Index: head/sys/dev/cxgbe/t4_main.c
===================================================================
--- head/sys/dev/cxgbe/t4_main.c	(revision 348253)
+++ head/sys/dev/cxgbe/t4_main.c	(revision 348254)
@@ -1,10850 +1,10845 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/pciio.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 #include <sys/firmware.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/if_vlan_var.h>
 #ifdef RSS
 #include <net/rss_config.h>
 #endif
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/md_var.h>
 #include <machine/cputypes.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #endif
 #include <crypto/rijndael/rijndael.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "cudbg/cudbg.h"
 #include "t4_clip.h"
 #include "t4_ioctl.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 #include "t4_if.h"
 #include "t4_smt.h"
 
 /* T4 bus driver interface */
 static int t4_probe(device_t);
 static int t4_attach(device_t);
 static int t4_detach(device_t);
 static int t4_child_location_str(device_t, device_t, char *, size_t);
 static int t4_ready(device_t);
 static int t4_read_port_device(device_t, int, device_t *);
 static device_method_t t4_methods[] = {
 	DEVMETHOD(device_probe,		t4_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t4_driver = {
 	"t4nex",
 	t4_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T4 port (cxgbe) interface */
 static int cxgbe_probe(device_t);
 static int cxgbe_attach(device_t);
 static int cxgbe_detach(device_t);
 device_method_t cxgbe_methods[] = {
 	DEVMETHOD(device_probe,		cxgbe_probe),
 	DEVMETHOD(device_attach,	cxgbe_attach),
 	DEVMETHOD(device_detach,	cxgbe_detach),
 	{ 0, 0 }
 };
 static driver_t cxgbe_driver = {
 	"cxgbe",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T4 VI (vcxgbe) interface */
 static int vcxgbe_probe(device_t);
 static int vcxgbe_attach(device_t);
 static int vcxgbe_detach(device_t);
 static device_method_t vcxgbe_methods[] = {
 	DEVMETHOD(device_probe,		vcxgbe_probe),
 	DEVMETHOD(device_attach,	vcxgbe_attach),
 	DEVMETHOD(device_detach,	vcxgbe_detach),
 	{ 0, 0 }
 };
 static driver_t vcxgbe_driver = {
 	"vcxgbe",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 static d_ioctl_t t4_ioctl;
 
 static struct cdevsw t4_cdevsw = {
        .d_version = D_VERSION,
        .d_ioctl = t4_ioctl,
        .d_name = "t4nex",
 };
 
 /* T5 bus driver interface */
 static int t5_probe(device_t);
 static device_method_t t5_methods[] = {
 	DEVMETHOD(device_probe,		t5_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t5_driver = {
 	"t5nex",
 	t5_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T5 port (cxl) interface */
 static driver_t cxl_driver = {
 	"cxl",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T5 VI (vcxl) interface */
 static driver_t vcxl_driver = {
 	"vcxl",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 /* T6 bus driver interface */
 static int t6_probe(device_t);
 static device_method_t t6_methods[] = {
 	DEVMETHOD(device_probe,		t6_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t6_driver = {
 	"t6nex",
 	t6_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T6 port (cc) interface */
 static driver_t cc_driver = {
 	"cc",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T6 VI (vcc) interface */
 static driver_t vcc_driver = {
 	"vcc",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 /* ifnet interface */
 static void cxgbe_init(void *);
 static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t);
 static int cxgbe_transmit(struct ifnet *, struct mbuf *);
 static void cxgbe_qflush(struct ifnet *);
 
 MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services");
 
 /*
  * Correct lock order when you need to acquire multiple locks is t4_list_lock,
  * then ADAPTER_LOCK, then t4_uld_list_lock.
  */
 static struct sx t4_list_lock;
 SLIST_HEAD(, adapter) t4_list;
 #ifdef TCP_OFFLOAD
 static struct sx t4_uld_list_lock;
 SLIST_HEAD(, uld_info) t4_uld_list;
 #endif
 
 /*
  * Tunables.  See tweak_tunables() too.
  *
  * Each tunable is set to a default value here if it's known at compile-time.
  * Otherwise it is set to -n as an indication to tweak_tunables() that it should
  * provide a reasonable default (upto n) when the driver is loaded.
  *
  * Tunables applicable to both T4 and T5 are under hw.cxgbe.  Those specific to
  * T5 are under hw.cxl.
  */
 SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD, 0, "cxgbe(4) parameters");
 SYSCTL_NODE(_hw, OID_AUTO, cxl, CTLFLAG_RD, 0, "cxgbe(4) T5+ parameters");
 SYSCTL_NODE(_hw_cxgbe, OID_AUTO, toe, CTLFLAG_RD, 0, "cxgbe(4) TOE parameters");
 
 /*
  * Number of queues for tx and rx, NIC and offload.
  */
 #define NTXQ 16
 int t4_ntxq = -NTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq, CTLFLAG_RDTUN, &t4_ntxq, 0,
     "Number of TX queues per port");
 TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq);	/* Old name, undocumented */
 
 #define NRXQ 8
 int t4_nrxq = -NRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq, CTLFLAG_RDTUN, &t4_nrxq, 0,
     "Number of RX queues per port");
 TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq);	/* Old name, undocumented */
 
 #define NTXQ_VI 1
 static int t4_ntxq_vi = -NTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq_vi, CTLFLAG_RDTUN, &t4_ntxq_vi, 0,
     "Number of TX queues per VI");
 
 #define NRXQ_VI 1
 static int t4_nrxq_vi = -NRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq_vi, CTLFLAG_RDTUN, &t4_nrxq_vi, 0,
     "Number of RX queues per VI");
 
 static int t4_rsrv_noflowq = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, rsrv_noflowq, CTLFLAG_RDTUN, &t4_rsrv_noflowq,
     0, "Reserve TX queue 0 of each VI for non-flowid packets");
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 #define NOFLDTXQ 8
 static int t4_nofldtxq = -NOFLDTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0,
     "Number of offload TX queues per port");
 
 #define NOFLDRXQ 2
 static int t4_nofldrxq = -NOFLDRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0,
     "Number of offload RX queues per port");
 
 #define NOFLDTXQ_VI 1
 static int t4_nofldtxq_vi = -NOFLDTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq_vi, CTLFLAG_RDTUN, &t4_nofldtxq_vi, 0,
     "Number of offload TX queues per VI");
 
 #define NOFLDRXQ_VI 1
 static int t4_nofldrxq_vi = -NOFLDRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq_vi, CTLFLAG_RDTUN, &t4_nofldrxq_vi, 0,
     "Number of offload RX queues per VI");
 
 #define TMR_IDX_OFLD 1
 int t4_tmr_idx_ofld = TMR_IDX_OFLD;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx_ofld, CTLFLAG_RDTUN,
     &t4_tmr_idx_ofld, 0, "Holdoff timer index for offload queues");
 
 #define PKTC_IDX_OFLD (-1)
 int t4_pktc_idx_ofld = PKTC_IDX_OFLD;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx_ofld, CTLFLAG_RDTUN,
     &t4_pktc_idx_ofld, 0, "holdoff packet counter index for offload queues");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_keepalive_idle = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_idle, CTLFLAG_RDTUN,
     &t4_toe_keepalive_idle, 0, "TOE keepalive idle timer (us)");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_keepalive_interval = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_interval, CTLFLAG_RDTUN,
     &t4_toe_keepalive_interval, 0, "TOE keepalive interval timer (us)");
 
 /* 0 means chip/fw default, non-zero number is # of keepalives before abort */
 static int t4_toe_keepalive_count = 0;
 SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, keepalive_count, CTLFLAG_RDTUN,
     &t4_toe_keepalive_count, 0, "Number of TOE keepalive probes before abort");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_rexmt_min = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_min, CTLFLAG_RDTUN,
     &t4_toe_rexmt_min, 0, "Minimum TOE retransmit interval (us)");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_rexmt_max = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_max, CTLFLAG_RDTUN,
     &t4_toe_rexmt_max, 0, "Maximum TOE retransmit interval (us)");
 
 /* 0 means chip/fw default, non-zero number is # of rexmt before abort */
 static int t4_toe_rexmt_count = 0;
 SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, rexmt_count, CTLFLAG_RDTUN,
     &t4_toe_rexmt_count, 0, "Number of TOE retransmissions before abort");
 
 /* -1 means chip/fw default, other values are raw backoff values to use */
 static int t4_toe_rexmt_backoff[16] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 };
 SYSCTL_NODE(_hw_cxgbe_toe, OID_AUTO, rexmt_backoff, CTLFLAG_RD, 0,
     "cxgbe(4) TOE retransmit backoff values");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 0, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[0], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 1, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[1], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 2, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[2], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 3, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[3], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 4, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[4], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 5, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[5], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 6, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[6], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 7, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[7], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 8, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[8], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 9, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[9], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 10, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[10], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 11, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[11], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 12, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[12], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 13, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[13], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[14], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[15], 0, "");
 #endif
 
 #ifdef DEV_NETMAP
 #define NNMTXQ_VI 2
 static int t4_nnmtxq_vi = -NNMTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq_vi, CTLFLAG_RDTUN, &t4_nnmtxq_vi, 0,
     "Number of netmap TX queues per VI");
 
 #define NNMRXQ_VI 2
 static int t4_nnmrxq_vi = -NNMRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq_vi, CTLFLAG_RDTUN, &t4_nnmrxq_vi, 0,
     "Number of netmap RX queues per VI");
 #endif
 
 /*
  * Holdoff parameters for ports.
  */
 #define TMR_IDX 1
 int t4_tmr_idx = TMR_IDX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx, CTLFLAG_RDTUN, &t4_tmr_idx,
     0, "Holdoff timer index");
 TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx);	/* Old name */
 
 #define PKTC_IDX (-1)
 int t4_pktc_idx = PKTC_IDX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx, CTLFLAG_RDTUN, &t4_pktc_idx,
     0, "Holdoff packet counter index");
 TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx);	/* Old name */
 
 /*
  * Size (# of entries) of each tx and rx queue.
  */
 unsigned int t4_qsize_txq = TX_EQ_QSIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_txq, CTLFLAG_RDTUN, &t4_qsize_txq, 0,
     "Number of descriptors in each TX queue");
 
 unsigned int t4_qsize_rxq = RX_IQ_QSIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_rxq, CTLFLAG_RDTUN, &t4_qsize_rxq, 0,
     "Number of descriptors in each RX queue");
 
 /*
  * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively).
  */
 int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, interrupt_types, CTLFLAG_RDTUN, &t4_intr_types,
     0, "Interrupt types allowed (bit 0 = INTx, 1 = MSI, 2 = MSI-X)");
 
 /*
  * Configuration file.  All the _CF names here are special.
  */
 #define DEFAULT_CF	"default"
 #define BUILTIN_CF	"built-in"
 #define FLASH_CF	"flash"
 #define UWIRE_CF	"uwire"
 #define FPGA_CF		"fpga"
 static char t4_cfg_file[32] = DEFAULT_CF;
 SYSCTL_STRING(_hw_cxgbe, OID_AUTO, config_file, CTLFLAG_RDTUN, t4_cfg_file,
     sizeof(t4_cfg_file), "Firmware configuration file");
 
 /*
  * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively).
  * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them.
  * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water
  *            mark or when signalled to do so, 0 to never emit PAUSE.
  * pause_autoneg = 1 means PAUSE will be negotiated if possible and the
  *                 negotiated settings will override rx_pause/tx_pause.
  *                 Otherwise rx_pause/tx_pause are applied forcibly.
  */
 static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, pause_settings, CTLFLAG_RDTUN,
     &t4_pause_settings, 0,
     "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)");
 
 /*
  * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively).
  * -1 to run with the firmware default.  Same as FEC_AUTO (bit 5)
  *  0 to disable FEC.
  */
 static int t4_fec = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fec, CTLFLAG_RDTUN, &t4_fec, 0,
     "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)");
 
 /*
  * Link autonegotiation.
  * -1 to run with the firmware default.
  *  0 to disable.
  *  1 to enable.
  */
 static int t4_autoneg = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, autoneg, CTLFLAG_RDTUN, &t4_autoneg, 0,
     "Link autonegotiation");
 
 /*
  * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed,
  * encouraged respectively).  '-n' is the same as 'n' except the firmware
  * version used in the checks is read from the firmware bundled with the driver.
  */
 static int t4_fw_install = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fw_install, CTLFLAG_RDTUN, &t4_fw_install, 0,
     "Firmware auto-install (0 = prohibited, 1 = allowed, 2 = encouraged)");
 
 /*
  * ASIC features that will be used.  Disable the ones you don't want so that the
  * chip resources aren't wasted on features that will not be used.
  */
 static int t4_nbmcaps_allowed = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nbmcaps_allowed, CTLFLAG_RDTUN,
     &t4_nbmcaps_allowed, 0, "Default NBM capabilities");
 
 static int t4_linkcaps_allowed = 0;	/* No DCBX, PPP, etc. by default */
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, linkcaps_allowed, CTLFLAG_RDTUN,
     &t4_linkcaps_allowed, 0, "Default link capabilities");
 
 static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS |
     FW_CAPS_CONFIG_SWITCH_EGRESS;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, switchcaps_allowed, CTLFLAG_RDTUN,
     &t4_switchcaps_allowed, 0, "Default switch capabilities");
 
 #ifdef RATELIMIT
 static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC |
 	FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD;
 #else
 static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC |
 	FW_CAPS_CONFIG_NIC_HASHFILTER;
 #endif
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, niccaps_allowed, CTLFLAG_RDTUN,
     &t4_niccaps_allowed, 0, "Default NIC capabilities");
 
 static int t4_toecaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, toecaps_allowed, CTLFLAG_RDTUN,
     &t4_toecaps_allowed, 0, "Default TCP offload capabilities");
 
 static int t4_rdmacaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, rdmacaps_allowed, CTLFLAG_RDTUN,
     &t4_rdmacaps_allowed, 0, "Default RDMA capabilities");
 
 static int t4_cryptocaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cryptocaps_allowed, CTLFLAG_RDTUN,
     &t4_cryptocaps_allowed, 0, "Default crypto capabilities");
 
 static int t4_iscsicaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, iscsicaps_allowed, CTLFLAG_RDTUN,
     &t4_iscsicaps_allowed, 0, "Default iSCSI capabilities");
 
 static int t4_fcoecaps_allowed = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fcoecaps_allowed, CTLFLAG_RDTUN,
     &t4_fcoecaps_allowed, 0, "Default FCoE capabilities");
 
 static int t5_write_combine = 0;
 SYSCTL_INT(_hw_cxl, OID_AUTO, write_combine, CTLFLAG_RDTUN, &t5_write_combine,
     0, "Use WC instead of UC for BAR2");
 
 static int t4_num_vis = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, num_vis, CTLFLAG_RDTUN, &t4_num_vis, 0,
     "Number of VIs per port");
 
 /*
  * PCIe Relaxed Ordering.
  * -1: driver should figure out a good value.
  * 0: disable RO.
  * 1: enable RO.
  * 2: leave RO alone.
  */
 static int pcie_relaxed_ordering = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, pcie_relaxed_ordering, CTLFLAG_RDTUN,
     &pcie_relaxed_ordering, 0,
     "PCIe Relaxed Ordering: 0 = disable, 1 = enable, 2 = leave alone");
 
 static int t4_panic_on_fatal_err = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RDTUN,
     &t4_panic_on_fatal_err, 0, "panic on fatal errors");
 
 #ifdef TCP_OFFLOAD
 /*
  * TOE tunables.
  */
 static int t4_cop_managed_offloading = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cop_managed_offloading, CTLFLAG_RDTUN,
     &t4_cop_managed_offloading, 0,
     "COP (Connection Offload Policy) controls all TOE offload");
 #endif
 
 /* Functions used by VIs to obtain unique MAC addresses for each VI. */
 static int vi_mac_funcs[] = {
 	FW_VI_FUNC_ETH,
 	FW_VI_FUNC_OFLD,
 	FW_VI_FUNC_IWARP,
 	FW_VI_FUNC_OPENISCSI,
 	FW_VI_FUNC_OPENFCOE,
 	FW_VI_FUNC_FOISCSI,
 	FW_VI_FUNC_FOFCOE,
 };
 
 struct intrs_and_queues {
 	uint16_t intr_type;	/* INTx, MSI, or MSI-X */
 	uint16_t num_vis;	/* number of VIs for each port */
 	uint16_t nirq;		/* Total # of vectors */
 	uint16_t ntxq;		/* # of NIC txq's for each port */
 	uint16_t nrxq;		/* # of NIC rxq's for each port */
 	uint16_t nofldtxq;	/* # of TOE/ETHOFLD txq's for each port */
 	uint16_t nofldrxq;	/* # of TOE rxq's for each port */
 
 	/* The vcxgbe/vcxl interfaces use these and not the ones above. */
 	uint16_t ntxq_vi;	/* # of NIC txq's */
 	uint16_t nrxq_vi;	/* # of NIC rxq's */
 	uint16_t nofldtxq_vi;	/* # of TOE txq's */
 	uint16_t nofldrxq_vi;	/* # of TOE rxq's */
 	uint16_t nnmtxq_vi;	/* # of netmap txq's */
 	uint16_t nnmrxq_vi;	/* # of netmap rxq's */
 };
 
 static void setup_memwin(struct adapter *);
 static void position_memwin(struct adapter *, int, uint32_t);
 static int validate_mem_range(struct adapter *, uint32_t, uint32_t);
 static int fwmtype_to_hwmtype(int);
 static int validate_mt_off_len(struct adapter *, int, uint32_t, uint32_t,
     uint32_t *);
 static int fixup_devlog_params(struct adapter *);
 static int cfg_itype_and_nqueues(struct adapter *, struct intrs_and_queues *);
 static int contact_firmware(struct adapter *);
 static int partition_resources(struct adapter *);
 static int get_params__pre_init(struct adapter *);
 static int set_params__pre_init(struct adapter *);
 static int get_params__post_init(struct adapter *);
 static int set_params__post_init(struct adapter *);
 static void t4_set_desc(struct adapter *);
 static bool fixed_ifmedia(struct port_info *);
 static void build_medialist(struct port_info *);
 static void init_link_config(struct port_info *);
 static int fixup_link_config(struct port_info *);
 static int apply_link_config(struct port_info *);
 static int cxgbe_init_synchronized(struct vi_info *);
 static int cxgbe_uninit_synchronized(struct vi_info *);
 static void quiesce_txq(struct adapter *, struct sge_txq *);
 static void quiesce_wrq(struct adapter *, struct sge_wrq *);
 static void quiesce_iq(struct adapter *, struct sge_iq *);
 static void quiesce_fl(struct adapter *, struct sge_fl *);
 static int t4_alloc_irq(struct adapter *, struct irq *, int rid,
     driver_intr_t *, void *, char *);
 static int t4_free_irq(struct adapter *, struct irq *);
 static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *);
 static void vi_refresh_stats(struct adapter *, struct vi_info *);
 static void cxgbe_refresh_stats(struct adapter *, struct port_info *);
 static void cxgbe_tick(void *);
 static void cxgbe_sysctls(struct port_info *);
 static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
 static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS);
 static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS);
 static int sysctl_btphy(SYSCTL_HANDLER_ARGS);
 static int sysctl_noflowq(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS);
 static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS);
 static int sysctl_fec(SYSCTL_HANDLER_ARGS);
 static int sysctl_autoneg(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS);
 static int sysctl_temperature(SYSCTL_HANDLER_ARGS);
 static int sysctl_loadavg(SYSCTL_HANDLER_ARGS);
 static int sysctl_cctrl(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_devlog(SYSCTL_HANDLER_ARGS);
 static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS);
 static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS);
 static int sysctl_meminfo(SYSCTL_HANDLER_ARGS);
 static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS);
 static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS);
 static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS);
 static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tids(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS);
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpus(SYSCTL_HANDLER_ARGS);
 #ifdef TCP_OFFLOAD
 static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS);
 #endif
 static int get_sge_context(struct adapter *, struct t4_sge_context *);
 static int load_fw(struct adapter *, struct t4_data *);
 static int load_cfg(struct adapter *, struct t4_data *);
 static int load_boot(struct adapter *, struct t4_bootrom *);
 static int load_bootcfg(struct adapter *, struct t4_data *);
 static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *);
 static void free_offload_policy(struct t4_offload_policy *);
 static int set_offload_policy(struct adapter *, struct t4_offload_policy *);
 static int read_card_mem(struct adapter *, int, struct t4_mem_range *);
 static int read_i2c(struct adapter *, struct t4_i2c_data *);
 #ifdef TCP_OFFLOAD
 static int toe_capability(struct vi_info *, int);
 #endif
 static int mod_event(module_t, int, void *);
 static int notify_siblings(device_t, int);
 
 struct {
 	uint16_t device;
 	char *desc;
 } t4_pciids[] = {
 	{0xa000, "Chelsio Terminator 4 FPGA"},
 	{0x4400, "Chelsio T440-dbg"},
 	{0x4401, "Chelsio T420-CR"},
 	{0x4402, "Chelsio T422-CR"},
 	{0x4403, "Chelsio T440-CR"},
 	{0x4404, "Chelsio T420-BCH"},
 	{0x4405, "Chelsio T440-BCH"},
 	{0x4406, "Chelsio T440-CH"},
 	{0x4407, "Chelsio T420-SO"},
 	{0x4408, "Chelsio T420-CX"},
 	{0x4409, "Chelsio T420-BT"},
 	{0x440a, "Chelsio T404-BT"},
 	{0x440e, "Chelsio T440-LP-CR"},
 }, t5_pciids[] = {
 	{0xb000, "Chelsio Terminator 5 FPGA"},
 	{0x5400, "Chelsio T580-dbg"},
 	{0x5401,  "Chelsio T520-CR"},		/* 2 x 10G */
 	{0x5402,  "Chelsio T522-CR"},		/* 2 x 10G, 2 X 1G */
 	{0x5403,  "Chelsio T540-CR"},		/* 4 x 10G */
 	{0x5407,  "Chelsio T520-SO"},		/* 2 x 10G, nomem */
 	{0x5409,  "Chelsio T520-BT"},		/* 2 x 10GBaseT */
 	{0x540a,  "Chelsio T504-BT"},		/* 4 x 1G */
 	{0x540d,  "Chelsio T580-CR"},		/* 2 x 40G */
 	{0x540e,  "Chelsio T540-LP-CR"},	/* 4 x 10G */
 	{0x5410,  "Chelsio T580-LP-CR"},	/* 2 x 40G */
 	{0x5411,  "Chelsio T520-LL-CR"},	/* 2 x 10G */
 	{0x5412,  "Chelsio T560-CR"},		/* 1 x 40G, 2 x 10G */
 	{0x5414,  "Chelsio T580-LP-SO-CR"},	/* 2 x 40G, nomem */
 	{0x5415,  "Chelsio T502-BT"},		/* 2 x 1G */
 	{0x5418,  "Chelsio T540-BT"},		/* 4 x 10GBaseT */
 	{0x5419,  "Chelsio T540-LP-BT"},	/* 4 x 10GBaseT */
 	{0x541a,  "Chelsio T540-SO-BT"},	/* 4 x 10GBaseT, nomem */
 	{0x541b,  "Chelsio T540-SO-CR"},	/* 4 x 10G, nomem */
 
 	/* Custom */
 	{0x5483, "Custom T540-CR"},
 	{0x5484, "Custom T540-BT"},
 }, t6_pciids[] = {
 	{0xc006, "Chelsio Terminator 6 FPGA"},	/* T6 PE10K6 FPGA (PF0) */
 	{0x6400, "Chelsio T6-DBG-25"},		/* 2 x 10/25G, debug */
 	{0x6401, "Chelsio T6225-CR"},		/* 2 x 10/25G */
 	{0x6402, "Chelsio T6225-SO-CR"},	/* 2 x 10/25G, nomem */
 	{0x6403, "Chelsio T6425-CR"},		/* 4 x 10/25G */
 	{0x6404, "Chelsio T6425-SO-CR"},	/* 4 x 10/25G, nomem */
 	{0x6405, "Chelsio T6225-OCP-SO"},	/* 2 x 10/25G, nomem */
 	{0x6406, "Chelsio T62100-OCP-SO"},	/* 2 x 40/50/100G, nomem */
 	{0x6407, "Chelsio T62100-LP-CR"},	/* 2 x 40/50/100G */
 	{0x6408, "Chelsio T62100-SO-CR"},	/* 2 x 40/50/100G, nomem */
 	{0x6409, "Chelsio T6210-BT"},		/* 2 x 10GBASE-T */
 	{0x640d, "Chelsio T62100-CR"},		/* 2 x 40/50/100G */
 	{0x6410, "Chelsio T6-DBG-100"},		/* 2 x 40/50/100G, debug */
 	{0x6411, "Chelsio T6225-LL-CR"},	/* 2 x 10/25G */
 	{0x6414, "Chelsio T61100-OCP-SO"},	/* 1 x 40/50/100G, nomem */
 	{0x6415, "Chelsio T6201-BT"},		/* 2 x 1000BASE-T */
 
 	/* Custom */
 	{0x6480, "Custom T6225-CR"},
 	{0x6481, "Custom T62100-CR"},
 	{0x6482, "Custom T6225-CR"},
 	{0x6483, "Custom T62100-CR"},
 	{0x6484, "Custom T64100-CR"},
 	{0x6485, "Custom T6240-SO"},
 	{0x6486, "Custom T6225-SO-CR"},
 	{0x6487, "Custom T6225-CR"},
 };
 
 #ifdef TCP_OFFLOAD
 /*
  * service_iq_fl() has an iq and needs the fl.  Offset of fl from the iq should
  * be exactly the same for both rxq and ofld_rxq.
  */
 CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq));
 CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
 #endif
 CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
 
 static int
 t4_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xa000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t4_pciids); i++) {
 		if (d == t4_pciids[i].device) {
 			device_set_desc(dev, t4_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t5_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xb000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t5_pciids); i++) {
 		if (d == t5_pciids[i].device) {
 			device_set_desc(dev, t5_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t6_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t6_pciids); i++) {
 		if (d == t6_pciids[i].device) {
 			device_set_desc(dev, t6_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static void
 t5_attribute_workaround(device_t dev)
 {
 	device_t root_port;
 	uint32_t v;
 
 	/*
 	 * The T5 chips do not properly echo the No Snoop and Relaxed
 	 * Ordering attributes when replying to a TLP from a Root
 	 * Port.  As a workaround, find the parent Root Port and
 	 * disable No Snoop and Relaxed Ordering.  Note that this
 	 * affects all devices under this root port.
 	 */
 	root_port = pci_find_pcie_root_port(dev);
 	if (root_port == NULL) {
 		device_printf(dev, "Unable to find parent root port\n");
 		return;
 	}
 
 	v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL,
 	    PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2);
 	if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) !=
 	    0)
 		device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n",
 		    device_get_nameunit(root_port));
 }
 
 static const struct devnames devnames[] = {
 	{
 		.nexus_name = "t4nex",
 		.ifnet_name = "cxgbe",
 		.vi_ifnet_name = "vcxgbe",
 		.pf03_drv_name = "t4iov",
 		.vf_nexus_name = "t4vf",
 		.vf_ifnet_name = "cxgbev"
 	}, {
 		.nexus_name = "t5nex",
 		.ifnet_name = "cxl",
 		.vi_ifnet_name = "vcxl",
 		.pf03_drv_name = "t5iov",
 		.vf_nexus_name = "t5vf",
 		.vf_ifnet_name = "cxlv"
 	}, {
 		.nexus_name = "t6nex",
 		.ifnet_name = "cc",
 		.vi_ifnet_name = "vcc",
 		.pf03_drv_name = "t6iov",
 		.vf_nexus_name = "t6vf",
 		.vf_ifnet_name = "ccv"
 	}
 };
 
 void
 t4_init_devnames(struct adapter *sc)
 {
 	int id;
 
 	id = chip_id(sc);
 	if (id >= CHELSIO_T4 && id - CHELSIO_T4 < nitems(devnames))
 		sc->names = &devnames[id - CHELSIO_T4];
 	else {
 		device_printf(sc->dev, "chip id %d is not supported.\n", id);
 		sc->names = NULL;
 	}
 }
 
 static int
 t4_ifnet_unit(struct adapter *sc, struct port_info *pi)
 {
 	const char *parent, *name;
 	long value;
 	int line, unit;
 
 	line = 0;
 	parent = device_get_nameunit(sc->dev);
 	name = sc->names->ifnet_name;
 	while (resource_find_dev(&line, name, &unit, "at", parent) == 0) {
 		if (resource_long_value(name, unit, "port", &value) == 0 &&
 		    value == pi->port_id)
 			return (unit);
 	}
 	return (-1);
 }
 
 static int
 t4_attach(device_t dev)
 {
 	struct adapter *sc;
 	int rc = 0, i, j, rqidx, tqidx, nports;
 	struct make_dev_args mda;
 	struct intrs_and_queues iaq;
 	struct sge *s;
 	uint32_t *buf;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	int ofld_tqidx;
 #endif
 #ifdef TCP_OFFLOAD
 	int ofld_rqidx;
 #endif
 #ifdef DEV_NETMAP
 	int nm_rqidx, nm_tqidx;
 #endif
 	int num_vis;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 	TUNABLE_INT_FETCH("hw.cxgbe.dflags", &sc->debug_flags);
 
 	if ((pci_get_device(dev) & 0xff00) == 0x5400)
 		t5_attribute_workaround(dev);
 	pci_enable_busmaster(dev);
 	if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) {
 		uint32_t v;
 
 		pci_set_max_read_req(dev, 4096);
 		v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2);
 		sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5);
 		if (pcie_relaxed_ordering == 0 &&
 		    (v & PCIEM_CTL_RELAXED_ORD_ENABLE) != 0) {
 			v &= ~PCIEM_CTL_RELAXED_ORD_ENABLE;
 			pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2);
 		} else if (pcie_relaxed_ordering == 1 &&
 		    (v & PCIEM_CTL_RELAXED_ORD_ENABLE) == 0) {
 			v |= PCIEM_CTL_RELAXED_ORD_ENABLE;
 			pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2);
 		}
 	}
 
 	sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS);
 	sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL);
 	sc->traceq = -1;
 	mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF);
 	snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer",
 	    device_get_nameunit(dev));
 
 	snprintf(sc->lockname, sizeof(sc->lockname), "%s",
 	    device_get_nameunit(dev));
 	mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF);
 	t4_add_adapter(sc);
 
 	mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF);
 	TAILQ_INIT(&sc->sfl);
 	callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0);
 
 	mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF);
 
 	sc->policy = NULL;
 	rw_init(&sc->policy_lock, "connection offload policy");
 
 	rc = t4_map_bars_0_and_4(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	memset(sc->chan_map, 0xff, sizeof(sc->chan_map));
 
 	/* Prepare the adapter for operation. */
 	buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK);
 	rc = -t4_prep_adapter(sc, buf);
 	free(buf, M_CXGBE);
 	if (rc != 0) {
 		device_printf(dev, "failed to prepare adapter: %d.\n", rc);
 		goto done;
 	}
 
 	/*
 	 * This is the real PF# to which we're attaching.  Works from within PCI
 	 * passthrough environments too, where pci_get_function() could return a
 	 * different PF# depending on the passthrough configuration.  We need to
 	 * use the real PF# in all our communication with the firmware.
 	 */
 	j = t4_read_reg(sc, A_PL_WHOAMI);
 	sc->pf = chip_id(sc) <= CHELSIO_T5 ? G_SOURCEPF(j) : G_T6_SOURCEPF(j);
 	sc->mbox = sc->pf;
 
 	t4_init_devnames(sc);
 	if (sc->names == NULL) {
 		rc = ENOTSUP;
 		goto done; /* error message displayed already */
 	}
 
 	/*
 	 * Do this really early, with the memory windows set up even before the
 	 * character device.  The userland tool's register i/o and mem read
 	 * will work even in "recovery mode".
 	 */
 	setup_memwin(sc);
 	if (t4_init_devlog_params(sc, 0) == 0)
 		fixup_devlog_params(sc);
 	make_dev_args_init(&mda);
 	mda.mda_devsw = &t4_cdevsw;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = sc;
 	rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
 	if (rc != 0)
 		device_printf(dev, "failed to create nexus char device: %d.\n",
 		    rc);
 
 	/* Go no further if recovery mode has been requested. */
 	if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) {
 		device_printf(dev, "recovery mode.\n");
 		goto done;
 	}
 
 #if defined(__i386__)
 	if ((cpu_feature & CPUID_CX8) == 0) {
 		device_printf(dev, "64 bit atomics not available.\n");
 		rc = ENOTSUP;
 		goto done;
 	}
 #endif
 
 	/* Contact the firmware and try to become the master driver. */
 	rc = contact_firmware(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 	MPASS(sc->flags & FW_OK);
 
 	rc = get_params__pre_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	if (sc->flags & MASTER_PF) {
 		rc = partition_resources(sc);
 		if (rc != 0)
 			goto done; /* error message displayed already */
 		t4_intr_clear(sc);
 	}
 
 	rc = get_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = set_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = t4_map_bar_2(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = t4_create_dma_tag(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	/*
 	 * First pass over all the ports - allocate VIs and initialize some
 	 * basic parameters like mac address, port type, etc.
 	 */
 	for_each_port(sc, i) {
 		struct port_info *pi;
 
 		pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK);
 		sc->port[i] = pi;
 
 		/* These must be set before t4_port_init */
 		pi->adapter = sc;
 		pi->port_id = i;
 		/*
 		 * XXX: vi[0] is special so we can't delay this allocation until
 		 * pi->nvi's final value is known.
 		 */
 		pi->vi = malloc(sizeof(struct vi_info) * t4_num_vis, M_CXGBE,
 		    M_ZERO | M_WAITOK);
 
 		/*
 		 * Allocate the "main" VI and initialize parameters
 		 * like mac addr.
 		 */
 		rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i);
 		if (rc != 0) {
 			device_printf(dev, "unable to initialize port %d: %d\n",
 			    i, rc);
 			free(pi->vi, M_CXGBE);
 			free(pi, M_CXGBE);
 			sc->port[i] = NULL;
 			goto done;
 		}
 
 		snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d",
 		    device_get_nameunit(dev), i);
 		mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF);
 		sc->chan_map[pi->tx_chan] = i;
 
 		/* All VIs on this port share this media. */
 		ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change,
 		    cxgbe_media_status);
 
 		PORT_LOCK(pi);
 		init_link_config(pi);
 		fixup_link_config(pi);
 		build_medialist(pi);
 		if (fixed_ifmedia(pi))
 			pi->flags |= FIXED_IFMEDIA;
 		PORT_UNLOCK(pi);
 
 		pi->dev = device_add_child(dev, sc->names->ifnet_name,
 		    t4_ifnet_unit(sc, pi));
 		if (pi->dev == NULL) {
 			device_printf(dev,
 			    "failed to add device for port %d.\n", i);
 			rc = ENXIO;
 			goto done;
 		}
 		pi->vi[0].dev = pi->dev;
 		device_set_softc(pi->dev, pi);
 	}
 
 	/*
 	 * Interrupt type, # of interrupts, # of rx/tx queues, etc.
 	 */
 	nports = sc->params.nports;
 	rc = cfg_itype_and_nqueues(sc, &iaq);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	num_vis = iaq.num_vis;
 	sc->intr_type = iaq.intr_type;
 	sc->intr_count = iaq.nirq;
 
 	s = &sc->sge;
 	s->nrxq = nports * iaq.nrxq;
 	s->ntxq = nports * iaq.ntxq;
 	if (num_vis > 1) {
 		s->nrxq += nports * (num_vis - 1) * iaq.nrxq_vi;
 		s->ntxq += nports * (num_vis - 1) * iaq.ntxq_vi;
 	}
 	s->neq = s->ntxq + s->nrxq;	/* the free list in an rxq is an eq */
 	s->neq += nports;		/* ctrl queues: 1 per port */
 	s->niq = s->nrxq + 1;		/* 1 extra for firmware event queue */
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (is_offload(sc) || is_ethoffload(sc)) {
 		s->nofldtxq = nports * iaq.nofldtxq;
 		if (num_vis > 1)
 			s->nofldtxq += nports * (num_vis - 1) * iaq.nofldtxq_vi;
 		s->neq += s->nofldtxq;
 
 		s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		s->nofldrxq = nports * iaq.nofldrxq;
 		if (num_vis > 1)
 			s->nofldrxq += nports * (num_vis - 1) * iaq.nofldrxq_vi;
 		s->neq += s->nofldrxq;	/* free list */
 		s->niq += s->nofldrxq;
 
 		s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 	}
 #endif
 #ifdef DEV_NETMAP
 	if (num_vis > 1) {
 		s->nnmrxq = nports * (num_vis - 1) * iaq.nnmrxq_vi;
 		s->nnmtxq = nports * (num_vis - 1) * iaq.nnmtxq_vi;
 	}
 	s->neq += s->nnmtxq + s->nnmrxq;
 	s->niq += s->nnmrxq;
 
 	s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 	s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 #endif
 
 	s->ctrlq = malloc(nports * sizeof(struct sge_wrq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_init_l2t(sc, M_WAITOK);
 	t4_init_smt(sc, M_WAITOK);
 	t4_init_tx_sched(sc);
 #ifdef RATELIMIT
 	t4_init_etid_table(sc);
 #endif
 #ifdef INET6
 	t4_init_clip_table(sc);
 #endif
 	if (sc->vres.key.size != 0)
 		sc->key_map = vmem_create("T4TLS key map", sc->vres.key.start,
 		    sc->vres.key.size, 32, 0, M_FIRSTFIT | M_WAITOK);
 
 	/*
 	 * Second pass over the ports.  This time we know the number of rx and
 	 * tx queues that each port should get.
 	 */
 	rqidx = tqidx = 0;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	ofld_tqidx = 0;
 #endif
 #ifdef TCP_OFFLOAD
 	ofld_rqidx = 0;
 #endif
 #ifdef DEV_NETMAP
 	nm_rqidx = nm_tqidx = 0;
 #endif
 	for_each_port(sc, i) {
 		struct port_info *pi = sc->port[i];
 		struct vi_info *vi;
 
 		if (pi == NULL)
 			continue;
 
 		pi->nvi = num_vis;
 		for_each_vi(pi, j, vi) {
 			vi->pi = pi;
 			vi->qsize_rxq = t4_qsize_rxq;
 			vi->qsize_txq = t4_qsize_txq;
 
 			vi->first_rxq = rqidx;
 			vi->first_txq = tqidx;
 			vi->tmr_idx = t4_tmr_idx;
 			vi->pktc_idx = t4_pktc_idx;
 			vi->nrxq = j == 0 ? iaq.nrxq : iaq.nrxq_vi;
 			vi->ntxq = j == 0 ? iaq.ntxq : iaq.ntxq_vi;
 
 			rqidx += vi->nrxq;
 			tqidx += vi->ntxq;
 
 			if (j == 0 && vi->ntxq > 1)
 				vi->rsrv_noflowq = t4_rsrv_noflowq ? 1 : 0;
 			else
 				vi->rsrv_noflowq = 0;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 			vi->first_ofld_txq = ofld_tqidx;
 			vi->nofldtxq = j == 0 ? iaq.nofldtxq : iaq.nofldtxq_vi;
 			ofld_tqidx += vi->nofldtxq;
 #endif
 #ifdef TCP_OFFLOAD
 			vi->ofld_tmr_idx = t4_tmr_idx_ofld;
 			vi->ofld_pktc_idx = t4_pktc_idx_ofld;
 			vi->first_ofld_rxq = ofld_rqidx;
 			vi->nofldrxq = j == 0 ? iaq.nofldrxq : iaq.nofldrxq_vi;
 
 			ofld_rqidx += vi->nofldrxq;
 #endif
 #ifdef DEV_NETMAP
 			if (j > 0) {
 				vi->first_nm_rxq = nm_rqidx;
 				vi->first_nm_txq = nm_tqidx;
 				vi->nnmrxq = iaq.nnmrxq_vi;
 				vi->nnmtxq = iaq.nnmtxq_vi;
 				nm_rqidx += vi->nnmrxq;
 				nm_tqidx += vi->nnmtxq;
 			}
 #endif
 		}
 	}
 
 	rc = t4_setup_intr_handlers(sc);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt handlers: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_generic_probe(dev);
 	if (rc != 0) {
 		device_printf(dev, "failed to probe child drivers: %d\n", rc);
 		goto done;
 	}
 
 	/*
 	 * Ensure thread-safe mailbox access (in debug builds).
 	 *
 	 * So far this was the only thread accessing the mailbox but various
 	 * ifnets and sysctls are about to be created and their handlers/ioctls
 	 * will access the mailbox from different threads.
 	 */
 	sc->flags |= CHK_MBOX_ACCESS;
 
 	rc = bus_generic_attach(dev);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to attach all child ports: %d\n", rc);
 		goto done;
 	}
 
 	device_printf(dev,
 	    "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n",
 	    sc->params.pci.speed, sc->params.pci.width, sc->params.nports,
 	    sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" :
 	    (sc->intr_type == INTR_MSI ? "MSI" : "INTx"),
 	    sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq);
 
 	t4_set_desc(sc);
 
 	notify_siblings(dev, 0);
 
 done:
 	if (rc != 0 && sc->cdev) {
 		/* cdev was created and so cxgbetool works; recover that way. */
 		device_printf(dev,
 		    "error during attach, adapter is now in recovery mode.\n");
 		rc = 0;
 	}
 
 	if (rc != 0)
 		t4_detach_common(dev);
 	else
 		t4_sysctls(sc);
 
 	return (rc);
 }
 
 static int
 t4_child_location_str(device_t bus, device_t dev, char *buf, size_t buflen)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 	int i;
 
 	sc = device_get_softc(bus);
 	buf[0] = '\0';
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		if (pi != NULL && pi->dev == dev) {
 			snprintf(buf, buflen, "port=%d", pi->port_id);
 			break;
 		}
 	}
 	return (0);
 }
 
 static int
 t4_ready(device_t dev)
 {
 	struct adapter *sc;
 
 	sc = device_get_softc(dev);
 	if (sc->flags & FW_OK)
 		return (0);
 	return (ENXIO);
 }
 
 static int
 t4_read_port_device(device_t dev, int port, device_t *child)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 
 	sc = device_get_softc(dev);
 	if (port < 0 || port >= MAX_NPORTS)
 		return (EINVAL);
 	pi = sc->port[port];
 	if (pi == NULL || pi->dev == NULL)
 		return (ENXIO);
 	*child = pi->dev;
 	return (0);
 }
 
 static int
 notify_siblings(device_t dev, int detaching)
 {
 	device_t sibling;
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < PCI_FUNCMAX; i++) {
 		if (i == pci_get_function(dev))
 			continue;
 		sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev),
 		    pci_get_slot(dev), i);
 		if (sibling == NULL || !device_is_attached(sibling))
 			continue;
 		if (detaching)
 			error = T4_DETACH_CHILD(sibling);
 		else
 			(void)T4_ATTACH_CHILD(sibling);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Idempotent
  */
 static int
 t4_detach(device_t dev)
 {
 	struct adapter *sc;
 	int rc;
 
 	sc = device_get_softc(dev);
 
 	rc = notify_siblings(dev, 1);
 	if (rc) {
 		device_printf(dev,
 		    "failed to detach sibling devices: %d\n", rc);
 		return (rc);
 	}
 
 	return (t4_detach_common(dev));
 }
 
 int
 t4_detach_common(device_t dev)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 	int i, rc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->cdev) {
 		destroy_dev(sc->cdev);
 		sc->cdev = NULL;
 	}
 
 	sc->flags &= ~CHK_MBOX_ACCESS;
 	if (sc->flags & FULL_INIT_DONE) {
 		if (!(sc->flags & IS_VF))
 			t4_intr_disable(sc);
 	}
 
 	if (device_is_attached(dev)) {
 		rc = bus_generic_detach(dev);
 		if (rc) {
 			device_printf(dev,
 			    "failed to detach child devices: %d\n", rc);
 			return (rc);
 		}
 	}
 
 	for (i = 0; i < sc->intr_count; i++)
 		t4_free_irq(sc, &sc->irq[i]);
 
 	if ((sc->flags & (IS_VF | FW_OK)) == FW_OK)
 		t4_free_tx_sched(sc);
 
 	for (i = 0; i < MAX_NPORTS; i++) {
 		pi = sc->port[i];
 		if (pi) {
 			t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid);
 			if (pi->dev)
 				device_delete_child(dev, pi->dev);
 
 			mtx_destroy(&pi->pi_lock);
 			free(pi->vi, M_CXGBE);
 			free(pi, M_CXGBE);
 		}
 	}
 
 	device_delete_children(dev);
 
 	if (sc->flags & FULL_INIT_DONE)
 		adapter_full_uninit(sc);
 
 	if ((sc->flags & (IS_VF | FW_OK)) == FW_OK)
 		t4_fw_bye(sc, sc->mbox);
 
 	if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX)
 		pci_release_msi(dev);
 
 	if (sc->regs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid,
 		    sc->regs_res);
 
 	if (sc->udbs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid,
 		    sc->udbs_res);
 
 	if (sc->msix_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid,
 		    sc->msix_res);
 
 	if (sc->l2t)
 		t4_free_l2t(sc->l2t);
 	if (sc->smt)
 		t4_free_smt(sc->smt);
 #ifdef RATELIMIT
 	t4_free_etid_table(sc);
 #endif
 	if (sc->key_map)
 		vmem_destroy(sc->key_map);
 #ifdef INET6
 	t4_destroy_clip_table(sc);
 #endif
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	free(sc->sge.ofld_txq, M_CXGBE);
 #endif
 #ifdef TCP_OFFLOAD
 	free(sc->sge.ofld_rxq, M_CXGBE);
 #endif
 #ifdef DEV_NETMAP
 	free(sc->sge.nm_rxq, M_CXGBE);
 	free(sc->sge.nm_txq, M_CXGBE);
 #endif
 	free(sc->irq, M_CXGBE);
 	free(sc->sge.rxq, M_CXGBE);
 	free(sc->sge.txq, M_CXGBE);
 	free(sc->sge.ctrlq, M_CXGBE);
 	free(sc->sge.iqmap, M_CXGBE);
 	free(sc->sge.eqmap, M_CXGBE);
 	free(sc->tids.ftid_tab, M_CXGBE);
 	free(sc->tids.hpftid_tab, M_CXGBE);
 	free_hftid_hash(&sc->tids);
 	free(sc->tids.atid_tab, M_CXGBE);
 	free(sc->tids.tid_tab, M_CXGBE);
 	free(sc->tt.tls_rx_ports, M_CXGBE);
 	t4_destroy_dma_tag(sc);
 	if (mtx_initialized(&sc->sc_lock)) {
 		sx_xlock(&t4_list_lock);
 		SLIST_REMOVE(&t4_list, sc, adapter, link);
 		sx_xunlock(&t4_list_lock);
 		mtx_destroy(&sc->sc_lock);
 	}
 
 	callout_drain(&sc->sfl_callout);
 	if (mtx_initialized(&sc->tids.ftid_lock)) {
 		mtx_destroy(&sc->tids.ftid_lock);
 		cv_destroy(&sc->tids.ftid_cv);
 	}
 	if (mtx_initialized(&sc->tids.atid_lock))
 		mtx_destroy(&sc->tids.atid_lock);
 	if (mtx_initialized(&sc->sfl_lock))
 		mtx_destroy(&sc->sfl_lock);
 	if (mtx_initialized(&sc->ifp_lock))
 		mtx_destroy(&sc->ifp_lock);
 	if (mtx_initialized(&sc->reg_lock))
 		mtx_destroy(&sc->reg_lock);
 
 	if (rw_initialized(&sc->policy_lock)) {
 		rw_destroy(&sc->policy_lock);
 #ifdef TCP_OFFLOAD
 		if (sc->policy != NULL)
 			free_offload_policy(sc->policy);
 #endif
 	}
 
 	for (i = 0; i < NUM_MEMWIN; i++) {
 		struct memwin *mw = &sc->memwin[i];
 
 		if (rw_initialized(&mw->mw_lock))
 			rw_destroy(&mw->mw_lock);
 	}
 
 	bzero(sc, sizeof(*sc));
 
 	return (0);
 }
 
 static int
 cxgbe_probe(device_t dev)
 {
 	char buf[128];
 	struct port_info *pi = device_get_softc(dev);
 
 	snprintf(buf, sizeof(buf), "port %d", pi->port_id);
 	device_set_desc_copy(dev, buf);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \
     IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \
     IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \
     IFCAP_HWRXTSTMP)
 #define T4_CAP_ENABLE (T4_CAP)
 
 static int
 cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 {
 	struct ifnet *ifp;
 	struct sbuf *sb;
 
 	vi->xact_addr_filt = -1;
 	callout_init(&vi->tick, 1);
 
 	/* Allocate an ifnet and set it up */
 	ifp = if_alloc_dev(IFT_ETHER, dev);
 	if (ifp == NULL) {
 		device_printf(dev, "Cannot allocate ifnet\n");
 		return (ENOMEM);
 	}
 	vi->ifp = ifp;
 	ifp->if_softc = vi;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 
 	ifp->if_init = cxgbe_init;
 	ifp->if_ioctl = cxgbe_ioctl;
 	ifp->if_transmit = cxgbe_transmit;
 	ifp->if_qflush = cxgbe_qflush;
 	ifp->if_get_counter = cxgbe_get_counter;
 #ifdef RATELIMIT
 	ifp->if_snd_tag_alloc = cxgbe_snd_tag_alloc;
 	ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
 	ifp->if_snd_tag_query = cxgbe_snd_tag_query;
 	ifp->if_snd_tag_free = cxgbe_snd_tag_free;
 #endif
 
 	ifp->if_capabilities = T4_CAP;
 	ifp->if_capenable = T4_CAP_ENABLE;
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0)
 		ifp->if_capabilities |= IFCAP_TOE;
 #endif
 #ifdef RATELIMIT
 	if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) {
 		ifp->if_capabilities |= IFCAP_TXRTLMT;
 		ifp->if_capenable |= IFCAP_TXRTLMT;
 	}
 #endif
 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
 
 	ifp->if_hw_tsomax = IP_MAXPACKET;
 	ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
 #ifdef RATELIMIT
 	if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0)
 		ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO;
 #endif
 	ifp->if_hw_tsomaxsegsize = 65536;
 
 	ether_ifattach(ifp, vi->hw_addr);
 #ifdef DEV_NETMAP
 	if (vi->nnmrxq != 0)
 		cxgbe_nm_attach(vi);
 #endif
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	switch (ifp->if_capabilities & (IFCAP_TOE | IFCAP_TXRTLMT)) {
 	case IFCAP_TOE:
 		sbuf_printf(sb, "; %d txq (TOE)", vi->nofldtxq);
 		break;
 	case IFCAP_TOE | IFCAP_TXRTLMT:
 		sbuf_printf(sb, "; %d txq (TOE/ETHOFLD)", vi->nofldtxq);
 		break;
 	case IFCAP_TXRTLMT:
 		sbuf_printf(sb, "; %d txq (ETHOFLD)", vi->nofldtxq);
 		break;
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (ifp->if_capabilities & IFCAP_TOE)
 		sbuf_printf(sb, ", %d rxq (TOE)", vi->nofldrxq);
 #endif
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		sbuf_printf(sb, "; %d txq, %d rxq (netmap)",
 		    vi->nnmtxq, vi->nnmrxq);
 #endif
 	sbuf_finish(sb);
 	device_printf(dev, "%s\n", sbuf_data(sb));
 	sbuf_delete(sb);
 
 	vi_sysctls(vi);
 
 	return (0);
 }
 
 static int
 cxgbe_attach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct adapter *sc = pi->adapter;
 	struct vi_info *vi;
 	int i, rc;
 
 	callout_init_mtx(&pi->tick, &pi->pi_lock, 0);
 
 	rc = cxgbe_vi_attach(dev, &pi->vi[0]);
 	if (rc)
 		return (rc);
 
 	for_each_vi(pi, i, vi) {
 		if (i == 0)
 			continue;
 		vi->dev = device_add_child(dev, sc->names->vi_ifnet_name, -1);
 		if (vi->dev == NULL) {
 			device_printf(dev, "failed to add VI %d\n", i);
 			continue;
 		}
 		device_set_softc(vi->dev, vi);
 	}
 
 	cxgbe_sysctls(pi);
 
 	bus_generic_attach(dev);
 
 	return (0);
 }
 
 static void
 cxgbe_vi_detach(struct vi_info *vi)
 {
 	struct ifnet *ifp = vi->ifp;
 
 	ether_ifdetach(ifp);
 
 	/* Let detach proceed even if these fail. */
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		cxgbe_nm_detach(vi);
 #endif
 	cxgbe_uninit_synchronized(vi);
 	callout_drain(&vi->tick);
 	vi_full_uninit(vi);
 
 	if_free(vi->ifp);
 	vi->ifp = NULL;
 }
 
 static int
 cxgbe_detach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct adapter *sc = pi->adapter;
 	int rc;
 
 	/* Detach the extra VIs first. */
 	rc = bus_generic_detach(dev);
 	if (rc)
 		return (rc);
 	device_delete_children(dev);
 
 	doom_vi(sc, &pi->vi[0]);
 
 	if (pi->flags & HAS_TRACEQ) {
 		sc->traceq = -1;	/* cloner should not create ifnet */
 		t4_tracer_port_detach(sc);
 	}
 
 	cxgbe_vi_detach(&pi->vi[0]);
 	callout_drain(&pi->tick);
 	ifmedia_removeall(&pi->media);
 
 	end_synchronized_op(sc, 0);
 
 	return (0);
 }
 
 static void
 cxgbe_init(void *arg)
 {
 	struct vi_info *vi = arg;
 	struct adapter *sc = vi->pi->adapter;
 
 	if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0)
 		return;
 	cxgbe_init_synchronized(vi);
 	end_synchronized_op(sc, 0);
 }
 
 static int
 cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
 {
 	int rc = 0, mtu, flags;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifreq *ifr = (struct ifreq *)data;
 	uint32_t mask;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		mtu = ifr->ifr_mtu;
 		if (mtu < ETHERMIN || mtu > MAX_MTU)
 			return (EINVAL);
 
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu");
 		if (rc)
 			return (rc);
 		ifp->if_mtu = mtu;
 		if (vi->flags & VI_INIT_DONE) {
 			t4_update_fl_bufsize(ifp);
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_MTU);
 		}
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFFLAGS:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4flg");
 		if (rc)
 			return (rc);
 
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				flags = vi->if_flags;
 				if ((ifp->if_flags ^ flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					rc = update_mac_settings(ifp,
 					    XGMAC_PROMISC | XGMAC_ALLMULTI);
 				}
 			} else {
 				rc = cxgbe_init_synchronized(vi);
 			}
 			vi->if_flags = ifp->if_flags;
 		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			rc = cxgbe_uninit_synchronized(vi);
 		}
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4multi");
 		if (rc)
 			return (rc);
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			rc = update_mac_settings(ifp, XGMAC_MCADDRS);
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFCAP:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap");
 		if (rc)
 			return (rc);
 
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 
 			if (IFCAP_TSO4 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				if_printf(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 			if (IFCAP_TSO6 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO6;
 				if_printf(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 
 		/*
 		 * Note that we leave CSUM_TSO alone (it is always set).  The
 		 * kernel takes both IFCAP_TSOx and CSUM_TSO into account before
 		 * sending a TSO request our way, so it's sufficient to toggle
 		 * IFCAP_TSOx only.
 		 */
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO4;
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum6 first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO6;
 		}
 		if (mask & IFCAP_LRO) {
 #if defined(INET) || defined(INET6)
 			int i;
 			struct sge_rxq *rxq;
 
 			ifp->if_capenable ^= IFCAP_LRO;
 			for_each_rxq(vi, i, rxq) {
 				if (ifp->if_capenable & IFCAP_LRO)
 					rxq->iq.flags |= IQ_LRO_ENABLED;
 				else
 					rxq->iq.flags &= ~IQ_LRO_ENABLED;
 			}
 #endif
 		}
 #ifdef TCP_OFFLOAD
 		if (mask & IFCAP_TOE) {
 			int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE;
 
 			rc = toe_capability(vi, enable);
 			if (rc != 0)
 				goto fail;
 
 			ifp->if_capenable ^= mask;
 		}
 #endif
 		if (mask & IFCAP_VLAN_HWTAGGING) {
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_VLANEX);
 		}
 		if (mask & IFCAP_VLAN_MTU) {
 			ifp->if_capenable ^= IFCAP_VLAN_MTU;
 
 			/* Need to find out how to disable auto-mtu-inflation */
 		}
 		if (mask & IFCAP_VLAN_HWTSO)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
 		if (mask & IFCAP_VLAN_HWCSUM)
 			ifp->if_capenable ^= IFCAP_VLAN_HWCSUM;
 #ifdef RATELIMIT
 		if (mask & IFCAP_TXRTLMT)
 			ifp->if_capenable ^= IFCAP_TXRTLMT;
 #endif
 		if (mask & IFCAP_HWRXTSTMP) {
 			int i;
 			struct sge_rxq *rxq;
 
 			ifp->if_capenable ^= IFCAP_HWRXTSTMP;
 			for_each_rxq(vi, i, rxq) {
 				if (ifp->if_capenable & IFCAP_HWRXTSTMP)
 					rxq->iq.flags |= IQ_RX_TIMESTAMP;
 				else
 					rxq->iq.flags &= ~IQ_RX_TIMESTAMP;
 			}
 		}
 
 #ifdef VLAN_CAPABILITIES
 		VLAN_CAPABILITIES(ifp);
 #endif
 fail:
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		ifmedia_ioctl(ifp, ifr, &pi->media, cmd);
 		break;
 
 	case SIOCGI2C: {
 		struct ifi2creq i2c;
 
 		rc = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (rc != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			rc = EPERM;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			rc = EINVAL;
 			break;
 		}
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c");
 		if (rc)
 			return (rc);
 		rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr,
 		    i2c.offset, i2c.len, &i2c.data[0]);
 		end_synchronized_op(sc, 0);
 		if (rc == 0)
 			rc = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 		break;
 	}
 
 	default:
 		rc = ether_ioctl(ifp, cmd, data);
 	}
 
 	return (rc);
 }
 
 static int
 cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_txq *txq;
 	void *items[1];
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_nextpkt == NULL);	/* not quite ready for this yet */
 
 	if (__predict_false(pi->link_cfg.link_ok == false)) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	rc = parse_pkt(sc, &m);
 	if (__predict_false(rc != 0)) {
 		MPASS(m == NULL);			/* was freed already */
 		atomic_add_int(&pi->tx_parse_error, 1);	/* rare, atomic is ok */
 		return (rc);
 	}
 #ifdef RATELIMIT
-	if (m->m_pkthdr.snd_tag != NULL) {
-		/* EAGAIN tells the stack we are not the correct interface. */
-		if (__predict_false(ifp != m->m_pkthdr.snd_tag->ifp)) {
-			m_freem(m);
-			return (EAGAIN);
-		}
-
+	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
+		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 		return (ethofld_transmit(ifp, m));
 	}
 #endif
 
 	/* Select a txq. */
 	txq = &sc->sge.txq[vi->first_txq];
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) +
 		    vi->rsrv_noflowq);
 
 	items[0] = m;
 	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
 	if (__predict_false(rc != 0))
 		m_freem(m);
 
 	return (rc);
 }
 
 static void
 cxgbe_qflush(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct sge_txq *txq;
 	int i;
 
 	/* queues do not exist if !VI_INIT_DONE. */
 	if (vi->flags & VI_INIT_DONE) {
 		for_each_txq(vi, i, txq) {
 			TXQ_LOCK(txq);
 			txq->eq.flags |= EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 			while (!mp_ring_is_idle(txq->r)) {
 				mp_ring_check_drainage(txq->r, 0);
 				pause("qflush", 1);
 			}
 			TXQ_LOCK(txq);
 			txq->eq.flags &= ~EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 		}
 	}
 	if_qflush(ifp);
 }
 
 static uint64_t
 vi_get_counter(struct ifnet *ifp, ift_counter c)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct fw_vi_stats_vf *s = &vi->stats;
 
 	vi_refresh_stats(vi->pi->adapter, vi);
 
 	switch (c) {
 	case IFCOUNTER_IPACKETS:
 		return (s->rx_bcast_frames + s->rx_mcast_frames +
 		    s->rx_ucast_frames);
 	case IFCOUNTER_IERRORS:
 		return (s->rx_err_frames);
 	case IFCOUNTER_OPACKETS:
 		return (s->tx_bcast_frames + s->tx_mcast_frames +
 		    s->tx_ucast_frames + s->tx_offload_frames);
 	case IFCOUNTER_OERRORS:
 		return (s->tx_drop_frames);
 	case IFCOUNTER_IBYTES:
 		return (s->rx_bcast_bytes + s->rx_mcast_bytes +
 		    s->rx_ucast_bytes);
 	case IFCOUNTER_OBYTES:
 		return (s->tx_bcast_bytes + s->tx_mcast_bytes +
 		    s->tx_ucast_bytes + s->tx_offload_bytes);
 	case IFCOUNTER_IMCASTS:
 		return (s->rx_mcast_frames);
 	case IFCOUNTER_OMCASTS:
 		return (s->tx_mcast_frames);
 	case IFCOUNTER_OQDROPS: {
 		uint64_t drops;
 
 		drops = 0;
 		if (vi->flags & VI_INIT_DONE) {
 			int i;
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
 				drops += counter_u64_fetch(txq->r->drops);
 		}
 
 		return (drops);
 
 	}
 
 	default:
 		return (if_get_counter_default(ifp, c));
 	}
 }
 
 uint64_t
 cxgbe_get_counter(struct ifnet *ifp, ift_counter c)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct port_stats *s = &pi->stats;
 
 	if (pi->nvi > 1 || sc->flags & IS_VF)
 		return (vi_get_counter(ifp, c));
 
 	cxgbe_refresh_stats(sc, pi);
 
 	switch (c) {
 	case IFCOUNTER_IPACKETS:
 		return (s->rx_frames);
 
 	case IFCOUNTER_IERRORS:
 		return (s->rx_jabber + s->rx_runt + s->rx_too_long +
 		    s->rx_fcs_err + s->rx_len_err);
 
 	case IFCOUNTER_OPACKETS:
 		return (s->tx_frames);
 
 	case IFCOUNTER_OERRORS:
 		return (s->tx_error_frames);
 
 	case IFCOUNTER_IBYTES:
 		return (s->rx_octets);
 
 	case IFCOUNTER_OBYTES:
 		return (s->tx_octets);
 
 	case IFCOUNTER_IMCASTS:
 		return (s->rx_mcast_frames);
 
 	case IFCOUNTER_OMCASTS:
 		return (s->tx_mcast_frames);
 
 	case IFCOUNTER_IQDROPS:
 		return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 +
 		    s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 +
 		    s->rx_trunc3 + pi->tnl_cong_drops);
 
 	case IFCOUNTER_OQDROPS: {
 		uint64_t drops;
 
 		drops = s->tx_drop;
 		if (vi->flags & VI_INIT_DONE) {
 			int i;
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
 				drops += counter_u64_fetch(txq->r->drops);
 		}
 
 		return (drops);
 
 	}
 
 	default:
 		return (if_get_counter_default(ifp, c));
 	}
 }
 
 /*
  * The kernel picks a media from the list we had provided but we still validate
  * the requeste.
  */
 int
 cxgbe_media_change(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct ifmedia *ifm = &pi->media;
 	struct link_config *lc = &pi->link_cfg;
 	struct adapter *sc = pi->adapter;
 	int rc;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mec");
 	if (rc != 0)
 		return (rc);
 	PORT_LOCK(pi);
 	if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) {
 		/* ifconfig .. media autoselect */
 		if (!(lc->supported & FW_PORT_CAP32_ANEG)) {
 			rc = ENOTSUP; /* AN not supported by transceiver */
 			goto done;
 		}
 		lc->requested_aneg = AUTONEG_ENABLE;
 		lc->requested_speed = 0;
 		lc->requested_fc |= PAUSE_AUTONEG;
 	} else {
 		lc->requested_aneg = AUTONEG_DISABLE;
 		lc->requested_speed =
 		    ifmedia_baudrate(ifm->ifm_media) / 1000000;
 		lc->requested_fc = 0;
 		if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE)
 			lc->requested_fc |= PAUSE_RX;
 		if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE)
 			lc->requested_fc |= PAUSE_TX;
 	}
 	if (pi->up_vis > 0) {
 		fixup_link_config(pi);
 		rc = apply_link_config(pi);
 	}
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 /*
  * Base media word (without ETHER, pause, link active, etc.) for the port at the
  * given speed.
  */
 static int
 port_mword(struct port_info *pi, uint32_t speed)
 {
 
 	MPASS(speed & M_FW_PORT_CAP32_SPEED);
 	MPASS(powerof2(speed));
 
 	switch(pi->port_type) {
 	case FW_PORT_TYPE_BT_SGMII:
 	case FW_PORT_TYPE_BT_XFI:
 	case FW_PORT_TYPE_BT_XAUI:
 		/* BaseT */
 		switch (speed) {
 		case FW_PORT_CAP32_SPEED_100M:
 			return (IFM_100_T);
 		case FW_PORT_CAP32_SPEED_1G:
 			return (IFM_1000_T);
 		case FW_PORT_CAP32_SPEED_10G:
 			return (IFM_10G_T);
 		}
 		break;
 	case FW_PORT_TYPE_KX4:
 		if (speed == FW_PORT_CAP32_SPEED_10G)
 			return (IFM_10G_KX4);
 		break;
 	case FW_PORT_TYPE_CX4:
 		if (speed == FW_PORT_CAP32_SPEED_10G)
 			return (IFM_10G_CX4);
 		break;
 	case FW_PORT_TYPE_KX:
 		if (speed == FW_PORT_CAP32_SPEED_1G)
 			return (IFM_1000_KX);
 		break;
 	case FW_PORT_TYPE_KR:
 	case FW_PORT_TYPE_BP_AP:
 	case FW_PORT_TYPE_BP4_AP:
 	case FW_PORT_TYPE_BP40_BA:
 	case FW_PORT_TYPE_KR4_100G:
 	case FW_PORT_TYPE_KR_SFP28:
 	case FW_PORT_TYPE_KR_XLAUI:
 		switch (speed) {
 		case FW_PORT_CAP32_SPEED_1G:
 			return (IFM_1000_KX);
 		case FW_PORT_CAP32_SPEED_10G:
 			return (IFM_10G_KR);
 		case FW_PORT_CAP32_SPEED_25G:
 			return (IFM_25G_KR);
 		case FW_PORT_CAP32_SPEED_40G:
 			return (IFM_40G_KR4);
 		case FW_PORT_CAP32_SPEED_50G:
 			return (IFM_50G_KR2);
 		case FW_PORT_CAP32_SPEED_100G:
 			return (IFM_100G_KR4);
 		}
 		break;
 	case FW_PORT_TYPE_FIBER_XFI:
 	case FW_PORT_TYPE_FIBER_XAUI:
 	case FW_PORT_TYPE_SFP:
 	case FW_PORT_TYPE_QSFP_10G:
 	case FW_PORT_TYPE_QSA:
 	case FW_PORT_TYPE_QSFP:
 	case FW_PORT_TYPE_CR4_QSFP:
 	case FW_PORT_TYPE_CR_QSFP:
 	case FW_PORT_TYPE_CR2_QSFP:
 	case FW_PORT_TYPE_SFP28:
 		/* Pluggable transceiver */
 		switch (pi->mod_type) {
 		case FW_PORT_MOD_TYPE_LR:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_LX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_LR);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_LR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_LR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_LR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_LR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_SR:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_SX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_SR);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_SR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_SR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_SR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_SR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_ER:
 			if (speed == FW_PORT_CAP32_SPEED_10G)
 				return (IFM_10G_ER);
 			break;
 		case FW_PORT_MOD_TYPE_TWINAX_PASSIVE:
 		case FW_PORT_MOD_TYPE_TWINAX_ACTIVE:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_CX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_TWINAX);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_CR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_CR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_CR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_CR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_LRM:
 			if (speed == FW_PORT_CAP32_SPEED_10G)
 				return (IFM_10G_LRM);
 			break;
 		case FW_PORT_MOD_TYPE_NA:
 			MPASS(0);	/* Not pluggable? */
 			/* fall throough */
 		case FW_PORT_MOD_TYPE_ERROR:
 		case FW_PORT_MOD_TYPE_UNKNOWN:
 		case FW_PORT_MOD_TYPE_NOTSUPPORTED:
 			break;
 		case FW_PORT_MOD_TYPE_NONE:
 			return (IFM_NONE);
 		}
 		break;
 	case FW_PORT_TYPE_NONE:
 		return (IFM_NONE);
 	}
 
 	return (IFM_UNKNOWN);
 }
 
 void
 cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4med") != 0)
 		return;
 	PORT_LOCK(pi);
 
 	if (pi->up_vis == 0) {
 		/*
 		 * If all the interfaces are administratively down the firmware
 		 * does not report transceiver changes.  Refresh port info here
 		 * so that ifconfig displays accurate ifmedia at all times.
 		 * This is the only reason we have a synchronized op in this
 		 * function.  Just PORT_LOCK would have been enough otherwise.
 		 */
 		t4_update_port_info(pi);
 		build_medialist(pi);
 	}
 
 	/* ifm_status */
 	ifmr->ifm_status = IFM_AVALID;
 	if (lc->link_ok == false)
 		goto done;
 	ifmr->ifm_status |= IFM_ACTIVE;
 
 	/* ifm_active */
 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
 	ifmr->ifm_active &= ~(IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE);
 	if (lc->fc & PAUSE_RX)
 		ifmr->ifm_active |= IFM_ETH_RXPAUSE;
 	if (lc->fc & PAUSE_TX)
 		ifmr->ifm_active |= IFM_ETH_TXPAUSE;
 	ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed));
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 }
 
 static int
 vcxgbe_probe(device_t dev)
 {
 	char buf[128];
 	struct vi_info *vi = device_get_softc(dev);
 
 	snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id,
 	    vi - vi->pi->vi);
 	device_set_desc_copy(dev, buf);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 alloc_extra_vi(struct adapter *sc, struct port_info *pi, struct vi_info *vi)
 {
 	int func, index, rc;
 	uint32_t param, val;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	index = vi - pi->vi;
 	MPASS(index > 0);	/* This function deals with _extra_ VIs only */
 	KASSERT(index < nitems(vi_mac_funcs),
 	    ("%s: VI %s doesn't have a MAC func", __func__,
 	    device_get_nameunit(vi->dev)));
 	func = vi_mac_funcs[index];
 	rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1,
 	    vi->hw_addr, &vi->rss_size, &vi->vfvld, &vi->vin, func, 0);
 	if (rc < 0) {
 		device_printf(vi->dev, "failed to allocate virtual interface %d"
 		    "for port %d: %d\n", index, pi->port_id, -rc);
 		return (-rc);
 	}
 	vi->viid = rc;
 
 	if (vi->rss_size == 1) {
 		/*
 		 * This VI didn't get a slice of the RSS table.  Reduce the
 		 * number of VIs being created (hw.cxgbe.num_vis) or modify the
 		 * configuration file (nvi, rssnvi for this PF) if this is a
 		 * problem.
 		 */
 		device_printf(vi->dev, "RSS table not available.\n");
 		vi->rss_base = 0xffff;
 
 		return (0);
 	}
 
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) |
 	    V_FW_PARAMS_PARAM_YZ(vi->viid);
 	rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	if (rc)
 		vi->rss_base = 0xffff;
 	else {
 		MPASS((val >> 16) == vi->rss_size);
 		vi->rss_base = val & 0xffff;
 	}
 
 	return (0);
 }
 
 static int
 vcxgbe_attach(device_t dev)
 {
 	struct vi_info *vi;
 	struct port_info *pi;
 	struct adapter *sc;
 	int rc;
 
 	vi = device_get_softc(dev);
 	pi = vi->pi;
 	sc = pi->adapter;
 
 	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4via");
 	if (rc)
 		return (rc);
 	rc = alloc_extra_vi(sc, pi, vi);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	rc = cxgbe_vi_attach(dev, vi);
 	if (rc) {
 		t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid);
 		return (rc);
 	}
 	return (0);
 }
 
 static int
 vcxgbe_detach(device_t dev)
 {
 	struct vi_info *vi;
 	struct adapter *sc;
 
 	vi = device_get_softc(dev);
 	sc = vi->pi->adapter;
 
 	doom_vi(sc, vi);
 
 	cxgbe_vi_detach(vi);
 	t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid);
 
 	end_synchronized_op(sc, 0);
 
 	return (0);
 }
 
 static struct callout fatal_callout;
 
 static void
 delayed_panic(void *arg)
 {
 	struct adapter *sc = arg;
 
 	panic("%s: panic on fatal error", device_get_nameunit(sc->dev));
 }
 
 void
 t4_fatal_err(struct adapter *sc, bool fw_error)
 {
 
 	t4_shutdown_adapter(sc);
 	log(LOG_ALERT, "%s: encountered fatal error, adapter stopped.\n",
 	    device_get_nameunit(sc->dev));
 	if (fw_error) {
 		ASSERT_SYNCHRONIZED_OP(sc);
 		sc->flags |= ADAP_ERR;
 	} else {
 		ADAPTER_LOCK(sc);
 		sc->flags |= ADAP_ERR;
 		ADAPTER_UNLOCK(sc);
 	}
 
 	if (t4_panic_on_fatal_err) {
 		log(LOG_ALERT, "%s: panic on fatal error after 30s",
 		    device_get_nameunit(sc->dev));
 		callout_reset(&fatal_callout, hz * 30, delayed_panic, sc);
 	}
 }
 
 void
 t4_add_adapter(struct adapter *sc)
 {
 	sx_xlock(&t4_list_lock);
 	SLIST_INSERT_HEAD(&t4_list, sc, link);
 	sx_xunlock(&t4_list_lock);
 }
 
 int
 t4_map_bars_0_and_4(struct adapter *sc)
 {
 	sc->regs_rid = PCIR_BAR(0);
 	sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->regs_rid, RF_ACTIVE);
 	if (sc->regs_res == NULL) {
 		device_printf(sc->dev, "cannot map registers.\n");
 		return (ENXIO);
 	}
 	sc->bt = rman_get_bustag(sc->regs_res);
 	sc->bh = rman_get_bushandle(sc->regs_res);
 	sc->mmio_len = rman_get_size(sc->regs_res);
 	setbit(&sc->doorbells, DOORBELL_KDB);
 
 	sc->msix_rid = PCIR_BAR(4);
 	sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->msix_rid, RF_ACTIVE);
 	if (sc->msix_res == NULL) {
 		device_printf(sc->dev, "cannot map MSI-X BAR.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 int
 t4_map_bar_2(struct adapter *sc)
 {
 
 	/*
 	 * T4: only iWARP driver uses the userspace doorbells.  There is no need
 	 * to map it if RDMA is disabled.
 	 */
 	if (is_t4(sc) && sc->rdmacaps == 0)
 		return (0);
 
 	sc->udbs_rid = PCIR_BAR(2);
 	sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->udbs_rid, RF_ACTIVE);
 	if (sc->udbs_res == NULL) {
 		device_printf(sc->dev, "cannot map doorbell BAR.\n");
 		return (ENXIO);
 	}
 	sc->udbs_base = rman_get_virtual(sc->udbs_res);
 
 	if (chip_id(sc) >= CHELSIO_T5) {
 		setbit(&sc->doorbells, DOORBELL_UDB);
 #if defined(__i386__) || defined(__amd64__)
 		if (t5_write_combine) {
 			int rc, mode;
 
 			/*
 			 * Enable write combining on BAR2.  This is the
 			 * userspace doorbell BAR and is split into 128B
 			 * (UDBS_SEG_SIZE) doorbell regions, each associated
 			 * with an egress queue.  The first 64B has the doorbell
 			 * and the second 64B can be used to submit a tx work
 			 * request with an implicit doorbell.
 			 */
 
 			rc = pmap_change_attr((vm_offset_t)sc->udbs_base,
 			    rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING);
 			if (rc == 0) {
 				clrbit(&sc->doorbells, DOORBELL_UDB);
 				setbit(&sc->doorbells, DOORBELL_WCWR);
 				setbit(&sc->doorbells, DOORBELL_UDBWC);
 			} else {
 				device_printf(sc->dev,
 				    "couldn't enable write combining: %d\n",
 				    rc);
 			}
 
 			mode = is_t5(sc) ? V_STATMODE(0) : V_T6_STATMODE(0);
 			t4_write_reg(sc, A_SGE_STAT_CFG,
 			    V_STATSOURCE_T5(7) | mode);
 		}
 #endif
 	}
 	sc->iwt.wc_en = isset(&sc->doorbells, DOORBELL_UDBWC) ? 1 : 0;
 
 	return (0);
 }
 
 struct memwin_init {
 	uint32_t base;
 	uint32_t aperture;
 };
 
 static const struct memwin_init t4_memwin[NUM_MEMWIN] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 }
 };
 
 static const struct memwin_init t5_memwin[NUM_MEMWIN] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 },
 };
 
 static void
 setup_memwin(struct adapter *sc)
 {
 	const struct memwin_init *mw_init;
 	struct memwin *mw;
 	int i;
 	uint32_t bar0;
 
 	if (is_t4(sc)) {
 		/*
 		 * Read low 32b of bar0 indirectly via the hardware backdoor
 		 * mechanism.  Works from within PCI passthrough environments
 		 * too, where rman_get_start() can return a different value.  We
 		 * need to program the T4 memory window decoders with the actual
 		 * addresses that will be coming across the PCIe link.
 		 */
 		bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0));
 		bar0 &= (uint32_t) PCIM_BAR_MEM_BASE;
 
 		mw_init = &t4_memwin[0];
 	} else {
 		/* T5+ use the relative offset inside the PCIe BAR */
 		bar0 = 0;
 
 		mw_init = &t5_memwin[0];
 	}
 
 	for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) {
 		rw_init(&mw->mw_lock, "memory window access");
 		mw->mw_base = mw_init->base;
 		mw->mw_aperture = mw_init->aperture;
 		mw->mw_curpos = 0;
 		t4_write_reg(sc,
 		    PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i),
 		    (mw->mw_base + bar0) | V_BIR(0) |
 		    V_WINDOW(ilog2(mw->mw_aperture) - 10));
 		rw_wlock(&mw->mw_lock);
 		position_memwin(sc, i, 0);
 		rw_wunlock(&mw->mw_lock);
 	}
 
 	/* flush */
 	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2));
 }
 
 /*
  * Positions the memory window at the given address in the card's address space.
  * There are some alignment requirements and the actual position may be at an
  * address prior to the requested address.  mw->mw_curpos always has the actual
  * position of the window.
  */
 static void
 position_memwin(struct adapter *sc, int idx, uint32_t addr)
 {
 	struct memwin *mw;
 	uint32_t pf;
 	uint32_t reg;
 
 	MPASS(idx >= 0 && idx < NUM_MEMWIN);
 	mw = &sc->memwin[idx];
 	rw_assert(&mw->mw_lock, RA_WLOCKED);
 
 	if (is_t4(sc)) {
 		pf = 0;
 		mw->mw_curpos = addr & ~0xf;	/* start must be 16B aligned */
 	} else {
 		pf = V_PFNUM(sc->pf);
 		mw->mw_curpos = addr & ~0x7f;	/* start must be 128B aligned */
 	}
 	reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx);
 	t4_write_reg(sc, reg, mw->mw_curpos | pf);
 	t4_read_reg(sc, reg);	/* flush */
 }
 
 int
 rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val,
     int len, int rw)
 {
 	struct memwin *mw;
 	uint32_t mw_end, v;
 
 	MPASS(idx >= 0 && idx < NUM_MEMWIN);
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (addr & 3 || len & 3 || len <= 0)
 		return (EINVAL);
 
 	mw = &sc->memwin[idx];
 	while (len > 0) {
 		rw_rlock(&mw->mw_lock);
 		mw_end = mw->mw_curpos + mw->mw_aperture;
 		if (addr >= mw_end || addr < mw->mw_curpos) {
 			/* Will need to reposition the window */
 			if (!rw_try_upgrade(&mw->mw_lock)) {
 				rw_runlock(&mw->mw_lock);
 				rw_wlock(&mw->mw_lock);
 			}
 			rw_assert(&mw->mw_lock, RA_WLOCKED);
 			position_memwin(sc, idx, addr);
 			rw_downgrade(&mw->mw_lock);
 			mw_end = mw->mw_curpos + mw->mw_aperture;
 		}
 		rw_assert(&mw->mw_lock, RA_RLOCKED);
 		while (addr < mw_end && len > 0) {
 			if (rw == 0) {
 				v = t4_read_reg(sc, mw->mw_base + addr -
 				    mw->mw_curpos);
 				*val++ = le32toh(v);
 			} else {
 				v = *val++;
 				t4_write_reg(sc, mw->mw_base + addr -
 				    mw->mw_curpos, htole32(v));
 			}
 			addr += 4;
 			len -= 4;
 		}
 		rw_runlock(&mw->mw_lock);
 	}
 
 	return (0);
 }
 
 int
 alloc_atid_tab(struct tid_info *t, int flags)
 {
 	int i;
 
 	MPASS(t->natids > 0);
 	MPASS(t->atid_tab == NULL);
 
 	t->atid_tab = malloc(t->natids * sizeof(*t->atid_tab), M_CXGBE,
 	    M_ZERO | flags);
 	if (t->atid_tab == NULL)
 		return (ENOMEM);
 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
 	t->afree = t->atid_tab;
 	t->atids_in_use = 0;
 	for (i = 1; i < t->natids; i++)
 		t->atid_tab[i - 1].next = &t->atid_tab[i];
 	t->atid_tab[t->natids - 1].next = NULL;
 
 	return (0);
 }
 
 void
 free_atid_tab(struct tid_info *t)
 {
 
 	KASSERT(t->atids_in_use == 0,
 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
 
 	if (mtx_initialized(&t->atid_lock))
 		mtx_destroy(&t->atid_lock);
 	free(t->atid_tab, M_CXGBE);
 	t->atid_tab = NULL;
 }
 
 int
 alloc_atid(struct adapter *sc, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 	int atid = -1;
 
 	mtx_lock(&t->atid_lock);
 	if (t->afree) {
 		union aopen_entry *p = t->afree;
 
 		atid = p - t->atid_tab;
 		MPASS(atid <= M_TID_TID);
 		t->afree = p->next;
 		p->data = ctx;
 		t->atids_in_use++;
 	}
 	mtx_unlock(&t->atid_lock);
 	return (atid);
 }
 
 void *
 lookup_atid(struct adapter *sc, int atid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->atid_tab[atid].data);
 }
 
 void
 free_atid(struct adapter *sc, int atid)
 {
 	struct tid_info *t = &sc->tids;
 	union aopen_entry *p = &t->atid_tab[atid];
 
 	mtx_lock(&t->atid_lock);
 	p->next = t->afree;
 	t->afree = p;
 	t->atids_in_use--;
 	mtx_unlock(&t->atid_lock);
 }
 
 static void
 queue_tid_release(struct adapter *sc, int tid)
 {
 
 	CXGBE_UNIMPLEMENTED("deferred tid release");
 }
 
 void
 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
 {
 	struct wrqe *wr;
 	struct cpl_tid_release *req;
 
 	wr = alloc_wrqe(sizeof(*req), ctrlq);
 	if (wr == NULL) {
 		queue_tid_release(sc, tid);	/* defer */
 		return;
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 t4_range_cmp(const void *a, const void *b)
 {
 	return ((const struct t4_range *)a)->start -
 	       ((const struct t4_range *)b)->start;
 }
 
 /*
  * Verify that the memory range specified by the addr/len pair is valid within
  * the card's address space.
  */
 static int
 validate_mem_range(struct adapter *sc, uint32_t addr, uint32_t len)
 {
 	struct t4_range mem_ranges[4], *r, *next;
 	uint32_t em, addr_len;
 	int i, n, remaining;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (addr & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	/* Enabled memories */
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 
 	r = &mem_ranges[0];
 	n = 0;
 	bzero(r, sizeof(mem_ranges));
 	if (em & F_EDRAM0_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		r->size = G_EDRAM0_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EDRAM0_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (em & F_EDRAM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		r->size = G_EDRAM1_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EDRAM1_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (em & F_EXT_MEM_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		r->size = G_EXT_MEM_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EXT_MEM_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		r->size = G_EXT_MEM1_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EXT_MEM1_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	MPASS(n <= nitems(mem_ranges));
 
 	if (n > 1) {
 		/* Sort and merge the ranges. */
 		qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp);
 
 		/* Start from index 0 and examine the next n - 1 entries. */
 		r = &mem_ranges[0];
 		for (remaining = n - 1; remaining > 0; remaining--, r++) {
 
 			MPASS(r->size > 0);	/* r is a valid entry. */
 			next = r + 1;
 			MPASS(next->size > 0);	/* and so is the next one. */
 
 			while (r->start + r->size >= next->start) {
 				/* Merge the next one into the current entry. */
 				r->size = max(r->start + r->size,
 				    next->start + next->size) - r->start;
 				n--;	/* One fewer entry in total. */
 				if (--remaining == 0)
 					goto done;	/* short circuit */
 				next++;
 			}
 			if (next != r + 1) {
 				/*
 				 * Some entries were merged into r and next
 				 * points to the first valid entry that couldn't
 				 * be merged.
 				 */
 				MPASS(next->size > 0);	/* must be valid */
 				memcpy(r + 1, next, remaining * sizeof(*r));
 #ifdef INVARIANTS
 				/*
 				 * This so that the foo->size assertion in the
 				 * next iteration of the loop do the right
 				 * thing for entries that were pulled up and are
 				 * no longer valid.
 				 */
 				MPASS(n < nitems(mem_ranges));
 				bzero(&mem_ranges[n], (nitems(mem_ranges) - n) *
 				    sizeof(struct t4_range));
 #endif
 			}
 		}
 done:
 		/* Done merging the ranges. */
 		MPASS(n > 0);
 		r = &mem_ranges[0];
 		for (i = 0; i < n; i++, r++) {
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 		}
 	}
 
 	return (EFAULT);
 }
 
 static int
 fwmtype_to_hwmtype(int mtype)
 {
 
 	switch (mtype) {
 	case FW_MEMTYPE_EDC0:
 		return (MEM_EDC0);
 	case FW_MEMTYPE_EDC1:
 		return (MEM_EDC1);
 	case FW_MEMTYPE_EXTMEM:
 		return (MEM_MC0);
 	case FW_MEMTYPE_EXTMEM1:
 		return (MEM_MC1);
 	default:
 		panic("%s: cannot translate fw mtype %d.", __func__, mtype);
 	}
 }
 
 /*
  * Verify that the memory range specified by the memtype/offset/len pair is
  * valid and lies entirely within the memtype specified.  The global address of
  * the start of the range is returned in addr.
  */
 static int
 validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, uint32_t len,
     uint32_t *addr)
 {
 	uint32_t em, addr_len, maddr;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (off & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	switch (fwmtype_to_hwmtype(mtype)) {
 	case MEM_EDC0:
 		if (!(em & F_EDRAM0_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		maddr = G_EDRAM0_BASE(addr_len) << 20;
 		break;
 	case MEM_EDC1:
 		if (!(em & F_EDRAM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		maddr = G_EDRAM1_BASE(addr_len) << 20;
 		break;
 	case MEM_MC:
 		if (!(em & F_EXT_MEM_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		maddr = G_EXT_MEM_BASE(addr_len) << 20;
 		break;
 	case MEM_MC1:
 		if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		maddr = G_EXT_MEM1_BASE(addr_len) << 20;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	*addr = maddr + off;	/* global address */
 	return (validate_mem_range(sc, *addr, len));
 }
 
 static int
 fixup_devlog_params(struct adapter *sc)
 {
 	struct devlog_params *dparams = &sc->params.devlog;
 	int rc;
 
 	rc = validate_mt_off_len(sc, dparams->memtype, dparams->start,
 	    dparams->size, &dparams->addr);
 
 	return (rc);
 }
 
 static void
 update_nirq(struct intrs_and_queues *iaq, int nports)
 {
 	int extra = T4_EXTRA_INTR;
 
 	iaq->nirq = extra;
 	iaq->nirq += nports * (iaq->nrxq + iaq->nofldrxq);
 	iaq->nirq += nports * (iaq->num_vis - 1) *
 	    max(iaq->nrxq_vi, iaq->nnmrxq_vi);
 	iaq->nirq += nports * (iaq->num_vis - 1) * iaq->nofldrxq_vi;
 }
 
 /*
  * Adjust requirements to fit the number of interrupts available.
  */
 static void
 calculate_iaq(struct adapter *sc, struct intrs_and_queues *iaq, int itype,
     int navail)
 {
 	int old_nirq;
 	const int nports = sc->params.nports;
 
 	MPASS(nports > 0);
 	MPASS(navail > 0);
 
 	bzero(iaq, sizeof(*iaq));
 	iaq->intr_type = itype;
 	iaq->num_vis = t4_num_vis;
 	iaq->ntxq = t4_ntxq;
 	iaq->ntxq_vi = t4_ntxq_vi;
 	iaq->nrxq = t4_nrxq;
 	iaq->nrxq_vi = t4_nrxq_vi;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (is_offload(sc) || is_ethoffload(sc)) {
 		iaq->nofldtxq = t4_nofldtxq;
 		iaq->nofldtxq_vi = t4_nofldtxq_vi;
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		iaq->nofldrxq = t4_nofldrxq;
 		iaq->nofldrxq_vi = t4_nofldrxq_vi;
 	}
 #endif
 #ifdef DEV_NETMAP
 	iaq->nnmtxq_vi = t4_nnmtxq_vi;
 	iaq->nnmrxq_vi = t4_nnmrxq_vi;
 #endif
 
 	update_nirq(iaq, nports);
 	if (iaq->nirq <= navail &&
 	    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 		/*
 		 * This is the normal case -- there are enough interrupts for
 		 * everything.
 		 */
 		goto done;
 	}
 
 	/*
 	 * If extra VIs have been configured try reducing their count and see if
 	 * that works.
 	 */
 	while (iaq->num_vis > 1) {
 		iaq->num_vis--;
 		update_nirq(iaq, nports);
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 			device_printf(sc->dev, "virtual interfaces per port "
 			    "reduced to %d from %d.  nrxq=%u, nofldrxq=%u, "
 			    "nrxq_vi=%u nofldrxq_vi=%u, nnmrxq_vi=%u.  "
 			    "itype %d, navail %u, nirq %d.\n",
 			    iaq->num_vis, t4_num_vis, iaq->nrxq, iaq->nofldrxq,
 			    iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi,
 			    itype, navail, iaq->nirq);
 			goto done;
 		}
 	}
 
 	/*
 	 * Extra VIs will not be created.  Log a message if they were requested.
 	 */
 	MPASS(iaq->num_vis == 1);
 	iaq->ntxq_vi = iaq->nrxq_vi = 0;
 	iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0;
 	iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0;
 	if (iaq->num_vis != t4_num_vis) {
 		device_printf(sc->dev, "extra virtual interfaces disabled.  "
 		    "nrxq=%u, nofldrxq=%u, nrxq_vi=%u nofldrxq_vi=%u, "
 		    "nnmrxq_vi=%u.  itype %d, navail %u, nirq %d.\n",
 		    iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi,
 		    iaq->nnmrxq_vi, itype, navail, iaq->nirq);
 	}
 
 	/*
 	 * Keep reducing the number of NIC rx queues to the next lower power of
 	 * 2 (for even RSS distribution) and halving the TOE rx queues and see
 	 * if that works.
 	 */
 	do {
 		if (iaq->nrxq > 1) {
 			do {
 				iaq->nrxq--;
 			} while (!powerof2(iaq->nrxq));
 		}
 		if (iaq->nofldrxq > 1)
 			iaq->nofldrxq >>= 1;
 
 		old_nirq = iaq->nirq;
 		update_nirq(iaq, nports);
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 			device_printf(sc->dev, "running with reduced number of "
 			    "rx queues because of shortage of interrupts.  "
 			    "nrxq=%u, nofldrxq=%u.  "
 			    "itype %d, navail %u, nirq %d.\n", iaq->nrxq,
 			    iaq->nofldrxq, itype, navail, iaq->nirq);
 			goto done;
 		}
 	} while (old_nirq != iaq->nirq);
 
 	/* One interrupt for everything.  Ugh. */
 	device_printf(sc->dev, "running with minimal number of queues.  "
 	    "itype %d, navail %u.\n", itype, navail);
 	iaq->nirq = 1;
 	MPASS(iaq->nrxq == 1);
 	iaq->ntxq = 1;
 	if (iaq->nofldrxq > 1)
 		iaq->nofldtxq = 1;
 done:
 	MPASS(iaq->num_vis > 0);
 	if (iaq->num_vis > 1) {
 		MPASS(iaq->nrxq_vi > 0);
 		MPASS(iaq->ntxq_vi > 0);
 	}
 	MPASS(iaq->nirq > 0);
 	MPASS(iaq->nrxq > 0);
 	MPASS(iaq->ntxq > 0);
 	if (itype == INTR_MSI) {
 		MPASS(powerof2(iaq->nirq));
 	}
 }
 
 static int
 cfg_itype_and_nqueues(struct adapter *sc, struct intrs_and_queues *iaq)
 {
 	int rc, itype, navail, nalloc;
 
 	for (itype = INTR_MSIX; itype; itype >>= 1) {
 
 		if ((itype & t4_intr_types) == 0)
 			continue;	/* not allowed */
 
 		if (itype == INTR_MSIX)
 			navail = pci_msix_count(sc->dev);
 		else if (itype == INTR_MSI)
 			navail = pci_msi_count(sc->dev);
 		else
 			navail = 1;
 restart:
 		if (navail == 0)
 			continue;
 
 		calculate_iaq(sc, iaq, itype, navail);
 		nalloc = iaq->nirq;
 		rc = 0;
 		if (itype == INTR_MSIX)
 			rc = pci_alloc_msix(sc->dev, &nalloc);
 		else if (itype == INTR_MSI)
 			rc = pci_alloc_msi(sc->dev, &nalloc);
 
 		if (rc == 0 && nalloc > 0) {
 			if (nalloc == iaq->nirq)
 				return (0);
 
 			/*
 			 * Didn't get the number requested.  Use whatever number
 			 * the kernel is willing to allocate.
 			 */
 			device_printf(sc->dev, "fewer vectors than requested, "
 			    "type=%d, req=%d, rcvd=%d; will downshift req.\n",
 			    itype, iaq->nirq, nalloc);
 			pci_release_msi(sc->dev);
 			navail = nalloc;
 			goto restart;
 		}
 
 		device_printf(sc->dev,
 		    "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n",
 		    itype, rc, iaq->nirq, nalloc);
 	}
 
 	device_printf(sc->dev,
 	    "failed to find a usable interrupt type.  "
 	    "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types,
 	    pci_msix_count(sc->dev), pci_msi_count(sc->dev));
 
 	return (ENXIO);
 }
 
 #define FW_VERSION(chip) ( \
     V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \
     V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \
     V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \
     V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD))
 #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf)
 
 /* Just enough of fw_hdr to cover all version info. */
 struct fw_h {
 	__u8	ver;
 	__u8	chip;
 	__be16	len512;
 	__be32	fw_ver;
 	__be32	tp_microcode_ver;
 	__u8	intfver_nic;
 	__u8	intfver_vnic;
 	__u8	intfver_ofld;
 	__u8	intfver_ri;
 	__u8	intfver_iscsipdu;
 	__u8	intfver_iscsi;
 	__u8	intfver_fcoepdu;
 	__u8	intfver_fcoe;
 };
 /* Spot check a couple of fields. */
 CTASSERT(offsetof(struct fw_h, fw_ver) == offsetof(struct fw_hdr, fw_ver));
 CTASSERT(offsetof(struct fw_h, intfver_nic) == offsetof(struct fw_hdr, intfver_nic));
 CTASSERT(offsetof(struct fw_h, intfver_fcoe) == offsetof(struct fw_hdr, intfver_fcoe));
 
 struct fw_info {
 	uint8_t chip;
 	char *kld_name;
 	char *fw_mod_name;
 	struct fw_h fw_h;
 } fw_info[] = {
 	{
 		.chip = CHELSIO_T4,
 		.kld_name = "t4fw_cfg",
 		.fw_mod_name = "t4fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T4,
 			.fw_ver = htobe32(FW_VERSION(T4)),
 			.intfver_nic = FW_INTFVER(T4, NIC),
 			.intfver_vnic = FW_INTFVER(T4, VNIC),
 			.intfver_ofld = FW_INTFVER(T4, OFLD),
 			.intfver_ri = FW_INTFVER(T4, RI),
 			.intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T4, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T4, FCOE),
 		},
 	}, {
 		.chip = CHELSIO_T5,
 		.kld_name = "t5fw_cfg",
 		.fw_mod_name = "t5fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T5,
 			.fw_ver = htobe32(FW_VERSION(T5)),
 			.intfver_nic = FW_INTFVER(T5, NIC),
 			.intfver_vnic = FW_INTFVER(T5, VNIC),
 			.intfver_ofld = FW_INTFVER(T5, OFLD),
 			.intfver_ri = FW_INTFVER(T5, RI),
 			.intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T5, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T5, FCOE),
 		},
 	}, {
 		.chip = CHELSIO_T6,
 		.kld_name = "t6fw_cfg",
 		.fw_mod_name = "t6fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T6,
 			.fw_ver = htobe32(FW_VERSION(T6)),
 			.intfver_nic = FW_INTFVER(T6, NIC),
 			.intfver_vnic = FW_INTFVER(T6, VNIC),
 			.intfver_ofld = FW_INTFVER(T6, OFLD),
 			.intfver_ri = FW_INTFVER(T6, RI),
 			.intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T6, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T6, FCOE),
 		},
 	}
 };
 
 static struct fw_info *
 find_fw_info(int chip)
 {
 	int i;
 
 	for (i = 0; i < nitems(fw_info); i++) {
 		if (fw_info[i].chip == chip)
 			return (&fw_info[i]);
 	}
 	return (NULL);
 }
 
 /*
  * Is the given firmware API compatible with the one the driver was compiled
  * with?
  */
 static int
 fw_compatible(const struct fw_h *hdr1, const struct fw_h *hdr2)
 {
 
 	/* short circuit if it's the exact same firmware version */
 	if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver)
 		return (1);
 
 	/*
 	 * XXX: Is this too conservative?  Perhaps I should limit this to the
 	 * features that are supported in the driver.
 	 */
 #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x)
 	if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) &&
 	    SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) &&
 	    SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe))
 		return (1);
 #undef SAME_INTF
 
 	return (0);
 }
 
 static int
 load_fw_module(struct adapter *sc, const struct firmware **dcfg,
     const struct firmware **fw)
 {
 	struct fw_info *fw_info;
 
 	*dcfg = NULL;
 	if (fw != NULL)
 		*fw = NULL;
 
 	fw_info = find_fw_info(chip_id(sc));
 	if (fw_info == NULL) {
 		device_printf(sc->dev,
 		    "unable to look up firmware information for chip %d.\n",
 		    chip_id(sc));
 		return (EINVAL);
 	}
 
 	*dcfg = firmware_get(fw_info->kld_name);
 	if (*dcfg != NULL) {
 		if (fw != NULL)
 			*fw = firmware_get(fw_info->fw_mod_name);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 unload_fw_module(struct adapter *sc, const struct firmware *dcfg,
     const struct firmware *fw)
 {
 
 	if (fw != NULL)
 		firmware_put(fw, FIRMWARE_UNLOAD);
 	if (dcfg != NULL)
 		firmware_put(dcfg, FIRMWARE_UNLOAD);
 }
 
 /*
  * Return values:
  * 0 means no firmware install attempted.
  * ERESTART means a firmware install was attempted and was successful.
  * +ve errno means a firmware install was attempted but failed.
  */
 static int
 install_kld_firmware(struct adapter *sc, struct fw_h *card_fw,
     const struct fw_h *drv_fw, const char *reason, int *already)
 {
 	const struct firmware *cfg, *fw;
 	const uint32_t c = be32toh(card_fw->fw_ver);
 	uint32_t d, k;
 	int rc, fw_install;
 	struct fw_h bundled_fw;
 	bool load_attempted;
 
 	cfg = fw = NULL;
 	load_attempted = false;
 	fw_install = t4_fw_install < 0 ? -t4_fw_install : t4_fw_install;
 
 	memcpy(&bundled_fw, drv_fw, sizeof(bundled_fw));
 	if (t4_fw_install < 0) {
 		rc = load_fw_module(sc, &cfg, &fw);
 		if (rc != 0 || fw == NULL) {
 			device_printf(sc->dev,
 			    "failed to load firmware module: %d. cfg %p, fw %p;"
 			    " will use compiled-in firmware version for"
 			    "hw.cxgbe.fw_install checks.\n",
 			    rc, cfg, fw);
 		} else {
 			memcpy(&bundled_fw, fw->data, sizeof(bundled_fw));
 		}
 		load_attempted = true;
 	}
 	d = be32toh(bundled_fw.fw_ver);
 
 	if (reason != NULL)
 		goto install;
 
 	if ((sc->flags & FW_OK) == 0) {
 
 		if (c == 0xffffffff) {
 			reason = "missing";
 			goto install;
 		}
 
 		rc = 0;
 		goto done;
 	}
 
 	if (!fw_compatible(card_fw, &bundled_fw)) {
 		reason = "incompatible or unusable";
 		goto install;
 	}
 
 	if (d > c) {
 		reason = "older than the version bundled with this driver";
 		goto install;
 	}
 
 	if (fw_install == 2 && d != c) {
 		reason = "different than the version bundled with this driver";
 		goto install;
 	}
 
 	/* No reason to do anything to the firmware already on the card. */
 	rc = 0;
 	goto done;
 
 install:
 	rc = 0;
 	if ((*already)++)
 		goto done;
 
 	if (fw_install == 0) {
 		device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 		    "but the driver is prohibited from installing a firmware "
 		    "on the card.\n",
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason);
 
 		goto done;
 	}
 
 	/*
 	 * We'll attempt to install a firmware.  Load the module first (if it
 	 * hasn't been loaded already).
 	 */
 	if (!load_attempted) {
 		rc = load_fw_module(sc, &cfg, &fw);
 		if (rc != 0 || fw == NULL) {
 			device_printf(sc->dev,
 			    "failed to load firmware module: %d. cfg %p, fw %p\n",
 			    rc, cfg, fw);
 			/* carry on */
 		}
 	}
 	if (fw == NULL) {
 		device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 		    "but the driver cannot take corrective action because it "
 		    "is unable to load the firmware module.\n",
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason);
 		rc = sc->flags & FW_OK ? 0 : ENOENT;
 		goto done;
 	}
 	k = be32toh(((const struct fw_hdr *)fw->data)->fw_ver);
 	if (k != d) {
 		MPASS(t4_fw_install > 0);
 		device_printf(sc->dev,
 		    "firmware in KLD (%u.%u.%u.%u) is not what the driver was "
 		    "expecting (%u.%u.%u.%u) and will not be used.\n",
 		    G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k),
 		    G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k),
 		    G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d),
 		    G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d));
 		rc = sc->flags & FW_OK ? 0 : EINVAL;
 		goto done;
 	}
 
 	device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 	    "installing firmware %u.%u.%u.%u on card.\n",
 	    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 	    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason,
 	    G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d),
 	    G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d));
 
 	rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to install firmware: %d\n", rc);
 	} else {
 		/* Installed successfully, update the cached header too. */
 		rc = ERESTART;
 		memcpy(card_fw, fw->data, sizeof(*card_fw));
 	}
 done:
 	unload_fw_module(sc, cfg, fw);
 
 	return (rc);
 }
 
 /*
  * Establish contact with the firmware and attempt to become the master driver.
  *
  * A firmware will be installed to the card if needed (if the driver is allowed
  * to do so).
  */
 static int
 contact_firmware(struct adapter *sc)
 {
 	int rc, already = 0;
 	enum dev_state state;
 	struct fw_info *fw_info;
 	struct fw_hdr *card_fw;		/* fw on the card */
 	const struct fw_h *drv_fw;
 
 	fw_info = find_fw_info(chip_id(sc));
 	if (fw_info == NULL) {
 		device_printf(sc->dev,
 		    "unable to look up firmware information for chip %d.\n",
 		    chip_id(sc));
 		return (EINVAL);
 	}
 	drv_fw = &fw_info->fw_h;
 
 	/* Read the header of the firmware on the card */
 	card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK);
 restart:
 	rc = -t4_get_fw_hdr(sc, card_fw);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "unable to read firmware header from card's flash: %d\n",
 		    rc);
 		goto done;
 	}
 
 	rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL,
 	    &already);
 	if (rc == ERESTART)
 		goto restart;
 	if (rc != 0)
 		goto done;
 
 	rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state);
 	if (rc < 0 || state == DEV_STATE_ERR) {
 		rc = -rc;
 		device_printf(sc->dev,
 		    "failed to connect to the firmware: %d, %d.  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 #if 0
 		if (install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw,
 		    "not responding properly to HELLO", &already) == ERESTART)
 			goto restart;
 #endif
 		goto done;
 	}
 	MPASS(be32toh(card_fw->flags) & FW_HDR_FLAGS_RESET_HALT);
 	sc->flags |= FW_OK;	/* The firmware responded to the FW_HELLO. */
 
 	if (rc == sc->pf) {
 		sc->flags |= MASTER_PF;
 		rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw,
 		    NULL, &already);
 		if (rc == ERESTART)
 			rc = 0;
 		else if (rc != 0)
 			goto done;
 	} else if (state == DEV_STATE_UNINIT) {
 		/*
 		 * We didn't get to be the master so we definitely won't be
 		 * configuring the chip.  It's a bug if someone else hasn't
 		 * configured it already.
 		 */
 		device_printf(sc->dev, "couldn't be master(%d), "
 		    "device not already initialized either(%d).  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 		rc = EPROTO;
 		goto done;
 	} else {
 		/*
 		 * Some other PF is the master and has configured the chip.
 		 * This is allowed but untested.
 		 */
 		device_printf(sc->dev, "PF%d is master, device state %d.  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 		snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", rc);
 		sc->cfcsum = 0;
 		rc = 0;
 	}
 done:
 	if (rc != 0 && sc->flags & FW_OK) {
 		t4_fw_bye(sc, sc->mbox);
 		sc->flags &= ~FW_OK;
 	}
 	free(card_fw, M_CXGBE);
 	return (rc);
 }
 
 static int
 copy_cfg_file_to_card(struct adapter *sc, char *cfg_file,
     uint32_t mtype, uint32_t moff)
 {
 	struct fw_info *fw_info;
 	const struct firmware *dcfg, *rcfg = NULL;
 	const uint32_t *cfdata;
 	uint32_t cflen, addr;
 	int rc;
 
 	load_fw_module(sc, &dcfg, NULL);
 
 	/* Card specific interpretation of "default". */
 	if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) {
 		if (pci_get_device(sc->dev) == 0x440a)
 			snprintf(cfg_file, sizeof(t4_cfg_file), UWIRE_CF);
 		if (is_fpga(sc))
 			snprintf(cfg_file, sizeof(t4_cfg_file), FPGA_CF);
 	}
 
 	if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) {
 		if (dcfg == NULL) {
 			device_printf(sc->dev,
 			    "KLD with default config is not available.\n");
 			rc = ENOENT;
 			goto done;
 		}
 		cfdata = dcfg->data;
 		cflen = dcfg->datasize & ~3;
 	} else {
 		char s[32];
 
 		fw_info = find_fw_info(chip_id(sc));
 		if (fw_info == NULL) {
 			device_printf(sc->dev,
 			    "unable to look up firmware information for chip %d.\n",
 			    chip_id(sc));
 			rc = EINVAL;
 			goto done;
 		}
 		snprintf(s, sizeof(s), "%s_%s", fw_info->kld_name, cfg_file);
 
 		rcfg = firmware_get(s);
 		if (rcfg == NULL) {
 			device_printf(sc->dev,
 			    "unable to load module \"%s\" for configuration "
 			    "profile \"%s\".\n", s, cfg_file);
 			rc = ENOENT;
 			goto done;
 		}
 		cfdata = rcfg->data;
 		cflen = rcfg->datasize & ~3;
 	}
 
 	if (cflen > FLASH_CFG_MAX_SIZE) {
 		device_printf(sc->dev,
 		    "config file too long (%d, max allowed is %d).\n",
 		    cflen, FLASH_CFG_MAX_SIZE);
 		rc = EINVAL;
 		goto done;
 	}
 
 	rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "%s: addr (%d/0x%x) or len %d is not valid: %d.\n",
 		    __func__, mtype, moff, cflen, rc);
 		rc = EINVAL;
 		goto done;
 	}
 	write_via_memwin(sc, 2, addr, cfdata, cflen);
 done:
 	if (rcfg != NULL)
 		firmware_put(rcfg, FIRMWARE_UNLOAD);
 	unload_fw_module(sc, dcfg, NULL);
 	return (rc);
 }
 
 struct caps_allowed {
 	uint16_t nbmcaps;
 	uint16_t linkcaps;
 	uint16_t switchcaps;
 	uint16_t niccaps;
 	uint16_t toecaps;
 	uint16_t rdmacaps;
 	uint16_t cryptocaps;
 	uint16_t iscsicaps;
 	uint16_t fcoecaps;
 };
 
 #define FW_PARAM_DEV(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param))
 #define FW_PARAM_PFVF(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param))
 
 /*
  * Provide a configuration profile to the firmware and have it initialize the
  * chip accordingly.  This may involve uploading a configuration file to the
  * card.
  */
 static int
 apply_cfg_and_initialize(struct adapter *sc, char *cfg_file,
     const struct caps_allowed *caps_allowed)
 {
 	int rc;
 	struct fw_caps_config_cmd caps;
 	uint32_t mtype, moff, finicsum, cfcsum, param, val;
 
 	rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST);
 	if (rc != 0) {
 		device_printf(sc->dev, "firmware reset failed: %d.\n", rc);
 		return (rc);
 	}
 
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	if (strncmp(cfg_file, BUILTIN_CF, sizeof(t4_cfg_file)) == 0) {
 		mtype = 0;
 		moff = 0;
 		caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	} else if (strncmp(cfg_file, FLASH_CF, sizeof(t4_cfg_file)) == 0) {
 		mtype = FW_MEMTYPE_FLASH;
 		moff = t4_flash_cfg_addr(sc);
 		caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 		    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
 		    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) |
 		    FW_LEN16(caps));
 	} else {
 		/*
 		 * Ask the firmware where it wants us to upload the config file.
 		 */
 		param = FW_PARAM_DEV(CF);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* No support for config file?  Shouldn't happen. */
 			device_printf(sc->dev,
 			    "failed to query config file location: %d.\n", rc);
 			goto done;
 		}
 		mtype = G_FW_PARAMS_PARAM_Y(val);
 		moff = G_FW_PARAMS_PARAM_Z(val) << 16;
 		caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 		    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
 		    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) |
 		    FW_LEN16(caps));
 
 		rc = copy_cfg_file_to_card(sc, cfg_file, mtype, moff);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to upload config file to card: %d.\n", rc);
 			goto done;
 		}
 	}
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to pre-process config file: %d "
 		    "(mtype %d, moff 0x%x).\n", rc, mtype, moff);
 		goto done;
 	}
 
 	finicsum = be32toh(caps.finicsum);
 	cfcsum = be32toh(caps.cfcsum);	/* actual */
 	if (finicsum != cfcsum) {
 		device_printf(sc->dev,
 		    "WARNING: config file checksum mismatch: %08x %08x\n",
 		    finicsum, cfcsum);
 	}
 	sc->cfcsum = cfcsum;
 	snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", cfg_file);
 
 	/*
 	 * Let the firmware know what features will (not) be used so it can tune
 	 * things accordingly.
 	 */
 #define LIMIT_CAPS(x) do { \
 	caps.x##caps &= htobe16(caps_allowed->x##caps); \
 } while (0)
 	LIMIT_CAPS(nbm);
 	LIMIT_CAPS(link);
 	LIMIT_CAPS(switch);
 	LIMIT_CAPS(nic);
 	LIMIT_CAPS(toe);
 	LIMIT_CAPS(rdma);
 	LIMIT_CAPS(crypto);
 	LIMIT_CAPS(iscsi);
 	LIMIT_CAPS(fcoe);
 #undef LIMIT_CAPS
 	if (caps.niccaps & htobe16(FW_CAPS_CONFIG_NIC_HASHFILTER)) {
 		/*
 		 * TOE and hashfilters are mutually exclusive.  It is a config
 		 * file or firmware bug if both are reported as available.  Try
 		 * to cope with the situation in non-debug builds by disabling
 		 * TOE.
 		 */
 		MPASS(caps.toecaps == 0);
 
 		caps.toecaps = 0;
 		caps.rdmacaps = 0;
 		caps.iscsicaps = 0;
 	}
 
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_WRITE);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to process config file: %d.\n", rc);
 		goto done;
 	}
 
 	t4_tweak_chip_settings(sc);
 	set_params__pre_init(sc);
 
 	/* get basic stuff going */
 	rc = -t4_fw_initialize(sc, sc->mbox);
 	if (rc != 0) {
 		device_printf(sc->dev, "fw_initialize failed: %d.\n", rc);
 		goto done;
 	}
 done:
 	return (rc);
 }
 
 /*
  * Partition chip resources for use between various PFs, VFs, etc.
  */
 static int
 partition_resources(struct adapter *sc)
 {
 	char cfg_file[sizeof(t4_cfg_file)];
 	struct caps_allowed caps_allowed;
 	int rc;
 	bool fallback;
 
 	/* Only the master driver gets to configure the chip resources. */
 	MPASS(sc->flags & MASTER_PF);
 
 #define COPY_CAPS(x) do { \
 	caps_allowed.x##caps = t4_##x##caps_allowed; \
 } while (0)
 	bzero(&caps_allowed, sizeof(caps_allowed));
 	COPY_CAPS(nbm);
 	COPY_CAPS(link);
 	COPY_CAPS(switch);
 	COPY_CAPS(nic);
 	COPY_CAPS(toe);
 	COPY_CAPS(rdma);
 	COPY_CAPS(crypto);
 	COPY_CAPS(iscsi);
 	COPY_CAPS(fcoe);
 	fallback = sc->debug_flags & DF_DISABLE_CFG_RETRY ? false : true;
 	snprintf(cfg_file, sizeof(cfg_file), "%s", t4_cfg_file);
 retry:
 	rc = apply_cfg_and_initialize(sc, cfg_file, &caps_allowed);
 	if (rc != 0 && fallback) {
 		device_printf(sc->dev,
 		    "failed (%d) to configure card with \"%s\" profile, "
 		    "will fall back to a basic configuration and retry.\n",
 		    rc, cfg_file);
 		snprintf(cfg_file, sizeof(cfg_file), "%s", BUILTIN_CF);
 		bzero(&caps_allowed, sizeof(caps_allowed));
 		COPY_CAPS(nbm);
 		COPY_CAPS(link);
 		COPY_CAPS(switch);
 		COPY_CAPS(nic);
 		fallback = false;
 		goto retry;
 	}
 #undef COPY_CAPS
 	return (rc);
 }
 
 /*
  * Retrieve parameters that are needed (or nice to have) very early.
  */
 static int
 get_params__pre_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[2], val[2];
 
 	t4_get_version_info(sc);
 
 	snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers));
 
 	snprintf(sc->bs_version, sizeof(sc->bs_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.bs_vers));
 
 	snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers));
 
 	snprintf(sc->er_version, sizeof(sc->er_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.er_vers));
 
 	param[0] = FW_PARAM_DEV(PORTVEC);
 	param[1] = FW_PARAM_DEV(CCLK);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (pre_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->params.portvec = val[0];
 	sc->params.nports = bitcount32(val[0]);
 	sc->params.vpd.cclk = val[1];
 
 	/* Read device log parameters. */
 	rc = -t4_init_devlog_params(sc, 1);
 	if (rc == 0)
 		fixup_devlog_params(sc);
 	else {
 		device_printf(sc->dev,
 		    "failed to get devlog parameters: %d.\n", rc);
 		rc = 0;	/* devlog isn't critical for device operation */
 	}
 
 	return (rc);
 }
 
 /*
  * Any params that need to be set before FW_INITIALIZE.
  */
 static int
 set_params__pre_init(struct adapter *sc)
 {
 	int rc = 0;
 	uint32_t param, val;
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		param = FW_PARAM_DEV(HPFILTER_REGION_SUPPORT);
 		val = 1;
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		/* firmwares < 1.20.1.0 do not have this param. */
 		if (rc == FW_EINVAL && sc->params.fw_vers <
 		    (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) |
 		    V_FW_HDR_FW_VER_MICRO(1) | V_FW_HDR_FW_VER_BUILD(0))) {
 			rc = 0;
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to enable high priority filters :%d.\n",
 			    rc);
 		}
 	}
 
 	/* Enable opaque VIIDs with firmwares that support it. */
 	param = FW_PARAM_DEV(OPAQUE_VIID_SMT_EXTN);
 	val = 1;
 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	if (rc == 0 && val == 1)
 		sc->params.viid_smt_extn_support = true;
 	else
 		sc->params.viid_smt_extn_support = false;
 
 	return (rc);
 }
 
 /*
  * Retrieve various parameters that are of interest to the driver.  The device
  * has been initialized by the firmware at this point.
  */
 static int
 get_params__post_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[7], val[7];
 	struct fw_caps_config_cmd caps;
 
 	param[0] = FW_PARAM_PFVF(IQFLINT_START);
 	param[1] = FW_PARAM_PFVF(EQ_START);
 	param[2] = FW_PARAM_PFVF(FILTER_START);
 	param[3] = FW_PARAM_PFVF(FILTER_END);
 	param[4] = FW_PARAM_PFVF(L2T_START);
 	param[5] = FW_PARAM_PFVF(L2T_END);
 	param[6] = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 	    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 7, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (post_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->sge.iq_start = val[0];
 	sc->sge.eq_start = val[1];
 	if ((int)val[3] > (int)val[2]) {
 		sc->tids.ftid_base = val[2];
 		sc->tids.ftid_end = val[3];
 		sc->tids.nftids = val[3] - val[2] + 1;
 	}
 	sc->vres.l2t.start = val[4];
 	sc->vres.l2t.size = val[5] - val[4] + 1;
 	KASSERT(sc->vres.l2t.size <= L2T_SIZE,
 	    ("%s: L2 table size (%u) larger than expected (%u)",
 	    __func__, sc->vres.l2t.size, L2T_SIZE));
 	sc->params.core_vdd = val[6];
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 
 		sc->tids.tid_base = t4_read_reg(sc,
 		    A_LE_DB_ACTIVE_TABLE_START_INDEX);
 
 		param[0] = FW_PARAM_PFVF(HPFILTER_START);
 		param[1] = FW_PARAM_PFVF(HPFILTER_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			   "failed to query hpfilter parameters: %d.\n", rc);
 			return (rc);
 		}
 		if ((int)val[1] > (int)val[0]) {
 			sc->tids.hpftid_base = val[0];
 			sc->tids.hpftid_end = val[1];
 			sc->tids.nhpftids = val[1] - val[0] + 1;
 
 			/*
 			 * These should go off if the layout changes and the
 			 * driver needs to catch up.
 			 */
 			MPASS(sc->tids.hpftid_base == 0);
 			MPASS(sc->tids.tid_base == sc->tids.nhpftids);
 		}
 	}
 
 	/*
 	 * MPSBGMAP is queried separately because only recent firmwares support
 	 * it as a parameter and we don't want the compound query above to fail
 	 * on older firmwares.
 	 */
 	param[0] = FW_PARAM_DEV(MPSBGMAP);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.mps_bg_map = val[0];
 	else
 		sc->params.mps_bg_map = 0;
 
 	/*
 	 * Determine whether the firmware supports the filter2 work request.
 	 * This is queried separately for the same reason as MPSBGMAP above.
 	 */
 	param[0] = FW_PARAM_DEV(FILTER2_WR);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.filter2_wr_support = val[0] != 0;
 	else
 		sc->params.filter2_wr_support = 0;
 
 	/*
 	 * Find out whether we're allowed to use the ULPTX MEMWRITE DSGL.
 	 * This is queried separately for the same reason as other params above.
 	 */
 	param[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.ulptx_memwrite_dsgl = val[0] != 0;
 	else
 		sc->params.ulptx_memwrite_dsgl = false;
 
 	/* get capabilites */
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to get card capabilities: %d.\n", rc);
 		return (rc);
 	}
 
 #define READ_CAPS(x) do { \
 	sc->x = htobe16(caps.x); \
 } while (0)
 	READ_CAPS(nbmcaps);
 	READ_CAPS(linkcaps);
 	READ_CAPS(switchcaps);
 	READ_CAPS(niccaps);
 	READ_CAPS(toecaps);
 	READ_CAPS(rdmacaps);
 	READ_CAPS(cryptocaps);
 	READ_CAPS(iscsicaps);
 	READ_CAPS(fcoecaps);
 
 	if (sc->niccaps & FW_CAPS_CONFIG_NIC_HASHFILTER) {
 		MPASS(chip_id(sc) > CHELSIO_T4);
 		MPASS(sc->toecaps == 0);
 		sc->toecaps = 0;
 
 		param[0] = FW_PARAM_DEV(NTID);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query HASHFILTER parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.ntids = val[0];
 		if (sc->params.fw_vers <
 		    (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) |
 		    V_FW_HDR_FW_VER_MICRO(5) | V_FW_HDR_FW_VER_BUILD(0))) {
 			MPASS(sc->tids.ntids >= sc->tids.nhpftids);
 			sc->tids.ntids -= sc->tids.nhpftids;
 		}
 		sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS);
 		sc->params.hash_filter = 1;
 	}
 	if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) {
 		param[0] = FW_PARAM_PFVF(ETHOFLD_START);
 		param[1] = FW_PARAM_PFVF(ETHOFLD_END);
 		param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query NIC parameters: %d.\n", rc);
 			return (rc);
 		}
 		if ((int)val[1] > (int)val[0]) {
 			sc->tids.etid_base = val[0];
 			sc->tids.etid_end = val[1];
 			sc->tids.netids = val[1] - val[0] + 1;
 			sc->params.eo_wr_cred = val[2];
 			sc->params.ethoffload = 1;
 		}
 	}
 	if (sc->toecaps) {
 		/* query offload-related parameters */
 		param[0] = FW_PARAM_DEV(NTID);
 		param[1] = FW_PARAM_PFVF(SERVER_START);
 		param[2] = FW_PARAM_PFVF(SERVER_END);
 		param[3] = FW_PARAM_PFVF(TDDP_START);
 		param[4] = FW_PARAM_PFVF(TDDP_END);
 		param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query TOE parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.ntids = val[0];
 		if (sc->params.fw_vers <
 		    (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) |
 		    V_FW_HDR_FW_VER_MICRO(5) | V_FW_HDR_FW_VER_BUILD(0))) {
 			MPASS(sc->tids.ntids >= sc->tids.nhpftids);
 			sc->tids.ntids -= sc->tids.nhpftids;
 		}
 		sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS);
 		if ((int)val[2] > (int)val[1]) {
 			sc->tids.stid_base = val[1];
 			sc->tids.nstids = val[2] - val[1] + 1;
 		}
 		sc->vres.ddp.start = val[3];
 		sc->vres.ddp.size = val[4] - val[3] + 1;
 		sc->params.ofldq_wr_cred = val[5];
 		sc->params.offload = 1;
 	} else {
 		/*
 		 * The firmware attempts memfree TOE configuration for -SO cards
 		 * and will report toecaps=0 if it runs out of resources (this
 		 * depends on the config file).  It may not report 0 for other
 		 * capabilities dependent on the TOE in this case.  Set them to
 		 * 0 here so that the driver doesn't bother tracking resources
 		 * that will never be used.
 		 */
 		sc->iscsicaps = 0;
 		sc->rdmacaps = 0;
 	}
 	if (sc->rdmacaps) {
 		param[0] = FW_PARAM_PFVF(STAG_START);
 		param[1] = FW_PARAM_PFVF(STAG_END);
 		param[2] = FW_PARAM_PFVF(RQ_START);
 		param[3] = FW_PARAM_PFVF(RQ_END);
 		param[4] = FW_PARAM_PFVF(PBL_START);
 		param[5] = FW_PARAM_PFVF(PBL_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(1): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.stag.start = val[0];
 		sc->vres.stag.size = val[1] - val[0] + 1;
 		sc->vres.rq.start = val[2];
 		sc->vres.rq.size = val[3] - val[2] + 1;
 		sc->vres.pbl.start = val[4];
 		sc->vres.pbl.size = val[5] - val[4] + 1;
 
 		param[0] = FW_PARAM_PFVF(SQRQ_START);
 		param[1] = FW_PARAM_PFVF(SQRQ_END);
 		param[2] = FW_PARAM_PFVF(CQ_START);
 		param[3] = FW_PARAM_PFVF(CQ_END);
 		param[4] = FW_PARAM_PFVF(OCQ_START);
 		param[5] = FW_PARAM_PFVF(OCQ_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(2): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.qp.start = val[0];
 		sc->vres.qp.size = val[1] - val[0] + 1;
 		sc->vres.cq.start = val[2];
 		sc->vres.cq.size = val[3] - val[2] + 1;
 		sc->vres.ocq.start = val[4];
 		sc->vres.ocq.size = val[5] - val[4] + 1;
 
 		param[0] = FW_PARAM_PFVF(SRQ_START);
 		param[1] = FW_PARAM_PFVF(SRQ_END);
 		param[2] = FW_PARAM_DEV(MAXORDIRD_QP);
 		param[3] = FW_PARAM_DEV(MAXIRD_ADAPTER);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 4, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(3): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.srq.start = val[0];
 		sc->vres.srq.size = val[1] - val[0] + 1;
 		sc->params.max_ordird_qp = val[2];
 		sc->params.max_ird_adapter = val[3];
 	}
 	if (sc->iscsicaps) {
 		param[0] = FW_PARAM_PFVF(ISCSI_START);
 		param[1] = FW_PARAM_PFVF(ISCSI_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query iSCSI parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.iscsi.start = val[0];
 		sc->vres.iscsi.size = val[1] - val[0] + 1;
 	}
 	if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS) {
 		param[0] = FW_PARAM_PFVF(TLS_START);
 		param[1] = FW_PARAM_PFVF(TLS_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query TLS parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.key.start = val[0];
 		sc->vres.key.size = val[1] - val[0] + 1;
 	}
 
 	t4_init_sge_params(sc);
 
 	/*
 	 * We've got the params we wanted to query via the firmware.  Now grab
 	 * some others directly from the chip.
 	 */
 	rc = t4_read_chip_settings(sc);
 
 	return (rc);
 }
 
 static int
 set_params__post_init(struct adapter *sc)
 {
 	uint32_t param, val;
 #ifdef TCP_OFFLOAD
 	int i, v, shift;
 #endif
 
 	/* ask for encapsulated CPLs */
 	param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP);
 	val = 1;
 	(void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 
 	/* Enable 32b port caps if the firmware supports it. */
 	param = FW_PARAM_PFVF(PORT_CAPS32);
 	val = 1;
 	if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val) == 0)
 		sc->params.port_caps32 = 1;
 
 	/* Let filter + maskhash steer to a part of the VI's RSS region. */
 	val = 1 << (G_MASKSIZE(t4_read_reg(sc, A_TP_RSS_CONFIG_TNL)) - 1);
 	t4_set_reg_field(sc, A_TP_RSS_CONFIG_TNL, V_MASKFILTER(M_MASKFILTER),
 	    V_MASKFILTER(val - 1));
 
 #ifdef TCP_OFFLOAD
 	/*
 	 * Override the TOE timers with user provided tunables.  This is not the
 	 * recommended way to change the timers (the firmware config file is) so
 	 * these tunables are not documented.
 	 *
 	 * All the timer tunables are in microseconds.
 	 */
 	if (t4_toe_keepalive_idle != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_keepalive_idle);
 		v &= M_KEEPALIVEIDLE;
 		t4_set_reg_field(sc, A_TP_KEEP_IDLE,
 		    V_KEEPALIVEIDLE(M_KEEPALIVEIDLE), V_KEEPALIVEIDLE(v));
 	}
 	if (t4_toe_keepalive_interval != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_keepalive_interval);
 		v &= M_KEEPALIVEINTVL;
 		t4_set_reg_field(sc, A_TP_KEEP_INTVL,
 		    V_KEEPALIVEINTVL(M_KEEPALIVEINTVL), V_KEEPALIVEINTVL(v));
 	}
 	if (t4_toe_keepalive_count != 0) {
 		v = t4_toe_keepalive_count & M_KEEPALIVEMAXR2;
 		t4_set_reg_field(sc, A_TP_SHIFT_CNT,
 		    V_KEEPALIVEMAXR1(M_KEEPALIVEMAXR1) |
 		    V_KEEPALIVEMAXR2(M_KEEPALIVEMAXR2),
 		    V_KEEPALIVEMAXR1(1) | V_KEEPALIVEMAXR2(v));
 	}
 	if (t4_toe_rexmt_min != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_rexmt_min);
 		v &= M_RXTMIN;
 		t4_set_reg_field(sc, A_TP_RXT_MIN,
 		    V_RXTMIN(M_RXTMIN), V_RXTMIN(v));
 	}
 	if (t4_toe_rexmt_max != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_rexmt_max);
 		v &= M_RXTMAX;
 		t4_set_reg_field(sc, A_TP_RXT_MAX,
 		    V_RXTMAX(M_RXTMAX), V_RXTMAX(v));
 	}
 	if (t4_toe_rexmt_count != 0) {
 		v = t4_toe_rexmt_count & M_RXTSHIFTMAXR2;
 		t4_set_reg_field(sc, A_TP_SHIFT_CNT,
 		    V_RXTSHIFTMAXR1(M_RXTSHIFTMAXR1) |
 		    V_RXTSHIFTMAXR2(M_RXTSHIFTMAXR2),
 		    V_RXTSHIFTMAXR1(1) | V_RXTSHIFTMAXR2(v));
 	}
 	for (i = 0; i < nitems(t4_toe_rexmt_backoff); i++) {
 		if (t4_toe_rexmt_backoff[i] != -1) {
 			v = t4_toe_rexmt_backoff[i] & M_TIMERBACKOFFINDEX0;
 			shift = (i & 3) << 3;
 			t4_set_reg_field(sc, A_TP_TCP_BACKOFF_REG0 + (i & ~3),
 			    M_TIMERBACKOFFINDEX0 << shift, v << shift);
 		}
 	}
 #endif
 	return (0);
 }
 
 #undef FW_PARAM_PFVF
 #undef FW_PARAM_DEV
 
 static void
 t4_set_desc(struct adapter *sc)
 {
 	char buf[128];
 	struct adapter_params *p = &sc->params;
 
 	snprintf(buf, sizeof(buf), "Chelsio %s", p->vpd.id);
 
 	device_set_desc_copy(sc->dev, buf);
 }
 
 static inline void
 ifmedia_add4(struct ifmedia *ifm, int m)
 {
 
 	ifmedia_add(ifm, m, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_RXPAUSE, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL);
 }
 
 /*
  * This is the selected media, which is not quite the same as the active media.
  * The media line in ifconfig is "media: Ethernet selected (active)" if selected
  * and active are not the same, and "media: Ethernet selected" otherwise.
  */
 static void
 set_current_media(struct port_info *pi)
 {
 	struct link_config *lc;
 	struct ifmedia *ifm;
 	int mword;
 	u_int speed;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	/* Leave current media alone if it's already set to IFM_NONE. */
 	ifm = &pi->media;
 	if (ifm->ifm_cur != NULL &&
 	    IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE)
 		return;
 
 	lc = &pi->link_cfg;
 	if (lc->requested_aneg != AUTONEG_DISABLE &&
 	    lc->supported & FW_PORT_CAP32_ANEG) {
 		ifmedia_set(ifm, IFM_ETHER | IFM_AUTO);
 		return;
 	}
 	mword = IFM_ETHER | IFM_FDX;
 	if (lc->requested_fc & PAUSE_TX)
 		mword |= IFM_ETH_TXPAUSE;
 	if (lc->requested_fc & PAUSE_RX)
 		mword |= IFM_ETH_RXPAUSE;
 	if (lc->requested_speed == 0)
 		speed = port_top_speed(pi) * 1000;	/* Gbps -> Mbps */
 	else
 		speed = lc->requested_speed;
 	mword |= port_mword(pi, speed_to_fwcap(speed));
 	ifmedia_set(ifm, mword);
 }
 
 /*
  * Returns true if the ifmedia list for the port cannot change.
  */
 static bool
 fixed_ifmedia(struct port_info *pi)
 {
 
 	return (pi->port_type == FW_PORT_TYPE_BT_SGMII ||
 	    pi->port_type == FW_PORT_TYPE_BT_XFI ||
 	    pi->port_type == FW_PORT_TYPE_BT_XAUI ||
 	    pi->port_type == FW_PORT_TYPE_KX4 ||
 	    pi->port_type == FW_PORT_TYPE_KX ||
 	    pi->port_type == FW_PORT_TYPE_KR ||
 	    pi->port_type == FW_PORT_TYPE_BP_AP ||
 	    pi->port_type == FW_PORT_TYPE_BP4_AP ||
 	    pi->port_type == FW_PORT_TYPE_BP40_BA ||
 	    pi->port_type == FW_PORT_TYPE_KR4_100G ||
 	    pi->port_type == FW_PORT_TYPE_KR_SFP28 ||
 	    pi->port_type == FW_PORT_TYPE_KR_XLAUI);
 }
 
 static void
 build_medialist(struct port_info *pi)
 {
 	uint32_t ss, speed;
 	int unknown, mword, bit;
 	struct link_config *lc;
 	struct ifmedia *ifm;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	if (pi->flags & FIXED_IFMEDIA)
 		return;
 
 	/*
 	 * Rebuild the ifmedia list.
 	 */
 	ifm = &pi->media;
 	ifmedia_removeall(ifm);
 	lc = &pi->link_cfg;
 	ss = G_FW_PORT_CAP32_SPEED(lc->supported); /* Supported Speeds */
 	if (__predict_false(ss == 0)) {	/* not supposed to happen. */
 		MPASS(ss != 0);
 no_media:
 		MPASS(LIST_EMPTY(&ifm->ifm_list));
 		ifmedia_add(ifm, IFM_ETHER | IFM_NONE, 0, NULL);
 		ifmedia_set(ifm, IFM_ETHER | IFM_NONE);
 		return;
 	}
 
 	unknown = 0;
 	for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) {
 		speed = 1 << bit;
 		MPASS(speed & M_FW_PORT_CAP32_SPEED);
 		if (ss & speed) {
 			mword = port_mword(pi, speed);
 			if (mword == IFM_NONE) {
 				goto no_media;
 			} else if (mword == IFM_UNKNOWN)
 				unknown++;
 			else
 				ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | mword);
 		}
 	}
 	if (unknown > 0) /* Add one unknown for all unknown media types. */
 		ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN);
 	if (lc->supported & FW_PORT_CAP32_ANEG)
 		ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL);
 
 	set_current_media(pi);
 }
 
 /*
  * Initialize the requested fields in the link config based on driver tunables.
  */
 static void
 init_link_config(struct port_info *pi)
 {
 	struct link_config *lc = &pi->link_cfg;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	lc->requested_speed = 0;
 
 	if (t4_autoneg == 0)
 		lc->requested_aneg = AUTONEG_DISABLE;
 	else if (t4_autoneg == 1)
 		lc->requested_aneg = AUTONEG_ENABLE;
 	else
 		lc->requested_aneg = AUTONEG_AUTO;
 
 	lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX |
 	    PAUSE_AUTONEG);
 
 	if (t4_fec == -1 || t4_fec & FEC_AUTO)
 		lc->requested_fec = FEC_AUTO;
 	else {
 		lc->requested_fec = FEC_NONE;
 		if (t4_fec & FEC_RS)
 			lc->requested_fec |= FEC_RS;
 		if (t4_fec & FEC_BASER_RS)
 			lc->requested_fec |= FEC_BASER_RS;
 	}
 }
 
 /*
  * Makes sure that all requested settings comply with what's supported by the
  * port.  Returns the number of settings that were invalid and had to be fixed.
  */
 static int
 fixup_link_config(struct port_info *pi)
 {
 	int n = 0;
 	struct link_config *lc = &pi->link_cfg;
 	uint32_t fwspeed;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	/* Speed (when not autonegotiating) */
 	if (lc->requested_speed != 0) {
 		fwspeed = speed_to_fwcap(lc->requested_speed);
 		if ((fwspeed & lc->supported) == 0) {
 			n++;
 			lc->requested_speed = 0;
 		}
 	}
 
 	/* Link autonegotiation */
 	MPASS(lc->requested_aneg == AUTONEG_ENABLE ||
 	    lc->requested_aneg == AUTONEG_DISABLE ||
 	    lc->requested_aneg == AUTONEG_AUTO);
 	if (lc->requested_aneg == AUTONEG_ENABLE &&
 	    !(lc->supported & FW_PORT_CAP32_ANEG)) {
 		n++;
 		lc->requested_aneg = AUTONEG_AUTO;
 	}
 
 	/* Flow control */
 	MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0);
 	if (lc->requested_fc & PAUSE_TX &&
 	    !(lc->supported & FW_PORT_CAP32_FC_TX)) {
 		n++;
 		lc->requested_fc &= ~PAUSE_TX;
 	}
 	if (lc->requested_fc & PAUSE_RX &&
 	    !(lc->supported & FW_PORT_CAP32_FC_RX)) {
 		n++;
 		lc->requested_fc &= ~PAUSE_RX;
 	}
 	if (!(lc->requested_fc & PAUSE_AUTONEG) &&
 	    !(lc->supported & FW_PORT_CAP32_FORCE_PAUSE)) {
 		n++;
 		lc->requested_fc |= PAUSE_AUTONEG;
 	}
 
 	/* FEC */
 	if ((lc->requested_fec & FEC_RS &&
 	    !(lc->supported & FW_PORT_CAP32_FEC_RS)) ||
 	    (lc->requested_fec & FEC_BASER_RS &&
 	    !(lc->supported & FW_PORT_CAP32_FEC_BASER_RS))) {
 		n++;
 		lc->requested_fec = FEC_AUTO;
 	}
 
 	return (n);
 }
 
 /*
  * Apply the requested L1 settings, which are expected to be valid, to the
  * hardware.
  */
 static int
 apply_link_config(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 
 #ifdef INVARIANTS
 	ASSERT_SYNCHRONIZED_OP(sc);
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	if (lc->requested_aneg == AUTONEG_ENABLE)
 		MPASS(lc->supported & FW_PORT_CAP32_ANEG);
 	if (!(lc->requested_fc & PAUSE_AUTONEG))
 		MPASS(lc->supported & FW_PORT_CAP32_FORCE_PAUSE);
 	if (lc->requested_fc & PAUSE_TX)
 		MPASS(lc->supported & FW_PORT_CAP32_FC_TX);
 	if (lc->requested_fc & PAUSE_RX)
 		MPASS(lc->supported & FW_PORT_CAP32_FC_RX);
 	if (lc->requested_fec & FEC_RS)
 		MPASS(lc->supported & FW_PORT_CAP32_FEC_RS);
 	if (lc->requested_fec & FEC_BASER_RS)
 		MPASS(lc->supported & FW_PORT_CAP32_FEC_BASER_RS);
 #endif
 	rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc);
 	if (rc != 0) {
 		/* Don't complain if the VF driver gets back an EPERM. */
 		if (!(sc->flags & IS_VF) || rc != FW_EPERM)
 			device_printf(pi->dev, "l1cfg failed: %d\n", rc);
 	} else {
 		/*
 		 * An L1_CFG will almost always result in a link-change event if
 		 * the link is up, and the driver will refresh the actual
 		 * fec/fc/etc. when the notification is processed.  If the link
 		 * is down then the actual settings are meaningless.
 		 *
 		 * This takes care of the case where a change in the L1 settings
 		 * may not result in a notification.
 		 */
 		if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG))
 			lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX);
 	}
 	return (rc);
 }
 
 #define FW_MAC_EXACT_CHUNK	7
 
 /*
  * Program the port's XGMAC based on parameters in ifnet.  The caller also
  * indicates which parameters should be programmed (the rest are left alone).
  */
 int
 update_mac_settings(struct ifnet *ifp, int flags)
 {
 	int rc = 0;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT(flags, ("%s: not told what to update.", __func__));
 
 	if (flags & XGMAC_MTU)
 		mtu = ifp->if_mtu;
 
 	if (flags & XGMAC_PROMISC)
 		promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0;
 
 	if (flags & XGMAC_ALLMULTI)
 		allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0;
 
 	if (flags & XGMAC_VLANEX)
 		vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0;
 
 	if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) {
 		rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc,
 		    allmulti, 1, vlanex, false);
 		if (rc) {
 			if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags,
 			    rc);
 			return (rc);
 		}
 	}
 
 	if (flags & XGMAC_UCADDR) {
 		uint8_t ucaddr[ETHER_ADDR_LEN];
 
 		bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr));
 		rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt,
 		    ucaddr, true, &vi->smt_idx);
 		if (rc < 0) {
 			rc = -rc;
 			if_printf(ifp, "change_mac failed: %d\n", rc);
 			return (rc);
 		} else {
 			vi->xact_addr_filt = rc;
 			rc = 0;
 		}
 	}
 
 	if (flags & XGMAC_MCADDRS) {
 		const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK];
 		int del = 1;
 		uint64_t hash = 0;
 		struct ifmultiaddr *ifma;
 		int i = 0, j;
 
 		if_maddr_rlock(ifp);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_LINK)
 				continue;
 			mcaddr[i] =
 			    LLADDR((struct sockaddr_dl *)ifma->ifma_addr);
 			MPASS(ETHER_IS_MULTICAST(mcaddr[i]));
 			i++;
 
 			if (i == FW_MAC_EXACT_CHUNK) {
 				rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid,
 				    del, i, mcaddr, NULL, &hash, 0);
 				if (rc < 0) {
 					rc = -rc;
 					for (j = 0; j < i; j++) {
 						if_printf(ifp,
 						    "failed to add mc address"
 						    " %02x:%02x:%02x:"
 						    "%02x:%02x:%02x rc=%d\n",
 						    mcaddr[j][0], mcaddr[j][1],
 						    mcaddr[j][2], mcaddr[j][3],
 						    mcaddr[j][4], mcaddr[j][5],
 						    rc);
 					}
 					goto mcfail;
 				}
 				del = 0;
 				i = 0;
 			}
 		}
 		if (i > 0) {
 			rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i,
 			    mcaddr, NULL, &hash, 0);
 			if (rc < 0) {
 				rc = -rc;
 				for (j = 0; j < i; j++) {
 					if_printf(ifp,
 					    "failed to add mc address"
 					    " %02x:%02x:%02x:"
 					    "%02x:%02x:%02x rc=%d\n",
 					    mcaddr[j][0], mcaddr[j][1],
 					    mcaddr[j][2], mcaddr[j][3],
 					    mcaddr[j][4], mcaddr[j][5],
 					    rc);
 				}
 				goto mcfail;
 			}
 		}
 
 		rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, hash, 0);
 		if (rc != 0)
 			if_printf(ifp, "failed to set mc address hash: %d", rc);
 mcfail:
 		if_maddr_runlock(ifp);
 	}
 
 	return (rc);
 }
 
 /*
  * {begin|end}_synchronized_op must be called from the same thread.
  */
 int
 begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags,
     char *wmesg)
 {
 	int rc, pri;
 
 #ifdef WITNESS
 	/* the caller thinks it's ok to sleep, but is it really? */
 	if (flags & SLEEP_OK)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "begin_synchronized_op");
 #endif
 
 	if (INTR_OK)
 		pri = PCATCH;
 	else
 		pri = 0;
 
 	ADAPTER_LOCK(sc);
 	for (;;) {
 
 		if (vi && IS_DOOMED(vi)) {
 			rc = ENXIO;
 			goto done;
 		}
 
 		if (!IS_BUSY(sc)) {
 			rc = 0;
 			break;
 		}
 
 		if (!(flags & SLEEP_OK)) {
 			rc = EBUSY;
 			goto done;
 		}
 
 		if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) {
 			rc = EINTR;
 			goto done;
 		}
 	}
 
 	KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__));
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = wmesg;
 	sc->last_op_thr = curthread;
 	sc->last_op_flags = flags;
 #endif
 
 done:
 	if (!(flags & HOLD_LOCK) || rc)
 		ADAPTER_UNLOCK(sc);
 
 	return (rc);
 }
 
 /*
  * Tell if_ioctl and if_init that the VI is going away.  This is
  * special variant of begin_synchronized_op and must be paired with a
  * call to end_synchronized_op.
  */
 void
 doom_vi(struct adapter *sc, struct vi_info *vi)
 {
 
 	ADAPTER_LOCK(sc);
 	SET_DOOMED(vi);
 	wakeup(&sc->flags);
 	while (IS_BUSY(sc))
 		mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0);
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = "t4detach";
 	sc->last_op_thr = curthread;
 	sc->last_op_flags = 0;
 #endif
 	ADAPTER_UNLOCK(sc);
 }
 
 /*
  * {begin|end}_synchronized_op must be called from the same thread.
  */
 void
 end_synchronized_op(struct adapter *sc, int flags)
 {
 
 	if (flags & LOCK_HELD)
 		ADAPTER_LOCK_ASSERT_OWNED(sc);
 	else
 		ADAPTER_LOCK(sc);
 
 	KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__));
 	CLR_BUSY(sc);
 	wakeup(&sc->flags);
 	ADAPTER_UNLOCK(sc);
 }
 
 static int
 cxgbe_init_synchronized(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	int rc = 0, i;
 	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return (0);	/* already running */
 
 	if (!(sc->flags & FULL_INIT_DONE) &&
 	    ((rc = adapter_full_init(sc)) != 0))
 		return (rc);	/* error message displayed already */
 
 	if (!(vi->flags & VI_INIT_DONE) &&
 	    ((rc = vi_full_init(vi)) != 0))
 		return (rc); /* error message displayed already */
 
 	rc = update_mac_settings(ifp, XGMAC_ALL);
 	if (rc)
 		goto done;	/* error message displayed already */
 
 	PORT_LOCK(pi);
 	if (pi->up_vis == 0) {
 		t4_update_port_info(pi);
 		fixup_link_config(pi);
 		build_medialist(pi);
 		apply_link_config(pi);
 	}
 
 	rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true);
 	if (rc != 0) {
 		if_printf(ifp, "enable_vi failed: %d\n", rc);
 		PORT_UNLOCK(pi);
 		goto done;
 	}
 
 	/*
 	 * Can't fail from this point onwards.  Review cxgbe_uninit_synchronized
 	 * if this changes.
 	 */
 
 	for_each_txq(vi, i, txq) {
 		TXQ_LOCK(txq);
 		txq->eq.flags |= EQ_ENABLED;
 		TXQ_UNLOCK(txq);
 	}
 
 	/*
 	 * The first iq of the first port to come up is used for tracing.
 	 */
 	if (sc->traceq < 0 && IS_MAIN_VI(vi)) {
 		sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id;
 		t4_write_reg(sc, is_t4(sc) ?  A_MPS_TRC_RSS_CONTROL :
 		    A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) |
 		    V_QUEUENUMBER(sc->traceq));
 		pi->flags |= HAS_TRACEQ;
 	}
 
 	/* all ok */
 	pi->up_vis++;
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	if (pi->nvi > 1 || sc->flags & IS_VF)
 		callout_reset(&vi->tick, hz, vi_tick, vi);
 	else
 		callout_reset(&pi->tick, hz, cxgbe_tick, pi);
 	if (pi->link_cfg.link_ok)
 		t4_os_link_changed(pi);
 	PORT_UNLOCK(pi);
 done:
 	if (rc != 0)
 		cxgbe_uninit_synchronized(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 static int
 cxgbe_uninit_synchronized(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	int rc, i;
 	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!(vi->flags & VI_INIT_DONE)) {
 		if (__predict_false(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			KASSERT(0, ("uninited VI is running"));
 			if_printf(ifp, "uninited VI with running ifnet.  "
 			    "vi->flags 0x%016lx, if_flags 0x%08x, "
 			    "if_drv_flags 0x%08x\n", vi->flags, ifp->if_flags,
 			    ifp->if_drv_flags);
 		}
 		return (0);
 	}
 
 	/*
 	 * Disable the VI so that all its data in either direction is discarded
 	 * by the MPS.  Leave everything else (the queues, interrupts, and 1Hz
 	 * tick) intact as the TP can deliver negative advice or data that it's
 	 * holding in its RAM (for an offloaded connection) even after the VI is
 	 * disabled.
 	 */
 	rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false);
 	if (rc) {
 		if_printf(ifp, "disable_vi failed: %d\n", rc);
 		return (rc);
 	}
 
 	for_each_txq(vi, i, txq) {
 		TXQ_LOCK(txq);
 		txq->eq.flags &= ~EQ_ENABLED;
 		TXQ_UNLOCK(txq);
 	}
 
 	PORT_LOCK(pi);
 	if (pi->nvi > 1 || sc->flags & IS_VF)
 		callout_stop(&vi->tick);
 	else
 		callout_stop(&pi->tick);
 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		PORT_UNLOCK(pi);
 		return (0);
 	}
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	pi->up_vis--;
 	if (pi->up_vis > 0) {
 		PORT_UNLOCK(pi);
 		return (0);
 	}
 
 	pi->link_cfg.link_ok = false;
 	pi->link_cfg.speed = 0;
 	pi->link_cfg.link_down_rc = 255;
 	t4_os_link_changed(pi);
 	PORT_UNLOCK(pi);
 
 	return (0);
 }
 
 /*
  * It is ok for this function to fail midway and return right away.  t4_detach
  * will walk the entire sc->irq list and clean up whatever is valid.
  */
 int
 t4_setup_intr_handlers(struct adapter *sc)
 {
 	int rc, rid, p, q, v;
 	char s[8];
 	struct irq *irq;
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct sge *sge = &sc->sge;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 #endif
 #ifdef RSS
 	int nbuckets = rss_getnumbuckets();
 #endif
 
 	/*
 	 * Setup interrupts.
 	 */
 	irq = &sc->irq[0];
 	rid = sc->intr_type == INTR_INTX ? 0 : 1;
 	if (forwarding_intr_to_fwq(sc))
 		return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all"));
 
 	/* Multiple interrupts. */
 	if (sc->flags & IS_VF)
 		KASSERT(sc->intr_count >= T4VF_EXTRA_INTR + sc->params.nports,
 		    ("%s: too few intr.", __func__));
 	else
 		KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports,
 		    ("%s: too few intr.", __func__));
 
 	/* The first one is always error intr on PFs */
 	if (!(sc->flags & IS_VF)) {
 		rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err");
 		if (rc != 0)
 			return (rc);
 		irq++;
 		rid++;
 	}
 
 	/* The second one is always the firmware event queue (first on VFs) */
 	rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt");
 	if (rc != 0)
 		return (rc);
 	irq++;
 	rid++;
 
 	for_each_port(sc, p) {
 		pi = sc->port[p];
 		for_each_vi(pi, v, vi) {
 			vi->first_intr = rid - 1;
 
 			if (vi->nnmrxq > 0) {
 				int n = max(vi->nrxq, vi->nnmrxq);
 
 				rxq = &sge->rxq[vi->first_rxq];
 #ifdef DEV_NETMAP
 				nm_rxq = &sge->nm_rxq[vi->first_nm_rxq];
 #endif
 				for (q = 0; q < n; q++) {
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'a' + v, q);
 					if (q < vi->nrxq)
 						irq->rxq = rxq++;
 #ifdef DEV_NETMAP
 					if (q < vi->nnmrxq)
 						irq->nm_rxq = nm_rxq++;
 
 					if (irq->nm_rxq != NULL &&
 					    irq->rxq == NULL) {
 						/* Netmap rx only */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_nm_intr, irq->nm_rxq, s);
 					}
 					if (irq->nm_rxq != NULL &&
 					    irq->rxq != NULL) {
 						/* NIC and Netmap rx */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_vi_intr, irq, s);
 					}
 #endif
 					if (irq->rxq != NULL &&
 					    irq->nm_rxq == NULL) {
 						/* NIC rx only */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_intr, irq->rxq, s);
 					}
 					if (rc != 0)
 						return (rc);
 #ifdef RSS
 					if (q < vi->nrxq) {
 						bus_bind_intr(sc->dev, irq->res,
 						    rss_getcpu(q % nbuckets));
 					}
 #endif
 					irq++;
 					rid++;
 					vi->nintr++;
 				}
 			} else {
 				for_each_rxq(vi, q, rxq) {
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'a' + v, q);
 					rc = t4_alloc_irq(sc, irq, rid,
 					    t4_intr, rxq, s);
 					if (rc != 0)
 						return (rc);
 #ifdef RSS
 					bus_bind_intr(sc->dev, irq->res,
 					    rss_getcpu(q % nbuckets));
 #endif
 					irq++;
 					rid++;
 					vi->nintr++;
 				}
 			}
 #ifdef TCP_OFFLOAD
 			for_each_ofld_rxq(vi, q, ofld_rxq) {
 				snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q);
 				rc = t4_alloc_irq(sc, irq, rid, t4_intr,
 				    ofld_rxq, s);
 				if (rc != 0)
 					return (rc);
 				irq++;
 				rid++;
 				vi->nintr++;
 			}
 #endif
 		}
 	}
 	MPASS(irq == &sc->irq[sc->intr_count]);
 
 	return (0);
 }
 
 int
 adapter_full_init(struct adapter *sc)
 {
 	int rc, i;
 #ifdef RSS
 	uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)];
 	uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)];
 #endif
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 	KASSERT((sc->flags & FULL_INIT_DONE) == 0,
 	    ("%s: FULL_INIT_DONE already", __func__));
 
 	/*
 	 * queues that belong to the adapter (not any particular port).
 	 */
 	rc = t4_setup_adapter_queues(sc);
 	if (rc != 0)
 		goto done;
 
 	for (i = 0; i < nitems(sc->tq); i++) {
 		sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT,
 		    taskqueue_thread_enqueue, &sc->tq[i]);
 		if (sc->tq[i] == NULL) {
 			device_printf(sc->dev,
 			    "failed to allocate task queue %d\n", i);
 			rc = ENOMEM;
 			goto done;
 		}
 		taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d",
 		    device_get_nameunit(sc->dev), i);
 	}
 #ifdef RSS
 	MPASS(RSS_KEYSIZE == 40);
 	rss_getkey((void *)&raw_rss_key[0]);
 	for (i = 0; i < nitems(rss_key); i++) {
 		rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]);
 	}
 	t4_write_rss_key(sc, &rss_key[0], -1, 1);
 #endif
 
 	if (!(sc->flags & IS_VF))
 		t4_intr_enable(sc);
 	sc->flags |= FULL_INIT_DONE;
 done:
 	if (rc != 0)
 		adapter_full_uninit(sc);
 
 	return (rc);
 }
 
 int
 adapter_full_uninit(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	t4_teardown_adapter_queues(sc);
 
 	for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) {
 		taskqueue_free(sc->tq[i]);
 		sc->tq[i] = NULL;
 	}
 
 	sc->flags &= ~FULL_INIT_DONE;
 
 	return (0);
 }
 
 #ifdef RSS
 #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \
     RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \
     RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \
     RSS_HASHTYPE_RSS_UDP_IPV6)
 
 /* Translates kernel hash types to hardware. */
 static int
 hashconfig_to_hashen(int hashconfig)
 {
 	int hashen = 0;
 
 	if (hashconfig & RSS_HASHTYPE_RSS_IPV4)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_IPV6)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) {
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN;
 	}
 	if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) {
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN;
 	}
 	if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN;
 
 	return (hashen);
 }
 
 /* Translates hardware hash types to kernel. */
 static int
 hashen_to_hashconfig(int hashen)
 {
 	int hashconfig = 0;
 
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) {
 		/*
 		 * If UDP hashing was enabled it must have been enabled for
 		 * either IPv4 or IPv6 (inclusive or).  Enabling UDP without
 		 * enabling any 4-tuple hash is nonsense configuration.
 		 */
 		MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN));
 
 		if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN)
 			hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4;
 		if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)
 			hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6;
 	}
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_IPV4;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_IPV6;
 
 	return (hashconfig);
 }
 #endif
 
 int
 vi_full_init(struct vi_info *vi)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	uint16_t *rss;
 	struct sge_rxq *rxq;
 	int rc, i, j;
 #ifdef RSS
 	int nbuckets = rss_getnumbuckets();
 	int hashconfig = rss_gethashconfig();
 	int extra;
 #endif
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT((vi->flags & VI_INIT_DONE) == 0,
 	    ("%s: VI_INIT_DONE already", __func__));
 
 	sysctl_ctx_init(&vi->ctx);
 	vi->flags |= VI_SYSCTL_CTX;
 
 	/*
 	 * Allocate tx/rx/fl queues for this VI.
 	 */
 	rc = t4_setup_vi_queues(vi);
 	if (rc != 0)
 		goto done;	/* error message displayed already */
 
 	/*
 	 * Setup RSS for this VI.  Save a copy of the RSS table for later use.
 	 */
 	if (vi->nrxq > vi->rss_size) {
 		if_printf(ifp, "nrxq (%d) > hw RSS table size (%d); "
 		    "some queues will never receive traffic.\n", vi->nrxq,
 		    vi->rss_size);
 	} else if (vi->rss_size % vi->nrxq) {
 		if_printf(ifp, "nrxq (%d), hw RSS table size (%d); "
 		    "expect uneven traffic distribution.\n", vi->nrxq,
 		    vi->rss_size);
 	}
 #ifdef RSS
 	if (vi->nrxq != nbuckets) {
 		if_printf(ifp, "nrxq (%d) != kernel RSS buckets (%d);"
 		    "performance will be impacted.\n", vi->nrxq, nbuckets);
 	}
 #endif
 	rss = malloc(vi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK);
 	for (i = 0; i < vi->rss_size;) {
 #ifdef RSS
 		j = rss_get_indirection_to_bucket(i);
 		j %= vi->nrxq;
 		rxq = &sc->sge.rxq[vi->first_rxq + j];
 		rss[i++] = rxq->iq.abs_id;
 #else
 		for_each_rxq(vi, j, rxq) {
 			rss[i++] = rxq->iq.abs_id;
 			if (i == vi->rss_size)
 				break;
 		}
 #endif
 	}
 
 	rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, rss,
 	    vi->rss_size);
 	if (rc != 0) {
 		free(rss, M_CXGBE);
 		if_printf(ifp, "rss_config failed: %d\n", rc);
 		goto done;
 	}
 
 #ifdef RSS
 	vi->hashen = hashconfig_to_hashen(hashconfig);
 
 	/*
 	 * We may have had to enable some hashes even though the global config
 	 * wants them disabled.  This is a potential problem that must be
 	 * reported to the user.
 	 */
 	extra = hashen_to_hashconfig(vi->hashen) ^ hashconfig;
 
 	/*
 	 * If we consider only the supported hash types, then the enabled hashes
 	 * are a superset of the requested hashes.  In other words, there cannot
 	 * be any supported hash that was requested but not enabled, but there
 	 * can be hashes that were not requested but had to be enabled.
 	 */
 	extra &= SUPPORTED_RSS_HASHTYPES;
 	MPASS((extra & hashconfig) == 0);
 
 	if (extra) {
 		if_printf(ifp,
 		    "global RSS config (0x%x) cannot be accommodated.\n",
 		    hashconfig);
 	}
 	if (extra & RSS_HASHTYPE_RSS_IPV4)
 		if_printf(ifp, "IPv4 2-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_TCP_IPV4)
 		if_printf(ifp, "TCP/IPv4 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_IPV6)
 		if_printf(ifp, "IPv6 2-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_TCP_IPV6)
 		if_printf(ifp, "TCP/IPv6 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_UDP_IPV4)
 		if_printf(ifp, "UDP/IPv4 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_UDP_IPV6)
 		if_printf(ifp, "UDP/IPv6 4-tuple hashing forced on.\n");
 #else
 	vi->hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN;
 #endif
 	rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, rss[0], 0, 0);
 	if (rc != 0) {
 		free(rss, M_CXGBE);
 		if_printf(ifp, "rss hash/defaultq config failed: %d\n", rc);
 		goto done;
 	}
 
 	vi->rss = rss;
 	vi->flags |= VI_INIT_DONE;
 done:
 	if (rc != 0)
 		vi_full_uninit(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 int
 vi_full_uninit(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_wrq *ofld_txq;
 #endif
 
 	if (vi->flags & VI_INIT_DONE) {
 
 		/* Need to quiesce queues.  */
 
 		/* XXX: Only for the first VI? */
 		if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF))
 			quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
 
 		for_each_txq(vi, i, txq) {
 			quiesce_txq(sc, txq);
 		}
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 		for_each_ofld_txq(vi, i, ofld_txq) {
 			quiesce_wrq(sc, ofld_txq);
 		}
 #endif
 
 		for_each_rxq(vi, i, rxq) {
 			quiesce_iq(sc, &rxq->iq);
 			quiesce_fl(sc, &rxq->fl);
 		}
 
 #ifdef TCP_OFFLOAD
 		for_each_ofld_rxq(vi, i, ofld_rxq) {
 			quiesce_iq(sc, &ofld_rxq->iq);
 			quiesce_fl(sc, &ofld_rxq->fl);
 		}
 #endif
 		free(vi->rss, M_CXGBE);
 		free(vi->nm_rss, M_CXGBE);
 	}
 
 	t4_teardown_vi_queues(vi);
 	vi->flags &= ~VI_INIT_DONE;
 
 	return (0);
 }
 
 static void
 quiesce_txq(struct adapter *sc, struct sge_txq *txq)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 
 	(void) sc;	/* unused */
 
 #ifdef INVARIANTS
 	TXQ_LOCK(txq);
 	MPASS((eq->flags & EQ_ENABLED) == 0);
 	TXQ_UNLOCK(txq);
 #endif
 
 	/* Wait for the mp_ring to empty. */
 	while (!mp_ring_is_idle(txq->r)) {
 		mp_ring_check_drainage(txq->r, 0);
 		pause("rquiesce", 1);
 	}
 
 	/* Then wait for the hardware to finish. */
 	while (spg->cidx != htobe16(eq->pidx))
 		pause("equiesce", 1);
 
 	/* Finally, wait for the driver to reclaim all descriptors. */
 	while (eq->cidx != eq->pidx)
 		pause("dquiesce", 1);
 }
 
 static void
 quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 
 	/* XXXTX */
 }
 
 static void
 quiesce_iq(struct adapter *sc, struct sge_iq *iq)
 {
 	(void) sc;	/* unused */
 
 	/* Synchronize with the interrupt handler */
 	while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED))
 		pause("iqfree", 1);
 }
 
 static void
 quiesce_fl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	fl->flags |= FL_DOOMED;
 	FL_UNLOCK(fl);
 	callout_stop(&sc->sfl_callout);
 	mtx_unlock(&sc->sfl_lock);
 
 	KASSERT((fl->flags & FL_STARVING) == 0,
 	    ("%s: still starving", __func__));
 }
 
 static int
 t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid,
     driver_intr_t *handler, void *arg, char *name)
 {
 	int rc;
 
 	irq->rid = rid;
 	irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid,
 	    RF_SHAREABLE | RF_ACTIVE);
 	if (irq->res == NULL) {
 		device_printf(sc->dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 
 	rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET,
 	    NULL, handler, arg, &irq->tag);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 		    rid, name, rc);
 	} else if (name)
 		bus_describe_intr(sc->dev, irq->res, irq->tag, "%s", name);
 
 	return (rc);
 }
 
 static int
 t4_free_irq(struct adapter *sc, struct irq *irq)
 {
 	if (irq->tag)
 		bus_teardown_intr(sc->dev, irq->res, irq->tag);
 	if (irq->res)
 		bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res);
 
 	bzero(irq, sizeof(*irq));
 
 	return (0);
 }
 
 static void
 get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf)
 {
 
 	regs->version = chip_id(sc) | chip_rev(sc) << 10;
 	t4_get_regs(sc, buf, regs->len);
 }
 
 #define	A_PL_INDIR_CMD	0x1f8
 
 #define	S_PL_AUTOINC	31
 #define	M_PL_AUTOINC	0x1U
 #define	V_PL_AUTOINC(x)	((x) << S_PL_AUTOINC)
 #define	G_PL_AUTOINC(x)	(((x) >> S_PL_AUTOINC) & M_PL_AUTOINC)
 
 #define	S_PL_VFID	20
 #define	M_PL_VFID	0xffU
 #define	V_PL_VFID(x)	((x) << S_PL_VFID)
 #define	G_PL_VFID(x)	(((x) >> S_PL_VFID) & M_PL_VFID)
 
 #define	S_PL_ADDR	0
 #define	M_PL_ADDR	0xfffffU
 #define	V_PL_ADDR(x)	((x) << S_PL_ADDR)
 #define	G_PL_ADDR(x)	(((x) >> S_PL_ADDR) & M_PL_ADDR)
 
 #define	A_PL_INDIR_DATA	0x1fc
 
 static uint64_t
 read_vf_stat(struct adapter *sc, u_int vin, int reg)
 {
 	u32 stats[2];
 
 	mtx_assert(&sc->reg_lock, MA_OWNED);
 	if (sc->flags & IS_VF) {
 		stats[0] = t4_read_reg(sc, VF_MPS_REG(reg));
 		stats[1] = t4_read_reg(sc, VF_MPS_REG(reg + 4));
 	} else {
 		t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) |
 		    V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(reg)));
 		stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA);
 		stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA);
 	}
 	return (((uint64_t)stats[1]) << 32 | stats[0]);
 }
 
 static void
 t4_get_vi_stats(struct adapter *sc, u_int vin, struct fw_vi_stats_vf *stats)
 {
 
 #define GET_STAT(name) \
 	read_vf_stat(sc, vin, A_MPS_VF_STAT_##name##_L)
 
 	stats->tx_bcast_bytes    = GET_STAT(TX_VF_BCAST_BYTES);
 	stats->tx_bcast_frames   = GET_STAT(TX_VF_BCAST_FRAMES);
 	stats->tx_mcast_bytes    = GET_STAT(TX_VF_MCAST_BYTES);
 	stats->tx_mcast_frames   = GET_STAT(TX_VF_MCAST_FRAMES);
 	stats->tx_ucast_bytes    = GET_STAT(TX_VF_UCAST_BYTES);
 	stats->tx_ucast_frames   = GET_STAT(TX_VF_UCAST_FRAMES);
 	stats->tx_drop_frames    = GET_STAT(TX_VF_DROP_FRAMES);
 	stats->tx_offload_bytes  = GET_STAT(TX_VF_OFFLOAD_BYTES);
 	stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES);
 	stats->rx_bcast_bytes    = GET_STAT(RX_VF_BCAST_BYTES);
 	stats->rx_bcast_frames   = GET_STAT(RX_VF_BCAST_FRAMES);
 	stats->rx_mcast_bytes    = GET_STAT(RX_VF_MCAST_BYTES);
 	stats->rx_mcast_frames   = GET_STAT(RX_VF_MCAST_FRAMES);
 	stats->rx_ucast_bytes    = GET_STAT(RX_VF_UCAST_BYTES);
 	stats->rx_ucast_frames   = GET_STAT(RX_VF_UCAST_FRAMES);
 	stats->rx_err_frames     = GET_STAT(RX_VF_ERR_FRAMES);
 
 #undef GET_STAT
 }
 
 static void
 t4_clr_vi_stats(struct adapter *sc, u_int vin)
 {
 	int reg;
 
 	t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) |
 	    V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L)));
 	for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L;
 	     reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4)
 		t4_write_reg(sc, A_PL_INDIR_DATA, 0);
 }
 
 static void
 vi_refresh_stats(struct adapter *sc, struct vi_info *vi)
 {
 	struct timeval tv;
 	const struct timeval interval = {0, 250000};	/* 250ms */
 
 	if (!(vi->flags & VI_INIT_DONE))
 		return;
 
 	getmicrotime(&tv);
 	timevalsub(&tv, &interval);
 	if (timevalcmp(&tv, &vi->last_refreshed, <))
 		return;
 
 	mtx_lock(&sc->reg_lock);
 	t4_get_vi_stats(sc, vi->vin, &vi->stats);
 	getmicrotime(&vi->last_refreshed);
 	mtx_unlock(&sc->reg_lock);
 }
 
 static void
 cxgbe_refresh_stats(struct adapter *sc, struct port_info *pi)
 {
 	u_int i, v, tnl_cong_drops, bg_map;
 	struct timeval tv;
 	const struct timeval interval = {0, 250000};	/* 250ms */
 
 	getmicrotime(&tv);
 	timevalsub(&tv, &interval);
 	if (timevalcmp(&tv, &pi->last_refreshed, <))
 		return;
 
 	tnl_cong_drops = 0;
 	t4_get_port_stats(sc, pi->tx_chan, &pi->stats);
 	bg_map = pi->mps_bg_map;
 	while (bg_map) {
 		i = ffs(bg_map) - 1;
 		mtx_lock(&sc->reg_lock);
 		t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1,
 		    A_TP_MIB_TNL_CNG_DROP_0 + i);
 		mtx_unlock(&sc->reg_lock);
 		tnl_cong_drops += v;
 		bg_map &= ~(1 << i);
 	}
 	pi->tnl_cong_drops = tnl_cong_drops;
 	getmicrotime(&pi->last_refreshed);
 }
 
 static void
 cxgbe_tick(void *arg)
 {
 	struct port_info *pi = arg;
 	struct adapter *sc = pi->adapter;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 	cxgbe_refresh_stats(sc, pi);
 
 	callout_schedule(&pi->tick, hz);
 }
 
 void
 vi_tick(void *arg)
 {
 	struct vi_info *vi = arg;
 	struct adapter *sc = vi->pi->adapter;
 
 	vi_refresh_stats(sc, vi);
 
 	callout_schedule(&vi->tick, hz);
 }
 
 /*
  * Should match fw_caps_config_<foo> enums in t4fw_interface.h
  */
 static char *caps_decoder[] = {
 	"\20\001IPMI\002NCSI",				/* 0: NBM */
 	"\20\001PPP\002QFC\003DCBX",			/* 1: link */
 	"\20\001INGRESS\002EGRESS",			/* 2: switch */
 	"\20\001NIC\002VM\003IDS\004UM\005UM_ISGL"	/* 3: NIC */
 	    "\006HASHFILTER\007ETHOFLD",
 	"\20\001TOE",					/* 4: TOE */
 	"\20\001RDDP\002RDMAC",				/* 5: RDMA */
 	"\20\001INITIATOR_PDU\002TARGET_PDU"		/* 6: iSCSI */
 	    "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD"
 	    "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD"
 	    "\007T10DIF"
 	    "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD",
 	"\20\001LOOKASIDE\002TLSKEYS",			/* 7: Crypto */
 	"\20\001INITIATOR\002TARGET\003CTRL_OFLD"	/* 8: FCoE */
 		    "\004PO_INITIATOR\005PO_TARGET",
 };
 
 void
 t4_sysctls(struct adapter *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children, *c0;
 	static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"};
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 
 	/*
 	 * dev.t4nex.X.
 	 */
 	oid = device_get_sysctl_tree(sc->dev);
 	c0 = children = SYSCTL_CHILDREN(oid);
 
 	sc->sc_do_rxcopy = 1;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW,
 	    &sc->sc_do_rxcopy, 1, "Do RX copy of small frames");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL,
 	    sc->params.nports, "# of ports");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells",
 	    CTLTYPE_STRING | CTLFLAG_RD, doorbells, (uintptr_t)&sc->doorbells,
 	    sysctl_bitfield_8b, "A", "available doorbells");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL,
 	    sc->params.vpd.cclk, "core clock frequency (in KHz)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.timer_val,
 	    sizeof(sc->params.sge.timer_val), sysctl_int_array, "A",
 	    "interrupt holdoff timer values (us)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.counter_val,
 	    sizeof(sc->params.sge.counter_val), sysctl_int_array, "A",
 	    "interrupt holdoff packet counter values");
 
 	t4_sge_sysctls(sc, ctx, children);
 
 	sc->lro_timeout = 100;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW,
 	    &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dflags", CTLFLAG_RW,
 	    &sc->debug_flags, 0, "flags to enable runtime debugging");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version",
 	    CTLFLAG_RD, sc->tp_version, 0, "TP microcode version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
 	    CTLFLAG_RD, sc->fw_version, 0, "firmware version");
 
 	if (sc->flags & IS_VF)
 		return;
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD,
 	    NULL, chip_rev(sc), "chip hardware revision");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "sn",
 	    CTLFLAG_RD, sc->params.vpd.sn, 0, "serial number");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pn",
 	    CTLFLAG_RD, sc->params.vpd.pn, 0, "part number");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "ec",
 	    CTLFLAG_RD, sc->params.vpd.ec, 0, "engineering change");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "md_version",
 	    CTLFLAG_RD, sc->params.vpd.md, 0, "manufacturing diags version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "na",
 	    CTLFLAG_RD, sc->params.vpd.na, 0, "network address");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "er_version", CTLFLAG_RD,
 	    sc->er_version, 0, "expansion ROM version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bs_version", CTLFLAG_RD,
 	    sc->bs_version, 0, "bootstrap firmware version");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "scfg_version", CTLFLAG_RD,
 	    NULL, sc->params.scfg_vers, "serial config version");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "vpd_version", CTLFLAG_RD,
 	    NULL, sc->params.vpd_vers, "VPD version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf",
 	    CTLFLAG_RD, sc->cfg_file, 0, "configuration file");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL,
 	    sc->cfcsum, "config file checksum");
 
 #define SYSCTL_CAP(name, n, text) \
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \
 	    CTLTYPE_STRING | CTLFLAG_RD, caps_decoder[n], (uintptr_t)&sc->name, \
 	    sysctl_bitfield_16b, "A", "available " text " capabilities")
 
 	SYSCTL_CAP(nbmcaps, 0, "NBM");
 	SYSCTL_CAP(linkcaps, 1, "link");
 	SYSCTL_CAP(switchcaps, 2, "switch");
 	SYSCTL_CAP(niccaps, 3, "NIC");
 	SYSCTL_CAP(toecaps, 4, "TCP offload");
 	SYSCTL_CAP(rdmacaps, 5, "RDMA");
 	SYSCTL_CAP(iscsicaps, 6, "iSCSI");
 	SYSCTL_CAP(cryptocaps, 7, "crypto");
 	SYSCTL_CAP(fcoecaps, 8, "FCoE");
 #undef SYSCTL_CAP
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD,
 	    NULL, sc->tids.nftids, "number of filters");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT |
 	    CTLFLAG_RD, sc, 0, sysctl_temperature, "I",
 	    "chip temperature (in Celsius)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "loadavg", CTLTYPE_STRING |
 	    CTLFLAG_RD, sc, 0, sysctl_loadavg, "A",
 	    "microprocessor load averages (debug firmwares only)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_vdd", CTLFLAG_RD,
 	    &sc->params.core_vdd, 0, "core Vdd (in mV)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "local_cpus",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, LOCAL_CPUS,
 	    sysctl_cpus, "A", "local CPUs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_cpus",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, INTR_CPUS,
 	    sysctl_cpus, "A", "preferred CPUs for interrupts");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "swintr", CTLFLAG_RW,
 	    &sc->swintr, 0, "software triggered interrupts");
 
 	/*
 	 * dev.t4nex.X.misc.  Marked CTLFLAG_SKIP to avoid information overload.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc",
 	    CTLFLAG_RD | CTLFLAG_SKIP, NULL,
 	    "logs and miscellaneous information");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cctrl, "A", "congestion control");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 1,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 2,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 3,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 4,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 5,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_la,
 	    "A", "CIM logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_ma_la, "A", "CIM MA logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ,
 	    sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)");
 
 	if (chip_id(sc) > CHELSIO_T4) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ,
 		    sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ,
 		    sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)");
 	}
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_pif_la, "A", "CIM PIF logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cim_qcfg, "A", "CIM queue configuration");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_cpl_stats, "A", "CPL statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_ddp_stats, "A", "non-TCP DDP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_devlog, "A", "firmware's device log");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_fcoe_stats, "A", "FCoE statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_hw_sched, "A", "hardware scheduler ");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_l2t, "A", "hardware L2 table");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "smt",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_smt, "A", "hardware source MAC table");
 
 #ifdef INET6
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "clip",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_clip, "A", "active CLIP table entries");
 #endif
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_lb_stats, "A", "loopback statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_meminfo, "A", "memory regions");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6,
 	    "A", "MPS TCAM entries");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_path_mtus, "A", "path MTUs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_pm_stats, "A", "PM statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_rdma_stats, "A", "RDMA statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tcp_stats, "A", "TCP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tids, "A", "TID information");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tp_err_stats, "A", "TP error statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask",
 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tp_la_mask, "I",
 	    "TP logic analyzer event capture mask");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tp_la, "A", "TP logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_tx_rate, "A", "Tx rate");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la",
 	    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    sysctl_ulprx_la, "A", "ULPRX logic analyzer");
 
 	if (chip_id(sc) >= CHELSIO_T5) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 		    sysctl_wcwr_stats, "A", "write combined work requests");
 	}
 
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		int i;
 		char s[4];
 
 		/*
 		 * dev.t4nex.X.toe.
 		 */
 		oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD,
 		    NULL, "TOE parameters");
 		children = SYSCTL_CHILDREN(oid);
 
 		sc->tt.cong_algorithm = -1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_algorithm",
 		    CTLFLAG_RW, &sc->tt.cong_algorithm, 0, "congestion control "
 		    "(-1 = default, 0 = reno, 1 = tahoe, 2 = newreno, "
 		    "3 = highspeed)");
 
 		sc->tt.sndbuf = 256 * 1024;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW,
 		    &sc->tt.sndbuf, 0, "max hardware send buffer size");
 
 		sc->tt.ddp = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW,
 		    &sc->tt.ddp, 0, "DDP allowed");
 
 		sc->tt.rx_coalesce = 1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce",
 		    CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing");
 
 		sc->tt.tls = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tls", CTLFLAG_RW,
 		    &sc->tt.tls, 0, "Inline TLS allowed");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_ports",
 		    CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tls_rx_ports,
 		    "I", "TCP ports that use inline TLS+TOE RX");
 
 		sc->tt.tx_align = 1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align",
 		    CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload");
 
 		sc->tt.tx_zcopy = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy",
 		    CTLFLAG_RW, &sc->tt.tx_zcopy, 0,
 		    "Enable zero-copy aio_write(2)");
 
 		sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
 		    "cop_managed_offloading", CTLFLAG_RW,
 		    &sc->tt.cop_managed_offloading, 0,
 		    "COP (Connection Offload Policy) controls all TOE offload");
 
 		sc->tt.autorcvbuf_inc = 16 * 1024;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "autorcvbuf_inc",
 		    CTLFLAG_RW, &sc->tt.autorcvbuf_inc, 0,
 		    "autorcvbuf increment");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A",
 		    "TP timer tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_tp_tick, "A",
 		    "TCP timestamp tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_tp_tick, "A",
 		    "DACK tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, sysctl_tp_dack_timer,
 		    "IU", "DACK timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MIN,
 		    sysctl_tp_timer, "LU", "Minimum retransmit interval (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MAX,
 		    sysctl_tp_timer, "LU", "Maximum retransmit interval (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MIN,
 		    sysctl_tp_timer, "LU", "Persist timer min (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MAX,
 		    sysctl_tp_timer, "LU", "Persist timer max (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_IDLE,
 		    sysctl_tp_timer, "LU", "Keepalive idle timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_interval",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_INTVL,
 		    sysctl_tp_timer, "LU", "Keepalive interval timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_INIT_SRTT,
 		    sysctl_tp_timer, "LU", "Initial SRTT (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer",
 		    CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_FINWAIT2_TIMER,
 		    sysctl_tp_timer, "LU", "FINWAIT2 timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "syn_rexmt_count",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, S_SYNSHIFTMAX,
 		    sysctl_tp_shift_cnt, "IU",
 		    "Number of SYN retransmissions before abort");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_count",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, S_RXTSHIFTMAXR2,
 		    sysctl_tp_shift_cnt, "IU",
 		    "Number of retransmissions before abort");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_count",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, S_KEEPALIVEMAXR2,
 		    sysctl_tp_shift_cnt, "IU",
 		    "Number of keepalive probes before abort");
 
 		oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rexmt_backoff",
 		    CTLFLAG_RD, NULL, "TOE retransmit backoffs");
 		children = SYSCTL_CHILDREN(oid);
 		for (i = 0; i < 16; i++) {
 			snprintf(s, sizeof(s), "%u", i);
 			SYSCTL_ADD_PROC(ctx, children, OID_AUTO, s,
 			    CTLTYPE_UINT | CTLFLAG_RD, sc, i, sysctl_tp_backoff,
 			    "IU", "TOE retransmit backoff");
 		}
 	}
 #endif
 }
 
 void
 vi_sysctls(struct vi_info *vi)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 
 	ctx = device_get_sysctl_ctx(vi->dev);
 
 	/*
 	 * dev.v?(cxgbe|cxl).X.
 	 */
 	oid = device_get_sysctl_tree(vi->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL,
 	    vi->viid, "VI identifer");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD,
 	    &vi->nrxq, 0, "# of rx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD,
 	    &vi->ntxq, 0, "# of tx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD,
 	    &vi->first_rxq, 0, "index of first rx queue");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
 	    &vi->first_txq, 0, "index of first tx queue");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_base", CTLFLAG_RD, NULL,
 	    vi->rss_base, "start of RSS indirection table");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL,
 	    vi->rss_size, "size of RSS indirection table");
 
 	if (IS_MAIN_VI(vi)) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq",
 		    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_noflowq, "IU",
 		    "Reserve queue 0 for non-flowid packets");
 	}
 
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD,
 		    &vi->nofldrxq, 0,
 		    "# of rx queues for offloaded TCP connections");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq",
 		    CTLFLAG_RD, &vi->first_ofld_rxq, 0,
 		    "index of first TOE rx queue");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx_ofld",
 		    CTLTYPE_INT | CTLFLAG_RW, vi, 0,
 		    sysctl_holdoff_tmr_idx_ofld, "I",
 		    "holdoff timer index for TOE queues");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx_ofld",
 		    CTLTYPE_INT | CTLFLAG_RW, vi, 0,
 		    sysctl_holdoff_pktc_idx_ofld, "I",
 		    "holdoff packet counter index for TOE queues");
 	}
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (vi->nofldtxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD,
 		    &vi->nofldtxq, 0,
 		    "# of tx queues for TOE/ETHOFLD");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq",
 		    CTLFLAG_RD, &vi->first_ofld_txq, 0,
 		    "index of first TOE/ETHOFLD tx queue");
 	}
 #endif
 #ifdef DEV_NETMAP
 	if (vi->nnmrxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD,
 		    &vi->nnmrxq, 0, "# of netmap rx queues");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD,
 		    &vi->nnmtxq, 0, "# of netmap tx queues");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq",
 		    CTLFLAG_RD, &vi->first_nm_rxq, 0,
 		    "index of first netmap rx queue");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq",
 		    CTLFLAG_RD, &vi->first_nm_txq, 0,
 		    "index of first netmap tx queue");
 	}
 #endif
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_tmr_idx, "I",
 	    "holdoff timer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_pktc_idx, "I",
 	    "holdoff packet counter index");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_rxq, "I",
 	    "rx queue size");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_txq, "I",
 	    "tx queue size");
 }
 
 static void
 cxgbe_sysctls(struct port_info *pi)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children, *children2;
 	struct adapter *sc = pi->adapter;
 	int i;
 	char name[16];
 	static char *tc_flags = {"\20\1USER\2SYNC\3ASYNC\4ERR"};
 
 	ctx = device_get_sysctl_ctx(pi->dev);
 
 	/*
 	 * dev.cxgbe.X.
 	 */
 	oid = device_get_sysctl_tree(pi->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING |
 	   CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down");
 	if (pi->port_type == FW_PORT_TYPE_BT_XAUI) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature",
 		    CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I",
 		    "PHY temperature (in Celsius)");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version",
 		    CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I",
 		    "PHY firmware version");
 	}
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings",
 	    CTLTYPE_STRING | CTLFLAG_RW, pi, 0, sysctl_pause_settings, "A",
     "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fec",
 	    CTLTYPE_STRING | CTLFLAG_RW, pi, 0, sysctl_fec, "A",
 	    "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "autoneg",
 	    CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_autoneg, "I",
 	    "autonegotiation (-1 = not supported)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL,
 	    port_top_speed(pi), "max speed (in Gbps)");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "mps_bg_map", CTLFLAG_RD, NULL,
 	    pi->mps_bg_map, "MPS buffer group map");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_e_chan_map", CTLFLAG_RD,
 	    NULL, pi->rx_e_chan_map, "TP rx e-channel map");
 
 	if (sc->flags & IS_VF)
 		return;
 
 	/*
 	 * dev.(cxgbe|cxl).X.tc.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc", CTLFLAG_RD, NULL,
 	    "Tx scheduler traffic classes (cl_rl)");
 	children2 = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "pktsize",
 	    CTLFLAG_RW, &pi->sched_params->pktsize, 0,
 	    "pktsize for per-flow cl-rl (0 means up to the driver )");
 	SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "burstsize",
 	    CTLFLAG_RW, &pi->sched_params->burstsize, 0,
 	    "burstsize for per-flow cl-rl (0 means up to the driver)");
 	for (i = 0; i < sc->chip_params->nsched_cls; i++) {
 		struct tx_cl_rl_params *tc = &pi->sched_params->cl_rl[i];
 
 		snprintf(name, sizeof(name), "%d", i);
 		children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx,
 		    SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL,
 		    "traffic class"));
 		SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "flags",
 		    CTLTYPE_STRING | CTLFLAG_RD, tc_flags, (uintptr_t)&tc->flags,
 		    sysctl_bitfield_8b, "A", "flags");
 		SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount",
 		    CTLFLAG_RD, &tc->refcount, 0, "references to this class");
 		SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params",
 		    CTLTYPE_STRING | CTLFLAG_RD, sc, (pi->port_id << 16) | i,
 		    sysctl_tc_params, "A", "traffic class parameters");
 	}
 
 	/*
 	 * dev.cxgbe.X.stats.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD,
 	    NULL, "port statistics");
 	children = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD,
 	    &pi->tx_parse_error, 0,
 	    "# of tx packets with invalid length or # of segments");
 
 #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \
 	SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \
 	    CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \
 	    sysctl_handle_t4_reg64, "QU", desc)
 
 	SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_64",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max",
 	    "# of tx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L));
 	SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L));
 
 	SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err",
 	    "# of frames received with bad FCS",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_len_err",
 	    "# of frames received with length error",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_64",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max",
 	    "# of rx frames in this range",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L));
 	SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received",
 	    PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L));
 
 #undef SYSCTL_ADD_T4_REG64
 
 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \
 	    &pi->stats.name, desc)
 
 	/* We get these from port_stats and they may be stale by up to 1s */
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0,
 	    "# drops due to buffer-group 0 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1,
 	    "# drops due to buffer-group 1 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2,
 	    "# drops due to buffer-group 2 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3,
 	    "# drops due to buffer-group 3 overflows");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc0,
 	    "# of buffer-group 0 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc1,
 	    "# of buffer-group 1 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc2,
 	    "# of buffer-group 2 truncated packets");
 	SYSCTL_ADD_T4_PORTSTAT(rx_trunc3,
 	    "# of buffer-group 3 truncated packets");
 
 #undef SYSCTL_ADD_T4_PORTSTAT
 
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tx_tls_records",
 	    CTLFLAG_RD, &pi->tx_tls_records,
 	    "# of TLS records transmitted");
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tx_tls_octets",
 	    CTLFLAG_RD, &pi->tx_tls_octets,
 	    "# of payload octets in transmitted TLS records");
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_tls_records",
 	    CTLFLAG_RD, &pi->rx_tls_records,
 	    "# of TLS records received");
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_tls_octets",
 	    CTLFLAG_RD, &pi->rx_tls_octets,
 	    "# of payload octets in received TLS records");
 }
 
 static int
 sysctl_int_array(SYSCTL_HANDLER_ARGS)
 {
 	int rc, *i, space = 0;
 	struct sbuf sb;
 
 	sbuf_new_for_sysctl(&sb, NULL, 64, req);
 	for (i = arg1; arg2; arg2 -= sizeof(int), i++) {
 		if (space)
 			sbuf_printf(&sb, " ");
 		sbuf_printf(&sb, "%d", *i);
 		space = 1;
 	}
 	rc = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 static int
 sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%b", *(uint8_t *)(uintptr_t)arg2, (char *)arg1);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%b", *(uint16_t *)(uintptr_t)arg2, (char *)arg1);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_btphy(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	int op = arg2;
 	struct adapter *sc = pi->adapter;
 	u_int v;
 	int rc;
 
 	rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt");
 	if (rc)
 		return (rc);
 	/* XXX: magic numbers */
 	rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820,
 	    &v);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 	if (op == 0)
 		v /= 256;
 
 	rc = sysctl_handle_int(oidp, &v, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_noflowq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	int rc, val;
 
 	val = vi->rsrv_noflowq;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if ((val >= 1) && (vi->ntxq > 1))
 		vi->rsrv_noflowq = 1;
 	else
 		vi->rsrv_noflowq = 0;
 
 	return (rc);
 }
 
 static int
 sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc, i;
 	struct sge_rxq *rxq;
 	uint8_t v;
 
 	idx = vi->tmr_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < 0 || idx >= SGE_NTIMERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4tmr");
 	if (rc)
 		return (rc);
 
 	v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1);
 	for_each_rxq(vi, i, rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&rxq->iq.intr_params, v);
 #else
 		rxq->iq.intr_params = v;
 #endif
 	}
 	vi->tmr_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc;
 
 	idx = vi->pktc_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < -1 || idx >= SGE_NCOUNTERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4pktc");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->pktc_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int qsize, rc;
 
 	qsize = vi->qsize_rxq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (qsize < 128 || (qsize & 7))
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4rxqs");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->qsize_rxq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_txq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int qsize, rc;
 
 	qsize = vi->qsize_txq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (qsize < 128 || qsize > 65536)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4txqs");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->qsize_txq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_pause_settings(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 
 	if (req->newptr == NULL) {
 		struct sbuf *sb;
 		static char *bits = "\20\1RX\2TX\3AUTO";
 
 		rc = sysctl_wire_old_buffer(req, 0);
 		if (rc != 0)
 			return(rc);
 
 		sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 		if (sb == NULL)
 			return (ENOMEM);
 
 		if (lc->link_ok) {
 			sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) |
 			    (lc->requested_fc & PAUSE_AUTONEG), bits);
 		} else {
 			sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX |
 			    PAUSE_RX | PAUSE_AUTONEG), bits);
 		}
 		rc = sbuf_finish(sb);
 		sbuf_delete(sb);
 	} else {
 		char s[2];
 		int n;
 
 		s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX |
 		    PAUSE_AUTONEG));
 		s[1] = 0;
 
 		rc = sysctl_handle_string(oidp, s, sizeof(s), req);
 		if (rc != 0)
 			return(rc);
 
 		if (s[1] != 0)
 			return (EINVAL);
 		if (s[0] < '0' || s[0] > '9')
 			return (EINVAL);	/* not a number */
 		n = s[0] - '0';
 		if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG))
 			return (EINVAL);	/* some other bit is set too */
 
 		rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 		    "t4PAUSE");
 		if (rc)
 			return (rc);
 		PORT_LOCK(pi);
 		lc->requested_fc = n;
 		fixup_link_config(pi);
 		if (pi->up_vis > 0)
 			rc = apply_link_config(pi);
 		set_current_media(pi);
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, 0);
 	}
 
 	return (rc);
 }
 
 static int
 sysctl_fec(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 	int8_t old;
 
 	if (req->newptr == NULL) {
 		struct sbuf *sb;
 		static char *bits = "\20\1RS\2BASE-R\3RSVD1\4RSVD2\5RSVD3\6AUTO";
 
 		rc = sysctl_wire_old_buffer(req, 0);
 		if (rc != 0)
 			return(rc);
 
 		sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 		if (sb == NULL)
 			return (ENOMEM);
 
 		/*
 		 * Display the requested_fec when the link is down -- the actual
 		 * FEC makes sense only when the link is up.
 		 */
 		if (lc->link_ok) {
 			sbuf_printf(sb, "%b", (lc->fec & M_FW_PORT_CAP32_FEC) |
 			    (lc->requested_fec & FEC_AUTO), bits);
 		} else {
 			sbuf_printf(sb, "%b", lc->requested_fec, bits);
 		}
 		rc = sbuf_finish(sb);
 		sbuf_delete(sb);
 	} else {
 		char s[3];
 		int n;
 
 		snprintf(s, sizeof(s), "%d",
 		    lc->requested_fec == FEC_AUTO ? -1 :
 		    lc->requested_fec & M_FW_PORT_CAP32_FEC);
 
 		rc = sysctl_handle_string(oidp, s, sizeof(s), req);
 		if (rc != 0)
 			return(rc);
 
 		n = strtol(&s[0], NULL, 0);
 		if (n < 0 || n & FEC_AUTO)
 			n = FEC_AUTO;
 		else {
 			if (n & ~M_FW_PORT_CAP32_FEC)
 				return (EINVAL);/* some other bit is set too */
 			if (!powerof2(n))
 				return (EINVAL);/* one bit can be set at most */
 		}
 
 		rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 		    "t4fec");
 		if (rc)
 			return (rc);
 		PORT_LOCK(pi);
 		old = lc->requested_fec;
 		if (n == FEC_AUTO)
 			lc->requested_fec = FEC_AUTO;
 		else if (n == 0)
 			lc->requested_fec = FEC_NONE;
 		else {
 			if ((lc->supported | V_FW_PORT_CAP32_FEC(n)) !=
 			    lc->supported) {
 				rc = ENOTSUP;
 				goto done;
 			}
 			lc->requested_fec = n;
 		}
 		fixup_link_config(pi);
 		if (pi->up_vis > 0) {
 			rc = apply_link_config(pi);
 			if (rc != 0) {
 				lc->requested_fec = old;
 				if (rc == FW_EPROTO)
 					rc = ENOTSUP;
 			}
 		}
 done:
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, 0);
 	}
 
 	return (rc);
 }
 
 static int
 sysctl_autoneg(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc, val;
 
 	if (lc->supported & FW_PORT_CAP32_ANEG)
 		val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1;
 	else
 		val = -1;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 	if (val == 0)
 		val = AUTONEG_DISABLE;
 	else if (val == 1)
 		val = AUTONEG_ENABLE;
 	else
 		val = AUTONEG_AUTO;
 
 	rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 	    "t4aneg");
 	if (rc)
 		return (rc);
 	PORT_LOCK(pi);
 	if (val == AUTONEG_ENABLE && !(lc->supported & FW_PORT_CAP32_ANEG)) {
 		rc = ENOTSUP;
 		goto done;
 	}
 	lc->requested_aneg = val;
 	fixup_link_config(pi);
 	if (pi->up_vis > 0)
 		rc = apply_link_config(pi);
 	set_current_media(pi);
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int reg = arg2;
 	uint64_t val;
 
 	val = t4_read_reg64(sc, reg);
 
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 static int
 sysctl_temperature(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, t;
 	uint32_t param, val;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp");
 	if (rc)
 		return (rc);
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 	    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	/* unknown is returned as 0 but we display -1 in that case */
 	t = val == 0 ? -1 : val;
 
 	rc = sysctl_handle_int(oidp, &t, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_loadavg(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint32_t param, val;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4lavg");
 	if (rc)
 		return (rc);
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_LOAD);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (val == 0xffffffff) {
 		/* Only debug and custom firmwares report load averages. */
 		sbuf_printf(sb, "not available");
 	} else {
 		sbuf_printf(sb, "%d %d %d", val & 0xff, (val >> 8) & 0xff,
 		    (val >> 16) & 0xff);
 	}
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cctrl(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t incr[NMTUS][NCCTRL_WIN];
 	static const char *dec_fac[] = {
 		"0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875",
 		"0.9375"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_read_cong_tbl(sc, incr);
 
 	for (i = 0; i < NCCTRL_WIN; ++i) {
 		sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i,
 		    incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i],
 		    incr[5][i], incr[6][i], incr[7][i]);
 		sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n",
 		    incr[8][i], incr[9][i], incr[10][i], incr[11][i],
 		    incr[12][i], incr[13][i], incr[14][i], incr[15][i],
 		    sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = {
 	"TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI",	/* ibq's */
 	"ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI",	/* obq's */
 	"SGE0-RX", "SGE1-RX"	/* additional obq's (T5 onwards) */
 };
 
 static int
 sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n, qid = arg2;
 	uint32_t *buf, *p;
 	char *qtype;
 	u_int cim_num_obq = sc->chip_params->cim_num_obq;
 
 	KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq,
 	    ("%s: bad qid %d\n", __func__, qid));
 
 	if (qid < CIM_NUM_IBQ) {
 		/* inbound queue */
 		qtype = "IBQ";
 		n = 4 * CIM_IBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		rc = t4_read_cim_ibq(sc, qid, buf, n);
 	} else {
 		/* outbound queue */
 		qtype = "OBQ";
 		qid -= CIM_NUM_IBQ;
 		n = 4 * cim_num_obq * CIM_OBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		rc = t4_read_cim_obq(sc, qid, buf, n);
 	}
 
 	if (rc < 0) {
 		rc = -rc;
 		goto done;
 	}
 	n = rc * sizeof(uint32_t);	/* rc has # of words actually read */
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		goto done;
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]);
 	for (i = 0, p = buf; i < n; i += 16, p += 4)
 		sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1],
 		    p[2], p[3]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static void
 sbuf_cim_la4(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg)
 {
 	uint32_t *p;
 
 	sbuf_printf(sb, "Status   Data      PC%s",
 	    cfg & F_UPDBGLACAPTPCONLY ? "" :
 	    "     LS0Stat  LS0Addr             LS0Data");
 
 	for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) {
 		if (cfg & F_UPDBGLACAPTPCONLY) {
 			sbuf_printf(sb, "\n  %02x   %08x %08x", p[5] & 0xff,
 			    p[6], p[7]);
 			sbuf_printf(sb, "\n  %02x   %02x%06x %02x%06x",
 			    (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8,
 			    p[4] & 0xff, p[5] >> 8);
 			sbuf_printf(sb, "\n  %02x   %x%07x %x%07x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4);
 		} else {
 			sbuf_printf(sb,
 			    "\n  %02x   %x%07x %x%07x %08x %08x "
 			    "%08x%08x%08x%08x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5],
 			    p[6], p[7]);
 		}
 	}
 }
 
 static void
 sbuf_cim_la6(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg)
 {
 	uint32_t *p;
 
 	sbuf_printf(sb, "Status   Inst    Data      PC%s",
 	    cfg & F_UPDBGLACAPTPCONLY ? "" :
 	    "     LS0Stat  LS0Addr  LS0Data  LS1Stat  LS1Addr  LS1Data");
 
 	for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) {
 		if (cfg & F_UPDBGLACAPTPCONLY) {
 			sbuf_printf(sb, "\n  %02x   %08x %08x %08x",
 			    p[3] & 0xff, p[2], p[1], p[0]);
 			sbuf_printf(sb, "\n  %02x   %02x%06x %02x%06x %02x%06x",
 			    (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8,
 			    p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8);
 			sbuf_printf(sb, "\n  %02x   %04x%04x %04x%04x %04x%04x",
 			    (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16,
 			    p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff,
 			    p[6] >> 16);
 		} else {
 			sbuf_printf(sb, "\n  %02x   %04x%04x %04x%04x %04x%04x "
 			    "%08x %08x %08x %08x %08x %08x",
 			    (p[9] >> 16) & 0xff,
 			    p[9] & 0xffff, p[8] >> 16,
 			    p[8] & 0xffff, p[7] >> 16,
 			    p[7] & 0xffff, p[6] >> 16,
 			    p[2], p[1], p[0], p[5], p[4], p[3]);
 		}
 	}
 }
 
 static int
 sbuf_cim_la(struct adapter *sc, struct sbuf *sb, int flags)
 {
 	uint32_t cfg, *buf;
 	int rc;
 
 	rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg);
 	if (rc != 0)
 		return (rc);
 
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 	buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | flags);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	rc = -t4_cim_read_la(sc, buf, NULL);
 	if (rc != 0)
 		goto done;
 	if (chip_id(sc) < CHELSIO_T6)
 		sbuf_cim_la4(sc, sb, buf, cfg);
 	else
 		sbuf_cim_la6(sc, sb, buf, cfg);
 
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	rc = sbuf_cim_la(sc, sb, M_WAITOK);
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 bool
 t4_os_dump_cimla(struct adapter *sc, int arg, bool verbose)
 {
 	struct sbuf sb;
 	int rc;
 
 	if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
 		return (false);
 	rc = sbuf_cim_la(sc, &sb, M_NOWAIT);
 	if (rc == 0) {
 		rc = sbuf_finish(&sb);
 		if (rc == 0) {
 			log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s",
 		    		device_get_nameunit(sc->dev), sbuf_data(&sb));
 		}
 	}
 	sbuf_delete(&sb);
 	return (false);
 }
 
 static int
 sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE);
 	p = buf;
 
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2],
 		    p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCnt ID Tag UE       Data       RDY VLD");
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%3u %2u  %x   %u %08x%08x  %u   %u",
 		    (p[2] >> 10) & 0xff, (p[2] >> 7) & 7,
 		    (p[2] >> 3) & 0xf, (p[2] >> 2) & 1,
 		    (p[1] >> 2) | ((p[2] & 3) << 30),
 		    (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1,
 		    p[0] & 1);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL);
 	p = buf;
 
 	sbuf_printf(sb, "Cntl ID DataBE   Addr                 Data");
 	for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x  %04x  %08x %08x%08x%08x%08x",
 		    (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff,
 		    p[4], p[3], p[2], p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCntl ID               Data");
 	for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x %08x%08x%08x%08x",
 		    (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t thres[CIM_NUM_IBQ];
 	uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr;
 	uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat;
 	u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq;
 
 	cim_num_obq = sc->chip_params->cim_num_obq;
 	if (is_t4(sc)) {
 		ibq_rdaddr = A_UP_IBQ_0_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_REALADDR;
 	} else {
 		ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR;
 	}
 	nq = CIM_NUM_IBQ + cim_num_obq;
 
 	rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat);
 	if (rc == 0)
 		rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr);
 	if (rc != 0)
 		return (rc);
 
 	t4_read_cimq_cfg(sc, base, size, thres);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "  Queue  Base  Size Thres  RdPtr WrPtr  SOP  EOP Avail");
 
 	for (i = 0; i < CIM_NUM_IBQ; i++, p += 4)
 		sbuf_printf(sb, "\n%7s %5x %5u %5u %6x  %4x %4u %4u %5u",
 		    qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]),
 		    G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 	for ( ; i < nq; i++, p += 4, wr += 2)
 		sbuf_printf(sb, "\n%7s %5x %5u %12x  %4x %4u %4u %5u", qname[i],
 		    base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff,
 		    wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cpl_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_cpl_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_cpl_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "                 channel 0  channel 1"
 		    "  channel 2  channel 3");
 		sbuf_printf(sb, "\nCPL requests:   %10u %10u %10u %10u",
 		    stats.req[0], stats.req[1], stats.req[2], stats.req[3]);
 		sbuf_printf(sb, "\nCPL responses:   %10u %10u %10u %10u",
 		    stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]);
 	} else {
 		sbuf_printf(sb, "                 channel 0  channel 1");
 		sbuf_printf(sb, "\nCPL requests:   %10u %10u",
 		    stats.req[0], stats.req[1]);
 		sbuf_printf(sb, "\nCPL responses:   %10u %10u",
 		    stats.rsp[0], stats.rsp[1]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_ddp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_usm_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_get_usm_stats(sc, &stats, 1);
 
 	sbuf_printf(sb, "Frames: %u\n", stats.frames);
 	sbuf_printf(sb, "Octets: %ju\n", stats.octets);
 	sbuf_printf(sb, "Drops:  %u", stats.drops);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static const char * const devlog_level_strings[] = {
 	[FW_DEVLOG_LEVEL_EMERG]		= "EMERG",
 	[FW_DEVLOG_LEVEL_CRIT]		= "CRIT",
 	[FW_DEVLOG_LEVEL_ERR]		= "ERR",
 	[FW_DEVLOG_LEVEL_NOTICE]	= "NOTICE",
 	[FW_DEVLOG_LEVEL_INFO]		= "INFO",
 	[FW_DEVLOG_LEVEL_DEBUG]		= "DEBUG"
 };
 
 static const char * const devlog_facility_strings[] = {
 	[FW_DEVLOG_FACILITY_CORE]	= "CORE",
 	[FW_DEVLOG_FACILITY_CF]		= "CF",
 	[FW_DEVLOG_FACILITY_SCHED]	= "SCHED",
 	[FW_DEVLOG_FACILITY_TIMER]	= "TIMER",
 	[FW_DEVLOG_FACILITY_RES]	= "RES",
 	[FW_DEVLOG_FACILITY_HW]		= "HW",
 	[FW_DEVLOG_FACILITY_FLR]	= "FLR",
 	[FW_DEVLOG_FACILITY_DMAQ]	= "DMAQ",
 	[FW_DEVLOG_FACILITY_PHY]	= "PHY",
 	[FW_DEVLOG_FACILITY_MAC]	= "MAC",
 	[FW_DEVLOG_FACILITY_PORT]	= "PORT",
 	[FW_DEVLOG_FACILITY_VI]		= "VI",
 	[FW_DEVLOG_FACILITY_FILTER]	= "FILTER",
 	[FW_DEVLOG_FACILITY_ACL]	= "ACL",
 	[FW_DEVLOG_FACILITY_TM]		= "TM",
 	[FW_DEVLOG_FACILITY_QFC]	= "QFC",
 	[FW_DEVLOG_FACILITY_DCB]	= "DCB",
 	[FW_DEVLOG_FACILITY_ETH]	= "ETH",
 	[FW_DEVLOG_FACILITY_OFLD]	= "OFLD",
 	[FW_DEVLOG_FACILITY_RI]		= "RI",
 	[FW_DEVLOG_FACILITY_ISCSI]	= "ISCSI",
 	[FW_DEVLOG_FACILITY_FCOE]	= "FCOE",
 	[FW_DEVLOG_FACILITY_FOISCSI]	= "FOISCSI",
 	[FW_DEVLOG_FACILITY_FOFCOE]	= "FOFCOE",
 	[FW_DEVLOG_FACILITY_CHNET]	= "CHNET",
 };
 
 static int
 sbuf_devlog(struct adapter *sc, struct sbuf *sb, int flags)
 {
 	int i, j, rc, nentries, first = 0;
 	struct devlog_params *dparams = &sc->params.devlog;
 	struct fw_devlog_e *buf, *e;
 	uint64_t ftstamp = UINT64_MAX;
 
 	if (dparams->addr == 0)
 		return (ENXIO);
 
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 	buf = malloc(dparams->size, M_CXGBE, M_ZERO | flags);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf, dparams->size);
 	if (rc != 0)
 		goto done;
 
 	nentries = dparams->size / sizeof(struct fw_devlog_e);
 	for (i = 0; i < nentries; i++) {
 		e = &buf[i];
 
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		e->timestamp = be64toh(e->timestamp);
 		e->seqno = be32toh(e->seqno);
 		for (j = 0; j < 8; j++)
 			e->params[j] = be32toh(e->params[j]);
 
 		if (e->timestamp < ftstamp) {
 			ftstamp = e->timestamp;
 			first = i;
 		}
 	}
 
 	if (buf[first].timestamp == 0)
 		goto done;	/* nothing in the log */
 
 	sbuf_printf(sb, "%10s  %15s  %8s  %8s  %s\n",
 	    "Seq#", "Tstamp", "Level", "Facility", "Message");
 
 	i = first;
 	do {
 		e = &buf[i];
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		sbuf_printf(sb, "%10d  %15ju  %8s  %8s  ",
 		    e->seqno, e->timestamp,
 		    (e->level < nitems(devlog_level_strings) ?
 			devlog_level_strings[e->level] : "UNKNOWN"),
 		    (e->facility < nitems(devlog_facility_strings) ?
 			devlog_facility_strings[e->facility] : "UNKNOWN"));
 		sbuf_printf(sb, e->fmt, e->params[0], e->params[1],
 		    e->params[2], e->params[3], e->params[4],
 		    e->params[5], e->params[6], e->params[7]);
 
 		if (++i == nentries)
 			i = 0;
 	} while (i != first);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_devlog(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	rc = sbuf_devlog(sc, sb, M_WAITOK);
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 void
 t4_os_dump_devlog(struct adapter *sc)
 {
 	int rc;
 	struct sbuf sb;
 
 	if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
 		return;
 	rc = sbuf_devlog(sc, &sb, M_NOWAIT);
 	if (rc == 0) {
 		rc = sbuf_finish(&sb);
 		if (rc == 0) {
 			log(LOG_DEBUG, "%s: device log follows.\n%s",
 		    		device_get_nameunit(sc->dev), sbuf_data(&sb));
 		}
 	}
 	sbuf_delete(&sb);
 }
 
 static int
 sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_fcoe_stats stats[MAX_NCHAN];
 	int i, nchan = sc->chip_params->nchan;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < nchan; i++)
 		t4_get_fcoe_stats(sc, i, &stats[i], 1);
 
 	if (nchan > 2) {
 		sbuf_printf(sb, "                   channel 0        channel 1"
 		    "        channel 2        channel 3");
 		sbuf_printf(sb, "\noctetsDDP:  %16ju %16ju %16ju %16ju",
 		    stats[0].octets_ddp, stats[1].octets_ddp,
 		    stats[2].octets_ddp, stats[3].octets_ddp);
 		sbuf_printf(sb, "\nframesDDP:  %16u %16u %16u %16u",
 		    stats[0].frames_ddp, stats[1].frames_ddp,
 		    stats[2].frames_ddp, stats[3].frames_ddp);
 		sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u",
 		    stats[0].frames_drop, stats[1].frames_drop,
 		    stats[2].frames_drop, stats[3].frames_drop);
 	} else {
 		sbuf_printf(sb, "                   channel 0        channel 1");
 		sbuf_printf(sb, "\noctetsDDP:  %16ju %16ju",
 		    stats[0].octets_ddp, stats[1].octets_ddp);
 		sbuf_printf(sb, "\nframesDDP:  %16u %16u",
 		    stats[0].frames_ddp, stats[1].frames_ddp);
 		sbuf_printf(sb, "\nframesDrop: %16u %16u",
 		    stats[0].frames_drop, stats[1].frames_drop);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_hw_sched(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	unsigned int map, kbps, ipg, mode;
 	unsigned int pace_tab[NTX_SCHED];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP);
 	mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG));
 	t4_read_pace_tbl(sc, pace_tab);
 
 	sbuf_printf(sb, "Scheduler  Mode   Channel  Rate (Kbps)   "
 	    "Class IPG (0.1 ns)   Flow IPG (us)");
 
 	for (i = 0; i < NTX_SCHED; ++i, map >>= 2) {
 		t4_get_tx_sched(sc, i, &kbps, &ipg, 1);
 		sbuf_printf(sb, "\n    %u      %-5s     %u     ", i,
 		    (mode & (1 << i)) ? "flow" : "class", map & 3);
 		if (kbps)
 			sbuf_printf(sb, "%9u     ", kbps);
 		else
 			sbuf_printf(sb, " disabled     ");
 
 		if (ipg)
 			sbuf_printf(sb, "%13u        ", ipg);
 		else
 			sbuf_printf(sb, "     disabled        ");
 
 		if (pace_tab[i])
 			sbuf_printf(sb, "%10u", pace_tab[i]);
 		else
 			sbuf_printf(sb, "  disabled");
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_lb_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, j;
 	uint64_t *p0, *p1;
 	struct lb_port_stats s[2];
 	static const char *stat_name[] = {
 		"OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:",
 		"UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:",
 		"Frames128To255:", "Frames256To511:", "Frames512To1023:",
 		"Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:",
 		"BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:",
 		"BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:",
 		"BG2FramesTrunc:", "BG3FramesTrunc:"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	memset(s, 0, sizeof(s));
 
 	for (i = 0; i < sc->chip_params->nchan; i += 2) {
 		t4_get_lb_stats(sc, i, &s[0]);
 		t4_get_lb_stats(sc, i + 1, &s[1]);
 
 		p0 = &s[0].octets;
 		p1 = &s[1].octets;
 		sbuf_printf(sb, "%s                       Loopback %u"
 		    "           Loopback %u", i == 0 ? "" : "\n", i, i + 1);
 
 		for (j = 0; j < nitems(stat_name); j++)
 			sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j],
 				   *p0++, *p1++);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_linkdnrc(SYSCTL_HANDLER_ARGS)
 {
 	int rc = 0;
 	struct port_info *pi = arg1;
 	struct link_config *lc = &pi->link_cfg;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 64, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (lc->link_ok || lc->link_down_rc == 255)
 		sbuf_printf(sb, "n/a");
 	else
 		sbuf_printf(sb, "%s", t4_link_down_rc_str(lc->link_down_rc));
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 struct mem_desc {
 	unsigned int base;
 	unsigned int limit;
 	unsigned int idx;
 };
 
 static int
 mem_desc_cmp(const void *a, const void *b)
 {
 	return ((const struct mem_desc *)a)->base -
 	       ((const struct mem_desc *)b)->base;
 }
 
 static void
 mem_region_show(struct sbuf *sb, const char *name, unsigned int from,
     unsigned int to)
 {
 	unsigned int size;
 
 	if (from == to)
 		return;
 
 	size = to - from + 1;
 	if (size == 0)
 		return;
 
 	/* XXX: need humanize_number(3) in libkern for a more readable 'size' */
 	sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size);
 }
 
 static int
 sysctl_meminfo(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n;
 	uint32_t lo, hi, used, alloc;
 	static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"};
 	static const char *region[] = {
 		"DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:",
 		"Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:",
 		"Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:",
 		"TDDP region:", "TPT region:", "STAG region:", "RQ region:",
 		"RQUDP region:", "PBL region:", "TXPBL region:",
 		"DBVFIFO region:", "ULPRX state:", "ULPTX state:",
 		"On-chip queues:", "TLS keys:",
 	};
 	struct mem_desc avail[4];
 	struct mem_desc mem[nitems(region) + 3];	/* up to 3 holes */
 	struct mem_desc *md = mem;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < nitems(mem); i++) {
 		mem[i].limit = 0;
 		mem[i].idx = i;
 	}
 
 	/* Find and sort the populated memory ranges */
 	i = 0;
 	lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	if (lo & F_EDRAM0_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		avail[i].base = G_EDRAM0_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20);
 		avail[i].idx = 0;
 		i++;
 	}
 	if (lo & F_EDRAM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		avail[i].base = G_EDRAM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20);
 		avail[i].idx = 1;
 		i++;
 	}
 	if (lo & F_EXT_MEM_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		avail[i].base = G_EXT_MEM_BASE(hi) << 20;
 		avail[i].limit = avail[i].base +
 		    (G_EXT_MEM_SIZE(hi) << 20);
 		avail[i].idx = is_t5(sc) ? 3 : 2;	/* Call it MC0 for T5 */
 		i++;
 	}
 	if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		avail[i].base = G_EXT_MEM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base +
 		    (G_EXT_MEM1_SIZE(hi) << 20);
 		avail[i].idx = 4;
 		i++;
 	}
 	if (!i)                                    /* no memory available */
 		return 0;
 	qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp);
 
 	(md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE);
 
 	/* the next few have explicit upper bounds */
 	md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) *
 		    G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE));
 	md++;
 
 	md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) *
 		    G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE));
 	md++;
 
 	if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 		if (chip_id(sc) <= CHELSIO_T5)
 			md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE);
 		else
 			md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR);
 		md->limit = 0;
 	} else {
 		md->base = 0;
 		md->idx = nitems(region);  /* hide it */
 	}
 	md++;
 
 #define ulp_region(reg) \
 	md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\
 	(md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT)
 
 	ulp_region(RX_ISCSI);
 	ulp_region(RX_TDDP);
 	ulp_region(TX_TPT);
 	ulp_region(RX_STAG);
 	ulp_region(RX_RQ);
 	ulp_region(RX_RQUDP);
 	ulp_region(RX_PBL);
 	ulp_region(TX_PBL);
 #undef ulp_region
 
 	md->base = 0;
 	md->idx = nitems(region);
 	if (!is_t4(sc)) {
 		uint32_t size = 0;
 		uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2);
 		uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE);
 
 		if (is_t5(sc)) {
 			if (sge_ctrl & F_VFIFO_ENABLE)
 				size = G_DBVFIFO_SIZE(fifo_size);
 		} else
 			size = G_T6_DBVFIFO_SIZE(fifo_size);
 
 		if (size) {
 			md->base = G_BASEADDR(t4_read_reg(sc,
 			    A_SGE_DBVFIFO_BADDR));
 			md->limit = md->base + (size << 2) - 1;
 		}
 	}
 	md++;
 
 	md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE);
 	md->limit = 0;
 	md++;
 	md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE);
 	md->limit = 0;
 	md++;
 
 	md->base = sc->vres.ocq.start;
 	if (sc->vres.ocq.size)
 		md->limit = md->base + sc->vres.ocq.size - 1;
 	else
 		md->idx = nitems(region);  /* hide it */
 	md++;
 
 	md->base = sc->vres.key.start;
 	if (sc->vres.key.size)
 		md->limit = md->base + sc->vres.key.size - 1;
 	else
 		md->idx = nitems(region);  /* hide it */
 	md++;
 
 	/* add any address-space holes, there can be up to 3 */
 	for (n = 0; n < i - 1; n++)
 		if (avail[n].limit < avail[n + 1].base)
 			(md++)->base = avail[n].limit;
 	if (avail[n].limit)
 		(md++)->base = avail[n].limit;
 
 	n = md - mem;
 	qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp);
 
 	for (lo = 0; lo < i; lo++)
 		mem_region_show(sb, memory[avail[lo].idx], avail[lo].base,
 				avail[lo].limit - 1);
 
 	sbuf_printf(sb, "\n");
 	for (i = 0; i < n; i++) {
 		if (mem[i].idx >= nitems(region))
 			continue;                        /* skip holes */
 		if (!mem[i].limit)
 			mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0;
 		mem_region_show(sb, region[mem[i].idx], mem[i].base,
 				mem[i].limit);
 	}
 
 	sbuf_printf(sb, "\n");
 	lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP RAM:", lo, hi);
 
 	lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP Extmem2:", lo, hi);
 
 	lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE);
 	sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n",
 		   G_PMRXMAXPAGE(lo),
 		   t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10,
 		   (lo & F_PMRXNUMCHN) ? 2 : 1);
 
 	lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE);
 	hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
 	sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n",
 		   G_PMTXMAXPAGE(lo),
 		   hi >= (1 << 20) ? (hi >> 20) : (hi >> 10),
 		   hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo));
 	sbuf_printf(sb, "%u p-structs\n",
 		   t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT));
 
 	for (i = 0; i < 4; i++) {
 		if (chip_id(sc) > CHELSIO_T5)
 			lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4);
 		else
 			lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4);
 		if (is_t5(sc)) {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		} else {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		}
 		/* For T6 these are MAC buffer groups */
 		sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated",
 		    i, used, alloc);
 	}
 	for (i = 0; i < sc->chip_params->nchan; i++) {
 		if (chip_id(sc) > CHELSIO_T5)
 			lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4);
 		else
 			lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4);
 		if (is_t5(sc)) {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		} else {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		}
 		/* For T6 these are MAC buffer groups */
 		sbuf_printf(sb,
 		    "\nLoopback %d using %u pages out of %u allocated",
 		    i, used, alloc);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static inline void
 tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask)
 {
 	*mask = x | y;
 	y = htobe64(y);
 	memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN);
 }
 
 static int
 sysctl_mps_tcam(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 
 	MPASS(chip_id(sc) <= CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "Idx  Ethernet address     Mask     Vld Ports PF"
 	    "  VF              Replication             P0 P1 P2 P3  ML");
 	for (i = 0; i < sc->chip_params->mps_tcam_size; i++) {
 		uint64_t tcamx, tcamy, mask;
 		uint32_t cls_lo, cls_hi;
 		uint8_t addr[ETHER_ADDR_LEN];
 
 		tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i));
 		tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i));
 		if (tcamx & tcamy)
 			continue;
 		tcamxy2valmask(tcamx, tcamy, addr, &mask);
 		cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i));
 		cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i));
 		sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx"
 			   "  %c   %#x%4u%4d", i, addr[0], addr[1], addr[2],
 			   addr[3], addr[4], addr[5], (uintmax_t)mask,
 			   (cls_lo & F_SRAM_VLD) ? 'Y' : 'N',
 			   G_PORTMAP(cls_hi), G_PF(cls_lo),
 			   (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1);
 
 		if (cls_lo & F_REPLICATE) {
 			struct fw_ldst_cmd ldst_cmd;
 
 			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
 			ldst_cmd.op_to_addrspace =
 			    htobe32(V_FW_CMD_OP(FW_LDST_CMD) |
 				F_FW_CMD_REQUEST | F_FW_CMD_READ |
 				V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS));
 			ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd));
 			ldst_cmd.u.mps.rplc.fid_idx =
 			    htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) |
 				V_FW_LDST_CMD_IDX(i));
 
 			rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t4mps");
 			if (rc)
 				break;
 			rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd,
 			    sizeof(ldst_cmd), &ldst_cmd);
 			end_synchronized_op(sc, 0);
 
 			if (rc != 0) {
 				sbuf_printf(sb, "%36d", rc);
 				rc = 0;
 			} else {
 				sbuf_printf(sb, " %08x %08x %08x %08x",
 				    be32toh(ldst_cmd.u.mps.rplc.rplc127_96),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc95_64),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc63_32),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc31_0));
 			}
 		} else
 			sbuf_printf(sb, "%36s", "");
 
 		sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo),
 		    G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo),
 		    G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf);
 	}
 
 	if (rc)
 		(void) sbuf_finish(sb);
 	else
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 
 	MPASS(chip_id(sc) > CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "Idx  Ethernet address     Mask       VNI   Mask"
 	    "   IVLAN Vld DIP_Hit   Lookup  Port Vld Ports PF  VF"
 	    "                           Replication"
 	    "                                    P0 P1 P2 P3  ML\n");
 
 	for (i = 0; i < sc->chip_params->mps_tcam_size; i++) {
 		uint8_t dip_hit, vlan_vld, lookup_type, port_num;
 		uint16_t ivlan;
 		uint64_t tcamx, tcamy, val, mask;
 		uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy;
 		uint8_t addr[ETHER_ADDR_LEN];
 
 		ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0);
 		if (i < 256)
 			ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0);
 		else
 			ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1);
 		t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl);
 		val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1);
 		tcamy = G_DMACH(val) << 32;
 		tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1);
 		data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1);
 		lookup_type = G_DATALKPTYPE(data2);
 		port_num = G_DATAPORTNUM(data2);
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			/* Inner header VNI */
 			vniy = ((data2 & F_DATAVIDH2) << 23) |
 				       (G_DATAVIDH1(data2) << 16) | G_VIDL(val);
 			dip_hit = data2 & F_DATADIPHIT;
 			vlan_vld = 0;
 		} else {
 			vniy = 0;
 			dip_hit = 0;
 			vlan_vld = data2 & F_DATAVIDH2;
 			ivlan = G_VIDL(val);
 		}
 
 		ctl |= V_CTLXYBITSEL(1);
 		t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl);
 		val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1);
 		tcamx = G_DMACH(val) << 32;
 		tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1);
 		data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1);
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			/* Inner header VNI mask */
 			vnix = ((data2 & F_DATAVIDH2) << 23) |
 			       (G_DATAVIDH1(data2) << 16) | G_VIDL(val);
 		} else
 			vnix = 0;
 
 		if (tcamx & tcamy)
 			continue;
 		tcamxy2valmask(tcamx, tcamy, addr, &mask);
 
 		cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i));
 		cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i));
 
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x "
 			    "%012jx %06x %06x    -    -   %3c"
 			    "      'I'  %4x   %3c   %#x%4u%4d", i, addr[0],
 			    addr[1], addr[2], addr[3], addr[4], addr[5],
 			    (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N',
 			    port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N',
 			    G_PORTMAP(cls_hi), G_T6_PF(cls_lo),
 			    cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1);
 		} else {
 			sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x "
 			    "%012jx    -       -   ", i, addr[0], addr[1],
 			    addr[2], addr[3], addr[4], addr[5],
 			    (uintmax_t)mask);
 
 			if (vlan_vld)
 				sbuf_printf(sb, "%4u   Y     ", ivlan);
 			else
 				sbuf_printf(sb, "  -    N     ");
 
 			sbuf_printf(sb, "-      %3c  %4x   %3c   %#x%4u%4d",
 			    lookup_type ? 'I' : 'O', port_num,
 			    cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N',
 			    G_PORTMAP(cls_hi), G_T6_PF(cls_lo),
 			    cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1);
 		}
 
 
 		if (cls_lo & F_T6_REPLICATE) {
 			struct fw_ldst_cmd ldst_cmd;
 
 			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
 			ldst_cmd.op_to_addrspace =
 			    htobe32(V_FW_CMD_OP(FW_LDST_CMD) |
 				F_FW_CMD_REQUEST | F_FW_CMD_READ |
 				V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS));
 			ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd));
 			ldst_cmd.u.mps.rplc.fid_idx =
 			    htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) |
 				V_FW_LDST_CMD_IDX(i));
 
 			rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t6mps");
 			if (rc)
 				break;
 			rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd,
 			    sizeof(ldst_cmd), &ldst_cmd);
 			end_synchronized_op(sc, 0);
 
 			if (rc != 0) {
 				sbuf_printf(sb, "%72d", rc);
 				rc = 0;
 			} else {
 				sbuf_printf(sb, " %08x %08x %08x %08x"
 				    " %08x %08x %08x %08x",
 				    be32toh(ldst_cmd.u.mps.rplc.rplc255_224),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc223_192),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc191_160),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc159_128),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc127_96),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc95_64),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc63_32),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc31_0));
 			}
 		} else
 			sbuf_printf(sb, "%72s", "");
 
 		sbuf_printf(sb, "%4u%3u%3u%3u %#x",
 		    G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo),
 		    G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo),
 		    (cls_lo >> S_T6_MULTILISTEN0) & 0xf);
 	}
 
 	if (rc)
 		(void) sbuf_finish(sb);
 	else
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_path_mtus(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint16_t mtus[NMTUS];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_read_mtu_tbl(sc, mtus, NULL);
 
 	sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u",
 	    mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6],
 	    mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13],
 	    mtus[14], mtus[15]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_pm_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS];
 	uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS];
 	static const char *tx_stats[MAX_PM_NSTATS] = {
 		"Read:", "Write bypass:", "Write mem:", "Bypass + mem:",
 		"Tx FIFO wait", NULL, "Tx latency"
 	};
 	static const char *rx_stats[MAX_PM_NSTATS] = {
 		"Read:", "Write bypass:", "Write mem:", "Flush:",
 		"Rx FIFO wait", NULL, "Rx latency"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_pmtx_get_stats(sc, tx_cnt, tx_cyc);
 	t4_pmrx_get_stats(sc, rx_cnt, rx_cyc);
 
 	sbuf_printf(sb, "                Tx pcmds             Tx bytes");
 	for (i = 0; i < 4; i++) {
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 	}
 
 	sbuf_printf(sb, "\n                Rx pcmds             Rx bytes");
 	for (i = 0; i < 4; i++) {
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 	}
 
 	if (chip_id(sc) > CHELSIO_T5) {
 		sbuf_printf(sb,
 		    "\n              Total wait      Total occupancy");
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 
 		i += 2;
 		MPASS(i < nitems(tx_stats));
 
 		sbuf_printf(sb,
 		    "\n                   Reads           Total wait");
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_rdma_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_rdma_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_rdma_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod);
 	sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tcp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_tcp_stats v4, v6;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_tcp_stats(sc, &v4, &v6, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	sbuf_printf(sb,
 	    "                                IP                 IPv6\n");
 	sbuf_printf(sb, "OutRsts:      %20u %20u\n",
 	    v4.tcp_out_rsts, v6.tcp_out_rsts);
 	sbuf_printf(sb, "InSegs:       %20ju %20ju\n",
 	    v4.tcp_in_segs, v6.tcp_in_segs);
 	sbuf_printf(sb, "OutSegs:      %20ju %20ju\n",
 	    v4.tcp_out_segs, v6.tcp_out_segs);
 	sbuf_printf(sb, "RetransSegs:  %20ju %20ju",
 	    v4.tcp_retrans_segs, v6.tcp_retrans_segs);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tids(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tid_info *t = &sc->tids;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (t->natids) {
 		sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1,
 		    t->atids_in_use);
 	}
 
 	if (t->nhpftids) {
 		sbuf_printf(sb, "HPFTID range: %u-%u, in use: %u\n",
 		    t->hpftid_base, t->hpftid_end, t->hpftids_in_use);
 	}
 
 	if (t->ntids) {
 		sbuf_printf(sb, "TID range: ");
 		if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 			uint32_t b, hb;
 
 			if (chip_id(sc) <= CHELSIO_T5) {
 				b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4;
 				hb = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4;
 			} else {
 				b = t4_read_reg(sc, A_LE_DB_SRVR_START_INDEX);
 				hb = t4_read_reg(sc, A_T6_LE_DB_HASH_TID_BASE);
 			}
 
 			if (b)
 				sbuf_printf(sb, "%u-%u, ", t->tid_base, b - 1);
 			sbuf_printf(sb, "%u-%u", hb, t->ntids - 1);
 		} else
 			sbuf_printf(sb, "%u-%u", t->tid_base, t->ntids - 1);
 		sbuf_printf(sb, ", in use: %u\n",
 		    atomic_load_acq_int(&t->tids_in_use));
 	}
 
 	if (t->nstids) {
 		sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base,
 		    t->stid_base + t->nstids - 1, t->stids_in_use);
 	}
 
 	if (t->nftids) {
 		sbuf_printf(sb, "FTID range: %u-%u, in use: %u\n", t->ftid_base,
 		    t->ftid_end, t->ftids_in_use);
 	}
 
 	if (t->netids) {
 		sbuf_printf(sb, "ETID range: %u-%u, in use: %u\n", t->etid_base,
 		    t->etid_base + t->netids - 1, t->etids_in_use);
 	}
 
 	sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users",
 	    t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4),
 	    t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6));
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_err_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	t4_tp_get_err_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "                 channel 0  channel 1"
 		    "  channel 2  channel 3\n");
 		sbuf_printf(sb, "macInErrs:      %10u %10u %10u %10u\n",
 		    stats.mac_in_errs[0], stats.mac_in_errs[1],
 		    stats.mac_in_errs[2], stats.mac_in_errs[3]);
 		sbuf_printf(sb, "hdrInErrs:      %10u %10u %10u %10u\n",
 		    stats.hdr_in_errs[0], stats.hdr_in_errs[1],
 		    stats.hdr_in_errs[2], stats.hdr_in_errs[3]);
 		sbuf_printf(sb, "tcpInErrs:      %10u %10u %10u %10u\n",
 		    stats.tcp_in_errs[0], stats.tcp_in_errs[1],
 		    stats.tcp_in_errs[2], stats.tcp_in_errs[3]);
 		sbuf_printf(sb, "tcp6InErrs:     %10u %10u %10u %10u\n",
 		    stats.tcp6_in_errs[0], stats.tcp6_in_errs[1],
 		    stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]);
 		sbuf_printf(sb, "tnlCongDrops:   %10u %10u %10u %10u\n",
 		    stats.tnl_cong_drops[0], stats.tnl_cong_drops[1],
 		    stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]);
 		sbuf_printf(sb, "tnlTxDrops:     %10u %10u %10u %10u\n",
 		    stats.tnl_tx_drops[0], stats.tnl_tx_drops[1],
 		    stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]);
 		sbuf_printf(sb, "ofldVlanDrops:  %10u %10u %10u %10u\n",
 		    stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1],
 		    stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]);
 		sbuf_printf(sb, "ofldChanDrops:  %10u %10u %10u %10u\n\n",
 		    stats.ofld_chan_drops[0], stats.ofld_chan_drops[1],
 		    stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]);
 	} else {
 		sbuf_printf(sb, "                 channel 0  channel 1\n");
 		sbuf_printf(sb, "macInErrs:      %10u %10u\n",
 		    stats.mac_in_errs[0], stats.mac_in_errs[1]);
 		sbuf_printf(sb, "hdrInErrs:      %10u %10u\n",
 		    stats.hdr_in_errs[0], stats.hdr_in_errs[1]);
 		sbuf_printf(sb, "tcpInErrs:      %10u %10u\n",
 		    stats.tcp_in_errs[0], stats.tcp_in_errs[1]);
 		sbuf_printf(sb, "tcp6InErrs:     %10u %10u\n",
 		    stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]);
 		sbuf_printf(sb, "tnlCongDrops:   %10u %10u\n",
 		    stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]);
 		sbuf_printf(sb, "tnlTxDrops:     %10u %10u\n",
 		    stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]);
 		sbuf_printf(sb, "ofldVlanDrops:  %10u %10u\n",
 		    stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]);
 		sbuf_printf(sb, "ofldChanDrops:  %10u %10u\n\n",
 		    stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]);
 	}
 
 	sbuf_printf(sb, "ofldNoNeigh:    %u\nofldCongDefer:  %u",
 	    stats.ofld_no_neigh, stats.ofld_cong_defer);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct tp_params *tpp = &sc->params.tp;
 	u_int mask;
 	int rc;
 
 	mask = tpp->la_mask >> 16;
 	rc = sysctl_handle_int(oidp, &mask, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 	if (mask > 0xffff)
 		return (EINVAL);
 	tpp->la_mask = mask << 16;
 	t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U, tpp->la_mask);
 
 	return (0);
 }
 
 struct field_desc {
 	const char *name;
 	u_int start;
 	u_int width;
 };
 
 static void
 field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f)
 {
 	char buf[32];
 	int line_size = 0;
 
 	while (f->name) {
 		uint64_t mask = (1ULL << f->width) - 1;
 		int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name,
 		    ((uintmax_t)v >> f->start) & mask);
 
 		if (line_size + len >= 79) {
 			line_size = 8;
 			sbuf_printf(sb, "\n        ");
 		}
 		sbuf_printf(sb, "%s ", buf);
 		line_size += len + 1;
 		f++;
 	}
 	sbuf_printf(sb, "\n");
 }
 
 static const struct field_desc tp_la0[] = {
 	{ "RcfOpCodeOut", 60, 4 },
 	{ "State", 56, 4 },
 	{ "WcfState", 52, 4 },
 	{ "RcfOpcSrcOut", 50, 2 },
 	{ "CRxError", 49, 1 },
 	{ "ERxError", 48, 1 },
 	{ "SanityFailed", 47, 1 },
 	{ "SpuriousMsg", 46, 1 },
 	{ "FlushInputMsg", 45, 1 },
 	{ "FlushInputCpl", 44, 1 },
 	{ "RssUpBit", 43, 1 },
 	{ "RssFilterHit", 42, 1 },
 	{ "Tid", 32, 10 },
 	{ "InitTcb", 31, 1 },
 	{ "LineNumber", 24, 7 },
 	{ "Emsg", 23, 1 },
 	{ "EdataOut", 22, 1 },
 	{ "Cmsg", 21, 1 },
 	{ "CdataOut", 20, 1 },
 	{ "EreadPdu", 19, 1 },
 	{ "CreadPdu", 18, 1 },
 	{ "TunnelPkt", 17, 1 },
 	{ "RcfPeerFin", 16, 1 },
 	{ "RcfReasonOut", 12, 4 },
 	{ "TxCchannel", 10, 2 },
 	{ "RcfTxChannel", 8, 2 },
 	{ "RxEchannel", 6, 2 },
 	{ "RcfRxChannel", 5, 1 },
 	{ "RcfDataOutSrdy", 4, 1 },
 	{ "RxDvld", 3, 1 },
 	{ "RxOoDvld", 2, 1 },
 	{ "RxCongestion", 1, 1 },
 	{ "TxCongestion", 0, 1 },
 	{ NULL }
 };
 
 static const struct field_desc tp_la1[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "CplCmdOut", 48, 8 },
 	{ "ESynOut", 47, 1 },
 	{ "EAckOut", 46, 1 },
 	{ "EFinOut", 45, 1 },
 	{ "ERstOut", 44, 1 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static const struct field_desc tp_la2[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "MpsVfVld", 55, 1 },
 	{ "MpsPf", 52, 3 },
 	{ "MpsVf", 44, 8 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static void
 tp_la_show(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	field_desc_show(sb, *p, tp_la0);
 }
 
 static void
 tp_la_show2(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], tp_la0);
 }
 
 static void
 tp_la_show3(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1);
 }
 
 static int
 sysctl_tp_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint64_t *buf, *p;
 	int rc;
 	u_int i, inc;
 	void (*show_func)(struct sbuf *, uint64_t *, int);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK);
 
 	t4_tp_read_la(sc, buf, NULL);
 	p = buf;
 
 	switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) {
 	case 2:
 		inc = 2;
 		show_func = tp_la_show2;
 		break;
 	case 3:
 		inc = 2;
 		show_func = tp_la_show3;
 		break;
 	default:
 		inc = 1;
 		show_func = tp_la_show;
 	}
 
 	for (i = 0; i < TPLA_SIZE / inc; i++, p += inc)
 		(*show_func)(sb, p, i);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_tx_rate(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	u64 nrate[MAX_NCHAN], orate[MAX_NCHAN];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	t4_get_chan_txrate(sc, nrate, orate);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "              channel 0   channel 1"
 		    "   channel 2   channel 3\n");
 		sbuf_printf(sb, "NIC B/s:     %10ju  %10ju  %10ju  %10ju\n",
 		    nrate[0], nrate[1], nrate[2], nrate[3]);
 		sbuf_printf(sb, "Offload B/s: %10ju  %10ju  %10ju  %10ju",
 		    orate[0], orate[1], orate[2], orate[3]);
 	} else {
 		sbuf_printf(sb, "              channel 0   channel 1\n");
 		sbuf_printf(sb, "NIC B/s:     %10ju  %10ju\n",
 		    nrate[0], nrate[1]);
 		sbuf_printf(sb, "Offload B/s: %10ju  %10ju",
 		    orate[0], orate[1]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_ulprx_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc, i;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_ulprx_read_la(sc, buf);
 	p = buf;
 
 	sbuf_printf(sb, "      Pcmd        Type   Message"
 	    "                Data");
 	for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) {
 		sbuf_printf(sb, "\n%08x%08x  %4x  %08x  %08x%08x%08x%08x",
 		    p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, v;
 
 	MPASS(chip_id(sc) >= CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	v = t4_read_reg(sc, A_SGE_STAT_CFG);
 	if (G_STATSOURCE_T5(v) == 7) {
 		int mode;
 
 		mode = is_t5(sc) ? G_STATMODE(v) : G_T6_STATMODE(v);
 		if (mode == 0) {
 			sbuf_printf(sb, "total %d, incomplete %d",
 			    t4_read_reg(sc, A_SGE_STAT_TOTAL),
 			    t4_read_reg(sc, A_SGE_STAT_MATCH));
 		} else if (mode == 1) {
 			sbuf_printf(sb, "total %d, data overflow %d",
 			    t4_read_reg(sc, A_SGE_STAT_TOTAL),
 			    t4_read_reg(sc, A_SGE_STAT_MATCH));
 		} else {
 			sbuf_printf(sb, "unknown mode %d", mode);
 		}
 	}
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cpus(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	enum cpu_sets op = arg2;
 	cpuset_t cpuset;
 	struct sbuf *sb;
 	int i, rc;
 
 	MPASS(op == LOCAL_CPUS || op == INTR_CPUS);
 
 	CPU_ZERO(&cpuset);
 	rc = bus_get_cpus(sc->dev, op, sizeof(cpuset), &cpuset);
 	if (rc != 0)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	CPU_FOREACH(i)
 		sbuf_printf(sb, "%d ", i);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int *old_ports, *new_ports;
 	int i, new_count, rc;
 
 	if (req->newptr == NULL && req->oldptr == NULL)
 		return (SYSCTL_OUT(req, NULL, imax(sc->tt.num_tls_rx_ports, 1) *
 		    sizeof(sc->tt.tls_rx_ports[0])));
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tlsrx");
 	if (rc)
 		return (rc);
 
 	if (sc->tt.num_tls_rx_ports == 0) {
 		i = -1;
 		rc = SYSCTL_OUT(req, &i, sizeof(i));
 	} else
 		rc = SYSCTL_OUT(req, sc->tt.tls_rx_ports,
 		    sc->tt.num_tls_rx_ports * sizeof(sc->tt.tls_rx_ports[0]));
 	if (rc == 0 && req->newptr != NULL) {
 		new_count = req->newlen / sizeof(new_ports[0]);
 		new_ports = malloc(new_count * sizeof(new_ports[0]), M_CXGBE,
 		    M_WAITOK);
 		rc = SYSCTL_IN(req, new_ports, new_count *
 		    sizeof(new_ports[0]));
 		if (rc)
 			goto err;
 
 		/* Allow setting to a single '-1' to clear the list. */
 		if (new_count == 1 && new_ports[0] == -1) {
 			ADAPTER_LOCK(sc);
 			old_ports = sc->tt.tls_rx_ports;
 			sc->tt.tls_rx_ports = NULL;
 			sc->tt.num_tls_rx_ports = 0;
 			ADAPTER_UNLOCK(sc);
 			free(old_ports, M_CXGBE);
 		} else {
 			for (i = 0; i < new_count; i++) {
 				if (new_ports[i] < 1 ||
 				    new_ports[i] > IPPORT_MAX) {
 					rc = EINVAL;
 					goto err;
 				}
 			}
 
 			ADAPTER_LOCK(sc);
 			old_ports = sc->tt.tls_rx_ports;
 			sc->tt.tls_rx_ports = new_ports;
 			sc->tt.num_tls_rx_ports = new_count;
 			ADAPTER_UNLOCK(sc);
 			free(old_ports, M_CXGBE);
 			new_ports = NULL;
 		}
 	err:
 		free(new_ports, M_CXGBE);
 	}
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static void
 unit_conv(char *buf, size_t len, u_int val, u_int factor)
 {
 	u_int rem = val % factor;
 
 	if (rem == 0)
 		snprintf(buf, len, "%u", val / factor);
 	else {
 		while (rem % 10 == 0)
 			rem /= 10;
 		snprintf(buf, len, "%u.%u", val / factor, rem);
 	}
 }
 
 static int
 sysctl_tp_tick(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	char buf[16];
 	u_int res, re;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION);
 	switch (arg2) {
 	case 0:
 		/* timer_tick */
 		re = G_TIMERRESOLUTION(res);
 		break;
 	case 1:
 		/* TCP timestamp tick */
 		re = G_TIMESTAMPRESOLUTION(res);
 		break;
 	case 2:
 		/* DACK tick */
 		re = G_DELAYEDACKRESOLUTION(res);
 		break;
 	default:
 		return (EDOOFUS);
 	}
 
 	unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000);
 
 	return (sysctl_handle_string(oidp, buf, sizeof(buf), req));
 }
 
 static int
 sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int res, dack_re, v;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION);
 	dack_re = G_DELAYEDACKRESOLUTION(res);
 	v = ((cclk_ps << dack_re) / 1000000) * t4_read_reg(sc, A_TP_DACK_TIMER);
 
 	return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_tp_timer(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int reg = arg2;
 	u_int tre;
 	u_long tp_tick_us, v;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX ||
 	    reg == A_TP_PERS_MIN  || reg == A_TP_PERS_MAX ||
 	    reg == A_TP_KEEP_IDLE || reg == A_TP_KEEP_INTVL ||
 	    reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER);
 
 	tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION));
 	tp_tick_us = (cclk_ps << tre) / 1000000;
 
 	if (reg == A_TP_INIT_SRTT)
 		v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg));
 	else
 		v = tp_tick_us * t4_read_reg(sc, reg);
 
 	return (sysctl_handle_long(oidp, &v, 0, req));
 }
 
 /*
  * All fields in TP_SHIFT_CNT are 4b and the starting location of the field is
  * passed to this function.
  */
 static int
 sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int idx = arg2;
 	u_int v;
 
 	MPASS(idx >= 0 && idx <= 24);
 
 	v = (t4_read_reg(sc, A_TP_SHIFT_CNT) >> idx) & 0xf;
 
 	return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_tp_backoff(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int idx = arg2;
 	u_int shift, v, r;
 
 	MPASS(idx >= 0 && idx < 16);
 
 	r = A_TP_TCP_BACKOFF_REG0 + (idx & ~3);
 	shift = (idx & 3) << 3;
 	v = (t4_read_reg(sc, r) >> shift) & M_TIMERBACKOFFINDEX0;
 
 	return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc, i;
 	struct sge_ofld_rxq *ofld_rxq;
 	uint8_t v;
 
 	idx = vi->ofld_tmr_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < 0 || idx >= SGE_NTIMERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4otmr");
 	if (rc)
 		return (rc);
 
 	v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->ofld_pktc_idx != -1);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&ofld_rxq->iq.intr_params, v);
 #else
 		ofld_rxq->iq.intr_params = v;
 #endif
 	}
 	vi->ofld_tmr_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->pi->adapter;
 	int idx, rc;
 
 	idx = vi->ofld_pktc_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < -1 || idx >= SGE_NCOUNTERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4opktc");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->ofld_pktc_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 #endif
 
 static int
 get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt)
 {
 	int rc;
 
 	if (cntxt->cid > M_CTXTQID)
 		return (EINVAL);
 
 	if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS &&
 	    cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt");
 	if (rc)
 		return (rc);
 
 	if (sc->flags & FW_OK) {
 		rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id,
 		    &cntxt->data[0]);
 		if (rc == 0)
 			goto done;
 	}
 
 	/*
 	 * Read via firmware failed or wasn't even attempted.  Read directly via
 	 * the backdoor.
 	 */
 	rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_fw(struct adapter *sc, struct t4_data *fw)
 {
 	int rc;
 	uint8_t *fw_data;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw");
 	if (rc)
 		return (rc);
 
 	/*
 	 * The firmware, with the sole exception of the memory parity error
 	 * handler, runs from memory and not flash.  It is almost always safe to
 	 * install a new firmware on a running system.  Just set bit 1 in
 	 * hw.cxgbe.dflags or dev.<nexus>.<n>.dflags first.
 	 */
 	if (sc->flags & FULL_INIT_DONE &&
 	    (sc->debug_flags & DF_LOAD_FW_ANYTIME) == 0) {
 		rc = EBUSY;
 		goto done;
 	}
 
 	fw_data = malloc(fw->len, M_CXGBE, M_WAITOK);
 	if (fw_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(fw->data, fw_data, fw->len);
 	if (rc == 0)
 		rc = -t4_load_fw(sc, fw_data, fw->len);
 
 	free(fw_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_cfg(struct adapter *sc, struct t4_data *cfg)
 {
 	int rc;
 	uint8_t *cfg_data = NULL;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf");
 	if (rc)
 		return (rc);
 
 	if (cfg->len == 0) {
 		/* clear */
 		rc = -t4_load_cfg(sc, NULL, 0);
 		goto done;
 	}
 
 	cfg_data = malloc(cfg->len, M_CXGBE, M_WAITOK);
 	if (cfg_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(cfg->data, cfg_data, cfg->len);
 	if (rc == 0)
 		rc = -t4_load_cfg(sc, cfg_data, cfg->len);
 
 	free(cfg_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_boot(struct adapter *sc, struct t4_bootrom *br)
 {
 	int rc;
 	uint8_t *br_data = NULL;
 	u_int offset;
 
 	if (br->len > 1024 * 1024)
 		return (EFBIG);
 
 	if (br->pf_offset == 0) {
 		/* pfidx */
 		if (br->pfidx_addr > 7)
 			return (EINVAL);
 		offset = G_OFFSET(t4_read_reg(sc, PF_REG(br->pfidx_addr,
 		    A_PCIE_PF_EXPROM_OFST)));
 	} else if (br->pf_offset == 1) {
 		/* offset */
 		offset = G_OFFSET(br->pfidx_addr);
 	} else {
 		return (EINVAL);
 	}
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldbr");
 	if (rc)
 		return (rc);
 
 	if (br->len == 0) {
 		/* clear */
 		rc = -t4_load_boot(sc, NULL, offset, 0);
 		goto done;
 	}
 
 	br_data = malloc(br->len, M_CXGBE, M_WAITOK);
 	if (br_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(br->data, br_data, br->len);
 	if (rc == 0)
 		rc = -t4_load_boot(sc, br_data, offset, br->len);
 
 	free(br_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_bootcfg(struct adapter *sc, struct t4_data *bc)
 {
 	int rc;
 	uint8_t *bc_data = NULL;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf");
 	if (rc)
 		return (rc);
 
 	if (bc->len == 0) {
 		/* clear */
 		rc = -t4_load_bootcfg(sc, NULL, 0);
 		goto done;
 	}
 
 	bc_data = malloc(bc->len, M_CXGBE, M_WAITOK);
 	if (bc_data == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	rc = copyin(bc->data, bc_data, bc->len);
 	if (rc == 0)
 		rc = -t4_load_bootcfg(sc, bc_data, bc->len);
 
 	free(bc_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 cudbg_dump(struct adapter *sc, struct t4_cudbg_dump *dump)
 {
 	int rc;
 	struct cudbg_init *cudbg;
 	void *handle, *buf;
 
 	/* buf is large, don't block if no memory is available */
 	buf = malloc(dump->len, M_CXGBE, M_NOWAIT | M_ZERO);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	handle = cudbg_alloc_handle();
 	if (handle == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	cudbg = cudbg_get_init(handle);
 	cudbg->adap = sc;
 	cudbg->print = (cudbg_print_cb)printf;
 
 #ifndef notyet
 	device_printf(sc->dev, "%s: wr_flash %u, len %u, data %p.\n",
 	    __func__, dump->wr_flash, dump->len, dump->data);
 #endif
 
 	if (dump->wr_flash)
 		cudbg->use_flash = 1;
 	MPASS(sizeof(cudbg->dbg_bitmap) == sizeof(dump->bitmap));
 	memcpy(cudbg->dbg_bitmap, dump->bitmap, sizeof(cudbg->dbg_bitmap));
 
 	rc = cudbg_collect(handle, buf, &dump->len);
 	if (rc != 0)
 		goto done;
 
 	rc = copyout(buf, dump->data, dump->len);
 done:
 	cudbg_free_handle(handle);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static void
 free_offload_policy(struct t4_offload_policy *op)
 {
 	struct offload_rule *r;
 	int i;
 
 	if (op == NULL)
 		return;
 
 	r = &op->rule[0];
 	for (i = 0; i < op->nrules; i++, r++) {
 		free(r->bpf_prog.bf_insns, M_CXGBE);
 	}
 	free(op->rule, M_CXGBE);
 	free(op, M_CXGBE);
 }
 
 static int
 set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop)
 {
 	int i, rc, len;
 	struct t4_offload_policy *op, *old;
 	struct bpf_program *bf;
 	const struct offload_settings *s;
 	struct offload_rule *r;
 	void *u;
 
 	if (!is_offload(sc))
 		return (ENODEV);
 
 	if (uop->nrules == 0) {
 		/* Delete installed policies. */
 		op = NULL;
 		goto set_policy;
 	} if (uop->nrules > 256) { /* arbitrary */
 		return (E2BIG);
 	}
 
 	/* Copy userspace offload policy to kernel */
 	op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK);
 	op->nrules = uop->nrules;
 	len = op->nrules * sizeof(struct offload_rule);
 	op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
 	rc = copyin(uop->rule, op->rule, len);
 	if (rc) {
 		free(op->rule, M_CXGBE);
 		free(op, M_CXGBE);
 		return (rc);
 	}
 
 	r = &op->rule[0];
 	for (i = 0; i < op->nrules; i++, r++) {
 
 		/* Validate open_type */
 		if (r->open_type != OPEN_TYPE_LISTEN &&
 		    r->open_type != OPEN_TYPE_ACTIVE &&
 		    r->open_type != OPEN_TYPE_PASSIVE &&
 		    r->open_type != OPEN_TYPE_DONTCARE) {
 error:
 			/*
 			 * Rules 0 to i have malloc'd filters that need to be
 			 * freed.  Rules i+1 to nrules have userspace pointers
 			 * and should be left alone.
 			 */
 			op->nrules = i;
 			free_offload_policy(op);
 			return (rc);
 		}
 
 		/* Validate settings */
 		s = &r->settings;
 		if ((s->offload != 0 && s->offload != 1) ||
 		    s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED ||
 		    s->sched_class < -1 ||
 		    s->sched_class >= sc->chip_params->nsched_cls) {
 			rc = EINVAL;
 			goto error;
 		}
 
 		bf = &r->bpf_prog;
 		u = bf->bf_insns;	/* userspace ptr */
 		bf->bf_insns = NULL;
 		if (bf->bf_len == 0) {
 			/* legal, matches everything */
 			continue;
 		}
 		len = bf->bf_len * sizeof(*bf->bf_insns);
 		bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
 		rc = copyin(u, bf->bf_insns, len);
 		if (rc != 0)
 			goto error;
 
 		if (!bpf_validate(bf->bf_insns, bf->bf_len)) {
 			rc = EINVAL;
 			goto error;
 		}
 	}
 set_policy:
 	rw_wlock(&sc->policy_lock);
 	old = sc->policy;
 	sc->policy = op;
 	rw_wunlock(&sc->policy_lock);
 	free_offload_policy(old);
 
 	return (0);
 }
 
 #define MAX_READ_BUF_SIZE (128 * 1024)
 static int
 read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr)
 {
 	uint32_t addr, remaining, n;
 	uint32_t *buf;
 	int rc;
 	uint8_t *dst;
 
 	rc = validate_mem_range(sc, mr->addr, mr->len);
 	if (rc != 0)
 		return (rc);
 
 	buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK);
 	addr = mr->addr;
 	remaining = mr->len;
 	dst = (void *)mr->data;
 
 	while (remaining) {
 		n = min(remaining, MAX_READ_BUF_SIZE);
 		read_via_memwin(sc, 2, addr, buf, n);
 
 		rc = copyout(buf, dst, n);
 		if (rc != 0)
 			break;
 
 		dst += n;
 		remaining -= n;
 		addr += n;
 	}
 
 	free(buf, M_CXGBE);
 	return (rc);
 }
 #undef MAX_READ_BUF_SIZE
 
 static int
 read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd)
 {
 	int rc;
 
 	if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports)
 		return (EINVAL);
 
 	if (i2cd->len > sizeof(i2cd->data))
 		return (EFBIG);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd");
 	if (rc)
 		return (rc);
 	rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr,
 	    i2cd->offset, i2cd->len, &i2cd->data[0]);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 int
 t4_os_find_pci_capability(struct adapter *sc, int cap)
 {
 	int i;
 
 	return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0);
 }
 
 int
 t4_os_pci_save_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_save(dev, dinfo, 0);
 	return (0);
 }
 
 int
 t4_os_pci_restore_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_restore(dev, dinfo);
 	return (0);
 }
 
 void
 t4_os_portmod_changed(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	static const char *mod_str[] = {
 		NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM"
 	};
 
 	KASSERT((pi->flags & FIXED_IFMEDIA) == 0,
 	    ("%s: port_type %u", __func__, pi->port_type));
 
 	vi = &pi->vi[0];
 	if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) {
 		PORT_LOCK(pi);
 		build_medialist(pi);
 		if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) {
 			fixup_link_config(pi);
 			apply_link_config(pi);
 		}
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, LOCK_HELD);
 	}
 
 	ifp = vi->ifp;
 	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
 		if_printf(ifp, "transceiver unplugged.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN)
 		if_printf(ifp, "unknown transceiver inserted.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED)
 		if_printf(ifp, "unsupported transceiver inserted.\n");
 	else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) {
 		if_printf(ifp, "%dGbps %s transceiver inserted.\n",
 		    port_top_speed(pi), mod_str[pi->mod_type]);
 	} else {
 		if_printf(ifp, "transceiver (type %d) inserted.\n",
 		    pi->mod_type);
 	}
 }
 
 void
 t4_os_link_changed(struct port_info *pi)
 {
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	struct link_config *lc;
 	int v;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	for_each_vi(pi, v, vi) {
 		ifp = vi->ifp;
 		if (ifp == NULL)
 			continue;
 
 		lc = &pi->link_cfg;
 		if (lc->link_ok) {
 			ifp->if_baudrate = IF_Mbps(lc->speed);
 			if_link_state_change(ifp, LINK_STATE_UP);
 		} else {
 			if_link_state_change(ifp, LINK_STATE_DOWN);
 		}
 	}
 }
 
 void
 t4_iterate(void (*func)(struct adapter *, void *), void *arg)
 {
 	struct adapter *sc;
 
 	sx_slock(&t4_list_lock);
 	SLIST_FOREACH(sc, &t4_list, link) {
 		/*
 		 * func should not make any assumptions about what state sc is
 		 * in - the only guarantee is that sc->sc_lock is a valid lock.
 		 */
 		func(sc, arg);
 	}
 	sx_sunlock(&t4_list_lock);
 }
 
 static int
 t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	int rc;
 	struct adapter *sc = dev->si_drv1;
 
 	rc = priv_check(td, PRIV_DRIVER);
 	if (rc != 0)
 		return (rc);
 
 	switch (cmd) {
 	case CHELSIO_T4_GETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		if (edata->size == 4)
 			edata->val = t4_read_reg(sc, edata->addr);
 		else if (edata->size == 8)
 			edata->val = t4_read_reg64(sc, edata->addr);
 		else
 			return (EINVAL);
 
 		break;
 	}
 	case CHELSIO_T4_SETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		if (edata->size == 4) {
 			if (edata->val & 0xffffffff00000000)
 				return (EINVAL);
 			t4_write_reg(sc, edata->addr, (uint32_t) edata->val);
 		} else if (edata->size == 8)
 			t4_write_reg64(sc, edata->addr, edata->val);
 		else
 			return (EINVAL);
 		break;
 	}
 	case CHELSIO_T4_REGDUMP: {
 		struct t4_regdump *regs = (struct t4_regdump *)data;
 		int reglen = t4_get_regs_len(sc);
 		uint8_t *buf;
 
 		if (regs->len < reglen) {
 			regs->len = reglen; /* hint to the caller */
 			return (ENOBUFS);
 		}
 
 		regs->len = reglen;
 		buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO);
 		get_regs(sc, regs, buf);
 		rc = copyout(buf, regs->data, reglen);
 		free(buf, M_CXGBE);
 		break;
 	}
 	case CHELSIO_T4_GET_FILTER_MODE:
 		rc = get_filter_mode(sc, (uint32_t *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER_MODE:
 		rc = set_filter_mode(sc, *(uint32_t *)data);
 		break;
 	case CHELSIO_T4_GET_FILTER:
 		rc = get_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER:
 		rc = set_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_DEL_FILTER:
 		rc = del_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_GET_SGE_CONTEXT:
 		rc = get_sge_context(sc, (struct t4_sge_context *)data);
 		break;
 	case CHELSIO_T4_LOAD_FW:
 		rc = load_fw(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_GET_MEM:
 		rc = read_card_mem(sc, 2, (struct t4_mem_range *)data);
 		break;
 	case CHELSIO_T4_GET_I2C:
 		rc = read_i2c(sc, (struct t4_i2c_data *)data);
 		break;
 	case CHELSIO_T4_CLEAR_STATS: {
 		int i, v, bg_map;
 		u_int port_id = *(uint32_t *)data;
 		struct port_info *pi;
 		struct vi_info *vi;
 
 		if (port_id >= sc->params.nports)
 			return (EINVAL);
 		pi = sc->port[port_id];
 		if (pi == NULL)
 			return (EIO);
 
 		/* MAC stats */
 		t4_clr_port_stats(sc, pi->tx_chan);
 		pi->tx_parse_error = 0;
 		pi->tnl_cong_drops = 0;
 		mtx_lock(&sc->reg_lock);
 		for_each_vi(pi, v, vi) {
 			if (vi->flags & VI_INIT_DONE)
 				t4_clr_vi_stats(sc, vi->vin);
 		}
 		bg_map = pi->mps_bg_map;
 		v = 0;	/* reuse */
 		while (bg_map) {
 			i = ffs(bg_map) - 1;
 			t4_write_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v,
 			    1, A_TP_MIB_TNL_CNG_DROP_0 + i);
 			bg_map &= ~(1 << i);
 		}
 		mtx_unlock(&sc->reg_lock);
 
 		/*
 		 * Since this command accepts a port, clear stats for
 		 * all VIs on this port.
 		 */
 		for_each_vi(pi, v, vi) {
 			if (vi->flags & VI_INIT_DONE) {
 				struct sge_rxq *rxq;
 				struct sge_txq *txq;
 				struct sge_wrq *wrq;
 
 				for_each_rxq(vi, i, rxq) {
 #if defined(INET) || defined(INET6)
 					rxq->lro.lro_queued = 0;
 					rxq->lro.lro_flushed = 0;
 #endif
 					rxq->rxcsum = 0;
 					rxq->vlan_extraction = 0;
 				}
 
 				for_each_txq(vi, i, txq) {
 					txq->txcsum = 0;
 					txq->tso_wrs = 0;
 					txq->vlan_insertion = 0;
 					txq->imm_wrs = 0;
 					txq->sgl_wrs = 0;
 					txq->txpkt_wrs = 0;
 					txq->txpkts0_wrs = 0;
 					txq->txpkts1_wrs = 0;
 					txq->txpkts0_pkts = 0;
 					txq->txpkts1_pkts = 0;
 					txq->raw_wrs = 0;
 					mp_ring_reset_stats(txq->r);
 				}
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 				/* nothing to clear for each ofld_rxq */
 
 				for_each_ofld_txq(vi, i, wrq) {
 					wrq->tx_wrs_direct = 0;
 					wrq->tx_wrs_copied = 0;
 				}
 #endif
 
 				if (IS_MAIN_VI(vi)) {
 					wrq = &sc->sge.ctrlq[pi->port_id];
 					wrq->tx_wrs_direct = 0;
 					wrq->tx_wrs_copied = 0;
 				}
 			}
 		}
 		break;
 	}
 	case CHELSIO_T4_SCHED_CLASS:
 		rc = t4_set_sched_class(sc, (struct t4_sched_params *)data);
 		break;
 	case CHELSIO_T4_SCHED_QUEUE:
 		rc = t4_set_sched_queue(sc, (struct t4_sched_queue *)data);
 		break;
 	case CHELSIO_T4_GET_TRACER:
 		rc = t4_get_tracer(sc, (struct t4_tracer *)data);
 		break;
 	case CHELSIO_T4_SET_TRACER:
 		rc = t4_set_tracer(sc, (struct t4_tracer *)data);
 		break;
 	case CHELSIO_T4_LOAD_CFG:
 		rc = load_cfg(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_LOAD_BOOT:
 		rc = load_boot(sc, (struct t4_bootrom *)data);
 		break;
 	case CHELSIO_T4_LOAD_BOOTCFG:
 		rc = load_bootcfg(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_CUDBG_DUMP:
 		rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data);
 		break;
 	case CHELSIO_T4_SET_OFLD_POLICY:
 		rc = set_offload_policy(sc, (struct t4_offload_policy *)data);
 		break;
 	default:
 		rc = ENOTTY;
 	}
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 toe_capability(struct vi_info *vi, int enable)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!is_offload(sc))
 		return (ENODEV);
 
 	if (enable) {
 		if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) {
 			/* TOE is already enabled. */
 			return (0);
 		}
 
 		/*
 		 * We need the port's queues around so that we're able to send
 		 * and receive CPLs to/from the TOE even if the ifnet for this
 		 * port has never been UP'd administratively.
 		 */
 		if (!(vi->flags & VI_INIT_DONE)) {
 			rc = vi_full_init(vi);
 			if (rc)
 				return (rc);
 		}
 		if (!(pi->vi[0].flags & VI_INIT_DONE)) {
 			rc = vi_full_init(&pi->vi[0]);
 			if (rc)
 				return (rc);
 		}
 
 		if (isset(&sc->offload_map, pi->port_id)) {
 			/* TOE is enabled on another VI of this port. */
 			pi->uld_vis++;
 			return (0);
 		}
 
 		if (!uld_active(sc, ULD_TOM)) {
 			rc = t4_activate_uld(sc, ULD_TOM);
 			if (rc == EAGAIN) {
 				log(LOG_WARNING,
 				    "You must kldload t4_tom.ko before trying "
 				    "to enable TOE on a cxgbe interface.\n");
 			}
 			if (rc != 0)
 				return (rc);
 			KASSERT(sc->tom_softc != NULL,
 			    ("%s: TOM activated but softc NULL", __func__));
 			KASSERT(uld_active(sc, ULD_TOM),
 			    ("%s: TOM activated but flag not set", __func__));
 		}
 
 		/* Activate iWARP and iSCSI too, if the modules are loaded. */
 		if (!uld_active(sc, ULD_IWARP))
 			(void) t4_activate_uld(sc, ULD_IWARP);
 		if (!uld_active(sc, ULD_ISCSI))
 			(void) t4_activate_uld(sc, ULD_ISCSI);
 
 		pi->uld_vis++;
 		setbit(&sc->offload_map, pi->port_id);
 	} else {
 		pi->uld_vis--;
 
 		if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0)
 			return (0);
 
 		KASSERT(uld_active(sc, ULD_TOM),
 		    ("%s: TOM never initialized?", __func__));
 		clrbit(&sc->offload_map, pi->port_id);
 	}
 
 	return (0);
 }
 
 /*
  * Add an upper layer driver to the global list.
  */
 int
 t4_register_uld(struct uld_info *ui)
 {
 	int rc = 0;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u->uld_id == ui->uld_id) {
 		    rc = EEXIST;
 		    goto done;
 	    }
 	}
 
 	SLIST_INSERT_HEAD(&t4_uld_list, ui, link);
 	ui->refcount = 0;
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_unregister_uld(struct uld_info *ui)
 {
 	int rc = EINVAL;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u == ui) {
 		    if (ui->refcount > 0) {
 			    rc = EBUSY;
 			    goto done;
 		    }
 
 		    SLIST_REMOVE(&t4_uld_list, ui, uld_info, link);
 		    rc = 0;
 		    goto done;
 	    }
 	}
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_activate_uld(struct adapter *sc, int id)
 {
 	int rc;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (id < 0 || id > ULD_MAX)
 		return (EINVAL);
 	rc = EAGAIN;	/* kldoad the module with this ULD and try again. */
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			if (!(sc->flags & FULL_INIT_DONE)) {
 				rc = adapter_full_init(sc);
 				if (rc != 0)
 					break;
 			}
 
 			rc = ui->activate(sc);
 			if (rc == 0) {
 				setbit(&sc->active_ulds, id);
 				ui->refcount++;
 			}
 			break;
 		}
 	}
 
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 
 int
 t4_deactivate_uld(struct adapter *sc, int id)
 {
 	int rc;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (id < 0 || id > ULD_MAX)
 		return (EINVAL);
 	rc = ENXIO;
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			rc = ui->deactivate(sc);
 			if (rc == 0) {
 				clrbit(&sc->active_ulds, id);
 				ui->refcount--;
 			}
 			break;
 		}
 	}
 
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 
 int
 uld_active(struct adapter *sc, int uld_id)
 {
 
 	MPASS(uld_id >= 0 && uld_id <= ULD_MAX);
 
 	return (isset(&sc->active_ulds, uld_id));
 }
 #endif
 
 /*
  * t  = ptr to tunable.
  * nc = number of CPUs.
  * c  = compiled in default for that tunable.
  */
 static void
 calculate_nqueues(int *t, int nc, const int c)
 {
 	int nq;
 
 	if (*t > 0)
 		return;
 	nq = *t < 0 ? -*t : c;
 	*t = min(nc, nq);
 }
 
 /*
  * Come up with reasonable defaults for some of the tunables, provided they're
  * not set by the user (in which case we'll use the values as is).
  */
 static void
 tweak_tunables(void)
 {
 	int nc = mp_ncpus;	/* our snapshot of the number of CPUs */
 
 	if (t4_ntxq < 1) {
 #ifdef RSS
 		t4_ntxq = rss_getnumbuckets();
 #else
 		calculate_nqueues(&t4_ntxq, nc, NTXQ);
 #endif
 	}
 
 	calculate_nqueues(&t4_ntxq_vi, nc, NTXQ_VI);
 
 	if (t4_nrxq < 1) {
 #ifdef RSS
 		t4_nrxq = rss_getnumbuckets();
 #else
 		calculate_nqueues(&t4_nrxq, nc, NRXQ);
 #endif
 	}
 
 	calculate_nqueues(&t4_nrxq_vi, nc, NRXQ_VI);
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	calculate_nqueues(&t4_nofldtxq, nc, NOFLDTXQ);
 	calculate_nqueues(&t4_nofldtxq_vi, nc, NOFLDTXQ_VI);
 #endif
 #ifdef TCP_OFFLOAD
 	calculate_nqueues(&t4_nofldrxq, nc, NOFLDRXQ);
 	calculate_nqueues(&t4_nofldrxq_vi, nc, NOFLDRXQ_VI);
 
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = FW_CAPS_CONFIG_TOE;
 
 	if (t4_rdmacaps_allowed == -1) {
 		t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP |
 		    FW_CAPS_CONFIG_RDMA_RDMAC;
 	}
 
 	if (t4_iscsicaps_allowed == -1) {
 		t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU |
 		    FW_CAPS_CONFIG_ISCSI_TARGET_PDU |
 		    FW_CAPS_CONFIG_ISCSI_T10DIF;
 	}
 
 	if (t4_tmr_idx_ofld < 0 || t4_tmr_idx_ofld >= SGE_NTIMERS)
 		t4_tmr_idx_ofld = TMR_IDX_OFLD;
 
 	if (t4_pktc_idx_ofld < -1 || t4_pktc_idx_ofld >= SGE_NCOUNTERS)
 		t4_pktc_idx_ofld = PKTC_IDX_OFLD;
 #else
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = 0;
 
 	if (t4_rdmacaps_allowed == -1)
 		t4_rdmacaps_allowed = 0;
 
 	if (t4_iscsicaps_allowed == -1)
 		t4_iscsicaps_allowed = 0;
 #endif
 
 #ifdef DEV_NETMAP
 	calculate_nqueues(&t4_nnmtxq_vi, nc, NNMTXQ_VI);
 	calculate_nqueues(&t4_nnmrxq_vi, nc, NNMRXQ_VI);
 #endif
 
 	if (t4_tmr_idx < 0 || t4_tmr_idx >= SGE_NTIMERS)
 		t4_tmr_idx = TMR_IDX;
 
 	if (t4_pktc_idx < -1 || t4_pktc_idx >= SGE_NCOUNTERS)
 		t4_pktc_idx = PKTC_IDX;
 
 	if (t4_qsize_txq < 128)
 		t4_qsize_txq = 128;
 
 	if (t4_qsize_rxq < 128)
 		t4_qsize_rxq = 128;
 	while (t4_qsize_rxq & 7)
 		t4_qsize_rxq++;
 
 	t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX;
 
 	/*
 	 * Number of VIs to create per-port.  The first VI is the "main" regular
 	 * VI for the port.  The rest are additional virtual interfaces on the
 	 * same physical port.  Note that the main VI does not have native
 	 * netmap support but the extra VIs do.
 	 *
 	 * Limit the number of VIs per port to the number of available
 	 * MAC addresses per port.
 	 */
 	if (t4_num_vis < 1)
 		t4_num_vis = 1;
 	if (t4_num_vis > nitems(vi_mac_funcs)) {
 		t4_num_vis = nitems(vi_mac_funcs);
 		printf("cxgbe: number of VIs limited to %d\n", t4_num_vis);
 	}
 
 	if (pcie_relaxed_ordering < 0 || pcie_relaxed_ordering > 2) {
 		pcie_relaxed_ordering = 1;
 #if defined(__i386__) || defined(__amd64__)
 		if (cpu_vendor_id == CPU_VENDOR_INTEL)
 			pcie_relaxed_ordering = 0;
 #endif
 	}
 }
 
 #ifdef DDB
 static void
 t4_dump_tcb(struct adapter *sc, int tid)
 {
 	uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos;
 
 	reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2);
 	save = t4_read_reg(sc, reg);
 	base = sc->memwin[2].mw_base;
 
 	/* Dump TCB for the tid */
 	tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	tcb_addr += tid * TCB_SIZE;
 
 	if (is_t4(sc)) {
 		pf = 0;
 		win_pos = tcb_addr & ~0xf;	/* start must be 16B aligned */
 	} else {
 		pf = V_PFNUM(sc->pf);
 		win_pos = tcb_addr & ~0x7f;	/* start must be 128B aligned */
 	}
 	t4_write_reg(sc, reg, win_pos | pf);
 	t4_read_reg(sc, reg);
 
 	off = tcb_addr - win_pos;
 	for (i = 0; i < 4; i++) {
 		uint32_t buf[8];
 		for (j = 0; j < 8; j++, off += 4)
 			buf[j] = htonl(t4_read_reg(sc, base + off));
 
 		db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n",
 		    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6],
 		    buf[7]);
 	}
 
 	t4_write_reg(sc, reg, save);
 	t4_read_reg(sc, reg);
 }
 
 static void
 t4_dump_devlog(struct adapter *sc)
 {
 	struct devlog_params *dparams = &sc->params.devlog;
 	struct fw_devlog_e e;
 	int i, first, j, m, nentries, rc;
 	uint64_t ftstamp = UINT64_MAX;
 
 	if (dparams->start == 0) {
 		db_printf("devlog params not valid\n");
 		return;
 	}
 
 	nentries = dparams->size / sizeof(struct fw_devlog_e);
 	m = fwmtype_to_hwmtype(dparams->memtype);
 
 	/* Find the first entry. */
 	first = -1;
 	for (i = 0; i < nentries && !db_pager_quit; i++) {
 		rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e),
 		    sizeof(e), (void *)&e);
 		if (rc != 0)
 			break;
 
 		if (e.timestamp == 0)
 			break;
 
 		e.timestamp = be64toh(e.timestamp);
 		if (e.timestamp < ftstamp) {
 			ftstamp = e.timestamp;
 			first = i;
 		}
 	}
 
 	if (first == -1)
 		return;
 
 	i = first;
 	do {
 		rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e),
 		    sizeof(e), (void *)&e);
 		if (rc != 0)
 			return;
 
 		if (e.timestamp == 0)
 			return;
 
 		e.timestamp = be64toh(e.timestamp);
 		e.seqno = be32toh(e.seqno);
 		for (j = 0; j < 8; j++)
 			e.params[j] = be32toh(e.params[j]);
 
 		db_printf("%10d  %15ju  %8s  %8s  ",
 		    e.seqno, e.timestamp,
 		    (e.level < nitems(devlog_level_strings) ?
 			devlog_level_strings[e.level] : "UNKNOWN"),
 		    (e.facility < nitems(devlog_facility_strings) ?
 			devlog_facility_strings[e.facility] : "UNKNOWN"));
 		db_printf(e.fmt, e.params[0], e.params[1], e.params[2],
 		    e.params[3], e.params[4], e.params[5], e.params[6],
 		    e.params[7]);
 
 		if (++i == nentries)
 			i = 0;
 	} while (i != first && !db_pager_quit);
 }
 
 static struct command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table);
 _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table);
 
 DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL)
 {
 	device_t dev;
 	int t;
 	bool valid;
 
 	valid = false;
 	t = db_read_token();
 	if (t == tIDENT) {
 		dev = device_lookup_by_name(db_tok_string);
 		valid = true;
 	}
 	db_skip_to_eol();
 	if (!valid) {
 		db_printf("usage: show t4 devlog <nexus>\n");
 		return;
 	}
 
 	if (dev == NULL) {
 		db_printf("device not found\n");
 		return;
 	}
 
 	t4_dump_devlog(device_get_softc(dev));
 }
 
 DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL)
 {
 	device_t dev;
 	int radix, tid, t;
 	bool valid;
 
 	valid = false;
 	radix = db_radix;
 	db_radix = 10;
 	t = db_read_token();
 	if (t == tIDENT) {
 		dev = device_lookup_by_name(db_tok_string);
 		t = db_read_token();
 		if (t == tNUMBER) {
 			tid = db_tok_number;
 			valid = true;
 		}
 	}	
 	db_radix = radix;
 	db_skip_to_eol();
 	if (!valid) {
 		db_printf("usage: show t4 tcb <nexus> <tid>\n");
 		return;
 	}
 
 	if (dev == NULL) {
 		db_printf("device not found\n");
 		return;
 	}
 	if (tid < 0) {
 		db_printf("invalid tid\n");
 		return;
 	}
 
 	t4_dump_tcb(device_get_softc(dev), tid);
 }
 #endif
 
 /*
  * Borrowed from cesa_prep_aes_key().
  *
  * NB: The crypto engine wants the words in the decryption key in reverse
  * order.
  */
 void
 t4_aes_getdeckey(void *dec_key, const void *enc_key, unsigned int kbits)
 {
 	uint32_t ek[4 * (RIJNDAEL_MAXNR + 1)];
 	uint32_t *dkey;
 	int i;
 
 	rijndaelKeySetupEnc(ek, enc_key, kbits);
 	dkey = dec_key;
 	dkey += (kbits / 8) / 4;
 
 	switch (kbits) {
 	case 128:
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 10 + i]);
 		break;
 	case 192:
 		for (i = 0; i < 2; i++)
 			*--dkey = htobe32(ek[4 * 11 + 2 + i]);
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 12 + i]);
 		break;
 	case 256:
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 13 + i]);
 		for (i = 0; i < 4; i++)
 			*--dkey = htobe32(ek[4 * 14 + i]);
 		break;
 	}
 	MPASS(dkey == dec_key);
 }
 
 static struct sx mlu;	/* mod load unload */
 SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
 
 static int
 mod_event(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 	static int loaded = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		sx_xlock(&mlu);
 		if (loaded++ == 0) {
 			t4_sge_modload();
 			t4_register_shared_cpl_handler(CPL_SET_TCB_RPL,
 			    t4_filter_rpl, CPL_COOKIE_FILTER);
 			t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL,
 			    do_l2t_write_rpl, CPL_COOKIE_FILTER);
 			t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL,
 			    t4_hashfilter_ao_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_shared_cpl_handler(CPL_SET_TCB_RPL,
 			    t4_hashfilter_tcb_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS,
 			    t4_del_hashfilter_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt);
 			t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt);
 			t4_register_cpl_handler(CPL_SMT_WRITE_RPL,
 			    do_smt_write_rpl);
 			sx_init(&t4_list_lock, "T4/T5 adapters");
 			SLIST_INIT(&t4_list);
 			callout_init(&fatal_callout, 1);
 #ifdef TCP_OFFLOAD
 			sx_init(&t4_uld_list_lock, "T4/T5 ULDs");
 			SLIST_INIT(&t4_uld_list);
 #endif
 #ifdef INET6
 			t4_clip_modload();
 #endif
 			t4_tracer_modload();
 			tweak_tunables();
 		}
 		sx_xunlock(&mlu);
 		break;
 
 	case MOD_UNLOAD:
 		sx_xlock(&mlu);
 		if (--loaded == 0) {
 			int tries;
 
 			sx_slock(&t4_list_lock);
 			if (!SLIST_EMPTY(&t4_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #ifdef TCP_OFFLOAD
 			sx_slock(&t4_uld_list_lock);
 			if (!SLIST_EMPTY(&t4_uld_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_uld_list_lock);
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #endif
 			tries = 0;
 			while (tries++ < 5 && t4_sge_extfree_refs() != 0) {
 				uprintf("%ju clusters with custom free routine "
 				    "still is use.\n", t4_sge_extfree_refs());
 				pause("t4unload", 2 * hz);
 			}
 #ifdef TCP_OFFLOAD
 			sx_sunlock(&t4_uld_list_lock);
 #endif
 			sx_sunlock(&t4_list_lock);
 
 			if (t4_sge_extfree_refs() == 0) {
 				t4_tracer_modunload();
 #ifdef INET6
 				t4_clip_modunload();
 #endif
 #ifdef TCP_OFFLOAD
 				sx_destroy(&t4_uld_list_lock);
 #endif
 				sx_destroy(&t4_list_lock);
 				t4_sge_modunload();
 				loaded = 0;
 			} else {
 				rc = EBUSY;
 				loaded++;	/* undo earlier decrement */
 			}
 		}
 done_unload:
 		sx_xunlock(&mlu);
 		break;
 	}
 
 	return (rc);
 }
 
 static devclass_t t4_devclass, t5_devclass, t6_devclass;
 static devclass_t cxgbe_devclass, cxl_devclass, cc_devclass;
 static devclass_t vcxgbe_devclass, vcxl_devclass, vcc_devclass;
 
 DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0);
 MODULE_VERSION(t4nex, 1);
 MODULE_DEPEND(t4nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t4nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0);
 MODULE_VERSION(t5nex, 1);
 MODULE_DEPEND(t5nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t5nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(t6nex, pci, t6_driver, t6_devclass, mod_event, 0);
 MODULE_VERSION(t6nex, 1);
 MODULE_DEPEND(t6nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t6nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0);
 MODULE_VERSION(cxgbe, 1);
 
 DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0);
 MODULE_VERSION(cxl, 1);
 
 DRIVER_MODULE(cc, t6nex, cc_driver, cc_devclass, 0, 0);
 MODULE_VERSION(cc, 1);
 
 DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, vcxgbe_devclass, 0, 0);
 MODULE_VERSION(vcxgbe, 1);
 
 DRIVER_MODULE(vcxl, cxl, vcxl_driver, vcxl_devclass, 0, 0);
 MODULE_VERSION(vcxl, 1);
 
 DRIVER_MODULE(vcc, cc, vcc_driver, vcc_devclass, 0, 0);
 MODULE_VERSION(vcc, 1);
Index: head/sys/dev/cxgbe/t4_sched.c
===================================================================
--- head/sys/dev/cxgbe/t4_sched.c	(revision 348253)
+++ head/sys/dev/cxgbe/t4_sched.c	(revision 348254)
@@ -1,906 +1,906 @@
 /*-
  * Copyright (c) 2017 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/sysctl.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 
 
 static int
 in_range(int val, int lo, int hi)
 {
 
 	return (val < 0 || (val <= hi && val >= lo));
 }
 
 static int
 set_sched_class_config(struct adapter *sc, int minmax)
 {
 	int rc;
 
 	if (minmax < 0)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc");
 	if (rc)
 		return (rc);
 	rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 static int
 set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p,
     int sleep_ok)
 {
 	int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode;
 	struct port_info *pi;
 	struct tx_cl_rl_params *tc, old;
 	bool check_pktsize = false;
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL)
 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
 	else if (p->level == SCHED_CLASS_LEVEL_CL_WRR)
 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
 	else if (p->level == SCHED_CLASS_LEVEL_CH_RL)
 		fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
 	else
 		return (EINVAL);
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 		if (p->mode == SCHED_CLASS_MODE_CLASS)
 			fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
 		else if (p->mode == SCHED_CLASS_MODE_FLOW) {
 			check_pktsize = true;
 			fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
 		} else
 			return (EINVAL);
 	} else
 		fw_mode = 0;
 
 	/* Valid channel must always be provided. */
 	if (p->channel < 0)
 		return (EINVAL);
 	if (!in_range(p->channel, 0, sc->chip_params->nchan - 1))
 		return (ERANGE);
 
 	pi = sc->port[sc->chan_map[p->channel]];
 	if (pi == NULL)
 		return (ENXIO);
 	MPASS(pi->tx_chan == p->channel);
 	top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
 	    p->level == SCHED_CLASS_LEVEL_CH_RL) {
 		/*
 		 * Valid rate (mode, unit and values) must be provided.
 		 */
 
 		if (p->minrate < 0)
 			p->minrate = 0;
 		if (p->maxrate < 0)
 			return (EINVAL);
 
 		if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) {
 			fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
 			/* ratemode could be relative (%) or absolute. */
 			if (p->ratemode == SCHED_CLASS_RATEMODE_REL) {
 				fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
 				/* maxrate is % of port bandwidth. */
 				if (!in_range(p->minrate, 0, 100) ||
 				    !in_range(p->maxrate, 0, 100)) {
 					return (ERANGE);
 				}
 			} else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) {
 				fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
 				/* maxrate is absolute value in kbps. */
 				if (!in_range(p->minrate, 0, top_speed) ||
 				    !in_range(p->maxrate, 0, top_speed)) {
 					return (ERANGE);
 				}
 			} else
 				return (EINVAL);
 		} else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) {
 			/* maxrate is the absolute value in pps. */
 			check_pktsize = true;
 			fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
 		} else
 			return (EINVAL);
 	} else {
 		MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR);
 
 		/*
 		 * Valid weight must be provided.
 		 */
 		if (p->weight < 0)
 		       return (EINVAL);
 		if (!in_range(p->weight, 1, 99))
 			return (ERANGE);
 
 		fw_rateunit = 0;
 		fw_ratemode = 0;
 	}
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
 	    p->level == SCHED_CLASS_LEVEL_CL_WRR) {
 		/*
 		 * Valid scheduling class must be provided.
 		 */
 		if (p->cl < 0)
 			return (EINVAL);
 		if (!in_range(p->cl, 0, sc->chip_params->nsched_cls - 1))
 			return (ERANGE);
 	}
 
 	if (check_pktsize) {
 		if (p->pktsize < 0)
 			return (EINVAL);
 		if (!in_range(p->pktsize, 64, pi->vi[0].ifp->if_mtu))
 			return (ERANGE);
 	}
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 		tc = &pi->sched_params->cl_rl[p->cl];
 		mtx_lock(&sc->tc_lock);
 		if (tc->refcount > 0 || tc->flags & (CLRL_SYNC | CLRL_ASYNC))
 			rc = EBUSY;
 		else {
 			tc->flags |= CLRL_SYNC | CLRL_USER;
 			tc->ratemode = fw_ratemode;
 			tc->rateunit = fw_rateunit;
 			tc->mode = fw_mode;
 			tc->maxrate = p->maxrate;
 			tc->pktsize = p->pktsize;
 			rc = 0;
 			old= *tc;
 		}
 		mtx_unlock(&sc->tc_lock);
 		if (rc != 0)
 			return (rc);
 	}
 
 	rc = begin_synchronized_op(sc, NULL,
 	    sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp");
 	if (rc != 0) {
 		if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 			mtx_lock(&sc->tc_lock);
 			*tc = old;
 			mtx_unlock(&sc->tc_lock);
 		}
 		return (rc);
 	}
 	rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level, fw_mode,
 	    fw_rateunit, fw_ratemode, p->channel, p->cl, p->minrate, p->maxrate,
 	    p->weight, p->pktsize, 0, sleep_ok);
 	end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD);
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 		mtx_lock(&sc->tc_lock);
 		MPASS(tc->flags & CLRL_SYNC);
 		MPASS(tc->flags & CLRL_USER);
 		MPASS(tc->refcount == 0);
 
 		tc->flags &= ~CLRL_SYNC;
 		if (rc == 0)
 			tc->flags &= ~CLRL_ERR;
 		else
 			tc->flags |= CLRL_ERR;
 		mtx_unlock(&sc->tc_lock);
 	}
 
 	return (rc);
 }
 
 static void
 update_tx_sched(void *context, int pending)
 {
 	int i, j, rc;
 	struct port_info *pi;
 	struct tx_cl_rl_params *tc;
 	struct adapter *sc = context;
 	const int n = sc->chip_params->nsched_cls;
 
 	mtx_lock(&sc->tc_lock);
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		tc = &pi->sched_params->cl_rl[0];
 		for (j = 0; j < n; j++, tc++) {
 			MPASS(mtx_owned(&sc->tc_lock));
 			if ((tc->flags & CLRL_ASYNC) == 0)
 				continue;
 			mtx_unlock(&sc->tc_lock);
 
 			if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t4utxs") != 0) {
 				mtx_lock(&sc->tc_lock);
 				continue;
 			}
 			rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED,
 			    FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit,
 			    tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0,
 			    tc->pktsize, tc->burstsize, 1);
 			end_synchronized_op(sc, 0);
 
 			mtx_lock(&sc->tc_lock);
 			MPASS(tc->flags & CLRL_ASYNC);
 			tc->flags &= ~CLRL_ASYNC;
 			if (rc == 0)
 				tc->flags &= ~CLRL_ERR;
 			else
 				tc->flags |= CLRL_ERR;
 		}
 	}
 	mtx_unlock(&sc->tc_lock);
 }
 
 int
 t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p)
 {
 
 	if (p->type != SCHED_CLASS_TYPE_PACKET)
 		return (EINVAL);
 
 	if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
 		return (set_sched_class_config(sc, p->u.config.minmax));
 
 	if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
 		return (set_sched_class_params(sc, &p->u.params, 1));
 
 	return (EINVAL);
 }
 
 static int
 bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx)
 {
 	struct tx_cl_rl_params *tc0, *tc;
 	int rc, old_idx;
 	uint32_t fw_mnem, fw_class;
 
 	if (!(txq->eq.flags & EQ_ALLOCATED))
 		return (EAGAIN);
 
 	mtx_lock(&sc->tc_lock);
 	if (txq->tc_idx == -2) {
 		rc = EBUSY;	/* Another bind/unbind in progress already. */
 		goto done;
 	}
 	if (idx == txq->tc_idx) {
 		rc = 0;		/* No change, nothing to do. */
 		goto done;
 	}
 
 	tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0];
 	if (idx != -1) {
 		/*
 		 * Bind to a different class at index idx.
 		 */
 		tc = &tc0[idx];
 		if (tc->flags & CLRL_ERR) {
 			rc = ENXIO;
 			goto done;
 		} else {
 			/*
 			 * Ok to proceed.  Place a reference on the new class
 			 * while still holding on to the reference on the
 			 * previous class, if any.
 			 */
 			tc->refcount++;
 		}
 	}
 	/* Mark as busy before letting go of the lock. */
 	old_idx = txq->tc_idx;
 	txq->tc_idx = -2;
 	mtx_unlock(&sc->tc_lock);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq");
 	if (rc != 0)
 		return (rc);
 	fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
 	    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
 	fw_class = idx < 0 ? 0xffffffff : idx;
 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem, &fw_class);
 	end_synchronized_op(sc, 0);
 
 	mtx_lock(&sc->tc_lock);
 	MPASS(txq->tc_idx == -2);
 	if (rc == 0) {
 		/*
 		 * Unbind, bind, or bind to a different class succeeded.  Remove
 		 * the reference on the old traffic class, if any.
 		 */
 		if (old_idx != -1) {
 			tc = &tc0[old_idx];
 			MPASS(tc->refcount > 0);
 			tc->refcount--;
 		}
 		txq->tc_idx = idx;
 	} else {
 		/*
 		 * Unbind, bind, or bind to a different class failed.  Remove
 		 * the anticipatory reference on the new traffic class, if any.
 		 */
 		if (idx != -1) {
 			tc = &tc0[idx];
 			MPASS(tc->refcount > 0);
 			tc->refcount--;
 		}
 		txq->tc_idx = old_idx;
 	}
 done:
 	MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->chip_params->nsched_cls);
 	mtx_unlock(&sc->tc_lock);
 	return (rc);
 }
 
 int
 t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
 {
 	struct port_info *pi = NULL;
 	struct vi_info *vi;
 	struct sge_txq *txq;
 	int i, rc;
 
 	if (p->port >= sc->params.nports)
 		return (EINVAL);
 
 	/*
 	 * XXX: cxgbetool allows the user to specify the physical port only.  So
 	 * we always operate on the main VI.
 	 */
 	pi = sc->port[p->port];
 	vi = &pi->vi[0];
 
 	/* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */
 	if (!(vi->flags & VI_INIT_DONE))
 		return (EAGAIN);
 	MPASS(vi->ntxq > 0);
 
 	if (!in_range(p->queue, 0, vi->ntxq - 1) ||
 	    !in_range(p->cl, 0, sc->chip_params->nsched_cls - 1))
 		return (EINVAL);
 
 	if (p->queue < 0) {
 		/*
 		 * Change the scheduling on all the TX queues for the
 		 * interface.
 		 */
 		for_each_txq(vi, i, txq) {
 			rc = bind_txq_to_traffic_class(sc, txq, p->cl);
 			if (rc != 0)
 				break;
 		}
 	} else {
 		/*
 		 * If op.queue is non-negative, then we're only changing the
 		 * scheduling on a single specified TX queue.
 		 */
 		txq = &sc->sge.txq[vi->first_txq + p->queue];
 		rc = bind_txq_to_traffic_class(sc, txq, p->cl);
 	}
 
 	return (rc);
 }
 
 int
 t4_init_tx_sched(struct adapter *sc)
 {
 	int i, j;
 	const int n = sc->chip_params->nsched_cls;
 	struct port_info *pi;
 	struct tx_cl_rl_params *tc;
 
 	mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF);
 	TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc);
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		pi->sched_params = malloc(sizeof(*pi->sched_params) +
 		    n * sizeof(*tc), M_CXGBE, M_ZERO | M_WAITOK);
 		tc = &pi->sched_params->cl_rl[0];
 		for (j = 0; j < n; j++, tc++) {
 			tc->refcount = 0;
 			tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
 			tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
 			tc->mode = FW_SCHED_PARAMS_MODE_CLASS;
 			tc->maxrate = 1000 * 1000;	/* 1 Gbps.  Arbitrary */
 
 			if (t4_sched_params_cl_rl_kbps(sc, pi->tx_chan, j,
 			    tc->mode, tc->maxrate, tc->pktsize, 1) != 0)
 				tc->flags = CLRL_ERR;
 		}
 	}
 
 	return (0);
 }
 
 int
 t4_free_tx_sched(struct adapter *sc)
 {
 	int i;
 
 	taskqueue_drain(taskqueue_thread, &sc->tc_task);
 
 	for_each_port(sc, i) {
 		if (sc->port[i] != NULL)
 			free(sc->port[i]->sched_params, M_CXGBE);
 	}
 
 	if (mtx_initialized(&sc->tc_lock))
 		mtx_destroy(&sc->tc_lock);
 
 	return (0);
 }
 
 void
 t4_update_tx_sched(struct adapter *sc)
 {
 
 	taskqueue_enqueue(taskqueue_thread, &sc->tc_task);
 }
 
 int
 t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate,
     int *tc_idx)
 {
 	int rc = 0, fa = -1, i, pktsize, burstsize;
 	bool update;
 	struct tx_cl_rl_params *tc;
 	struct port_info *pi;
 
 	MPASS(port_id >= 0 && port_id < sc->params.nports);
 
 	pi = sc->port[port_id];
 	if (pi->sched_params->pktsize > 0)
 		pktsize = pi->sched_params->pktsize;
 	else
 		pktsize = pi->vi[0].ifp->if_mtu;
 	if (pi->sched_params->burstsize > 0)
 		burstsize = pi->sched_params->burstsize;
 	else
 		burstsize = pktsize * 4;
 	tc = &pi->sched_params->cl_rl[0];
 
 	update = false;
 	mtx_lock(&sc->tc_lock);
 	for (i = 0; i < sc->chip_params->nsched_cls; i++, tc++) {
 		if (fa < 0 && tc->refcount == 0 && !(tc->flags & CLRL_USER))
 			fa = i;		/* first available */
 
 		if (tc->ratemode == FW_SCHED_PARAMS_RATE_ABS &&
 		    tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE &&
 		    tc->mode == FW_SCHED_PARAMS_MODE_FLOW &&
 		    tc->maxrate == maxrate && tc->pktsize == pktsize &&
 		    tc->burstsize == burstsize) {
 			tc->refcount++;
 			*tc_idx = i;
 			if ((tc->flags & (CLRL_ERR | CLRL_ASYNC | CLRL_SYNC)) ==
 			    CLRL_ERR) {
 				update = true;
 			}
 			goto done;
 		}
 	}
 	/* Not found */
 	MPASS(i == sc->chip_params->nsched_cls);
 	if (fa != -1) {
 		tc = &pi->sched_params->cl_rl[fa];
 		tc->refcount = 1;
 		tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
 		tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
 		tc->mode = FW_SCHED_PARAMS_MODE_FLOW;
 		tc->maxrate = maxrate;
 		tc->pktsize = pktsize;
 		tc->burstsize = burstsize;
 		*tc_idx = fa;
 		update = true;
 	} else {
 		*tc_idx = -1;
 		rc = ENOSPC;
 	}
 done:
 	mtx_unlock(&sc->tc_lock);
 	if (update) {
 		tc->flags |= CLRL_ASYNC;
 		t4_update_tx_sched(sc);
 	}
 	return (rc);
 }
 
 void
 t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx)
 {
 	struct tx_cl_rl_params *tc;
 
 	MPASS(port_id >= 0 && port_id < sc->params.nports);
 	MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls);
 
 	mtx_lock(&sc->tc_lock);
 	tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx];
 	MPASS(tc->refcount > 0);
 	tc->refcount--;
 	mtx_unlock(&sc->tc_lock);
 }
 
 int
 sysctl_tc(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct port_info *pi;
 	struct adapter *sc;
 	struct sge_txq *txq;
 	int qidx = arg2, rc, tc_idx;
 
 	MPASS(qidx >= 0 && qidx < vi->ntxq);
 	pi = vi->pi;
 	sc = pi->adapter;
 	txq = &sc->sge.txq[vi->first_txq + qidx];
 
 	tc_idx = txq->tc_idx;
 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (sc->flags & IS_VF)
 		return (EPERM);
 	if (!in_range(tc_idx, 0, sc->chip_params->nsched_cls - 1))
 		return (EINVAL);
 
 	return (bind_txq_to_traffic_class(sc, txq, tc_idx));
 }
 
 int
 sysctl_tc_params(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct tx_cl_rl_params tc;
 	struct sbuf *sb;
 	int i, rc, port_id, mbps, gbps;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	port_id = arg2 >> 16;
 	MPASS(port_id < sc->params.nports);
 	MPASS(sc->port[port_id] != NULL);
 	i = arg2 & 0xffff;
 	MPASS(i < sc->chip_params->nsched_cls);
 
 	mtx_lock(&sc->tc_lock);
 	tc = sc->port[port_id]->sched_params->cl_rl[i];
 	mtx_unlock(&sc->tc_lock);
 
 	switch (tc.rateunit) {
 	case SCHED_CLASS_RATEUNIT_BITS:
 		switch (tc.ratemode) {
 		case SCHED_CLASS_RATEMODE_REL:
 			/* XXX: top speed or actual link speed? */
 			gbps = port_top_speed(sc->port[port_id]);
 			sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps);
 			break;
 		case SCHED_CLASS_RATEMODE_ABS:
 			mbps = tc.maxrate / 1000;
 			gbps = tc.maxrate / 1000000;
 			if (tc.maxrate == gbps * 1000000)
 				sbuf_printf(sb, "%uGbps", gbps);
 			else if (tc.maxrate == mbps * 1000)
 				sbuf_printf(sb, "%uMbps", mbps);
 			else
 				sbuf_printf(sb, "%uKbps", tc.maxrate);
 			break;
 		default:
 			rc = ENXIO;
 			goto done;
 		}
 		break;
 	case SCHED_CLASS_RATEUNIT_PKTS:
 		sbuf_printf(sb, "%upps", tc.maxrate);
 		break;
 	default:
 		rc = ENXIO;
 		goto done;
 	}
 
 	switch (tc.mode) {
 	case SCHED_CLASS_MODE_CLASS:
 		sbuf_printf(sb, " aggregate");
 		break;
 	case SCHED_CLASS_MODE_FLOW:
 		sbuf_printf(sb, " per-flow");
 		if (tc.pktsize > 0)
 			sbuf_printf(sb, " pkt-size %u", tc.pktsize);
 		if (tc.burstsize > 0)
 			sbuf_printf(sb, " burst-size %u", tc.burstsize);
 		break;
 	default:
 		rc = ENXIO;
 		goto done;
 	}
 
 done:
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 #ifdef RATELIMIT
 void
 t4_init_etid_table(struct adapter *sc)
 {
 	int i;
 	struct tid_info *t;
 
 	if (!is_ethoffload(sc))
 		return;
 
 	t = &sc->tids;
 	MPASS(t->netids > 0);
 
 	mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF);
 	t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE,
 			M_ZERO | M_WAITOK);
 	t->efree = t->etid_tab;
 	t->etids_in_use = 0;
 	for (i = 1; i < t->netids; i++)
 		t->etid_tab[i - 1].next = &t->etid_tab[i];
 	t->etid_tab[t->netids - 1].next = NULL;
 }
 
 void
 t4_free_etid_table(struct adapter *sc)
 {
 	struct tid_info *t;
 
 	if (!is_ethoffload(sc))
 		return;
 
 	t = &sc->tids;
 	MPASS(t->netids > 0);
 
 	free(t->etid_tab, M_CXGBE);
 	t->etid_tab = NULL;
 
 	if (mtx_initialized(&t->etid_lock))
 		mtx_destroy(&t->etid_lock);
 }
 
 /* etid services */
 static int alloc_etid(struct adapter *, struct cxgbe_snd_tag *);
 static void free_etid(struct adapter *, int);
 
 static int
 alloc_etid(struct adapter *sc, struct cxgbe_snd_tag *cst)
 {
 	struct tid_info *t = &sc->tids;
 	int etid = -1;
 
 	mtx_lock(&t->etid_lock);
 	if (t->efree) {
 		union etid_entry *p = t->efree;
 
 		etid = p - t->etid_tab + t->etid_base;
 		t->efree = p->next;
 		p->cst = cst;
 		t->etids_in_use++;
 	}
 	mtx_unlock(&t->etid_lock);
 	return (etid);
 }
 
 struct cxgbe_snd_tag *
 lookup_etid(struct adapter *sc, int etid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->etid_tab[etid - t->etid_base].cst);
 }
 
 static void
 free_etid(struct adapter *sc, int etid)
 {
 	struct tid_info *t = &sc->tids;
 	union etid_entry *p = &t->etid_tab[etid - t->etid_base];
 
 	mtx_lock(&t->etid_lock);
 	p->next = t->efree;
 	t->efree = p;
 	t->etids_in_use--;
 	mtx_unlock(&t->etid_lock);
 }
 
 int
 cxgbe_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	int rc, schedcl;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct cxgbe_snd_tag *cst;
 
 	if (params->hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
 		return (ENOTSUP);
 
 	rc = t4_reserve_cl_rl_kbps(sc, pi->port_id,
 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
 	if (rc != 0)
 		return (rc);
 	MPASS(schedcl >= 0 && schedcl < sc->chip_params->nsched_cls);
 
 	cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (cst == NULL) {
 failed:
 		t4_release_cl_rl(sc, pi->port_id, schedcl);
 		return (ENOMEM);
 	}
 
 	cst->etid = alloc_etid(sc, cst);
 	if (cst->etid < 0) {
 		free(cst, M_CXGBE);
 		goto failed;
 	}
 
 	mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
 	mbufq_init(&cst->pending_tx, INT_MAX);
 	mbufq_init(&cst->pending_fwack, INT_MAX);
-	cst->com.ifp = ifp;
+	m_snd_tag_init(&cst->com, ifp);
 	cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
 	cst->adapter = sc;
 	cst->port_id = pi->port_id;
 	cst->schedcl = schedcl;
 	cst->max_rate = params->rate_limit.max_rate;
 	cst->tx_credits = sc->params.eo_wr_cred;
 	cst->tx_total = cst->tx_credits;
 	cst->plen = 0;
 	cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 
 	/*
 	 * Queues will be selected later when the connection flowid is available.
 	 */
 
 	*pt = &cst->com;
 	return (0);
 }
 
 /*
  * Change in parameters, no change in ifp.
  */
 int
 cxgbe_snd_tag_modify(struct m_snd_tag *mst,
     union if_snd_tag_modify_params *params)
 {
 	int rc, schedcl;
 	struct cxgbe_snd_tag *cst = mst_to_cst(mst);
 	struct adapter *sc = cst->adapter;
 
 	/* XXX: is schedcl -1 ok here? */
 	MPASS(cst->schedcl >= 0 && cst->schedcl < sc->chip_params->nsched_cls);
 
 	mtx_lock(&cst->lock);
 	MPASS(cst->flags & EO_SND_TAG_REF);
 	rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
 	if (rc != 0)
 		return (rc);
 	MPASS(schedcl >= 0 && schedcl < sc->chip_params->nsched_cls);
 	t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
 	cst->schedcl = schedcl;
 	cst->max_rate = params->rate_limit.max_rate;
 	mtx_unlock(&cst->lock);
 
 	return (0);
 }
 
 int
 cxgbe_snd_tag_query(struct m_snd_tag *mst,
     union if_snd_tag_query_params *params)
 {
 	struct cxgbe_snd_tag *cst = mst_to_cst(mst);
 
 	params->rate_limit.max_rate = cst->max_rate;
 
 #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total)
 	params->rate_limit.queue_level =
 		(cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE;
 
 	return (0);
 }
 
 /*
  * Unlocks cst and frees it.
  */
 void
 cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *cst)
 {
 	struct adapter *sc = cst->adapter;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	MPASS((cst->flags & EO_SND_TAG_REF) == 0);
 	MPASS(cst->tx_credits == cst->tx_total);
 	MPASS(cst->plen == 0);
 	MPASS(mbufq_first(&cst->pending_tx) == NULL);
 	MPASS(mbufq_first(&cst->pending_fwack) == NULL);
 
 	if (cst->etid >= 0)
 		free_etid(sc, cst->etid);
 	if (cst->schedcl != -1)
 		t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
 	mtx_unlock(&cst->lock);
 	mtx_destroy(&cst->lock);
 	free(cst, M_CXGBE);
 }
 
 void
 cxgbe_snd_tag_free(struct m_snd_tag *mst)
 {
 	struct cxgbe_snd_tag *cst = mst_to_cst(mst);
 
 	mtx_lock(&cst->lock);
 
 	/* The kernel is done with the snd_tag.  Remove its reference. */
 	MPASS(cst->flags & EO_SND_TAG_REF);
 	cst->flags &= ~EO_SND_TAG_REF;
 
 	if (cst->ncompl == 0) {
 		/*
 		 * No fw4_ack in flight.  Free the tag right away if there are
 		 * no outstanding credits.  Request the firmware to return all
 		 * credits for the etid otherwise.
 		 */
 		if (cst->tx_credits == cst->tx_total) {
 			cxgbe_snd_tag_free_locked(cst);
 			return;	/* cst is gone. */
 		}
 		send_etid_flush_wr(cst);
 	}
 	mtx_unlock(&cst->lock);
 }
 #endif
Index: head/sys/dev/cxgbe/t4_sge.c
===================================================================
--- head/sys/dev/cxgbe/t4_sge.c	(revision 348253)
+++ head/sys/dev/cxgbe/t4_sge.c	(revision 348254)
@@ -1,6054 +1,6087 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #ifdef DEV_NETMAP
 #include <machine/bus.h>
 #include <sys/selinfo.h>
 #include <net/if_var.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
 #define RX_COPY_THRESHOLD MINCLSIZE
 #endif
 
 /* Internal mbuf flags stored in PH_loc.eight[1]. */
 #define	MC_RAW_WR		0x02
 
 /*
  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
  * 0-7 are valid values.
  */
 static int fl_pktshift = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0,
     "payload DMA offset in rx buffer (bytes)");
 
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
  *  0: disable padding.
  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 int fl_pad = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0,
     "payload pad boundary (bytes)");
 
 /*
  * Status page length.
  * -1: driver should figure out a good value.
  *  64 or 128 are the only other valid values.
  */
 static int spg_len = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0,
     "status page size (bytes)");
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int cong_drop = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0,
     "Congestion control for RX queues (0 = backpressure, 1 = drop");
 
 /*
  * Deliver multiple frames in the same free list buffer if they fit.
  * -1: let the driver decide whether to enable buffer packing or not.
  *  0: disable buffer packing.
  *  1: enable buffer packing.
  */
 static int buffer_packing = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing,
     0, "Enable buffer packing");
 
 /*
  * Start next frame in a packed buffer at this boundary.
  * -1: driver should figure out a good value.
  * T4: driver will ignore this and use the same value as fl_pad above.
  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
  */
 static int fl_pack = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0,
     "payload pack boundary (bytes)");
 
 /*
  * Allow the driver to create mbuf(s) in a cluster allocated for rx.
  * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
  * 1: ok to create mbuf(s) within a cluster if there is room.
  */
 static int allow_mbufs_in_cluster = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, allow_mbufs_in_cluster, CTLFLAG_RDTUN,
     &allow_mbufs_in_cluster, 0,
     "Allow driver to create mbufs within a rx cluster");
 
 /*
  * Largest rx cluster size that the driver is allowed to allocate.
  */
 static int largest_rx_cluster = MJUM16BYTES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN,
     &largest_rx_cluster, 0, "Largest rx cluster (bytes)");
 
 /*
  * Size of cluster allocation that's most likely to succeed.  The driver will
  * fall back to this size if it fails to allocate clusters larger than this.
  */
 static int safest_rx_cluster = PAGE_SIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN,
     &safest_rx_cluster, 0, "Safe rx cluster (bytes)");
 
 #ifdef RATELIMIT
 /*
  * Knob to control TCP timestamp rewriting, and the granularity of the tick used
  * for rewriting.  -1 and 0-3 are all valid values.
  * -1: hardware should leave the TCP timestamps alone.
  * 0: 1ms
  * 1: 100us
  * 2: 10us
  * 3: 1us
  */
 static int tsclk = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0,
     "Control TCP timestamp rewriting when using pacing");
 
 static int eo_max_backlog = 1024 * 1024;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog,
     0, "Maximum backlog of ratelimited data per flow");
 #endif
 
 /*
  * The interrupt holdoff timers are multiplied by this value on T6+.
  * 1 and 3-17 (both inclusive) are legal values.
  */
 static int tscale = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0,
     "Interrupt holdoff timer scale on T6+");
 
 /*
  * Number of LRO entries in the lro_ctrl structure per rx queue.
  */
 static int lro_entries = TCP_LRO_ENTRIES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0,
     "Number of LRO entries per RX queue");
 
 /*
  * This enables presorting of frames before they're fed into tcp_lro_rx.
  */
 static int lro_mbufs = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
     "Enable presorting of LRO frames");
 
 struct txpkts {
 	u_int wr_type;		/* type 0 or type 1 */
 	u_int npkt;		/* # of packets in this work request */
 	u_int plen;		/* total payload (sum of all packets) */
 	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
 	struct sglist sg;
 	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
     uint16_t, char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
     bus_addr_t *, void **);
 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
     void *);
 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_iq *);
 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_ctrlq(struct adapter *, struct sge_wrq *, int,
     struct sysctl_oid *);
 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct vi_info *, struct sge_rxq *);
 #ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
 #endif
 #ifdef DEV_NETMAP
 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
     struct sysctl_oid *);
 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
     struct sysctl_oid *);
 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int free_eq(struct adapter *, struct sge_eq *);
 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
     struct sysctl_oid *);
 static int free_wrq(struct adapter *, struct sge_wrq *);
 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
     struct sysctl_oid *);
 static int free_txq(struct vi_info *, struct sge_txq *);
 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
 static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
 static inline u_int txpkt_len16(u_int, u_int);
 static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
 static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
     struct mbuf *, u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
 static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
     struct mbuf *, const struct txpkts *, u_int);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
 static inline uint16_t read_hw_cidx(struct sge_eq *);
 static inline u_int reclaimable_tx_desc(struct sge_eq *);
 static inline u_int total_available_tx_desc(struct sge_eq *);
 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
 static void tx_reclaim(void *, int);
 static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
 static void wrq_tx_drain(void *, int);
 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 #ifdef RATELIMIT
 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 #endif
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
 
 an_handler_t t4_an_handler;
 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
 
 void
 t4_register_an_handler(an_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(h == NULL || t4_an_handler == NULL);
 
 	loc = (uintptr_t *)&t4_an_handler;
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(type < nitems(t4_fw_msg_handler));
 	MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	MPASS(type != FW_TYPE_RSSCPL);
 	MPASS(type != FW6_TYPE_RSSCPL);
 
 	loc = (uintptr_t *)&t4_fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_cpl_handler(int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
 
 	loc = (uintptr_t *)&t4_cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 static int
 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	u_int tid;
 	int cookie;
 
 	MPASS(m == NULL);
 
 	tid = GET_TID(cpl);
 	if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) {
 		/*
 		 * The return code for filter-write is put in the CPL cookie so
 		 * we have to rely on the hardware tid (is_ftid) to determine
 		 * that this is a response to a filter.
 		 */
 		cookie = CPL_COOKIE_FILTER;
 	} else {
 		cookie = G_COOKIE(cpl->cookie);
 	}
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < nitems(set_tcb_rpl_handlers));
 
 	return (set_tcb_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
 	unsigned int cookie;
 
 	MPASS(m == NULL);
 
 	cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
 	return (l2t_write_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 
 	MPASS(m == NULL);
 	MPASS(cookie != CPL_COOKIE_RESERVED);
 
 	return (act_open_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_hashfilter(sc))
 		cookie = CPL_COOKIE_HASHFILTER;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (abort_rpl_rss_handlers[cookie](iq, rss, m));
 }
 
 static int
 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_etid(sc, tid))
 		cookie = CPL_COOKIE_ETHOFLD;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (fw4_ack_handlers[cookie](iq, rss, m));
 }
 
 static void
 t4_init_shared_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
 	t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
 	t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
 	t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
 }
 
 void
 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < NUM_CPL_COOKIES);
 	MPASS(t4_cpl_handler[opcode] != NULL);
 
 	switch (opcode) {
 	case CPL_SET_TCB_RPL:
 		loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
 		break;
 	case CPL_L2T_WRITE_RPL:
 		loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
 		break;
 	case CPL_ACT_OPEN_RPL:
 		loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
 		break;
 	case CPL_ABORT_RPL_RSS:
 		loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
 		break;
 	case CPL_FW4_ACK:
 		loc = (uintptr_t *)&fw4_ack_handlers[cookie];
 		break;
 	default:
 		MPASS(0);
 		return;
 	}
 	MPASS(h == NULL || *loc == (uintptr_t)NULL);
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
 		    " using 0 instead.\n", fl_pktshift);
 		fl_pktshift = 0;
 	}
 
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
 #if defined(__i386__) || defined(__amd64__)
 		len = cpu_clflush_line_size > 64 ? 128 : 64;
 #else
 		len = 64;
 #endif
 		if (spg_len != -1) {
 			printf("Invalid hw.cxgbe.spg_len value (%d),"
 			    " using %d instead.\n", spg_len, len);
 		}
 		spg_len = len;
 	}
 
 	if (cong_drop < -1 || cong_drop > 1) {
 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
 		    " using 0 instead.\n", cong_drop);
 		cong_drop = 0;
 	}
 
 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
 		printf("Invalid hw.cxgbe.tscale value (%d),"
 		    " using 1 instead.\n", tscale);
 		tscale = 1;
 	}
 
 	extfree_refs = counter_u64_alloc(M_WAITOK);
 	extfree_rels = counter_u64_alloc(M_WAITOK);
 	counter_u64_zero(extfree_refs);
 	counter_u64_zero(extfree_rels);
 
 	t4_init_shared_cpl_handlers();
 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
 #ifdef RATELIMIT
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
 	    CPL_COOKIE_ETHOFLD);
 #endif
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
 
 void
 t4_sge_modunload(void)
 {
 
 	counter_u64_free(extfree_refs);
 	counter_u64_free(extfree_rels);
 }
 
 uint64_t
 t4_sge_extfree_refs(void)
 {
 	uint64_t refs, rels;
 
 	rels = counter_u64_fetch(extfree_rels);
 	refs = counter_u64_fetch(extfree_refs);
 
 	return (refs - rels);
 }
 
 static inline void
 setup_pad_and_pack_boundaries(struct adapter *sc)
 {
 	uint32_t v, m;
 	int pad, pack, pad_shift;
 
 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
 	    X_INGPADBOUNDARY_SHIFT;
 	pad = fl_pad;
 	if (fl_pad < (1 << pad_shift) ||
 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
 	    !powerof2(fl_pad)) {
 		/*
 		 * If there is any chance that we might use buffer packing and
 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
 		 * it to the minimum allowed in all other cases.
 		 */
 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
 
 		/*
 		 * For fl_pad = 0 we'll still write a reasonable value to the
 		 * register but all the freelists will opt out of padding.
 		 * We'll complain here only if the user tried to set it to a
 		 * value greater than 0 that was invalid.
 		 */
 		if (fl_pad > 0) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
 			    " (%d), using %d instead.\n", fl_pad, pad);
 		}
 	}
 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	if (is_t4(sc)) {
 		if (fl_pack != -1 && fl_pack != pad) {
 			/* Complain but carry on. */
 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
 			    " using %d instead.\n", fl_pack, pad);
 		}
 		return;
 	}
 
 	pack = fl_pack;
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack)) {
 		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
 		MPASS(powerof2(pack));
 		if (pack < 16)
 			pack = 16;
 		if (pack == 32)
 			pack = 64;
 		if (pack > 4096)
 			pack = 4096;
 		if (fl_pack != -1) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
 			    " (%d), using %d instead.\n", fl_pack, pack);
 		}
 	}
 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
 	if (pack == 16)
 		v = V_INGPACKBOUNDARY(0);
 	else
 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
 
 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
 }
 
 /*
  * adap->params.vpd.cclk must be set up before this is called.
  */
 void
 t4_tweak_chip_settings(struct adapter *sc)
 {
 	int i;
 	uint32_t v, m;
 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sge_flbuf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 		MJUMPAGESIZE - CL_METADATA_SIZE,
 		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES,
 		MCLBYTES - MSIZE - CL_METADATA_SIZE,
 		MJUM9BYTES - CL_METADATA_SIZE,
 		MJUM16BYTES - CL_METADATA_SIZE,
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	setup_pad_and_pack_boundaries(sc);
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
 	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
 	    ("%s: hw buffer size table too big", __func__));
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096);
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536);
 	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE15 - (4 * i),
 		    sge_flbuf_sizes[i]);
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
 
 	KASSERT(intr_timer[0] <= timer_max,
 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
 	    timer_max));
 	for (i = 1; i < nitems(intr_timer); i++) {
 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
 		    ("%s: timers not listed in increasing order (%d)",
 		    __func__, i));
 
 		while (intr_timer[i] > timer_max) {
 			if (i == nitems(intr_timer) - 1) {
 				intr_timer[i] = timer_max;
 				break;
 			}
 			intr_timer[i] += intr_timer[i - 1];
 			intr_timer[i] /= 2;
 		}
 	}
 
 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		m = V_TSCALE(M_TSCALE);
 		if (tscale == 1)
 			v = 0;
 		else
 			v = V_TSCALE(tscale - 2);
 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
 
 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 			v &= ~m;
 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(16);
 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 		}
 	}
 
 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
 
 	/*
 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
 	 * may have to deal with is MAXPHYS + 1 page.
 	 */
 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
 
 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
 }
 
 /*
  * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
  * padding is in use, the buffer's start and end need to be aligned to the pad
  * boundary as well.  We'll just make sure that the size is a multiple of the
  * boundary here, it is up to the buffer allocation code to make sure the start
  * of the buffer is aligned as well.
  */
 static inline int
 hwsz_ok(struct adapter *sc, int hwsz)
 {
 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
 
 	return (hwsz >= 64 && (hwsz & mask) == 0);
 }
 
 /*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
 t4_read_chip_settings(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
 	struct sge_params *sp = &sc->params.sge;
 	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 	struct sw_zone_info *swz, *safe_swz;
 	struct hw_buf_info *hwb;
 
 	m = F_RXPKTCPLMODE;
 	v = F_RXPKTCPLMODE;
 	r = sc->params.sge.sge_control;
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/*
 	 * If this changes then every single use of PAGE_SHIFT in the driver
 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
 	 */
 	if (sp->page_shift != PAGE_SHIFT) {
 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
 	hwb = &s->hw_buf_info[0];
 	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
 		r = sc->params.sge.sge_fl_buffer_size[i];
 		hwb->size = r;
 		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
 		hwb->next = -1;
 	}
 
 	/*
 	 * Create a sorted list in decreasing order of hw buffer sizes (and so
 	 * increasing order of spare area) for each software zone.
 	 *
 	 * If padding is enabled then the start and end of the buffer must align
 	 * to the pad boundary; if packing is enabled then they must align with
 	 * the pack boundary as well.  Allocations from the cluster zones are
 	 * aligned to min(size, 4K), so the buffer starts at that alignment and
 	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
 	 * starting alignment will be reduced to MSIZE and the driver will
 	 * exercise appropriate caution when deciding on the best buffer layout
 	 * to use.
 	 */
 	n = 0;	/* no usable buffer size to begin with */
 	swz = &s->sw_zone_info[0];
 	safe_swz = NULL;
 	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
 		int8_t head = -1, tail = -1;
 
 		swz->size = sw_buf_sizes[i];
 		swz->zone = m_getzone(swz->size);
 		swz->type = m_gettype(swz->size);
 
 		if (swz->size < PAGE_SIZE) {
 			MPASS(powerof2(swz->size));
 			if (fl_pad && (swz->size % sp->pad_boundary != 0))
 				continue;
 		}
 
 		if (swz->size == safest_rx_cluster)
 			safe_swz = swz;
 
 		hwb = &s->hw_buf_info[0];
 		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
 			if (hwb->zidx != -1 || hwb->size > swz->size)
 				continue;
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			hwb->zidx = i;
 			if (head == -1)
 				head = tail = j;
 			else if (hwb->size < s->hw_buf_info[tail].size) {
 				s->hw_buf_info[tail].next = j;
 				tail = j;
 			} else {
 				int8_t *cur;
 				struct hw_buf_info *t;
 
 				for (cur = &head; *cur != -1; cur = &t->next) {
 					t = &s->hw_buf_info[*cur];
 					if (hwb->size == t->size) {
 						hwb->zidx = -2;
 						break;
 					}
 					if (hwb->size > t->size) {
 						hwb->next = *cur;
 						*cur = j;
 						break;
 					}
 				}
 			}
 		}
 		swz->head_hwidx = head;
 		swz->tail_hwidx = tail;
 
 		if (tail != -1) {
 			n++;
 			if (swz->size - s->hw_buf_info[tail].size >=
 			    CL_METADATA_SIZE)
 				sc->flags |= BUF_PACKING_OK;
 		}
 	}
 	if (n == 0) {
 		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
 		rc = EINVAL;
 	}
 
 	s->safe_hwidx1 = -1;
 	s->safe_hwidx2 = -1;
 	if (safe_swz != NULL) {
 		s->safe_hwidx1 = safe_swz->head_hwidx;
 		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
 			int spare;
 
 			hwb = &s->hw_buf_info[i];
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			spare = safe_swz->size - hwb->size;
 			if (spare >= CL_METADATA_SIZE) {
 				s->safe_hwidx2 = i;
 				break;
 			}
 		}
 	}
 
 	if (sc->flags & IS_VF)
 		return (0);
 
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
 	if (r != v) {
 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = v = F_TDDPTAGTCB;
 	r = t4_read_reg(sc, A_ULP_RX_CTL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	r = t4_read_reg(sc, A_TP_PARA_REG5);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	t4_init_tp_params(sc, 1);
 
 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
 
 	return (rc);
 }
 
 int
 t4_create_dma_tag(struct adapter *sc)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
 	    NULL, &sc->dmat);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create main DMA tag: %d\n", rc);
 	}
 
 	return (rc);
 }
 
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
 	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
 	    "freelist buffer sizes");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
 	    NULL, sp->spg_len, "status page size (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
 t4_destroy_dma_tag(struct adapter *sc)
 {
 	if (sc->dmat)
 		bus_dma_tag_destroy(sc->dmat);
 
 	return (0);
 }
 
 /*
  * Allocate and initialize the firmware event queue, control queues, and special
  * purpose rx queues owned by the adapter.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 int
 t4_setup_adapter_queues(struct adapter *sc)
 {
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 	int rc, i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	sysctl_ctx_init(&sc->ctx);
 	sc->flags |= ADAP_SYSCTL_CTX;
 
 	/*
 	 * Firmware event queue
 	 */
 	rc = alloc_fwq(sc);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * That's all for the VF driver.
 	 */
 	if (sc->flags & IS_VF)
 		return (rc);
 
 	oid = device_get_sysctl_tree(sc->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	/*
 	 * XXX: General purpose rx queues, one per port.
 	 */
 
 	/*
 	 * Control queues, one per port.
 	 */
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "ctrlq",
 	    CTLFLAG_RD, NULL, "control queues");
 	for_each_port(sc, i) {
 		struct sge_wrq *ctrlq = &sc->sge.ctrlq[i];
 
 		rc = alloc_ctrlq(sc, ctrlq, i, oid);
 		if (rc != 0)
 			return (rc);
 	}
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_adapter_queues(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	/* Do this before freeing the queue */
 	if (sc->flags & ADAP_SYSCTL_CTX) {
 		sysctl_ctx_free(&sc->ctx);
 		sc->flags &= ~ADAP_SYSCTL_CTX;
 	}
 
 	if (!(sc->flags & IS_VF)) {
 		for_each_port(sc, i)
 			free_wrq(sc, &sc->sge.ctrlq[i]);
 	}
 	free_fwq(sc);
 
 	return (0);
 }
 
 /* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
 mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
 {
 	int payload;
 
 #ifdef TCP_OFFLOAD
 	if (toe) {
 		int rxcs = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2));
 
 		/* Note that COP can set rx_coalesce on/off per connection. */
 		payload = max(mtu, rxcs);
 	} else {
 #endif
 		/* large enough even when hw VLAN extraction is disabled */
 		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
 		    ETHER_VLAN_ENCAP_LEN + mtu;
 #ifdef TCP_OFFLOAD
 	}
 #endif
 
 	return (payload);
 }
 
 int
 t4_setup_vi_queues(struct vi_info *vi)
 {
 	int rc = 0, i, intr_idx, iqidx;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	int saved_idx;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 	char name[16];
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 	int maxp, mtu = ifp->if_mtu;
 
 	/* Interrupt vector to start from (when using multiple vectors) */
 	intr_idx = vi->first_intr;
 
 #ifdef DEV_NETMAP
 	saved_idx = intr_idx;
 	if (ifp->if_capabilities & IFCAP_NETMAP) {
 
 		/* netmap is supported with direct interrupts only. */
 		MPASS(!forwarding_intr_to_fwq(sc));
 
 		/*
 		 * We don't have buffers to back the netmap rx queues
 		 * right now so we create the queues in a way that
 		 * doesn't set off any congestion signal in the chip.
 		 */
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
 		    CTLFLAG_RD, NULL, "tx queues");
 		for_each_nm_txq(vi, i, nm_txq) {
 			iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
 			rc = alloc_nm_txq(vi, nm_txq, iqidx, i, oid);
 			if (rc != 0)
 				goto done;
 		}
 	}
 
 	/* Normal rx queues and netmap rx queues share the same interrupts. */
 	intr_idx = saved_idx;
 #endif
 
 	/*
 	 * Allocate rx queues first because a default iqid is required when
 	 * creating a tx queue.
 	 */
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 	    CTLFLAG_RD, NULL, "rx queues");
 	for_each_rxq(vi, i, rxq) {
 
 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_rxq(vi, rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
 #endif
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 	    CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections");
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
 		    vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		rc = alloc_ofld_rxq(vi, ofld_rxq,
 		    forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid);
 		if (rc != 0)
 			goto done;
 		intr_idx++;
 	}
 #endif
 
 	/*
 	 * Now the tx queues.
 	 */
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
 	    NULL, "tx queues");
 	for_each_txq(vi, i, txq) {
 		iqidx = vi->first_rxq + (i % vi->nrxq);
 		snprintf(name, sizeof(name), "%s txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan,
 		    sc->sge.rxq[iqidx].iq.cntxt_id, name);
 
 		rc = alloc_txq(vi, txq, i, oid);
 		if (rc != 0)
 			goto done;
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD, NULL, "tx queues for TOE/ETHOFLD");
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		struct sysctl_oid *oid2;
 
 		snprintf(name, sizeof(name), "%s ofld_txq%d",
 		    device_get_nameunit(vi->dev), i);
 		if (vi->nofldrxq > 0) {
 			iqidx = vi->first_ofld_rxq + (i % vi->nofldrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.ofld_rxq[iqidx].iq.cntxt_id,
 			    name);
 		} else {
 			iqidx = vi->first_rxq + (i % vi->nrxq);
 			init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq,
 			    pi->tx_chan, sc->sge.rxq[iqidx].iq.cntxt_id, name);
 		}
 
 		snprintf(name, sizeof(name), "%d", i);
 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    name, CTLFLAG_RD, NULL, "offload tx queue");
 
 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
 		if (rc != 0)
 			goto done;
 	}
 #endif
 done:
 	if (rc)
 		t4_teardown_vi_queues(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_vi_queues(struct vi_info *vi)
 {
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 
 	/* Do this before freeing the queues */
 	if (vi->flags & VI_SYSCTL_CTX) {
 		sysctl_ctx_free(&vi->ctx);
 		vi->flags &= ~VI_SYSCTL_CTX;
 	}
 
 #ifdef DEV_NETMAP
 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
 		for_each_nm_txq(vi, i, nm_txq) {
 			free_nm_txq(vi, nm_txq);
 		}
 
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			free_nm_rxq(vi, nm_rxq);
 		}
 	}
 #endif
 
 	/*
 	 * Take down all the tx queues first, as they reference the rx queues
 	 * (for egress updates, etc.).
 	 */
 
 	for_each_txq(vi, i, txq) {
 		free_txq(vi, txq);
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Interrupt handler when the driver is using only 1 interrupt.  This is a very
  * unusual scenario.
  *
  * a) Deals with errors, if any.
  * b) Services firmware event queue, which is taking interrupts for all other
  *    queues.
  */
 void
 t4_intr_all(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	MPASS(sc->intr_count == 1);
 
 	if (sc->intr_type == INTR_INTX)
 		t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 
 	t4_intr_err(arg);
 	t4_intr_evt(fwq);
 }
 
 /*
  * Interrupt handler for errors (installed directly when multiple interrupts are
  * being used, or called by t4_intr_all).
  */
 void
 t4_intr_err(void *arg)
 {
 	struct adapter *sc = arg;
 	uint32_t v;
 	const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
 
 	if (sc->flags & ADAP_ERR)
 		return;
 
 	v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
 	if (v & F_PFSW) {
 		sc->swintr++;
 		t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
 	}
 
 	t4_slow_intr_handler(sc, verbose);
 }
 
 /*
  * Interrupt handler for iq-only queues.  The firmware event queue is the only
  * such queue right now.
  */
 void
 t4_intr_evt(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 /*
  * Interrupt handler for iq+fl queues.
  */
 void
 t4_intr(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq_fl(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 #ifdef DEV_NETMAP
 /*
  * Interrupt handler for netmap rx queues.
  */
 void
 t4_nm_intr(void *arg)
 {
 	struct sge_nm_rxq *nm_rxq = arg;
 
 	if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) {
 		service_nm_rxq(nm_rxq);
 		(void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON);
 	}
 }
 
 /*
  * Interrupt handler for vectors shared between NIC and netmap rx queues.
  */
 void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
 
 	MPASS(irq->nm_rxq != NULL);
 	t4_nm_intr(irq->nm_rxq);
 
 	MPASS(irq->rxq != NULL);
 	t4_intr(irq->rxq);
 }
 #endif
 
 /*
  * Deals with interrupts on an iq-only (no freelist) queue.
  */
 static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type;
 	uint32_t lq;
 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	KASSERT((iq->flags & IQ_HAS_FL) == 0,
 	    ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq,
 	    iq->flags));
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 	MPASS((iq->flags & IQ_LRO_ENABLED) == 0);
 
 	limit = budget ? budget : iq->qsize / 16;
 
 	/*
 	 * We always come back and check the descriptor ring for new indirect
 	 * interrupts and other responses after running a single handler.
 	 */
 	for (;;) {
 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 			rmb();
 
 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 			lq = be32toh(d->rsp.pldbuflen_qid);
 
 			switch (rsp_type) {
 			case X_RSPD_TYPE_FLBUF:
 				panic("%s: data for an iq (%p) with no freelist",
 				    __func__, iq);
 
 				/* NOTREACHED */
 
 			case X_RSPD_TYPE_CPL:
 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 				    ("%s: bad opcode %02x.", __func__,
 				    d->rss.opcode));
 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL);
 				break;
 
 			case X_RSPD_TYPE_INTR:
 				/*
 				 * There are 1K interrupt-capable queues (qids 0
 				 * through 1023).  A response type indicating a
 				 * forwarded interrupt with a qid >= 1K is an
 				 * iWARP async notification.
 				 */
 				if (__predict_true(lq >= 1024)) {
 					t4_an_handler(iq, &d->rsp);
 					break;
 				}
 
 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
 				    sc->sge.iq_base];
 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
 				    IQS_BUSY)) {
 					if (service_iq_fl(q, q->qsize / 16) == 0) {
 						(void) atomic_cmpset_int(&q->state,
 						    IQS_BUSY, IQS_IDLE);
 					} else {
 						STAILQ_INSERT_TAIL(&iql, q,
 						    link);
 					}
 				}
 				break;
 
 			default:
 				KASSERT(0,
 				    ("%s: illegal response type %d on iq %p",
 				    __func__, rsp_type, iq));
 				log(LOG_ERR,
 				    "%s: illegal response type %d on iq %p",
 				    device_get_nameunit(sc->dev), rsp_type, iq);
 				break;
 			}
 
 			d++;
 			if (__predict_false(++iq->cidx == iq->sidx)) {
 				iq->cidx = 0;
 				iq->gen ^= F_RSPD_GEN;
 				d = &iq->desc[0];
 			}
 			if (__predict_false(++ndescs == limit)) {
 				t4_write_reg(sc, sc->sge_gts_reg,
 				    V_CIDXINC(ndescs) |
 				    V_INGRESSQID(iq->cntxt_id) |
 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 				ndescs = 0;
 
 				if (budget) {
 					return (EINPROGRESS);
 				}
 			}
 		}
 
 		if (STAILQ_EMPTY(&iql))
 			break;
 
 		/*
 		 * Process the head only, and send it to the back of the list if
 		 * it's still not done.
 		 */
 		q = STAILQ_FIRST(&iql);
 		STAILQ_REMOVE_HEAD(&iql, link);
 		if (service_iq_fl(q, q->qsize / 8) == 0)
 			(void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
 		else
 			STAILQ_INSERT_TAIL(&iql, q, link);
 	}
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	return (0);
 }
 
 static inline int
 sort_before_lro(struct lro_ctrl *lro)
 {
 
 	return (lro->lro_mbuf_max != 0);
 }
 
 static inline uint64_t
 last_flit_to_ns(struct adapter *sc, uint64_t lf)
 {
 	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
 
 	if (n > UINT64_MAX / 1000000)
 		return (n / sc->params.vpd.cclk * 1000000);
 	else
 		return (n * 1000000 / sc->params.vpd.cclk);
 }
 
 /*
  * Deals with interrupts on an iq+fl queue.
  */
 static int
 service_iq_fl(struct sge_iq *iq, int budget)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct sge_fl *fl;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type, refill, starved;
 	uint32_t lq;
 	uint16_t fl_hw_cidx;
 	struct mbuf *m0;
 #if defined(INET) || defined(INET6)
 	const struct timeval lro_timeout = {0, sc->lro_timeout};
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	MPASS(iq->flags & IQ_HAS_FL);
 
 	limit = budget ? budget : iq->qsize / 16;
 	fl = &rxq->fl;
 	fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
 
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_ADJ_CREDIT) {
 		MPASS(sort_before_lro(lro));
 		iq->flags &= ~IQ_ADJ_CREDIT;
 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
 			tcp_lro_flush_all(lro);
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
 			    V_INGRESSQID((u32)iq->cntxt_id) |
 			    V_SEINTARM(iq->intr_params));
 			return (0);
 		}
 		ndescs = 1;
 	}
 #else
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 #endif
 
 	while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 		rmb();
 
 		refill = 0;
 		m0 = NULL;
 		rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 		lq = be32toh(d->rsp.pldbuflen_qid);
 
 		switch (rsp_type) {
 		case X_RSPD_TYPE_FLBUF:
 
 			m0 = get_fl_payload(sc, fl, lq);
 			if (__predict_false(m0 == NULL))
 				goto out;
 			refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
 
 			if (iq->flags & IQ_RX_TIMESTAMP) {
 				/*
 				 * Fill up rcv_tstmp but do not set M_TSTMP.
 				 * rcv_tstmp is not in the format that the
 				 * kernel expects and we don't want to mislead
 				 * it.  For now this is only for custom code
 				 * that knows how to interpret cxgbe's stamp.
 				 */
 				m0->m_pkthdr.rcv_tstmp =
 				    last_flit_to_ns(sc, d->rsp.u.last_flit);
 #ifdef notyet
 				m0->m_flags |= M_TSTMP;
 #endif
 			}
 
 			/* fall through */
 
 		case X_RSPD_TYPE_CPL:
 			KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 			    ("%s: bad opcode %02x.", __func__, d->rss.opcode));
 			t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
 			break;
 
 		case X_RSPD_TYPE_INTR:
 
 			/*
 			 * There are 1K interrupt-capable queues (qids 0
 			 * through 1023).  A response type indicating a
 			 * forwarded interrupt with a qid >= 1K is an
 			 * iWARP async notification.  That is the only
 			 * acceptable indirect interrupt on this queue.
 			 */
 			if (__predict_false(lq < 1024)) {
 				panic("%s: indirect interrupt on iq_fl %p "
 				    "with qid %u", __func__, iq, lq);
 			}
 
 			t4_an_handler(iq, &d->rsp);
 			break;
 
 		default:
 			KASSERT(0, ("%s: illegal response type %d on iq %p",
 			    __func__, rsp_type, iq));
 			log(LOG_ERR, "%s: illegal response type %d on iq %p",
 			    device_get_nameunit(sc->dev), rsp_type, iq);
 			break;
 		}
 
 		d++;
 		if (__predict_false(++iq->cidx == iq->sidx)) {
 			iq->cidx = 0;
 			iq->gen ^= F_RSPD_GEN;
 			d = &iq->desc[0];
 		}
 		if (__predict_false(++ndescs == limit)) {
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 			    V_INGRESSQID(iq->cntxt_id) |
 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 			ndescs = 0;
 
 #if defined(INET) || defined(INET6)
 			if (iq->flags & IQ_LRO_ENABLED &&
 			    !sort_before_lro(lro) &&
 			    sc->lro_timeout != 0) {
 				tcp_lro_flush_inactive(lro, &lro_timeout);
 			}
 #endif
 			if (budget) {
 				FL_LOCK(fl);
 				refill_fl(sc, fl, 32);
 				FL_UNLOCK(fl);
 
 				return (EINPROGRESS);
 			}
 		}
 		if (refill) {
 			FL_LOCK(fl);
 			refill_fl(sc, fl, 32);
 			FL_UNLOCK(fl);
 			fl_hw_cidx = fl->hw_cidx;
 		}
 	}
 out:
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
 			MPASS(sort_before_lro(lro));
 			/* hold back one credit and don't flush LRO state */
 			iq->flags |= IQ_ADJ_CREDIT;
 			ndescs--;
 		} else {
 			tcp_lro_flush_all(lro);
 		}
 	}
 #endif
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	FL_LOCK(fl);
 	starved = refill_fl(sc, fl, 64);
 	FL_UNLOCK(fl);
 	if (__predict_false(starved != 0))
 		add_fl_to_sfl(sc, fl);
 
 	return (0);
 }
 
 static inline int
 cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
 {
 	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
 
 	if (rc)
 		MPASS(cll->region3 >= CL_METADATA_SIZE);
 
 	return (rc);
 }
 
 static inline struct cluster_metadata *
 cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
     caddr_t cl)
 {
 
 	if (cl_has_metadata(fl, cll)) {
 		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 
 		return ((struct cluster_metadata *)(cl + swz->size) - 1);
 	}
 	return (NULL);
 }
 
 static void
 rxb_free(struct mbuf *m)
 {
 	uma_zone_t zone = m->m_ext.ext_arg1;
 	void *cl = m->m_ext.ext_arg2;
 
 	uma_zfree(zone, cl);
 	counter_u64_add(extfree_rels, 1);
 }
 
 /*
  * The mbuf returned by this function could be allocated from zone_mbuf or
  * constructed in spare room in the cluster.
  *
  * The mbuf carries the payload in one of these ways
  * a) frame inside the mbuf (mbuf from zone_mbuf)
  * b) m_cljset (for clusters without metadata) zone_mbuf
  * c) m_extaddref (cluster with metadata) inline mbuf
  * d) m_extaddref (cluster with metadata) zone_mbuf
  */
 static struct mbuf *
 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct cluster_layout *cll = &sd->cll;
 	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
 	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
 	int len, blen;
 	caddr_t payload;
 
 	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
 	len = min(remaining, blen);
 	payload = sd->cl + cll->region1 + fl->rx_offset;
 	if (fl->flags & FL_BUF_PACKING) {
 		const u_int l = fr_offset + len;
 		const u_int pad = roundup2(l, fl->buf_boundary) - l;
 
 		if (fl->rx_offset + len + pad < hwb->size)
 			blen = len + pad;
 		MPASS(fl->rx_offset + blen <= hwb->size);
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 	}
 
 
 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 
 		/*
 		 * Copy payload into a freshly allocated mbuf.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 
 		/* copy data to mbuf */
 		bcopy(payload, mtod(m, caddr_t), len);
 
 	} else if (sd->nmbuf * MSIZE < cll->region1) {
 
 		/*
 		 * There's spare room in the cluster for an mbuf.  Create one
 		 * and associate it with the payload that's in the cluster.
 		 */
 
 		MPASS(clm != NULL);
 		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
 		/* No bzero required */
 		if (m_init(m, M_NOWAIT, MT_DATA,
 		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
 			return (NULL);
 		fl->mbuf_inlined++;
 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
 		    swz->zone, sd->cl);
 		if (sd->nmbuf++ == 0)
 			counter_u64_add(extfree_refs, 1);
 
 	} else {
 
 		/*
 		 * Grab an mbuf from zone_mbuf and associate it with the
 		 * payload in the cluster.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 		if (clm != NULL) {
 			m_extaddref(m, payload, blen, &clm->refcount,
 			    rxb_free, swz->zone, sd->cl);
 			if (sd->nmbuf++ == 0)
 				counter_u64_add(extfree_refs, 1);
 		} else {
 			m_cljset(m, sd->cl, swz->type);
 			sd->cl = NULL;	/* consumed, not a recycle candidate */
 		}
 	}
 	if (fr_offset == 0)
 		m->m_pkthdr.len = remaining;
 	m->m_len = len;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= hwb->size);
 		if (fl->rx_offset < hwb->size)
 			return (m);	/* without advancing the cidx */
 	}
 
 	if (__predict_false(++fl->cidx % 8 == 0)) {
 		uint16_t cidx = fl->cidx / 8;
 
 		if (__predict_false(cidx == fl->sidx))
 			fl->cidx = cidx = 0;
 		fl->hw_cidx = cidx;
 	}
 	fl->rx_offset = 0;
 
 	return (m);
 }
 
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
 {
 	struct mbuf *m0, *m, **pnext;
 	u_int remaining;
 	const u_int total = G_RSPD_LEN(len_newbuf);
 
 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
 		M_ASSERTPKTHDR(fl->m0);
 		MPASS(fl->m0->m_pkthdr.len == total);
 		MPASS(fl->remaining < total);
 
 		m0 = fl->m0;
 		pnext = fl->pnext;
 		remaining = fl->remaining;
 		fl->flags &= ~FL_BUF_RESUME;
 		goto get_segment;
 	}
 
 	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
 		fl->rx_offset = 0;
 		if (__predict_false(++fl->cidx % 8 == 0)) {
 			uint16_t cidx = fl->cidx / 8;
 
 			if (__predict_false(cidx == fl->sidx))
 				fl->cidx = cidx = 0;
 			fl->hw_cidx = cidx;
 		}
 	}
 
 	/*
 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
 	 * 'len' and it may span multiple hw buffers.
 	 */
 
 	m0 = get_scatter_segment(sc, fl, 0, total);
 	if (m0 == NULL)
 		return (NULL);
 	remaining = total - m0->m_len;
 	pnext = &m0->m_next;
 	while (remaining > 0) {
 get_segment:
 		MPASS(fl->rx_offset == 0);
 		m = get_scatter_segment(sc, fl, total - remaining, remaining);
 		if (__predict_false(m == NULL)) {
 			fl->m0 = m0;
 			fl->pnext = pnext;
 			fl->remaining = remaining;
 			fl->flags |= FL_BUF_RESUME;
 			return (NULL);
 		}
 		*pnext = m;
 		pnext = &m->m_next;
 		remaining -= m->m_len;
 	}
 	*pnext = NULL;
 
 	M_ASSERTPKTHDR(m0);
 	return (m0);
 }
 
 static int
 t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct ifnet *ifp = rxq->ifp;
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
 
 	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
 	m0->m_len -= sc->params.sge.fl_pktshift;
 	m0->m_data += sc->params.sge.fl_pktshift;
 
 	m0->m_pkthdr.rcvif = ifp;
 	M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]);
 	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
 
 	if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
 		if (ifp->if_capenable & IFCAP_RXCSUM &&
 		    cpl->l2info & htobe32(F_RXF_IP)) {
 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 		    cpl->l2info & htobe32(F_RXF_IP6)) {
 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
 			    CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		}
 
 		if (__predict_false(cpl->ip_frag))
 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
 		else
 			m0->m_pkthdr.csum_data = 0xffff;
 	}
 
 	if (cpl->vlan_ex) {
 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
 		m0->m_flags |= M_VLANTAG;
 		rxq->vlan_extraction++;
 	}
 
 #ifdef NUMA
 	m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
 #endif
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		if (sort_before_lro(lro)) {
 			tcp_lro_queue_mbuf(lro, m0);
 			return (0); /* queued for sort, then LRO */
 		}
 		if (tcp_lro_rx(lro, m0, 0) == 0)
 			return (0); /* queued for LRO */
 	}
 #endif
 	ifp->if_input(ifp, m0);
 
 	return (0);
 }
 
 /*
  * Must drain the wrq or make sure that someone else will.
  */
 static void
 wrq_tx_drain(void *arg, int n)
 {
 	struct sge_wrq *wrq = arg;
 	struct sge_eq *eq = &wrq->eq;
 
 	EQ_LOCK(eq);
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(wrq->adapter, wrq);
 	EQ_UNLOCK(eq);
 }
 
 static void
 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
 {
 	struct sge_eq *eq = &wrq->eq;
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n;
 	struct wrqe *wr;
 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	wr = STAILQ_FIRST(&wrq->wr_list);
 	MPASS(wr != NULL);	/* Must be called with something useful to do */
 	MPASS(eq->pidx == eq->dbidx);
 	dbdiff = 0;
 
 	do {
 		eq->cidx = read_hw_cidx(eq);
 		if (eq->pidx == eq->cidx)
 			available = eq->sidx - 1;
 		else
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 		MPASS(wr->wrq == wrq);
 		n = howmany(wr->wr_len, EQ_ESIZE);
 		if (available < n)
 			break;
 
 		dst = (void *)&eq->desc[eq->pidx];
 		if (__predict_true(eq->sidx - eq->pidx > n)) {
 			/* Won't wrap, won't end exactly at the status page. */
 			bcopy(&wr->wr[0], dst, wr->wr_len);
 			eq->pidx += n;
 		} else {
 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
 
 			bcopy(&wr->wr[0], dst, first_portion);
 			if (wr->wr_len > first_portion) {
 				bcopy(&wr->wr[first_portion], &eq->desc[0],
 				    wr->wr_len - first_portion);
 			}
 			eq->pidx = n - (eq->sidx - eq->pidx);
 		}
 		wrq->tx_wrs_copied++;
 
 		if (available < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 		}
 
 		dbdiff += n;
 		if (dbdiff >= 16) {
 			ring_eq_db(sc, eq, dbdiff);
 			dbdiff = 0;
 		}
 
 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
 		free_wrqe(wr);
 		MPASS(wrq->nwr_pending > 0);
 		wrq->nwr_pending--;
 		MPASS(wrq->ndesc_needed >= n);
 		wrq->ndesc_needed -= n;
 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
 
 	if (dbdiff)
 		ring_eq_db(sc, eq, dbdiff);
 }
 
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 #ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
 #endif
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(wr != NULL);
 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
 	MPASS((wr->wr_len & 0x7) == 0);
 
 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 	wrq->nwr_pending++;
 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
 
 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
 		return;	/* commit_wrq_wr will drain wr_list as well. */
 
 	drain_wrq_wr_list(sc, wrq);
 
 	/* Doorbell must have caught up to the pidx. */
 	MPASS(eq->pidx == eq->dbidx);
 }
 
 void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	struct sge_fl *fl;
 	int i, maxp, mtu = ifp->if_mtu;
 
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	for_each_rxq(vi, i, rxq) {
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #endif
 }
 
 static inline int
 mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
 	return (m->m_pkthdr.l5hlen);
 }
 
 static inline void
 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.l5hlen = nsegs;
 }
 
 static inline int
 mbuf_cflags(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[4]);
 }
 
 static inline void
 set_mbuf_cflags(struct mbuf *m, uint8_t flags)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[4] = flags;
 }
 
 static inline int
 mbuf_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[0];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
 #ifdef RATELIMIT
 static inline int
 mbuf_eo_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[1]);
 }
 
 static inline void
 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[1] = nsegs;
 }
 
 static inline int
 mbuf_eo_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[2];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[2] = len16;
 }
 
 static inline int
 mbuf_eo_tsclk_tsoff(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[3]);
 }
 
 static inline void
 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
 }
 
 static inline int
 needs_eo(struct mbuf *m)
 {
 
-	return (m->m_pkthdr.snd_tag != NULL);
+	return (m->m_pkthdr.csum_flags & CSUM_SND_TAG);
 }
 #endif
 
 /*
  * Try to allocate an mbuf to contain a raw work request.  To make it
  * easy to construct the work request, don't allocate a chain but a
  * single mbuf.
  */
 struct mbuf *
 alloc_wr_mbuf(int len, int how)
 {
 	struct mbuf *m;
 
 	if (len <= MHLEN)
 		m = m_gethdr(how, MT_DATA);
 	else if (len <= MCLBYTES)
 		m = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m = NULL;
 	if (m == NULL)
 		return (NULL);
 	m->m_pkthdr.len = len;
 	m->m_len = len;
 	set_mbuf_cflags(m, MC_RAW_WR);
 	set_mbuf_len16(m, howmany(len, 16));
 	return (m);
 }
 
 static inline int
 needs_tso(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & CSUM_TSO);
 }
 
 static inline int
 needs_l3_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
 }
 
 static inline int
 needs_l4_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
 	    CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 static inline int
 needs_tcp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
 }
 
 #ifdef RATELIMIT
 static inline int
 needs_udp_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
 }
 #endif
 
 static inline int
 needs_vlan_insertion(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_flags & M_VLANTAG);
 }
 
 static void *
 m_advance(struct mbuf **pm, int *poffset, int len)
 {
 	struct mbuf *m = *pm;
 	int offset = *poffset;
 	uintptr_t p = 0;
 
 	MPASS(len > 0);
 
 	for (;;) {
 		if (offset + len < m->m_len) {
 			offset += len;
 			p = mtod(m, uintptr_t) + offset;
 			break;
 		}
 		len -= m->m_len - offset;
 		m = m->m_next;
 		offset = 0;
 		MPASS(m != NULL);
 	}
 	*poffset = offset;
 	*pm = m;
 	return ((void *)p);
 }
 
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
  * must have at least one mbuf that's not empty.  It is possible for this
  * routine to return 0 if skip accounts for all the contents of the mbuf chain.
  */
 static inline int
 count_mbuf_nsegs(struct mbuf *m, int skip)
 {
 	vm_paddr_t lastb, next;
 	vm_offset_t va;
 	int len, nsegs;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len > 0);
 	MPASS(m->m_pkthdr.len >= skip);
 
 	nsegs = 0;
 	lastb = 0;
 	for (; m; m = m->m_next) {
 
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
 		if (skip >= len) {
 			skip -= len;
 			continue;
 		}
 		va = mtod(m, vm_offset_t) + skip;
 		len -= skip;
 		skip = 0;
 		next = pmap_kextract(va);
 		nsegs += sglist_count((void *)(uintptr_t)va, len);
 		if (lastb + 1 == next)
 			nsegs--;
 		lastb = pmap_kextract(va + len - 1);
 	}
 
 	return (nsegs);
 }
 
 /*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
  */
 int
 parse_pkt(struct adapter *sc, struct mbuf **mp)
 {
 	struct mbuf *m0 = *mp, *m;
 	int rc, nsegs, defragged = 0, offset;
 	struct ether_header *eh;
 	void *l3hdr;
 #if defined(INET) || defined(INET6)
 	struct tcphdr *tcp;
 #endif
 	uint16_t eh_type;
 
 	M_ASSERTPKTHDR(m0);
 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
 		rc = EINVAL;
 fail:
 		m_freem(m0);
 		*mp = NULL;
 		return (rc);
 	}
 restart:
 	/*
 	 * First count the number of gather list segments in the payload.
 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
 	nsegs = count_mbuf_nsegs(m0, 0);
 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0 = m;	/* update caller's copy after defrag */
 		goto restart;
 	}
 
 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
 		m0 = m_pullup(m0, m0->m_pkthdr.len);
 		if (m0 == NULL) {
 			/* Should have left well enough alone. */
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0;	/* update caller's copy after pullup */
 		goto restart;
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	set_mbuf_cflags(m0, 0);
 	if (sc->flags & IS_VF)
 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
 	else
 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
 #ifdef RATELIMIT
 	/*
 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
 	 * checksumming is enabled.  needs_l4_csum happens to check for all the
 	 * right things.
 	 */
-	if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0)))
+	if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0))) {
+		m_snd_tag_rele(m0->m_pkthdr.snd_tag);
 		m0->m_pkthdr.snd_tag = NULL;
+		m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
+	}
 #endif
 
 	if (!needs_tso(m0) &&
 #ifdef RATELIMIT
 	    !needs_eo(m0) &&
 #endif
 	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
 		return (0);
 
 	m = m0;
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	if (eh_type == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evh = (void *)eh;
 
 		eh_type = ntohs(evh->evl_proto);
 		m0->m_pkthdr.l2hlen = sizeof(*evh);
 	} else
 		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
 	offset = 0;
 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = l3hdr;
 
 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
 
 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
 		break;
 	}
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
 		break;
 	}
 #endif
 	default:
 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
 		    " with the same INET/INET6 options as the kernel.",
 		    __func__, eh_type);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (needs_tcp_csum(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 #ifdef RATELIMIT
 		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
 			set_mbuf_eo_tsclk_tsoff(m0,
 			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
 		} else
 			set_mbuf_eo_tsclk_tsoff(m0, 0);
 	} else if (needs_udp_csum(m)) {
 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 #endif
 	}
 #ifdef RATELIMIT
 	if (needs_eo(m0)) {
 		u_int immhdrs;
 
 		/* EO WRs have the headers in the WR and not the GL. */
 		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
 		    m0->m_pkthdr.l4hlen;
 		nsegs = count_mbuf_nsegs(m0, immhdrs);
 		set_mbuf_eo_nsegs(m0, nsegs);
 		set_mbuf_eo_len16(m0,
 		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
 	}
 #endif
 #endif
 	MPASS(m0 == *mp);
 	return (0);
 }
 
 void *
 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, available;
 	struct wrqe *wr;
 	void *w;
 
 	MPASS(len16 > 0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
 	EQ_LOCK(eq);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
 slowpath:
 		EQ_UNLOCK(eq);
 		wr = alloc_wrqe(len16 * 16, wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 		cookie->pidx = -1;
 		cookie->ndesc = ndesc;
 		return (&wr->wr);
 	}
 
 	eq->cidx = read_hw_cidx(eq);
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	if (available < ndesc)
 		goto slowpath;
 
 	cookie->pidx = eq->pidx;
 	cookie->ndesc = ndesc;
 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
 
 	w = &eq->desc[eq->pidx];
 	IDXINCR(eq->pidx, ndesc, eq->sidx);
 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
 		w = &wrq->ss[0];
 		wrq->ss_pidx = cookie->pidx;
 		wrq->ss_len = len16 * 16;
 	}
 
 	EQ_UNLOCK(eq);
 
 	return (w);
 }
 
 void
 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, pidx;
 	struct wrq_cookie *prev, *next;
 
 	if (cookie->pidx == -1) {
 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
 		t4_wrq_tx(sc, wr);
 		return;
 	}
 
 	if (__predict_false(w == &wrq->ss[0])) {
 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
 
 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
 		wrq->tx_wrs_ss++;
 	} else
 		wrq->tx_wrs_direct++;
 
 	EQ_LOCK(eq);
 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
 	pidx = cookie->pidx;
 	MPASS(pidx >= 0 && pidx < eq->sidx);
 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
 	next = TAILQ_NEXT(cookie, link);
 	if (prev == NULL) {
 		MPASS(pidx == eq->dbidx);
 		if (next == NULL || ndesc >= 16) {
 			int available;
 			struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 			/*
 			 * Note that the WR via which we'll request tx updates
 			 * is at pidx and not eq->pidx, which has moved on
 			 * already.
 			 */
 			dst = (void *)&eq->desc[pidx];
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 			if (available < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 			}
 
 			ring_eq_db(wrq->adapter, eq, ndesc);
 		} else {
 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
 			next->pidx = pidx;
 			next->ndesc += ndesc;
 		}
 	} else {
 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
 		prev->ndesc += ndesc;
 	}
 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
 		/* Doorbell must have caught up to the pidx. */
 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
 	EQ_UNLOCK(eq);
 }
 
 static u_int
 can_resume_eth_tx(struct mp_ring *r)
 {
 	struct sge_eq *eq = r->cookie;
 
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
 static inline int
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
 
 	return (needs_tso(m) || (mbuf_cflags(m) & MC_RAW_WR) != 0);
 }
 
 static inline int
 discard_tx(struct sge_eq *eq)
 {
 
 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
 }
 
 static inline int
 wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
 {
 
 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
 	case FW_ETH_TX_PKT_WR:
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKT_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
 {
 	struct sge_txq *txq = r->cookie;
 	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	u_int total, remaining;		/* # of packets */
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n, next_cidx;
 	struct mbuf *m0, *tail;
 	struct txpkts txp;
 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
 
 	remaining = IDXDIFF(pidx, cidx, r->size);
 	MPASS(remaining > 0);	/* Must not be called without work to do. */
 	total = 0;
 
 	TXQ_LOCK(txq);
 	if (__predict_false(discard_tx(eq))) {
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
 		reclaim_tx_descs(txq, 2048);
 		total = remaining;
 		goto done;
 	}
 
 	/* How many hardware descriptors do we have readily available. */
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
 
 	while (remaining > 0) {
 
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
 		if (available < SGE_MAX_WR_NDESC) {
 			available += reclaim_tx_descs(txq, 64);
 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
 				break;	/* out of descriptors */
 		}
 
 		next_cidx = cidx + 1;
 		if (__predict_false(next_cidx == r->size))
 			next_cidx = 0;
 
 		wr = (void *)&eq->desc[eq->pidx];
 		if (sc->flags & IS_VF) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
 			    available);
 		} else if (remaining > 1 &&
 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
 
 			/* pkts at cidx, next_cidx should both be in txp. */
 			MPASS(txp.npkt == 2);
 			tail = r->items[next_cidx];
 			MPASS(tail->m_nextpkt == NULL);
 			ETHER_BPF_MTAP(ifp, m0);
 			ETHER_BPF_MTAP(ifp, tail);
 			m0->m_nextpkt = tail;
 
 			if (__predict_false(++next_cidx == r->size))
 				next_cidx = 0;
 
 			while (next_cidx != pidx) {
 				if (add_to_txpkts(r->items[next_cidx], &txp,
 				    available) != 0)
 					break;
 				tail->m_nextpkt = r->items[next_cidx];
 				tail = tail->m_nextpkt;
 				ETHER_BPF_MTAP(ifp, tail);
 				if (__predict_false(++next_cidx == r->size))
 					next_cidx = 0;
 			}
 
 			n = write_txpkts_wr(txq, wr, m0, &txp, available);
 			total += txp.npkt;
 			remaining -= txp.npkt;
 		} else if (mbuf_cflags(m0) & MC_RAW_WR) {
 			total++;
 			remaining--;
 			n = write_raw_wr(txq, (void *)wr, m0, available);
 		} else {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_wr(txq, (void *)wr, m0, available);
 		}
 		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
 
 		available -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
 		if (wr_can_update_eq(wr)) {
 			if (total_available_tx_desc(eq) < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >=
 			    32) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 				eq->equeqidx = eq->pidx;
 			}
 		}
 
 		if (dbdiff >= 16 && remaining >= 4) {
 			ring_eq_db(sc, eq, dbdiff);
 			available += reclaim_tx_descs(txq, 4 * dbdiff);
 			dbdiff = 0;
 		}
 
 		cidx = next_cidx;
 	}
 	if (dbdiff != 0) {
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
 	}
 done:
 	TXQ_UNLOCK(txq);
 
 	return (total);
 }
 
 static inline void
 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
     int qsize)
 {
 
 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
 
 	iq->flags = 0;
 	iq->adapter = sc;
 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
 	if (pktc_idx >= 0) {
 		iq->intr_params |= F_QINTR_CNT_EN;
 		iq->intr_pktc_idx = pktc_idx;
 	}
 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
 }
 
 static inline void
 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
 {
 
 	fl->qsize = qsize;
 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
 	if (sc->flags & BUF_PACKING_OK &&
 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
 		fl->flags |= FL_BUF_PACKING;
 	find_best_refill_source(sc, fl, maxp);
 	find_safe_refill_source(sc, fl);
 }
 
 static inline void
 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
     uint8_t tx_chan, uint16_t iqid, char *name)
 {
 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
 
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
 }
 
 static int
 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_addr_t *pa, void **va)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamem_alloc(*tag, va,
 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
 		goto done;
 	}
 done:
 	if (rc)
 		free_ring(sc, *tag, *map, *pa, *va);
 
 	return (rc);
 }
 
 static int
 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
     bus_addr_t pa, void *va)
 {
 	if (pa)
 		bus_dmamap_unload(tag, map);
 	if (va)
 		bus_dmamem_free(tag, va, map);
 	if (tag)
 		bus_dma_tag_destroy(tag);
 
 	return (0);
 }
 
 /*
  * Allocates the ring for an ingress queue and an optional freelist.  If the
  * freelist is specified it will be allocated and then associated with the
  * ingress queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  *
  * If the ingress queue will take interrupts directly then the intr_idx
  * specifies the vector, starting from 0.  -1 means the interrupts for this
  * queue should be forwarded to the fwq.
  */
 static int
 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
     int intr_idx, int cong)
 {
 	int rc, i, cntxt_id;
 	size_t len;
 	struct fw_iq_cmd c;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = iq->adapter;
 	struct sge_params *sp = &sc->params.sge;
 	__be32 v = 0;
 
 	len = iq->qsize * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
 	    (void **)&iq->desc);
 	if (rc != 0)
 		return (rc);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 
 	/* Special handling for firmware event queue */
 	if (iq == &sc->sge.fwq)
 		v |= F_FW_IQ_CMD_IQASYNCH;
 
 	if (intr_idx < 0) {
 		/* Forwarded interrupts, all headed to fwq */
 		v |= F_FW_IQ_CMD_IQANDST;
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
 	} else {
 		KASSERT(intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
 	}
 
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(iq->qsize);
 	c.iqaddr = htobe64(iq->ba);
 	if (cong >= 0)
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
 
 	if (fl) {
 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
 		len = fl->qsize * EQ_ESIZE;
 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
 		    &fl->ba, (void **)&fl->desc);
 		if (rc)
 			return (rc);
 
 		/* Allocate space for one software descriptor per buffer. */
 		rc = alloc_fl_sdesc(fl);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to setup fl software descriptors: %d\n",
 			    rc);
 			return (rc);
 		}
 
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
 			fl->buf_boundary = sp->pack_boundary;
 		} else {
 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
 			fl->buf_boundary = 16;
 		}
 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
 			fl->buf_boundary = sp->pad_boundary;
 
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
 			    0));
 		if (cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
 				    F_FW_IQ_CMD_FL0CONGCIF |
 				    F_FW_IQ_CMD_FL0CONGEN);
 		}
 		c.fl0dcaen_to_fl0cidxfthresh =
 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 		c.fl0size = htobe16(fl->qsize);
 		c.fl0addr = htobe64(fl->ba);
 	}
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	iq->cidx = 0;
 	iq->gen = F_RSPD_GEN;
 	iq->intr_next = iq->intr_params;
 	iq->cntxt_id = be16toh(c.iqid);
 	iq->abs_id = be16toh(c.physiqid);
 	iq->flags |= IQ_ALLOCATED;
 
 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.niq) {
 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
 		    cntxt_id, sc->sge.niq - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = iq;
 
 	if (fl) {
 		u_int qid;
 
 		iq->flags |= IQ_HAS_FL;
 		fl->cntxt_id = be16toh(c.fl0id);
 		fl->pidx = fl->cidx = 0;
 
 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
 		if (cntxt_id >= sc->sge.neq) {
 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
 			    __func__, cntxt_id, sc->sge.neq - 1);
 		}
 		sc->sge.eqmap[cntxt_id] = (void *)fl;
 
 		qid = fl->cntxt_id;
 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 			uint32_t mask = (1 << s_qpp) - 1;
 			volatile uint8_t *udb;
 
 			udb = sc->udbs_base + UDBS_DB_OFFSET;
 			udb += (qid >> s_qpp) << PAGE_SHIFT;
 			qid &= mask;
 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
 				udb += qid << UDBS_SEG_SHIFT;
 				qid = 0;
 			}
 			fl->udb = (volatile void *)udb;
 		}
 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
 
 		FL_LOCK(fl);
 		/* Enough to make sure the SGE doesn't think it's starved */
 		refill_fl(sc, fl, fl->lowat);
 		FL_UNLOCK(fl);
 	}
 
 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
 		if (cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			device_printf(sc->dev,
 			    "failed to set congestion manager context for "
 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
 		}
 	}
 
 	/* Enable IQ interrupts */
 	atomic_store_rel_int(&iq->state, IQS_IDLE);
 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
 	    V_INGRESSQID(iq->cntxt_id));
 
 	return (0);
 }
 
 static int
 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc;
 	struct adapter *sc = iq->adapter;
 	device_t dev;
 
 	if (sc == NULL)
 		return (0);	/* nothing to do */
 
 	dev = vi ? vi->dev : sc->dev;
 
 	if (iq->flags & IQ_ALLOCATED) {
 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
 		if (rc != 0) {
 			device_printf(dev,
 			    "failed to free queue %p: %d\n", iq, rc);
 			return (rc);
 		}
 		iq->flags &= ~IQ_ALLOCATED;
 	}
 
 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
 
 	bzero(iq, sizeof(*iq));
 
 	if (fl) {
 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
 		    fl->desc);
 
 		if (fl->sdesc)
 			free_fl_sdesc(sc, fl);
 
 		if (mtx_initialized(&fl->fl_lock))
 			mtx_destroy(&fl->fl_lock);
 
 		bzero(fl, sizeof(*fl));
 	}
 
 	return (0);
 }
 
 static void
 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_iq *iq)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
 	    "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &iq->abs_id, 0, sysctl_uint16, "I",
 	    "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &iq->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &iq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 }
 
 static void
 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_fl *fl)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &fl->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
 	    fl_pad ? 1 : 0, "padding enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
 	    0, "consumer index");
 	if (fl->flags & FL_BUF_PACKING) {
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
 	}
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
 	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
 	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
 }
 
 static int
 alloc_fwq(struct adapter *sc)
 {
 	int rc, intr_idx;
 	struct sge_iq *fwq = &sc->sge.fwq;
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
 	if (sc->flags & IS_VF)
 		intr_idx = 0;
 	else
 		intr_idx = sc->intr_count > 1 ? 1 : 0;
 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create firmware event queue: %d\n", rc);
 		return (rc);
 	}
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
 	    NULL, "firmware event queue");
 	add_iq_sysctls(&sc->ctx, oid, fwq);
 
 	return (0);
 }
 
 static int
 free_fwq(struct adapter *sc)
 {
 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
 }
 
 static int
 alloc_ctrlq(struct adapter *sc, struct sge_wrq *ctrlq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	char name[16];
 	struct sysctl_oid_list *children;
 
 	snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev),
 	    idx);
 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[idx]->tx_chan,
 	    sc->sge.fwq.cntxt_id, name);
 
 	children = SYSCTL_CHILDREN(oid);
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "ctrl queue");
 	rc = alloc_wrq(sc, NULL, ctrlq, oid);
 
 	return (rc);
 }
 
 int
 tnl_cong(struct port_info *pi, int drop)
 {
 
 	if (drop == -1)
 		return (-1);
 	else if (drop == 1)
 		return (0);
 	else
 		return (pi->rx_e_chan_map);
 }
 
 static int
 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
 	    tnl_cong(vi->pi, cong_drop));
 	if (rc != 0)
 		return (rc);
 
 	if (idx == 0)
 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
 	else
 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
 		    ("iq_base mismatch"));
 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero iq_base"));
 
 	/*
 	 * The freelist is just barely above the starvation threshold right now,
 	 * fill it up a bit more.
 	 */
 	FL_LOCK(&rxq->fl);
 	refill_fl(sc, &rxq->fl, 128);
 	FL_UNLOCK(&rxq->fl);
 
 #if defined(INET) || defined(INET6)
 	rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs);
 	if (rc != 0)
 		return (rc);
 	MPASS(rxq->lro.ifp == vi->ifp);	/* also indicates LRO init'ed */
 
 	if (vi->ifp->if_capenable & IFCAP_LRO)
 		rxq->iq.flags |= IQ_LRO_ENABLED;
 #endif
 	if (vi->ifp->if_capenable & IFCAP_HWRXTSTMP)
 		rxq->iq.flags |= IQ_RX_TIMESTAMP;
 	rxq->ifp = vi->ifp;
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	add_iq_sysctls(&vi->ctx, oid, &rxq->iq);
 #if defined(INET) || defined(INET6)
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
 	    CTLFLAG_RD, &rxq->vlan_extraction,
 	    "# of times hardware extracted 802.1Q tag");
 
 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
 {
 	int rc;
 
 #if defined(INET) || defined(INET6)
 	if (rxq->lro.ifp) {
 		tcp_lro_free(&rxq->lro);
 		rxq->lro.ifp = NULL;
 	}
 #endif
 
 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
 	if (rc == 0)
 		bzero(rxq, sizeof(*rxq));
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
 {
 	struct port_info *pi = vi->pi;
 	int rc;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0);
 	if (rc != 0)
 		return (rc);
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	add_iq_sysctls(&vi->ctx, oid, &ofld_rxq->iq);
 	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
 {
 	int rc;
 
 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
 	if (rc == 0)
 		bzero(ofld_rxq, sizeof(*ofld_rxq));
 
 	return (rc);
 }
 #endif
 
 #ifdef DEV_NETMAP
 static int
 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
     int idx, struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_oid_list *children;
 	struct sysctl_ctx_list *ctx;
 	char name[16];
 	size_t len;
 	struct adapter *sc = vi->pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 
 	MPASS(na != NULL);
 
 	len = vi->qsize_rxq * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
 	if (rc != 0)
 		return (rc);
 
 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
 	if (rc != 0)
 		return (rc);
 
 	nm_rxq->vi = vi;
 	nm_rxq->nid = idx;
 	nm_rxq->iq_cidx = 0;
 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
 	nm_rxq->iq_gen = F_RSPD_GEN;
 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
 	nm_rxq->fl_sidx = na->num_rx_desc;
 	nm_rxq->intr_idx = intr_idx;
 	nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID;
 
 	ctx = &vi->ctx;
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
 	    "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
 	    "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
 	children = SYSCTL_CHILDREN(oid);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
 	    &nm_rxq->fl_cidx, 0, "consumer index");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
 	    &nm_rxq->fl_pidx, 0, "producer index");
 
 	return (rc);
 }
 
 
 static int
 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID);
 	else
 		MPASS(nm_rxq->iq_cntxt_id == 0);
 
 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
 	    nm_rxq->iq_desc);
 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
 	    nm_rxq->fl_desc);
 
 	return (0);
 }
 
 static int
 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	size_t len;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
 	    &nm_txq->ba, (void **)&nm_txq->desc);
 	if (rc)
 		return (rc);
 
 	nm_txq->pidx = nm_txq->cidx = 0;
 	nm_txq->sidx = na->num_tx_desc;
 	nm_txq->nid = idx;
 	nm_txq->iqidx = iqidx;
 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID;
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "netmap tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 
 	return (rc);
 }
 
 static int
 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (vi->flags & VI_INIT_DONE)
 		MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID);
 	else
 		MPASS(nm_txq->cntxt_id == 0);
 
 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
 	    nm_txq->desc);
 
 	return (0);
 }
 #endif
 
 /*
  * Returns a reasonable automatic cidx flush threshold for a given queue size.
  */
 static u_int
 qsize_to_fthresh(int qsize)
 {
 	u_int fthresh;
 
 	while (!powerof2(qsize))
 		qsize++;
 	fthresh = ilog2(qsize);
 	if (fthresh > X_CIDXFLUSHTHRESH_128)
 		fthresh = X_CIDXFLUSHTHRESH_128;
 
 	return (fthresh);
 }
 
 static int
 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 static int
 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create Ethernet egress queue: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int
 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
 	    V_FW_EQ_OFLD_CMD_VFN(0));
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create egress queue for TCP offload: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 #endif
 
 static int
 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
 	eq->pidx = eq->cidx = eq->dbidx = 0;
 	/* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */
 	eq->equeqidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
 	case EQ_CTRL:
 		rc = ctrl_eq_alloc(sc, eq);
 		break;
 
 	case EQ_ETH:
 		rc = eth_eq_alloc(sc, vi, eq);
 		break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, vi, eq);
 		break;
 #endif
 
 	default:
 		panic("%s: invalid eq type %d.", __func__,
 		    eq->flags & EQ_TYPEMASK);
 	}
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to allocate egress queue(%d): %d\n",
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
 			eq->udb_qid = 0;
 		}
 		eq->udb = (volatile void *)udb;
 	}
 
 	return (rc);
 }
 
 static int
 free_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc;
 
 	if (eq->flags & EQ_ALLOCATED) {
 		switch (eq->flags & EQ_TYPEMASK) {
 		case EQ_CTRL:
 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 		case EQ_ETH:
 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 #endif
 
 		default:
 			panic("%s: invalid eq type %d.", __func__,
 			    eq->flags & EQ_TYPEMASK);
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to free egress queue (%d): %d\n",
 			    eq->flags & EQ_TYPEMASK, rc);
 			return (rc);
 		}
 		eq->flags &= ~EQ_ALLOCATED;
 	}
 
 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
 
 	if (mtx_initialized(&eq->eq_lock))
 		mtx_destroy(&eq->eq_lock);
 
 	bzero(eq, sizeof(*eq));
 	return (0);
 }
 
 static int
 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = alloc_eq(sc, vi, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	wrq->adapter = sc;
 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
 	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
 	wrq->nwr_pending = 0;
 	wrq->ndesc_needed = 0;
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &wrq->eq.ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    wrq->eq.sidx, "status page index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
 
 	return (rc);
 }
 
 static int
 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 	int rc;
 
 	rc = free_eq(sc, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	bzero(wrq, sizeof(*wrq));
 	return (0);
 }
 
 static int
 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
 	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
 	rc = alloc_eq(sc, vi, eq);
 	if (rc != 0) {
 		mp_ring_free(txq->r);
 		txq->r = NULL;
 		return (rc);
 	}
 
 	/* Can't fail after this point. */
 
 	if (idx == 0)
 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
 	else
 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
 		    ("eq_base mismatch"));
 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero eq_base"));
 
 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
 	txq->ifp = vi->ifp;
 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
 	if (sc->flags & IS_VF)
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan));
 	else
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 		    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 		    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 	txq->tc_idx = -1;
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &eq->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &eq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &eq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    eq->sidx, "status page index");
 
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I",
 	    "traffic class (-1 means none)");
 
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
 	    &txq->txcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
 	    CTLFLAG_RD, &txq->vlan_insertion,
 	    "# of times hardware inserted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
 	    &txq->tso_wrs, "# of TSO work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
 	    &txq->imm_wrs, "# of work requests with immediate data");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
 	    CTLFLAG_RD, &txq->txpkts0_wrs,
 	    "# of txpkts (type 0) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
 	    CTLFLAG_RD, &txq->txpkts1_wrs,
 	    "# of txpkts (type 1) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
 	    CTLFLAG_RD, &txq->txpkts0_pkts,
 	    "# of frames tx'd using type0 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
 	    CTLFLAG_RD, &txq->txpkts1_pkts,
 	    "# of frames tx'd using type1 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
 	    &txq->raw_wrs, "# of raw work requests (non-packets)");
 
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
 	    CTLFLAG_RD, &txq->r->enqueues,
 	    "# of enqueues to the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
 	    CTLFLAG_RD, &txq->r->drops,
 	    "# of drops in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
 	    CTLFLAG_RD, &txq->r->starts,
 	    "# of normal consumer starts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
 	    CTLFLAG_RD, &txq->r->stalls,
 	    "# of consumer stalls in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
 	    CTLFLAG_RD, &txq->r->restarts,
 	    "# of consumer restarts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
 	    CTLFLAG_RD, &txq->r->abdications,
 	    "# of consumer abdications in the mp_ring for this queue");
 
 	return (0);
 }
 
 static int
 free_txq(struct vi_info *vi, struct sge_txq *txq)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 
 	rc = free_eq(sc, eq);
 	if (rc)
 		return (rc);
 
 	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
 	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
 }
 
 static void
 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *ba = arg;
 
 	KASSERT(nseg == 1,
 	    ("%s meant for single segment mappings only.", __func__));
 
 	*ba = error ? 0 : segs->ds_addr;
 }
 
 static inline void
 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
 {
 	uint32_t n, v;
 
 	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
 	MPASS(n > 0);
 
 	wmb();
 	v = fl->dbval | V_PIDX(n);
 	if (fl->udb)
 		*fl->udb = htole32(v);
 	else
 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
 	IDXINCR(fl->dbidx, n, fl->sidx);
 }
 
 /*
  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
  * recycled do not count towards this allocation budget.
  *
  * Returns non-zero to indicate that this freelist should be added to the list
  * of starving freelists.
  */
 static int
 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
 {
 	__be64 *d;
 	struct fl_sdesc *sd;
 	uintptr_t pa;
 	caddr_t cl;
 	struct cluster_layout *cll;
 	struct sw_zone_info *swz;
 	struct cluster_metadata *clm;
 	uint16_t max_pidx;
 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
 
 	FL_LOCK_ASSERT_OWNED(fl);
 
 	/*
 	 * We always stop at the beginning of the hardware descriptor that's just
 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
 	 * which would mean an empty freelist to the chip.
 	 */
 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
 	if (fl->pidx == max_pidx * 8)
 		return (0);
 
 	d = &fl->desc[fl->pidx];
 	sd = &fl->sdesc[fl->pidx];
 	cll = &fl->cll_def;	/* default layout */
 	swz = &sc->sge.sw_zone_info[cll->zidx];
 
 	while (n > 0) {
 
 		if (sd->cl != NULL) {
 
 			if (sd->nmbuf == 0) {
 				/*
 				 * Fast recycle without involving any atomics on
 				 * the cluster's metadata (if the cluster has
 				 * metadata).  This happens when all frames
 				 * received in the cluster were small enough to
 				 * fit within a single mbuf each.
 				 */
 				fl->cl_fast_recycled++;
 #ifdef INVARIANTS
 				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 				if (clm != NULL)
 					MPASS(clm->refcount == 1);
 #endif
 				goto recycled_fast;
 			}
 
 			/*
 			 * Cluster is guaranteed to have metadata.  Clusters
 			 * without metadata always take the fast recycle path
 			 * when they're recycled.
 			 */
 			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 			MPASS(clm != NULL);
 
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				fl->cl_recycled++;
 				counter_u64_add(extfree_rels, 1);
 				goto recycled;
 			}
 			sd->cl = NULL;	/* gave up my reference */
 		}
 		MPASS(sd->cl == NULL);
 alloc:
 		cl = uma_zalloc(swz->zone, M_NOWAIT);
 		if (__predict_false(cl == NULL)) {
 			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
 			    fl->cll_def.zidx == fl->cll_alt.zidx)
 				break;
 
 			/* fall back to the safe zone */
 			cll = &fl->cll_alt;
 			swz = &sc->sge.sw_zone_info[cll->zidx];
 			goto alloc;
 		}
 		fl->cl_allocated++;
 		n--;
 
 		pa = pmap_kextract((vm_offset_t)cl);
 		pa += cll->region1;
 		sd->cl = cl;
 		sd->cll = *cll;
 		*d = htobe64(pa | cll->hwidx);
 		clm = cl_metadata(sc, fl, cll, cl);
 		if (clm != NULL) {
 recycled:
 #ifdef INVARIANTS
 			clm->sd = sd;
 #endif
 			clm->refcount = 1;
 		}
 		sd->nmbuf = 0;
 recycled_fast:
 		d++;
 		sd++;
 		if (__predict_false(++fl->pidx % 8 == 0)) {
 			uint16_t pidx = fl->pidx / 8;
 
 			if (__predict_false(pidx == fl->sidx)) {
 				fl->pidx = 0;
 				pidx = 0;
 				sd = fl->sdesc;
 				d = fl->desc;
 			}
 			if (pidx == max_pidx)
 				break;
 
 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
 				ring_fl_db(sc, fl);
 		}
 	}
 
 	if (fl->pidx / 8 != fl->dbidx)
 		ring_fl_db(sc, fl);
 
 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
 }
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
 refill_sfl(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_fl *fl, *fl_temp;
 
 	mtx_assert(&sc->sfl_lock, MA_OWNED);
 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
 		FL_LOCK(fl);
 		refill_fl(sc, fl, 64);
 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
 			TAILQ_REMOVE(&sc->sfl, fl, link);
 			fl->flags &= ~FL_STARVING;
 		}
 		FL_UNLOCK(fl);
 	}
 
 	if (!TAILQ_EMPTY(&sc->sfl))
 		callout_schedule(&sc->sfl_callout, hz / 5);
 }
 
 static int
 alloc_fl_sdesc(struct sge_fl *fl)
 {
 
 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	return (0);
 }
 
 static void
 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 {
 	struct fl_sdesc *sd;
 	struct cluster_metadata *clm;
 	struct cluster_layout *cll;
 	int i;
 
 	sd = fl->sdesc;
 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
 		if (sd->cl == NULL)
 			continue;
 
 		cll = &sd->cll;
 		clm = cl_metadata(sc, fl, cll, sd->cl);
 		if (sd->nmbuf == 0)
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 			counter_u64_add(extfree_rels, 1);
 		}
 		sd->cl = NULL;
 	}
 
 	free(fl->sdesc, M_CXGBE);
 	fl->sdesc = NULL;
 }
 
 static inline void
 get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	sglist_reset(gl);
 	rc = sglist_append_mbuf(gl, m);
 	if (__predict_false(rc != 0)) {
 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
 
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
 	KASSERT(gl->sg_nseg > 0 &&
 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
  * request header.
  */
 static inline u_int
 txpkt_vm_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts0_len16(u_int nsegs)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts1_len16(void)
 {
 	u_int n;
 
 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
 	return (howmany(n, 16));
 }
 
 static inline u_int
 imm_payload(u_int ndesc)
 {
 	u_int n;
 
 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
 	    sizeof(struct cpl_tx_pkt_core);
 
 	return (n);
 }
 
 /*
  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int csum_type, len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3[0] = 0;
 	wr->r3[1] = 0;
 	
 	/*
 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
 	 * simpler to always copy it rather than making it
 	 * conditional.  Also, it seems that we do not have to set
 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
 	 */
 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
 
 	csum_type = -1;
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			csum_type = TX_CSUM_TCPIP6;
 		else
 			csum_type = TX_CSUM_TCPIP;
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else {
 		if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP)
 			csum_type = TX_CSUM_TCPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP)
 			csum_type = TX_CSUM_UDPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP)
 			csum_type = TX_CSUM_TCPIP6;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP)
 			csum_type = TX_CSUM_UDPIP6;
 #if defined(INET)
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP) {
 			/*
 			 * XXX: The firmware appears to stomp on the
 			 * fragment/flags field of the IP header when
 			 * using TX_CSUM_IP.  Fall back to doing
 			 * software checksums.
 			 */
 			u_short *sump;
 			struct mbuf *m;
 			int offset;
 
 			m = m0;
 			offset = 0;
 			sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen +
 			    offsetof(struct ip, ip_sum));
 			*sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen +
 			    m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen);
 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 #endif
 
 		cpl = (void *)(wr + 1);
 	}
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (csum_type >= 0) {
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0,
 	    ("%s: mbuf %p needs checksum offload but missing header lengths",
 			__func__, m0));
 
 		if (chip_id(sc) <= CHELSIO_T5) {
 			ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		} else {
 			ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		}
 		ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen);
 		ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type);
 	} else
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 
 	/*
 	 * A packet using TSO will use up an entire descriptor for the
 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
 	 * If this descriptor is the last descriptor in the ring, wrap
 	 * around to the front of the ring explicitly for the start of
 	 * the sgl.
 	 */
 	if (dst == (void *)&eq->desc[eq->sidx]) {
 		dst = (void *)&eq->desc[0];
 		write_gl_to_txd(txq, m0, &dst, 0);
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a raw WR to the hardware descriptors, update the software
  * descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct mbuf *m;
 	caddr_t dst;
 	int len16, ndesc;
 
 	len16 = mbuf_len16(m0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	dst = wr;
 	for (m = m0; m != NULL; m = m->m_next)
 		copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 
 	txq->raw_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
     struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	else if (pktlen <= imm_payload(2) && available >= 2) {
 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
 		nsegs = 0;
 	}
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (needs_l4_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
 	} else {
 		struct mbuf *m;
 
 		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
 #endif
 		}
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
 		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
 	u_int needed, nsegs1, nsegs2, l1, l2;
 
 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
 		return (1);
 
 	nsegs1 = mbuf_nsegs(m);
 	nsegs2 = mbuf_nsegs(n);
 	if (nsegs1 + nsegs2 == 2) {
 		txp->wr_type = 1;
 		l1 = l2 = txpkts1_len16();
 	} else {
 		txp->wr_type = 0;
 		l1 = txpkts0_len16(nsegs1);
 		l2 = txpkts0_len16(nsegs2);
 	}
 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
 	needed = howmany(txp->len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
 	if (txp->plen > 65535)
 		return (1);
 
 	txp->npkt = 2;
 	set_mbuf_len16(m, l1);
 	set_mbuf_len16(n, l2);
 
 	return (0);
 }
 
 static int
 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
 {
 	u_int plen, len16, needed, nsegs;
 
 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
 	if (cannot_use_txpkts(m))
 		return (1);
 
 	nsegs = mbuf_nsegs(m);
 	if (txp->wr_type == 1 && nsegs != 1)
 		return (1);
 
 	plen = txp->plen + m->m_pkthdr.len;
 	if (plen > 65535)
 		return (1);
 
 	if (txp->wr_type == 0)
 		len16 = txpkts0_len16(nsegs);
 	else
 		len16 = txpkts1_len16();
 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->npkt++;
 	txp->plen = plen;
 	txp->len16 += len16;
 	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
  * the software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
     struct mbuf *m0, const struct txpkts *txp, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int ndesc, checkwrap;
 	struct mbuf *m;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->plen < 65536);
 	MPASS(m0 != NULL);
 	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 	MPASS(available > 0 && available < eq->sidx);
 
 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
 	wr->type = txp->wr_type;
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
 	checkwrap = eq->sidx - ndesc < eq->pidx;
 	for (m = m0; m != NULL; m = m->m_nextpkt) {
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
 
 			/* ULP master command */
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
 			ulpmc->len = htobe32(mbuf_len16(m));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 			    F_ULP_TX_SC_MORE);
 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
 
 			cpl = (void *)(ulpsc + 1);
 			if (checkwrap &&
 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
 				cpl = (void *)&eq->desc[0];
 		} else {
 			cpl = flitp;
 		}
 
 		/* Checksum offload */
 		ctrl1 = 0;
 		if (needs_l3_csum(m) == 0)
 			ctrl1 |= F_TXPKT_IPCSUM_DIS;
 		if (needs_l4_csum(m) == 0)
 			ctrl1 |= F_TXPKT_L4CSUM_DIS;
 		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 			txq->txcsum++;	/* some hardware assistance provided */
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		if (checkwrap &&
 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = (void *)&eq->desc[0];
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
 	}
 
 	if (txp->wr_type == 0) {
 		txq->txpkts0_pkts += txp->npkt;
 		txq->txpkts0_wrs++;
 	} else {
 		txq->txpkts1_pkts += txp->npkt;
 		txq->txpkts1_wrs++;
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sglist *gl = txq->gl;
 	struct sglist_seg *seg;
 	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	get_pkt_gl(m, gl);
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
 	 * ring, so we're at least 16 bytes away from the status page.  There is
 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
 		for (i = 0; i < nsegs - 1; i++, seg++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
 		for (i = 0; i < nflits - 2; i++) {
 			if (flitp == wrap)
 				flitp = (void *)eq->desc;
 			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
 	}
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
 		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static inline void
 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
 	u_int db;
 
 	MPASS(n > 0);
 
 	db = eq->doorbells;
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 		int i;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
 		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
 		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(eq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 
 	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
 static inline u_int
 reclaimable_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
 }
 
 static inline u_int
 total_available_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx, pidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	pidx = eq->pidx;
 
 	if (pidx == hw_cidx)
 		return (eq->sidx - 1);
 	else
 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
 }
 
 static inline uint16_t
 read_hw_cidx(struct sge_eq *eq)
 {
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 	uint16_t cidx = spg->cidx;	/* stable snapshot */
 
 	return (be16toh(cidx));
 }
 
 /*
  * Reclaim 'n' descriptors approximately.
  */
 static u_int
 reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
 	struct sge_eq *eq = &txq->eq;
 	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(n > 0);
 
 	reclaimed = 0;
 	can_reclaim = reclaimable_tx_desc(eq);
 	while (can_reclaim && reclaimed < n) {
 		int ndesc;
 		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
 
 		/* Firmware doesn't return "partial" credits. */
 		KASSERT(can_reclaim >= ndesc,
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 		KASSERT(ndesc != 0,
 		    ("%s: descriptor with no credits: cidx %d",
 		    __func__, eq->cidx));
 
 		for (m = txsd->m; m != NULL; m = nextpkt) {
 			nextpkt = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
 		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
 tx_reclaim(void *arg, int n)
 {
 	struct sge_txq *txq = arg;
 	struct sge_eq *eq = &txq->eq;
 
 	do {
 		if (TXQ_TRYLOCK(txq) == 0)
 			break;
 		n = reclaim_tx_descs(txq, 32);
 		if (eq->cidx == eq->pidx)
 			eq->equeqidx = eq->pidx;
 		TXQ_UNLOCK(txq);
 	} while (n > 0);
 }
 
 static __be64
 get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
 	switch (idx % 3) {
 	case 0: {
 		uint64_t rc;
 
 		rc = (uint64_t)segs[i].ss_len << 32;
 		if (i + 1 < nsegs)
 			rc |= (uint64_t)(segs[i + 1].ss_len);
 
 		return (htobe64(rc));
 	}
 	case 1:
 		return (htobe64(segs[i].ss_paddr));
 	case 2:
 		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
 }
 
 static void
 find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
 {
 	int8_t zidx, hwidx, idx;
 	uint16_t region1, region3;
 	int spare, spare_needed, n;
 	struct sw_zone_info *swz;
 	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
 
 	/*
 	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
 	 * large enough for the max payload and cluster metadata.  Otherwise
 	 * settle for the largest bufsize that leaves enough room in the cluster
 	 * for metadata.
 	 *
 	 * Without buffer packing: Look for the smallest zone which has a
 	 * bufsize large enough for the max payload.  Settle for the largest
 	 * bufsize available if there's nothing big enough for max payload.
 	 */
 	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
 	swz = &sc->sge.sw_zone_info[0];
 	hwidx = -1;
 	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
 		if (swz->size > largest_rx_cluster) {
 			if (__predict_true(hwidx != -1))
 				break;
 
 			/*
 			 * This is a misconfiguration.  largest_rx_cluster is
 			 * preventing us from finding a refill source.  See
 			 * dev.t5nex.<n>.buffer_sizes to figure out why.
 			 */
 			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
 			    " refill source for fl %p (dma %u).  Ignored.\n",
 			    largest_rx_cluster, fl, maxp);
 		}
 		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
 			hwb = &hwb_list[idx];
 			spare = swz->size - hwb->size;
 			if (spare < spare_needed)
 				continue;
 
 			hwidx = idx;		/* best option so far */
 			if (hwb->size >= maxp) {
 
 				if ((fl->flags & FL_BUF_PACKING) == 0)
 					goto done; /* stop looking (not packing) */
 
 				if (swz->size >= safest_rx_cluster)
 					goto done; /* stop looking (packing) */
 			}
 			break;		/* keep looking, next zone */
 		}
 	}
 done:
 	/* A usable hwidx has been located. */
 	MPASS(hwidx != -1);
 	hwb = &hwb_list[hwidx];
 	zidx = hwb->zidx;
 	swz = &sc->sge.sw_zone_info[zidx];
 	region1 = 0;
 	region3 = swz->size - hwb->size;
 
 	/*
 	 * Stay within this zone and see if there is a better match when mbuf
 	 * inlining is allowed.  Remember that the hwidx's are sorted in
 	 * decreasing order of size (so in increasing order of spare area).
 	 */
 	for (idx = hwidx; idx != -1; idx = hwb->next) {
 		hwb = &hwb_list[idx];
 		spare = swz->size - hwb->size;
 
 		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
 			break;
 
 		/*
 		 * Do not inline mbufs if doing so would violate the pad/pack
 		 * boundary alignment requirement.
 		 */
 		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
 			continue;
 		if (fl->flags & FL_BUF_PACKING &&
 		    (MSIZE % sc->params.sge.pack_boundary) != 0)
 			continue;
 
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			continue;
 		n = (spare - CL_METADATA_SIZE) / MSIZE;
 		if (n > howmany(hwb->size, maxp))
 			break;
 
 		hwidx = idx;
 		if (fl->flags & FL_BUF_PACKING) {
 			region1 = n * MSIZE;
 			region3 = spare - region1;
 		} else {
 			region1 = MSIZE;
 			region3 = spare - region1;
 			break;
 		}
 	}
 
 	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
 	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
 	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
 	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
 	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
 	    sc->sge.sw_zone_info[zidx].size,
 	    ("%s: bad buffer layout for fl %p, maxp %d. "
 		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		sc->sge.sw_zone_info[zidx].size, region1,
 		sc->sge.hw_buf_info[hwidx].size, region3));
 	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
 		KASSERT(region3 >= CL_METADATA_SIZE,
 		    ("%s: no room for metadata.  fl %p, maxp %d; "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 		KASSERT(region1 % MSIZE == 0,
 		    ("%s: bad mbuf region for fl %p, maxp %d. "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 	}
 
 	fl->cll_def.zidx = zidx;
 	fl->cll_def.hwidx = hwidx;
 	fl->cll_def.region1 = region1;
 	fl->cll_def.region3 = region3;
 }
 
 static void
 find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
 {
 	struct sge *s = &sc->sge;
 	struct hw_buf_info *hwb;
 	struct sw_zone_info *swz;
 	int spare;
 	int8_t hwidx;
 
 	if (fl->flags & FL_BUF_PACKING)
 		hwidx = s->safe_hwidx2;	/* with room for metadata */
 	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
 		hwidx = s->safe_hwidx2;
 		hwb = &s->hw_buf_info[hwidx];
 		swz = &s->sw_zone_info[hwb->zidx];
 		spare = swz->size - hwb->size;
 
 		/* no good if there isn't room for an mbuf as well */
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			hwidx = s->safe_hwidx1;
 	} else
 		hwidx = s->safe_hwidx1;
 
 	if (hwidx == -1) {
 		/* No fallback source */
 		fl->cll_alt.hwidx = -1;
 		fl->cll_alt.zidx = -1;
 
 		return;
 	}
 
 	hwb = &s->hw_buf_info[hwidx];
 	swz = &s->sw_zone_info[hwb->zidx];
 	spare = swz->size - hwb->size;
 	fl->cll_alt.hwidx = hwidx;
 	fl->cll_alt.zidx = hwb->zidx;
 	if (allow_mbufs_in_cluster &&
 	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
 		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
 	else
 		fl->cll_alt.region1 = 0;
 	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
 }
 
 static void
 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	if ((fl->flags & FL_DOOMED) == 0) {
 		fl->flags |= FL_STARVING;
 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
 	}
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 }
 
 static void
 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_wrq *wrq = (void *)eq;
 
 	atomic_readandclear_int(&eq->equiq);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
 }
 
 static void
 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_txq *txq = (void *)eq;
 
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
 	mp_ring_check_drainage(txq->r, 0);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
 }
 
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
 		&handle_wrq_egr_update, &handle_eth_egr_update,
 		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
 
 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
     offsetof(struct cpl_fw6_msg, data));
 
 static int
 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
 		const struct rss_header *rss2;
 
 		rss2 = (const struct rss_header *)&cpl->data[0];
 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
 	}
 
 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 /**
  *	t4_handle_wrerr_rpl - process a FW work request error message
  *	@adap: the adapter
  *	@rpl: start of the FW message
  */
 static int
 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
 {
 	u8 opcode = *(const u8 *)rpl;
 	const struct fw_error_cmd *e = (const void *)rpl;
 	unsigned int i;
 
 	if (opcode != FW_ERROR_CMD) {
 		log(LOG_ERR,
 		    "%s: Received WRERR_RPL message with opcode %#x\n",
 		    device_get_nameunit(adap->dev), opcode);
 		return (EINVAL);
 	}
 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
 	    "non-fatal");
 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
 	case FW_ERROR_TYPE_EXCEPTION:
 		log(LOG_ERR, "exception info:\n");
 		for (i = 0; i < nitems(e->u.exception.info); i++)
 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
 			    be32toh(e->u.exception.info[i]));
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_HWMODULE:
 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
 		    be32toh(e->u.hwmodule.regaddr),
 		    be32toh(e->u.hwmodule.regval));
 		break;
 	case FW_ERROR_TYPE_WR:
 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
 		    be16toh(e->u.wr.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
 		    be32toh(e->u.wr.eqid));
 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
 			    e->u.wr.wrhdr[i]);
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_ACL:
 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
 		    be16toh(e->u.acl.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
 		    be32toh(e->u.acl.eqid),
 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
 		    "MAC");
 		for (i = 0; i < nitems(e->u.acl.val); i++)
 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
 		log(LOG_ERR, "\n");
 		break;
 	default:
 		log(LOG_ERR, "type %#x\n",
 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 sysctl_uint16(SYSCTL_HANDLER_ARGS)
 {
 	uint16_t *id = arg1;
 	int i = *id;
 
 	return sysctl_handle_int(oidp, &i, 0, req);
 }
 
 static int
 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
 {
 	struct sge *s = arg1;
 	struct hw_buf_info *hwb = &s->hw_buf_info[0];
 	struct sw_zone_info *swz = &s->sw_zone_info[0];
 	int i, rc;
 	struct sbuf sb;
 	char c;
 
 	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
 		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
 			c = '*';
 		else
 			c = '\0';
 
 		sbuf_printf(&sb, "%u%c ", hwb->size, c);
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 #ifdef RATELIMIT
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
 {
 	u_int n;
 
 	MPASS(immhdrs > 0);
 
 	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
 	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
 	if (__predict_false(nsegs == 0))
 		goto done;
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 done:
 	return (howmany(n, 16));
 }
 
 #define ETID_FLOWC_NPARAMS 6
 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
     ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
 
 static int
 send_etid_flowc_wr(struct cxgbe_snd_tag *cst, struct port_info *pi,
     struct vi_info *vi)
 {
 	struct wrq_cookie cookie;
 	u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN;
 	struct fw_flowc_wr *flowc;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
 	    EO_FLOWC_PENDING);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLOWC_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		return (ENOMEM);
 
 	bzero(flowc, ETID_FLOWC_LEN);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(cst->iqid);
 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
 	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 	flowc->mnemval[5].val = htobe32(cst->schedcl);
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags &= ~EO_FLOWC_PENDING;
 	cst->flags |= EO_FLOWC_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
 	cst->tx_credits -= ETID_FLOWC_LEN16;
 
 	return (0);
 }
 
 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
 
 void
 send_etid_flush_wr(struct cxgbe_snd_tag *cst)
 {
 	struct fw_flowc_wr *flowc;
 	struct wrq_cookie cookie;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	flowc = start_wrq_wr(cst->eo_txq, ETID_FLUSH_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		CXGBE_UNIMPLEMENTED(__func__);
 
 	bzero(flowc, ETID_FLUSH_LEN16 * 16);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
 	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 
 	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
 
 	cst->flags |= EO_FLUSH_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
 	cst->tx_credits -= ETID_FLUSH_LEN16;
 	cst->ncompl++;
 }
 
 static void
 write_ethofld_wr(struct cxgbe_snd_tag *cst, struct fw_eth_tx_eo_wr *wr,
     struct mbuf *m0, int compl)
 {
 	struct cpl_tx_pkt_core *cpl;
 	uint64_t ctrl1;
 	uint32_t ctrl;	/* used in many unrelated places */
 	int len16, pktlen, nsegs, immhdrs;
 	caddr_t dst;
 	uintptr_t p;
 	struct ulptx_sgl *usgl;
 	struct sglist sg;
 	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	M_ASSERTPKTHDR(m0);
 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 	    m0->m_pkthdr.l4hlen > 0,
 	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
 
 	len16 = mbuf_eo_len16(m0);
 	nsegs = mbuf_eo_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
 	ctrl += immhdrs;
 
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
 	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	wr->r3 = 0;
 	if (needs_udp_csum(m0)) {
 		wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
 		wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen;
 		wr->u.udpseg.rtplen = 0;
 		wr->u.udpseg.r4 = 0;
 		wr->u.udpseg.mss = htobe16(pktlen - immhdrs);
 		wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
 		wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
 		cpl = (void *)(wr + 1);
 	} else {
 		MPASS(needs_tcp_csum(m0));
 		wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
 		wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
 		wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
 		wr->u.tcpseg.r4 = 0;
 		wr->u.tcpseg.r5 = 0;
 		wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
 
 		if (needs_tso(m0)) {
 			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 			wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
 
 			ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
 			    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
 			    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 			    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 			if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 				ctrl |= V_LSO_ETHHDR_LEN(1);
 			if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 				ctrl |= F_LSO_IPV6;
 			lso->lso_ctrl = htobe32(ctrl);
 			lso->ipid_ofst = htobe16(0);
 			lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 			lso->seqno_offset = htobe32(0);
 			lso->len = htobe32(pktlen);
 
 			cpl = (void *)(lso + 1);
 		} else {
 			wr->u.tcpseg.mss = htobe16(0xffff);
 			cpl = (void *)(wr + 1);
 		}
 	}
 
 	/* Checksum offload must be requested for ethofld. */
 	ctrl1 = 0;
 	MPASS(needs_l4_csum(m0));
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = cst->ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */
 	p = (uintptr_t)(cpl + 1);
 	m_copydata(m0, 0, immhdrs, (void *)p);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 		int i, pad;
 
 		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
 		p += immhdrs;
 		pad = 16 - (immhdrs & 0xf);
 		bzero((void *)p, pad);
 
 		usgl = (void *)(p + pad);
 		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 		    V_ULPTX_NSGE(nsegs));
 
 		sglist_init(&sg, nitems(segs), segs);
 		for (; m0 != NULL; m0 = m0->m_next) {
 			if (__predict_false(m0->m_len == 0))
 				continue;
 			if (immhdrs >= m0->m_len) {
 				immhdrs -= m0->m_len;
 				continue;
 			}
 
 			sglist_append(&sg, mtod(m0, char *) + immhdrs,
 			    m0->m_len - immhdrs);
 			immhdrs = 0;
 		}
 		MPASS(sg.sg_nseg == nsegs);
 
 		/*
 		 * Zero pad last 8B in case the WR doesn't end on a 16B
 		 * boundary.
 		 */
 		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
 
 		usgl->len0 = htobe32(segs[0].ss_len);
 		usgl->addr0 = htobe64(segs[0].ss_paddr);
 		for (i = 0; i < nsegs - 1; i++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 	}
 
 }
 
 static void
 ethofld_tx(struct cxgbe_snd_tag *cst)
 {
 	struct mbuf *m;
 	struct wrq_cookie cookie;
 	int next_credits, compl;
 	struct fw_eth_tx_eo_wr *wr;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
 		M_ASSERTPKTHDR(m);
 
 		/* How many len16 credits do we need to send this mbuf. */
 		next_credits = mbuf_eo_len16(m);
 		MPASS(next_credits > 0);
 		if (next_credits > cst->tx_credits) {
 			/*
 			 * Tx will make progress eventually because there is at
 			 * least one outstanding fw4_ack that will return
 			 * credits and kick the tx.
 			 */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		wr = start_wrq_wr(cst->eo_txq, next_credits, &cookie);
 		if (__predict_false(wr == NULL)) {
 			/* XXX: wishful thinking, not a real assertion. */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		cst->tx_credits -= next_credits;
 		cst->tx_nocompl += next_credits;
 		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
 		ETHER_BPF_MTAP(cst->com.ifp, m);
 		write_ethofld_wr(cst, wr, m, compl);
 		commit_wrq_wr(cst->eo_txq, wr, &cookie);
 		if (compl) {
 			cst->ncompl++;
 			cst->tx_nocompl	= 0;
 		}
 		(void) mbufq_dequeue(&cst->pending_tx);
+
+		/*
+		 * Drop the mbuf's reference on the tag now rather
+		 * than waiting until m_freem().  This ensures that
+		 * cxgbe_snd_tag_free gets called when the inp drops
+		 * its reference on the tag and there are no more
+		 * mbufs in the pending_tx queue and can flush any
+		 * pending requests.  Otherwise if the last mbuf
+		 * doesn't request a completion the etid will never be
+		 * released.
+		 */
+		m->m_pkthdr.snd_tag = NULL;
+		m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
+		m_snd_tag_rele(&cst->com);
+
 		mbufq_enqueue(&cst->pending_fwack, m);
 	}
 }
 
 int
 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
 {
 	struct cxgbe_snd_tag *cst;
 	int rc;
 
 	MPASS(m0->m_nextpkt == NULL);
+	MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG);
 	MPASS(m0->m_pkthdr.snd_tag != NULL);
 	cst = mst_to_cst(m0->m_pkthdr.snd_tag);
 
 	mtx_lock(&cst->lock);
 	MPASS(cst->flags & EO_SND_TAG_REF);
 
 	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
 		struct vi_info *vi = ifp->if_softc;
 		struct port_info *pi = vi->pi;
 		struct adapter *sc = pi->adapter;
 		const uint32_t rss_mask = vi->rss_size - 1;
 		uint32_t rss_hash;
 
 		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
 		if (M_HASHTYPE_ISHASH(m0))
 			rss_hash = m0->m_pkthdr.flowid;
 		else
 			rss_hash = arc4random();
 		/* We assume RSS hashing */
 		cst->iqid = vi->rss[rss_hash & rss_mask];
 		cst->eo_txq += rss_hash % vi->nofldtxq;
 		rc = send_etid_flowc_wr(cst, pi, vi);
 		if (rc != 0)
 			goto done;
 	}
 
 	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
 		rc = ENOBUFS;
 		goto done;
 	}
 
 	mbufq_enqueue(&cst->pending_tx, m0);
 	cst->plen += m0->m_pkthdr.len;
 
+	/*
+	 * Hold an extra reference on the tag while generating work
+	 * requests to ensure that we don't try to free the tag during
+	 * ethofld_tx() in case we are sending the final mbuf after
+	 * the inp was freed.
+	 */
+	m_snd_tag_ref(&cst->com);
 	ethofld_tx(cst);
-	rc = 0;
+	mtx_unlock(&cst->lock);
+	m_snd_tag_rele(&cst->com);
+	return (0);
+
 done:
 	mtx_unlock(&cst->lock);
 	if (__predict_false(rc != 0))
 		m_freem(m0);
 	return (rc);
 }
 
 static int
 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	struct mbuf *m;
 	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct cxgbe_snd_tag *cst;
 	uint8_t credits = cpl->credits;
 
 	cst = lookup_etid(sc, etid);
 	mtx_lock(&cst->lock);
 	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
 		MPASS(credits >= ETID_FLOWC_LEN16);
 		credits -= ETID_FLOWC_LEN16;
 		cst->flags &= ~EO_FLOWC_RPL_PENDING;
 	}
 
 	KASSERT(cst->ncompl > 0,
 	    ("%s: etid %u (%p) wasn't expecting completion.",
 	    __func__, etid, cst));
 	cst->ncompl--;
 
 	while (credits > 0) {
 		m = mbufq_dequeue(&cst->pending_fwack);
 		if (__predict_false(m == NULL)) {
 			/*
 			 * The remaining credits are for the final flush that
 			 * was issued when the tag was freed by the kernel.
 			 */
 			MPASS((cst->flags &
 			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
 			    EO_FLUSH_RPL_PENDING);
 			MPASS(credits == ETID_FLUSH_LEN16);
 			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
 			MPASS(cst->ncompl == 0);
 
 			cst->flags &= ~EO_FLUSH_RPL_PENDING;
 			cst->tx_credits += cpl->credits;
-freetag:
 			cxgbe_snd_tag_free_locked(cst);
 			return (0);	/* cst is gone. */
 		}
 		KASSERT(m != NULL,
 		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
 		    credits));
 		KASSERT(credits >= mbuf_eo_len16(m),
 		    ("%s: too few credits (%u, %u, %u)", __func__,
 		    cpl->credits, credits, mbuf_eo_len16(m)));
 		credits -= mbuf_eo_len16(m);
 		cst->plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 
 	cst->tx_credits += cpl->credits;
 	MPASS(cst->tx_credits <= cst->tx_total);
 
-	m = mbufq_first(&cst->pending_tx);
-	if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
-		ethofld_tx(cst);
-
-	if (__predict_false((cst->flags & EO_SND_TAG_REF) == 0) &&
-	    cst->ncompl == 0) {
-		if (cst->tx_credits == cst->tx_total)
-			goto freetag;
-		else {
-			MPASS((cst->flags & EO_FLUSH_RPL_PENDING) == 0);
-			send_etid_flush_wr(cst);
-		}
+	if (cst->flags & EO_SND_TAG_REF) {
+		/*
+		 * As with ethofld_transmit(), hold an extra reference
+		 * so that the tag is stable across ethold_tx().
+		 */
+		m_snd_tag_ref(&cst->com);	
+		m = mbufq_first(&cst->pending_tx);
+		if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
+			ethofld_tx(cst);
+		mtx_unlock(&cst->lock);
+		m_snd_tag_rele(&cst->com);
+	} else {
+		/*
+		 * There shouldn't be any pending packets if the tag
+		 * was freed by the kernel since any pending packet
+		 * should hold a reference to the tag.
+		 */
+		MPASS(mbufq_first(&cst->pending_tx) == NULL);
+		mtx_unlock(&cst->lock);
 	}
-
-	mtx_unlock(&cst->lock);
 
 	return (0);
 }
 #endif
Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
===================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	(revision 348253)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	(revision 348254)
@@ -1,4490 +1,4491 @@
 /*-
  * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "en.h"
 
 #include <sys/eventhandler.h>
 #include <sys/sockio.h>
 #include <machine/atomic.h>
 
 #ifndef ETH_DRIVER_VERSION
 #define	ETH_DRIVER_VERSION	"3.5.1"
 #endif
 #define DRIVER_RELDATE	"April 2019"
 
 static const char mlx5e_version[] = "mlx5en: Mellanox Ethernet driver "
 	ETH_DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
 
 static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
 
 struct mlx5e_channel_param {
 	struct mlx5e_rq_param rq;
 	struct mlx5e_sq_param sq;
 	struct mlx5e_cq_param rx_cq;
 	struct mlx5e_cq_param tx_cq;
 };
 
 struct media {
 	u32	subtype;
 	u64	baudrate;
 };
 
 static const struct media mlx5e_mode_table[MLX5E_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = {
 
 	[MLX5E_1000BASE_CX_SGMII][MLX5E_SGMII] = {
 		.subtype = IFM_1000_CX_SGMII,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_1000BASE_KX][MLX5E_KX] = {
 		.subtype = IFM_1000_KX,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_CX4][MLX5E_CX4] = {
 		.subtype = IFM_10G_CX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KX4][MLX5E_KX4] = {
 		.subtype = IFM_10G_KX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KR][MLX5E_KR] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_20GBASE_KR2][MLX5E_KR2] = {
 		.subtype = IFM_20G_KR2,
 		.baudrate = IF_Gbps(20ULL),
 	},
 	[MLX5E_40GBASE_CR4][MLX5E_CR4] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_KR4][MLX5E_KR4] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_56GBASE_R4][MLX5E_R] = {
 		.subtype = IFM_56G_R4,
 		.baudrate = IF_Gbps(56ULL),
 	},
 	[MLX5E_10GBASE_CR][MLX5E_CR1] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_SR][MLX5E_SR] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_ER_LR][MLX5E_ER] = {
 		.subtype = IFM_10G_ER,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_ER_LR][MLX5E_LR] = {
 		.subtype = IFM_10G_LR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_40GBASE_SR4][MLX5E_SR4] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_LR4_ER4][MLX5E_LR4] = {
 		.subtype = IFM_40G_LR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_LR4_ER4][MLX5E_ER4] = {
 		.subtype = IFM_40G_ER4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_100GBASE_CR4][MLX5E_CR4] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_SR4][MLX5E_SR4] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_KR4][MLX5E_KR4] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_LR4][MLX5E_LR4] = {
 		.subtype = IFM_100G_LR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100BASE_TX][MLX5E_TX] = {
 		.subtype = IFM_100_TX,
 		.baudrate = IF_Mbps(100ULL),
 	},
 	[MLX5E_1000BASE_T][MLX5E_T] = {
 		.subtype = IFM_1000_T,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_T][MLX5E_T] = {
 		.subtype = IFM_10G_T,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_25GBASE_CR][MLX5E_CR] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_KR][MLX5E_KR] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_SR][MLX5E_SR] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_50GBASE_CR2][MLX5E_CR2] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GBASE_KR2][MLX5E_KR2] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 };
 
 static const struct media mlx5e_ext_mode_table[MLX5E_EXT_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = {
 	[MLX5E_SGMII_100M][MLX5E_SGMII] = {
 		.subtype = IFM_100_SGMII,
 		.baudrate = IF_Mbps(100),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_KX] = {
 		.subtype = IFM_1000_KX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CX_SGMII] = {
 		.subtype = IFM_1000_CX_SGMII,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CX] = {
 		.subtype = IFM_1000_CX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_LX] = {
 		.subtype = IFM_1000_LX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_SX] = {
 		.subtype = IFM_1000_SX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_T] = {
 		.subtype = IFM_1000_T,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_T] = {
 		.subtype = IFM_5000_T,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_KR] = {
 		.subtype = IFM_5000_KR,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_KR1] = {
 		.subtype = IFM_5000_KR1,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_KR_S] = {
 		.subtype = IFM_5000_KR_S,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_ER] = {
 		.subtype = IFM_10G_ER,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_KR] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_LR] = {
 		.subtype = IFM_10G_LR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_SR] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_T] = {
 		.subtype = IFM_10G_T,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_AOC] = {
 		.subtype = IFM_10G_AOC,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CR1] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CR4] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_KR4] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_LR4] = {
 		.subtype = IFM_40G_LR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_SR4] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_ER4] = {
 		.subtype = IFM_40G_ER4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_SR] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_ACC] = {
 		.subtype = IFM_25G_ACC,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_AOC] = {
 		.subtype = IFM_25G_AOC,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR1] = {
 		.subtype = IFM_25G_CR1,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR_S] = {
 		.subtype = IFM_25G_CR_S,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR1] = {
 		.subtype = IFM_5000_KR1,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR_S] = {
 		.subtype = IFM_25G_KR_S,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_LR] = {
 		.subtype = IFM_25G_LR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_T] = {
 		.subtype = IFM_25G_T,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CR2] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_KR2] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_SR2] = {
 		.subtype = IFM_50G_SR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_LR2] = {
 		.subtype = IFM_50G_LR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_LR] = {
 		.subtype = IFM_50G_LR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_SR] = {
 		.subtype = IFM_50G_SR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CP] = {
 		.subtype = IFM_50G_CP,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_FR] = {
 		.subtype = IFM_50G_FR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_KR_PAM4] = {
 		.subtype = IFM_50G_KR_PAM4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CR4] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_KR4] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_LR4] = {
 		.subtype = IFM_100G_LR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_SR4] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_SR2] = {
 		.subtype = IFM_100G_SR2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CP2] = {
 		.subtype = IFM_100G_CP2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_KR2_PAM4] = {
 		.subtype = IFM_100G_KR2_PAM4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_DR4] = {
 		.subtype = IFM_200G_DR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_LR4] = {
 		.subtype = IFM_200G_LR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_SR4] = {
 		.subtype = IFM_200G_SR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_FR4] = {
 		.subtype = IFM_200G_FR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CR4_PAM4] = {
 		.subtype = IFM_200G_CR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_KR4_PAM4] = {
 		.subtype = IFM_200G_KR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 };
 
 MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet");
 
 static void
 mlx5e_update_carrier(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	u32 eth_proto_oper;
 	int error;
 	u8 port_state;
 	u8 is_er_type;
 	u8 i, j;
 	bool ext;
 	struct media media_entry = {};
 
 	port_state = mlx5_query_vport_state(mdev,
 	    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0);
 
 	if (port_state == VPORT_STATE_UP) {
 		priv->media_status_last |= IFM_ACTIVE;
 	} else {
 		priv->media_status_last &= ~IFM_ACTIVE;
 		priv->media_active_last = IFM_ETHER;
 		if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 		return;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error) {
 		priv->media_active_last = IFM_ETHER;
 		priv->ifp->if_baudrate = 1;
 		if_printf(priv->ifp, "%s: query port ptys failed: "
 		    "0x%x\n", __func__, error);
 		return;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_oper);
 
 	i = ilog2(eth_proto_oper);
 
 	for (j = 0; j != MLX5E_LINK_MODES_NUMBER; j++) {
 		media_entry = ext ? mlx5e_ext_mode_table[i][j] :
 		    mlx5e_mode_table[i][j];
 		if (media_entry.baudrate != 0)
 			break;
 	}
 
 	if (media_entry.subtype == 0) {
 		if_printf(priv->ifp, "%s: Could not find operational "
 		    "media subtype\n", __func__);
 		return;
 	}
 
 	switch (media_entry.subtype) {
 	case IFM_10G_ER:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			if_printf(priv->ifp, "%s: query port pddr failed: %d\n",
 				  __func__, error);
 		}
 		if (error != 0 || is_er_type == 0)
 			media_entry.subtype = IFM_10G_LR;
 		break;
 	case IFM_40G_LR4:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			if_printf(priv->ifp, "%s: query port pddr failed: %d\n",
 				  __func__, error);
 		}
 		if (error == 0 && is_er_type != 0)
 			media_entry.subtype = IFM_40G_ER4;
 		break;
 	}
 	priv->media_active_last = media_entry.subtype | IFM_ETHER | IFM_FDX;
 	priv->ifp->if_baudrate = media_entry.baudrate;
 
 	if_link_state_change(priv->ifp, LINK_STATE_UP);
 }
 
 static void
 mlx5e_media_status(struct ifnet *dev, struct ifmediareq *ifmr)
 {
 	struct mlx5e_priv *priv = dev->if_softc;
 
 	ifmr->ifm_status = priv->media_status_last;
 	ifmr->ifm_active = priv->media_active_last |
 	    (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) |
 	    (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0);
 
 }
 
 static u32
 mlx5e_find_link_mode(u32 subtype, bool ext)
 {
 	u32 i;
 	u32 j;
 	u32 link_mode = 0;
 	u32 speeds_num = 0;
 	struct media media_entry = {};
 
 	switch (subtype) {
 	case IFM_10G_LR:
 		subtype = IFM_10G_ER;
 		break;
 	case IFM_40G_ER4:
 		subtype = IFM_40G_LR4;
 		break;
 	}
 
 	speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER :
 	    MLX5E_LINK_SPEEDS_NUMBER;
 
 	for (i = 0; i != speeds_num; i++) {
 		for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) {
 			media_entry = ext ? mlx5e_ext_mode_table[i][j] :
 			    mlx5e_mode_table[i][j];
 			if (media_entry.baudrate == 0)
 				continue;
 			if (media_entry.subtype == subtype) {
 				link_mode |= MLX5E_PROT_MASK(i);
 			}
 		}
 	}
 
 	return (link_mode);
 }
 
 static int
 mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv)
 {
 	return (mlx5_set_port_pause_and_pfc(priv->mdev, 1,
 	    priv->params.rx_pauseframe_control,
 	    priv->params.tx_pauseframe_control,
 	    priv->params.rx_priority_flow_control,
 	    priv->params.tx_priority_flow_control));
 }
 
 static int
 mlx5e_set_port_pfc(struct mlx5e_priv *priv)
 {
 	int error;
 
 	if (priv->gone != 0) {
 		error = -ENXIO;
 	} else if (priv->params.rx_pauseframe_control ||
 	    priv->params.tx_pauseframe_control) {
 		if_printf(priv->ifp,
 		    "Global pauseframes must be disabled before "
 		    "enabling PFC.\n");
 		error = -EINVAL;
 	} else {
 		error = mlx5e_set_port_pause_and_pfc(priv);
 	}
 	return (error);
 }
 
 static int
 mlx5e_media_change(struct ifnet *dev)
 {
 	struct mlx5e_priv *priv = dev->if_softc;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 eth_proto_cap;
 	u32 link_mode;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	int was_opened;
 	int locked;
 	int error;
 	bool ext;
 
 	locked = PRIV_LOCKED(priv);
 	if (!locked)
 		PRIV_LOCK(priv);
 
 	if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) {
 		error = EINVAL;
 		goto done;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error != 0) {
 		if_printf(dev, "Query port media capability failed\n");
 		goto done;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media), ext);
 
 	/* query supported capabilities */
 	eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_capability);
 
 	/* check for autoselect */
 	if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) {
 		link_mode = eth_proto_cap;
 		if (link_mode == 0) {
 			if_printf(dev, "Port media capability is zero\n");
 			error = EINVAL;
 			goto done;
 		}
 	} else {
 		link_mode = link_mode & eth_proto_cap;
 		if (link_mode == 0) {
 			if_printf(dev, "Not supported link mode requested\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
 		/* check if PFC is enabled */
 		if (priv->params.rx_priority_flow_control ||
 		    priv->params.tx_priority_flow_control) {
 			if_printf(dev, "PFC must be disabled before enabling global pauseframes.\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	/* update pauseframe control bits */
 	priv->params.rx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0;
 	priv->params.tx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0;
 
 	/* check if device is opened */
 	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	/* reconfigure the hardware */
 	mlx5_set_port_status(mdev, MLX5_PORT_DOWN);
 	mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN, ext);
 	error = -mlx5e_set_port_pause_and_pfc(priv);
 	if (was_opened)
 		mlx5_set_port_status(mdev, MLX5_PORT_UP);
 
 done:
 	if (!locked)
 		PRIV_UNLOCK(priv);
 	return (error);
 }
 
 static void
 mlx5e_update_carrier_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
 	    update_carrier_work);
 
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
 		mlx5e_update_carrier(priv);
 	PRIV_UNLOCK(priv);
 }
 
 #define	MLX5E_PCIE_PERF_GET_64(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET64(mpcnt_reg, out, counter_set.f.c);
 
 #define	MLX5E_PCIE_PERF_GET_32(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET(mpcnt_reg, out, counter_set.f.c);
 
 static void
 mlx5e_update_pcie_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	const unsigned sz = MLX5_ST_SZ_BYTES(mpcnt_reg);
 	void *out;
 	void *in;
 	int err;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_64(MLX5E_PCIE_PERF_GET_64)
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_LANE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_LANE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 /*
  * This function reads the physical port counters from the firmware
  * using a pre-defined layout defined by various MLX5E_PPORT_XXX()
  * macros. The output is converted from big-endian 64-bit values into
  * host endian ones and stored in the "priv->stats.pport" structure.
  */
 static void
 mlx5e_update_pport_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_pport_stats *s = &priv->stats.pport;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	u32 *in;
 	u32 *out;
 	const u64 *ptr;
 	unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 	unsigned x;
 	unsigned y;
 	unsigned z;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	/*
 	 * Get pointer to the 64-bit counter set which is located at a
 	 * fixed offset in the output firmware request structure:
 	 */
 	ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set);
 
 	MLX5_SET(ppcnt_reg, in, local_port, 1);
 
 	/* read IEEE802_3 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM;
 	     x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2819 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM +
 	    MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2863 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read physical layer stats counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Ethernet counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Statistical Group */
 	if (MLX5_CAP_GEN(mdev, pcam_reg) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) {
 		/* read Extended Statistical counter group using predefined counter layout */
 		MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		for (x = 0; x != MLX5E_PPORT_STATISTICAL_DEBUG_NUM; x++, y++)
 			s_debug->arg[y] = be64toh(ptr[x]);
 	}
 
 	/* read PCIE counters */
 	mlx5e_update_pcie_counters(priv);
 
 	/* read per-priority counters */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
 
 	/* iterate all the priorities */
 	for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) {
 		MLX5_SET(ppcnt_reg, in, prio_tc, z);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		/* read per priority stats counter group using predefined counter layout */
 		for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM /
 		    MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++)
 			s->arg[y] = be64toh(ptr[x]);
 	}
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 static void
 mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv)
 {
 	u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
 
 	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
 		return;
 
 	MLX5_SET(query_vnic_env_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VNIC_ENV);
 	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
 	MLX5_SET(query_vnic_env_in, in, other_vport, 0);
 
 	if (mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)) != 0)
 		return;
 
 	priv->stats.vport.rx_steer_missed_packets =
 	    MLX5_GET64(query_vnic_env_out, out,
 	    vport_env.nic_receive_steering_discard);
 }
 
 /*
  * This function is called regularly to collect all statistics
  * counters from the firmware. The values can be viewed through the
  * sysctl interface. Execution is serialized using the priv's global
  * configuration lock.
  */
 static void
 mlx5e_update_stats_locked(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_vport_stats *s = &priv->stats.vport;
 	struct mlx5e_sq_stats *sq_stats;
 	struct buf_ring *sq_br;
 #if (__FreeBSD_version < 1100000)
 	struct ifnet *ifp = priv->ifp;
 #endif
 
 	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
 	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
 	u64 tso_packets = 0;
 	u64 tso_bytes = 0;
 	u64 tx_queue_dropped = 0;
 	u64 tx_defragged = 0;
 	u64 tx_offload_none = 0;
 	u64 lro_packets = 0;
 	u64 lro_bytes = 0;
 	u64 sw_lro_queued = 0;
 	u64 sw_lro_flushed = 0;
 	u64 rx_csum_none = 0;
 	u64 rx_wqe_err = 0;
 	u64 rx_packets = 0;
 	u64 rx_bytes = 0;
 	u32 rx_out_of_buffer = 0;
 	int i;
 	int j;
 
 	out = mlx5_vzalloc(outlen);
 	if (out == NULL)
 		goto free_out;
 
 	/* Collect firts the SW counters and then HW for consistency */
 	for (i = 0; i < priv->params.num_channels; i++) {
 		struct mlx5e_channel *pch = priv->channel + i;
 		struct mlx5e_rq *rq = &pch->rq;
 		struct mlx5e_rq_stats *rq_stats = &pch->rq.stats;
 
 		/* collect stats from LRO */
 		rq_stats->sw_lro_queued = rq->lro.lro_queued;
 		rq_stats->sw_lro_flushed = rq->lro.lro_flushed;
 		sw_lro_queued += rq_stats->sw_lro_queued;
 		sw_lro_flushed += rq_stats->sw_lro_flushed;
 		lro_packets += rq_stats->lro_packets;
 		lro_bytes += rq_stats->lro_bytes;
 		rx_csum_none += rq_stats->csum_none;
 		rx_wqe_err += rq_stats->wqe_err;
 		rx_packets += rq_stats->packets;
 		rx_bytes += rq_stats->bytes;
 
 		for (j = 0; j < priv->num_tc; j++) {
 			sq_stats = &pch->sq[j].stats;
 			sq_br = pch->sq[j].br;
 
 			tso_packets += sq_stats->tso_packets;
 			tso_bytes += sq_stats->tso_bytes;
 			tx_queue_dropped += sq_stats->dropped;
 			if (sq_br != NULL)
 				tx_queue_dropped += sq_br->br_drops;
 			tx_defragged += sq_stats->defragged;
 			tx_offload_none += sq_stats->csum_offload_none;
 		}
 	}
 
 	/* update counters */
 	s->tso_packets = tso_packets;
 	s->tso_bytes = tso_bytes;
 	s->tx_queue_dropped = tx_queue_dropped;
 	s->tx_defragged = tx_defragged;
 	s->lro_packets = lro_packets;
 	s->lro_bytes = lro_bytes;
 	s->sw_lro_queued = sw_lro_queued;
 	s->sw_lro_flushed = sw_lro_flushed;
 	s->rx_csum_none = rx_csum_none;
 	s->rx_wqe_err = rx_wqe_err;
 	s->rx_packets = rx_packets;
 	s->rx_bytes = rx_bytes;
 
 	mlx5e_grp_vnic_env_update_stats(priv);
 
 	/* HW counters */
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(query_vport_counter_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VPORT_COUNTER);
 	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
 	MLX5_SET(query_vport_counter_in, in, other_vport, 0);
 
 	memset(out, 0, outlen);
 
 	/* get number of out-of-buffer drops first */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 &&
 	    mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id,
 	    &rx_out_of_buffer) == 0) {
 		s->rx_out_of_buffer = rx_out_of_buffer;
 	}
 
 	/* get port statistics */
 	if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen) == 0) {
 #define	MLX5_GET_CTR(out, x) \
 	MLX5_GET64(query_vport_counter_out, out, x)
 
 		s->rx_error_packets =
 		    MLX5_GET_CTR(out, received_errors.packets);
 		s->rx_error_bytes =
 		    MLX5_GET_CTR(out, received_errors.octets);
 		s->tx_error_packets =
 		    MLX5_GET_CTR(out, transmit_errors.packets);
 		s->tx_error_bytes =
 		    MLX5_GET_CTR(out, transmit_errors.octets);
 
 		s->rx_unicast_packets =
 		    MLX5_GET_CTR(out, received_eth_unicast.packets);
 		s->rx_unicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_unicast.octets);
 		s->tx_unicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.packets);
 		s->tx_unicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.octets);
 
 		s->rx_multicast_packets =
 		    MLX5_GET_CTR(out, received_eth_multicast.packets);
 		s->rx_multicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_multicast.octets);
 		s->tx_multicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.packets);
 		s->tx_multicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.octets);
 
 		s->rx_broadcast_packets =
 		    MLX5_GET_CTR(out, received_eth_broadcast.packets);
 		s->rx_broadcast_bytes =
 		    MLX5_GET_CTR(out, received_eth_broadcast.octets);
 		s->tx_broadcast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
 		s->tx_broadcast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
 
 		s->tx_packets = s->tx_unicast_packets +
 		    s->tx_multicast_packets + s->tx_broadcast_packets;
 		s->tx_bytes = s->tx_unicast_bytes + s->tx_multicast_bytes +
 		    s->tx_broadcast_bytes;
 
 		/* Update calculated offload counters */
 		s->tx_csum_offload = s->tx_packets - tx_offload_none;
 		s->rx_csum_good = s->rx_packets - s->rx_csum_none;
 	}
 
 	/* Get physical port counters */
 	mlx5e_update_pport_counters(priv);
 
 	s->tx_jumbo_packets =
 	    priv->stats.port_stats_debug.tx_stat_p1519to2047octets +
 	    priv->stats.port_stats_debug.tx_stat_p2048to4095octets +
 	    priv->stats.port_stats_debug.tx_stat_p4096to8191octets +
 	    priv->stats.port_stats_debug.tx_stat_p8192to10239octets;
 
 #if (__FreeBSD_version < 1100000)
 	/* no get_counters interface in fbsd 10 */
 	ifp->if_ipackets = s->rx_packets;
 	ifp->if_ierrors = priv->stats.pport.in_range_len_errors +
 	    priv->stats.pport.out_of_range_len +
 	    priv->stats.pport.too_long_errors +
 	    priv->stats.pport.check_seq_err +
 	    priv->stats.pport.alignment_err;
 	ifp->if_iqdrops = s->rx_out_of_buffer;
 	ifp->if_opackets = s->tx_packets;
 	ifp->if_oerrors = priv->stats.port_stats_debug.out_discards;
 	ifp->if_snd.ifq_drops = s->tx_queue_dropped;
 	ifp->if_ibytes = s->rx_bytes;
 	ifp->if_obytes = s->tx_bytes;
 	ifp->if_collisions =
 	    priv->stats.pport.collisions;
 #endif
 
 free_out:
 	kvfree(out);
 
 	/* Update diagnostics, if any */
 	if (priv->params_ethtool.diag_pci_enable ||
 	    priv->params_ethtool.diag_general_enable) {
 		int error = mlx5_core_get_diagnostics_full(mdev,
 		    priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL,
 		    priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL);
 		if (error != 0)
 			if_printf(priv->ifp, "Failed reading diagnostics: %d\n", error);
 	}
 }
 
 static void
 mlx5e_update_stats_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv;
 
 	priv  = container_of(work, struct mlx5e_priv, update_stats_work);
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 		mlx5e_update_stats_locked(priv);
 	PRIV_UNLOCK(priv);
 }
 
 static void
 mlx5e_update_stats(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	queue_work(priv->wq, &priv->update_stats_work);
 
 	callout_reset(&priv->watchdog, hz, &mlx5e_update_stats, priv);
 }
 
 static void
 mlx5e_async_event_sub(struct mlx5e_priv *priv,
     enum mlx5_dev_event event)
 {
 	switch (event) {
 	case MLX5_DEV_EVENT_PORT_UP:
 	case MLX5_DEV_EVENT_PORT_DOWN:
 		queue_work(priv->wq, &priv->update_carrier_work);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static void
 mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
     enum mlx5_dev_event event, unsigned long param)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	mtx_lock(&priv->async_events_mtx);
 	if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state))
 		mlx5e_async_event_sub(priv, event);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void
 mlx5e_enable_async_events(struct mlx5e_priv *priv)
 {
 	set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 }
 
 static void
 mlx5e_disable_async_events(struct mlx5e_priv *priv)
 {
 	mtx_lock(&priv->async_events_mtx);
 	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void mlx5e_calibration_callout(void *arg);
 static int mlx5e_calibration_duration = 20;
 static int mlx5e_fast_calibration = 1;
 static int mlx5e_normal_calibration = 30;
 
 static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW, 0,
     "MLX5 timestamp calibration parameteres");
 
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN,
     &mlx5e_calibration_duration, 0,
     "Duration of initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN,
     &mlx5e_fast_calibration, 0,
     "Recalibration interval during initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN,
     &mlx5e_normal_calibration, 0,
     "Recalibration interval during normal operations");
 
 /*
  * Ignites the calibration process.
  */
 static void
 mlx5e_reset_calibration_callout(struct mlx5e_priv *priv)
 {
 
 	if (priv->clbr_done == 0)
 		mlx5e_calibration_callout(priv);
 	else
 		callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done <
 		    mlx5e_calibration_duration ? mlx5e_fast_calibration :
 		    mlx5e_normal_calibration) * hz, mlx5e_calibration_callout,
 		    priv);
 }
 
 static uint64_t
 mlx5e_timespec2usec(const struct timespec *ts)
 {
 
 	return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec);
 }
 
 static uint64_t
 mlx5e_hw_clock(struct mlx5e_priv *priv)
 {
 	struct mlx5_init_seg *iseg;
 	uint32_t hw_h, hw_h1, hw_l;
 
 	iseg = priv->mdev->iseg;
 	do {
 		hw_h = ioread32be(&iseg->internal_timer_h);
 		hw_l = ioread32be(&iseg->internal_timer_l);
 		hw_h1 = ioread32be(&iseg->internal_timer_h);
 	} while (hw_h1 != hw_h);
 	return (((uint64_t)hw_h << 32) | hw_l);
 }
 
 /*
  * The calibration callout, it runs either in the context of the
  * thread which enables calibration, or in callout.  It takes the
  * snapshot of system and adapter clocks, then advances the pointers to
  * the calibration point to allow rx path to read the consistent data
  * lockless.
  */
 static void
 mlx5e_calibration_callout(void *arg)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_clbr_point *next, *curr;
 	struct timespec ts;
 	int clbr_curr_next;
 
 	priv = arg;
 	curr = &priv->clbr_points[priv->clbr_curr];
 	clbr_curr_next = priv->clbr_curr + 1;
 	if (clbr_curr_next >= nitems(priv->clbr_points))
 		clbr_curr_next = 0;
 	next = &priv->clbr_points[clbr_curr_next];
 
 	next->base_prev = curr->base_curr;
 	next->clbr_hw_prev = curr->clbr_hw_curr;
 
 	next->clbr_hw_curr = mlx5e_hw_clock(priv);
 	if (((next->clbr_hw_curr - curr->clbr_hw_curr) >> MLX5E_TSTMP_PREC) ==
 	    0) {
 		if (priv->clbr_done != 0) {
 			if_printf(priv->ifp, "HW failed tstmp frozen %#jx %#jx,"
 			    "disabling\n",
 			     next->clbr_hw_curr, curr->clbr_hw_prev);
 			priv->clbr_done = 0;
 		}
 		atomic_store_rel_int(&curr->clbr_gen, 0);
 		return;
 	}
 
 	nanouptime(&ts);
 	next->base_curr = mlx5e_timespec2usec(&ts);
 
 	curr->clbr_gen = 0;
 	atomic_thread_fence_rel();
 	priv->clbr_curr = clbr_curr_next;
 	atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen));
 
 	if (priv->clbr_done < mlx5e_calibration_duration)
 		priv->clbr_done++;
 	mlx5e_reset_calibration_callout(priv);
 }
 
 static const char *mlx5e_rq_stats_desc[] = {
 	MLX5E_RQ_STATS(MLX5E_STATS_DESC)
 };
 
 static int
 mlx5e_create_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *rqc = param->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	int wq_sz;
 	int err;
 	int i;
 	u32 nsegs, wqe_sz;
 
 	err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	if (err != 0)
 		goto done;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsize */
 	    nsegs,			/* nsegments */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &rq->dma_tag)))
 		goto done;
 
 	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
 	    &rq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
 	err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
 	if (err != 0)
 		goto err_rq_wq_destroy;
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 
-	err = -tcp_lro_init_args(&rq->lro, c->tag.m_snd_tag.ifp, TCP_LRO_ENTRIES, wq_sz);
+	err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz);
 	if (err)
 		goto err_rq_wq_destroy;
 
 	rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
 	for (i = 0; i != wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 		int j;
 
 		err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
 		if (err != 0) {
 			while (i--)
 				bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 			goto err_rq_mbuf_free;
 		}
 
 		/* set value for constant fields */
 		for (j = 0; j < rq->nsegs; j++)
 			wqe->data[j].lkey = c->mkey_be;
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_dim_work);
 	if (priv->params.rx_cq_moderation_mode < 2) {
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 	} else {
 		void *cqc = container_of(param,
 		    struct mlx5e_channel_param, rq)->rx_cq.cqc;
 
 		switch (MLX5_GET(cqc, cqc, cq_period_mode)) {
 		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		default:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 			break;
 		}
 	}
 
-	rq->ifp = c->tag.m_snd_tag.ifp;
+	rq->ifp = priv->ifp;
 	rq->channel = c;
 	rq->ix = c->ix;
 
 	snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix);
 	mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM,
 	    rq->stats.arg);
 	return (0);
 
 err_rq_mbuf_free:
 	free(rq->mbuf, M_MLX5EN);
 	tcp_lro_free(&rq->lro);
 err_rq_wq_destroy:
 	mlx5_wq_destroy(&rq->wq_ctrl);
 err_free_dma_tag:
 	bus_dma_tag_destroy(rq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
 	int wq_sz;
 	int i;
 
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&rq->stats.ctx);
 
 	/* free leftover LRO packets, if any */
 	tcp_lro_free(&rq->lro);
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	for (i = 0; i != wq_sz; i++) {
 		if (rq->mbuf[i].mbuf != NULL) {
 			bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map);
 			m_freem(rq->mbuf[i].mbuf);
 		}
 		bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 	}
 	free(rq->mbuf, M_MLX5EN);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
 static int
 mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *rqc;
 	void *wq;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rq_in) +
 	    sizeof(u64) * rq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
 	wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
 	memcpy(rqc, param->rqc, sizeof(param->rqc));
 
 	MLX5_SET(rqc, rqc, cqn, c->rq.cq.mcq.cqn);
 	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc, rqc, flush_in_error_en, 1);
 	if (priv->counter_set_id >= 0)
 		MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id);
 	MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&rq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static int
 mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *rqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
 
 	MLX5_SET(modify_rq_in, in, rqn, rq->rqn);
 	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
 	MLX5_SET(rqc, rqc, state, next_state);
 
 	err = mlx5_core_modify_rq(mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_disable_rq(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	mlx5_core_destroy_rq(mdev, rq->rqn);
 }
 
 static int
 mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_wq_ll *wq = &rq->wq;
 	int i;
 
 	for (i = 0; i < 1000; i++) {
 		if (wq->cur_sz >= priv->params.min_rx_wqes)
 			return (0);
 
 		msleep(4);
 	}
 	return (-ETIMEDOUT);
 }
 
 static int
 mlx5e_open_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	int err;
 
 	err = mlx5e_create_rq(c, param, rq);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_rq(rq, param);
 	if (err)
 		goto err_destroy_rq;
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err)
 		goto err_disable_rq;
 
 	c->rq.enabled = 1;
 
 	return (0);
 
 err_disable_rq:
 	mlx5e_disable_rq(rq);
 err_destroy_rq:
 	mlx5e_destroy_rq(rq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_rq(struct mlx5e_rq *rq)
 {
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	callout_drain(&rq->watchdog);
 
 	mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 }
 
 static void
 mlx5e_close_rq_wait(struct mlx5e_rq *rq)
 {
 
 	mlx5e_disable_rq(rq);
 	mlx5e_close_cq(&rq->cq);
 	cancel_work_sync(&rq->dim.work);
 	mlx5e_destroy_rq(rq);
 }
 
 void
 mlx5e_free_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int x;
 
 	for (x = 0; x != wq_sz; x++) {
 		if (sq->mbuf[x].mbuf != NULL) {
 			bus_dmamap_unload(sq->dma_tag, sq->mbuf[x].dma_map);
 			m_freem(sq->mbuf[x].mbuf);
 		}
 		bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 	}
 	free(sq->mbuf, M_MLX5EN);
 }
 
 int
 mlx5e_alloc_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int err;
 	int x;
 
 	sq->mbuf = malloc(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
 
 	/* Create DMA descriptor MAPs */
 	for (x = 0; x != wq_sz; x++) {
 		err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map);
 		if (err != 0) {
 			while (x--)
 				bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 			free(sq->mbuf, M_MLX5EN);
 			return (err);
 		}
 	}
 	return (0);
 }
 
 static const char *mlx5e_sq_stats_desc[] = {
 	MLX5E_SQ_STATS(MLX5E_STATS_DESC)
 };
 
 void
 mlx5e_update_sq_inline(struct mlx5e_sq *sq)
 {
 	sq->max_inline = sq->priv->params.tx_max_inline;
 	sq->min_inline_mode = sq->priv->params.tx_min_inline_mode;
 
 	/*
 	 * Check if trust state is DSCP or if inline mode is NONE which
 	 * indicates CX-5 or newer hardware.
 	 */
 	if (sq->priv->params_ethtool.trust_state != MLX5_QPTS_TRUST_PCP ||
 	    sq->min_inline_mode == MLX5_INLINE_MODE_NONE) {
 		if (MLX5_CAP_ETH(sq->priv->mdev, wqe_vlan_insert))
 			sq->min_insert_caps = MLX5E_INSERT_VLAN | MLX5E_INSERT_NON_VLAN;
 		else
 			sq->min_insert_caps = MLX5E_INSERT_NON_VLAN;
 	} else {
 		sq->min_insert_caps = 0;
 	}
 }
 
 static void
 mlx5e_refresh_sq_inline_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int i;
 
 	for (i = 0; i != c->num_tc; i++) {
 		mtx_lock(&c->sq[i].lock);
 		mlx5e_update_sq_inline(&c->sq[i]);
 		mtx_unlock(&c->sq[i].lock);
 	}
 }
 
 void
 mlx5e_refresh_sq_inline(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_refresh_sq_inline_sub(priv, &priv->channel[i]);
 }
 
 static int
 mlx5e_create_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *sqc = param->sqc;
 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	int err;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &sq->dma_tag)))
 		goto done;
 
 	err = mlx5_alloc_map_uar(mdev, &sq->uar);
 	if (err)
 		goto err_free_dma_tag;
 
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 	    &sq->wq_ctrl);
 	if (err)
 		goto err_unmap_free_uar;
 
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 
 	err = mlx5e_alloc_sq_db(sq);
 	if (err)
 		goto err_sq_wq_destroy;
 
 	sq->mkey_be = c->mkey_be;
 	sq->ifp = priv->ifp;
 	sq->priv = priv;
 	sq->tc = tc;
 
 	mlx5e_update_sq_inline(sq);
 
 	snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc);
 	mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM,
 	    sq->stats.arg);
 
 	return (0);
 
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
 err_unmap_free_uar:
 	mlx5_unmap_free_uar(mdev, &sq->uar);
 
 err_free_dma_tag:
 	bus_dma_tag_destroy(sq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_sq(struct mlx5e_sq *sq)
 {
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&sq->stats.ctx);
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 	mlx5_unmap_free_uar(sq->priv->mdev, &sq->uar);
 }
 
 int
 mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param,
     int tis_num)
 {
 	void *in;
 	void *sqc;
 	void *wq;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
 	    sizeof(u64) * sq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
 	wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	memcpy(sqc, param->sqc, sizeof(param->sqc));
 
 	MLX5_SET(sqc, sqc, tis_num_0, tis_num);
 	MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn);
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
 	MLX5_SET(sqc, sqc, tis_lst_sz, 1);
 	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
 
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
 	MLX5_SET(wq, wq, uar_page, sq->uar.index);
 	MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&sq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 int
 mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state)
 {
 	void *in;
 	void *sqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
 	MLX5_SET(modify_sq_in, in, sq_state, curr_state);
 	MLX5_SET(sqc, sqc, state, next_state);
 
 	err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 void
 mlx5e_disable_sq(struct mlx5e_sq *sq)
 {
 
 	mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn);
 }
 
 static int
 mlx5e_open_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	int err;
 
 	err = mlx5e_create_sq(c, tc, param, sq);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_sq(sq, param, c->priv->tisn[tc]);
 	if (err)
 		goto err_destroy_sq;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
 	if (err)
 		goto err_disable_sq;
 
 	WRITE_ONCE(sq->running, 1);
 
 	return (0);
 
 err_disable_sq:
 	mlx5e_disable_sq(sq);
 err_destroy_sq:
 	mlx5e_destroy_sq(sq);
 
 	return (err);
 }
 
 static void
 mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
 {
 	/* fill up remainder with NOPs */
 	while (sq->cev_counter != 0) {
 		while (!mlx5e_sq_has_room_for(sq, 1)) {
 			if (can_sleep != 0) {
 				mtx_unlock(&sq->lock);
 				msleep(4);
 				mtx_lock(&sq->lock);
 			} else {
 				goto done;
 			}
 		}
 		/* send a single NOP */
 		mlx5e_send_nop(sq, 1);
 		atomic_thread_fence_rel();
 	}
 done:
 	/* Check if we need to write the doorbell */
 	if (likely(sq->doorbell.d64 != 0)) {
 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
 		sq->doorbell.d64 = 0;
 	}
 }
 
 void
 mlx5e_sq_cev_timeout(void *arg)
 {
 	struct mlx5e_sq *sq = arg;
 
 	mtx_assert(&sq->lock, MA_OWNED);
 
 	/* check next state */
 	switch (sq->cev_next_state) {
 	case MLX5E_CEV_STATE_SEND_NOPS:
 		/* fill TX ring with NOPs, if any */
 		mlx5e_sq_send_nops_locked(sq, 0);
 
 		/* check if completed */
 		if (sq->cev_counter == 0) {
 			sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 			return;
 		}
 		break;
 	default:
 		/* send NOPs on next timeout */
 		sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
 		break;
 	}
 
 	/* restart timer */
 	callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
 }
 
 void
 mlx5e_drain_sq(struct mlx5e_sq *sq)
 {
 	int error;
 	struct mlx5_core_dev *mdev= sq->priv->mdev;
 
 	/*
 	 * Check if already stopped.
 	 *
 	 * NOTE: Serialization of this function is managed by the
 	 * caller ensuring the priv's state lock is locked or in case
 	 * of rate limit support, a single thread manages drain and
 	 * resume of SQs. The "running" variable can therefore safely
 	 * be read without any locks.
 	 */
 	if (READ_ONCE(sq->running) == 0)
 		return;
 
 	/* don't put more packets into the SQ */
 	WRITE_ONCE(sq->running, 0);
 
 	/* serialize access to DMA rings */
 	mtx_lock(&sq->lock);
 
 	/* teardown event factor timer, if any */
 	sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
 	callout_stop(&sq->cev_callout);
 
 	/* send dummy NOPs in order to flush the transmit ring */
 	mlx5e_sq_send_nops_locked(sq, 1);
 	mtx_unlock(&sq->lock);
 
 	/* make sure it is safe to free the callout */
 	callout_drain(&sq->cev_callout);
 
 	/* wait till SQ is empty or link is down */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	    (sq->priv->media_status_last & IFM_ACTIVE) != 0 &&
 	    mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 
 	/* error out remaining requests */
 	error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
 	if (error != 0) {
 		if_printf(sq->ifp,
 		    "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error);
 	}
 
 	/* wait till SQ is empty */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	       mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 }
 
 static void
 mlx5e_close_sq_wait(struct mlx5e_sq *sq)
 {
 
 	mlx5e_drain_sq(sq);
 	mlx5e_disable_sq(sq);
 	mlx5e_destroy_sq(sq);
 }
 
 static int
 mlx5e_create_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	int eqn_not_used;
 	int irqn;
 	int err;
 	u32 i;
 
 	param->wq.buf_numa_node = 0;
 	param->wq.db_numa_node = 0;
 
 	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
 	    &cq->wq_ctrl);
 	if (err)
 		return (err);
 
 	mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn);
 
 	mcq->cqe_sz = 64;
 	mcq->set_ci_db = cq->wq_ctrl.db.db;
 	mcq->arm_db = cq->wq_ctrl.db.db + 1;
 	*mcq->set_ci_db = 0;
 	*mcq->arm_db = 0;
 	mcq->vector = eq_ix;
 	mcq->comp = comp;
 	mcq->event = mlx5e_cq_error_event;
 	mcq->irqn = irqn;
 	mcq->uar = &priv->cq_uar;
 
 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
 
 		cqe->op_own = 0xf1;
 	}
 
 	cq->priv = priv;
 
 	return (0);
 }
 
 static void
 mlx5e_destroy_cq(struct mlx5e_cq *cq)
 {
 	mlx5_wq_destroy(&cq->wq_ctrl);
 }
 
 static int
 mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix)
 {
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	void *in;
 	void *cqc;
 	int inlen;
 	int irqn_not_used;
 	int eqn;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 	    sizeof(u64) * cq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 
 	memcpy(cqc, param->cqc, sizeof(param->cqc));
 
 	mlx5_fill_page_array(&cq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used);
 
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET(cqc, cqc, uar_page, mcq->uar->index);
 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
 
 	err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen);
 
 	kvfree(in);
 
 	if (err)
 		return (err);
 
 	mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock));
 
 	return (0);
 }
 
 static void
 mlx5e_disable_cq(struct mlx5e_cq *cq)
 {
 
 	mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq);
 }
 
 int
 mlx5e_open_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	int err;
 
 	err = mlx5e_create_cq(priv, param, cq, comp, eq_ix);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_cq(cq, param, eq_ix);
 	if (err)
 		goto err_destroy_cq;
 
 	return (0);
 
 err_destroy_cq:
 	mlx5e_destroy_cq(cq);
 
 	return (err);
 }
 
 void
 mlx5e_close_cq(struct mlx5e_cq *cq)
 {
 	mlx5e_disable_cq(cq);
 	mlx5e_destroy_cq(cq);
 }
 
 static int
 mlx5e_open_tx_cqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->num_tc; tc++) {
 		/* open completion queue */
 		err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq,
 		    &mlx5e_tx_cq_comp, c->ix);
 		if (err)
 			goto err_close_tx_cqs;
 	}
 	return (0);
 
 err_close_tx_cqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_cq(&c->sq[tc].cq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tx_cqs(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->num_tc; tc++)
 		mlx5e_close_cq(&c->sq[tc].cq);
 }
 
 static int
 mlx5e_open_sqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->num_tc; tc++) {
 		err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]);
 		if (err)
 			goto err_close_sqs;
 	}
 
 	return (0);
 
 err_close_sqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 
 	return (err);
 }
 
 static void
 mlx5e_close_sqs_wait(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->num_tc; tc++)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 }
 
 static void
 mlx5e_chan_mtx_init(struct mlx5e_channel *c)
 {
 	int tc;
 
 	mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
 
 	callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0);
 
 	for (tc = 0; tc < c->num_tc; tc++) {
 		struct mlx5e_sq *sq = c->sq + tc;
 
 		mtx_init(&sq->lock, "mlx5tx",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 		mtx_init(&sq->comp_lock, "mlx5comp",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 
 		callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
 
 		sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
 
 		/* ensure the TX completion event factor is not zero */
 		if (sq->cev_factor == 0)
 			sq->cev_factor = 1;
 	}
 }
 
 static void
 mlx5e_chan_mtx_destroy(struct mlx5e_channel *c)
 {
 	int tc;
 
 	mtx_destroy(&c->rq.mtx);
 
 	for (tc = 0; tc < c->num_tc; tc++) {
 		mtx_destroy(&c->sq[tc].lock);
 		mtx_destroy(&c->sq[tc].comp_lock);
 	}
 }
 
 static int
 mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
     struct mlx5e_channel_param *cparam,
     struct mlx5e_channel *c)
 {
 	int err;
 
 	memset(c, 0, sizeof(*c));
 
 	c->priv = priv;
 	c->ix = ix;
 	/* setup send tag */
-	c->tag.m_snd_tag.ifp = priv->ifp;
 	c->tag.type = IF_SND_TAG_TYPE_UNLIMITED;
 	c->mkey_be = cpu_to_be32(priv->mr.key);
 	c->num_tc = priv->num_tc;
 
 	/* init mutexes */
 	mlx5e_chan_mtx_init(c);
 
 	/* open transmit completion queue */
 	err = mlx5e_open_tx_cqs(c, cparam);
 	if (err)
 		goto err_free;
 
 	/* open receive completion queue */
 	err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq,
 	    &mlx5e_rx_cq_comp, c->ix);
 	if (err)
 		goto err_close_tx_cqs;
 
 	err = mlx5e_open_sqs(c, cparam);
 	if (err)
 		goto err_close_rx_cq;
 
 	err = mlx5e_open_rq(c, &cparam->rq, &c->rq);
 	if (err)
 		goto err_close_sqs;
 
 	/* poll receive queue initially */
 	c->rq.cq.mcq.comp(&c->rq.cq.mcq);
 
 	return (0);
 
 err_close_sqs:
 	mlx5e_close_sqs_wait(c);
 
 err_close_rx_cq:
 	mlx5e_close_cq(&c->rq.cq);
 
 err_close_tx_cqs:
 	mlx5e_close_tx_cqs(c);
 
 err_free:
 	/* destroy mutexes */
 	mlx5e_chan_mtx_destroy(c);
 	return (err);
 }
 
 static void
 mlx5e_close_channel(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq(&c->rq);
 }
 
 static void
 mlx5e_close_channel_wait(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq_wait(&c->rq);
 	mlx5e_close_sqs_wait(c);
 	mlx5e_close_tx_cqs(c);
 	/* destroy mutexes */
 	mlx5e_chan_mtx_destroy(c);
 }
 
 static int
 mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
 {
 	u32 r, n;
 
 	r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
 	    MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
 	if (r > MJUM16BYTES)
 		return (-ENOMEM);
 
 	if (r > MJUM9BYTES)
 		r = MJUM16BYTES;
 	else if (r > MJUMPAGESIZE)
 		r = MJUM9BYTES;
 	else if (r > MCLBYTES)
 		r = MJUMPAGESIZE;
 	else
 		r = MCLBYTES;
 
 	/*
 	 * n + 1 must be a power of two, because stride size must be.
 	 * Stride size is 16 * (n + 1), as the first segment is
 	 * control.
 	 */
 	for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
 		;
 
 	if (n > MLX5E_MAX_BUSDMA_RX_SEGS)
 		return (-ENOMEM);
 
 	*wqe_sz = r;
 	*nsegs = n;
 	return (0);
 }
 
 static void
 mlx5e_build_rq_param(struct mlx5e_priv *priv,
     struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	u32 wqe_sz, nsegs;
 
 	mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
 	    nsegs * sizeof(struct mlx5_wqe_data_seg)));
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.buf_numa_node = 0;
 	param->wq.db_numa_node = 0;
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_sq_param(struct mlx5e_priv *priv,
     struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.buf_numa_node = 0;
 	param->wq.db_numa_node = 0;
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, uar_page, priv->cq_uar.index);
 }
 
 static void
 mlx5e_get_default_profile(struct mlx5e_priv *priv, int mode, struct net_dim_cq_moder *ptr)
 {
 
 	*ptr = net_dim_get_profile(mode, MLX5E_DIM_DEFAULT_PROFILE);
 
 	/* apply LRO restrictions */
 	if (priv->params.hw_lro_en &&
 	    ptr->pkts > MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO) {
 		ptr->pkts = MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO;
 	}
 }
 
 static void
 mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	struct net_dim_cq_moder curr;
 	void *cqc = param->cqc;
 
 	/*
 	 * We use MLX5_CQE_FORMAT_HASH because the RX hash mini CQE
 	 * format is more beneficial for FreeBSD use case.
 	 *
 	 * Adding support for MLX5_CQE_FORMAT_CSUM will require changes
 	 * in mlx5e_decompress_cqe.
 	 */
 	if (priv->params.cqe_zipping_en) {
 		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_HASH);
 		MLX5_SET(cqc, cqc, cqe_compression_en, 1);
 	}
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size);
 
 	switch (priv->params.rx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 1:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 2:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 3:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		break;
 	}
 
 	mlx5e_dim_build_cq_param(priv, param);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size);
 	MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec);
 	MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts);
 
 	switch (priv->params.tx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	}
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_channel_param(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam)
 {
 	memset(cparam, 0, sizeof(*cparam));
 
 	mlx5e_build_rq_param(priv, &cparam->rq);
 	mlx5e_build_sq_param(priv, &cparam->sq);
 	mlx5e_build_rx_cq_param(priv, &cparam->rx_cq);
 	mlx5e_build_tx_cq_param(priv, &cparam->tx_cq);
 }
 
 static int
 mlx5e_open_channels(struct mlx5e_priv *priv)
 {
 	struct mlx5e_channel_param cparam;
 	int err;
 	int i;
 	int j;
 
 	mlx5e_build_channel_param(priv, &cparam);
 	for (i = 0; i < priv->params.num_channels; i++) {
 		err = mlx5e_open_channel(priv, i, &cparam, &priv->channel[i]);
 		if (err)
 			goto err_close_channels;
 	}
 
 	for (j = 0; j < priv->params.num_channels; j++) {
 		err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j].rq);
 		if (err)
 			goto err_close_channels;
 	}
 	return (0);
 
 err_close_channels:
 	while (i--) {
 		mlx5e_close_channel(&priv->channel[i]);
 		mlx5e_close_channel_wait(&priv->channel[i]);
 	}
 	return (err);
 }
 
 static void
 mlx5e_close_channels(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel(&priv->channel[i]);
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel_wait(&priv->channel[i]);
 }
 
 static int
 mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 
 		switch (priv->params.tx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq,
 		    priv->params.tx_cq_moderation_usec,
 		    priv->params.tx_cq_moderation_pkts,
 		    cq_mode));
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq,
 	    priv->params.tx_cq_moderation_usec,
 	    priv->params.tx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 		uint8_t dim_mode;
 		int retval;
 
 		switch (priv->params.rx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		/* tear down dynamic interrupt moderation */
 		mtx_lock(&rq->mtx);
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 		mtx_unlock(&rq->mtx);
 
 		/* wait for dynamic interrupt moderation work task, if any */
 		cancel_work_sync(&rq->dim.work);
 
 		if (priv->params.rx_cq_moderation_mode >= 2) {
 			struct net_dim_cq_moder curr;
 
 			mlx5e_get_default_profile(priv, dim_mode, &curr);
 
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    curr.usec, curr.pkts, cq_mode);
 
 			/* set dynamic interrupt moderation mode and zero defaults */
 			mtx_lock(&rq->mtx);
 			rq->dim.mode = dim_mode;
 			rq->dim.state = 0;
 			rq->dim.profile_ix = MLX5E_DIM_DEFAULT_PROFILE;
 			mtx_unlock(&rq->mtx);
 		} else {
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    priv->params.rx_cq_moderation_usec,
 			    priv->params.rx_cq_moderation_pkts,
 			    cq_mode);
 		}
 		return (retval);
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq,
 	    priv->params.rx_cq_moderation_usec,
 	    priv->params.rx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int err;
 	int i;
 
 	err = mlx5e_refresh_rq_params(priv, &c->rq);
 	if (err)
 		goto done;
 
 	for (i = 0; i != c->num_tc; i++) {
 		err = mlx5e_refresh_sq_params(priv, &c->sq[i]);
 		if (err)
 			goto done;
 	}
 done:
 	return (err);
 }
 
 int
 mlx5e_refresh_channel_params(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (EINVAL);
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		int err;
 
 		err = mlx5e_refresh_channel_params_sub(priv, &priv->channel[i]);
 		if (err)
 			return (err);
 	}
 	return (0);
 }
 
 static int
 mlx5e_open_tis(struct mlx5e_priv *priv, int tc)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(tisc, tisc, prio, tc);
 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
 
 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc]));
 }
 
 static void
 mlx5e_close_tis(struct mlx5e_priv *priv, int tc)
 {
 	mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc]);
 }
 
 static int
 mlx5e_open_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int err;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++) {
 		err = mlx5e_open_tis(priv, tc);
 		if (err)
 			goto err_close_tises;
 	}
 
 	return (0);
 
 err_close_tises:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_tis(priv, tc);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++)
 		mlx5e_close_tis(priv, tc);
 }
 
 static int
 mlx5e_open_rqt(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 *in;
 	u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0};
 	void *rqtc;
 	int inlen;
 	int err;
 	int sz;
 	int i;
 
 	sz = 1 << priv->params.rx_hash_log_tbl_sz;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
 	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
 
 	for (i = 0; i < sz; i++) {
 		int ix = i;
 #ifdef RSS
 		ix = rss_get_indirection_to_bucket(ix);
 #endif
 		/* ensure we don't overflow */
 		ix %= priv->params.num_channels;
 
 		/* apply receive side scaling stride, if any */
 		ix -= ix % (int)priv->params.channels_rsss;
 
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix].rq.rqn);
 	}
 
 	MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
 
 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
 	if (!err)
 		priv->rqtn = MLX5_GET(create_rqt_out, out, rqtn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_close_rqt(struct mlx5e_priv *priv)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0};
 	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0};
 
 	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
 	MLX5_SET(destroy_rqt_in, in, rqtn, priv->rqtn);
 
 	mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out));
 }
 
 static void
 mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt)
 {
 	void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 	__be32 *hkey;
 
 	MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
 
 #define	ROUGH_MAX_L2_L3_HDR_SZ 256
 
 #define	MLX5_HASH_IP     (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP)
 
 #define	MLX5_HASH_ALL    (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP   |\
 			  MLX5_HASH_FIELD_SEL_L4_SPORT |\
 			  MLX5_HASH_FIELD_SEL_L4_DPORT)
 
 #define	MLX5_HASH_IP_IPSEC_SPI	(MLX5_HASH_FIELD_SEL_SRC_IP   |\
 				 MLX5_HASH_FIELD_SEL_DST_IP   |\
 				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
 
 	if (priv->params.hw_lro_en) {
 		MLX5_SET(tirc, tirc, lro_enable_mask,
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
 		MLX5_SET(tirc, tirc, lro_max_msg_sz,
 		    (priv->params.lro_wqe_sz -
 		    ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
 		/* TODO: add the option to choose timer value dynamically */
 		MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
 		    MLX5_CAP_ETH(priv->mdev,
 		    lro_timer_supported_periods[2]));
 	}
 
 	/* setup parameters for hashing TIR type, if any */
 	switch (tt) {
 	case MLX5E_TT_ANY:
 		MLX5_SET(tirc, tirc, disp_type,
 		    MLX5_TIRC_DISP_TYPE_DIRECT);
 		MLX5_SET(tirc, tirc, inline_rqn,
 		    priv->channel[0].rq.rqn);
 		break;
 	default:
 		MLX5_SET(tirc, tirc, disp_type,
 		    MLX5_TIRC_DISP_TYPE_INDIRECT);
 		MLX5_SET(tirc, tirc, indirect_table,
 		    priv->rqtn);
 		MLX5_SET(tirc, tirc, rx_hash_fn,
 		    MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ);
 		hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
 #ifdef RSS
 		/*
 		 * The FreeBSD RSS implementation does currently not
 		 * support symmetric Toeplitz hashes:
 		 */
 		MLX5_SET(tirc, tirc, rx_hash_symmetric, 0);
 		rss_getkey((uint8_t *)hkey);
 #else
 		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
 		hkey[0] = cpu_to_be32(0xD181C62C);
 		hkey[1] = cpu_to_be32(0xF7F4DB5B);
 		hkey[2] = cpu_to_be32(0x1983A2FC);
 		hkey[3] = cpu_to_be32(0x943E1ADB);
 		hkey[4] = cpu_to_be32(0xD9389E6B);
 		hkey[5] = cpu_to_be32(0xD1039C2C);
 		hkey[6] = cpu_to_be32(0xA74499AD);
 		hkey[7] = cpu_to_be32(0x593D56D9);
 		hkey[8] = cpu_to_be32(0xF3253C06);
 		hkey[9] = cpu_to_be32(0x2ADC1FFC);
 #endif
 		break;
 	}
 
 	switch (tt) {
 	case MLX5E_TT_IPV4_TCP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_TCP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_UDP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_UDP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	case MLX5E_TT_IPV6:
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static int
 mlx5e_open_tir(struct mlx5e_priv *priv, int tt)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 *in;
 	void *tirc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context);
 
 	mlx5e_build_tir_ctx(priv, tirc, tt);
 
 	err = mlx5_core_create_tir(mdev, in, inlen, &priv->tirn[tt]);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tir(struct mlx5e_priv *priv, int tt)
 {
 	mlx5_core_destroy_tir(priv->mdev, priv->tirn[tt]);
 }
 
 static int
 mlx5e_open_tirs(struct mlx5e_priv *priv)
 {
 	int err;
 	int i;
 
 	for (i = 0; i < MLX5E_NUM_TT; i++) {
 		err = mlx5e_open_tir(priv, i);
 		if (err)
 			goto err_close_tirs;
 	}
 
 	return (0);
 
 err_close_tirs:
 	for (i--; i >= 0; i--)
 		mlx5e_close_tir(priv, i);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tirs(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i < MLX5E_NUM_TT; i++)
 		mlx5e_close_tir(priv, i);
 }
 
 /*
  * SW MTU does not include headers,
  * HW MTU includes all headers and checksums.
  */
 static int
 mlx5e_set_dev_port_mtu(struct ifnet *ifp, int sw_mtu)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int hw_mtu;
 	int err;
 
 	hw_mtu = MLX5E_SW2HW_MTU(sw_mtu);
 
 	err = mlx5_set_port_mtu(mdev, hw_mtu);
 	if (err) {
 		if_printf(ifp, "%s: mlx5_set_port_mtu failed setting %d, err=%d\n",
 		    __func__, sw_mtu, err);
 		return (err);
 	}
 
 	/* Update vport context MTU */
 	err = mlx5_set_vport_mtu(mdev, hw_mtu);
 	if (err) {
 		if_printf(ifp, "%s: Failed updating vport context with MTU size, err=%d\n",
 		    __func__, err);
 	}
 
 	ifp->if_mtu = sw_mtu;
 
 	err = mlx5_query_vport_mtu(mdev, &hw_mtu);
 	if (err || !hw_mtu) {
 		/* fallback to port oper mtu */
 		err = mlx5_query_port_oper_mtu(mdev, &hw_mtu);
 	}
 	if (err) {
 		if_printf(ifp, "Query port MTU, after setting new "
 		    "MTU value, failed\n");
 		return (err);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) {
 		err = -E2BIG,
 		if_printf(ifp, "Port MTU %d is smaller than "
                     "ifp mtu %d\n", hw_mtu, sw_mtu);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) {
 		err = -EINVAL;
                 if_printf(ifp, "Port MTU %d is bigger than "
                     "ifp mtu %d\n", hw_mtu, sw_mtu);
 	}
 	priv->params_ethtool.hw_mtu = hw_mtu;
 
 	return (err);
 }
 
 int
 mlx5e_open_locked(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	int err;
 	u16 set_id;
 
 	/* check if already opened */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 		return (0);
 
 #ifdef RSS
 	if (rss_getnumbuckets() > priv->params.num_channels) {
 		if_printf(ifp, "NOTE: There are more RSS buckets(%u) than "
 		    "channels(%u) available\n", rss_getnumbuckets(),
 		    priv->params.num_channels);
 	}
 #endif
 	err = mlx5e_open_tises(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_tises failed, %d\n",
 		    __func__, err);
 		return (err);
 	}
 	err = mlx5_vport_alloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, &set_id);
 	if (err) {
 		if_printf(priv->ifp,
 		    "%s: mlx5_vport_alloc_q_counter failed: %d\n",
 		    __func__, err);
 		goto err_close_tises;
 	}
 	/* store counter set ID */
 	priv->counter_set_id = set_id;
 
 	err = mlx5e_open_channels(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_channels failed, %d\n",
 		    __func__, err);
 		goto err_dalloc_q_counter;
 	}
 	err = mlx5e_open_rqt(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_rqt failed, %d\n",
 		    __func__, err);
 		goto err_close_channels;
 	}
 	err = mlx5e_open_tirs(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_tir failed, %d\n",
 		    __func__, err);
 		goto err_close_rqls;
 	}
 	err = mlx5e_open_flow_table(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_open_flow_table failed, %d\n",
 		    __func__, err);
 		goto err_close_tirs;
 	}
 	err = mlx5e_add_all_vlan_rules(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_add_all_vlan_rules failed, %d\n",
 		    __func__, err);
 		goto err_close_flow_table;
 	}
 	set_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	mlx5e_update_carrier(priv);
 	mlx5e_set_rx_mode_core(priv);
 
 	return (0);
 
 err_close_flow_table:
 	mlx5e_close_flow_table(priv);
 
 err_close_tirs:
 	mlx5e_close_tirs(priv);
 
 err_close_rqls:
 	mlx5e_close_rqt(priv);
 
 err_close_channels:
 	mlx5e_close_channels(priv);
 
 err_dalloc_q_counter:
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 
 err_close_tises:
 	mlx5e_close_tises(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_open(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	PRIV_LOCK(priv);
 	if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP))
 		if_printf(priv->ifp,
 		    "%s: Setting port status to up failed\n",
 		    __func__);
 
 	mlx5e_open_locked(priv->ifp);
 	priv->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	PRIV_UNLOCK(priv);
 }
 
 int
 mlx5e_close_locked(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 
 	/* check if already closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (0);
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	mlx5e_set_rx_mode_core(priv);
 	mlx5e_del_all_vlan_rules(priv);
 	if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 	mlx5e_close_flow_table(priv);
 	mlx5e_close_tirs(priv);
 	mlx5e_close_rqt(priv);
 	mlx5e_close_channels(priv);
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 	mlx5e_close_tises(priv);
 
 	return (0);
 }
 
 #if (__FreeBSD_version >= 1100000)
 static uint64_t
 mlx5e_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	u64 retval;
 
 	/* PRIV_LOCK(priv); XXX not allowed */
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		retval = priv->stats.vport.rx_packets;
 		break;
 	case IFCOUNTER_IERRORS:
 		retval = priv->stats.pport.in_range_len_errors +
 		    priv->stats.pport.out_of_range_len +
 		    priv->stats.pport.too_long_errors +
 		    priv->stats.pport.check_seq_err +
 		    priv->stats.pport.alignment_err;
 		break;
 	case IFCOUNTER_IQDROPS:
 		retval = priv->stats.vport.rx_out_of_buffer;
 		break;
 	case IFCOUNTER_OPACKETS:
 		retval = priv->stats.vport.tx_packets;
 		break;
 	case IFCOUNTER_OERRORS:
 		retval = priv->stats.port_stats_debug.out_discards;
 		break;
 	case IFCOUNTER_IBYTES:
 		retval = priv->stats.vport.rx_bytes;
 		break;
 	case IFCOUNTER_OBYTES:
 		retval = priv->stats.vport.tx_bytes;
 		break;
 	case IFCOUNTER_IMCASTS:
 		retval = priv->stats.vport.rx_multicast_packets;
 		break;
 	case IFCOUNTER_OMCASTS:
 		retval = priv->stats.vport.tx_multicast_packets;
 		break;
 	case IFCOUNTER_OQDROPS:
 		retval = priv->stats.vport.tx_queue_dropped;
 		break;
 	case IFCOUNTER_COLLISIONS:
 		retval = priv->stats.pport.collisions;
 		break;
 	default:
 		retval = if_get_counter_default(ifp, cnt);
 		break;
 	}
 	/* PRIV_UNLOCK(priv); XXX not allowed */
 	return (retval);
 }
 #endif
 
 static void
 mlx5e_set_rx_mode(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 
 	queue_work(priv->wq, &priv->set_rx_mode_work);
 }
 
 static int
 mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct mlx5e_priv *priv;
 	struct ifreq *ifr;
 	struct ifi2creq i2c;
 	int error = 0;
 	int mask = 0;
 	int size_read = 0;
 	int module_status;
 	int module_num;
 	int max_mtu;
 	uint8_t read_addr;
 
 	priv = ifp->if_softc;
 
 	/* check if detaching */
 	if (priv == NULL || priv->gone != 0)
 		return (ENXIO);
 
 	switch (command) {
 	case SIOCSIFMTU:
 		ifr = (struct ifreq *)data;
 
 		PRIV_LOCK(priv);
 		mlx5_query_port_max_mtu(priv->mdev, &max_mtu);
 
 		if (ifr->ifr_mtu >= MLX5E_MTU_MIN &&
 		    ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) {
 			int was_opened;
 
 			was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			if (was_opened)
 				mlx5e_close_locked(ifp);
 
 			/* set new MTU */
 			mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu);
 
 			if (was_opened)
 				mlx5e_open_locked(ifp);
 		} else {
 			error = EINVAL;
 			if_printf(ifp, "Invalid MTU value. Min val: %d, Max val: %d\n",
 			    MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu));
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCSIFFLAGS:
 		if ((ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			mlx5e_set_rx_mode(ifp);
 			break;
 		}
 		PRIV_LOCK(priv);
 		if (ifp->if_flags & IFF_UP) {
 			if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 					mlx5e_open_locked(ifp);
 				ifp->if_drv_flags |= IFF_DRV_RUNNING;
 				mlx5_set_port_status(priv->mdev, MLX5_PORT_UP);
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				mlx5_set_port_status(priv->mdev,
 				    MLX5_PORT_DOWN);
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 					mlx5e_close_locked(ifp);
 				mlx5e_update_carrier(priv);
 				ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			}
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		mlx5e_set_rx_mode(ifp);
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		ifr = (struct ifreq *)data;
 		error = ifmedia_ioctl(ifp, ifr, &priv->media, command);
 		break;
 	case SIOCSIFCAP:
 		ifr = (struct ifreq *)data;
 		PRIV_LOCK(priv);
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 
 			if (IFCAP_TSO4 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				ifp->if_hwassist &= ~CSUM_IP_TSO;
 				if_printf(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 			if (IFCAP_TSO6 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				ifp->if_capenable &= ~IFCAP_TSO6;
 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
 				if_printf(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			ifp->if_capenable ^= IFCAP_TSO4;
 			ifp->if_hwassist ^= CSUM_IP_TSO;
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum6 first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			ifp->if_capenable ^= IFCAP_TSO6;
 			ifp->if_hwassist ^= CSUM_IP6_TSO;
 		}
 		if (mask & IFCAP_VLAN_HWFILTER) {
 			if (ifp->if_capenable & IFCAP_VLAN_HWFILTER)
 				mlx5e_disable_vlan_filter(priv);
 			else
 				mlx5e_enable_vlan_filter(priv);
 
 			ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
 		}
 		if (mask & IFCAP_VLAN_HWTAGGING)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 		if (mask & IFCAP_WOL_MAGIC)
 			ifp->if_capenable ^= IFCAP_WOL_MAGIC;
 
 		VLAN_CAPABILITIES(ifp);
 		/* turn off LRO means also turn of HW LRO - if it's on */
 		if (mask & IFCAP_LRO) {
 			int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			bool need_restart = false;
 
 			ifp->if_capenable ^= IFCAP_LRO;
 
 			/* figure out if updating HW LRO is needed */
 			if (!(ifp->if_capenable & IFCAP_LRO)) {
 				if (priv->params.hw_lro_en) {
 					priv->params.hw_lro_en = false;
 					need_restart = true;
 				}
 			} else {
 				if (priv->params.hw_lro_en == false &&
 				    priv->params_ethtool.hw_lro != 0) {
 					priv->params.hw_lro_en = true;
 					need_restart = true;
 				}
 			}
 			if (was_opened && need_restart) {
 				mlx5e_close_locked(ifp);
 				mlx5e_open_locked(ifp);
 			}
 		}
 		if (mask & IFCAP_HWRXTSTMP) {
 			ifp->if_capenable ^= IFCAP_HWRXTSTMP;
 			if (ifp->if_capenable & IFCAP_HWRXTSTMP) {
 				if (priv->clbr_done == 0)
 					mlx5e_reset_calibration_callout(priv);
 			} else {
 				callout_drain(&priv->tstmp_clbr);
 				priv->clbr_done = 0;
 			}
 		}
 out:
 		PRIV_UNLOCK(priv);
 		break;
 
 	case SIOCGI2C:
 		ifr = (struct ifreq *)data;
 
 		/*
 		 * Copy from the user-space address ifr_data to the
 		 * kernel-space address i2c
 		 */
 		error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (error)
 			break;
 
 		if (i2c.len > sizeof(i2c.data)) {
 			error = EINVAL;
 			break;
 		}
 
 		PRIV_LOCK(priv);
 		/* Get module_num which is required for the query_eeprom */
 		error = mlx5_query_module_num(priv->mdev, &module_num);
 		if (error) {
 			if_printf(ifp, "Query module num failed, eeprom "
 			    "reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/* Check if module is present before doing an access */
 		module_status = mlx5_query_module_status(priv->mdev, module_num);
 		if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED &&
 		    module_status != MLX5_MODULE_STATUS_PLUGGED_DISABLED) {
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/*
 		 * Currently 0XA0 and 0xA2 are the only addresses permitted.
 		 * The internal conversion is as follows:
 		 */
 		if (i2c.dev_addr == 0xA0)
 			read_addr = MLX5E_I2C_ADDR_LOW;
 		else if (i2c.dev_addr == 0xA2)
 			read_addr = MLX5E_I2C_ADDR_HIGH;
 		else {
 			if_printf(ifp, "Query eeprom failed, "
 			    "Invalid Address: %X\n", i2c.dev_addr);
 			error = EINVAL;
 			goto err_i2c;
 		}
 		error = mlx5_query_eeprom(priv->mdev,
 		    read_addr, MLX5E_EEPROM_LOW_PAGE,
 		    (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num,
 		    (uint32_t *)i2c.data, &size_read);
 		if (error) {
 			if_printf(ifp, "Query eeprom failed, eeprom "
 			    "reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		if (i2c.len > MLX5_EEPROM_MAX_BYTES) {
 			error = mlx5_query_eeprom(priv->mdev,
 			    read_addr, MLX5E_EEPROM_LOW_PAGE,
 			    (uint32_t)(i2c.offset + size_read),
 			    (uint32_t)(i2c.len - size_read), module_num,
 			    (uint32_t *)(i2c.data + size_read), &size_read);
 		}
 		if (error) {
 			if_printf(ifp, "Query eeprom failed, eeprom "
 			    "reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 err_i2c:
 		PRIV_UNLOCK(priv);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, command, data);
 		break;
 	}
 	return (error);
 }
 
 static int
 mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
 {
 	/*
 	 * TODO: uncoment once FW really sets all these bits if
 	 * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap ||
 	 * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap ||
 	 * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return
 	 * -ENOTSUPP;
 	 */
 
 	/* TODO: add more must-to-have features */
 
 	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
 		return (-ENODEV);
 
 	return (0);
 }
 
 static u16
 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev)
 {
 	uint32_t bf_buf_size = (1U << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2U;
 
 	bf_buf_size -= sizeof(struct mlx5e_tx_wqe) - 2;
 
 	/* verify against driver hardware limit */
 	if (bf_buf_size > MLX5E_MAX_TX_INLINE)
 		bf_buf_size = MLX5E_MAX_TX_INLINE;
 
 	return (bf_buf_size);
 }
 
 static int
 mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev,
     struct mlx5e_priv *priv,
     int num_comp_vectors)
 {
 	int err;
 
 	/*
 	 * TODO: Consider link speed for setting "log_sq_size",
 	 * "log_rq_size" and "cq_moderation_xxx":
 	 */
 	priv->params.log_sq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
 	priv->params.log_rq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
 	priv->params.rx_cq_moderation_usec =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE :
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	priv->params.rx_cq_moderation_mode =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0;
 	priv->params.rx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
 	priv->params.tx_cq_moderation_usec =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
 	priv->params.tx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.min_rx_wqes =
 	    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
 	priv->params.rx_hash_log_tbl_sz =
 	    (order_base_2(num_comp_vectors) >
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ?
 	    order_base_2(num_comp_vectors) :
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ;
 	priv->params.num_tc = 1;
 	priv->params.default_vlan_prio = 0;
 	priv->counter_set_id = -1;
 	priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
 
 	err = mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
 	if (err)
 		return (err);
 
 	/*
 	 * hw lro is currently defaulted to off. when it won't anymore we
 	 * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)"
 	 */
 	priv->params.hw_lro_en = false;
 	priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
 
 	/*
 	 * CQE zipping is currently defaulted to off. when it won't
 	 * anymore we will consider the HW capability:
 	 * "!!MLX5_CAP_GEN(mdev, cqe_compression)"
 	 */
 	priv->params.cqe_zipping_en = false;
 
 	priv->mdev = mdev;
 	priv->params.num_channels = num_comp_vectors;
 	priv->params.channels_rsss = 1;
 	priv->order_base_2_num_channels = order_base_2(num_comp_vectors);
 	priv->queue_mapping_channel_mask =
 	    roundup_pow_of_two(num_comp_vectors) - 1;
 	priv->num_tc = priv->params.num_tc;
 	priv->default_vlan_prio = priv->params.default_vlan_prio;
 
 	INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
 
 	return (0);
 }
 
 static int
 mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn,
 		  struct mlx5_core_mr *mkey)
 {
 	struct ifnet *ifp = priv->ifp;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	void *mkc;
 	u32 *in;
 	int err;
 
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL) {
 		if_printf(ifp, "%s: failed to allocate inbox\n", __func__);
 		return (-ENOMEM);
 	}
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA);
 	MLX5_SET(mkc, mkc, lw, 1);
 	MLX5_SET(mkc, mkc, lr, 1);
 
 	MLX5_SET(mkc, mkc, pd, pdn);
 	MLX5_SET(mkc, mkc, length64, 1);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 
 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
 	if (err)
 		if_printf(ifp, "%s: mlx5_core_create_mkey failed, %d\n",
 		    __func__, err);
 
 	kvfree(in);
 	return (err);
 }
 
 static const char *mlx5e_vport_stats_desc[] = {
 	MLX5E_VPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_pport_stats_desc[] = {
 	MLX5E_PPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static void
 mlx5e_priv_mtx_init(struct mlx5e_priv *priv)
 {
 	mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF);
 	sx_init(&priv->state_lock, "mlx5state");
 	callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0);
 	MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock);
 }
 
 static void
 mlx5e_priv_mtx_destroy(struct mlx5e_priv *priv)
 {
 	mtx_destroy(&priv->async_events_mtx);
 	sx_destroy(&priv->state_lock);
 }
 
 static int
 sysctl_firmware(SYSCTL_HANDLER_ARGS)
 {
 	/*
 	 * %d.%d%.d the string format.
 	 * fw_rev_{maj,min,sub} return u16, 2^16 = 65536.
 	 * We need at most 5 chars to store that.
 	 * It also has: two "." and NULL at the end, which means we need 18
 	 * (5*3 + 3) chars at most.
 	 */
 	char fw[18];
 	struct mlx5e_priv *priv = arg1;
 	int error;
 
 	snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev),
 	    fw_rev_sub(priv->mdev));
 	error = sysctl_handle_string(oidp, fw, sizeof(fw), req);
 	return (error);
 }
 
 static void
 mlx5e_disable_tx_dma(struct mlx5e_channel *ch)
 {
 	int i;
 
 	for (i = 0; i < ch->num_tc; i++)
 		mlx5e_drain_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq)
 {
 
 	sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP);
 	sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8);
 	mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
 	sq->doorbell.d64 = 0;
 }
 
 void
 mlx5e_resume_sq(struct mlx5e_sq *sq)
 {
 	int err;
 
 	/* check if already enabled */
 	if (READ_ONCE(sq->running) != 0)
 		return;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR,
 	    MLX5_SQC_STATE_RST);
 	if (err != 0) {
 		if_printf(sq->ifp,
 		    "mlx5e_modify_sq() from ERR to RST failed: %d\n", err);
 	}
 
 	sq->cc = 0;
 	sq->pc = 0;
 
 	/* reset doorbell prior to moving from RST to RDY */
 	mlx5e_reset_sq_doorbell_record(sq);
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST,
 	    MLX5_SQC_STATE_RDY);
 	if (err != 0) {
 		if_printf(sq->ifp,
 		    "mlx5e_modify_sq() from RST to RDY failed: %d\n", err);
 	}
 
 	sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 	WRITE_ONCE(sq->running, 1);
 }
 
 static void
 mlx5e_enable_tx_dma(struct mlx5e_channel *ch)
 {
         int i;
 
 	for (i = 0; i < ch->num_tc; i++)
 		mlx5e_resume_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_disable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	int err;
 
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	callout_drain(&rq->watchdog);
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 	if (err != 0) {
 		if_printf(rq->ifp,
 		    "mlx5e_modify_rq() from RDY to RST failed: %d\n", err);
 	}
 
 	while (!mlx5_wq_ll_is_empty(&rq->wq)) {
 		msleep(1);
 		rq->cq.mcq.comp(&rq->cq.mcq);
 	}
 
 	/*
 	 * Transitioning into RST state will allow the FW to track less ERR state queues,
 	 * thus reducing the recv queue flushing time
 	 */
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST);
 	if (err != 0) {
 		if_printf(rq->ifp,
 		    "mlx5e_modify_rq() from ERR to RST failed: %d\n", err);
 	}
 }
 
 static void
 mlx5e_enable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	int err;
 
 	rq->wq.wqe_ctr = 0;
 	mlx5_wq_ll_update_db_record(&rq->wq);
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err != 0) {
 		if_printf(rq->ifp,
 		    "mlx5e_modify_rq() from RST to RDY failed: %d\n", err);
         }
 
 	rq->enabled = 1;
 
 	rq->cq.mcq.comp(&rq->cq.mcq);
 }
 
 void
 mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_tx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_tx_dma(&priv->channel[i]);
 	}
 }
 
 void
 mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_rx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_rx_dma(&priv->channel[i]);
 	}
 }
 
 static void
 mlx5e_add_hw_stats(struct mlx5e_priv *priv)
 {
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD, priv, 0,
 	    sysctl_firmware, "A", "HCA firmware version");
 
 	SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0,
 	    "Board ID");
 }
 
 static int
 mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t tx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	tx_pfc = priv->params.tx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (tx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.tx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.tx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (tx_pfc != priv->params.tx_priority_flow_control)
 		err = -mlx5e_set_port_pfc(priv);
 done:
 	if (err != 0)
 		priv->params.tx_priority_flow_control= tx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static int
 mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t rx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	rx_pfc = priv->params.rx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (rx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.rx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.rx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (rx_pfc != priv->params.rx_priority_flow_control)
 		err = -mlx5e_set_port_pfc(priv);
 done:
 	if (err != 0)
 		priv->params.rx_priority_flow_control= rx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_setup_pauseframes(struct mlx5e_priv *priv)
 {
 #if (__FreeBSD_version < 1100000)
 	char path[96];
 #endif
 	int error;
 
 	/* enable pauseframes by default */
 	priv->params.tx_pauseframe_control = 1;
 	priv->params.rx_pauseframe_control = 1;
 
 	/* disable ports flow control, PFC, by default */
 	priv->params.tx_priority_flow_control = 0;
 	priv->params.rx_priority_flow_control = 0;
 
 #if (__FreeBSD_version < 1100000)
 	/* compute path for sysctl */
 	snprintf(path, sizeof(path), "dev.mce.%d.tx_pauseframe_control",
 	    device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	/* try to fetch tunable, if any */
 	TUNABLE_INT_FETCH(path, &priv->params.tx_pauseframe_control);
 
 	/* compute path for sysctl */
 	snprintf(path, sizeof(path), "dev.mce.%d.rx_pauseframe_control",
 	    device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	/* try to fetch tunable, if any */
 	TUNABLE_INT_FETCH(path, &priv->params.rx_pauseframe_control);
 #endif
 
 	/* register pauseframe SYSCTLs */
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.tx_pauseframe_control, 0,
 	    "Set to enable TX pause frames. Clear to disable.");
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.rx_pauseframe_control, 0,
 	    "Set to enable RX pause frames. Clear to disable.");
 
 	/* register priority flow control, PFC, SYSCTLs */
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_tx_priority_flow_control, "CU",
 	    "Set to enable TX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_rx_priority_flow_control, "CU",
 	    "Set to enable RX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	PRIV_LOCK(priv);
 
 	/* range check */
 	priv->params.tx_pauseframe_control =
 	    priv->params.tx_pauseframe_control ? 1 : 0;
 	priv->params.rx_pauseframe_control =
 	    priv->params.rx_pauseframe_control ? 1 : 0;
 
 	/* update firmware */
 	error = mlx5e_set_port_pause_and_pfc(priv);
 	if (error == -EINVAL) {
 		if_printf(priv->ifp,
 		    "Global pauseframes must be disabled before enabling PFC.\n");
 		priv->params.rx_priority_flow_control = 0;
 		priv->params.tx_priority_flow_control = 0;
 
 		/* update firmware */
 		(void) mlx5e_set_port_pause_and_pfc(priv);
 	}
 	PRIV_UNLOCK(priv);
 }
 
 static int
 mlx5e_ul_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_channel *pch;
 
 	priv = ifp->if_softc;
 
 	if (unlikely(priv->gone || params->hdr.flowtype == M_HASHTYPE_NONE)) {
 		return (EOPNOTSUPP);
 	} else {
 		/* keep this code synced with mlx5e_select_queue() */
 		u32 ch = priv->params.num_channels;
 #ifdef RSS
 		u32 temp;
 
 		if (rss_hash2bucket(params->hdr.flowid,
 		    params->hdr.flowtype, &temp) == 0)
 			ch = temp % ch;
 		else
 #endif
 			ch = (params->hdr.flowid % 128) % ch;
 
 		/*
 		 * NOTE: The channels array is only freed at detach
 		 * and it safe to return a pointer to the send tag
 		 * inside the channels structure as long as we
 		 * reference the priv.
 		 */
 		pch = priv->channel + ch;
 
 		/* check if send queue is not running */
 		if (unlikely(pch->sq[0].running == 0))
 			return (ENXIO);
 		mlx5e_ref_channel(priv);
+		MPASS(pch->tag.m_snd_tag.refcount == 0);
+		m_snd_tag_init(&pch->tag.m_snd_tag, ifp);
 		*ppmt = &pch->tag.m_snd_tag;
 		return (0);
 	}
 }
 
 static int
 mlx5e_ul_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag.m_snd_tag);
 
 	params->unlimited.max_rate = -1ULL;
 	params->unlimited.queue_level = mlx5e_sq_queue_level(&pch->sq[0]);
 	return (0);
 }
 
 static void
 mlx5e_ul_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag.m_snd_tag);
 
 	mlx5e_unref_channel(pch->priv);
 }
 
 static int
 mlx5e_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_alloc(ifp, params, ppmt));
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		return (mlx5e_ul_snd_tag_alloc(ifp, params, ppmt));
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 mlx5e_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
 {
 	struct mlx5e_snd_tag *tag =
 	    container_of(pmt, struct mlx5e_snd_tag, m_snd_tag);
 
 	switch (tag->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_modify(pmt, params));
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_snd_tag *tag =
 	    container_of(pmt, struct mlx5e_snd_tag, m_snd_tag);
 
 	switch (tag->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_query(pmt, params));
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		return (mlx5e_ul_snd_tag_query(pmt, params));
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static void
 mlx5e_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_snd_tag *tag =
 	    container_of(pmt, struct mlx5e_snd_tag, m_snd_tag);
 
 	switch (tag->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		mlx5e_rl_snd_tag_free(pmt);
 		break;
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		mlx5e_ul_snd_tag_free(pmt);
 		break;
 	default:
 		break;
 	}
 }
 
 static void *
 mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 {
 	struct ifnet *ifp;
 	struct mlx5e_priv *priv;
 	u8 dev_addr[ETHER_ADDR_LEN] __aligned(4);
 	u8 connector_type;
 	struct sysctl_oid_list *child;
 	int ncv = mdev->priv.eq_table.num_comp_vectors;
 	char unit[16];
 	struct pfil_head_args pa;
 	int err;
 	int i,j;
 	u32 eth_proto_cap;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	bool ext = 0;
 	u32 speeds_num;
 	struct media media_entry = {};
 
 	if (mlx5e_check_required_hca_cap(mdev)) {
 		mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n");
 		return (NULL);
 	}
 	/*
 	 * Try to allocate the priv and make room for worst-case
 	 * number of channel structures:
 	 */
 	priv = malloc(sizeof(*priv) +
 	    (sizeof(priv->channel[0]) * mdev->priv.eq_table.num_comp_vectors),
 	    M_MLX5EN, M_WAITOK | M_ZERO);
 	mlx5e_priv_mtx_init(priv);
 
 	ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev);
 	if (ifp == NULL) {
 		mlx5_core_err(mdev, "if_alloc() failed\n");
 		goto err_free_priv;
 	}
 	ifp->if_softc = priv;
 	if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev));
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_init = mlx5e_open;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = mlx5e_ioctl;
 	ifp->if_transmit = mlx5e_xmit;
 	ifp->if_qflush = if_qflush;
 #if (__FreeBSD_version >= 1100000)
 	ifp->if_get_counter = mlx5e_get_counter;
 #endif
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/*
          * Set driver features
          */
 	ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6;
 	ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
 	ifp->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER;
 	ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
 	ifp->if_capabilities |= IFCAP_LRO;
 	ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
 	ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
 	ifp->if_capabilities |= IFCAP_TXRTLMT;
 	ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc;
 	ifp->if_snd_tag_free = mlx5e_snd_tag_free;
 	ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
 	ifp->if_snd_tag_query = mlx5e_snd_tag_query;
 
 	/* set TSO limits so that we don't have to drop TX packets */
 	ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 	ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
 	ifp->if_hw_tsomaxsegsize = MLX5E_MAX_TX_MBUF_SIZE;
 
 	ifp->if_capenable = ifp->if_capabilities;
 	ifp->if_hwassist = 0;
 	if (ifp->if_capenable & IFCAP_TSO)
 		ifp->if_hwassist |= CSUM_TSO;
 	if (ifp->if_capenable & IFCAP_TXCSUM)
 		ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 		ifp->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 	/* ifnet sysctl tree */
 	sysctl_ctx_init(&priv->sysctl_ctx);
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev),
 	    OID_AUTO, ifp->if_dname, CTLFLAG_RD, 0, "MLX5 ethernet - interface name");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 	snprintf(unit, sizeof(unit), "%d", ifp->if_dunit);
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, unit, CTLFLAG_RD, 0, "MLX5 ethernet - interface unit");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	/* HW sysctl tree */
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev));
 	priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child,
 	    OID_AUTO, "hw", CTLFLAG_RD, 0, "MLX5 ethernet dev hw");
 	if (priv->sysctl_hw == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	err = mlx5e_build_ifp_priv(mdev, priv, ncv);
 	if (err) {
 		mlx5_core_err(mdev, "mlx5e_build_ifp_priv() failed (%d)\n", err);
 		goto err_free_sysctl;
 	}
 
 	/* reuse mlx5core's watchdog workqueue */
 	priv->wq = mdev->priv.health.wq_watchdog;
 
 	err = mlx5_alloc_map_uar(mdev, &priv->cq_uar);
 	if (err) {
 		if_printf(ifp, "%s: mlx5_alloc_map_uar failed, %d\n",
 		    __func__, err);
 		goto err_free_wq;
 	}
 	err = mlx5_core_alloc_pd(mdev, &priv->pdn);
 	if (err) {
 		if_printf(ifp, "%s: mlx5_core_alloc_pd failed, %d\n",
 		    __func__, err);
 		goto err_unmap_free_uar;
 	}
 	err = mlx5_alloc_transport_domain(mdev, &priv->tdn);
 	if (err) {
 		if_printf(ifp, "%s: mlx5_alloc_transport_domain failed, %d\n",
 		    __func__, err);
 		goto err_dealloc_pd;
 	}
 	err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_create_mkey failed, %d\n",
 		    __func__, err);
 		goto err_dealloc_transport_domain;
 	}
 	mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr);
 
 	/* check if we should generate a random MAC address */
 	if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 &&
 	    is_zero_ether_addr(dev_addr)) {
 		random_ether_addr(dev_addr);
 		if_printf(ifp, "Assigned random MAC address\n");
 	}
 #ifdef RATELIMIT
 	err = mlx5e_rl_init(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_rl_init failed, %d\n",
 		    __func__, err);
 		goto err_create_mkey;
 	}
 #endif
 
 	/* set default MTU */
 	mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu);
 
 	/* Set default media status */
 	priv->media_status_last = IFM_AVALID;
 	priv->media_active_last = IFM_ETHER | IFM_AUTO |
 	    IFM_ETH_RXPAUSE | IFM_FDX;
 
 	/* setup default pauseframes configuration */
 	mlx5e_setup_pauseframes(priv);
 
 	/* Setup supported medias */
 	//TODO: If we failed to query ptys is it ok to proceed??
 	if (!mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1)) {
 		ext = MLX5_CAP_PCAM_FEATURE(mdev,
 		    ptys_extended_ethernet);
 		eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 		    eth_proto_capability);
 		if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_connector_type))
 			connector_type = MLX5_GET(ptys_reg, out,
 			    connector_type);
 	} else {
 		eth_proto_cap = 0;
 		if_printf(ifp, "%s: Query port media capability failed,"
 		    " %d\n", __func__, err);
 	}
 
 	ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK,
 	    mlx5e_media_change, mlx5e_media_status);
 
 	speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER : MLX5E_LINK_SPEEDS_NUMBER;
 	for (i = 0; i != speeds_num; i++) {
 		for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) {
 			media_entry = ext ? mlx5e_ext_mode_table[i][j] :
 			    mlx5e_mode_table[i][j];
 			if (media_entry.baudrate == 0)
 				continue;
 			if (MLX5E_PROT_MASK(i) & eth_proto_cap) {
 				ifmedia_add(&priv->media,
 				    media_entry.subtype |
 				    IFM_ETHER, 0, NULL);
 				ifmedia_add(&priv->media,
 				    media_entry.subtype |
 				    IFM_ETHER | IFM_FDX |
 				    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 			}
 		}
 	}
 
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 
 	/* Set autoselect by default */
 	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE);
 	ether_ifattach(ifp, dev_addr);
 
 	/* Register for VLAN events */
 	priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 	    mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
 	priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 	    mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
 
 	/* Link is down by default */
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 
 	mlx5e_enable_async_events(priv);
 
 	mlx5e_add_hw_stats(priv);
 
 	mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM,
 	    priv->stats.vport.arg);
 
 	mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM,
 	    priv->stats.pport.arg);
 
 	mlx5e_create_ethtool(priv);
 
 	mtx_lock(&priv->async_events_mtx);
 	mlx5e_update_stats(priv);
 	mtx_unlock(&priv->async_events_mtx);
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_clbr_done", CTLFLAG_RD,
 	    &priv->clbr_done, 0,
 	    "RX timestamps calibration state");
 	callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT);
 	mlx5e_reset_calibration_callout(priv);
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = ifp->if_xname;
 	priv->pfil = pfil_head_register(&pa);
 
 	return (priv);
 
 #ifdef RATELIMIT
 err_create_mkey:
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 #endif
 err_dealloc_transport_domain:
 	mlx5_dealloc_transport_domain(mdev, priv->tdn);
 
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(mdev, priv->pdn);
 
 err_unmap_free_uar:
 	mlx5_unmap_free_uar(mdev, &priv->cq_uar);
 
 err_free_wq:
 	flush_workqueue(priv->wq);
 
 err_free_sysctl:
 	sysctl_ctx_free(&priv->sysctl_ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	if_free(ifp);
 
 err_free_priv:
 	mlx5e_priv_mtx_destroy(priv);
 	free(priv, M_MLX5EN);
 	return (NULL);
 }
 
 static void
 mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 	struct ifnet *ifp = priv->ifp;
 
 	/* don't allow more IOCTLs */
 	priv->gone = 1;
 
 	/* XXX wait a bit to allow IOCTL handlers to complete */
 	pause("W", hz);
 
 #ifdef RATELIMIT
 	/*
 	 * The kernel can have reference(s) via the m_snd_tag's into
 	 * the ratelimit channels, and these must go away before
 	 * detaching:
 	 */
 	while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) {
 		if_printf(priv->ifp, "Waiting for all ratelimit connections "
 		    "to terminate\n");
 		pause("W", hz);
 	}
 #endif
 	/* stop watchdog timer */
 	callout_drain(&priv->watchdog);
 
 	callout_drain(&priv->tstmp_clbr);
 
 	if (priv->vlan_attach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
 	if (priv->vlan_detach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
 
 	/* make sure device gets closed */
 	PRIV_LOCK(priv);
 	mlx5e_close_locked(ifp);
 	PRIV_UNLOCK(priv);
 
 	/* wait for all unlimited send tags to go away */
 	while (priv->channel_refs != 0) {
 		if_printf(priv->ifp, "Waiting for all unlimited connections "
 		    "to terminate\n");
 		pause("W", hz);
 	}
 
 	/* deregister pfil */
 	if (priv->pfil != NULL) {
 		pfil_head_unregister(priv->pfil);
 		priv->pfil = NULL;
 	}
 
 	/* unregister device */
 	ifmedia_removeall(&priv->media);
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
 #ifdef RATELIMIT
 	mlx5e_rl_cleanup(priv);
 #endif
 	/* destroy all remaining sysctl nodes */
 	sysctl_ctx_free(&priv->stats.vport.ctx);
 	sysctl_ctx_free(&priv->stats.pport.ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	sysctl_ctx_free(&priv->sysctl_ctx);
 
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 	mlx5_dealloc_transport_domain(priv->mdev, priv->tdn);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
 	mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
 	mlx5e_disable_async_events(priv);
 	flush_workqueue(priv->wq);
 	mlx5e_priv_mtx_destroy(priv);
 	free(priv, M_MLX5EN);
 }
 
 static void *
 mlx5e_get_ifp(void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	return (priv->ifp);
 }
 
 static struct mlx5_interface mlx5e_interface = {
 	.add = mlx5e_create_ifp,
 	.remove = mlx5e_destroy_ifp,
 	.event = mlx5e_async_event,
 	.protocol = MLX5_INTERFACE_PROTOCOL_ETH,
 	.get_dev = mlx5e_get_ifp,
 };
 
 void
 mlx5e_init(void)
 {
 	mlx5_register_interface(&mlx5e_interface);
 }
 
 void
 mlx5e_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5e_interface);
 }
 
 static void
 mlx5e_show_version(void __unused *arg)
 {
 
 	printf("%s", mlx5e_version);
 }
 SYSINIT(mlx5e_show_version, SI_SUB_DRIVERS, SI_ORDER_ANY, mlx5e_show_version, NULL);
 
 module_init_order(mlx5e_init, SI_ORDER_THIRD);
 module_exit_order(mlx5e_cleanup, SI_ORDER_THIRD);
 
 #if (__FreeBSD_version >= 1100000)
 MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1);
 #endif
 MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1);
 MODULE_VERSION(mlx5en, 1);
Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
===================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c	(revision 348253)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c	(revision 348254)
@@ -1,1573 +1,1574 @@
 /*-
  * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "en.h"
 
 #ifdef RATELIMIT
 
 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
     struct sysctl_oid *, const char *name, const char *desc);
 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
       struct sysctl_oid *node, const char *name, const char *desc);
 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
 
 static void
 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
     struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
 
 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
 
 	param->wq.buf_numa_node = 0;
 	param->wq.db_numa_node = 0;
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
 
 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
 
 	switch (rl->param.tx_coalesce_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	}
 }
 
 static void
 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
     struct mlx5e_rl_channel_param *cparam)
 {
 	memset(cparam, 0, sizeof(*cparam));
 
 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
 }
 
 static int
 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
     struct mlx5e_sq_param *param, int ix)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *sqc = param->sqc;
 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	int err;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &sq->dma_tag)))
 		goto done;
 
 	/* use shared UAR */
 	sq->uar = priv->rl.sq_uar;
 
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 	    &sq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 	/*
 	 * The sq->bf_buf_size variable is intentionally left zero so
 	 * that the doorbell writes will occur at the same memory
 	 * location.
 	 */
 
 	err = mlx5e_alloc_sq_db(sq);
 	if (err)
 		goto err_sq_wq_destroy;
 
 	sq->mkey_be = cpu_to_be32(priv->mr.key);
 	sq->ifp = priv->ifp;
 	sq->priv = priv;
 
 	mlx5e_update_sq_inline(sq);
 
 	return (0);
 
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 err_free_dma_tag:
 	bus_dma_tag_destroy(sq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
 {
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 }
 
 static int
 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
     struct mlx5e_sq_param *param, int ix)
 {
 	int err;
 
 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
 	if (err)
 		goto err_destroy_sq;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
 	if (err)
 		goto err_disable_sq;
 
 	WRITE_ONCE(sq->running, 1);
 
 	return (0);
 
 err_disable_sq:
 	mlx5e_disable_sq(sq);
 err_destroy_sq:
 	mlx5e_rl_destroy_sq(sq);
 
 	return (err);
 }
 
 static void
 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
 {
 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
 
 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
 
 	sq->cev_factor = priv->rl.param.tx_completion_fact;
 
 	/* ensure the TX completion event factor is not zero */
 	if (sq->cev_factor == 0)
 		sq->cev_factor = 1;
 }
 
 static int
 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
     struct mlx5e_rl_channel_param *cparam,
     struct mlx5e_sq *volatile *ppsq)
 {
 	struct mlx5e_priv *priv = rlw->priv;
 	struct mlx5e_sq *sq;
 	int err;
 
 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
 
 	/* init mutexes */
 	mlx5e_rl_chan_mtx_init(priv, sq);
 
 	/* open TX completion queue */
 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
 	    &mlx5e_tx_cq_comp, eq_ix);
 	if (err)
 		goto err_free;
 
 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
 	if (err)
 		goto err_close_tx_cq;
 
 	/* store TX channel pointer */
 	*ppsq = sq;
 
 	/* poll TX queue initially */
 	sq->cq.mcq.comp(&sq->cq.mcq);
 
 	return (0);
 
 err_close_tx_cq:
 	mlx5e_close_cq(&sq->cq);
 
 err_free:
 	/* destroy mutexes */
 	mtx_destroy(&sq->lock);
 	mtx_destroy(&sq->comp_lock);
 	free(sq, M_MLX5EN);
 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
 	return (err);
 }
 
 static void
 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
 {
 	struct mlx5e_sq *sq = *ppsq;
 
 	/* check if channel is already closed */
 	if (sq == NULL)
 		return;
 	/* ensure channel pointer is no longer used */
 	*ppsq = NULL;
 
 	/* teardown and destroy SQ */
 	mlx5e_drain_sq(sq);
 	mlx5e_disable_sq(sq);
 	mlx5e_rl_destroy_sq(sq);
 
 	/* close CQ */
 	mlx5e_close_cq(&sq->cq);
 
 	/* destroy mutexes */
 	mtx_destroy(&sq->lock);
 	mtx_destroy(&sq->comp_lock);
 
 	free(sq, M_MLX5EN);
 }
 
 static void
 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
 {
 	/*
 	 * Limit the maximum distance between completion events to
 	 * half of the currently set TX queue size.
 	 *
 	 * The maximum number of queue entries a single IP packet can
 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
 	 *
 	 * The worst case max value is then given as below:
 	 */
 	uint64_t max = rl->param.tx_queue_size /
 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
 
 	/*
 	 * Update the maximum completion factor value in case the
 	 * tx_queue_size field changed. Ensure we don't overflow
 	 * 16-bits.
 	 */
 	if (max < 1)
 		max = 1;
 	else if (max > 65535)
 		max = 65535;
 	rl->param.tx_completion_fact_max = max;
 
 	/*
 	 * Verify that the current TX completion factor is within the
 	 * given limits:
 	 */
 	if (rl->param.tx_completion_fact < 1)
 		rl->param.tx_completion_fact = 1;
 	else if (rl->param.tx_completion_fact > max)
 		rl->param.tx_completion_fact = max;
 }
 
 static int
 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
 {
 	struct mlx5e_priv *priv = sq->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *sqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
 
 	err = mlx5_core_modify_sq(mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 /*
  * This function will search the configured rate limit table for the
  * best match to avoid that a single socket based application can
  * allocate all the available hardware rates. If the user selected
  * rate deviates too much from the closes rate available in the rate
  * limit table, unlimited rate will be selected.
  */
 static uint64_t
 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
 {
 	uint64_t distance = -1ULL;
 	uint64_t diff;
 	uint64_t retval = 0;		/* unlimited */
 	uint64_t x;
 
 	/* search for closest rate */
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		uint64_t rate = rl->rate_limit_table[x];
 		if (rate == 0)
 			continue;
 
 		if (rate > user_rate)
 			diff = rate - user_rate;
 		else
 			diff = user_rate - rate;
 
 		/* check if distance is smaller than previous rate */
 		if (diff < distance) {
 			distance = diff;
 			retval = rate;
 		}
 	}
 
 	/* range check for multiplication below */
 	if (user_rate > rl->param.tx_limit_max)
 		user_rate = rl->param.tx_limit_max;
 
 	/* fallback to unlimited, if rate deviates too much */
 	if (distance > howmany(user_rate *
 	    rl->param.tx_allowed_deviation, 1000ULL))
 		retval = 0;
 
 	return (retval);
 }
 
 /*
  * This function sets the requested rate for a rate limit channel, in
  * bits per second. The requested rate will be filtered through the
  * find best rate function above.
  */
 static int
 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
     struct mlx5e_rl_channel *channel, uint64_t rate)
 {
 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
 	struct mlx5e_sq *sq;
 	uint64_t temp;
 	uint16_t index;
 	uint16_t burst;
 	int error;
 
 	if (rate != 0) {
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 
 		MLX5E_RL_RLOCK(rl);
 
 		/* get current burst size in bytes */
 		temp = rl->param.tx_burst_size *
 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
 
 		/* limit burst size to 64K currently */
 		if (temp > 65535)
 			temp = 65535;
 		burst = temp;
 
 		/* find best rate */
 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
 
 		MLX5E_RL_RUNLOCK(rl);
 
 		if (rate == 0) {
 			/* rate doesn't exist, fallback to unlimited */
 			index = 0;
 			rate = 0;
 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
 		} else {
 			/* get a reference on the new rate */
 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
 			    howmany(rate, 1000), burst, &index);
 
 			if (error != 0) {
 				/* adding rate failed, fallback to unlimited */
 				index = 0;
 				rate = 0;
 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
 			}
 		}
 		MLX5E_RL_WORKER_LOCK(rlw);
 	} else {
 		index = 0;
 		burst = 0;	/* default */
 	}
 
 	/* atomically swap rates */
 	temp = channel->last_rate;
 	channel->last_rate = rate;
 	rate = temp;
 
 	/* atomically swap burst size */
 	temp = channel->last_burst;
 	channel->last_burst = burst;
 	burst = temp;
 
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 	/* put reference on the old rate, if any */
 	if (rate != 0) {
 		mlx5_rl_remove_rate(rlw->priv->mdev,
 		    howmany(rate, 1000), burst);
 	}
 
 	/* set new rate, if SQ is running */
 	sq = channel->sq;
 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
 		error = mlx5e_rl_modify_sq(sq, index);
 		if (error != 0)
 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
 	} else
 		error = 0;
 	MLX5E_RL_WORKER_LOCK(rlw);
 
 	return (-error);
 }
 
 static void
 mlx5e_rl_worker(void *arg)
 {
 	struct thread *td;
 	struct mlx5e_rl_worker *rlw = arg;
 	struct mlx5e_rl_channel *channel;
 	struct mlx5e_priv *priv;
 	unsigned ix;
 	uint64_t x;
 	int error;
 
 	/* set thread priority */
 	td = curthread;
 
 	thread_lock(td);
 	sched_prio(td, PI_SWI(SWI_NET));
 	thread_unlock(td);
 
 	priv = rlw->priv;
 
 	/* compute completion vector */
 	ix = (rlw - priv->rl.workers) %
 	    priv->mdev->priv.eq_table.num_comp_vectors;
 
 	/* TODO bind to CPU */
 
 	/* open all the SQs */
 	MLX5E_RL_WORKER_LOCK(rlw);
 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
 		struct mlx5e_rl_channel *channel = rlw->channels + x;
 
 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
 		if (channel->state == MLX5E_RL_ST_FREE)
 			continue;
 #endif
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 
 		MLX5E_RL_RLOCK(&priv->rl);
 		error = mlx5e_rl_open_channel(rlw, ix,
 		    &priv->rl.chan_param, &channel->sq);
 		MLX5E_RL_RUNLOCK(&priv->rl);
 
 		MLX5E_RL_WORKER_LOCK(rlw);
 		if (error != 0) {
 			if_printf(priv->ifp,
 			    "mlx5e_rl_open_channel failed: %d\n", error);
 			break;
 		}
 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
 	}
 	while (1) {
 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
 			/* check if we are tearing down */
 			if (rlw->worker_done != 0)
 				break;
 			cv_wait(&rlw->cv, &rlw->mtx);
 		}
 		/* check if we are tearing down */
 		if (rlw->worker_done != 0)
 			break;
 		channel = STAILQ_FIRST(&rlw->process_head);
 		if (channel != NULL) {
 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
 
 			switch (channel->state) {
 			case MLX5E_RL_ST_MODIFY:
 				channel->state = MLX5E_RL_ST_USED;
 				MLX5E_RL_WORKER_UNLOCK(rlw);
 
 				/* create channel by demand */
 				if (channel->sq == NULL) {
 					MLX5E_RL_RLOCK(&priv->rl);
 					error = mlx5e_rl_open_channel(rlw, ix,
 					    &priv->rl.chan_param, &channel->sq);
 					MLX5E_RL_RUNLOCK(&priv->rl);
 
 					if (error != 0) {
 						if_printf(priv->ifp,
 						    "mlx5e_rl_open_channel failed: %d\n", error);
 					} else {
 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
 					}
 				} else {
 					mlx5e_resume_sq(channel->sq);
 				}
 
 				MLX5E_RL_WORKER_LOCK(rlw);
 				/* convert from bytes/s to bits/s and set new rate */
 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
 				    channel->new_rate * 8ULL);
 				if (error != 0) {
 					if_printf(priv->ifp,
 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
 					    error);
 				}
 				break;
 
 			case MLX5E_RL_ST_DESTROY:
 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
 				if (error != 0) {
 					if_printf(priv->ifp,
 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
 					    error);
 				}
 				if (channel->sq != NULL) {
 					/*
 					 * Make sure all packets are
 					 * transmitted before SQ is
 					 * returned to free list:
 					 */
 					MLX5E_RL_WORKER_UNLOCK(rlw);
 					mlx5e_drain_sq(channel->sq);
 					MLX5E_RL_WORKER_LOCK(rlw);
 				}
 				/* put the channel back into the free list */
 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
 				channel->state = MLX5E_RL_ST_FREE;
 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
 				break;
 			default:
 				/* NOP */
 				break;
 			}
 		}
 	}
 
 	/* close all the SQs */
 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
 		struct mlx5e_rl_channel *channel = rlw->channels + x;
 
 		/* update the initial rate */
 		channel->init_rate = channel->last_rate;
 
 		/* make sure we free up the rate resource */
 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
 
 		if (channel->sq != NULL) {
 			MLX5E_RL_WORKER_UNLOCK(rlw);
 			mlx5e_rl_close_channel(&channel->sq);
 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
 			MLX5E_RL_WORKER_LOCK(rlw);
 		}
 	}
 
 	rlw->worker_done = 0;
 	cv_broadcast(&rlw->cv);
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	kthread_exit();
 }
 
 static int
 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(tisc, tisc, prio, 0);
 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
 
 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
 }
 
 static void
 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
 {
 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
 }
 
 static void
 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
     struct mlx5_core_dev *mdev)
 {
 	/* ratelimit workers */
 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
 
 	/* range check */
 	if (param->tx_worker_threads_def == 0 ||
 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
 		param->tx_worker_threads_def = param->tx_worker_threads_max;
 
 	/* ratelimit channels */
 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
 	    param->tx_worker_threads_def;
 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
 
 	/* range check */
 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
 
 	/* set default burst size */
 	param->tx_burst_size = 4;	/* MTUs */
 
 	/*
 	 * Set maximum burst size
 	 *
 	 * The burst size is multiplied by the MTU and clamped to the
 	 * range 0 ... 65535 bytes inclusivly before fed into the
 	 * firmware.
 	 *
 	 * NOTE: If the burst size or MTU is changed only ratelimit
 	 * connections made after the change will use the new burst
 	 * size.
 	 */
 	param->tx_burst_size_max = 255;
 
 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
 
 	/* ratelimit table size */
 	param->tx_rates_max = mdev->priv.rl_table.max_size;
 
 	/* range check */
 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
 
 	/* set default number of rates */
 	param->tx_rates_def = param->tx_rates_max;
 
 	/* set maximum allowed rate deviation */
 	if (param->tx_limit_max != 0) {
 		/*
 		 * Make sure the deviation multiplication doesn't
 		 * overflow unsigned 64-bit:
 		 */
 		param->tx_allowed_deviation_max = -1ULL /
 		    param->tx_limit_max;
 	}
 	/* set default rate deviation */
 	param->tx_allowed_deviation = 50;	/* 5.0% */
 
 	/* channel parameters */
 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
 }
 
 static const char *mlx5e_rl_params_desc[] = {
 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_rl_table_params_desc[] = {
 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_rl_stats_desc[] = {
 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
 };
 
 int
 mlx5e_rl_init(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	struct sysctl_oid *node;
 	struct sysctl_oid *stats;
 	char buf[64];
 	uint64_t i;
 	uint64_t j;
 	int error;
 
 	/* check if there is support for packet pacing */
 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
 		return (0);
 
 	rl->priv = priv;
 
 	sysctl_ctx_init(&rl->ctx);
 
 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
 
 	/* allocate shared UAR for SQs */
 	error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar);
 	if (error)
 		goto done;
 
 	/* open own TIS domain for ratelimit SQs */
 	error = mlx5e_rl_open_tis(priv);
 	if (error)
 		goto err_uar;
 
 	/* setup default value for parameters */
 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
 
 	/* update the completion factor */
 	mlx5e_rl_sync_tx_completion_fact(rl);
 
 	/* create root node */
 	node = SYSCTL_ADD_NODE(&rl->ctx,
 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
 	    "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support");
 
 	if (node != NULL) {
 		/* create SYSCTLs */
 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
 			mlx5e_rl_sysctl_add_u64_oid(rl,
 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
 			    node, mlx5e_rl_params_desc[2 * i],
 			    mlx5e_rl_params_desc[2 * i + 1]);
 		}
 
 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
 		    OID_AUTO, "stats", CTLFLAG_RD, NULL,
 		    "Rate limiting statistics");
 		if (stats != NULL) {
 			/* create SYSCTLs */
 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
 				    stats, mlx5e_rl_stats_desc[2 * i],
 				    mlx5e_rl_stats_desc[2 * i + 1]);
 			}
 		}
 	}
 
 	/* allocate workers array */
 	rl->workers = malloc(sizeof(rl->workers[0]) *
 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
 
 	/* allocate rate limit array */
 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
 
 	if (node != NULL) {
 		/* create more SYSCTls */
 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
 		    "A", "Show table of all configured TX rates");
 
 		/* try to fetch rate table from kernel environment */
 		for (i = 0; i != rl->param.tx_rates_def; i++) {
 			/* compute path for tunable */
 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
 			if (TUNABLE_QUAD_FETCH(buf, &j))
 				mlx5e_rl_tx_limit_add(rl, j);
 		}
 
 		/* setup rate table sysctls */
 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
 			mlx5e_rl_sysctl_add_u64_oid(rl,
 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
 			    node, mlx5e_rl_table_params_desc[2 * i],
 			    mlx5e_rl_table_params_desc[2 * i + 1]);
 		}
 	}
 
 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + j;
 
 		rlw->priv = priv;
 
 		cv_init(&rlw->cv, "mlx5-worker-cv");
 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
 		STAILQ_INIT(&rlw->index_list_head);
 		STAILQ_INIT(&rlw->process_head);
 
 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
 
 		MLX5E_RL_WORKER_LOCK(rlw);
 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
 			struct mlx5e_rl_channel *channel = rlw->channels + i;
 			channel->worker = rlw;
-			channel->tag.m_snd_tag.ifp = priv->ifp;
 			channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
 		}
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 	}
 
 	PRIV_LOCK(priv);
 	error = mlx5e_rl_open_workers(priv);
 	PRIV_UNLOCK(priv);
 
 	if (error != 0) {
 		if_printf(priv->ifp,
 		    "mlx5e_rl_open_workers failed: %d\n", error);
 	}
 
 	return (0);
 
 err_uar:
 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
 done:
 	sysctl_ctx_free(&rl->ctx);
 	sx_destroy(&rl->rl_sxlock);
 	return (error);
 }
 
 static int
 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	struct thread *rl_thread = NULL;
 	struct proc *rl_proc = NULL;
 	uint64_t j;
 	int error;
 
 	if (priv->gone || rl->opened)
 		return (-EINVAL);
 
 	MLX5E_RL_WLOCK(rl);
 	/* compute channel parameters once */
 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
 	MLX5E_RL_WUNLOCK(rl);
 
 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + j;
 
 		/* start worker thread */
 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
 		if (error != 0) {
 			if_printf(rl->priv->ifp,
 			    "kproc_kthread_add failed: %d\n", error);
 			rlw->worker_done = 1;
 		}
 	}
 
 	rl->opened = 1;
 
 	return (0);
 }
 
 static void
 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	uint64_t y;
 
 	if (rl->opened == 0)
 		return;
 
 	/* tear down worker threads simultaneously */
 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		/* tear down worker before freeing SQs */
 		MLX5E_RL_WORKER_LOCK(rlw);
 		if (rlw->worker_done == 0) {
 			rlw->worker_done = 1;
 			cv_broadcast(&rlw->cv);
 		} else {
 			/* XXX thread not started */
 			rlw->worker_done = 0;
 		}
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 	}
 
 	/* wait for worker threads to exit */
 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		/* tear down worker before freeing SQs */
 		MLX5E_RL_WORKER_LOCK(rlw);
 		while (rlw->worker_done != 0)
 			cv_wait(&rlw->cv, &rlw->mtx);
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 	}
 
 	rl->opened = 0;
 }
 
 static void
 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
 {
 	unsigned x;
 
 	MLX5E_RL_WLOCK(rl);
 	for (x = 0; x != rl->param.tx_rates_def; x++)
 		rl->rate_limit_table[x] = 0;
 	MLX5E_RL_WUNLOCK(rl);
 }
 
 void
 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	uint64_t y;
 
 	/* check if there is support for packet pacing */
 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
 		return;
 
 	/* TODO check if there is support for packet pacing */
 
 	sysctl_ctx_free(&rl->ctx);
 
 	PRIV_LOCK(priv);
 	mlx5e_rl_close_workers(priv);
 	PRIV_UNLOCK(priv);
 
 	mlx5e_rl_reset_rates(rl);
 
 	/* free shared UAR for SQs */
 	mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
 
 	/* close TIS domain */
 	mlx5e_rl_close_tis(priv);
 
 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		cv_destroy(&rlw->cv);
 		mtx_destroy(&rlw->mtx);
 		free(rlw->channels, M_MLX5EN);
 	}
 	free(rl->rate_limit_table, M_MLX5EN);
 	free(rl->workers, M_MLX5EN);
 	sx_destroy(&rl->rl_sxlock);
 }
 
 static void
 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
     struct mlx5e_rl_channel *channel)
 {
 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
 	cv_broadcast(&rlw->cv);
 }
 
 static void
 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
 {
 	if (channel == NULL)
 		return;
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	switch (channel->state) {
 	case MLX5E_RL_ST_MODIFY:
 		channel->state = MLX5E_RL_ST_DESTROY;
 		break;
 	case MLX5E_RL_ST_USED:
 		channel->state = MLX5E_RL_ST_DESTROY;
 		mlx5e_rlw_queue_channel_locked(rlw, channel);
 		break;
 	default:
 		break;
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 }
 
 static int
 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
 {
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	channel->new_rate = rate;
 	switch (channel->state) {
 	case MLX5E_RL_ST_USED:
 		channel->state = MLX5E_RL_ST_MODIFY;
 		mlx5e_rlw_queue_channel_locked(rlw, channel);
 		break;
 	default:
 		break;
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	return (0);
 }
 
 static int
 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
     union if_snd_tag_query_params *params)
 {
 	int retval;
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	switch (channel->state) {
 	case MLX5E_RL_ST_USED:
 		params->rate_limit.max_rate = channel->last_rate;
 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
 		retval = 0;
 		break;
 	case MLX5E_RL_ST_MODIFY:
 		params->rate_limit.max_rate = channel->last_rate;
 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
 		retval = EBUSY;
 		break;
 	default:
 		retval = EINVAL;
 		break;
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	return (retval);
 }
 
 static int
 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
     struct mlx5e_rl_channel **pchannel)
 {
 	struct mlx5e_rl_channel *channel;
 	int retval = ENOMEM;
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	/* Check for available channel in free list */
 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
 		retval = 0;
 		/* Remove head index from available list */
 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
 		channel->state = MLX5E_RL_ST_USED;
 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
 	} else {
 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	*pchannel = channel;
 #ifdef RATELIMIT_DEBUG
 	if_printf(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel);
 #endif
 	return (retval);
 }
 
 int
 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct mlx5e_rl_channel *channel;
 	struct mlx5e_rl_worker *rlw;
 	struct mlx5e_priv *priv;
 	int error;
 
 	priv = ifp->if_softc;
 
 	/* check if there is support for packet pacing or if device is going away */
 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
 		return (EOPNOTSUPP);
 
 	/* compute worker thread this TCP connection belongs to */
 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
 	    priv->rl.param.tx_worker_threads_def);
 
 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
 	if (error != 0)
 		goto done;
 
 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
 	if (error != 0) {
 		mlx5e_rl_free(rlw, channel);
 		goto done;
 	}
 
 	/* store pointer to mbuf tag */
+	MPASS(channel->tag.m_snd_tag.refcount == 0);
+	m_snd_tag_init(&channel->tag.m_snd_tag, ifp);
 	*ppmt = &channel->tag.m_snd_tag;
 done:
 	return (error);
 }
 
 
 int
 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
 {
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
 
 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
 }
 
 int
 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
 
 	return (mlx5e_rl_query(channel->worker, channel, params));
 }
 
 void
 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
 
 	mlx5e_rl_free(channel->worker, channel);
 }
 
 static int
 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_rl_priv_data *rl = arg1;
 	struct mlx5e_priv *priv = rl->priv;
 	struct sbuf sbuf;
 	unsigned x;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	PRIV_LOCK(priv);
 
 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
 
 	sbuf_printf(&sbuf,
 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
 	    "\t" "--------------------------------------------\n");
 
 	MLX5E_RL_RLOCK(rl);
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		if (rl->rate_limit_table[x] == 0)
 			continue;
 
 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
 		    x, (unsigned)rl->param.tx_burst_size,
 		    (long long)rl->rate_limit_table[x]);
 	}
 	MLX5E_RL_RUNLOCK(rl);
 
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 
 	PRIV_UNLOCK(priv);
 
 	return (error);
 }
 
 static int
 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
 {
 	uint64_t x;
 	uint64_t y;
 
 	MLX5E_RL_WLOCK(rl);
 	/* compute channel parameters once */
 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
 	MLX5E_RL_WUNLOCK(rl);
 
 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
 			struct mlx5e_rl_channel *channel;
 			struct mlx5e_sq *sq;
 
 			channel = rlw->channels + x;
 			sq = channel->sq;
 
 			if (sq == NULL)
 				continue;
 
 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
 				    rl->param.tx_coalesce_usecs,
 				    rl->param.tx_coalesce_pkts,
 				    rl->param.tx_coalesce_mode);
 			} else {
 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
 				    rl->param.tx_coalesce_usecs,
 				    rl->param.tx_coalesce_pkts);
 			}
 		}
 	}
 	return (0);
 }
 
 void
 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
 {
 	uint64_t x;
 	uint64_t y;
 
 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
 			struct mlx5e_rl_channel *channel;
 			struct mlx5e_sq *sq;
 
 			channel = rlw->channels + x;
 			sq = channel->sq;
 
 			if (sq == NULL)
 				continue;
 
 			mtx_lock(&sq->lock);
 			mlx5e_update_sq_inline(sq);
 			mtx_unlock(&sq->lock);
 		}
 	}
 }
 
 static int
 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
 {
 	unsigned x;
 	int error;
 
 	if (value < 1000 ||
 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
 		return (EINVAL);
 
 	MLX5E_RL_WLOCK(rl);
 	error = ENOMEM;
 
 	/* check if rate already exists */
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		if (rl->rate_limit_table[x] != value)
 			continue;
 		error = EEXIST;
 		break;
 	}
 
 	/* check if there is a free rate entry */
 	if (x == rl->param.tx_rates_def) {
 		for (x = 0; x != rl->param.tx_rates_def; x++) {
 			if (rl->rate_limit_table[x] != 0)
 				continue;
 			rl->rate_limit_table[x] = value;
 			error = 0;
 			break;
 		}
 	}
 	MLX5E_RL_WUNLOCK(rl);
 
 	return (error);
 }
 
 static int
 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
 {
 	unsigned x;
 	int error;
 
 	if (value == 0)
 		return (EINVAL);
 
 	MLX5E_RL_WLOCK(rl);
 
 	/* check if rate already exists */
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		if (rl->rate_limit_table[x] != value)
 			continue;
 		/* free up rate */
 		rl->rate_limit_table[x] = 0;
 		break;
 	}
 
 	/* check if there is a free rate entry */
 	if (x == rl->param.tx_rates_def)
 		error = ENOENT;
 	else
 		error = 0;
 	MLX5E_RL_WUNLOCK(rl);
 
 	return (error);
 }
 
 static int
 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_rl_priv_data *rl = arg1;
 	struct mlx5e_priv *priv = rl->priv;
 	unsigned mode_modify;
 	unsigned was_opened;
 	uint64_t value;
 	uint64_t old;
 	int error;
 
 	PRIV_LOCK(priv);
 
 	MLX5E_RL_RLOCK(rl);
 	value = rl->param.arg[arg2];
 	MLX5E_RL_RUNLOCK(rl);
 
 	if (req != NULL) {
 		old = value;
 		error = sysctl_handle_64(oidp, &value, 0, req);
 		if (error || req->newptr == NULL ||
 		    value == rl->param.arg[arg2])
 			goto done;
 	} else {
 		old = 0;
 		error = 0;
 	}
 
 	/* check if device is gone */
 	if (priv->gone) {
 		error = ENXIO;
 		goto done;
 	}
 	was_opened = rl->opened;
 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
 
 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
 		if (value > rl->param.tx_worker_threads_max)
 			value = rl->param.tx_worker_threads_max;
 		else if (value < 1)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
 		if (value > rl->param.tx_channels_per_worker_max)
 			value = rl->param.tx_channels_per_worker_max;
 		else if (value < 1)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
 		if (value > rl->param.tx_rates_max)
 			value = rl->param.tx_rates_max;
 		else if (value < 1)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
 		/* range check */
 		if (value < 1)
 			value = 0;
 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
 			value = MLX5E_FLD_MAX(cqc, cq_period);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* check to avoid down and up the network interface */
 		if (was_opened)
 			error = mlx5e_rl_refresh_channel_params(rl);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
 		/* import TX coal pkts */
 		if (value < 1)
 			value = 0;
 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* check to avoid down and up the network interface */
 		if (was_opened)
 			error = mlx5e_rl_refresh_channel_params(rl);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
 		/* network interface must be down */
 		if (was_opened != 0 && mode_modify == 0)
 			mlx5e_rl_close_workers(priv);
 
 		/* import TX coalesce mode */
 		if (value != 0)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* restart network interface, if any */
 		if (was_opened != 0) {
 			if (mode_modify == 0)
 				mlx5e_rl_open_workers(priv);
 			else
 				error = mlx5e_rl_refresh_channel_params(rl);
 		}
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
 		/* network interface must be down */
 		if (was_opened)
 			mlx5e_rl_close_workers(priv);
 
 		/* import TX queue size */
 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
 		else if (value > priv->params_ethtool.tx_queue_size_max)
 			value = priv->params_ethtool.tx_queue_size_max;
 
 		/* store actual TX queue size */
 		value = 1ULL << order_base_2(value);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* verify TX completion factor */
 		mlx5e_rl_sync_tx_completion_fact(rl);
 
 		/* restart network interface, if any */
 		if (was_opened)
 			mlx5e_rl_open_workers(priv);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
 		/* network interface must be down */
 		if (was_opened)
 			mlx5e_rl_close_workers(priv);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* verify parameter */
 		mlx5e_rl_sync_tx_completion_fact(rl);
 
 		/* restart network interface, if any */
 		if (was_opened)
 			mlx5e_rl_open_workers(priv);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
 		error = mlx5e_rl_tx_limit_add(rl, value);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
 		error = mlx5e_rl_tx_limit_clr(rl, value);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
 		/* range check */
 		if (value > rl->param.tx_allowed_deviation_max)
 			value = rl->param.tx_allowed_deviation_max;
 		else if (value < rl->param.tx_allowed_deviation_min)
 			value = rl->param.tx_allowed_deviation_min;
 
 		MLX5E_RL_WLOCK(rl);
 		rl->param.arg[arg2] = value;
 		MLX5E_RL_WUNLOCK(rl);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
 		/* range check */
 		if (value > rl->param.tx_burst_size_max)
 			value = rl->param.tx_burst_size_max;
 		else if (value < rl->param.tx_burst_size_min)
 			value = rl->param.tx_burst_size_min;
 
 		MLX5E_RL_WLOCK(rl);
 		rl->param.arg[arg2] = value;
 		MLX5E_RL_WUNLOCK(rl);
 		break;
 
 	default:
 		break;
 	}
 done:
 	PRIV_UNLOCK(priv);
 	return (error);
 }
 
 static void
 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
     struct sysctl_oid *node, const char *name, const char *desc)
 {
 	/*
 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
 	 * take care of loading default sysctl value from the kernel
 	 * environment, if any:
 	 */
 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
 		/* read-only SYSCTLs */
 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 		    name, CTLTYPE_U64 | CTLFLAG_RD |
 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
 	} else {
 		if (strstr(name, "_def") != 0) {
 #ifdef RATELIMIT_DEBUG
 			/* tunable read-only advanced SYSCTLs */
 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
 #endif
 		} else {
 			/* read-write SYSCTLs */
 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
 		}
 	}
 }
 
 static void
 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
     struct sysctl_oid *node, const char *name, const char *desc)
 {
 	/* read-only SYSCTLs */
 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
 }
 
 #endif
Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
===================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c	(revision 348253)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c	(revision 348254)
@@ -1,657 +1,642 @@
 /*-
  * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "en.h"
 #include <machine/atomic.h>
 
 static inline bool
 mlx5e_do_send_cqe(struct mlx5e_sq *sq)
 {
 	sq->cev_counter++;
 	/* interleave the CQEs */
 	if (sq->cev_counter >= sq->cev_factor) {
 		sq->cev_counter = 0;
 		return (1);
 	}
 	return (0);
 }
 
 void
 mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt)
 {
 	u16 pi = sq->pc & sq->wq.sz_m1;
 	struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
 	else
 		wqe->ctrl.fm_ce_se = 0;
 
 	/* Copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 #if (__FreeBSD_version >= 1100000)
 static uint32_t mlx5e_hash_value;
 
 static void
 mlx5e_hash_init(void *arg)
 {
 	mlx5e_hash_value = m_ether_tcpip_hash_init();
 }
 
 /* Make kernel call mlx5e_hash_init after the random stack finished initializing */
 SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL);
 #endif
 
 static struct mlx5e_sq *
 mlx5e_select_queue_by_send_tag(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct mlx5e_snd_tag *ptag;
 	struct mlx5e_sq *sq;
 
-	/* check for route change */
-	if (mb->m_pkthdr.snd_tag->ifp != ifp)
-		return (NULL);
-
 	/* get pointer to sendqueue */
 	ptag = container_of(mb->m_pkthdr.snd_tag,
 	    struct mlx5e_snd_tag, m_snd_tag);
 
 	switch (ptag->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		sq = container_of(ptag,
 		    struct mlx5e_rl_channel, tag)->sq;
 		break;
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		sq = &container_of(ptag,
 		    struct mlx5e_channel, tag)->sq[0];
 		KASSERT(({
 		    struct mlx5e_priv *priv = ifp->if_softc;
 		    priv->channel_refs > 0; }),
 		    ("mlx5e_select_queue: Channel refs are zero for unlimited tag"));
 		break;
 	default:
 		sq = NULL;
 		break;
 	}
 
 	/* check if valid */
 	if (sq != NULL && READ_ONCE(sq->running) != 0)
 		return (sq);
 
 	return (NULL);
 }
 
 static struct mlx5e_sq *
 mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	struct mlx5e_sq *sq;
 	u32 ch;
 	u32 tc;
 
 	/* obtain VLAN information if present */
 	if (mb->m_flags & M_VLANTAG) {
 		tc = (mb->m_pkthdr.ether_vtag >> 13);
 		if (tc >= priv->num_tc)
 			tc = priv->default_vlan_prio;
 	} else {
 		tc = priv->default_vlan_prio;
 	}
 
 	ch = priv->params.num_channels;
 
 	/* check if flowid is set */
 	if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) {
 #ifdef RSS
 		u32 temp;
 
 		if (rss_hash2bucket(mb->m_pkthdr.flowid,
 		    M_HASHTYPE_GET(mb), &temp) == 0)
 			ch = temp % ch;
 		else
 #endif
 			ch = (mb->m_pkthdr.flowid % 128) % ch;
 	} else {
 #if (__FreeBSD_version >= 1100000)
 		ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 |
 		    MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch;
 #else
 		/*
 		 * m_ether_tcpip_hash not present in stable, so just
 		 * throw unhashed mbufs on queue 0
 		 */
 		ch = 0;
 #endif
 	}
 
 	/* check if send queue is running */
 	sq = &priv->channel[ch].sq[tc];
 	if (likely(READ_ONCE(sq->running) != 0))
 		return (sq);
 	return (NULL);
 }
 
 static inline u16
 mlx5e_get_l2_header_size(struct mlx5e_sq *sq, struct mbuf *mb)
 {
 	struct ether_vlan_header *eh;
 	uint16_t eth_type;
 	int min_inline;
 
 	eh = mtod(mb, struct ether_vlan_header *);
 	if (unlikely(mb->m_len < ETHER_HDR_LEN)) {
 		goto max_inline;
 	} else if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		if (unlikely(mb->m_len < (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)))
 			goto max_inline;
 		eth_type = ntohs(eh->evl_proto);
 		min_inline = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		eth_type = ntohs(eh->evl_encap_proto);
 		min_inline = ETHER_HDR_LEN;
 	}
 
 	switch (eth_type) {
 	case ETHERTYPE_IP:
 	case ETHERTYPE_IPV6:
 		/*
 		 * Make sure the TOS(IPv4) or traffic class(IPv6)
 		 * field gets inlined. Else the SQ may stall.
 		 */
 		min_inline += 4;
 		break;
 	default:
 		goto max_inline;
 	}
 
 	/*
 	 * m_copydata() will be used on the remaining header which
 	 * does not need to reside within the first m_len bytes of
 	 * data:
 	 */
 	if (mb->m_pkthdr.len < min_inline)
 		goto max_inline;
 	return (min_inline);
 
 max_inline:
 	return (MIN(mb->m_pkthdr.len, sq->max_inline));
 }
 
 static int
 mlx5e_get_full_header_size(struct mbuf *mb)
 {
 	struct ether_vlan_header *eh;
 	struct tcphdr *th;
 	struct ip *ip;
 	int ip_hlen, tcp_hlen;
 	struct ip6_hdr *ip6;
 	uint16_t eth_type;
 	int eth_hdr_len;
 
 	eh = mtod(mb, struct ether_vlan_header *);
 	if (mb->m_len < ETHER_HDR_LEN)
 		return (0);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		if (mb->m_len < (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN))
 			return (0);
 		eth_type = ntohs(eh->evl_proto);
 		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		eth_type = ntohs(eh->evl_encap_proto);
 		eth_hdr_len = ETHER_HDR_LEN;
 	}
 	switch (eth_type) {
 	case ETHERTYPE_IP:
 		ip = (struct ip *)(mb->m_data + eth_hdr_len);
 		if (mb->m_len < eth_hdr_len + sizeof(*ip))
 			return (0);
 		switch (ip->ip_p) {
 		case IPPROTO_TCP:
 			ip_hlen = ip->ip_hl << 2;
 			eth_hdr_len += ip_hlen;
 			break;
 		case IPPROTO_UDP:
 			ip_hlen = ip->ip_hl << 2;
 			eth_hdr_len += ip_hlen + 8;
 			goto done;
 		default:
 			return (0);
 		}
 		break;
 	case ETHERTYPE_IPV6:
 		ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len);
 		if (mb->m_len < eth_hdr_len + sizeof(*ip6))
 			return (0);
 		switch (ip6->ip6_nxt) {
 		case IPPROTO_TCP:
 			eth_hdr_len += sizeof(*ip6);
 			break;
 		case IPPROTO_UDP:
 			eth_hdr_len += sizeof(*ip6) + 8;
 			goto done;
 		default:
 			return (0);
 		}
 		break;
 	default:
 		return (0);
 	}
 	if (mb->m_len < eth_hdr_len + sizeof(*th))
 		return (0);
 	th = (struct tcphdr *)(mb->m_data + eth_hdr_len);
 	tcp_hlen = th->th_off << 2;
 	eth_hdr_len += tcp_hlen;
 done:
 	/*
 	 * m_copydata() will be used on the remaining header which
 	 * does not need to reside within the first m_len bytes of
 	 * data:
 	 */
 	if (mb->m_pkthdr.len < eth_hdr_len)
 		return (0);
 	return (eth_hdr_len);
 }
 
 static int
 mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp)
 {
 	bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS];
 	struct mlx5_wqe_data_seg *dseg;
 	struct mlx5e_tx_wqe *wqe;
 	struct ifnet *ifp;
 	int nsegs;
 	int err;
 	int x;
 	struct mbuf *mb = *mbp;
 	u16 ds_cnt;
 	u16 ihs;
 	u16 pi;
 	u8 opcode;
 
 	/* Return ENOBUFS if the queue is full */
 	if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS)))
 		return (ENOBUFS);
 
 	/* Align SQ edge with NOPs to avoid WQE wrap around */
 	pi = ((~sq->pc) & sq->wq.sz_m1);
 	if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) {
 		/* Send one multi NOP message instead of many */
 		mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS);
 		pi = ((~sq->pc) & sq->wq.sz_m1);
 		if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1))
 			return (ENOMEM);
 	}
 
 	/* Setup local variables */
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 	ifp = sq->ifp;
 
 	memset(wqe, 0, sizeof(*wqe));
 
 	/* Send a copy of the frame to the BPF listener, if any */
 	if (ifp != NULL && ifp->if_bpf != NULL)
 		ETHER_BPF_MTAP(ifp, mb);
 
 	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) {
 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM;
 	}
 	if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) {
 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM;
 	}
 	if (wqe->eth.cs_flags == 0) {
 		sq->stats.csum_offload_none++;
 	}
 	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
 		u32 payload_len;
 		u32 mss = mb->m_pkthdr.tso_segsz;
 		u32 num_pkts;
 
 		wqe->eth.mss = cpu_to_be16(mss);
 		opcode = MLX5_OPCODE_LSO;
 		ihs = mlx5e_get_full_header_size(mb);
 		if (unlikely(ihs == 0)) {
 			err = EINVAL;
 			goto tx_drop;
 		}
 		payload_len = mb->m_pkthdr.len - ihs;
 		if (payload_len == 0)
 			num_pkts = 1;
 		else
 			num_pkts = DIV_ROUND_UP(payload_len, mss);
 		sq->mbuf[pi].num_bytes = payload_len + (num_pkts * ihs);
 
 		sq->stats.tso_packets++;
 		sq->stats.tso_bytes += payload_len;
 	} else {
 		opcode = MLX5_OPCODE_SEND;
 
 		switch (sq->min_inline_mode) {
 		case MLX5_INLINE_MODE_IP:
 		case MLX5_INLINE_MODE_TCP_UDP:
 			ihs = mlx5e_get_full_header_size(mb);
 			if (unlikely(ihs == 0))
 				ihs = mlx5e_get_l2_header_size(sq, mb);
 			break;
 		case MLX5_INLINE_MODE_L2:
 			ihs = mlx5e_get_l2_header_size(sq, mb);
 			break;
 		case MLX5_INLINE_MODE_NONE:
 			/* FALLTHROUGH */
 		default:
 			if ((mb->m_flags & M_VLANTAG) != 0 &&
 			    (sq->min_insert_caps & MLX5E_INSERT_VLAN) != 0) {
 				/* inlining VLAN data is not required */
 				wqe->eth.vlan_cmd = htons(0x8000); /* bit 0 CVLAN */
 				wqe->eth.vlan_hdr = htons(mb->m_pkthdr.ether_vtag);
 				ihs = 0;
 			} else if ((mb->m_flags & M_VLANTAG) == 0 &&
 				   (sq->min_insert_caps & MLX5E_INSERT_NON_VLAN) != 0) {
 				/* inlining non-VLAN data is not required */
 				ihs = 0;
 			} else {
 				/* we are forced to inlining L2 header, if any */
 				ihs = mlx5e_get_l2_header_size(sq, mb);
 			}
 			break;
 		}
 		sq->mbuf[pi].num_bytes = max_t (unsigned int,
 		    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
 	}
 
 	if (likely(ihs == 0)) {
 		/* nothing to inline */
 	} else if (unlikely(ihs > sq->max_inline)) {
 		/* inline header size is too big */
 		err = EINVAL;
 		goto tx_drop;
 	} else if ((mb->m_flags & M_VLANTAG) != 0) {
 		struct ether_vlan_header *eh = (struct ether_vlan_header *)
 		    wqe->eth.inline_hdr_start;
 
 		/* Range checks */
 		if (unlikely(ihs > (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN)))
 			ihs = (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN);
 		else if (unlikely(ihs < ETHER_HDR_LEN)) {
 			err = EINVAL;
 			goto tx_drop;
 		}
 		m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh);
 		m_adj(mb, ETHER_HDR_LEN);
 		/* Insert 4 bytes VLAN tag into data stream */
 		eh->evl_proto = eh->evl_encap_proto;
 		eh->evl_encap_proto = htons(ETHERTYPE_VLAN);
 		eh->evl_tag = htons(mb->m_pkthdr.ether_vtag);
 		/* Copy rest of header data, if any */
 		m_copydata(mb, 0, ihs - ETHER_HDR_LEN, (caddr_t)(eh + 1));
 		m_adj(mb, ihs - ETHER_HDR_LEN);
 		/* Extend header by 4 bytes */
 		ihs += ETHER_VLAN_ENCAP_LEN;
 		wqe->eth.inline_hdr_sz = cpu_to_be16(ihs);
 	} else {
 		m_copydata(mb, 0, ihs, wqe->eth.inline_hdr_start);
 		m_adj(mb, ihs);
 		wqe->eth.inline_hdr_sz = cpu_to_be16(ihs);
 	}
 
 	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
 	if (ihs > sizeof(wqe->eth.inline_hdr_start)) {
 		ds_cnt += DIV_ROUND_UP(ihs - sizeof(wqe->eth.inline_hdr_start),
 		    MLX5_SEND_WQE_DS);
 	}
 	dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt;
 
 	err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
 	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
 	if (err == EFBIG) {
 		/* Update statistics */
 		sq->stats.defragged++;
 		/* Too many mbuf fragments */
 		mb = m_defrag(*mbp, M_NOWAIT);
 		if (mb == NULL) {
 			mb = *mbp;
 			goto tx_drop;
 		}
 		/* Try again */
 		err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
 		    mb, segs, &nsegs, BUS_DMA_NOWAIT);
 	}
 	/* Catch errors */
 	if (err != 0)
 		goto tx_drop;
 
 	/* Make sure all mbuf data, if any, is written to RAM */
 	if (nsegs != 0) {
 		bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map,
 		    BUS_DMASYNC_PREWRITE);
 	} else {
 		/* All data was inlined, free the mbuf. */
 		bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map);
 		m_freem(mb);
 		mb = NULL;
 	}
 
 	for (x = 0; x != nsegs; x++) {
 		if (segs[x].ds_len == 0)
 			continue;
 		dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr);
 		dseg->lkey = sq->mkey_be;
 		dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len);
 		dseg++;
 	}
 
 	ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
 	else
 		wqe->ctrl.fm_ce_se = 0;
 
 	/* Copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	/* Store pointer to mbuf */
 	sq->mbuf[pi].mbuf = mb;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 
 	/* Count all traffic going out */
 	sq->stats.packets++;
 	sq->stats.bytes += sq->mbuf[pi].num_bytes;
 
 	*mbp = NULL;	/* safety clear */
 	return (0);
 
 tx_drop:
 	sq->stats.dropped++;
 	*mbp = NULL;
 	m_freem(mb);
 	return err;
 }
 
 static void
 mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
 {
 	u16 sqcc;
 
 	/*
 	 * sq->cc must be updated only after mlx5_cqwq_update_db_record(),
 	 * otherwise a cq overrun may occur
 	 */
 	sqcc = sq->cc;
 
 	while (budget > 0) {
 		struct mlx5_cqe64 *cqe;
 		struct mbuf *mb;
 		u16 x;
 		u16 ci;
 
 		cqe = mlx5e_get_cqe(&sq->cq);
 		if (!cqe)
 			break;
 
 		mlx5_cqwq_pop(&sq->cq.wq);
 
 		/* update budget according to the event factor */
 		budget -= sq->cev_factor;
 
 		for (x = 0; x != sq->cev_factor; x++) {
 			ci = sqcc & sq->wq.sz_m1;
 			mb = sq->mbuf[ci].mbuf;
 			sq->mbuf[ci].mbuf = NULL;
 
 			if (mb == NULL) {
 				if (sq->mbuf[ci].num_bytes == 0) {
 					/* NOP */
 					sq->stats.nop++;
 				}
 			} else {
 				bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
 
 				/* Free transmitted mbuf */
 				m_freem(mb);
 			}
 			sqcc += sq->mbuf[ci].num_wqebbs;
 		}
 	}
 
 	mlx5_cqwq_update_db_record(&sq->cq.wq);
 
 	/* Ensure cq space is freed before enabling more cqes */
 	atomic_thread_fence_rel();
 
 	sq->cc = sqcc;
 }
 
 static int
 mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
 {
 	int err = 0;
 
 	if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    READ_ONCE(sq->running) == 0)) {
 		m_freem(mb);
 		return (ENETDOWN);
 	}
 
 	/* Do transmit */
 	if (mlx5e_sq_xmit(sq, &mb) != 0) {
 		/* NOTE: m_freem() is NULL safe */
 		m_freem(mb);
 		err = ENOBUFS;
 	}
 
 	/* Check if we need to write the doorbell */
 	if (likely(sq->doorbell.d64 != 0)) {
 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
 		sq->doorbell.d64 = 0;
 	}
 
 	/*
 	 * Check if we need to start the event timer which flushes the
 	 * transmit ring on timeout:
 	 */
 	if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
 	    sq->cev_factor != 1)) {
 		/* start the timer */
 		mlx5e_sq_cev_timeout(sq);
 	} else {
 		/* don't send NOPs yet */
 		sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
 	}
 	return (err);
 }
 
 int
 mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct mlx5e_sq *sq;
 	int ret;
 
-	if (mb->m_pkthdr.snd_tag != NULL) {
+	if (mb->m_pkthdr.csum_flags & CSUM_SND_TAG) {
+		MPASS(mb->m_pkthdr.snd_tag->ifp == ifp);
 		sq = mlx5e_select_queue_by_send_tag(ifp, mb);
 		if (unlikely(sq == NULL)) {
-			/* Check for route change */
-			if (mb->m_pkthdr.snd_tag->ifp != ifp) {
-				/* Free mbuf */
-				m_freem(mb);
-
-				/*
-				 * Tell upper layers about route
-				 * change and to re-transmit this
-				 * packet:
-				 */
-				return (EAGAIN);
-			}
 			goto select_queue;
 		}
 	} else {
 select_queue:
 		sq = mlx5e_select_queue(ifp, mb);
 		if (unlikely(sq == NULL)) {
 			/* Free mbuf */
 			m_freem(mb);
 
 			/* Invalid send queue */
 			return (ENXIO);
 		}
 	}
 
 	mtx_lock(&sq->lock);
 	ret = mlx5e_xmit_locked(ifp, sq, mb);
 	mtx_unlock(&sq->lock);
 
 	return (ret);
 }
 
 void
 mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq)
 {
 	struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq);
 
 	mtx_lock(&sq->comp_lock);
 	mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX);
 	mlx5e_cq_arm(&sq->cq, MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock));
 	mtx_unlock(&sq->comp_lock);
 }
Index: head/sys/kern/kern_mbuf.c
===================================================================
--- head/sys/kern/kern_mbuf.c	(revision 348253)
+++ head/sys/kern/kern_mbuf.c	(revision 348254)
@@ -1,1151 +1,1182 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004, 2005,
  *	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/domainset.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
+#include <sys/socket.h>
 #include <sys/sysctl.h>
 
+#include <net/if.h>
+#include <net/if_var.h>
+
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/uma.h>
 #include <vm/uma_dbg.h>
 
 /*
  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
  * Zones.
  *
  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
  * administrator so desires.
  *
  * Mbufs are allocated from a UMA Master Zone called the Mbuf
  * Zone.
  *
  * Additionally, FreeBSD provides a Packet Zone, which it
  * configures as a Secondary Zone to the Mbuf Master Zone,
  * thus sharing backend Slab kegs with the Mbuf Master Zone.
  *
  * Thus common-case allocations and locking are simplified:
  *
  *  m_clget()                m_getcl()
  *    |                         |
  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
  *    |   |             [     Packet   ]            |
  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
  *        |                       \________         |
  *  [ Cluster Keg   ]                      \       /
  *        |	                         [ Mbuf Keg   ]
  *  [ Cluster Slabs ]                         |
  *        |                              [ Mbuf Slabs ]
  *         \____________(VM)_________________/
  *
  *
  * Whenever an object is allocated with uma_zalloc() out of
  * one of the Zones its _ctor_ function is executed.  The same
  * for any deallocation through uma_zfree() the _dtor_ function
  * is executed.
  *
  * Caches are per-CPU and are filled from the Master Zone.
  *
  * Whenever an object is allocated from the underlying global
  * memory pool it gets pre-initialized with the _zinit_ functions.
  * When the Keg's are overfull objects get decommissioned with
  * _zfini_ functions and free'd back to the global memory pool.
  *
  */
 
 int nmbufs;			/* limits number of mbufs */
 int nmbclusters;		/* limits number of mbuf clusters */
 int nmbjumbop;			/* limits number of page size jumbo clusters */
 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
 int nmbjumbo16;			/* limits number of 16k jumbo clusters */
 
 static quad_t maxmbufmem;	/* overall real memory limit for all mbufs */
 
 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
     "Maximum real memory allocatable to various mbuf types");
 
+static counter_u64_t snd_tag_count;
+SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW,
+    &snd_tag_count, "# of active mbuf send tags");
+
 /*
  * tunable_mbinit() has to be run before any mbuf allocations are done.
  */
 static void
 tunable_mbinit(void *dummy)
 {
 	quad_t realmem;
 
 	/*
 	 * The default limit for all mbuf related memory is 1/2 of all
 	 * available kernel memory (physical or kmem).
 	 * At most it can be 3/4 of available kernel memory.
 	 */
 	realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
 	maxmbufmem = realmem / 2;
 	TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
 	if (maxmbufmem > realmem / 4 * 3)
 		maxmbufmem = realmem / 4 * 3;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 	if (nmbclusters == 0)
 		nmbclusters = maxmbufmem / MCLBYTES / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
 	if (nmbjumbop == 0)
 		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
 	if (nmbjumbo9 == 0)
 		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
 	if (nmbjumbo16 == 0)
 		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
 
 	/*
 	 * We need at least as many mbufs as we have clusters of
 	 * the various types added together.
 	 */
 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
 		nmbufs = lmax(maxmbufmem / MSIZE / 5,
 		    nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
 }
 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
 
 static int
 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbclusters;
 
 	newnmbclusters = nmbclusters;
 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
 	if (error == 0 && req->newptr && newnmbclusters != nmbclusters) {
 		if (newnmbclusters > nmbclusters &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbclusters = newnmbclusters;
 			nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 			EVENTHANDLER_INVOKE(nmbclusters_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
 &nmbclusters, 0, sysctl_nmbclusters, "IU",
     "Maximum number of mbuf clusters allowed");
 
 static int
 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbop;
 
 	newnmbjumbop = nmbjumbop;
 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) {
 		if (newnmbjumbop > nmbjumbop &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbop = newnmbjumbop;
 			nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
 &nmbjumbop, 0, sysctl_nmbjumbop, "IU",
     "Maximum number of mbuf page size jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo9;
 
 	newnmbjumbo9 = nmbjumbo9;
 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) {
 		if (newnmbjumbo9 > nmbjumbo9 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo9 = newnmbjumbo9;
 			nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
     "Maximum number of mbuf 9k jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo16;
 
 	newnmbjumbo16 = nmbjumbo16;
 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) {
 		if (newnmbjumbo16 > nmbjumbo16 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo16 = newnmbjumbo16;
 			nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
     "Maximum number of mbuf 16k jumbo clusters allowed");
 
 static int
 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbufs;
 
 	newnmbufs = nmbufs;
 	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
 	if (error == 0 && req->newptr && newnmbufs != nmbufs) {
 		if (newnmbufs > nmbufs) {
 			nmbufs = newnmbufs;
 			nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 			EVENTHANDLER_INVOKE(nmbufs_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW,
 &nmbufs, 0, sysctl_nmbufs, "IU",
     "Maximum number of mbufs allowed");
 
 /*
  * Zones from which we allocate.
  */
 uma_zone_t	zone_mbuf;
 uma_zone_t	zone_clust;
 uma_zone_t	zone_pack;
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
 
 /*
  * Local prototypes.
  */
 static int	mb_ctor_mbuf(void *, int, void *, int);
 static int	mb_ctor_clust(void *, int, void *, int);
 static int	mb_ctor_pack(void *, int, void *, int);
 static void	mb_dtor_mbuf(void *, int, void *);
 static void	mb_dtor_pack(void *, int, void *);
 static int	mb_zinit_pack(void *, int, int);
 static void	mb_zfini_pack(void *, int);
 static void	mb_reclaim(uma_zone_t, int);
 static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
 
 /*
  * Initialize FreeBSD Network buffer allocation.
  */
 static void
 mbuf_init(void *dummy)
 {
 
 	/*
 	 * Configure UMA zones for Mbufs, Clusters, and Packets.
 	 */
 	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
 	    mb_ctor_mbuf, mb_dtor_mbuf,
 #ifdef INVARIANTS
 	    trash_init, trash_fini,
 #else
 	    NULL, NULL,
 #endif
 	    MSIZE - 1, UMA_ZONE_MAXBUCKET);
 	if (nmbufs > 0)
 		nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 	uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
 	uma_zone_set_maxaction(zone_mbuf, mb_reclaim);
 
 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	if (nmbclusters > 0)
 		nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 	uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
 	uma_zone_set_maxaction(zone_clust, mb_reclaim);
 
 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
 
 	/* Make jumbo frame zone too. Page size, 9k and 16k. */
 	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	if (nmbjumbop > 0)
 		nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 	uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
 	uma_zone_set_maxaction(zone_jumbop, mb_reclaim);
 
 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
 	if (nmbjumbo9 > 0)
 		nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 	uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
 	uma_zone_set_maxaction(zone_jumbo9, mb_reclaim);
 
 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
 	if (nmbjumbo16 > 0)
 		nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
 
 	/*
 	 * Hook event handler for low-memory situation, used to
 	 * drain protocols and push data back to the caches (UMA
 	 * later pushes it back to VM).
 	 */
 	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
 	    EVENTHANDLER_PRI_FIRST);
+
+	snd_tag_count = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
 
 #ifdef NETDUMP
 /*
  * netdump makes use of a pre-allocated pool of mbufs and clusters.  When
  * netdump is configured, we initialize a set of UMA cache zones which return
  * items from this pool.  At panic-time, the regular UMA zone pointers are
  * overwritten with those of the cache zones so that drivers may allocate and
  * free mbufs and clusters without attempting to allocate physical memory.
  *
  * We keep mbufs and clusters in a pair of mbuf queues.  In particular, for
  * the purpose of caching clusters, we treat them as mbufs.
  */
 static struct mbufq nd_mbufq =
     { STAILQ_HEAD_INITIALIZER(nd_mbufq.mq_head), 0, INT_MAX };
 static struct mbufq nd_clustq =
     { STAILQ_HEAD_INITIALIZER(nd_clustq.mq_head), 0, INT_MAX };
 
 static int nd_clsize;
 static uma_zone_t nd_zone_mbuf;
 static uma_zone_t nd_zone_clust;
 static uma_zone_t nd_zone_pack;
 
 static int
 nd_buf_import(void *arg, void **store, int count, int domain __unused,
     int flags)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = mbufq_dequeue(q);
 		if (m == NULL)
 			break;
 		trash_init(m, q == &nd_mbufq ? MSIZE : nd_clsize, flags);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 nd_buf_release(void *arg, void **store, int count)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		(void)mbufq_enqueue(q, m);
 	}
 }
 
 static int
 nd_pack_import(void *arg __unused, void **store, int count, int domain __unused,
     int flags __unused)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = m_get(MT_DATA, M_NOWAIT);
 		if (m == NULL)
 			break;
 		clust = uma_zalloc(nd_zone_clust, M_NOWAIT);
 		if (clust == NULL) {
 			m_free(m);
 			break;
 		}
 		mb_ctor_clust(clust, nd_clsize, m, 0);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 nd_pack_release(void *arg __unused, void **store, int count)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		clust = m->m_ext.ext_buf;
 		uma_zfree(nd_zone_clust, clust);
 		uma_zfree(nd_zone_mbuf, m);
 	}
 }
 
 /*
  * Free the pre-allocated mbufs and clusters reserved for netdump, and destroy
  * the corresponding UMA cache zones.
  */
 void
 netdump_mbuf_drain(void)
 {
 	struct mbuf *m;
 	void *item;
 
 	if (nd_zone_mbuf != NULL) {
 		uma_zdestroy(nd_zone_mbuf);
 		nd_zone_mbuf = NULL;
 	}
 	if (nd_zone_clust != NULL) {
 		uma_zdestroy(nd_zone_clust);
 		nd_zone_clust = NULL;
 	}
 	if (nd_zone_pack != NULL) {
 		uma_zdestroy(nd_zone_pack);
 		nd_zone_pack = NULL;
 	}
 
 	while ((m = mbufq_dequeue(&nd_mbufq)) != NULL)
 		m_free(m);
 	while ((item = mbufq_dequeue(&nd_clustq)) != NULL)
 		uma_zfree(m_getzone(nd_clsize), item);
 }
 
 /*
  * Callback invoked immediately prior to starting a netdump.
  */
 void
 netdump_mbuf_dump(void)
 {
 
 	/*
 	 * All cluster zones return buffers of the size requested by the
 	 * drivers.  It's up to the driver to reinitialize the zones if the
 	 * MTU of a netdump-enabled interface changes.
 	 */
 	printf("netdump: overwriting mbuf zone pointers\n");
 	zone_mbuf = nd_zone_mbuf;
 	zone_clust = nd_zone_clust;
 	zone_pack = nd_zone_pack;
 	zone_jumbop = nd_zone_clust;
 	zone_jumbo9 = nd_zone_clust;
 	zone_jumbo16 = nd_zone_clust;
 }
 
 /*
  * Reinitialize the netdump mbuf+cluster pool and cache zones.
  */
 void
 netdump_mbuf_reinit(int nmbuf, int nclust, int clsize)
 {
 	struct mbuf *m;
 	void *item;
 
 	netdump_mbuf_drain();
 
 	nd_clsize = clsize;
 
 	nd_zone_mbuf = uma_zcache_create("netdump_" MBUF_MEM_NAME,
 	    MSIZE, mb_ctor_mbuf, mb_dtor_mbuf,
 #ifdef INVARIANTS
 	    trash_init, trash_fini,
 #else
 	    NULL, NULL,
 #endif
 	    nd_buf_import, nd_buf_release,
 	    &nd_mbufq, UMA_ZONE_NOBUCKET);
 
 	nd_zone_clust = uma_zcache_create("netdump_" MBUF_CLUSTER_MEM_NAME,
 	    clsize, mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    nd_buf_import, nd_buf_release,
 	    &nd_clustq, UMA_ZONE_NOBUCKET);
 
 	nd_zone_pack = uma_zcache_create("netdump_" MBUF_PACKET_MEM_NAME,
 	    MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL,
 	    nd_pack_import, nd_pack_release,
 	    NULL, UMA_ZONE_NOBUCKET);
 
 	while (nmbuf-- > 0) {
 		m = m_get(MT_DATA, M_WAITOK);
 		uma_zfree(nd_zone_mbuf, m);
 	}
 	while (nclust-- > 0) {
 		item = uma_zalloc(m_getzone(nd_clsize), M_WAITOK);
 		uma_zfree(nd_zone_clust, item);
 	}
 }
 #endif /* NETDUMP */
 
 /*
  * UMA backend page allocator for the jumbo frame zones.
  *
  * Allocates kernel virtual memory that is backed by contiguous physical
  * pages.
  */
 static void *
 mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
     int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
 	*flags = UMA_SLAB_KERNEL;
 	return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
 	    bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0,
 	    VM_MEMATTR_DEFAULT));
 }
 
 /*
  * Constructor for Mbuf master zone.
  *
  * The 'arg' pointer points to a mb_args structure which
  * contains call-specific information required to support the
  * mbuf allocation API.  See mbuf.h.
  */
 static int
 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error;
 	int flags;
 	short type;
 
 #ifdef INVARIANTS
 	trash_ctor(mem, size, arg, how);
 #endif
 	args = (struct mb_args *)arg;
 	type = args->type;
 
 	/*
 	 * The mbuf is initialized later.  The caller has the
 	 * responsibility to set up any MAC labels too.
 	 */
 	if (type == MT_NOINIT)
 		return (0);
 
 	m = (struct mbuf *)mem;
 	flags = args->flags;
 	MPASS((flags & M_NOFREE) == 0);
 
 	error = m_init(m, how, type, flags);
 
 	return (error);
 }
 
 /*
  * The Mbuf master zone destructor.
  */
 static void
 mb_dtor_mbuf(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 	unsigned long flags;
 
 	m = (struct mbuf *)mem;
 	flags = (unsigned long)arg;
 
 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
 	if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
 		m_tag_delete_chain(m, NULL);
 #ifdef INVARIANTS
 	trash_dtor(mem, size, arg);
 #endif
 }
 
 /*
  * The Mbuf Packet zone destructor.
  */
 static void
 mb_dtor_pack(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 	if ((m->m_flags & M_PKTHDR) != 0)
 		m_tag_delete_chain(m, NULL);
 
 	/* Make sure we've got a clean cluster back. */
 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
 	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
 	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
 #ifdef INVARIANTS
 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
 #endif
 	/*
 	 * If there are processes blocked on zone_clust, waiting for pages
 	 * to be freed up, * cause them to be woken up by draining the
 	 * packet zone.  We are exposed to a race here * (in the check for
 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
 	 * is deliberate. We don't want to acquire the zone lock for every
 	 * mbuf free.
 	 */
 	if (uma_zone_exhausted_nolock(zone_clust))
 		zone_drain(zone_pack);
 }
 
 /*
  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
  *
  * Here the 'arg' pointer points to the Mbuf which we
  * are configuring cluster storage for.  If 'arg' is
  * empty we allocate just the cluster without setting
  * the mbuf to it.  See mbuf.h.
  */
 static int
 mb_ctor_clust(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 
 #ifdef INVARIANTS
 	trash_ctor(mem, size, arg, how);
 #endif
 	m = (struct mbuf *)arg;
 	if (m != NULL) {
 		m->m_ext.ext_buf = (char *)mem;
 		m->m_data = m->m_ext.ext_buf;
 		m->m_flags |= M_EXT;
 		m->m_ext.ext_free = NULL;
 		m->m_ext.ext_arg1 = NULL;
 		m->m_ext.ext_arg2 = NULL;
 		m->m_ext.ext_size = size;
 		m->m_ext.ext_type = m_gettype(size);
 		m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 		m->m_ext.ext_count = 1;
 	}
 
 	return (0);
 }
 
 /*
  * The Packet secondary zone's init routine, executed on the
  * object's transition from mbuf keg slab to zone cache.
  */
 static int
 mb_zinit_pack(void *mem, int size, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;		/* m is virgin. */
 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
 	    m->m_ext.ext_buf == NULL)
 		return (ENOMEM);
 	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
 #ifdef INVARIANTS
 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
 #endif
 	return (0);
 }
 
 /*
  * The Packet secondary zone's fini routine, executed on the
  * object's transition from zone cache to keg slab.
  */
 static void
 mb_zfini_pack(void *mem, int size)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 #ifdef INVARIANTS
 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
 #endif
 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
 #ifdef INVARIANTS
 	trash_dtor(mem, size, NULL);
 #endif
 }
 
 /*
  * The "packet" keg constructor.
  */
 static int
 mb_ctor_pack(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error, flags;
 	short type;
 
 	m = (struct mbuf *)mem;
 	args = (struct mb_args *)arg;
 	flags = args->flags;
 	type = args->type;
 	MPASS((flags & M_NOFREE) == 0);
 
 #ifdef INVARIANTS
 	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
 #endif
 
 	error = m_init(m, how, type, flags);
 
 	/* m_ext is already initialized. */
 	m->m_data = m->m_ext.ext_buf;
  	m->m_flags = (flags | M_EXT);
 
 	return (error);
 }
 
 /*
  * This is the protocol drain routine.  Called by UMA whenever any of the
  * mbuf zones is closed to its limit.
  *
  * No locks should be held when this is called.  The drain routines have to
  * presently acquire some locks which raises the possibility of lock order
  * reversal.
  */
 static void
 mb_reclaim(uma_zone_t zone __unused, int pending __unused)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__);
 
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_drain != NULL)
 				(*pr->pr_drain)();
 }
 
 /*
  * Clean up after mbufs with M_EXT storage attached to them if the
  * reference count hits 1.
  */
 void
 mb_free_ext(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	struct mbuf *mref;
 	int freembuf;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/*
 	 * Check if the header is embedded in the cluster.  It is
 	 * important that we can't touch any of the mbuf fields
 	 * after we have freed the external storage, since mbuf
 	 * could have been embedded in it.  For now, the mbufs
 	 * embedded into the cluster are always of type EXT_EXTREF,
 	 * and for this type we won't free the mref.
 	 */
 	if (m->m_flags & M_NOFREE) {
 		freembuf = 0;
 		KASSERT(m->m_ext.ext_type == EXT_EXTREF ||
 		    m->m_ext.ext_type == EXT_RXRING,
 		    ("%s: no-free mbuf %p has wrong type", __func__, m));
 	} else
 		freembuf = 1;
 
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 		switch (m->m_ext.ext_type) {
 		case EXT_PACKET:
 			/* The packet zone is special. */
 			if (*refcnt == 0)
 				*refcnt = 1;
 			uma_zfree(zone_pack, mref);
 			break;
 		case EXT_CLUSTER:
 			uma_zfree(zone_clust, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBOP:
 			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBO9:
 			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_SFBUF:
 		case EXT_NET_DRV:
 		case EXT_MOD_TYPE:
 		case EXT_DISPOSABLE:
 			KASSERT(mref->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			mref->m_ext.ext_free(mref);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			m->m_ext.ext_free(m);
 			break;
 		case EXT_RXRING:
 			KASSERT(m->m_ext.ext_free == NULL,
 			    ("%s: ext_free is set", __func__));
 			break;
 		default:
 			KASSERT(m->m_ext.ext_type == 0,
 			    ("%s: unknown ext_type", __func__));
 		}
 	}
 
 	if (freembuf && m != mref)
 		uma_zfree(zone_mbuf, m);
 }
 
 /*
  * Official mbuf(9) allocation KPI for stack and drivers:
  *
  * m_get()	- a single mbuf without any attachments, sys/mbuf.h.
  * m_gethdr()	- a single mbuf initialized as M_PKTHDR, sys/mbuf.h.
  * m_getcl()	- an mbuf + 2k cluster, sys/mbuf.h.
  * m_clget()	- attach cluster to already allocated mbuf.
  * m_cljget()	- attach jumbo cluster to already allocated mbuf.
  * m_get2()	- allocate minimum mbuf that would fit size argument.
  * m_getm2()	- allocate a chain of mbufs/clusters.
  * m_extadd()	- attach external cluster to mbuf.
  *
  * m_free()	- free single mbuf with its tags and ext, sys/mbuf.h.
  * m_freem()	- free chain of mbufs.
  */
 
 int
 m_clget(struct mbuf *m, int how)
 {
 
 	KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 	    __func__, m));
 	m->m_ext.ext_buf = (char *)NULL;
 	uma_zalloc_arg(zone_clust, m, how);
 	/*
 	 * On a cluster allocation failure, drain the packet zone and retry,
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 		zone_drain(zone_pack);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 	MBUF_PROBE2(m__clget, m, how);
 	return (m->m_flags & M_EXT);
 }
 
 /*
  * m_cljget() is different from m_clget() as it can allocate clusters without
  * attaching them to an mbuf.  In that case the return value is the pointer
  * to the cluster of the requested size.  If an mbuf was specified, it gets
  * the cluster attached to it and the return value can be safely ignored.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 void *
 m_cljget(struct mbuf *m, int how, int size)
 {
 	uma_zone_t zone;
 	void *retval;
 
 	if (m != NULL) {
 		KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 		    __func__, m));
 		m->m_ext.ext_buf = NULL;
 	}
 
 	zone = m_getzone(size);
 	retval = uma_zalloc_arg(zone, m, how);
 
 	MBUF_PROBE4(m__cljget, m, how, size, retval);
 
 	return (retval);
 }
 
 /*
  * m_get2() allocates minimum mbuf that would fit "size" argument.
  */
 struct mbuf *
 m_get2(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 
 	args.flags = flags;
 	args.type = type;
 
 	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
 		return (uma_zalloc_arg(zone_mbuf, &args, how));
 	if (size <= MCLBYTES)
 		return (uma_zalloc_arg(zone_pack, &args, how));
 
 	if (size > MJUMPAGESIZE)
 		return (NULL);
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	n = uma_zalloc_arg(zone_jumbop, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_getjcl() returns an mbuf with a cluster of the specified size attached.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 struct mbuf *
 m_getjcl(int how, short type, int flags, int size)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size == MCLBYTES)
 		return m_getcl(how, type, flags);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	zone = m_getzone(size);
 	n = uma_zalloc_arg(zone, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 	return (m);
 }
 
 /*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
  * to the existing one and return a pointer to the provided mbuf.
  */
 struct mbuf *
 m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
 	struct mbuf *mb, *nm = NULL, *mtail = NULL;
 
 	KASSERT(len >= 0, ("%s: len is < 0", __func__));
 
 	/* Validate flags. */
 	flags &= (M_PKTHDR | M_EOR);
 
 	/* Packet header mbuf must be first in chain. */
 	if ((flags & M_PKTHDR) && m != NULL)
 		flags &= ~M_PKTHDR;
 
 	/* Loop and append maximum sized mbufs to the chain tail. */
 	while (len > 0) {
 		if (len > MCLBYTES)
 			mb = m_getjcl(how, type, (flags & M_PKTHDR),
 			    MJUMPAGESIZE);
 		else if (len >= MINCLSIZE)
 			mb = m_getcl(how, type, (flags & M_PKTHDR));
 		else if (flags & M_PKTHDR)
 			mb = m_gethdr(how, type);
 		else
 			mb = m_get(how, type);
 
 		/* Fail the whole operation if one mbuf can't be allocated. */
 		if (mb == NULL) {
 			if (nm != NULL)
 				m_freem(nm);
 			return (NULL);
 		}
 
 		/* Book keeping. */
 		len -= M_SIZE(mb);
 		if (mtail != NULL)
 			mtail->m_next = mb;
 		else
 			nm = mb;
 		mtail = mb;
 		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
 	}
 	if (flags & M_EOR)
 		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
 
 	/* If mbuf was supplied, append new chain to the end of it. */
 	if (m != NULL) {
 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
 			;
 		mtail->m_next = nm;
 		mtail->m_flags &= ~M_EOR;
 	} else
 		m = nm;
 
 	return (m);
 }
 
 /*-
  * Configure a provided mbuf to refer to the provided external storage
  * buffer and setup a reference count for said buffer.
  *
  * Arguments:
  *    mb     The existing mbuf to which to attach the provided buffer.
  *    buf    The address of the provided external storage buffer.
  *    size   The size of the provided buffer.
  *    freef  A pointer to a routine that is responsible for freeing the
  *           provided external storage buffer.
  *    args   A pointer to an argument structure (of any type) to be passed
  *           to the provided freef routine (may be NULL).
  *    flags  Any other flags to be passed to the provided mbuf.
  *    type   The type that the external storage buffer should be
  *           labeled with.
  *
  * Returns:
  *    Nothing.
  */
 void
 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef,
     void *arg1, void *arg2, int flags, int type)
 {
 
 	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
 	mb->m_flags |= (M_EXT | flags);
 	mb->m_ext.ext_buf = buf;
 	mb->m_data = mb->m_ext.ext_buf;
 	mb->m_ext.ext_size = size;
 	mb->m_ext.ext_free = freef;
 	mb->m_ext.ext_arg1 = arg1;
 	mb->m_ext.ext_arg2 = arg2;
 	mb->m_ext.ext_type = type;
 
 	if (type != EXT_EXTREF) {
 		mb->m_ext.ext_count = 1;
 		mb->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	} else
 		mb->m_ext.ext_flags = 0;
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, if
  * applicable.
  */
 void
 m_freem(struct mbuf *mb)
 {
 
 	MBUF_PROBE1(m__freem, mb);
 	while (mb != NULL)
 		mb = m_free(mb);
+}
+
+void
+m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp)
+{
+
+	if_ref(ifp);
+	mst->ifp = ifp;
+	refcount_init(&mst->refcount, 1);
+	counter_u64_add(snd_tag_count, 1);
+}
+
+void
+m_snd_tag_destroy(struct m_snd_tag *mst)
+{
+	struct ifnet *ifp;
+
+	ifp = mst->ifp;
+	ifp->if_snd_tag_free(mst);
+	if_rele(ifp);
+	counter_u64_add(snd_tag_count, -1);
 }
Index: head/sys/kern/uipc_mbuf.c
===================================================================
--- head/sys/kern/uipc_mbuf.c	(revision 348253)
+++ head/sys/kern/uipc_mbuf.c	(revision 348254)
@@ -1,1875 +1,1891 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mbuf_profiling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/uio.h>
 #include <sys/sdt.h>
 
 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl,
     "uint32_t", "uint32_t",
     "uint16_t", "uint16_t",
     "uint32_t", "uint32_t",
     "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t");
 
 SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget,
     "struct mbuf *", "mbufinfo_t *",
     "uint32_t", "uint32_t",
     "uint32_t", "uint32_t",
     "void*", "void*");
 
 SDT_PROBE_DEFINE(sdt, , , m__cljset);
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free,
         "struct mbuf *", "mbufinfo_t *");
 
 SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem,
     "struct mbuf *", "mbufinfo_t *");
 
 #include <security/mac/mac_framework.h>
 
 int	max_linkhdr;
 int	max_protohdr;
 int	max_hdr;
 int	max_datalen;
 #ifdef MBUF_STRESS_TEST
 int	m_defragpackets;
 int	m_defragbytes;
 int	m_defraguseless;
 int	m_defragfailure;
 int	m_defragrandomfailures;
 #endif
 
 /*
  * sysctl(8) exported objects
  */
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
 	   &max_linkhdr, 0, "Size of largest link layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
 	   &max_protohdr, 0, "Size of largest protocol layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
 	   &max_hdr, 0, "Size of largest link plus protocol header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
 	   &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
 #ifdef MBUF_STRESS_TEST
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 	   &m_defragpackets, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 	   &m_defragbytes, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 	   &m_defraguseless, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 	   &m_defragfailure, 0, "");
 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 	   &m_defragrandomfailures, 0, "");
 #endif
 
 /*
  * Ensure the correct size of various mbuf parameters.  It could be off due
  * to compiler-induced padding and alignment artifacts.
  */
 CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
 CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
 
 /*
  * mbuf data storage should be 64-bit aligned regardless of architectural
  * pointer size; check this is the case with and without a packet header.
  */
 CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0);
 CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0);
 
 /*
  * While the specific values here don't matter too much (i.e., +/- a few
  * words), we do want to ensure that changes to these values are carefully
  * reasoned about and properly documented.  This is especially the case as
  * network-protocol and device-driver modules encode these layouts, and must
  * be recompiled if the structures change.  Check these values at compile time
  * against the ones documented in comments in mbuf.h.
  *
  * NB: Possibly they should be documented there via #define's and not just
  * comments.
  */
 #if defined(__LP64__)
 CTASSERT(offsetof(struct mbuf, m_dat) == 32);
 CTASSERT(sizeof(struct pkthdr) == 56);
 CTASSERT(sizeof(struct m_ext) == 48);
 #else
 CTASSERT(offsetof(struct mbuf, m_dat) == 24);
 CTASSERT(sizeof(struct pkthdr) == 48);
 CTASSERT(sizeof(struct m_ext) == 28);
 #endif
 
 /*
  * Assert that the queue(3) macros produce code of the same size as an old
  * plain pointer does.
  */
 #ifdef INVARIANTS
 static struct mbuf __used m_assertbuf;
 CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next));
 CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt));
 CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt));
 #endif
 
 /*
  * Attach the cluster from *m to *n, set up m_ext in *n
  * and bump the refcount of the cluster.
  */
 void
 mb_dupcl(struct mbuf *n, struct mbuf *m)
 {
 	volatile u_int *refcnt;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
 
 	/*
 	 * Cache access optimization.  For most kinds of external
 	 * storage we don't need full copy of m_ext, since the
 	 * holder of the 'ext_count' is responsible to carry the
 	 * free routine and its arguments.  Exclusion is EXT_EXTREF,
 	 * where 'ext_cnt' doesn't point into mbuf at all.
 	 */
 	if (m->m_ext.ext_type == EXT_EXTREF)
 		bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext));
 	else
 		bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
 	n->m_flags |= M_EXT;
 	n->m_flags |= m->m_flags & M_RDONLY;
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count;
 		n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 	}
 
 	if (*refcnt == 1)
 		*refcnt += 1;
 	else
 		atomic_add_int(refcnt, 1);
 }
 
 void
 m_demote_pkthdr(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	m_tag_delete_chain(m, NULL);
 	m->m_flags &= ~M_PKTHDR;
 	bzero(&m->m_pkthdr, sizeof(struct pkthdr));
 }
 
 /*
  * Clean up mbuf (chain) from any tags and packet headers.
  * If "all" is set then the first mbuf in the chain will be
  * cleaned too.
  */
 void
 m_demote(struct mbuf *m0, int all, int flags)
 {
 	struct mbuf *m;
 
 	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p",
 		    __func__, m, m0));
 		if (m->m_flags & M_PKTHDR)
 			m_demote_pkthdr(m);
 		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
 	}
 }
 
 /*
  * Sanity checks on mbuf (chain) for use in KASSERT() and general
  * debugging.
  * Returns 0 or panics when bad and 1 on all tests passed.
  * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
  * blow up later.
  */
 int
 m_sanity(struct mbuf *m0, int sanitize)
 {
 	struct mbuf *m;
 	caddr_t a, b;
 	int pktlen = 0;
 
 #ifdef INVARIANTS
 #define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
 #else
 #define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
 #endif
 
 	for (m = m0; m != NULL; m = m->m_next) {
 		/*
 		 * Basic pointer checks.  If any of these fails then some
 		 * unrelated kernel memory before or after us is trashed.
 		 * No way to recover from that.
 		 */
 		a = M_START(m);
 		b = a + M_SIZE(m);
 		if ((caddr_t)m->m_data < a)
 			M_SANITY_ACTION("m_data outside mbuf data range left");
 		if ((caddr_t)m->m_data > b)
 			M_SANITY_ACTION("m_data outside mbuf data range right");
 		if ((caddr_t)m->m_data + m->m_len > b)
 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
 
 		/* m->m_nextpkt may only be set on first mbuf in chain. */
 		if (m != m0 && m->m_nextpkt != NULL) {
 			if (sanitize) {
 				m_freem(m->m_nextpkt);
 				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
 			} else
 				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
 		}
 
 		/* packet length (not mbuf length!) calculation */
 		if (m0->m_flags & M_PKTHDR)
 			pktlen += m->m_len;
 
 		/* m_tags may only be attached to first mbuf in chain. */
 		if (m != m0 && m->m_flags & M_PKTHDR &&
 		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
 			if (sanitize) {
 				m_tag_delete_chain(m, NULL);
 				/* put in 0xDEADC0DE perhaps? */
 			} else
 				M_SANITY_ACTION("m_tags on in-chain mbuf");
 		}
 
 		/* M_PKTHDR may only be set on first mbuf in chain */
 		if (m != m0 && m->m_flags & M_PKTHDR) {
 			if (sanitize) {
 				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 				m->m_flags &= ~M_PKTHDR;
 				/* put in 0xDEADCODE and leave hdr flag in */
 			} else
 				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
 		}
 	}
 	m = m0;
 	if (pktlen && pktlen != m->m_pkthdr.len) {
 		if (sanitize)
 			m->m_pkthdr.len = 0;
 		else
 			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
 	}
 	return 1;
 
 #undef	M_SANITY_ACTION
 }
 
 /*
  * Non-inlined part of m_init().
  */
 int
 m_pkthdr_init(struct mbuf *m, int how)
 {
 #ifdef MAC
 	int error;
 #endif
 	m->m_data = m->m_pktdat;
 	bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
 #ifdef NUMA
 	m->m_pkthdr.numa_domain = M_NODOM;
 #endif
 #ifdef MAC
 	/* If the label init fails, fail the alloc */
 	error = mac_mbuf_init(m, how);
 	if (error)
 		return (error);
 #endif
 
 	return (0);
 }
 
 /*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
 void
 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
 {
 
 #if 0
 	/* see below for why these are not enabled */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
 	    ("m_move_pkthdr: to has tags"));
 #endif
 #ifdef MAC
 	/*
 	 * XXXMAC: It could be this should also occur for non-MAC?
 	 */
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
 	from->m_flags &= ~M_PKTHDR;
+	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) {
+		from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
+		from->m_pkthdr.snd_tag = NULL;
+	}
 }
 
 /*
  * Duplicate "from"'s mbuf pkthdr in "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  * In particular, this does a deep copy of the packet tags.
  */
 int
 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
 {
 
 #if 0
 	/*
 	 * The mbuf allocator only initializes the pkthdr
 	 * when the mbuf is allocated with m_gethdr(). Many users
 	 * (e.g. m_copy*, m_prepend) use m_get() and then
 	 * smash the pkthdr as needed causing these
 	 * assertions to trip.  For now just disable them.
 	 */
 	M_ASSERTPKTHDR(to);
 	/* Note: with MAC, this may not be a good assertion. */
 	KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
 #endif
 	MBUF_CHECKSLEEP(how);
 #ifdef MAC
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;
+	if (from->m_pkthdr.csum_flags & CSUM_SND_TAG)
+		m_snd_tag_ref(from->m_pkthdr.snd_tag);
 	SLIST_INIT(&to->m_pkthdr.tags);
 	return (m_tag_copy_chain(to, from, how));
 }
 
 /*
  * Lesser-used path for M_PREPEND:
  * allocate new mbuf to prepend to chain,
  * copy junk along.
  */
 struct mbuf *
 m_prepend(struct mbuf *m, int len, int how)
 {
 	struct mbuf *mn;
 
 	if (m->m_flags & M_PKTHDR)
 		mn = m_gethdr(how, m->m_type);
 	else
 		mn = m_get(how, m->m_type);
 	if (mn == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (m->m_flags & M_PKTHDR)
 		m_move_pkthdr(mn, m);
 	mn->m_next = m;
 	m = mn;
 	if (len < M_SIZE(m))
 		M_ALIGN(m, len);
 	m->m_len = len;
 	return (m);
 }
 
 /*
  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
  * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  */
 struct mbuf *
 m_copym(struct mbuf *m, int off0, int len, int wait)
 {
 	struct mbuf *n, **np;
 	int off = off0;
 	struct mbuf *top;
 	int copyhdr = 0;
 
 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
 	MBUF_CHECKSLEEP(wait);
 	if (off == 0 && m->m_flags & M_PKTHDR)
 		copyhdr = 1;
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	np = &top;
 	top = NULL;
 	while (len > 0) {
 		if (m == NULL) {
 			KASSERT(len == M_COPYALL,
 			    ("m_copym, length > size of mbuf chain"));
 			break;
 		}
 		if (copyhdr)
 			n = m_gethdr(wait, m->m_type);
 		else
 			n = m_get(wait, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		if (copyhdr) {
 			if (!m_dup_pkthdr(n, m, wait))
 				goto nospace;
 			if (len == M_COPYALL)
 				n->m_pkthdr.len -= off0;
 			else
 				n->m_pkthdr.len = len;
 			copyhdr = 0;
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
 			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 		if (len != M_COPYALL)
 			len -= n->m_len;
 		off = 0;
 		m = m->m_next;
 		np = &n->m_next;
 	}
 
 	return (top);
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Copy an entire packet, including header (which must be present).
  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  * Preserve alignment of the first mbuf so if the creator has left
  * some room at the beginning (e.g. for inserting protocol headers)
  * the copies still have the room available.
  */
 struct mbuf *
 m_copypacket(struct mbuf *m, int how)
 {
 	struct mbuf *top, *n, *o;
 
 	MBUF_CHECKSLEEP(how);
 	n = m_get(how, m->m_type);
 	top = n;
 	if (n == NULL)
 		goto nospace;
 
 	if (!m_dup_pkthdr(n, m, how))
 		goto nospace;
 	n->m_len = m->m_len;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data;
 		mb_dupcl(n, m);
 	} else {
 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 	}
 
 	m = m->m_next;
 	while (m) {
 		o = m_get(how, m->m_type);
 		if (o == NULL)
 			goto nospace;
 
 		n->m_next = o;
 		n = n->m_next;
 
 		n->m_len = m->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data;
 			mb_dupcl(n, m);
 		} else {
 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 		}
 
 		m = m->m_next;
 	}
 	return top;
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
  */
 void
 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
 {
 	u_int count;
 
 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		bcopy(mtod(m, caddr_t) + off, cp, count);
 		len -= count;
 		cp += count;
 		off = 0;
 		m = m->m_next;
 	}
 }
 
 /*
  * Copy a packet header mbuf chain into a completely new chain, including
  * copying any mbuf clusters.  Use this instead of m_copypacket() when
  * you need a writable copy of an mbuf chain.
  */
 struct mbuf *
 m_dup(const struct mbuf *m, int how)
 {
 	struct mbuf **p, *top = NULL;
 	int remain, moff, nsize;
 
 	MBUF_CHECKSLEEP(how);
 	/* Sanity check */
 	if (m == NULL)
 		return (NULL);
 	M_ASSERTPKTHDR(m);
 
 	/* While there's more data, get a new mbuf, tack it on, and fill it */
 	remain = m->m_pkthdr.len;
 	moff = 0;
 	p = &top;
 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
 		struct mbuf *n;
 
 		/* Get the next new mbuf */
 		if (remain >= MINCLSIZE) {
 			n = m_getcl(how, m->m_type, 0);
 			nsize = MCLBYTES;
 		} else {
 			n = m_get(how, m->m_type);
 			nsize = MLEN;
 		}
 		if (n == NULL)
 			goto nospace;
 
 		if (top == NULL) {		/* First one, must be PKTHDR */
 			if (!m_dup_pkthdr(n, m, how)) {
 				m_free(n);
 				goto nospace;
 			}
 			if ((n->m_flags & M_EXT) == 0)
 				nsize = MHLEN;
 			n->m_flags &= ~M_RDONLY;
 		}
 		n->m_len = 0;
 
 		/* Link it into the new chain */
 		*p = n;
 		p = &n->m_next;
 
 		/* Copy data from original mbuf(s) into new mbuf */
 		while (n->m_len < nsize && m != NULL) {
 			int chunk = min(nsize - n->m_len, m->m_len - moff);
 
 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
 			moff += chunk;
 			n->m_len += chunk;
 			remain -= chunk;
 			if (moff == m->m_len) {
 				m = m->m_next;
 				moff = 0;
 			}
 		}
 
 		/* Check correct total mbuf length */
 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
 		    	("%s: bogus m_pkthdr.len", __func__));
 	}
 	return (top);
 
 nospace:
 	m_freem(top);
 	return (NULL);
 }
 
 /*
  * Concatenate mbuf chain n to m.
  * Both chains must be of the same type (e.g. MT_DATA).
  * Any m_pkthdr is not updated.
  */
 void
 m_cat(struct mbuf *m, struct mbuf *n)
 {
 	while (m->m_next)
 		m = m->m_next;
 	while (n) {
 		if (!M_WRITABLE(m) ||
 		    M_TRAILINGSPACE(m) < n->m_len) {
 			/* just join the two chains */
 			m->m_next = n;
 			return;
 		}
 		/* splat the data from one into the other */
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		    (u_int)n->m_len);
 		m->m_len += n->m_len;
 		n = m_free(n);
 	}
 }
 
 /*
  * Concatenate two pkthdr mbuf chains.
  */
 void
 m_catpkt(struct mbuf *m, struct mbuf *n)
 {
 
 	M_ASSERTPKTHDR(m);
 	M_ASSERTPKTHDR(n);
 
 	m->m_pkthdr.len += n->m_pkthdr.len;
 	m_demote(n, 1, 0);
 
 	m_cat(m, n);
 }
 
 void
 m_adj(struct mbuf *mp, int req_len)
 {
 	int len = req_len;
 	struct mbuf *m;
 	int count;
 
 	if ((m = mp) == NULL)
 		return;
 	if (len >= 0) {
 		/*
 		 * Trim from head.
 		 */
 		while (m != NULL && len > 0) {
 			if (m->m_len <= len) {
 				len -= m->m_len;
 				m->m_len = 0;
 				m = m->m_next;
 			} else {
 				m->m_len -= len;
 				m->m_data += len;
 				len = 0;
 			}
 		}
 		if (mp->m_flags & M_PKTHDR)
 			mp->m_pkthdr.len -= (req_len - len);
 	} else {
 		/*
 		 * Trim from tail.  Scan the mbuf chain,
 		 * calculating its length and finding the last mbuf.
 		 * If the adjustment only affects this mbuf, then just
 		 * adjust and return.  Otherwise, rescan and truncate
 		 * after the remaining size.
 		 */
 		len = -len;
 		count = 0;
 		for (;;) {
 			count += m->m_len;
 			if (m->m_next == (struct mbuf *)0)
 				break;
 			m = m->m_next;
 		}
 		if (m->m_len >= len) {
 			m->m_len -= len;
 			if (mp->m_flags & M_PKTHDR)
 				mp->m_pkthdr.len -= len;
 			return;
 		}
 		count -= len;
 		if (count < 0)
 			count = 0;
 		/*
 		 * Correct length for chain is "count".
 		 * Find the mbuf with last data, adjust its length,
 		 * and toss data from remaining mbufs on chain.
 		 */
 		m = mp;
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len = count;
 		for (; m; m = m->m_next) {
 			if (m->m_len >= count) {
 				m->m_len = count;
 				if (m->m_next != NULL) {
 					m_freem(m->m_next);
 					m->m_next = NULL;
 				}
 				break;
 			}
 			count -= m->m_len;
 		}
 	}
 }
 
 /*
  * Rearange an mbuf chain so that len bytes are contiguous
  * and in the data area of an mbuf (so that mtod will work
  * for a structure of size len).  Returns the resulting
  * mbuf chain on success, frees it and returns null on failure.
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
  */
 struct mbuf *
 m_pullup(struct mbuf *n, int len)
 {
 	struct mbuf *m;
 	int count;
 	int space;
 
 	/*
 	 * If first mbuf has no cluster, and has room for len bytes
 	 * without shifting current data, pullup into it,
 	 * otherwise allocate a new mbuf to prepend to the chain.
 	 */
 	if ((n->m_flags & M_EXT) == 0 &&
 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
 		if (n->m_len >= len)
 			return (n);
 		m = n;
 		n = n->m_next;
 		len -= m->m_len;
 	} else {
 		if (len > MHLEN)
 			goto bad;
 		m = m_get(M_NOWAIT, n->m_type);
 		if (m == NULL)
 			goto bad;
 		if (n->m_flags & M_PKTHDR)
 			m_move_pkthdr(m, n);
 	}
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
 		  (u_int)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
 bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Like m_pullup(), except a new mbuf is always allocated, and we allow
  * the amount of empty space before the data in the new mbuf to be specified
  * (in the event that the caller expects to prepend later).
  */
 struct mbuf *
 m_copyup(struct mbuf *n, int len, int dstoff)
 {
 	struct mbuf *m;
 	int count, space;
 
 	if (len > (MHLEN - dstoff))
 		goto bad;
 	m = m_get(M_NOWAIT, n->m_type);
 	if (m == NULL)
 		goto bad;
 	if (n->m_flags & M_PKTHDR)
 		m_move_pkthdr(m, n);
 	m->m_data += dstoff;
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
 		count = min(min(max(len, max_protohdr), space), n->m_len);
 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
 		    (unsigned)count);
 		len -= count;
 		m->m_len += count;
 		n->m_len -= count;
 		space -= count;
 		if (n->m_len)
 			n->m_data += count;
 		else
 			n = m_free(n);
 	} while (len > 0 && n);
 	if (len > 0) {
 		(void) m_free(m);
 		goto bad;
 	}
 	m->m_next = n;
 	return (m);
  bad:
 	m_freem(n);
 	return (NULL);
 }
 
 /*
  * Partition an mbuf chain in two pieces, returning the tail --
  * all but the first len0 bytes.  In case of failure, it returns NULL and
  * attempts to restore the chain to its original state.
  *
  * Note that the resulting mbufs might be read-only, because the new
  * mbuf can end up sharing an mbuf cluster with the original mbuf if
  * the "breaking point" happens to lie within a cluster mbuf. Use the
  * M_WRITABLE() macro to check for this case.
  */
 struct mbuf *
 m_split(struct mbuf *m0, int len0, int wait)
 {
 	struct mbuf *m, *n;
 	u_int len = len0, remain;
 
 	MBUF_CHECKSLEEP(wait);
 	for (m = m0; m && len > m->m_len; m = m->m_next)
 		len -= m->m_len;
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
 	if (m0->m_flags & M_PKTHDR && remain == 0) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
 		n->m_next = m->m_next;
 		m->m_next = NULL;
-		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
+			n->m_pkthdr.snd_tag =
+			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
+			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
+		} else
+			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		return (n);
 	} else if (m0->m_flags & M_PKTHDR) {
 		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
-		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+		if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) {
+			n->m_pkthdr.snd_tag =
+			    m_snd_tag_ref(m0->m_pkthdr.snd_tag);
+			n->m_pkthdr.csum_flags |= CSUM_SND_TAG;
+		} else
+			n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
 		if (m->m_flags & M_EXT)
 			goto extpacket;
 		if (remain > MHLEN) {
 			/* m can't be the lead packet */
 			M_ALIGN(n, 0);
 			n->m_next = m_split(m, len, wait);
 			if (n->m_next == NULL) {
 				(void) m_free(n);
 				return (NULL);
 			} else {
 				n->m_len = 0;
 				return (n);
 			}
 		} else
 			M_ALIGN(n, remain);
 	} else if (remain == 0) {
 		n = m->m_next;
 		m->m_next = NULL;
 		return (n);
 	} else {
 		n = m_get(wait, m->m_type);
 		if (n == NULL)
 			return (NULL);
 		M_ALIGN(n, remain);
 	}
 extpacket:
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + len;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
 	}
 	n->m_len = remain;
 	m->m_len = len;
 	n->m_next = m->m_next;
 	m->m_next = NULL;
 	return (n);
 }
 /*
  * Routine to copy from device local memory into mbufs.
  * Note that `off' argument is offset into first mbuf of target chain from
  * which to begin copying the data to.
  */
 struct mbuf *
 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
     void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
 	struct mbuf *top = NULL, **mp = &top;
 	int len;
 
 	if (off < 0 || off > MHLEN)
 		return (NULL);
 
 	while (totlen > 0) {
 		if (top == NULL) {	/* First one, must be PKTHDR */
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
 			} else {
 				m = m_gethdr(M_NOWAIT, MT_DATA);
 				len = MHLEN;
 
 				/* Place initial small packet/header at end of mbuf */
 				if (m && totlen + off + max_linkhdr <= MHLEN) {
 					m->m_data += max_linkhdr;
 					len -= max_linkhdr;
 				}
 			}
 			if (m == NULL)
 				return NULL;
 			m->m_pkthdr.rcvif = ifp;
 			m->m_pkthdr.len = totlen;
 		} else {
 			if (totlen + off >= MINCLSIZE) {
 				m = m_getcl(M_NOWAIT, MT_DATA, 0);
 				len = MCLBYTES;
 			} else {
 				m = m_get(M_NOWAIT, MT_DATA);
 				len = MLEN;
 			}
 			if (m == NULL) {
 				m_freem(top);
 				return NULL;
 			}
 		}
 		if (off) {
 			m->m_data += off;
 			len -= off;
 			off = 0;
 		}
 		m->m_len = len = min(totlen, len);
 		if (copy)
 			copy(buf, mtod(m, caddr_t), (u_int)len);
 		else
 			bcopy(buf, mtod(m, caddr_t), (u_int)len);
 		buf += len;
 		*mp = m;
 		mp = &m->m_next;
 		totlen -= len;
 	}
 	return (top);
 }
 
 /*
  * Copy data from a buffer back into the indicated mbuf chain,
  * starting "off" bytes from the beginning, extending the mbuf
  * chain if necessary.
  */
 void
 m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
 {
 	int mlen;
 	struct mbuf *m = m0, *n;
 	int totlen = 0;
 
 	if (m0 == NULL)
 		return;
 	while (off > (mlen = m->m_len)) {
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
 			bzero(mtod(n, caddr_t), MLEN);
 			n->m_len = min(MLEN, len + off);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 	while (len > 0) {
 		if (m->m_next == NULL && (len > m->m_len - off)) {
 			m->m_len += min(len - (m->m_len - off),
 			    M_TRAILINGSPACE(m));
 		}
 		mlen = min (m->m_len - off, len);
 		bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
 		cp += mlen;
 		len -= mlen;
 		mlen += off;
 		off = 0;
 		totlen += mlen;
 		if (len == 0)
 			break;
 		if (m->m_next == NULL) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				break;
 			n->m_len = min(MLEN, len);
 			m->m_next = n;
 		}
 		m = m->m_next;
 	}
 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
 		m->m_pkthdr.len = totlen;
 }
 
 /*
  * Append the specified data to the indicated mbuf chain,
  * Extend the mbuf chain if the new data does not fit in
  * existing space.
  *
  * Return 1 if able to complete the job; otherwise 0.
  */
 int
 m_append(struct mbuf *m0, int len, c_caddr_t cp)
 {
 	struct mbuf *m, *n;
 	int remainder, space;
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	remainder = len;
 	space = M_TRAILINGSPACE(m);
 	if (space > 0) {
 		/*
 		 * Copy into available space.
 		 */
 		if (space > remainder)
 			space = remainder;
 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 		m->m_len += space;
 		cp += space, remainder -= space;
 	}
 	while (remainder > 0) {
 		/*
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
 		n = m_get(M_NOWAIT, m->m_type);
 		if (n == NULL)
 			break;
 		n->m_len = min(MLEN, remainder);
 		bcopy(cp, mtod(n, caddr_t), n->m_len);
 		cp += n->m_len, remainder -= n->m_len;
 		m->m_next = n;
 		m = n;
 	}
 	if (m0->m_flags & M_PKTHDR)
 		m0->m_pkthdr.len += len - remainder;
 	return (remainder == 0);
 }
 
 /*
  * Apply function f to the data in an mbuf chain starting "off" bytes from
  * the beginning, continuing for "len" bytes.
  */
 int
 m_apply(struct mbuf *m, int off, int len,
     int (*f)(void *, void *, u_int), void *arg)
 {
 	u_int count;
 	int rval;
 
 	KASSERT(off >= 0, ("m_apply, negative off %d", off));
 	KASSERT(len >= 0, ("m_apply, negative len %d", len));
 	while (off > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		if (off < m->m_len)
 			break;
 		off -= m->m_len;
 		m = m->m_next;
 	}
 	while (len > 0) {
 		KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
 		count = min(m->m_len - off, len);
 		rval = (*f)(arg, mtod(m, caddr_t) + off, count);
 		if (rval)
 			return (rval);
 		len -= count;
 		off = 0;
 		m = m->m_next;
 	}
 	return (0);
 }
 
 /*
  * Return a pointer to mbuf/offset of location in mbuf chain.
  */
 struct mbuf *
 m_getptr(struct mbuf *m, int loc, int *off)
 {
 
 	while (loc >= 0) {
 		/* Normal end of search. */
 		if (m->m_len > loc) {
 			*off = loc;
 			return (m);
 		} else {
 			loc -= m->m_len;
 			if (m->m_next == NULL) {
 				if (loc == 0) {
 					/* Point at the end of valid data. */
 					*off = m->m_len;
 					return (m);
 				}
 				return (NULL);
 			}
 			m = m->m_next;
 		}
 	}
 	return (NULL);
 }
 
 void
 m_print(const struct mbuf *m, int maxlen)
 {
 	int len;
 	int pdata;
 	const struct mbuf *m2;
 
 	if (m == NULL) {
 		printf("mbuf: %p\n", m);
 		return;
 	}
 
 	if (m->m_flags & M_PKTHDR)
 		len = m->m_pkthdr.len;
 	else
 		len = -1;
 	m2 = m;
 	while (m2 != NULL && (len == -1 || len)) {
 		pdata = m2->m_len;
 		if (maxlen != -1 && pdata > maxlen)
 			pdata = maxlen;
 		printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
 		    m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
 		if (pdata)
 			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
 		if (len != -1)
 			len -= m2->m_len;
 		m2 = m2->m_next;
 	}
 	if (len > 0)
 		printf("%d bytes unaccounted for.\n", len);
 	return;
 }
 
 u_int
 m_fixhdr(struct mbuf *m0)
 {
 	u_int len;
 
 	len = m_length(m0, NULL);
 	m0->m_pkthdr.len = len;
 	return (len);
 }
 
 u_int
 m_length(struct mbuf *m0, struct mbuf **last)
 {
 	struct mbuf *m;
 	u_int len;
 
 	len = 0;
 	for (m = m0; m != NULL; m = m->m_next) {
 		len += m->m_len;
 		if (m->m_next == NULL)
 			break;
 	}
 	if (last != NULL)
 		*last = m;
 	return (len);
 }
 
 /*
  * Defragment a mbuf chain, returning the shortest possible
  * chain of mbufs and clusters.  If allocation fails and
  * this cannot be completed, NULL will be returned, but
  * the passed in chain will be unchanged.  Upon success,
  * the original chain will be freed, and the new chain
  * will be returned.
  *
  * If a non-packet header is passed in, the original
  * mbuf (chain?) will be returned unharmed.
  */
 struct mbuf *
 m_defrag(struct mbuf *m0, int how)
 {
 	struct mbuf *m_new = NULL, *m_final = NULL;
 	int progress = 0, length;
 
 	MBUF_CHECKSLEEP(how);
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 #ifdef MBUF_STRESS_TEST
 	if (m_defragrandomfailures) {
 		int temp = arc4random() & 0xff;
 		if (temp == 0xba)
 			goto nospace;
 	}
 #endif
 
 	if (m0->m_pkthdr.len > MHLEN)
 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m_final = m_gethdr(how, MT_DATA);
 
 	if (m_final == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_final, m0, how) == 0)
 		goto nospace;
 
 	m_new = m_final;
 
 	while (progress < m0->m_pkthdr.len) {
 		length = m0->m_pkthdr.len - progress;
 		if (length > MCLBYTES)
 			length = MCLBYTES;
 
 		if (m_new == NULL) {
 			if (length > MLEN)
 				m_new = m_getcl(how, MT_DATA, 0);
 			else
 				m_new = m_get(how, MT_DATA);
 			if (m_new == NULL)
 				goto nospace;
 		}
 
 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
 		progress += length;
 		m_new->m_len = length;
 		if (m_new != m_final)
 			m_cat(m_final, m_new);
 		m_new = NULL;
 	}
 #ifdef MBUF_STRESS_TEST
 	if (m0->m_next == NULL)
 		m_defraguseless++;
 #endif
 	m_freem(m0);
 	m0 = m_final;
 #ifdef MBUF_STRESS_TEST
 	m_defragpackets++;
 	m_defragbytes += m0->m_pkthdr.len;
 #endif
 	return (m0);
 nospace:
 #ifdef MBUF_STRESS_TEST
 	m_defragfailure++;
 #endif
 	if (m_final)
 		m_freem(m_final);
 	return (NULL);
 }
 
 /*
  * Defragment an mbuf chain, returning at most maxfrags separate
  * mbufs+clusters.  If this is not possible NULL is returned and
  * the original mbuf chain is left in its present (potentially
  * modified) state.  We use two techniques: collapsing consecutive
  * mbufs and replacing consecutive mbufs by a cluster.
  *
  * NB: this should really be named m_defrag but that name is taken
  */
 struct mbuf *
 m_collapse(struct mbuf *m0, int how, int maxfrags)
 {
 	struct mbuf *m, *n, *n2, **prev;
 	u_int curfrags;
 
 	/*
 	 * Calculate the current number of frags.
 	 */
 	curfrags = 0;
 	for (m = m0; m != NULL; m = m->m_next)
 		curfrags++;
 	/*
 	 * First, try to collapse mbufs.  Note that we always collapse
 	 * towards the front so we don't need to deal with moving the
 	 * pkthdr.  This may be suboptimal if the first mbuf has much
 	 * less data than the following.
 	 */
 	m = m0;
 again:
 	for (;;) {
 		n = m->m_next;
 		if (n == NULL)
 			break;
 		if (M_WRITABLE(m) &&
 		    n->m_len < M_TRAILINGSPACE(m)) {
 			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
 				n->m_len);
 			m->m_len += n->m_len;
 			m->m_next = n->m_next;
 			m_free(n);
 			if (--curfrags <= maxfrags)
 				return m0;
 		} else
 			m = n;
 	}
 	KASSERT(maxfrags > 1,
 		("maxfrags %u, but normal collapse failed", maxfrags));
 	/*
 	 * Collapse consecutive mbufs to a cluster.
 	 */
 	prev = &m0->m_next;		/* NB: not the first mbuf */
 	while ((n = *prev) != NULL) {
 		if ((n2 = n->m_next) != NULL &&
 		    n->m_len + n2->m_len < MCLBYTES) {
 			m = m_getcl(how, MT_DATA, 0);
 			if (m == NULL)
 				goto bad;
 			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
 			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
 				n2->m_len);
 			m->m_len = n->m_len + n2->m_len;
 			m->m_next = n2->m_next;
 			*prev = m;
 			m_free(n);
 			m_free(n2);
 			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
 				return m0;
 			/*
 			 * Still not there, try the normal collapse
 			 * again before we allocate another cluster.
 			 */
 			goto again;
 		}
 		prev = &n->m_next;
 	}
 	/*
 	 * No place where we can collapse to a cluster; punt.
 	 * This can occur if, for example, you request 2 frags
 	 * but the packet requires that both be clusters (we
 	 * never reallocate the first mbuf to avoid moving the
 	 * packet header).
 	 */
 bad:
 	return NULL;
 }
 
 #ifdef MBUF_STRESS_TEST
 
 /*
  * Fragment an mbuf chain.  There's no reason you'd ever want to do
  * this in normal usage, but it's great for stress testing various
  * mbuf consumers.
  *
  * If fragmentation is not possible, the original chain will be
  * returned.
  *
  * Possible length values:
  * 0	 no fragmentation will occur
  * > 0	each fragment will be of the specified length
  * -1	each fragment will be the same random value in length
  * -2	each fragment's length will be entirely random
  * (Random values range from 1 to 256)
  */
 struct mbuf *
 m_fragment(struct mbuf *m0, int how, int length)
 {
 	struct mbuf *m_first, *m_last;
 	int divisor = 255, progress = 0, fraglen;
 
 	if (!(m0->m_flags & M_PKTHDR))
 		return (m0);
 
 	if (length == 0 || length < -2)
 		return (m0);
 	if (length > MCLBYTES)
 		length = MCLBYTES;
 	if (length < 0 && divisor > MCLBYTES)
 		divisor = MCLBYTES;
 	if (length == -1)
 		length = 1 + (arc4random() % divisor);
 	if (length > 0)
 		fraglen = length;
 
 	m_fixhdr(m0); /* Needed sanity check */
 
 	m_first = m_getcl(how, MT_DATA, M_PKTHDR);
 	if (m_first == NULL)
 		goto nospace;
 
 	if (m_dup_pkthdr(m_first, m0, how) == 0)
 		goto nospace;
 
 	m_last = m_first;
 
 	while (progress < m0->m_pkthdr.len) {
 		if (length == -2)
 			fraglen = 1 + (arc4random() % divisor);
 		if (fraglen > m0->m_pkthdr.len - progress)
 			fraglen = m0->m_pkthdr.len - progress;
 
 		if (progress != 0) {
 			struct mbuf *m_new = m_getcl(how, MT_DATA, 0);
 			if (m_new == NULL)
 				goto nospace;
 
 			m_last->m_next = m_new;
 			m_last = m_new;
 		}
 
 		m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t));
 		progress += fraglen;
 		m_last->m_len = fraglen;
 	}
 	m_freem(m0);
 	m0 = m_first;
 	return (m0);
 nospace:
 	if (m_first)
 		m_freem(m_first);
 	/* Return the original chain on failure */
 	return (m0);
 }
 
 #endif
 
 /*
  * Copy the contents of uio into a properly sized mbuf chain.
  */
 struct mbuf *
 m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 {
 	struct mbuf *m, *mb;
 	int error, length;
 	ssize_t total;
 	int progress = 0;
 
 	/*
 	 * len can be zero or an arbitrary large value bound by
 	 * the total data supplied by the uio.
 	 */
 	if (len > 0)
 		total = (uio->uio_resid < len) ? uio->uio_resid : len;
 	else
 		total = uio->uio_resid;
 
 	/*
 	 * The smallest unit returned by m_getm2() is a single mbuf
 	 * with pkthdr.  We can't align past it.
 	 */
 	if (align >= MHLEN)
 		return (NULL);
 
 	/*
 	 * Give us the full allocation or nothing.
 	 * If len is zero return the smallest empty mbuf.
 	 */
 	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
 	if (m == NULL)
 		return (NULL);
 	m->m_data += align;
 
 	/* Fill all mbufs with uio data and update header information. */
 	for (mb = m; mb != NULL; mb = mb->m_next) {
 		length = min(M_TRAILINGSPACE(mb), total - progress);
 
 		error = uiomove(mtod(mb, void *), length, uio);
 		if (error) {
 			m_freem(m);
 			return (NULL);
 		}
 
 		mb->m_len = length;
 		progress += length;
 		if (flags & M_PKTHDR)
 			m->m_pkthdr.len += length;
 	}
 	KASSERT(progress == total, ("%s: progress != total", __func__));
 
 	return (m);
 }
 
 /*
  * Copy an mbuf chain into a uio limited by len if set.
  */
 int
 m_mbuftouio(struct uio *uio, const struct mbuf *m, int len)
 {
 	int error, length, total;
 	int progress = 0;
 
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/* Fill the uio with data from the mbufs. */
 	for (; m != NULL; m = m->m_next) {
 		length = min(m->m_len, total - progress);
 
 		error = uiomove(mtod(m, void *), length, uio);
 		if (error)
 			return (error);
 
 		progress += length;
 	}
 
 	return (0);
 }
 
 /*
  * Create a writable copy of the mbuf chain.  While doing this
  * we compact the chain with a goal of producing a chain with
  * at most two mbufs.  The second mbuf in this chain is likely
  * to be a cluster.  The primary purpose of this work is to create
  * a writable packet for encryption, compression, etc.  The
  * secondary goal is to linearize the data so the data can be
  * passed to crypto hardware in the most efficient manner possible.
  */
 struct mbuf *
 m_unshare(struct mbuf *m0, int how)
 {
 	struct mbuf *m, *mprev;
 	struct mbuf *n, *mfirst, *mlast;
 	int len, off;
 
 	mprev = NULL;
 	for (m = m0; m != NULL; m = mprev->m_next) {
 		/*
 		 * Regular mbufs are ignored unless there's a cluster
 		 * in front of it that we can use to coalesce.  We do
 		 * the latter mainly so later clusters can be coalesced
 		 * also w/o having to handle them specially (i.e. convert
 		 * mbuf+cluster -> cluster).  This optimization is heavily
 		 * influenced by the assumption that we're running over
 		 * Ethernet where MCLBYTES is large enough that the max
 		 * packet size will permit lots of coalescing into a
 		 * single cluster.  This in turn permits efficient
 		 * crypto operations, especially when using hardware.
 		 */
 		if ((m->m_flags & M_EXT) == 0) {
 			if (mprev && (mprev->m_flags & M_EXT) &&
 			    m->m_len <= M_TRAILINGSPACE(mprev)) {
 				/* XXX: this ignores mbuf types */
 				memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 				    mtod(m, caddr_t), m->m_len);
 				mprev->m_len += m->m_len;
 				mprev->m_next = m->m_next;	/* unlink from chain */
 				m_free(m);			/* reclaim mbuf */
 			} else {
 				mprev = m;
 			}
 			continue;
 		}
 		/*
 		 * Writable mbufs are left alone (for now).
 		 */
 		if (M_WRITABLE(m)) {
 			mprev = m;
 			continue;
 		}
 
 		/*
 		 * Not writable, replace with a copy or coalesce with
 		 * the previous mbuf if possible (since we have to copy
 		 * it anyway, we try to reduce the number of mbufs and
 		 * clusters so that future work is easier).
 		 */
 		KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
 		/* NB: we only coalesce into a cluster or larger */
 		if (mprev != NULL && (mprev->m_flags & M_EXT) &&
 		    m->m_len <= M_TRAILINGSPACE(mprev)) {
 			/* XXX: this ignores mbuf types */
 			memcpy(mtod(mprev, caddr_t) + mprev->m_len,
 			    mtod(m, caddr_t), m->m_len);
 			mprev->m_len += m->m_len;
 			mprev->m_next = m->m_next;	/* unlink from chain */
 			m_free(m);			/* reclaim mbuf */
 			continue;
 		}
 
 		/*
 		 * Allocate new space to hold the copy and copy the data.
 		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
 		 * splitting them into clusters.  We could just malloc a
 		 * buffer and make it external but too many device drivers
 		 * don't know how to break up the non-contiguous memory when
 		 * doing DMA.
 		 */
 		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 		if (n == NULL) {
 			m_freem(m0);
 			return (NULL);
 		}
 		if (m->m_flags & M_PKTHDR) {
 			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
 			    __func__, m0, m));
 			m_move_pkthdr(n, m);
 		}
 		len = m->m_len;
 		off = 0;
 		mfirst = n;
 		mlast = NULL;
 		for (;;) {
 			int cc = min(len, MCLBYTES);
 			memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
 			n->m_len = cc;
 			if (mlast != NULL)
 				mlast->m_next = n;
 			mlast = n;
 #if 0
 			newipsecstat.ips_clcopied++;
 #endif
 
 			len -= cc;
 			if (len <= 0)
 				break;
 			off += cc;
 
 			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 			if (n == NULL) {
 				m_freem(mfirst);
 				m_freem(m0);
 				return (NULL);
 			}
 		}
 		n->m_next = m->m_next;
 		if (mprev == NULL)
 			m0 = mfirst;		/* new head of chain */
 		else
 			mprev->m_next = mfirst;	/* replace old mbuf */
 		m_free(m);			/* release old mbuf */
 		mprev = mfirst;
 	}
 	return (m0);
 }
 
 #ifdef MBUF_PROFILING
 
 #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
 struct mbufprofile {
 	uintmax_t wasted[MP_BUCKETS];
 	uintmax_t used[MP_BUCKETS];
 	uintmax_t segments[MP_BUCKETS];
 } mbprof;
 
 #define MP_MAXDIGITS 21	/* strlen("16,000,000,000,000,000,000") == 21 */
 #define MP_NUMLINES 6
 #define MP_NUMSPERLINE 16
 #define MP_EXTRABYTES 64	/* > strlen("used:\nwasted:\nsegments:\n") */
 /* work out max space needed and add a bit of spare space too */
 #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
 #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
 
 char mbprofbuf[MP_BUFSIZE];
 
 void
 m_profile(struct mbuf *m)
 {
 	int segments = 0;
 	int used = 0;
 	int wasted = 0;
 
 	while (m) {
 		segments++;
 		used += m->m_len;
 		if (m->m_flags & M_EXT) {
 			wasted += MHLEN - sizeof(m->m_ext) +
 			    m->m_ext.ext_size - m->m_len;
 		} else {
 			if (m->m_flags & M_PKTHDR)
 				wasted += MHLEN - m->m_len;
 			else
 				wasted += MLEN - m->m_len;
 		}
 		m = m->m_next;
 	}
 	/* be paranoid.. it helps */
 	if (segments > MP_BUCKETS - 1)
 		segments = MP_BUCKETS - 1;
 	if (used > 100000)
 		used = 100000;
 	if (wasted > 100000)
 		wasted = 100000;
 	/* store in the appropriate bucket */
 	/* don't bother locking. if it's slightly off, so what? */
 	mbprof.segments[segments]++;
 	mbprof.used[fls(used)]++;
 	mbprof.wasted[fls(wasted)]++;
 }
 
 static void
 mbprof_textify(void)
 {
 	int offset;
 	char *c;
 	uint64_t *p;
 
 	p = &mbprof.wasted[0];
 	c = mbprofbuf;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "wasted:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.wasted[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.used[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "used:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.used[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 	p = &mbprof.segments[0];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE + 10,
 	    "segments:\n"
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %ju\n",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #ifdef BIG_ARRAY
 	p = &mbprof.segments[16];
 	c += offset;
 	offset = snprintf(c, MP_MAXLINE,
 	    "%ju %ju %ju %ju %ju %ju %ju %ju "
 	    "%ju %ju %ju %ju %ju %ju %ju %jju",
 	    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 #endif
 }
 
 static int
 mbprof_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	mbprof_textify();
 	error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
 	return (error);
 }
 
 static int
 mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
 {
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (clear) {
 		bzero(&mbprof, sizeof(mbprof));
 	}
 
 	return (error);
 }
 
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
 	    NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
 	    NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
 #endif
 
Index: head/sys/net/bpf.c
===================================================================
--- head/sys/net/bpf.c	(revision 348253)
+++ head/sys/net/bpf.c	(revision 348254)
@@ -1,3085 +1,3085 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_ddb.h"
 #include "opt_netgraph.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 
 #include <sys/socket.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/bpf.h>
 #include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net80211/ieee80211_freebsd.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 static struct bpf_if_ext dead_bpf_if = {
 	.bif_dlist = CK_LIST_HEAD_INITIALIZER()
 };
 
 struct bpf_if {
 #define	bif_next	bif_ext.bif_next
 #define	bif_dlist	bif_ext.bif_dlist
 	struct bpf_if_ext bif_ext;	/* public members */
 	u_int		bif_dlt;	/* link layer type */
 	u_int		bif_hdrlen;	/* length of link header */
 	struct bpfd_list bif_wlist;	/* writer-only list */
 	struct ifnet	*bif_ifp;	/* corresponding interface */
 	struct bpf_if	**bif_bpf;	/* Pointer to pointer to us */
 	volatile u_int	bif_refcnt;
 	struct epoch_context epoch_ctx;
 };
 
 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 
 struct bpf_program_buffer {
 	struct epoch_context	epoch_ctx;
 #ifdef BPF_JITTER
 	bpf_jit_filter		*func;
 #endif
 	void			*buffer[0];
 };
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
 #define PRINET  26			/* interruptible */
 
 #define	SIZEOF_BPF_HDR(type)	\
     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #define BPF_ALIGNMENT32 sizeof(int32_t)
 #define	BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
 
 #ifndef BURN_BRIDGES
 /*
  * 32-bit version of structure prepended to each packet.  We use this header
  * instead of the standard one for 32-bit streams.  We mark the a stream as
  * 32-bit the first time we see a 32-bit compat ioctl request.
  */
 struct bpf_hdr32 {
 	struct timeval32 bh_tstamp;	/* time stamp */
 	uint32_t	bh_caplen;	/* length of captured portion */
 	uint32_t	bh_datalen;	/* original length of packet */
 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
 					   plus alignment padding) */
 };
 #endif
 
 struct bpf_program32 {
 	u_int bf_len;
 	uint32_t bf_insns;
 };
 
 struct bpf_dltlist32 {
 	u_int	bfl_len;
 	u_int	bfl_list;
 };
 
 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
 #endif
 
 #define BPF_LOCK()	   sx_xlock(&bpf_sx)
 #define BPF_UNLOCK()		sx_xunlock(&bpf_sx)
 #define BPF_LOCK_ASSERT()	sx_assert(&bpf_sx, SA_XLOCKED)
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
  * specific DLT. The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
 CK_LIST_HEAD(bpf_iflist, bpf_if);
 static struct bpf_iflist bpf_iflist;
 static struct sx	bpf_sx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
 static void	bpfif_ref(struct bpf_if *);
 static void	bpfif_rele(struct bpf_if *);
 
 static void	bpfd_ref(struct bpf_d *);
 static void	bpfd_rele(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_detachd_locked(struct bpf_d *, bool);
 static void	bpfd_free(epoch_context_t);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_d *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct bintime *);
 static void	reset_d(struct bpf_d *);
 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int	bpf_setdlt(struct bpf_d *, u_int);
 static void	filt_bpfdetach(struct knote *);
 static int	filt_bpfread(struct knote *, long);
 static void	bpf_drvinit(void *);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
 int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 static int bpf_zerocopy_enable = 0;
 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
 VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0;
 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(bpf_optimize_writers), 0,
     "Do not send packets until BPF program is set");
 
 static	d_open_t	bpfopen;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
 static	d_ioctl_t	bpfioctl;
 static	d_poll_t	bpfpoll;
 static	d_kqfilter_t	bpfkqfilter;
 
 static struct cdevsw bpf_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	bpfopen,
 	.d_read =	bpfread,
 	.d_write =	bpfwrite,
 	.d_ioctl =	bpfioctl,
 	.d_poll =	bpfpoll,
 	.d_name =	"bpf",
 	.d_kqfilter =	bpfkqfilter,
 };
 
 static struct filterops bpfread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_bpfdetach,
 	.f_event = filt_bpfread,
 };
 
 /*
  * LOCKING MODEL USED BY BPF
  *
  * Locks:
  * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
  * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
  * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
  * structure fields used by bpf_*tap* code.
  *
  * Lock order: global lock, then descriptor lock.
  *
  * There are several possible consumers:
  *
  * 1. The kernel registers interface pointer with bpfattach().
  * Each call allocates new bpf_if structure, references ifnet pointer
  * and links bpf_if into bpf_iflist chain. This is protected with global
  * lock.
  *
  * 2. An userland application uses ioctl() call to bpf_d descriptor.
  * All such call are serialized with global lock. BPF filters can be
  * changed, but pointer to old filter will be freed using epoch_call().
  * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
  * filter pointers, even if change will happen during bpf_tap execution.
  * Destroying of bpf_d descriptor also is doing using epoch_call().
  *
  * 3. An userland application can write packets into bpf_d descriptor.
  * There we need to be sure, that ifnet won't disappear during bpfwrite().
  *
  * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
  * bif_dlist is protected with net_epoch_preempt section. So, it should
  * be safe to make access to bpf_d descriptor inside the section.
  *
  * 5. The kernel invokes bpfdetach() on interface destroying. All lists
  * are modified with global lock held and actual free() is done using
  * epoch_call().
  */
 
 static void
 bpfif_free(epoch_context_t ctx)
 {
 	struct bpf_if *bp;
 
 	bp = __containerof(ctx, struct bpf_if, epoch_ctx);
 	if_rele(bp->bif_ifp);
 	free(bp, M_BPF);
 }
 
 static void
 bpfif_ref(struct bpf_if *bp)
 {
 
 	refcount_acquire(&bp->bif_refcnt);
 }
 
 static void
 bpfif_rele(struct bpf_if *bp)
 {
 
 	if (!refcount_release(&bp->bif_refcnt))
 		return;
 	epoch_call(net_epoch_preempt, &bp->epoch_ctx, bpfif_free);
 }
 
 static void
 bpfd_ref(struct bpf_d *d)
 {
 
 	refcount_acquire(&d->bd_refcnt);
 }
 
 static void
 bpfd_rele(struct bpf_d *d)
 {
 
 	if (!refcount_release(&d->bd_refcnt))
 		return;
 	epoch_call(net_epoch_preempt, &d->epoch_ctx, bpfd_free);
 }
 
 static struct bpf_program_buffer*
 bpf_program_buffer_alloc(size_t size, int flags)
 {
 
 	return (malloc(sizeof(struct bpf_program_buffer) + size,
 	    M_BPF, flags));
 }
 
 static void
 bpf_program_buffer_free(epoch_context_t ctx)
 {
 	struct bpf_program_buffer *ptr;
 
 	ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
 #ifdef BPF_JITTER
 	if (ptr->func != NULL)
 		bpf_destroy_jit_filter(ptr->func);
 #endif
 	free(ptr, M_BPF);
 }
 
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
  * similar to protosw, et.
  */
 static void
 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		counter_u64_add(d->bd_zcopy, 1);
 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_bytes");
 	}
 }
 
 static void
 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		counter_u64_add(d->bd_zcopy, 1);
 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_mbuf");
 	}
 }
 
 /*
  * This function gets called when the free buffer is re-assigned.
  */
 static void
 bpf_buf_reclaimed(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return;
 
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buf_reclaimed(d);
 		return;
 
 	default:
 		panic("bpf_buf_reclaimed");
 	}
 }
 
 /*
  * If the buffer mechanism has a way to decide that a held buffer can be made
  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
  * returned if the buffer can be discarded, (0) is returned if it cannot.
  */
 static int
 bpf_canfreebuf(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canfreebuf(d));
 	}
 	return (0);
 }
 
 /*
  * Allow the buffer model to indicate that the current store buffer is
  * immutable, regardless of the appearance of space.  Return (1) if the
  * buffer is writable, and (0) if not.
  */
 static int
 bpf_canwritebuf(struct bpf_d *d)
 {
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canwritebuf(d));
 	}
 	return (1);
 }
 
 /*
  * Notify buffer model that an attempt to write to the store buffer has
  * resulted in a dropped packet, in which case the buffer may be considered
  * full.
  */
 static void
 bpf_buffull(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buffull(d);
 		break;
 	}
 }
 
 /*
  * Notify the buffer model that a buffer has moved into the hold position.
  */
 void
 bpf_bufheld(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_bufheld(d);
 		break;
 	}
 }
 
 static void
 bpf_free(struct bpf_d *d)
 {
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_free(d));
 
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_free(d));
 
 	default:
 		panic("bpf_buf_free");
 	}
 }
 
 static int
 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_uiomove(d, buf, len, uio));
 }
 
 static int
 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_ioctl_sblen(d, i));
 }
 
 static int
 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 }
 
 static int
 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 }
 
 static int
 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 }
 
 /*
  * General BPF functions.
  */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
 {
 	const struct ieee80211_bpf_params *p;
 	struct ether_header *eh;
 	struct mbuf *m;
 	int error;
 	int len;
 	int hlen;
 	int slen;
 
 	/*
 	 * Build a sockaddr based on the data link layer type.
 	 * We do this at this level because the ethernet header
 	 * is copied directly into the data field of the sockaddr.
 	 * In the case of SLIP, there is no header and the packet
 	 * is forwarded as is.
 	 * Also, we are careful to leave room at the front of the mbuf
 	 * for the link level header.
 	 */
 	switch (linktype) {
 
 	case DLT_SLIP:
 		sockp->sa_family = AF_INET;
 		hlen = 0;
 		break;
 
 	case DLT_EN10MB:
 		sockp->sa_family = AF_UNSPEC;
 		/* XXX Would MAXLINKHDR be better? */
 		hlen = ETHER_HDR_LEN;
 		break;
 
 	case DLT_FDDI:
 		sockp->sa_family = AF_IMPLINK;
 		hlen = 0;
 		break;
 
 	case DLT_RAW:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 0;
 		break;
 
 	case DLT_NULL:
 		/*
 		 * null interface types require a 4 byte pseudo header which
 		 * corresponds to the address family of the packet.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;
 		break;
 
 	case DLT_ATM_RFC1483:
 		/*
 		 * en atm driver requires 4-byte atm pseudo header.
 		 * though it isn't standard, vpi:vci needs to be
 		 * specified anyway.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 		break;
 
 	case DLT_PPP:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;	/* This should match PPP_HDRLEN */
 		break;
 
 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
 		sockp->sa_family = AF_IEEE80211;
 		hlen = 0;
 		break;
 
 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
 		sockp->sa_family = AF_IEEE80211;
 		sockp->sa_len = 12;	/* XXX != 0 */
 		hlen = sizeof(struct ieee80211_bpf_params);
 		break;
 
 	default:
 		return (EIO);
 	}
 
 	len = uio->uio_resid;
 	if (len < hlen || len - hlen > ifp->if_mtu)
 		return (EMSGSIZE);
 
 	m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return (EIO);
 	m->m_pkthdr.len = m->m_len = len;
 	*mp = m;
 
 	error = uiomove(mtod(m, u_char *), len, uio);
 	if (error)
 		goto bad;
 
 	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
 	}
 
 	/* Check for multicast destination */
 	switch (linktype) {
 	case DLT_EN10MB:
 		eh = mtod(m, struct ether_header *);
 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 			    ETHER_ADDR_LEN) == 0)
 				m->m_flags |= M_BCAST;
 			else
 				m->m_flags |= M_MCAST;
 		}
 		if (d->bd_hdrcmplt == 0) {
 			memcpy(eh->ether_shost, IF_LLADDR(ifp),
 			    sizeof(eh->ether_shost));
 		}
 		break;
 	}
 
 	/*
 	 * Make room for link header, and copy it to sockaddr
 	 */
 	if (hlen != 0) {
 		if (sockp->sa_family == AF_IEEE80211) {
 			/*
 			 * Collect true length from the parameter header
 			 * NB: sockp is known to be zero'd so if we do a
 			 *     short copy unspecified parameters will be
 			 *     zero.
 			 * NB: packet may not be aligned after stripping
 			 *     bpf params
 			 * XXX check ibp_vers
 			 */
 			p = mtod(m, const struct ieee80211_bpf_params *);
 			hlen = p->ibp_len;
 			if (hlen > sizeof(sockp->sa_data)) {
 				error = EINVAL;
 				goto bad;
 			}
 		}
 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
 	}
 	*hdrlen = hlen;
 
 	return (0);
 bad:
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Attach descriptor to the bpf interface, i.e. make d listen on bp,
  * then reset its buffers and counters with reset_d().
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	int op_w;
 
 	BPF_LOCK_ASSERT();
 
 	/*
 	 * Save sysctl value to protect from sysctl change
 	 * between reads
 	 */
 	op_w = V_bpf_optimize_writers || d->bd_writer;
 
 	if (d->bd_bif != NULL)
 		bpf_detachd_locked(d, false);
 	/*
 	 * Point d at bp, and add d to the interface's list.
 	 * Since there are many applications using BPF for
 	 * sending raw packets only (dhcpd, cdpd are good examples)
 	 * we can delay adding d to the list of active listeners until
 	 * some filter is configured.
 	 */
 
 	BPFD_LOCK(d);
 	/*
 	 * Hold reference to bpif while descriptor uses this interface.
 	 */
 	bpfif_ref(bp);
 	d->bd_bif = bp;
 	if (op_w != 0) {
 		/* Add to writers-only list */
 		CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 		/*
 		 * We decrement bd_writer on every filter set operation.
 		 * First BIOCSETF is done by pcap_open_live() to set up
 		 * snap length. After that appliation usually sets its own
 		 * filter.
 		 */
 		d->bd_writer = 2;
 	} else
 		CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
 	reset_d(d);
 	BPFD_UNLOCK(d);
 	bpf_bpfd_cnt++;
 
 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
 
 	if (op_w == 0)
 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
 /*
  * Check if we need to upgrade our descriptor @d from write-only mode.
  */
 static int
 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
     int flen)
 {
 	int is_snap, need_upgrade;
 
 	/*
 	 * Check if we've already upgraded or new filter is empty.
 	 */
 	if (d->bd_writer == 0 || fcode == NULL)
 		return (0);
 
 	need_upgrade = 0;
 
 	/*
 	 * Check if cmd looks like snaplen setting from
 	 * pcap_bpf.c:pcap_open_live().
 	 * Note we're not checking .k value here:
 	 * while pcap_open_live() definitely sets to non-zero value,
 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 	 * do not consider upgrading immediately
 	 */
 	if (cmd == BIOCSETF && flen == 1 &&
 	    fcode[0].code == (BPF_RET | BPF_K))
 		is_snap = 1;
 	else
 		is_snap = 0;
 
 	if (is_snap == 0) {
 		/*
 		 * We're setting first filter and it doesn't look like
 		 * setting snaplen.  We're probably using bpf directly.
 		 * Upgrade immediately.
 		 */
 		need_upgrade = 1;
 	} else {
 		/*
 		 * Do not require upgrade by first BIOCSETF
 		 * (used to set snaplen) by pcap_open_live().
 		 */
 
 		if (--d->bd_writer == 0) {
 			/*
 			 * First snaplen filter has already
 			 * been set. This is probably catch-all
 			 * filter
 			 */
 			need_upgrade = 1;
 		}
 	}
 
 	CTR5(KTR_NET,
 	    "%s: filter function set by pid %d, "
 	    "bd_writer counter %d, snap %d upgrade %d",
 	    __func__, d->bd_pid, d->bd_writer,
 	    is_snap, need_upgrade);
 
 	return (need_upgrade);
 }
 
 /*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	BPF_LOCK();
 	bpf_detachd_locked(d, false);
 	BPF_UNLOCK();
 }
 
 static void
 bpf_detachd_locked(struct bpf_d *d, bool detached_ifp)
 {
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 	int error;
 
 	BPF_LOCK_ASSERT();
 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 
 	/* Check if descriptor is attached */
 	if ((bp = d->bd_bif) == NULL)
 		return;
 
 	BPFD_LOCK(d);
 	/* Remove d from the interface's descriptor list. */
 	CK_LIST_REMOVE(d, bd_next);
 	/* Save bd_writer value */
 	error = d->bd_writer;
 	ifp = bp->bif_ifp;
 	d->bd_bif = NULL;
 	if (detached_ifp) {
 		/*
 		 * Notify descriptor as it's detached, so that any
 		 * sleepers wake up and get ENXIO.
 		 */
 		bpf_wakeup(d);
 	}
 	BPFD_UNLOCK(d);
 	bpf_bpfd_cnt--;
 
 	/* Call event handler iff d is attached */
 	if (error == 0)
 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
 	 * If so and ifnet is not detached, turn it off.
 	 */
 	if (d->bd_promisc && !detached_ifp) {
 		d->bd_promisc = 0;
 		CURVNET_SET(ifp->if_vnet);
 		error = ifpromisc(ifp, 0);
 		CURVNET_RESTORE();
 		if (error != 0 && error != ENXIO) {
 			/*
 			 * ENXIO can happen if a pccard is unplugged
 			 * Something is really wrong if we were able to put
 			 * the driver into promiscuous mode, but can't
 			 * take it out.
 			 */
 			if_printf(bp->bif_ifp,
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
 	bpfif_rele(bp);
 }
 
 /*
  * Close the descriptor by detaching it from its interface,
  * deallocating its buffers, and marking it free.
  */
 static void
 bpf_dtor(void *data)
 {
 	struct bpf_d *d = data;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 	funsetown(&d->bd_sigio);
 	bpf_detachd(d);
 #ifdef MAC
 	mac_bpfdesc_destroy(d);
 #endif /* MAC */
 	seldrain(&d->bd_sel);
 	knlist_destroy(&d->bd_sel.si_note);
 	callout_drain(&d->bd_callout);
 	bpfd_rele(d);
 }
 
 /*
  * Open ethernet device.  Returns ENXIO for illegal minor device number,
  * EBUSY if file is open by another process.
  */
 /* ARGSUSED */
 static	int
 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	error = devfs_set_cdevpriv(d, bpf_dtor);
 	if (error != 0) {
 		free(d, M_BPF);
 		return (error);
 	}
 
 	/* Setup counters */
 	d->bd_rcount = counter_u64_alloc(M_WAITOK);
 	d->bd_dcount = counter_u64_alloc(M_WAITOK);
 	d->bd_fcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wfcount = counter_u64_alloc(M_WAITOK);
 	d->bd_wdcount = counter_u64_alloc(M_WAITOK);
 	d->bd_zcopy = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * For historical reasons, perform a one-time initialization call to
 	 * the buffer routines, even though we're not yet committed to a
 	 * particular buffer method.
 	 */
 	bpf_buffer_init(d);
 	if ((flags & FREAD) == 0)
 		d->bd_writer = 2;
 	d->bd_hbuf_in_use = 0;
 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	d->bd_refcnt = 1;
 	BPF_PID_REFRESH(d, td);
 #ifdef MAC
 	mac_bpfdesc_init(d);
 	mac_bpfdesc_create(td->td_ucred, d);
 #endif
 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
 
 	return (0);
 }
 
 /*
  *  bpfread - read next chunk of packets from buffers
  */
 static	int
 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	int error;
 	int non_block;
 	int timed_out;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Restrict application to use a buffer the same size as
 	 * as kernel buffers.
 	 */
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
 	non_block = ((ioflag & O_NONBLOCK) != 0);
 
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
 		BPFD_UNLOCK(d);
 		return (EOPNOTSUPP);
 	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
 	while (d->bd_hbuf_in_use) {
 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 		    PRINET|PCATCH, "bd_hbuf", 0);
 		if (error != 0) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 	}
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
 	while (d->bd_hbuf == NULL) {
 		if (d->bd_slen != 0) {
 			/*
 			 * A packet(s) either arrived since the previous
 			 * read or arrived while we were asleep.
 			 */
 			if (d->bd_immediate || non_block || timed_out) {
 				/*
 				 * Rotate the buffers and return what's here
 				 * if we are in immediate mode, non-blocking
 				 * flag is set, or this descriptor timed out.
 				 */
 				ROTATE_BUFFERS(d);
 				break;
 			}
 		}
 
 		/*
 		 * No data is available, check to see if the bpf device
 		 * is still pointed at a real interface.  If not, return
 		 * ENXIO so that the userland process knows to rebind
 		 * it before using it again.
 		 */
 		if (d->bd_bif == NULL) {
 			BPFD_UNLOCK(d);
 			return (ENXIO);
 		}
 
 		if (non_block) {
 			BPFD_UNLOCK(d);
 			return (EWOULDBLOCK);
 		}
 		error = msleep(d, &d->bd_lock, PRINET|PCATCH,
 		     "bpf", d->bd_rtout);
 		if (error == EINTR || error == ERESTART) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 		if (error == EWOULDBLOCK) {
 			/*
 			 * On a timeout, return what's in the buffer,
 			 * which may be nothing.  If there is something
 			 * in the store buffer, we can rotate the buffers.
 			 */
 			if (d->bd_hbuf)
 				/*
 				 * We filled up the buffer in between
 				 * getting the timeout and arriving
 				 * here, so we don't need to rotate.
 				 */
 				break;
 
 			if (d->bd_slen == 0) {
 				BPFD_UNLOCK(d);
 				return (0);
 			}
 			ROTATE_BUFFERS(d);
 			break;
 		}
 	}
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
 	d->bd_hbuf_in_use = 1;
 	BPFD_UNLOCK(d);
 
 	/*
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
   	 *
 	 * We do not have to worry about simultaneous reads because
 	 * we waited for sole access to the hold buffer above.
 	 */
 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 	bpf_buf_reclaimed(d);
 	d->bd_hbuf_in_use = 0;
 	wakeup(&d->bd_hbuf_in_use);
 	BPFD_UNLOCK(d);
 
 	return (error);
 }
 
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
 static __inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_state == BPF_WAITING) {
 		callout_stop(&d->bd_callout);
 		d->bd_state = BPF_IDLE;
 	}
 	wakeup(d);
 	if (d->bd_async && d->bd_sig && d->bd_sigio)
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
 }
 
 static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (callout_pending(&d->bd_callout) ||
 	    !callout_active(&d->bd_callout))
 		return;
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
 }
 
 static int
 bpf_ready(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
 		return (1);
 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
 	    d->bd_slen != 0)
 		return (1);
 	return (0);
 }
 
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct route ro;
 	struct sockaddr dst;
 	struct epoch_tracker et;
 	struct bpf_if *bp;
 	struct bpf_d *d;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	counter_u64_add(d->bd_wcount, 1);
 	if ((bp = d->bd_bif) == NULL) {
 		error = ENXIO;
 		goto out_locked;
 	}
 
 	ifp = bp->bif_ifp;
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		error = ENETDOWN;
 		goto out_locked;
 	}
 
 	if (uio->uio_resid == 0)
 		goto out_locked;
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 
 	/*
 	 * Take extra reference, unlock d and exit from epoch section,
 	 * since bpf_movein() can sleep.
 	 */
 	bpfd_ref(d);
 	NET_EPOCH_EXIT(et);
 	BPFD_UNLOCK(d);
 
 	error = bpf_movein(uio, (int)bp->bif_dlt, ifp,
 	    &m, &dst, &hlen, d);
 
 	if (error != 0) {
 		counter_u64_add(d->bd_wdcount, 1);
 		bpfd_rele(d);
 		return (error);
 	}
 
 	BPFD_LOCK(d);
 	/*
 	 * Check that descriptor is still attached to the interface.
 	 * This can happen on bpfdetach(). To avoid access to detached
 	 * ifnet, free mbuf and return ENXIO.
 	 */
 	if (d->bd_bif == NULL) {
 		counter_u64_add(d->bd_wdcount, 1);
 		BPFD_UNLOCK(d);
 		bpfd_rele(d);
 		m_freem(m);
 		return (ENXIO);
 	}
 	counter_u64_add(d->bd_wfcount, 1);
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
 	if (d->bd_feedback) {
 		mc = m_dup(m, M_NOWAIT);
 		if (mc != NULL)
 			mc->m_pkthdr.rcvif = ifp;
 		/* Set M_PROMISC for outgoing packets to be discarded. */
 		if (d->bd_direction == BPF_D_INOUT)
 			m->m_flags |= M_PROMISC;
 	} else
 		mc = NULL;
 
 	m->m_pkthdr.len -= hlen;
 	m->m_len -= hlen;
 	m->m_data += hlen;	/* XXX */
 
 	CURVNET_SET(ifp->if_vnet);
 #ifdef MAC
 	mac_bpfdesc_create_mbuf(d, m);
 	if (mc != NULL)
 		mac_bpfdesc_create_mbuf(d, mc);
 #endif
 
 	bzero(&ro, sizeof(ro));
 	if (hlen != 0) {
 		ro.ro_prepend = (u_char *)&dst.sa_data;
 		ro.ro_plen = hlen;
 		ro.ro_flags = RT_HAS_HEADER;
 	}
 
 	/* Avoid possible recursion on BPFD_LOCK(). */
 	NET_EPOCH_ENTER(et);
 	BPFD_UNLOCK(d);
 	error = (*ifp->if_output)(ifp, m, &dst, &ro);
 	if (error)
 		counter_u64_add(d->bd_wdcount, 1);
 
 	if (mc != NULL) {
 		if (error == 0)
 			(*ifp->if_input)(ifp, mc);
 		else
 			m_freem(mc);
 	}
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	bpfd_rele(d);
 	return (error);
 
 out_locked:
 	counter_u64_add(d->bd_wdcount, 1);
 	NET_EPOCH_EXIT(et);
 	BPFD_UNLOCK(d);
 	return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the receive
  * and drop counts.  This is doable for kernel-only buffers, but with
  * zero-copy buffers, we can't write to (or rotate) buffers that are
  * currently owned by userspace.  It would be nice if we could encapsulate
  * this logic in the buffer code rather than here.
  */
 static void
 reset_d(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	while (d->bd_hbuf_in_use)
 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
 		    "bd_hbuf", 0);
 	if ((d->bd_hbuf != NULL) &&
 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 	if (bpf_canwritebuf(d))
 		d->bd_slen = 0;
 	counter_u64_zero(d->bd_rcount);
 	counter_u64_zero(d->bd_dcount);
 	counter_u64_zero(d->bd_fcount);
 	counter_u64_zero(d->bd_wcount);
 	counter_u64_zero(d->bd_wfcount);
 	counter_u64_zero(d->bd_wdcount);
 	counter_u64_zero(d->bd_zcopy);
 }
 
 /*
  *  FIONREAD		Check for read packet available.
  *  BIOCGBLEN		Get buffer len [for read()].
  *  BIOCSETF		Set read filter.
  *  BIOCSETFNR		Set read filter without resetting descriptor.
  *  BIOCSETWF		Set write filter.
  *  BIOCFLUSH		Flush read packet buffer.
  *  BIOCPROMISC		Put interface into promiscuous mode.
  *  BIOCGDLT		Get link layer type.
  *  BIOCGETIF		Get interface name.
  *  BIOCSETIF		Set interface.
  *  BIOCSRTIMEOUT	Set read timeout.
  *  BIOCGRTIMEOUT	Get read timeout.
  *  BIOCGSTATS		Get packet stats.
  *  BIOCIMMEDIATE	Set immediate mode.
  *  BIOCVERSION		Get filter language version.
  *  BIOCGHDRCMPLT	Get "header already complete" flag
  *  BIOCSHDRCMPLT	Set "header already complete" flag
  *  BIOCGDIRECTION	Get packet direction flag
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCGTSTAMP		Get time stamp format and resolution.
  *  BIOCSTSTAMP		Set time stamp format and resolution.
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
  *  BIOCSETZBUF		Set current zero-copy buffer locations.
  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
  *  BIOCROTZBUF		Force rotation of zero-copy buffer
  *  BIOCSETBUFMODE	Set buffer mode.
  *  BIOCGETBUFMODE	Get current buffer mode.
  */
 /* ARGSUSED */
 static	int
 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 
 	if (d->bd_locked == 1) {
 		switch (cmd) {
 		case BIOCGBLEN:
 		case BIOCFLUSH:
 		case BIOCGDLT:
 		case BIOCGDLTLIST:
 #ifdef COMPAT_FREEBSD32
 		case BIOCGDLTLIST32:
 #endif
 		case BIOCGETIF:
 		case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 		case BIOCGRTIMEOUT32:
 #endif
 		case BIOCGSTATS:
 		case BIOCVERSION:
 		case BIOCGRSIG:
 		case BIOCGHDRCMPLT:
 		case BIOCSTSTAMP:
 		case BIOCFEEDBACK:
 		case FIONREAD:
 		case BIOCLOCK:
 		case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 		case BIOCSRTIMEOUT32:
 #endif
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
 		case BIOCROTZBUF:
 			break;
 		default:
 			return (EPERM);
 		}
 	}
 #ifdef COMPAT_FREEBSD32
 	/*
 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
 	 * that it will get 32-bit packet headers.
 	 */
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 	case BIOCGDLTLIST32:
 	case BIOCGRTIMEOUT32:
 	case BIOCSRTIMEOUT32:
 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 			BPFD_LOCK(d);
 			d->bd_compat32 = 1;
 			BPFD_UNLOCK(d);
 		}
 	}
 #endif
 
 	CURVNET_SET(TD_TO_VNET(td));
 	switch (cmd) {
 
 	default:
 		error = EINVAL;
 		break;
 
 	/*
 	 * Check for read packet available.
 	 */
 	case FIONREAD:
 		{
 			int n;
 
 			BPFD_LOCK(d);
 			n = d->bd_slen;
 			while (d->bd_hbuf_in_use)
 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 				    PRINET, "bd_hbuf", 0);
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
 			BPFD_UNLOCK(d);
 
 			*(int *)addr = n;
 			break;
 		}
 
 	/*
 	 * Get buffer len [for read()].
 	 */
 	case BIOCGBLEN:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufsize;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
 		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
 	 * Set link layer read filter.
 	 */
 	case BIOCSETF:
 	case BIOCSETFNR:
 	case BIOCSETWF:
 #ifdef COMPAT_FREEBSD32
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 #endif
 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 		break;
 
 	/*
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
 		if (d->bd_bif == NULL) {
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 			break;
 		}
 		if (d->bd_promisc == 0) {
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
 		break;
 
 	/*
 	 * Get current data link type.
 	 */
 	case BIOCGDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get a list of supported data link types.
 	 */
 #ifdef COMPAT_FREEBSD32
 	case BIOCGDLTLIST32:
 		{
 			struct bpf_dltlist32 *list32;
 			struct bpf_dltlist dltlist;
 
 			list32 = (struct bpf_dltlist32 *)addr;
 			dltlist.bfl_len = list32->bfl_len;
 			dltlist.bfl_list = PTRIN(list32->bfl_list);
 			BPF_LOCK();
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				error = bpf_getdltlist(d, &dltlist);
 				if (error == 0)
 					list32->bfl_len = dltlist.bfl_len;
 			}
 			BPF_UNLOCK();
 			break;
 		}
 #endif
 
 	case BIOCGDLTLIST:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set data link type.
 	 */
 	case BIOCSDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get interface name.
 	 */
 	case BIOCGETIF:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else {
 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
 			struct ifreq *const ifr = (struct ifreq *)addr;
 
 			strlcpy(ifr->ifr_name, ifp->if_xname,
 			    sizeof(ifr->ifr_name));
 		}
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set interface.
 	 */
 	case BIOCSETIF:
 		{
 			int alloc_buf, size;
 
 			/*
 			 * Behavior here depends on the buffering model.  If
 			 * we're using kernel memory buffers, then we can
 			 * allocate them here.  If we're using zero-copy,
 			 * then the user process must have registered buffers
 			 * by the time we get here.
 			 */
 			alloc_buf = 0;
 			BPFD_LOCK(d);
 			if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
 			    d->bd_sbuf == NULL)
 				alloc_buf = 1;
 			BPFD_UNLOCK(d);
 			if (alloc_buf) {
 				size = d->bd_bufsize;
 				error = bpf_buffer_ioctl_sblen(d, &size);
 				if (error != 0)
 					break;
 			}
 			BPF_LOCK();
 			error = bpf_setif(d, (struct ifreq *)addr);
 			BPF_UNLOCK();
 			break;
 		}
 
 	/*
 	 * Set read timeout.
 	 */
 	case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 	case BIOCSRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv = (struct timeval *)addr;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCSRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv = &tv64;
 				tv->tv_sec = tv32->tv_sec;
 				tv->tv_usec = tv32->tv_usec;
 			} else
 #endif
 				tv = (struct timeval *)addr;
 
 			/*
 			 * Subtract 1 tick from tvtohz() since this isn't
 			 * a one-shot timer.
 			 */
 			if ((error = itimerfix(tv)) == 0)
 				d->bd_rtout = tvtohz(tv) - 1;
 			break;
 		}
 
 	/*
 	 * Get read timeout.
 	 */
 	case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 	case BIOCGRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv;
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCGRTIMEOUT32)
 				tv = &tv64;
 			else
 #endif
 				tv = (struct timeval *)addr;
 
 			tv->tv_sec = d->bd_rtout / hz;
 			tv->tv_usec = (d->bd_rtout % hz) * tick;
 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
 			if (cmd == BIOCGRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv32->tv_sec = tv->tv_sec;
 				tv32->tv_usec = tv->tv_usec;
 			}
 #endif
 
 			break;
 		}
 
 	/*
 	 * Get packet stats.
 	 */
 	case BIOCGSTATS:
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
 			/* XXXCSJP overflow */
 			bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
 			bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
 			break;
 		}
 
 	/*
 	 * Set immediate mode.
 	 */
 	case BIOCIMMEDIATE:
 		BPFD_LOCK(d);
 		d->bd_immediate = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCVERSION:
 		{
 			struct bpf_version *bv = (struct bpf_version *)addr;
 
 			bv->bv_major = BPF_MAJOR_VERSION;
 			bv->bv_minor = BPF_MINOR_VERSION;
 			break;
 		}
 
 	/*
 	 * Get "header already complete" flag
 	 */
 	case BIOCGHDRCMPLT:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_hdrcmplt;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set "header already complete" flag
 	 */
 	case BIOCSHDRCMPLT:
 		BPFD_LOCK(d);
 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Get packet direction flag
 	 */
 	case BIOCGDIRECTION:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_direction;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet direction flag
 	 */
 	case BIOCSDIRECTION:
 		{
 			u_int	direction;
 
 			direction = *(u_int *)addr;
 			switch (direction) {
 			case BPF_D_IN:
 			case BPF_D_INOUT:
 			case BPF_D_OUT:
 				BPFD_LOCK(d);
 				d->bd_direction = direction;
 				BPFD_UNLOCK(d);
 				break;
 			default:
 				error = EINVAL;
 			}
 		}
 		break;
 
 	/*
 	 * Get packet timestamp format and resolution.
 	 */
 	case BIOCGTSTAMP:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_tstamp;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet timestamp format and resolution.
 	 */
 	case BIOCSTSTAMP:
 		{
 			u_int	func;
 
 			func = *(u_int *)addr;
 			if (BPF_T_VALID(func))
 				d->bd_tstamp = func;
 			else
 				error = EINVAL;
 		}
 		break;
 
 	case BIOCFEEDBACK:
 		BPFD_LOCK(d);
 		d->bd_feedback = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCLOCK:
 		BPFD_LOCK(d);
 		d->bd_locked = 1;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIONBIO:		/* Non-blocking I/O */
 		break;
 
 	case FIOASYNC:		/* Send signal on receive packets */
 		BPFD_LOCK(d);
 		d->bd_async = *(int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIOSETOWN:
 		/*
 		 * XXX: Add some sort of locking here?
 		 * fsetown() can sleep.
 		 */
 		error = fsetown(*(int *)addr, &d->bd_sigio);
 		break;
 
 	case FIOGETOWN:
 		BPFD_LOCK(d);
 		*(int *)addr = fgetown(&d->bd_sigio);
 		BPFD_UNLOCK(d);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)addr = -fgetown(&d->bd_sigio);
 		break;
 
 	case BIOCSRSIG:		/* Set receive signal */
 		{
 			u_int sig;
 
 			sig = *(u_int *)addr;
 
 			if (sig >= NSIG)
 				error = EINVAL;
 			else {
 				BPFD_LOCK(d);
 				d->bd_sig = sig;
 				BPFD_UNLOCK(d);
 			}
 			break;
 		}
 	case BIOCGRSIG:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_sig;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETBUFMODE:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufmode;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCSETBUFMODE:
 		/*
 		 * Allow the buffering mode to be changed as long as we
 		 * haven't yet committed to a particular mode.  Our
 		 * definition of commitment, for now, is whether or not a
 		 * buffer has been allocated or an interface attached, since
 		 * that's the point where things get tricky.
 		 */
 		switch (*(u_int *)addr) {
 		case BPF_BUFMODE_BUFFER:
 			break;
 
 		case BPF_BUFMODE_ZBUF:
 			if (bpf_zerocopy_enable)
 				break;
 			/* FALLSTHROUGH */
 
 		default:
 			CURVNET_RESTORE();
 			return (EINVAL);
 		}
 
 		BPFD_LOCK(d);
 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
 			BPFD_UNLOCK(d);
 			CURVNET_RESTORE();
 			return (EBUSY);
 		}
 		d->bd_bufmode = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETZMAX:
 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
 		break;
 
 	case BIOCSETZBUF:
 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 
 	case BIOCROTZBUF:
 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Set d's packet filter program to fp. If this file already has a filter,
  * free it and replace it. Returns EINVAL for bogus requests.
  *
  * Note we use global lock here to serialize bpf_setf() and bpf_setif()
  * calls.
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
 {
 #ifdef COMPAT_FREEBSD32
 	struct bpf_program fp_swab;
 	struct bpf_program32 *fp32;
 #endif
 	struct bpf_program_buffer *fcode;
 	struct bpf_insn *filter;
 #ifdef BPF_JITTER
 	bpf_jit_filter *jfunc;
 #endif
 	size_t size;
 	u_int flen;
 	bool track_event;
 
 #ifdef COMPAT_FREEBSD32
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETWF32:
 	case BIOCSETFNR32:
 		fp32 = (struct bpf_program32 *)fp;
 		fp_swab.bf_len = fp32->bf_len;
 		fp_swab.bf_insns =
 		    (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
 		fp = &fp_swab;
 		switch (cmd) {
 		case BIOCSETF32:
 			cmd = BIOCSETF;
 			break;
 		case BIOCSETWF32:
 			cmd = BIOCSETWF;
 			break;
 		}
 		break;
 	}
 #endif
 
 	filter = NULL;
 #ifdef BPF_JITTER
 	jfunc = NULL;
 #endif
 	/*
 	 * Check new filter validness before acquiring any locks.
 	 * Allocate memory for new filter, if needed.
 	 */
 	flen = fp->bf_len;
 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
 		return (EINVAL);
 	size = flen * sizeof(*fp->bf_insns);
 	if (size > 0) {
 		/* We're setting up new filter. Copy and check actual data. */
 		fcode = bpf_program_buffer_alloc(size, M_WAITOK);
 		filter = (struct bpf_insn *)fcode->buffer;
 		if (copyin(fp->bf_insns, filter, size) != 0 ||
 		    !bpf_validate(filter, flen)) {
 			free(fcode, M_BPF);
 			return (EINVAL);
 		}
 #ifdef BPF_JITTER
 		if (cmd != BIOCSETWF) {
 			/*
 			 * Filter is copied inside fcode and is
 			 * perfectly valid.
 			 */
 			jfunc = bpf_jitter(filter, flen);
 		}
 #endif
 	}
 
 	track_event = false;
 	fcode = NULL;
 
 	BPF_LOCK();
 	BPFD_LOCK(d);
 	/* Set up new filter. */
 	if (cmd == BIOCSETWF) {
 		if (d->bd_wfilter != NULL) {
 			fcode = __containerof((void *)d->bd_wfilter,
 			    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 			fcode->func = NULL;
 #endif
 		}
 		d->bd_wfilter = filter;
 	} else {
 		if (d->bd_rfilter != NULL) {
 			fcode = __containerof((void *)d->bd_rfilter,
 			    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 			fcode->func = d->bd_bfilter;
 #endif
 		}
 		d->bd_rfilter = filter;
 #ifdef BPF_JITTER
 		d->bd_bfilter = jfunc;
 #endif
 		if (cmd == BIOCSETF)
 			reset_d(d);
 
 		if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
 			/*
 			 * Filter can be set several times without
 			 * specifying interface. In this case just mark d
 			 * as reader.
 			 */
 			d->bd_writer = 0;
 			if (d->bd_bif != NULL) {
 				/*
 				 * Remove descriptor from writers-only list
 				 * and add it to active readers list.
 				 */
 				CK_LIST_REMOVE(d, bd_next);
 				CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
 				    d, bd_next);
 				CTR2(KTR_NET,
 				    "%s: upgrade required by pid %d",
 				    __func__, d->bd_pid);
 				track_event = true;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 
 	if (fcode != NULL)
 		epoch_call(net_epoch_preempt, &fcode->epoch_ctx,
 		    bpf_program_buffer_free);
 
 	if (track_event)
 		EVENTHANDLER_INVOKE(bpf_track,
 		    d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1);
 
 	BPF_UNLOCK();
 	return (0);
 }
 
 /*
  * Detach a file from its current interface (if attached at all) and attach
  * to the interface indicated by the name stored in ifr.
  * Return an errno or 0.
  */
 static int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
 	struct bpf_if *bp;
 	struct ifnet *theywant;
 
 	BPF_LOCK_ASSERT();
 
 	theywant = ifunit(ifr->ifr_name);
 	if (theywant == NULL || theywant->if_bpf == NULL)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
 	/*
 	 * At this point, we expect the buffer is already allocated.  If not,
 	 * return an error.
 	 */
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 	case BPF_BUFMODE_ZBUF:
 		if (d->bd_sbuf == NULL)
 			return (EINVAL);
 		break;
 
 	default:
 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
 	}
 	if (bp != d->bd_bif)
 		bpf_attachd(d, bp);
 	else {
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 	}
 	return (0);
 }
 
 /*
  * Support for select() and poll() system calls
  *
  * Return true iff the specific operation will not block indefinitely.
  * Otherwise, return false but make a note that a selwakeup() must be done.
  */
 static int
 bpfpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct bpf_d *d;
 	int revents;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	revents = events & (POLLOUT | POLLWRNORM);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (bpf_ready(d))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			selrecord(td, &d->bd_sel);
 			/* Start the read timeout if necessary. */
 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 				callout_reset(&d->bd_callout, d->bd_rtout,
 				    bpf_timed_out, d);
 				d->bd_state = BPF_WAITING;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 	return (revents);
 }
 
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
  */
 int
 bpfkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct bpf_d *d;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 ||
 	    kn->kn_filter != EVFILT_READ)
 		return (1);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	kn->kn_fop = &bpfread_filtops;
 	kn->kn_hook = d;
 	knlist_add(&d->bd_sel.si_note, kn, 1);
 	BPFD_UNLOCK(d);
 
 	return (0);
 }
 
 static void
 filt_bpfdetach(struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	knlist_remove(&d->bd_sel.si_note, kn, 0);
 }
 
 static int
 filt_bpfread(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 	int ready;
 
 	BPFD_LOCK_ASSERT(d);
 	ready = bpf_ready(d);
 	if (ready) {
 		kn->kn_data = d->bd_slen;
 		/*
 		 * Ignore the hold buffer if it is being copied to user space.
 		 */
 		if (!d->bd_hbuf_in_use && d->bd_hbuf)
 			kn->kn_data += d->bd_hlen;
 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 		callout_reset(&d->bd_callout, d->bd_rtout,
 		    bpf_timed_out, d);
 		d->bd_state = BPF_WAITING;
 	}
 
 	return (ready);
 }
 
 #define	BPF_TSTAMP_NONE		0
 #define	BPF_TSTAMP_FAST		1
 #define	BPF_TSTAMP_NORMAL	2
 #define	BPF_TSTAMP_EXTERN	3
 
 static int
 bpf_ts_quality(int tstype)
 {
 
 	if (tstype == BPF_T_NONE)
 		return (BPF_TSTAMP_NONE);
 	if ((tstype & BPF_T_FAST) != 0)
 		return (BPF_TSTAMP_FAST);
 
 	return (BPF_TSTAMP_NORMAL);
 }
 
 static int
 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
 {
 	struct m_tag *tag;
 	int quality;
 
 	quality = bpf_ts_quality(tstype);
 	if (quality == BPF_TSTAMP_NONE)
 		return (quality);
 
 	if (m != NULL) {
 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
 		if (tag != NULL) {
 			*bt = *(struct bintime *)(tag + 1);
 			return (BPF_TSTAMP_EXTERN);
 		}
 	}
 	if (quality == BPF_TSTAMP_NORMAL)
 		binuptime(bt);
 	else
 		getbinuptime(bt);
 
 	return (quality);
 }
 
 /*
  * Incoming linkage from device drivers.  Process the packet pkt, of length
  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
  * by each process' filter, and if accepted, stashed into the corresponding
  * buffer.
  */
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int slen;
 	int gottime;
 
 	gottime = BPF_TSTAMP_NONE;
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		counter_u64_add(d->bd_rcount, 1);
 		/*
 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there
 		 * is no way for the caller to indiciate to us whether this
 		 * packet is inbound or outbound. In the bpf_mtap() routines,
 		 * we use the interface pointers on the mbuf to figure it out.
 		 */
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		if (bf != NULL)
 			slen = (*(bf->func))(pkt, pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
 		if (slen != 0) {
 			/*
 			 * Filter matches. Let's to acquire write lock.
 			 */
 			BPFD_LOCK(d);
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp,
 				    NULL);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, pkt, pktlen, slen,
 				    bpf_append_bytes, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 #define	BPF_CHECK_DIRECTION(d, r, i)				\
 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  * Locking model is explained in bpf_tap().
  */
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
-	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
+	if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	gottime = BPF_TSTAMP_NONE;
 
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
-		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
+		if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		/* XXX We cannot handle multiple mbufs. */
 		if (bf != NULL && m->m_next == NULL)
 			slen = (*(bf->func))(mtod(m, u_char *), pktlen,
 			    pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in
  * an mbuf chain and to be prepended by a contiguous header.
  */
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct bintime bt;
 	struct mbuf mb;
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	/*
 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
 	pktlen += dlen;
 
 	gottime = BPF_TSTAMP_NONE;
 
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 #undef	BPF_CHECK_DIRECTION
 #undef	BPF_TSTAMP_NONE
 #undef	BPF_TSTAMP_FAST
 #undef	BPF_TSTAMP_NORMAL
 #undef	BPF_TSTAMP_EXTERN
 
 static int
 bpf_hdrlen(struct bpf_d *d)
 {
 	int hdrlen;
 
 	hdrlen = d->bd_bif->bif_hdrlen;
 #ifndef BURN_BRIDGES
 	if (d->bd_tstamp == BPF_T_NONE ||
 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32)
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
 		else
 #endif
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
 	else
 #endif
 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		hdrlen = BPF_WORDALIGN32(hdrlen);
 	else
 #endif
 		hdrlen = BPF_WORDALIGN(hdrlen);
 
 	return (hdrlen - d->bd_bif->bif_hdrlen);
 }
 
 static void
 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
 {
 	struct bintime bt2, boottimebin;
 	struct timeval tsm;
 	struct timespec tsn;
 
 	if ((tstype & BPF_T_MONOTONIC) == 0) {
 		bt2 = *bt;
 		getboottimebin(&boottimebin);
 		bintime_add(&bt2, &boottimebin);
 		bt = &bt2;
 	}
 	switch (BPF_T_FORMAT(tstype)) {
 	case BPF_T_MICROTIME:
 		bintime2timeval(bt, &tsm);
 		ts->bt_sec = tsm.tv_sec;
 		ts->bt_frac = tsm.tv_usec;
 		break;
 	case BPF_T_NANOTIME:
 		bintime2timespec(bt, &tsn);
 		ts->bt_sec = tsn.tv_sec;
 		ts->bt_frac = tsn.tv_nsec;
 		break;
 	case BPF_T_BINTIME:
 		ts->bt_sec = bt->sec;
 		ts->bt_frac = bt->frac;
 		break;
 	}
 }
 
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
     struct bintime *bt)
 {
 	struct bpf_xhdr hdr;
 #ifndef BURN_BRIDGES
 	struct bpf_hdr hdr_old;
 #ifdef COMPAT_FREEBSD32
 	struct bpf_hdr32 hdr32_old;
 #endif
 #endif
 	int caplen, curlen, hdrlen, totlen;
 	int do_wakeup = 0;
 	int do_timestamp;
 	int tstype;
 
 	BPFD_LOCK_ASSERT(d);
 
 	/*
 	 * Detect whether user space has released a buffer back to us, and if
 	 * so, move it from being a hold buffer to a free buffer.  This may
 	 * not be the best place to do it (for example, we might only want to
 	 * run this check if we need the space), but for now it's a reliable
 	 * spot to do it.
 	 */
 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
 	 * much.  Otherwise, transfer the whole packet (unless
 	 * we hit the buffer size limit).
 	 */
 	hdrlen = bpf_hdrlen(d);
 	totlen = hdrlen + min(snaplen, pktlen);
 	if (totlen > d->bd_bufsize)
 		totlen = d->bd_bufsize;
 
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 *
 	 * Drop the packet if there's no room and no hope of room
 	 * If the packet would overflow the storage buffer or the storage
 	 * buffer is considered immutable by the buffer model, try to rotate
 	 * the buffer and wakeup pending processes.
 	 */
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		curlen = BPF_WORDALIGN32(d->bd_slen);
 	else
 #endif
 		curlen = BPF_WORDALIGN(d->bd_slen);
 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
 		if (d->bd_fbuf == NULL) {
 			/*
 			 * There's no room in the store buffer, and no
 			 * prospect of room, so drop the packet.  Notify the
 			 * buffer model.
 			 */
 			bpf_buffull(d);
 			counter_u64_add(d->bd_dcount, 1);
 			return;
 		}
 		KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
 		 * Immediate mode is set, or the read timeout has already
 		 * expired during a select call.  A packet arrived, so the
 		 * reader should be woken up.
 		 */
 		do_wakeup = 1;
 	caplen = totlen - hdrlen;
 	tstype = d->bd_tstamp;
 	do_timestamp = tstype != BPF_T_NONE;
 #ifndef BURN_BRIDGES
 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
 		struct bpf_ts ts;
 		if (do_timestamp)
 			bpf_bintime2ts(bt, &ts, tstype);
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32) {
 			bzero(&hdr32_old, sizeof(hdr32_old));
 			if (do_timestamp) {
 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
 			}
 			hdr32_old.bh_datalen = pktlen;
 			hdr32_old.bh_hdrlen = hdrlen;
 			hdr32_old.bh_caplen = caplen;
 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
 			    sizeof(hdr32_old));
 			goto copy;
 		}
 #endif
 		bzero(&hdr_old, sizeof(hdr_old));
 		if (do_timestamp) {
 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
 		}
 		hdr_old.bh_datalen = pktlen;
 		hdr_old.bh_hdrlen = hdrlen;
 		hdr_old.bh_caplen = caplen;
 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
 		    sizeof(hdr_old));
 		goto copy;
 	}
 #endif
 
 	/*
 	 * Append the bpf header.  Note we append the actual header size, but
 	 * move forward the length of the header plus padding.
 	 */
 	bzero(&hdr, sizeof(hdr));
 	if (do_timestamp)
 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
 	hdr.bh_datalen = pktlen;
 	hdr.bh_hdrlen = hdrlen;
 	hdr.bh_caplen = caplen;
 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
 
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
 #ifndef BURN_BRIDGES
 copy:
 #endif
 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpfd_free(epoch_context_t ctx)
 {
 	struct bpf_d *d;
 	struct bpf_program_buffer *p;
 
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
 	d = __containerof(ctx, struct bpf_d, epoch_ctx);
 	bpf_free(d);
 	if (d->bd_rfilter != NULL) {
 		p = __containerof((void *)d->bd_rfilter,
 		    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 		p->func = d->bd_bfilter;
 #endif
 		bpf_program_buffer_free(&p->epoch_ctx);
 	}
 	if (d->bd_wfilter != NULL) {
 		p = __containerof((void *)d->bd_wfilter,
 		    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
 		p->func = NULL;
 #endif
 		bpf_program_buffer_free(&p->epoch_ctx);
 	}
 
 	mtx_destroy(&d->bd_lock);
 	counter_u64_free(d->bd_rcount);
 	counter_u64_free(d->bd_dcount);
 	counter_u64_free(d->bd_fcount);
 	counter_u64_free(d->bd_wcount);
 	counter_u64_free(d->bd_wfcount);
 	counter_u64_free(d->bd_wdcount);
 	counter_u64_free(d->bd_zcopy);
 	free(d, M_BPF);
 }
 
 /*
  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
  * fixed size of the link header (variable length headers not yet supported).
  */
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 /*
  * Attach an interface to bpf.  ifp is a pointer to the structure
  * defining the interface to be attached, dlt is the link layer type,
  * and hdrlen is the fixed size of the link header (variable length
  * headers are not yet supporrted).
  */
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen,
     struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
 	KASSERT(*driverp == NULL,
 	    ("bpfattach2: driverp already initialized"));
 
 	bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
 
 	CK_LIST_INIT(&bp->bif_dlist);
 	CK_LIST_INIT(&bp->bif_wlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	bp->bif_hdrlen = hdrlen;
 	bp->bif_bpf = driverp;
 	bp->bif_refcnt = 1;
 	*driverp = bp;
 	/*
 	 * Reference ifnet pointer, so it won't freed until
 	 * we release it.
 	 */
 	if_ref(ifp);
 	BPF_LOCK();
 	CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	BPF_UNLOCK();
 
 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
 		if_printf(ifp, "bpf attached\n");
 }
 
 #ifdef VIMAGE
 /*
  * When moving interfaces between vnet instances we need a way to
  * query the dlt and hdrlen before detach so we can re-attch the if_bpf
  * after the vmove.  We unfortunately have no device driver infrastructure
  * to query the interface for these values after creation/attach, thus
  * add this as a workaround.
  */
 int
 bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
 {
 
 	if (bp == NULL)
 		return (ENXIO);
 	if (bif_dlt == NULL && bif_hdrlen == NULL)
 		return (0);
 
 	if (bif_dlt != NULL)
 		*bif_dlt = bp->bif_dlt;
 	if (bif_hdrlen != NULL)
 		*bif_hdrlen = bp->bif_hdrlen;
 
 	return (0);
 }
 #endif
 
 /*
  * Detach bpf from an interface. This involves detaching each descriptor
  * associated with the interface. Notify each descriptor as it's detached
  * so that any sleepers wake up and get ENXIO.
  */
 void
 bpfdetach(struct ifnet *ifp)
 {
 	struct bpf_if *bp, *bp_temp;
 	struct bpf_d *d;
 
 	BPF_LOCK();
 	/* Find all bpf_if struct's which reference ifp and detach them. */
 	CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
 		if (ifp != bp->bif_ifp)
 			continue;
 
 		CK_LIST_REMOVE(bp, bif_next);
 		*bp->bif_bpf = (struct bpf_if *)&dead_bpf_if;
 
 		CTR4(KTR_NET,
 		    "%s: sheduling free for encap %d (%p) for if %p",
 		    __func__, bp->bif_dlt, bp, ifp);
 
 		/* Detach common descriptors */
 		while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
 			bpf_detachd_locked(d, true);
 		}
 
 		/* Detach writer-only descriptors */
 		while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
 			bpf_detachd_locked(d, true);
 		}
 		bpfif_rele(bp);
 	}
 	BPF_UNLOCK();
 }
 
 /*
  * Get a list of available data link type of the interface.
  */
 static int
 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
 {
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 	u_int *lst;
 	int error, n, n1;
 
 	BPF_LOCK_ASSERT();
 
 	ifp = d->bd_bif->bif_ifp;
 	n1 = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp)
 			n1++;
 	}
 	if (bfl->bfl_list == NULL) {
 		bfl->bfl_len = n1;
 		return (0);
 	}
 	if (n1 > bfl->bfl_len)
 		return (ENOMEM);
 
 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
 	n = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		lst[n++] = bp->bif_dlt;
 	}
 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
 	free(lst, M_TEMP);
 	bfl->bfl_len = n;
 	return (error);
 }
 
 /*
  * Set the data link type of a BPF instance.
  */
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
 	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 	MPASS(d->bd_bif != NULL);
 
 	/*
 	 * It is safe to check bd_bif without BPFD_LOCK, it can not be
 	 * changed while we hold global lock.
 	 */
 	if (d->bd_bif->bif_dlt == dlt)
 		return (0);
 
 	ifp = d->bd_bif->bif_ifp;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
 			break;
 	}
 	if (bp == NULL)
 		return (EINVAL);
 
 	opromisc = d->bd_promisc;
 	bpf_attachd(d, bp);
 	if (opromisc) {
 		error = ifpromisc(bp->bif_ifp, 1);
 		if (error)
 			if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n",
 			    __func__, error);
 		else
 			d->bd_promisc = 1;
 	}
 	return (0);
 }
 
 static void
 bpf_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	sx_init(&bpf_sx, "bpf global lock");
 	CK_LIST_INIT(&bpf_iflist);
 
 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
 	/* For compatibility */
 	make_dev_alias(dev, "bpf0");
 }
 
 /*
  * Zero out the various packet counters associated with all of the bpf
  * descriptors.  At some point, we will probably want to get a bit more
  * granular and allow the user to specify descriptors to be zeroed.
  */
 static void
 bpf_zero_counters(void)
 {
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	BPF_LOCK();
 	/*
 	 * We are protected by global lock here, interfaces and
 	 * descriptors can not be deleted while we hold it.
 	 */
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			counter_u64_zero(bd->bd_rcount);
 			counter_u64_zero(bd->bd_dcount);
 			counter_u64_zero(bd->bd_fcount);
 			counter_u64_zero(bd->bd_wcount);
 			counter_u64_zero(bd->bd_wfcount);
 			counter_u64_zero(bd->bd_zcopy);
 		}
 	}
 	BPF_UNLOCK();
 }
 
 /*
  * Fill filter statistics
  */
 static void
 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 {
 
 	BPF_LOCK_ASSERT();
 	bzero(d, sizeof(*d));
 	d->bd_structsize = sizeof(*d);
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
 	d->bd_direction = bd->bd_direction;
 	d->bd_feedback = bd->bd_feedback;
 	d->bd_async = bd->bd_async;
 	d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
 	d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
 	d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
 	d->bd_sig = bd->bd_sig;
 	d->bd_slen = bd->bd_slen;
 	d->bd_hlen = bd->bd_hlen;
 	d->bd_bufsize = bd->bd_bufsize;
 	d->bd_pid = bd->bd_pid;
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
 	d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
 	d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
 	d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
 	d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
 	d->bd_bufmode = bd->bd_bufmode;
 }
 
 /*
  * Handle `netstat -B' stats request
  */
 static int
 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	static const struct xbpf_d zerostats;
 	struct xbpf_d *xbdbuf, *xbd, tempstats;
 	int index, error;
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	/*
 	 * XXX This is not technically correct. It is possible for non
 	 * privileged users to open bpf devices. It would make sense
 	 * if the users who opened the devices were able to retrieve
 	 * the statistics for them, too.
 	 */
 	error = priv_check(req->td, PRIV_NET_BPF);
 	if (error)
 		return (error);
 	/*
 	 * Check to see if the user is requesting that the counters be
 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
 	 * as we aren't allowing the user to set the counters currently.
 	 */
 	if (req->newptr != NULL) {
 		if (req->newlen != sizeof(tempstats))
 			return (EINVAL);
 		memset(&tempstats, 0, sizeof(tempstats));
 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
 		if (error)
 			return (error);
 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
 			return (EINVAL);
 		bpf_zero_counters();
 		return (0);
 	}
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
 	if (bpf_bpfd_cnt == 0)
 		return (SYSCTL_OUT(req, 0, 0));
 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
 	BPF_LOCK();
 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
 		BPF_UNLOCK();
 		free(xbdbuf, M_BPF);
 		return (ENOMEM);
 	}
 	index = 0;
 	CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		/* Send writers-only first */
 		CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			bpfstats_fill_xbpf(xbd, bd);
 		}
 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			bpfstats_fill_xbpf(xbd, bd);
 		}
 	}
 	BPF_UNLOCK();
 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
 	free(xbdbuf, M_BPF);
 	return (error);
 }
 
 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
 
 #else /* !DEV_BPF && !NETGRAPH_BPF */
 
 /*
  * NOP stubs to allow bpf-using drivers to load and function.
  *
  * A 'better' implementation would allow the core bpf functionality
  * to be loaded at runtime.
  */
 
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
 {
 }
 
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 
 	*driverp = (struct bpf_if *)&dead_bpf_if;
 }
 
 void
 bpfdetach(struct ifnet *ifp)
 {
 }
 
 u_int
 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
 	return -1;	/* "no filter" behaviour */
 }
 
 int
 bpf_validate(const struct bpf_insn *f, int len)
 {
 	return 0;		/* false */
 }
 
 #endif /* !DEV_BPF && !NETGRAPH_BPF */
 
 #ifdef DDB
 static void
 bpf_show_bpf_if(struct bpf_if *bpf_if)
 {
 
 	if (bpf_if == NULL)
 		return;
 	db_printf("%p:\n", bpf_if);
 #define	BPF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, bpf_if->e);
 	/* bif_ext.bif_next */
 	/* bif_ext.bif_dlist */
 	BPF_DB_PRINTF("%#x", bif_dlt);
 	BPF_DB_PRINTF("%u", bif_hdrlen);
 	/* bif_wlist */
 	BPF_DB_PRINTF("%p", bif_ifp);
 	BPF_DB_PRINTF("%p", bif_bpf);
 	BPF_DB_PRINTF("%u", bif_refcnt);
 }
 
 DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show bpf_if <struct bpf_if *>\n");
 		return;
 	}
 
 	bpf_show_bpf_if((struct bpf_if *)addr);
 }
 #endif
Index: head/sys/net/if.c
===================================================================
--- head/sys/net/if.c	(revision 348253)
+++ head/sys/net/if.c	(revision 348254)
@@ -1,4598 +1,4600 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/domainset.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/epoch.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <netinet/if_ether.h>
 #include <netinet/netdump/netdump.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name
  * and ifr_ifru when it is used in SIOCGIFCONF.
  */
 _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) ==
     offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru");
 
 __read_mostly epoch_t net_epoch_preempt;
 __read_mostly epoch_t net_epoch;
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct ifreq_buffer32 {
 	uint32_t	length;		/* (size_t) */
 	uint32_t	buffer;		/* (void *) */
 };
 
 /*
  * Interface request structure used for socket
  * ioctl's.  All interface ioctl's must have parameter
  * definitions which begin with ifr_name.  The
  * remainder may be interface specific.
  */
 struct ifreq32 {
 	char	ifr_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	union {
 		struct sockaddr	ifru_addr;
 		struct sockaddr	ifru_dstaddr;
 		struct sockaddr	ifru_broadaddr;
 		struct ifreq_buffer32 ifru_buffer;
 		short		ifru_flags[2];
 		short		ifru_index;
 		int		ifru_jid;
 		int		ifru_metric;
 		int		ifru_mtu;
 		int		ifru_phys;
 		int		ifru_media;
 		uint32_t	ifru_data;
 		int		ifru_cap[2];
 		u_int		ifru_fib;
 		u_char		ifru_vlan_pcp;
 	} ifr_ifru;
 };
 CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32));
 CTASSERT(__offsetof(struct ifreq, ifr_ifru) ==
     __offsetof(struct ifreq32, ifr_ifru));
 
 struct ifgroupreq32 {
 	char	ifgr_name[IFNAMSIZ];
 	u_int	ifgr_len;
 	union {
 		char		ifgru_group[IFNAMSIZ];
 		uint32_t	ifgru_groups;
 	} ifgr_ifgru;
 };
 
 struct ifmediareq32 {
 	char		ifm_name[IFNAMSIZ];
 	int		ifm_current;
 	int		ifm_mask;
 	int		ifm_status;
 	int		ifm_active;
 	int		ifm_count;
 	uint32_t	ifm_ulist;	/* (int *) */
 };
 #define	SIOCGIFMEDIA32	_IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32)
 #define	SIOCGIFXMEDIA32	_IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32)
 
 #define	_CASE_IOC_IFGROUPREQ_32(cmd)				\
     _IOC_NEWTYPE((cmd), struct ifgroupreq32): case
 #else /* !COMPAT_FREEBSD32 */
 #define _CASE_IOC_IFGROUPREQ_32(cmd)
 #endif /* !COMPAT_FREEBSD32 */
 
 #define CASE_IOC_IFGROUPREQ(cmd)	\
     _CASE_IOC_IFGROUPREQ_32(cmd)	\
     (cmd)
 
 union ifreq_union {
 	struct ifreq	ifr;
 #ifdef COMPAT_FREEBSD32
 	struct ifreq32	ifr32;
 #endif
 };
 
 union ifgroupreq_union {
 	struct ifgroupreq ifgr;
 #ifdef COMPAT_FREEBSD32
 	struct ifgroupreq32 ifgr32;
 #endif
 };
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Log promiscuous mode change events */
 static int log_promisc_mode_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN,
 	&log_promisc_mode_change, 1,
 	"log promiscuous mode change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *, bool);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	*if_grow(void);
 static void	if_input_default(struct ifnet *, struct mbuf *);
 static int	if_requestencap_default(struct ifnet *, struct if_encap_req *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, int, struct if_clone *);
 static int	if_detach_internal(struct ifnet *, int, struct if_clone **);
 #ifdef VIMAGE
 static void	if_vmove(struct ifnet *, struct vnet *);
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 /* ipsec helper hooks */
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 
 VNET_DEFINE(int, if_index);
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 VNET_DEFINE_STATIC(int, if_indexlim) = 8;
 
 /* Table of ifnet by index. */
 VNET_DEFINE(struct ifnet **, ifindex_table);
 
 #define	V_if_indexlim		VNET(if_indexlim)
 #define	V_ifindex_table		VNET(ifindex_table)
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
  * an rwlock.  Either may be acquired shared to stablize the list, but both
  * must be acquired writable to modify the list.  This model allows us to
  * both stablize the interface list during interrupt thread processing, but
  * also to stablize it over long-running ioctls, without introducing priority
  * inversions and deadlocks.
  */
 struct rwlock ifnet_rwlock;
 RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE);
 struct sx ifnet_sxlock;
 SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
 
 /*
  * The allocation of network interfaces is a rather non-atomic affair; we
  * need to select an index before we are ready to expose the interface for
  * use, so will use this pointer value to indicate reservation.
  */
 #define	IFNET_HOLD	(void *)(uintptr_t)(-1)
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex_locked(u_short idx)
 {
 
 	if (idx > V_if_index)
 		return (NULL);
 	if (V_ifindex_table[idx] == IFNET_HOLD)
 		return (NULL);
 	return (V_ifindex_table[idx]);
 }
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 
 	ifp = ifnet_byindex_locked(idx);
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_short idx)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	if_ref(ifp);
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 /*
  * Allocate an ifindex array entry; return 0 on success or an error on
  * failure.
  */
 static u_short
 ifindex_alloc(void **old)
 {
 	u_short idx;
 
 	IFNET_WLOCK_ASSERT();
 	/*
 	 * Try to find an empty slot below V_if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= V_if_index; idx++) {
 		if (V_ifindex_table[idx] == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= V_if_indexlim) {
 		*old = if_grow();
 		return (USHRT_MAX);
 	}
 	if (idx > V_if_index)
 		V_if_index = idx;
 	return (idx);
 }
 
 static void
 ifindex_free_locked(u_short idx)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = NULL;
 	while (V_if_index > 0 &&
 	    V_ifindex_table[V_if_index] == NULL)
 		V_if_index--;
 }
 
 static void
 ifindex_free(u_short idx)
 {
 
 	IFNET_WLOCK();
 	ifindex_free_locked(idx);
 	IFNET_WUNLOCK();
 }
 
 static void
 ifnet_setbyindex(u_short idx, struct ifnet *ifp)
 {
 
 	V_ifindex_table[idx] = ifp;
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 	struct ifaddr *ifa = NULL;
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp != NULL && (ifa = ifp->if_addr) != NULL)
 		ifa_ref(ifa);
 	NET_EPOCH_EXIT(et);
 	return (ifa);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 	void *old;
 
 	CK_STAILQ_INIT(&V_ifnet);
 	CK_STAILQ_INIT(&V_ifg_head);
 	IFNET_WLOCK();
 	old = if_grow();				/* create initial table */
 	IFNET_WUNLOCK();
 	epoch_wait_preempt(net_epoch_preempt);
 	free(old, M_IFNET);
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 #ifdef VIMAGE
 static void
 vnet_if_uninit(const void *unused __unused)
 {
 
 	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
 	    "not empty", __func__, __LINE__, &V_ifnet));
 	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
 	    "not empty", __func__, __LINE__, &V_ifg_head));
 
 	free((caddr_t)V_ifindex_table, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_if_uninit, NULL);
 
 static void
 vnet_if_return(const void *unused __unused)
 {
 	struct ifnet *ifp, *nifp;
 
 	/* Return all inherited interfaces to their parent vnets. */
 	CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
 		if (ifp->if_home_vnet != ifp->if_vnet)
 			if_vmove(ifp, ifp->if_home_vnet);
 	}
 }
 VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY,
     vnet_if_return, NULL);
 #endif
 
 
 static void *
 if_grow(void)
 {
 	int oldlim;
 	u_int n;
 	struct ifnet **e;
 	void *old;
 
 	old = NULL;
 	IFNET_WLOCK_ASSERT();
 	oldlim = V_if_indexlim;
 	IFNET_WUNLOCK();
 	n = (oldlim << 1) * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	IFNET_WLOCK();
 	if (V_if_indexlim != oldlim) {
 		free(e, M_IFNET);
 		return (NULL);
 	}
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		old = V_ifindex_table;
 	}
 	V_if_indexlim <<= 1;
 	V_ifindex_table = e;
 	return (old);
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 struct ifnet *
 if_alloc_domain(u_char type, int numa_domain)
 {
 	struct ifnet *ifp;
 	u_short idx;
 	void *old;
 
 	KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large"));
 	if (numa_domain == IF_NODOM)
 		ifp = malloc(sizeof(struct ifnet), M_IFNET,
 		    M_WAITOK | M_ZERO);
 	else
 		ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET,
 		    DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO);
  restart:
 	IFNET_WLOCK();
 	idx = ifindex_alloc(&old);
 	if (__predict_false(idx == USHRT_MAX)) {
 		IFNET_WUNLOCK();
 		epoch_wait_preempt(net_epoch_preempt);
 		free(old, M_IFNET);
 		goto restart;
 	}
 	ifnet_setbyindex(idx, IFNET_HOLD);
 	IFNET_WUNLOCK();
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	ifp->if_numa_domain = numa_domain;
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 #endif
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
 			free(ifp, M_IFNET);
 			ifindex_free(idx);
 			return (NULL);
 		}
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	CK_STAILQ_INIT(&ifp->if_addrhead);
 	CK_STAILQ_INIT(&ifp->if_multiaddrs);
 	CK_STAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 	ifp->if_get_counter = if_get_counter_default;
 	ifp->if_pcp = IFNET_PCP_NONE;
 	ifnet_setbyindex(ifp->if_index, ifp);
 	return (ifp);
 }
 
 struct ifnet *
 if_alloc_dev(u_char type, device_t dev)
 {
 	int numa_domain;
 
 	if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0)
 		return (if_alloc_domain(type, IF_NODOM));
 	return (if_alloc_domain(type, numa_domain));
 }
 
 struct ifnet *
 if_alloc(u_char type)
 {
 
 	return (if_alloc_domain(type, IF_NODOM));
 }
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the last reference to an
  * interface is released.
  */
 static void
 if_free_internal(struct ifnet *ifp)
 {
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("if_free_internal: interface not dying"));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	free(ifp->if_description, M_IFDESCR);
 	free(ifp->if_hw_addr, M_IFADDR);
 	if (ifp->if_numa_domain == IF_NODOM)
 		free(ifp, M_IFNET);
 	else
 		free_domain(ifp, M_IFNET);
 }
 
 static void
 if_destroy(epoch_context_t ctx)
 {
 	struct ifnet *ifp;
 
 	ifp = __containerof(ctx, struct ifnet, if_epoch_ctx);
 	if_free_internal(ifp);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IFNET_WLOCK();
 	KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
 	    ("%s: freeing unallocated ifnet", ifp->if_xname));
 
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy);
 	CURVNET_RESTORE();
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	refcount_acquire(&ifp->if_refcount);
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 	
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initialization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * Note that if_detach_internal() removes group membership unconditionally
  * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
  * Thus, when if_vmove() is applied to a cloned interface, group membership
  * is lost while a cloned one always joins a group whose name is
  * ifc->ifc_name.  To recover this after if_detach_internal() and
  * if_attach_internal(), the cloner should be specified to
  * if_attach_internal() via ifc.  If it is non-NULL, if_attach_internal()
  * attempts to join a group whose name is ifc->ifc_name.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, 0, NULL);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
 	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
 		pmax->tsomaxbytes = ifp->if_hw_tsomax;
 	}
 	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
 	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
 		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 	}
 	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
 	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
 		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	int retval = 0;
 	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
 		ifp->if_hw_tsomax = pmax->tsomaxbytes;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
 		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
 		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
 		retval++;
 	}
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 	/* Restore group membership for cloned interfaces. */
 	if (vmove && ifc != NULL)
 		if_clone_addgroup(ifp, ifc);
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit;
 		ifp->if_qflush = if_qflush;
 	}
 	if (ifp->if_input == NULL)
 		ifp->if_input = if_input_default;
 
 	if (ifp->if_requestencap == NULL)
 		ifp->if_requestencap = if_requestencap_default;
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 		if (ifp->if_type == IFT_ETHER) {
 			ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR,
 			    M_WAITOK | M_ZERO);
 		}
 
 #if defined(INET) || defined(INET6)
 		/* Use defaults for TSO, if nothing is set */
 		if (ifp->if_hw_tsomax == 0 &&
 		    ifp->if_hw_tsomaxsegcount == 0 &&
 		    ifp->if_hw_tsomaxsegsize == 0) {
 			/*
 			 * The TSO defaults needs to be such that an
 			 * NFS mbuf list of 35 mbufs totalling just
 			 * below 64K works and that a chain of mbufs
 			 * can be defragged into at most 32 segments:
 			 */
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			ifp->if_hw_tsomaxsegcount = 35;
 			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
 
 			/* XXX some drivers set IFCAP_TSO after ethernet attach */
 			if (ifp->if_capabilities & IFCAP_TSO) {
 				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
 				    ifp->if_hw_tsomax,
 				    ifp->if_hw_tsomaxsegcount,
 				    ifp->if_hw_tsomaxsegsize);
 			}
 		}
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = CK_STAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	IFNET_WLOCK();
 	CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 
 static void
 if_epochalloc(void *dummy __unused)
 {
 
 	net_epoch_preempt = epoch_alloc(EPOCH_PREEMPT);
 	net_epoch = epoch_alloc(0);
 }
 SYSINIT(ifepochalloc, SI_SUB_TASKQ + 1, SI_ORDER_ANY,
     if_epochalloc, NULL);
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 
 	while (1) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_LINK)
 				break;
 		}
 		NET_EPOCH_EXIT(et);
 
 		if (ifa == NULL)
 			break;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		IF_ADDR_WLOCK(ifp);
 		CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
 		IF_ADDR_WUNLOCK(ifp);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_WLOCK(ifp);
 	while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) {
 		ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs);
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		if_delmulti_locked(ifp, ifma, 1);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if_detach_internal(ifp, 0, NULL);
 	CURVNET_RESTORE();
 }
 
 /*
  * The vmove flag, if set, indicates that we are called from a callpath
  * that is moving an interface to a different vnet instance.
  *
  * The shutdown flag, if set, indicates that we are called in the
  * process of shutting down a vnet instance.  Currently only the
  * vnet_if_return SYSUNINIT function sets it.  Note: we can be called
  * on a vnet instance shutdown without this flag being set, e.g., when
  * the cloned interfaces are destoyed as first thing of teardown.
  */
 static int
 if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp)
 {
 	struct ifaddr *ifa;
 	int i;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0;
 #ifdef VIMAGE
 	int shutdown;
 
 	shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET &&
 		 ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
 #endif
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link);
 			if (!vmove)
 				ifp->if_flags |= IFF_DYING;
 			found = 1;
 			break;
 		}
 	IFNET_WUNLOCK();
 	if (!found) {
 		/*
 		 * While we would want to panic here, we cannot
 		 * guarantee that the interface is indeed still on
 		 * the list given we don't hold locks all the way.
 		 */
 		return (ENOENT);
 #if 0
 		if (vmove)
 			panic("%s: ifp=%p not on the ifnet tailq %p",
 			    __func__, ifp, &V_ifnet);
 		else
 			return; /* XXX this should panic as well? */
 #endif
 	}
 
 	/*
 	 * At this point we know the interface still was on the ifnet list
 	 * and we removed it so we are in a stable state.
 	 */
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt--;
 #endif
 	epoch_wait_preempt(net_epoch_preempt);
 	/*
 	 * In any case (destroy or vmove) detach us from the groups
 	 * and remove/wait for pending events on the taskq.
 	 * XXX-BZ in theory an interface could still enqueue a taskq change?
 	 */
 	if_delgroups(ifp);
 
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Check if this is a cloned interface or not. Must do even if
 	 * shutting down as a if_vmove_reclaim() would move the ifp and
 	 * the if_clone_addgroup() will have a corrupted string overwise
 	 * from a gibberish pointer.
 	 */
 	if (vmove && ifcp != NULL)
 		*ifcp = if_clone_findifc(ifp);
 
 	if_down(ifp);
 
 #ifdef VIMAGE
 	/*
 	 * On VNET shutdown abort here as the stack teardown will do all
 	 * the work top-down for us.
 	 */
 	if (shutdown) {
 		/* Give interface users the chance to clean up. */
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		/*
 		 * In case of a vmove we are done here without error.
 		 * If we would signal an error it would lead to the same
 		 * abort as if we did not find the ifnet anymore.
 		 * if_detach() calls us in void context and does not care
 		 * about an early abort notification, so life is splendid :)
 		 */
 		goto finish_vnet_shutdown;
 	}
 #endif
 
 	/*
 	 * At this point we are not tearing down a VNET and are either
 	 * going to destroy or vmove the interface and have to cleanup
 	 * accordingly.
 	 */
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Clean up all addresses.
 		 */
 		IF_ADDR_WLOCK(ifp);
 		if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = CK_STAILQ_FIRST(&ifp->if_addrhead);
 			CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
 			IF_ADDR_WUNLOCK(ifp);
 			ifa_free(ifa);
 		} else
 			IF_ADDR_WUNLOCK(ifp);
 	}
 
 	rt_flushifroutes(ifp);
 
 #ifdef VIMAGE
 finish_vnet_shutdown:
 #endif
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) {
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 			ifp->if_afdata[dp->dom_family] = NULL;
 		}
 	}
 
 	return (0);
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  * An attempt is made to shrink if_index in current vnet, find an
  * unused if_index in target vnet and calls if_grow() if necessary,
  * and finally find an unused if_xname for the target vnet.
  */
 static void
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 	struct if_clone *ifc;
 	u_int bif_dlt, bif_hdrlen;
 	void *old;
 	int rc;
 
  	/*
 	 * if_detach_internal() will call the eventhandler to notify
 	 * interface departure.  That will detach if_bpf.  We need to
 	 * safe the dlt and hdrlen so we can re-attach it later.
 	 */
 	bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen);
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 * If we cannot find it, we lost the race to someone else.
 	 */
 	rc = if_detach_internal(ifp, 1, &ifc);
 	if (rc != 0)
 		return;
 
 	/*
 	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
 	 * the if_index for that vnet if possible.
 	 *
 	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
 	 * or we'd lock on one vnet and unlock on another.
 	 */
 	IFNET_WLOCK();
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
  restart:
 	IFNET_WLOCK();
 	ifp->if_index = ifindex_alloc(&old);
 	if (__predict_false(ifp->if_index == USHRT_MAX)) {
 		IFNET_WUNLOCK();
 		epoch_wait_preempt(net_epoch_preempt);
 		free(old, M_IFNET);
 		goto restart;
 	}
 	ifnet_setbyindex(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 
 	if_attach_internal(ifp, 1, ifc);
 
 	if (ifp->if_bpf == NULL)
 		bpfattach(ifp, bif_dlt, bif_hdrlen);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 	int shutdown;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	if (difp != NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the VNET is stable. */
 	shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET &&
 		 ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
 	if (shutdown) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EBUSY);
 	}
 	CURVNET_RESTORE();
 
 	/* Move the interface into the child jail/vnet. */
 	if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
  	int shutdown;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the VNET is stable. */
 	shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET &&
 		 ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
 	if (shutdown) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EBUSY);
 	}
 
 	/* Get interface back from child jail/vnet. */
 	if_vmove(ifp, vnet_dst);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		CK_STAILQ_INIT(&ifg->ifg_members);
 		CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	int freeifgl;
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	freeifgl = 0;
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL)
 		CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next);
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next);
 		freeifgl = 1;
 	}
 	IFNET_WUNLOCK();
 
 	epoch_wait_preempt(net_epoch_preempt);
 	if (freeifgl) {
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	}
 	free(ifgm, M_TEMP);
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	char groupname[IFNAMSIZ];
 	int ifglfree;
 
 	IFNET_WLOCK();
 	while (!CK_STAILQ_EMPTY(&ifp->if_groups)) {
 		ifgl = CK_STAILQ_FIRST(&ifp->if_groups);
 
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 
 		IF_ADDR_WLOCK(ifp);
 		CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next);
 		IF_ADDR_WUNLOCK(ifp);
 
 		CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 			if (ifgm->ifgm_ifp == ifp)
 				break;
 
 		if (ifgm != NULL)
 			CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member,
 			    ifgm_next);
 		ifglfree = 0;
 		if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 			CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next);
 			ifglfree = 1;
 		}
 
 		IFNET_WUNLOCK();
 		epoch_wait_preempt(net_epoch_preempt);
 		free(ifgm, M_TEMP);
 		if (ifglfree) {
 			EVENTHANDLER_INVOKE(group_detach_event,
 								ifgl->ifgl_group);
 			free(ifgl->ifgl_group, M_TEMP);
 		}
 		EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 static char *
 ifgr_group_get(void *ifgrp)
 {
 	union ifgroupreq_union *ifgrup;
 
 	ifgrup = ifgrp;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]);
 #endif
 	return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]);
 }
 
 static struct ifg_req *
 ifgr_groups_get(void *ifgrp)
 {
 	union ifgroupreq_union *ifgrup;
 
 	ifgrup = ifgrp;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return ((struct ifg_req *)(uintptr_t)
 		    ifgrup->ifgr32.ifgr_ifgru.ifgru_groups);
 #endif
 	return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups);
 }
 
 /*
  * Stores all groups from an interface in memory pointed to by ifgr.
  */
 static int
 if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp)
 {
 	struct epoch_tracker	 et;
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 
 	if (ifgr->ifgr_len == 0) {
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		NET_EPOCH_EXIT(et);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr_groups_get(ifgr);
 	/* XXX: wire */
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			NET_EPOCH_EXIT(et);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	NET_EPOCH_EXIT(et);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by igfr
  */
 static int
 if_getgroupmembers(struct ifgroupreq *ifgr)
 {
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr_groups_get(ifgr);
 	CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Wrapper functions for struct ifnet address list locking macros.  These are
  * used by kernel modules to avoid encoding programming interface or binary
  * interface assumptions that may be violated when kernel-internal locking
  * approaches change.
  */
 void
 if_addr_rlock(struct ifnet *ifp)
 {
 
 	epoch_enter_preempt(net_epoch_preempt, curthread->td_et);
 }
 
 void
 if_addr_runlock(struct ifnet *ifp)
 {
 
 	epoch_exit_preempt(net_epoch_preempt, curthread->td_et);
 }
 
 void
 if_maddr_rlock(if_t ifp)
 {
 
 	epoch_enter_preempt(net_epoch_preempt, curthread->td_et);
 }
 
 void
 if_maddr_runlock(if_t ifp)
 {
 
 	epoch_exit_preempt(net_epoch_preempt, curthread->td_et);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 
 	refcount_acquire(&ifa->ifa_refcnt);
 }
 
 static void
 ifa_destroy(epoch_context_t ctx)
 {
 	struct ifaddr *ifa;
 
 	ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx);
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt))
 		epoch_call(net_epoch_preempt, &ifa->ifa_epoch_ctx, ifa_destroy);
 }
 
 
 static int
 ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa,
     struct sockaddr *ia)
 {
 	struct epoch_tracker et;
 	int error;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 	struct ifnet *ifp;
 
 	ifp = ifa->ifa_ifp;
 
 	bzero(&info, sizeof(info));
 	if (cmd != RTM_DELETE)
 		info.rti_ifp = V_loif;
 	if (cmd == RTM_ADD) {
 		/* explicitly specify (loopback) ifa */
 		if (info.rti_ifp != NULL) {
 			NET_EPOCH_ENTER(et);
 			info.rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp);
 			if (info.rti_ifa != NULL)
 				ifa_ref(info.rti_ifa);
 			NET_EPOCH_EXIT(et);
 		}
 	}
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type);
 
 	error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib);
 
 	if (error != 0 &&
 	    !(cmd == RTM_ADD && error == EEXIST) &&
 	    !(cmd == RTM_DELETE && error == ENOENT))
 		if_printf(ifp, "%s failed: %d\n", otype, error);
 
 	return (error);
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia));
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia));
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 
 	return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia));
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((const struct sockaddr_dl *)(a1))->sdl_len ==		\
 	 ((const struct sockaddr_dl *)(a2))->sdl_len) &&		\
 	 (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)),		\
 	       CLLADDR((const struct sockaddr_dl *)(a2)),		\
 	       ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithaddr(const struct sockaddr *addr)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	MPASS(in_epoch(net_epoch_preempt));
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 int
 ifa_ifwithaddr_check(const struct sockaddr *addr)
 {
 	struct epoch_tracker et;
 	int rc;
 
 	NET_EPOCH_ENTER(et);
 	rc = (ifa_ifwithaddr(addr) != NULL);
 	NET_EPOCH_EXIT(et);
 	return (rc);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	MPASS(in_epoch(net_epoch_preempt));
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	MPASS(in_epoch(net_epoch_preempt));
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	const char *addr_data = addr->sa_data, *cplim;
 
 	MPASS(in_epoch(net_epoch_preempt));
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.
 	 */
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			const char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					ifa_maybe = ifa;
 				}
 			}
 		}
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	const char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 
 	MPASS(in_epoch(net_epoch_preempt));
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_pcp_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		ifp->if_bridge_linkstate(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		if_printf(ifp, "link state changed to %s\n",
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN);
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 	
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != NULL) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL)
 		if_ref(ifp);
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 static void *
 ifr_buffer_get_buffer(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return ((void *)(uintptr_t)
 		    ifrup->ifr32.ifr_ifru.ifru_buffer.buffer);
 #endif
 	return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer);
 }
 
 static void
 ifr_buffer_set_buffer_null(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0;
 	else
 #endif
 		ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL;
 }
 
 static size_t
 ifr_buffer_get_length(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return (ifrup->ifr32.ifr_ifru.ifru_buffer.length);
 #endif
 	return (ifrup->ifr.ifr_ifru.ifru_buffer.length);
 }
 
 static void
 ifr_buffer_set_length(void *data, size_t len)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		ifrup->ifr32.ifr_ifru.ifru_buffer.length = len;
 	else
 #endif
 		ifrup->ifr.ifr_ifru.ifru_buffer.length = len;
 }
 
 void *
 ifr_data_get_ptr(void *ifrp)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = ifrp;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return ((void *)(uintptr_t)
 		    ifrup->ifr32.ifr_ifru.ifru_data);
 #endif
 		return (ifrup->ifr.ifr_ifru.ifru_data);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0, do_ifup = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr_buffer_get_length(ifr) < descrlen)
 				ifr_buffer_set_buffer_null(ifr);
 			else
 				error = copyout(ifp->if_description,
 				    ifr_buffer_get_buffer(ifr), descrlen);
 			ifr_buffer_set_length(ifr, descrlen);
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr_buffer_get_length(ifr) > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr_buffer_get_length(ifr) == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr_buffer_get_length(ifr),
 			    M_IFDESCR, M_WAITOK | M_ZERO);
 			error = copyin(ifr_buffer_get_buffer(ifr), descrbuf,
 			    ifr_buffer_get_length(ifr) - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			do_ifup = 1;
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			if (log_promisc_mode_change)
                                 if_printf(ifp, "permanently promiscuous mode %s\n",
                                     ((new_flags & IFF_PPROMISC) ?
                                      "enabled" : "disabled"));
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		if (do_ifup)
 			if_up(ifp);
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ,
 		    NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (new_name[IFNAMSIZ-1] != '\0') {
 			new_name[IFNAMSIZ-1] = '\0';
 			if (strlen(new_name) == IFNAMSIZ-1)
 				return (EINVAL);
 		}
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		if_printf(ifp, "changing name to '%s'\n", new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 #ifdef INET
 			NETDUMP_REINIT(ifp);
 #endif
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct epoch_tracker et;
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			NET_EPOCH_ENTER(et);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			NET_EPOCH_EXIT(et);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 	case SIOCGIFGENERIC:
 	case SIOCGIFRSSKEY:
 	case SIOCGIFRSSHASH:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		break;
 
 	case SIOCGHWADDR:
 		error = if_gethwaddr(ifp, ifr);
 		break;
 
 	case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP):
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp,
 		    ifgr_group_get((struct ifgroupreq *)data))))
 			return (error);
 		break;
 
 	case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP):
 		if ((error = if_getgroup((struct ifgroupreq *)data, ifp)))
 			return (error);
 		break;
 
 	case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP):
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp,
 		    ifgr_group_get((struct ifgroupreq *)data))))
 			return (error);
 		break;
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 #endif
 
 #ifdef COMPAT_FREEBSD32
 static void
 ifmr_init(struct ifmediareq *ifmr, caddr_t data)
 {
 	struct ifmediareq32 *ifmr32;
 
 	ifmr32 = (struct ifmediareq32 *)data;
 	memcpy(ifmr->ifm_name, ifmr32->ifm_name,
 	    sizeof(ifmr->ifm_name));
 	ifmr->ifm_current = ifmr32->ifm_current;
 	ifmr->ifm_mask = ifmr32->ifm_mask;
 	ifmr->ifm_status = ifmr32->ifm_status;
 	ifmr->ifm_active = ifmr32->ifm_active;
 	ifmr->ifm_count = ifmr32->ifm_count;
 	ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist;
 }
 
 static void
 ifmr_update(const struct ifmediareq *ifmr, caddr_t data)
 {
 	struct ifmediareq32 *ifmr32;
 
 	ifmr32 = (struct ifmediareq32 *)data;
 	ifmr32->ifm_current = ifmr->ifm_current;
 	ifmr32->ifm_mask = ifmr->ifm_mask;
 	ifmr32->ifm_status = ifmr->ifm_status;
 	ifmr32->ifm_active = ifmr->ifm_active;
 	ifmr32->ifm_count = ifmr->ifm_count;
 }
 #endif
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 #ifdef COMPAT_FREEBSD32
 	caddr_t saved_data = NULL;
 	struct ifmediareq ifmr;
 	struct ifmediareq *ifmrp;
 #endif
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 #ifdef VIMAGE
 	int shutdown;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 #ifdef VIMAGE
 	/* Make sure the VNET is stable. */
 	shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET &&
 		 so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0;
 	if (shutdown) {
 		CURVNET_RESTORE();
 		return (EBUSY);
 	}
 #endif
 
 
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		CURVNET_RESTORE();
 		return (error);
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGIFCONF32:
 		{
 			struct ifconf32 *ifc32;
 			struct ifconf ifc;
 
 			ifc32 = (struct ifconf32 *)data;
 			ifc.ifc_len = ifc32->ifc_len;
 			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 
 			error = ifconf(SIOCGIFCONF, (void *)&ifc);
 			CURVNET_RESTORE();
 			if (error == 0)
 				ifc32->ifc_len = ifc.ifc_len;
 			return (error);
 		}
 #endif
 	}
 
 #ifdef COMPAT_FREEBSD32
 	ifmrp = NULL;
 	switch (cmd) {
 	case SIOCGIFMEDIA32:
 	case SIOCGIFXMEDIA32:
 		ifmrp = &ifmr;
 		ifmr_init(ifmrp, data);
 		cmd = _IOC_NEWTYPE(cmd, struct ifmediareq);
 		saved_data = data;
 		data = (caddr_t)ifmrp;
 	}
 #endif
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		goto out_noref;
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ?
 			    ifr_data_get_ptr(ifr) : NULL);
 		goto out_noref;
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error == 0)
 			error = if_clone_destroy(ifr->ifr_name);
 		goto out_noref;
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		goto out_noref;
 
 	case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB):
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		goto out_noref;
 
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		goto out_noref;
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		error = ENXIO;
 		goto out_noref;
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL)
 		goto out_ref;
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		error = EOPNOTSUPP;
 		goto out_ref;
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
 	    ifp, td));
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 
 out_ref:
 	if_rele(ifp);
 out_noref:
 #ifdef COMPAT_FREEBSD32
 	if (ifmrp != NULL) {
 		KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA),
 		    ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx",
 		     cmd));
 		data = saved_data;
 		ifmr_update(ifmrp, data);
 	}
 #endif
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) &&
             log_promisc_mode_change)
 		if_printf(ifp, "promiscuous mode %s\n",
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		struct epoch_tracker et;
 		int addrs;
 
 		/*
 		 * Zero the ifr to make sure we don't disclose the contents
 		 * of the stack.
 		 */
 		memset(&ifr, 0, sizeof(ifr));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				if (sa->sa_len < sizeof(*sa)) {
 					memset(&ifr.ifr_ifru.ifru_addr, 0,
 					    sizeof(ifr.ifr_ifru.ifru_addr));
 					memcpy(&ifr.ifr_ifru.ifru_addr, sa,
 					    sa->sa_len);
 				} else
 					ifr.ifr_ifru.ifru_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		NET_EPOCH_EXIT(et);
 		if (addrs == 0) {
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, const struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 #ifdef MCAST_VERBOSE
 extern void kdb_backtrace(void);
 #endif
 static void
 if_freemulti_internal(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 #ifdef MCAST_VERBOSE
 	kdb_backtrace();
 	printf("%s freeing ifma: %p\n", __func__, ifma);
 #endif
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 static void
 if_destroymulti(epoch_context_t ctx)
 {
 	struct ifmultiaddr *ifma;
 
 	ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx);
 	if_freemulti_internal(ifma);
 }
 
 void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d",
 	    ifma->ifma_refcount));
 
 	epoch_call(net_epoch_preempt, &ifma->ifma_epoch_ctx, if_destroymulti);
 }
 
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 #ifdef INET
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 #ifdef INET6
 	IN6_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			ll_ifma->ifma_flags |= IFMA_F_ENQUEUED;
 			CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	ifma->ifma_flags |= IFMA_F_ENQUEUED;
 	CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct epoch_tracker et;
 	struct ifnet *oifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	NET_EPOCH_EXIT(et);
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	if_delmulti_ifma_flags(ifma, 0);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags)
 {
 	struct ifnet *ifp;
 	int lastref;
 	MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma);
 #ifdef INET
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct epoch_tracker et;
 		struct ifnet *oifp;
 
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp)
 			ifp = NULL;
 		NET_EPOCH_EXIT(et);
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, flags);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 	MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : "");
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 					CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr,
 						ifma_link);
 					ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 				}
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 #ifdef INVARIANTS
 	if (ifp) {
 		struct ifmultiaddr *ifmatmp;
 
 		CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link)
 			MPASS(ifma != ifmatmp);
 	}
 #endif
 	if_freemulti(ifma);
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct epoch_tracker et;
 	int rc;
 
 	rc = 0;
 	NET_EPOCH_ENTER(et);
 	ifa = ifp->if_addr;
 	if (ifa == NULL) {
 		rc = EINVAL;
 		goto out;
 	}
 
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL) {
 		rc = EINVAL;
 		goto out;
 	}
 	if (len != sdl->sdl_alen) {	/* don't allow length to change */
 		rc = EINVAL;
 		goto out;
 	}
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_XETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_IEEE8023ADLAG:
 		bcopy(lladdr, LLADDR(sdl), len);
 		break;
 	default:
 		rc = ENODEV;
 		goto out;
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	NET_EPOCH_EXIT(et);
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 	}
 	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 	return (0);
  out:
 	NET_EPOCH_EXIT(et);
 	return (rc);
 }
 
 /*
  * Compat function for handling basic encapsulation requests.
  * Not converted stacks (FDDI, IB, ..) supports traditional
  * output model: ARP (and other similar L2 protocols) are handled
  * inside output routine, arpresolve/nd6_resolve() returns MAC
  * address instead of full prepend.
  *
  * This function creates calculated header==MAC for IPv4/IPv6 and
  * returns EAFNOSUPPORT (which is then handled in ARP code) for other
  * address families.
  */
 static int
 if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req)
 {
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < req->lladdr_len)
 		return (ENOMEM);
 
 	switch (req->family) {
 	case AF_INET:
 	case AF_INET6:
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	/* Copy lladdr to storage as is */
 	memmove(req->buf, req->lladdr, req->lladdr_len);
 	req->bufsize = req->lladdr_len;
 	req->lladdr_off = 0;
 
 	return (0);
 }
 
 /*
  * Tunnel interfaces can nest, also they may cause infinite recursion
  * calls when misconfigured. We'll prevent this by detecting loops.
  * High nesting level may cause stack exhaustion. We'll prevent this
  * by introducing upper limit.
  *
  * Return 0, if tunnel nesting count is equal or less than limit.
  */
 int
 if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie,
     int limit)
 {
 	struct m_tag *mtag;
 	int count;
 
 	count = 1;
 	mtag = NULL;
 	while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) {
 		if (*(struct ifnet **)(mtag + 1) == ifp) {
 			log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp));
 			return (EIO);
 		}
 		count++;
 	}
 	if (count > limit) {
 		log(LOG_NOTICE,
 		    "%s: if_output recursively called too many times(%d)\n",
 		    if_name(ifp), count);
 		return (EIO);
 	}
 	mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT);
 	if (mtag == NULL)
 		return (ENOMEM);
 	*(struct ifnet **)(mtag + 1) = ifp;
 	m_tag_prepend(m, mtag);
 	return (0);
 }
 
 /*
  * Get the link layer address that was read from the hardware at attach.
  *
  * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type
  * their component interfaces as IFT_IEEE8023ADLAG.
  */
 int
 if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr)
 {
 
 	if (ifp->if_hw_addr == NULL)
 		return (ENODEV);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_IEEE8023ADLAG:
 		bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen);
 		return (0);
 	default:
 		return (ENODEV);
 	}
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 int
 if_printf(struct ifnet *ifp, const char *fmt, ...)
 {
 	char if_fmt[256];
 	va_list ap;
 
 	snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt);
 	va_start(ap, fmt);
 	vlog(LOG_INFO, if_fmt, ap);
 	va_end(ap);
 	return (0);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 static void
 if_input_default(struct ifnet *ifp __unused, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		IF_UNLOCK(ifq);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 	
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 	
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
  
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 
 int
 if_setflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags |= set;
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 
 	return (0);
 }
 
 int
 if_getflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_getmtu_family(if_t ifp, int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu((struct ifnet *)ifp));
 	}
 
 	return (((struct ifnet *)ifp)->if_mtu);
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
+
+	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 struct ifaddr *
 if_getifaddr(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 /* XXX */
 #ifndef ETH_ADDR_LEN
 #define ETH_ADDR_LEN 6
 #endif
 
 int 
 if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max)
 {
 	struct ifmultiaddr *ifma;
 	uint8_t *lmta = (uint8_t *)mta;
 	int mcnt = 0;
 
 	CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 
 		if (mcnt == max)
 			break;
 
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
 		    &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
 		mcnt++;
 	}
 	*cnt = mcnt;
 
 	return (0);
 }
 
 int
 if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max)
 {
 	int error;
 
 	if_maddr_rlock(ifp);
 	error = if_setupmultiaddr(ifp, mta, cnt, max);
 	if_maddr_runlock(ifp);
 	return (error);
 }
 
 int
 if_multiaddr_count(if_t ifp, int max)
 {
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	if_maddr_rlock(ifp);
 	CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count++;
 		if (count == max)
 			break;
 	}
 	if_maddr_runlock(ifp);
 	return (count);
 }
 
 int
 if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg)
 {
 	struct ifmultiaddr *ifma;
 	int cnt = 0;
 
 	if_maddr_rlock(ifp);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
 		cnt += filter(arg, ifma, cnt);
 	if_maddr_runlock(ifp);
 	return (cnt);
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 int
 if_sethwtsomax(if_t ifp, u_int if_hw_tsomax)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax;
         return (0);
 }
 
 int
 if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount;
         return (0);
 }
 
 int
 if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize;
         return (0);
 }
 
 u_int
 if_gethwtsomax(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomax);
 }
 
 u_int
 if_gethwtsomaxsegcount(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount);
 }
 
 u_int
 if_gethwtsomaxsegsize(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize);
 }
 
 void
 if_setinitfn(if_t ifp, void (*init_fn)(void *))
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setstartfn(if_t ifp, void (*start_fn)(if_t))
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 	
 }
 
 void
 if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
 {
 
 	ifp->if_get_counter = fn;
 }
 
 /* Revisit these - These are inline functions originally. */
 int
 drbr_inuse_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_inuse(ifh, br);
 }
 
 struct mbuf*
 drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_dequeue(ifh, br);
 }
 
 int
 drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_needs_enqueue(ifh, br);
 }
 
 int
 drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m)
 {
 	return drbr_enqueue(ifh, br, m);
 
 }
Index: head/sys/net/if_ethersubr.c
===================================================================
--- head/sys/net/if_ethersubr.c	(revision 348253)
+++ head/sys/net/if_ethersubr.c	(revision 348254)
@@ -1,1443 +1,1444 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ethersubr.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_netgraph.h"
 #include "opt_mbuf_profiling.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/priv.h>
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 
 #include <net/ieee_oui.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 #include <security/mac/mac_framework.h>
 
 #include <crypto/sha1.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
 CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
 #endif
 
 VNET_DEFINE(pfil_head_t, link_pfil_head);	/* Packet filter hooks */
 
 /* netgraph node hooks for ng_ether(4) */
 void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_attach_p)(struct ifnet *ifp);
 void	(*ng_ether_detach_p)(struct ifnet *ifp);
 
 void	(*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* if_bridge(4) support */
 void	(*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); 
 
 static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	int ether_resolvemulti(struct ifnet *, struct sockaddr **,
 		struct sockaddr *);
 #ifdef VIMAGE
 static	void ether_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 static	int ether_requestencap(struct ifnet *, struct if_encap_req *);
 
 
 #define senderr(e) do { error = (e); goto bad;} while (0)
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Handle link-layer encapsulation requests.
  */
 static int
 ether_requestencap(struct ifnet *ifp, struct if_encap_req *req)
 {
 	struct ether_header *eh;
 	struct arphdr *ah;
 	uint16_t etype;
 	const u_char *lladdr;
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < ETHER_HDR_LEN)
 		return (ENOMEM);
 
 	eh = (struct ether_header *)req->buf;
 	lladdr = req->lladdr;
 	req->lladdr_off = 0;
 
 	switch (req->family) {
 	case AF_INET:
 		etype = htons(ETHERTYPE_IP);
 		break;
 	case AF_INET6:
 		etype = htons(ETHERTYPE_IPV6);
 		break;
 	case AF_ARP:
 		ah = (struct arphdr *)req->hdata;
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			etype = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			etype = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (req->flags & IFENCAP_FLAG_BROADCAST)
 			lladdr = ifp->if_broadcastaddr;
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
 	memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
 	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	req->bufsize = sizeof(struct ether_header);
 
 	return (0);
 }
 
 
 static int
 ether_resolve_addr(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro, u_char *phdr,
 	uint32_t *pflags, struct llentry **plle)
 {
 	struct ether_header *eh;
 	uint32_t lleflags = 0;
 	int error = 0;
 #if defined(INET) || defined(INET6)
 	uint16_t etype;
 #endif
 
 	if (plle)
 		*plle = NULL;
 	eh = (struct ether_header *)phdr;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
 			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags,
 			    plle);
 		else {
 			if (m->m_flags & M_BCAST)
 				memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
 				    ETHER_ADDR_LEN);
 			else {
 				const struct in_addr *a;
 				a = &(((const struct sockaddr_in *)dst)->sin_addr);
 				ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
 			}
 			etype = htons(ETHERTYPE_IP);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if ((m->m_flags & M_MCAST) == 0)
 			error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags,
 			    plle);
 		else {
 			const struct in6_addr *a6;
 			a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
 			ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
 			etype = htons(ETHERTYPE_IPV6);
 			memcpy(&eh->ether_type, &etype, sizeof(etype));
 			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 		}
 		break;
 #endif
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		if (m != NULL)
 			m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	if (error == EHOSTDOWN) {
 		if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
 			error = EHOSTUNREACH;
 	}
 
 	if (error != 0)
 		return (error);
 
 	*pflags = RT_MAY_LOOP;
 	if (lleflags & LLE_IFADDR)
 		*pflags |= RT_L2_ME;
 
 	return (0);
 }
 
 /*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 int
 ether_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	int error = 0;
 	char linkhdr[ETHER_HDR_LEN], *phdr;
 	struct ether_header *eh;
 	struct pf_mtag *t;
 	int loop_copy = 1;
 	int hlen;	/* link layer header length */
 	uint32_t pflags;
 	struct llentry *lle = NULL;
 	int addref = 0;
 
 	phdr = NULL;
 	pflags = 0;
 	if (ro != NULL) {
 		/* XXX BPF uses ro_prepend */
 		if (ro->ro_prepend != NULL) {
 			phdr = ro->ro_prepend;
 			hlen = ro->ro_plen;
 		} else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
 			if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
 				lle = ro->ro_lle;
 				if (lle != NULL &&
 				    (lle->la_flags & LLE_VALID) == 0) {
 					LLE_FREE(lle);
 					lle = NULL;	/* redundant */
 					ro->ro_lle = NULL;
 				}
 				if (lle == NULL) {
 					/* if we lookup, keep cache */
 					addref = 1;
 				} else
 					/*
 					 * Notify LLE code that
 					 * the entry was used
 					 * by datapath.
 					 */
 					llentry_mark_used(lle);
 			}
 			if (lle != NULL) {
 				phdr = lle->r_linkdata;
 				hlen = lle->r_hdrlen;
 				pflags = lle->r_flags;
 			}
 		}
 	}
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 
 	if (phdr == NULL) {
 		/* No prepend data supplied. Try to calculate ourselves. */
 		phdr = linkhdr;
 		hlen = ETHER_HDR_LEN;
 		error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
 		    addref ? &lle : NULL);
 		if (addref && lle != NULL)
 			ro->ro_lle = lle;
 		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, dst->sa_family, 0));
 	}
 	loop_copy = pflags & RT_MAY_LOOP;
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 *
 	 * Note that we do prepend regardless of RT_HAS_HEADER flag.
 	 * This is done because BPF code shifts m_data pointer
 	 * to the end of ethernet header prior to calling if_output().
 	 */
 	M_PREPEND(m, hlen, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	if ((pflags & RT_HAS_HEADER) == 0) {
 		eh = mtod(m, struct ether_header *);
 		memcpy(eh, phdr, hlen);
 	}
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
 		struct mbuf *n;
 
 		/*
 		 * Because if_simloop() modifies the packet, we need a
 		 * writable copy through m_dup() instead of a readonly
 		 * one as m_copy[m] would give us. The alternative would
 		 * be to modify if_simloop() to handle the readonly mbuf,
 		 * but performancewise it is mostly equivalent (trading
 		 * extra data copying vs. extra locking).
 		 *
 		 * XXX This is a local workaround.  A number of less
 		 * often used kernel parts suffer from the same bug.
 		 * See PR kern/105943 for a proposed general solution.
 		 */
 		if ((n = m_dup(m, M_NOWAIT)) != NULL) {
 			update_mbuf_csumflags(m, n);
 			(void)if_simloop(ifp, n, dst->sa_family, hlen);
 		} else
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	}
 
        /*
 	* Bridges require special output handling.
 	*/
 	if (ifp->if_bridge) {
 		BRIDGE_OUTPUT(ifp, m, error);
 		return (error);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (ifp->if_carp &&
 	    (error = (*carp_output_p)(ifp, m, dst)))
 		goto bad;
 #endif
 
 	/* Handle ng_ether(4) processing, if any */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_output_p != NULL,
 		    ("ng_ether_output_p is NULL"));
 		if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
 bad:			if (m != NULL)
 				m_freem(m);
 			return (error);
 		}
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Continue with link-layer output */
 	return ether_output_frame(ifp, m);
 }
 
 static bool
 ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp)
 {
 	struct ether_header *eh;
 
 	eh = mtod(*mp, struct ether_header *);
 	if (ntohs(eh->ether_type) == ETHERTYPE_VLAN ||
 	    ether_8021q_frame(mp, ifp, ifp, 0, pcp))
 		return (true);
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (false);
 }
 
 /*
  * Ethernet link layer output routine to send a raw frame to the device.
  *
  * This assumes that the 14 byte Ethernet header is present and contiguous
  * in the first mbuf (if BRIDGE'ing).
  */
 int
 ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 {
 	uint8_t pcp;
 
 	pcp = ifp->if_pcp;
 	if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN &&
 	    !ether_set_pcp(&m, ifp, pcp))
 		return (0);
 
 	if (PFIL_HOOKED_OUT(V_link_pfil_head))
 		switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT,
 		    NULL)) {
 		case PFIL_DROPPED:
 			return (EACCES);
 		case PFIL_CONSUMED:
 			return (0);
 		}
 
 #ifdef EXPERIMENTAL
 #if defined(INET6) && defined(INET)
 	/* draft-ietf-6man-ipv6only-flag */
 	/* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) {
 		struct ether_header *eh;
 
 		eh = mtod(m, struct ether_header *);
 		switch (ntohs(eh->ether_type)) {
 		case ETHERTYPE_IP:
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 			/* NOTREACHED */
 			break;
 		};
 	}
 #endif
 #endif
 
 	/*
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
 	return ((ifp->if_transmit)(ifp, m));
 }
 
 /*
  * Process a received Ethernet packet; the packet is in the
  * mbuf chain m with the ethernet header at the front.
  */
 static void
 ether_input_internal(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	u_short etype;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
 		m_freem(m);
 		return;
 	}
 #endif
 	if (m->m_len < ETHER_HDR_LEN) {
 		/* XXX maybe should pullup? */
 		if_printf(ifp, "discard frame w/o leading ethernet "
 				"header (len %u pkt len %u)\n",
 				m->m_len, m->m_pkthdr.len);
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	random_harvest_queue_ether(m, sizeof(*m));
 
 #ifdef EXPERIMENTAL
 #if defined(INET6) && defined(INET)
 	/* draft-ietf-6man-ipv6only-flag */
 	/* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) {
 
 		switch (etype) {
 		case ETHERTYPE_IP:
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			m_freem(m);
 			return;
 			/* NOTREACHED */
 			break;
 		};
 	}
 #endif
 #endif
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (ETHER_IS_BROADCAST(eh->ether_dhost))
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	ETHER_BPF_MTAP(ifp, m);
 
 	/*
 	 * If the CRC is still on the packet, trim it off. We do this once
 	 * and once only in case we are re-entered. Nothing else on the
 	 * Ethernet receive path expects to see the FCS.
 	 */
 	if (m->m_flags & M_HASFCS) {
 		m_adj(m, -ETHER_CRC_LEN);
 		m->m_flags &= ~M_HASFCS;
 	}
 
 	if (!(ifp->if_capenable & IFCAP_HWSTATS))
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		CURVNET_RESTORE();
 		return;
 	}
 
 	/* Handle input from a lagg(4) port */
 	if (ifp->if_type == IFT_IEEE8023ADLAG) {
 		KASSERT(lagg_input_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_p)(ifp, m);
 		if (m != NULL)
 			ifp = m->m_pkthdr.rcvif;
 		else {
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 	/*
 	 * If the hardware did not process an 802.1Q tag, do this now,
 	 * to allow 802.1P priority frames to be passed to the main input
 	 * path correctly.
 	 * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
 	 */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evl;
 
 		if (m->m_len < sizeof(*evl) &&
 		    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 #ifdef DIAGNOSTIC
 			if_printf(ifp, "cannot pullup VLAN header\n");
 #endif
 			if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 			CURVNET_RESTORE();
 			return;
 		}
 
 		evl = mtod(m, struct ether_vlan_header *);
 		m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
 		m->m_flags |= M_VLANTAG;
 
 		bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 		    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 		m_adj(m, ETHER_VLAN_ENCAP_LEN);
 		eh = mtod(m, struct ether_header *);
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Allow ng_ether(4) to claim this frame. */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_p != NULL,
 		    ("%s: ng_ether_input_p is NULL", __func__));
 		m->m_flags &= ~M_PROMISC;
 		(*ng_ether_input_p)(ifp, &m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 	/*
 	 * Allow if_bridge(4) to claim this frame.
 	 * The BRIDGE_INPUT() macro will update ifp if the bridge changed it
 	 * and the frame should be delivered locally.
 	 */
 	if (ifp->if_bridge != NULL) {
 		m->m_flags &= ~M_PROMISC;
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 #if defined(INET) || defined(INET6)
 	/*
 	 * Clear M_PROMISC on frame so that carp(4) will see it when the
 	 * mbuf flows up to Layer 3.
 	 * FreeBSD's implementation of carp(4) uses the inprotosw
 	 * to dispatch IPPROTO_CARP. carp(4) also allocates its own
 	 * Ethernet addresses of the form 00:00:5e:00:01:xx, which
 	 * is outside the scope of the M_PROMISC test below.
 	 * TODO: Maintain a hash table of ethernet addresses other than
 	 * ether_dhost which may be active on this ifp.
 	 */
 	if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
 		m->m_flags &= ~M_PROMISC;
 	} else
 #endif
 	{
 		/*
 		 * If the frame received was not for our MAC address, set the
 		 * M_PROMISC flag on the mbuf chain. The frame may need to
 		 * be seen by the rest of the Ethernet input path in case of
 		 * re-entry (e.g. bridge, vlan, netgraph) but should not be
 		 * seen by upper protocol layers.
 		 */
 		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
 		    bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
 			m->m_flags |= M_PROMISC;
 	}
 
 	ether_demux(ifp, m);
 	CURVNET_RESTORE();
 }
 
 /*
  * Ethernet input dispatch; by default, direct dispatch here regardless of
  * global configuration.  However, if RSS is enabled, hook up RSS affinity
  * so that when deferred or hybrid dispatch is enabled, we can redistribute
  * load based on RSS.
  *
  * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
  * not it had already done work distribution via multi-queue.  Then we could
  * direct dispatch in the event load balancing was already complete and
  * handle the case of interfaces with different capabilities better.
  *
  * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
  * at multiple layers?
  *
  * XXXRW: For now, enable all this only if RSS is compiled in, although it
  * works fine without RSS.  Need to characterise the performance overhead
  * of the detour through the netisr code in the event the result is always
  * direct dispatch.
  */
 static void
 ether_nh_input(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL,
 	    ("%s: NULL interface pointer", __func__));
 	ether_input_internal(m->m_pkthdr.rcvif, m);
 }
 
 static struct netisr_handler	ether_nh = {
 	.nh_name = "ether",
 	.nh_handler = ether_nh_input,
 	.nh_proto = NETISR_ETHER,
 #ifdef RSS
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 	.nh_m2cpuid = rss_m2cpuid,
 #else
 	.nh_policy = NETISR_POLICY_SOURCE,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 #endif
 };
 
 static void
 ether_init(__unused void *arg)
 {
 
 	netisr_register(&ether_nh);
 }
 SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);
 
 static void
 vnet_ether_init(__unused void *arg)
 {
 	struct pfil_head_args args;
 
 	args.pa_version = PFIL_VERSION;
 	args.pa_flags = PFIL_IN | PFIL_OUT;
 	args.pa_type = PFIL_TYPE_ETHERNET;
 	args.pa_headname = PFIL_ETHER_NAME;
 	V_link_pfil_head = pfil_head_register(&args);
 
 #ifdef VIMAGE
 	netisr_register_vnet(&ether_nh);
 #endif
 }
 VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_init, NULL);
  
 #ifdef VIMAGE
 static void
 vnet_ether_pfil_destroy(__unused void *arg)
 {
 
 	pfil_head_unregister(V_link_pfil_head);
 }
 VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY,
     vnet_ether_pfil_destroy, NULL);
 
 static void
 vnet_ether_destroy(__unused void *arg)
 {
 
 	netisr_unregister_vnet(&ether_nh);
 }
 VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_destroy, NULL);
 #endif
 
 
 
 static void
 ether_input(struct ifnet *ifp, struct mbuf *m)
 {
 
 	struct mbuf *mn;
 
 	/*
 	 * The drivers are allowed to pass in a chain of packets linked with
 	 * m_nextpkt. We split them up into separate packets here and pass
 	 * them up. This allows the drivers to amortize the receive lock.
 	 */
 	while (m) {
 		mn = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We will rely on rcvif being set properly in the deferred context,
 		 * so assert it is correct here.
 		 */
+		MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 		KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p "
 		    "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp));
 		CURVNET_SET_QUIET(ifp->if_vnet);
 		netisr_dispatch(NETISR_ETHER, m);
 		CURVNET_RESTORE();
 		m = mn;
 	}
 }
 
 /*
  * Upper layer processing for a received Ethernet packet.
  */
 void
 ether_demux(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	int i, isr;
 	u_short ether_type;
 
 	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));
 
 	/* Do not grab PROMISC frames in case we are re-entered. */
 	if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) {
 		i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL);
 		if (i != 0 || m == NULL)
 			return;
 	}
 
 	eh = mtod(m, struct ether_header *);
 	ether_type = ntohs(eh->ether_type);
 
 	/*
 	 * If this frame has a VLAN tag other than 0, call vlan_input()
 	 * if its module is loaded. Otherwise, drop.
 	 */
 	if ((m->m_flags & M_VLANTAG) &&
 	    EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
 		if (ifp->if_vlantrunk == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 		KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
 		    __func__));
 		/* Clear before possibly re-entering ether_input(). */
 		m->m_flags &= ~M_PROMISC;
 		(*vlan_input_p)(ifp, m);
 		return;
 	}
 
 	/*
 	 * Pass promiscuously received frames to the upper layer if the user
 	 * requested this by setting IFF_PPROMISC. Otherwise, drop them.
 	 */
 	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper layers.
 	 * Strip off Ethernet header.
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 	m_adj(m, ETHER_HDR_LEN);
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ether_type) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			return;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		goto discard;
 	}
 	netisr_dispatch(isr, m);
 	return;
 
 discard:
 	/*
 	 * Packet is to be discarded.  If netgraph is present,
 	 * hand the packet to it for last chance processing;
 	 * otherwise dispose of it.
 	 */
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_orphan_p != NULL,
 		    ("ng_ether_input_orphan_p is NULL"));
 		/*
 		 * Put back the ethernet header so netgraph has a
 		 * consistent view of inbound packets.
 		 */
 		M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
 		(*ng_ether_input_orphan_p)(ifp, m);
 		return;
 	}
 	m_freem(m);
 }
 
 /*
  * Convert Ethernet address to printable (loggable) representation.
  * This routine is for compatibility; it's better to just use
  *
  *	printf("%6D", <pointer to address>, ":");
  *
  * since there's no static buffer involved.
  */
 char *
 ether_sprintf(const u_char *ap)
 {
 	static char etherbuf[18];
 	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
 	return (etherbuf);
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 ether_ifattach(struct ifnet *ifp, const u_int8_t *lla)
 {
 	int i;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_addrlen = ETHER_ADDR_LEN;
 	ifp->if_hdrlen = ETHER_HDR_LEN;
 	if_attach(ifp);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_output = ether_output;
 	ifp->if_input = ether_input;
 	ifp->if_resolvemulti = ether_resolvemulti;
 	ifp->if_requestencap = ether_requestencap;
 #ifdef VIMAGE
 	ifp->if_reassign = ether_reassign;
 #endif
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Mbps(10);		/* just a default */
 	ifp->if_broadcastaddr = etherbroadcastaddr;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_ETHER;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	if (ifp->if_hw_addr != NULL)
 		bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen);
 
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 	if (ng_ether_attach_p != NULL)
 		(*ng_ether_attach_p)(ifp);
 
 	/* Announce Ethernet MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break; 
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
 
 	uuid_ether_add(LLADDR(sdl));
 
 	/* Add necessary bits are setup; announce it now. */
 	EVENTHANDLER_INVOKE(ether_ifattach_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL);
 }
 
 /*
  * Perform common duties while detaching an Ethernet interface
  */
 void
 ether_ifdetach(struct ifnet *ifp)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
 	uuid_ether_del(LLADDR(sdl));
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 #ifdef VIMAGE
 void
 ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused)
 {
 
 	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	if (ng_ether_attach_p != NULL) {
 		CURVNET_SET_QUIET(new_vnet);
 		(*ng_ether_attach_p)(ifp);
 		CURVNET_RESTORE();
 	}
 }
 #endif
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet");
 
 #if 0
 /*
  * This is for reference.  We have a table-driven version
  * of the little-endian crc32 generator, which is faster
  * than the double-loop.
  */
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = (crc ^ data) & 1;
 			crc >>= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_LE);
 		}
 	}
 
 	return (crc);
 }
 #else
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	static const uint32_t crctab[] = {
 		0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
 		0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
 		0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
 		0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
 	};
 	size_t i;
 	uint32_t crc;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		crc ^= buf[i];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 	}
 
 	return (crc);
 }
 #endif
 
 uint32_t
 ether_crc32_be(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc, carry;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
 			crc <<= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
 		}
 	}
 
 	return (crc);
 }
 
 int
 ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
 		    ETHER_ADDR_LEN);
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > ETHERMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 
 	case SIOCSLANPCP:
 		error = priv_check(curthread, PRIV_NET_SETLANPCP);
 		if (error != 0)
 			break;
 		if (ifr->ifr_lan_pcp > 7 &&
 		    ifr->ifr_lan_pcp != IFNET_PCP_NONE) {
 			error = EINVAL;
 		} else {
 			ifp->if_pcp = ifr->ifr_lan_pcp;
 			/* broadcast event about PCP change */
 			EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP);
 		}
 		break;
 
 	case SIOCGLANPCP:
 		ifr->ifr_lan_pcp = ifp->if_pcp;
 		break;
 
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
 	}
 	return (error);
 }
 
 static int
 ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
 	struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!ETHER_IS_MULTICAST(e_addr))
 			return EADDRNOTAVAIL;
 		*llsa = NULL;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = NULL;
 			return 0;
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return EAFNOSUPPORT;
 	}
 }
 
 static moduledata_t ether_mod = {
 	.name = "ether",
 };
 
 void
 ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen)
 {
 	struct ether_vlan_header vlan;
 	struct mbuf mv, mb;
 
 	KASSERT((m->m_flags & M_VLANTAG) != 0,
 	    ("%s: vlan information not present", __func__));
 	KASSERT(m->m_len >= sizeof(struct ether_header),
 	    ("%s: mbuf not large enough for header", __func__));
 	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
 	vlan.evl_proto = vlan.evl_encap_proto;
 	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
 	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
 	m->m_len -= sizeof(struct ether_header);
 	m->m_data += sizeof(struct ether_header);
 	/*
 	 * If a data link has been supplied by the caller, then we will need to
 	 * re-create a stack allocated mbuf chain with the following structure:
 	 *
 	 * (1) mbuf #1 will contain the supplied data link
 	 * (2) mbuf #2 will contain the vlan header
 	 * (3) mbuf #3 will contain the original mbuf's packet data
 	 *
 	 * Otherwise, submit the packet and vlan header via bpf_mtap2().
 	 */
 	if (data != NULL) {
 		mv.m_next = m;
 		mv.m_data = (caddr_t)&vlan;
 		mv.m_len = sizeof(vlan);
 		mb.m_next = &mv;
 		mb.m_data = data;
 		mb.m_len = dlen;
 		bpf_mtap(bp, &mb);
 	} else
 		bpf_mtap2(bp, &vlan, sizeof(vlan), m);
 	m->m_len += sizeof(struct ether_header);
 	m->m_data -= sizeof(struct ether_header);
 }
 
 struct mbuf *
 ether_vlanencap(struct mbuf *m, uint16_t tag)
 {
 	struct ether_vlan_header *evl;
 
 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
 	if (m == NULL)
 		return (NULL);
 	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */
 
 	if (m->m_len < sizeof(*evl)) {
 		m = m_pullup(m, sizeof(*evl));
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * Transform the Ethernet header into an Ethernet header
 	 * with 802.1Q encapsulation.
 	 */
 	evl = mtod(m, struct ether_vlan_header *);
 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
 	    (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
 	evl->evl_tag = htons(tag);
 	return (m);
 }
 
 static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0,
     "IEEE 802.1Q VLAN");
 static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0,
     "for consistency");
 
 VNET_DEFINE_STATIC(int, soft_pad);
 #define	V_soft_pad	VNET(soft_pad)
 SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(soft_pad), 0,
     "pad short frames before tagging");
 
 /*
  * For now, make preserving PCP via an mbuf tag optional, as it increases
  * per-packet memory allocations and frees.  In the future, it would be
  * preferable to reuse ether_vtag for this, or similar.
  */
 int vlan_mtag_pcp = 0;
 SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW,
     &vlan_mtag_pcp, 0,
     "Retain VLAN PCP information as packets are passed up the stack");
 
 bool
 ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p,
     uint16_t vid, uint8_t pcp)
 {
 	struct m_tag *mtag;
 	int n;
 	uint16_t tag;
 	static const char pad[8];	/* just zeros */
 
 	/*
 	 * Pad the frame to the minimum size allowed if told to.
 	 * This option is in accord with IEEE Std 802.1Q, 2003 Ed.,
 	 * paragraph C.4.4.3.b.  It can help to work around buggy
 	 * bridges that violate paragraph C.4.4.3.a from the same
 	 * document, i.e., fail to pad short frames after untagging.
 	 * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but
 	 * untagging it will produce a 62-byte frame, which is a runt
 	 * and requires padding.  There are VLAN-enabled network
 	 * devices that just discard such runts instead or mishandle
 	 * them somehow.
 	 */
 	if (V_soft_pad && p->if_type == IFT_ETHER) {
 		for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len;
 		     n > 0; n -= sizeof(pad)) {
 			if (!m_append(*mp, min(n, sizeof(pad)), pad))
 				break;
 		}
 		if (n > 0) {
 			m_freem(*mp);
 			*mp = NULL;
 			if_printf(ife, "cannot pad short frame");
 			return (false);
 		}
 	}
 
 	/*
 	 * If underlying interface can do VLAN tag insertion itself,
 	 * just pass the packet along. However, we need some way to
 	 * tell the interface where the packet came from so that it
 	 * knows how to find the VLAN tag to use, so we attach a
 	 * packet tag that holds it.
 	 */
 	if (vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q,
 	    MTAG_8021Q_PCP_OUT, NULL)) != NULL)
 		tag = EVL_MAKETAG(vid, *(uint8_t *)(mtag + 1), 0);
 	else
 		tag = EVL_MAKETAG(vid, pcp, 0);
 	if (p->if_capenable & IFCAP_VLAN_HWTAGGING) {
 		(*mp)->m_pkthdr.ether_vtag = tag;
 		(*mp)->m_flags |= M_VLANTAG;
 	} else {
 		*mp = ether_vlanencap(*mp, tag);
 		if (*mp == NULL) {
 			if_printf(ife, "unable to prepend 802.1Q header");
 			return (false);
 		}
 	}
 	return (true);
 }
 
 /*
  * Allocate an address from the FreeBSD Foundation OUI.  This uses a
  * cryptographic hash function on the containing jail's UUID and the interface
  * name to attempt to provide a unique but stable address.  Pseudo-interfaces
  * which require a MAC address should use this function to allocate
  * non-locally-administered addresses.
  */
 void
 ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr)
 {
 #define	ETHER_GEN_ADDR_BUFSIZ	HOSTUUIDLEN + IFNAMSIZ + 2
 	SHA1_CTX ctx;
 	char buf[ETHER_GEN_ADDR_BUFSIZ];
 	char uuid[HOSTUUIDLEN + 1];
 	uint64_t addr;
 	int i, sz;
 	char digest[SHA1_RESULTLEN];
 
 	getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid));
 	sz = snprintf(buf, ETHER_GEN_ADDR_BUFSIZ, "%s-%s", uuid, ifp->if_xname);
 	SHA1Init(&ctx);
 	SHA1Update(&ctx, buf, sz);
 	SHA1Final(digest, &ctx);
 
 	addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) &
 	    OUI_FREEBSD_GENERATED_MASK;
 	addr = OUI_FREEBSD(addr);
 	for (i = 0; i < ETHER_ADDR_LEN; ++i) {
 		hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) &
 		    0xFF;
 	}
 }
 
 DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(ether, 1);
Index: head/sys/net/if_lagg.c
===================================================================
--- head/sys/net/if_lagg.c	(revision 348253)
+++ head/sys/net/if_lagg.c	(revision 348254)
@@ -1,2278 +1,2371 @@
 /*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
 
 /*
  * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
  * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
  * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/eventhandler.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #endif
 #ifdef INET
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 
 #include <net/if_vlan_var.h>
 #include <net/if_lagg.h>
 #include <net/ieee8023ad_lacp.h>
 
 #define	LAGG_RLOCK()	struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et)
 #define	LAGG_RUNLOCK()	epoch_exit_preempt(net_epoch_preempt, &lagg_et)
 #define	LAGG_RLOCK_ASSERT()	MPASS(in_epoch(net_epoch_preempt))
 #define	LAGG_UNLOCK_ASSERT()	MPASS(!in_epoch(net_epoch_preempt))
 
 #define	LAGG_SX_INIT(_sc)	sx_init(&(_sc)->sc_sx, "if_lagg sx")
 #define	LAGG_SX_DESTROY(_sc)	sx_destroy(&(_sc)->sc_sx)
 #define	LAGG_XLOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
 #define	LAGG_XUNLOCK(_sc)	sx_xunlock(&(_sc)->sc_sx)
 #define	LAGG_SXLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_LOCKED)
 #define	LAGG_XLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_XLOCKED)
 
 /* Special flags we should propagate to the lagg ports. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } lagg_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
+struct lagg_snd_tag {
+	struct m_snd_tag com;
+	struct m_snd_tag *tag;
+};
+
 VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
 #define	V_lagg_list	VNET(lagg_list)
 VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx);
 #define	V_lagg_list_mtx	VNET(lagg_list_mtx)
 #define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
 					"if_lagg list", NULL, MTX_DEF)
 #define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
 #define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
 #define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
 eventhandler_tag	lagg_detach_cookie = NULL;
 
 static int	lagg_clone_create(struct if_clone *, int, caddr_t);
 static void	lagg_clone_destroy(struct ifnet *);
 VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner);
 #define	V_lagg_cloner	VNET(lagg_cloner)
 static const char laggname[] = "lagg";
 static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface");
 
 static void	lagg_capabilities(struct lagg_softc *);
 static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
 static int	lagg_port_destroy(struct lagg_port *, int);
 static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
 static void	lagg_linkstate(struct lagg_softc *);
 static void	lagg_port_state(struct ifnet *, int);
 static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
 static int	lagg_port_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
 #ifdef LAGG_PORT_STACKING
 static int	lagg_port_checkstacking(struct lagg_softc *);
 #endif
 static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
 static void	lagg_init(void *);
 static void	lagg_stop(struct lagg_softc *);
 static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
 #ifdef RATELIMIT
 static int	lagg_snd_tag_alloc(struct ifnet *,
 		    union if_snd_tag_alloc_params *,
 		    struct m_snd_tag **);
+static int	lagg_snd_tag_modify(struct m_snd_tag *,
+		    union if_snd_tag_modify_params *);
+static int	lagg_snd_tag_query(struct m_snd_tag *,
+		    union if_snd_tag_query_params *);
 static void	lagg_snd_tag_free(struct m_snd_tag *);
 #endif
 static int	lagg_setmulti(struct lagg_port *);
 static int	lagg_clrmulti(struct lagg_port *);
 static	int	lagg_setcaps(struct lagg_port *, int cap);
 static	int	lagg_setflag(struct lagg_port *, int, int,
 		    int (*func)(struct ifnet *, int));
 static	int	lagg_setflags(struct lagg_port *, int status);
 static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
 static int	lagg_transmit(struct ifnet *, struct mbuf *);
 static void	lagg_qflush(struct ifnet *);
 static int	lagg_media_change(struct ifnet *);
 static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
 static struct lagg_port *lagg_link_active(struct lagg_softc *,
 	    struct lagg_port *);
 
 /* Simple round robin */
 static void	lagg_rr_attach(struct lagg_softc *);
 static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Active failover */
 static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Loadbalancing */
 static void	lagg_lb_attach(struct lagg_softc *);
 static void	lagg_lb_detach(struct lagg_softc *);
 static int	lagg_lb_port_create(struct lagg_port *);
 static void	lagg_lb_port_destroy(struct lagg_port *);
 static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
 
 /* Broadcast */
 static int    lagg_bcast_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* 802.3ad LACP */
 static void	lagg_lacp_attach(struct lagg_softc *);
 static void	lagg_lacp_detach(struct lagg_softc *);
 static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static void	lagg_lacp_lladdr(struct lagg_softc *);
 
 /* lagg protocol table */
 static const struct lagg_proto {
 	lagg_proto	pr_num;
 	void		(*pr_attach)(struct lagg_softc *);
 	void		(*pr_detach)(struct lagg_softc *);
 	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
 	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
 			    struct mbuf *);
 	int		(*pr_addport)(struct lagg_port *);
 	void		(*pr_delport)(struct lagg_port *);
 	void		(*pr_linkstate)(struct lagg_port *);
 	void 		(*pr_init)(struct lagg_softc *);
 	void 		(*pr_stop)(struct lagg_softc *);
 	void 		(*pr_lladdr)(struct lagg_softc *);
 	void		(*pr_request)(struct lagg_softc *, void *);
 	void		(*pr_portreq)(struct lagg_port *, void *);
 } lagg_protos[] = {
     {
 	.pr_num = LAGG_PROTO_NONE
     },
     {
 	.pr_num = LAGG_PROTO_ROUNDROBIN,
 	.pr_attach = lagg_rr_attach,
 	.pr_start = lagg_rr_start,
 	.pr_input = lagg_rr_input,
     },
     {
 	.pr_num = LAGG_PROTO_FAILOVER,
 	.pr_start = lagg_fail_start,
 	.pr_input = lagg_fail_input,
     },
     {
 	.pr_num = LAGG_PROTO_LOADBALANCE,
 	.pr_attach = lagg_lb_attach,
 	.pr_detach = lagg_lb_detach,
 	.pr_start = lagg_lb_start,
 	.pr_input = lagg_lb_input,
 	.pr_addport = lagg_lb_port_create,
 	.pr_delport = lagg_lb_port_destroy,
     },
     {
 	.pr_num = LAGG_PROTO_LACP,
 	.pr_attach = lagg_lacp_attach,
 	.pr_detach = lagg_lacp_detach,
 	.pr_start = lagg_lacp_start,
 	.pr_input = lagg_lacp_input,
 	.pr_addport = lacp_port_create,
 	.pr_delport = lacp_port_destroy,
 	.pr_linkstate = lacp_linkstate,
 	.pr_init = lacp_init,
 	.pr_stop = lacp_stop,
 	.pr_lladdr = lagg_lacp_lladdr,
 	.pr_request = lacp_req,
 	.pr_portreq = lacp_portreq,
     },
     {
 	.pr_num = LAGG_PROTO_BROADCAST,
 	.pr_start = lagg_bcast_start,
 	.pr_input = lagg_bcast_input,
     },
 };
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
     "Link Aggregation");
 
 /* Allow input on any failover links */
 VNET_DEFINE_STATIC(int, lagg_failover_rx_all);
 #define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(lagg_failover_rx_all), 0,
     "Accept input from any interface in a failover lagg");
 
 /* Default value for using flowid */
 VNET_DEFINE_STATIC(int, def_use_flowid) = 0;
 #define	V_def_use_flowid	VNET(def_use_flowid)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_flowid), 0,
     "Default setting for using flow id for load sharing");
 
 /* Default value for using numa */
 VNET_DEFINE_STATIC(int, def_use_numa) = 1;
 #define	V_def_use_numa	VNET(def_use_numa)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_numa), 0,
     "Use numa to steer flows");
 
 /* Default value for flowid shift */
 VNET_DEFINE_STATIC(int, def_flowid_shift) = 16;
 #define	V_def_flowid_shift	VNET(def_flowid_shift)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
     &VNET_NAME(def_flowid_shift), 0,
     "Default setting for flowid shift for load sharing");
 
 static void
 vnet_lagg_init(const void *unused __unused)
 {
 
 	LAGG_LIST_LOCK_INIT();
 	SLIST_INIT(&V_lagg_list);
 	V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
 	    lagg_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_lagg_init, NULL);
 
 static void
 vnet_lagg_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_lagg_cloner);
 	LAGG_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_lagg_uninit, NULL);
 
 static int
 lagg_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		lagg_input_p = lagg_input;
 		lagg_linkstate_p = lagg_port_state;
 		lagg_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, lagg_port_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    lagg_detach_cookie);
 		lagg_input_p = NULL;
 		lagg_linkstate_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t lagg_mod = {
 	"if_lagg",
 	lagg_modevent,
 	0
 };
 
 DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_lagg, 1);
 
 static void
 lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
 {
 
 	LAGG_XLOCK_ASSERT(sc);
 	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
 	    __func__, sc));
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		if_printf(sc->sc_ifp, "using proto %u\n", pr);
 
 	if (lagg_protos[pr].pr_attach != NULL)
 		lagg_protos[pr].pr_attach(sc);
 	sc->sc_proto = pr;
 }
 
 static void
 lagg_proto_detach(struct lagg_softc *sc)
 {
 	lagg_proto pr;
 
 	LAGG_XLOCK_ASSERT(sc);
 	pr = sc->sc_proto;
 	sc->sc_proto = LAGG_PROTO_NONE;
 
 	if (lagg_protos[pr].pr_detach != NULL)
 		lagg_protos[pr].pr_detach(sc);
 }
 
 static int
 lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
 }
 
 static struct mbuf *
 lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
 }
 
 static int
 lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
 		return (0);
 	else
 		return (lagg_protos[sc->sc_proto].pr_addport(lp));
 }
 
 static void
 lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
 		lagg_protos[sc->sc_proto].pr_delport(lp);
 }
 
 static void
 lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
 		lagg_protos[sc->sc_proto].pr_linkstate(lp);
 }
 
 static void
 lagg_proto_init(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_init != NULL)
 		lagg_protos[sc->sc_proto].pr_init(sc);
 }
 
 static void
 lagg_proto_stop(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
 		lagg_protos[sc->sc_proto].pr_stop(sc);
 }
 
 static void
 lagg_proto_lladdr(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
 		lagg_protos[sc->sc_proto].pr_lladdr(sc);
 }
 
 static void
 lagg_proto_request(struct lagg_softc *sc, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_request != NULL)
 		lagg_protos[sc->sc_proto].pr_request(sc, v);
 }
 
 static void
 lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
 		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
 }
 
 /*
  * This routine is run via an vlan
  * config EVENT
  */
 static void
 lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
 	LAGG_RUNLOCK();
 }
 
 /*
  * This routine is run via an vlan
  * unconfig EVENT
  */
 static void
 lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
 	LAGG_RUNLOCK();
 }
 
 static int
 lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct lagg_softc *sc;
 	struct ifnet *ifp;
 	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
 
 	sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		free(sc, M_LAGG);
 		return (ENOSPC);
 	}
 	LAGG_SX_INIT(sc);
 
 	LAGG_XLOCK(sc);
 	if (V_def_use_flowid)
 		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
 	if (V_def_use_numa)
 		sc->sc_opts |= LAGG_OPT_USE_NUMA;
 	sc->flowid_shift = V_def_flowid_shift;
 
 	/* Hash all layers by default */
 	sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4;
 
 	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
 
 	CK_SLIST_INIT(&sc->sc_ports);
 
 	/* Initialise pseudo media types */
 	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
 	    lagg_media_status);
 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
 
 	if_initname(ifp, laggname, unit);
 	ifp->if_softc = sc;
 	ifp->if_transmit = lagg_transmit;
 	ifp->if_qflush = lagg_qflush;
 	ifp->if_init = lagg_init;
 	ifp->if_ioctl = lagg_ioctl;
 	ifp->if_get_counter = lagg_get_counter;
 	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
 #ifdef RATELIMIT
 	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
+	ifp->if_snd_tag_modify = lagg_snd_tag_modify;
+	ifp->if_snd_tag_query = lagg_snd_tag_query;
 	ifp->if_snd_tag_free = lagg_snd_tag_free;
 #endif
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
 	/*
 	 * Attach as an ordinary ethernet device, children will be attached
 	 * as special device IFT_IEEE8023ADLAG.
 	 */
 	ether_ifattach(ifp, eaddr);
 
 	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
 
 	/* Insert into the global list of laggs */
 	LAGG_LIST_LOCK();
 	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
 	LAGG_LIST_UNLOCK();
 	LAGG_XUNLOCK(sc);
 
 	return (0);
 }
 
 static void
 lagg_clone_destroy(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	sc->sc_destroying = 1;
 	lagg_stop(sc);
 	ifp->if_flags &= ~IFF_UP;
 
 	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
 	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
 
 	/* Shutdown and remove lagg ports */
 	while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL)
 		lagg_port_destroy(lp, 1);
 
 	/* Unhook the aggregation protocol */
 	lagg_proto_detach(sc);
 	LAGG_XUNLOCK(sc);
 
 	ifmedia_removeall(&sc->sc_media);
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
 	LAGG_LIST_LOCK();
 	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
 	LAGG_LIST_UNLOCK();
 
 	LAGG_SX_DESTROY(sc);
 	free(sc, M_LAGG);
 }
 
 static void
 lagg_capabilities(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int cap, ena, pena;
 	uint64_t hwa;
 	struct ifnet_hw_tsomax hw_tsomax;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* Get common enabled capabilities for the lagg ports */
 	ena = ~0;
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		ena &= lp->lp_ifp->if_capenable;
 	ena = (ena == ~0 ? 0 : ena);
 
 	/*
 	 * Apply common enabled capabilities back to the lagg ports.
 	 * May require several iterations if they are dependent.
 	 */
 	do {
 		pena = ena;
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setcaps(lp, ena);
 			ena &= lp->lp_ifp->if_capenable;
 		}
 	} while (pena != ena);
 
 	/* Get other capabilities from the lagg ports */
 	cap = ~0;
 	hwa = ~(uint64_t)0;
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		cap &= lp->lp_ifp->if_capabilities;
 		hwa &= lp->lp_ifp->if_hwassist;
 		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
 	}
 	cap = (cap == ~0 ? 0 : cap);
 	hwa = (hwa == ~(uint64_t)0 ? 0 : hwa);
 
 	if (sc->sc_ifp->if_capabilities != cap ||
 	    sc->sc_ifp->if_capenable != ena ||
 	    sc->sc_ifp->if_hwassist != hwa ||
 	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
 		sc->sc_ifp->if_capabilities = cap;
 		sc->sc_ifp->if_capenable = ena;
 		sc->sc_ifp->if_hwassist = hwa;
 		getmicrotime(&sc->sc_ifp->if_lastchange);
 
 		if (sc->sc_ifflags & IFF_DEBUG)
 			if_printf(sc->sc_ifp,
 			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
 	}
 }
 
 static int
 lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp, *tlp;
 	struct ifreq ifr;
 	int error, i, oldmtu;
 	uint64_t *pval;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (sc->sc_ifp == ifp) {
 		if_printf(sc->sc_ifp,
 		    "cannot add a lagg to itself as a port\n");
 		return (EINVAL);
 	}
 
 	/* Limit the maximal number of lagg ports */
 	if (sc->sc_count >= LAGG_MAX_PORTS)
 		return (ENOSPC);
 
 	/* Check if port has already been associated to a lagg */
 	if (ifp->if_lagg != NULL) {
 		/* Port is already in the current lagg? */
 		lp = (struct lagg_port *)ifp->if_lagg;
 		if (lp->lp_softc == sc)
 			return (EEXIST);
 		return (EBUSY);
 	}
 
 	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
 	if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
 		return (EPROTONOSUPPORT);
 
 	/* Allow the first Ethernet member to define the MTU */
 	oldmtu = -1;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		sc->sc_ifp->if_mtu = ifp->if_mtu;
 	} else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
 		if (ifp->if_ioctl == NULL) {
 			if_printf(sc->sc_ifp, "cannot change MTU for %s\n",
 			    ifp->if_xname);
 			return (EINVAL);
 		}
 		oldmtu = ifp->if_mtu;
 		strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name));
 		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		if (error != 0) {
 			if_printf(sc->sc_ifp, "invalid MTU for %s\n",
 			    ifp->if_xname);
 			return (error);
 		}
 		ifr.ifr_mtu = oldmtu;
 	}
 
 	lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO);
 	lp->lp_softc = sc;
 
 	/* Check if port is a stacked lagg */
 	LAGG_LIST_LOCK();
 	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
 		if (ifp == sc_ptr->sc_ifp) {
 			LAGG_LIST_UNLOCK();
 			free(lp, M_LAGG);
 			if (oldmtu != -1)
 				(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 				    (caddr_t)&ifr);
 			return (EINVAL);
 			/* XXX disable stacking for the moment, its untested */
 #ifdef LAGG_PORT_STACKING
 			lp->lp_flags |= LAGG_PORT_STACK;
 			if (lagg_port_checkstacking(sc_ptr) >=
 			    LAGG_MAX_STACKING) {
 				LAGG_LIST_UNLOCK();
 				free(lp, M_LAGG);
 				if (oldmtu != -1)
 					(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 					    (caddr_t)&ifr);
 				return (E2BIG);
 			}
 #endif
 		}
 	}
 	LAGG_LIST_UNLOCK();
 
 	if_ref(ifp);
 	lp->lp_ifp = ifp;
 
 	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
 	lp->lp_ifcapenable = ifp->if_capenable;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 		lagg_proto_lladdr(sc);
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	} else {
 		if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 	}
 	lagg_setflags(lp, 1);
 
 	if (CK_SLIST_EMPTY(&sc->sc_ports))
 		sc->sc_primary = lp;
 
 	/* Change the interface type */
 	lp->lp_iftype = ifp->if_type;
 	ifp->if_type = IFT_IEEE8023ADLAG;
 	ifp->if_lagg = lp;
 	lp->lp_ioctl = ifp->if_ioctl;
 	ifp->if_ioctl = lagg_port_ioctl;
 	lp->lp_output = ifp->if_output;
 	ifp->if_output = lagg_port_output;
 
 	/* Read port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++)
 		*pval = ifp->if_get_counter(ifp, i);
 
 	/*
 	 * Insert into the list of ports.
 	 * Keep ports sorted by if_index. It is handy, when configuration
 	 * is predictable and `ifconfig laggN create ...` command
 	 * will lead to the same result each time.
 	 */
 	CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
 		if (tlp->lp_ifp->if_index < ifp->if_index && (
 		    CK_SLIST_NEXT(tlp, lp_entries) == NULL ||
 		    ((struct  lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index >
 		    ifp->if_index))
 			break;
 	}
 	if (tlp != NULL)
 		CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries);
 	else
 		CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
 	sc->sc_count++;
 
 	lagg_setmulti(lp);
 
 
 	if ((error = lagg_proto_addport(sc, lp)) != 0) {
 		/* Remove the port, without calling pr_delport. */
 		lagg_port_destroy(lp, 0);
 		if (oldmtu != -1)
 			(*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		return (error);
 	}
 
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 #ifdef LAGG_PORT_STACKING
 static int
 lagg_port_checkstacking(struct lagg_softc *sc)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp;
 	int m = 0;
 
 	LAGG_SXLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_flags & LAGG_PORT_STACK) {
 			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
 			m = MAX(m, lagg_port_checkstacking(sc_ptr));
 		}
 	}
 
 	return (m + 1);
 }
 #endif
 
 static void
 lagg_port_destroy_cb(epoch_context_t ec)
 {
 	struct lagg_port *lp;
 	struct ifnet *ifp;
 
 	lp = __containerof(ec, struct lagg_port, lp_epoch_ctx);
 	ifp = lp->lp_ifp;
 
 	if_rele(ifp);
 	free(lp, M_LAGG);
 }
 
 static int
 lagg_port_destroy(struct lagg_port *lp, int rundelport)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct lagg_port *lp_ptr, *lp0;
 	struct ifnet *ifp = lp->lp_ifp;
 	uint64_t *pval, vdiff;
 	int i;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (rundelport)
 		lagg_proto_delport(sc, lp);
 
 	if (lp->lp_detaching == 0)
 		lagg_clrmulti(lp);
 
 	/* Restore interface */
 	ifp->if_type = lp->lp_iftype;
 	ifp->if_ioctl = lp->lp_ioctl;
 	ifp->if_output = lp->lp_output;
 	ifp->if_lagg = NULL;
 
 	/* Update detached port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++) {
 		vdiff = ifp->if_get_counter(ifp, i) - *pval;
 		sc->detached_counters.val[i] += vdiff;
 	}
 
 	/* Finally, remove the port from the lagg */
 	CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
 	sc->sc_count--;
 
 	/* Update the primary interface */
 	if (lp == sc->sc_primary) {
 		uint8_t lladdr[ETHER_ADDR_LEN];
 
 		if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL)
 			bzero(&lladdr, ETHER_ADDR_LEN);
 		else
 			bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN);
 		sc->sc_primary = lp0;
 		if (sc->sc_destroying == 0) {
 			bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			lagg_proto_lladdr(sc);
 			EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 		}
 
 		/*
 		 * Update lladdr for each port (new primary needs update
 		 * as well, to switch from old lladdr to its 'real' one)
 		 */
 		CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
 			if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN);
 	}
 
 	if (lp->lp_ifflags)
 		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
 
 	if (lp->lp_detaching == 0) {
 		lagg_setflags(lp, 0);
 		lagg_setcaps(lp, lp->lp_ifcapenable);
 		if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN);
 	}
 
 	/*
 	 * free port and release it's ifnet reference after a grace period has
 	 * elapsed.
 	 */
 	epoch_call(net_epoch_preempt, &lp->lp_epoch_ctx, lagg_port_destroy_cb);
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 static int
 lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct lagg_reqport *rp = (struct lagg_reqport *)data;
 	struct lagg_softc *sc;
 	struct lagg_port *lp = NULL;
 	int error = 0;
 
 	/* Should be checked by the caller */
 	if (ifp->if_type != IFT_IEEE8023ADLAG ||
 	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
 		goto fallback;
 
 	switch (cmd) {
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    ifunit(rp->rp_portname) != ifp) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_RLOCK();
 		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_RUNLOCK();
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		LAGG_RUNLOCK();
 		break;
 
 	case SIOCSIFCAP:
 		if (lp->lp_ioctl == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = (*lp->lp_ioctl)(ifp, cmd, data);
 		if (error)
 			break;
 
 		/* Update lagg interface capabilities */
 		LAGG_XLOCK(sc);
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(sc->sc_ifp);
 		break;
 
 	case SIOCSIFMTU:
 		/* Do not allow the MTU to be changed once joined */
 		error = EINVAL;
 		break;
 
 	default:
 		goto fallback;
 	}
 
 	return (error);
 
 fallback:
 	if (lp != NULL && lp->lp_ioctl != NULL)
 		return ((*lp->lp_ioctl)(ifp, cmd, data));
 
 	return (EINVAL);
 }
 
 /*
  * Requests counter @cnt data. 
  *
  * Counter value is calculated the following way:
  * 1) for each port, sum  difference between current and "initial" measurements.
  * 2) add lagg logical interface counters.
  * 3) add data from detached_counters array.
  *
  * We also do the following things on ports attach/detach:
  * 1) On port attach we store all counters it has into port_counter array. 
  * 2) On port detach we add the different between "initial" and
  *   current counters data to detached_counters array.
  */
 static uint64_t
 lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *lpifp;
 	uint64_t newval, oldval, vsum;
 
 	/* Revise this when we've got non-generic counters. */
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	sc = (struct lagg_softc *)ifp->if_softc;
 
 	vsum = 0;
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		/* Saved attached value */
 		oldval = lp->port_counters.val[cnt];
 		/* current value */
 		lpifp = lp->lp_ifp;
 		newval = lpifp->if_get_counter(lpifp, cnt);
 		/* Calculate diff and save new */
 		vsum += newval - oldval;
 	}
 	LAGG_RUNLOCK();
 
 	/*
 	 * Add counter data which might be added by upper
 	 * layer protocols operating on logical interface.
 	 */
 	vsum += if_get_counter_default(ifp, cnt);
 
 	/*
 	 * Add counter data from detached ports counters
 	 */
 	vsum += sc->detached_counters.val[cnt];
 
 
 	return (vsum);
 }
 
 /*
  * For direct output to child ports.
  */
 static int
 lagg_port_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 
 	switch (dst->sa_family) {
 		case pseudo_AF_HDRCMPLT:
 		case AF_UNSPEC:
 			return ((*lp->lp_output)(ifp, m, dst, ro));
 	}
 
 	/* drop any other frames */
 	m_freem(m);
 	return (ENETDOWN);
 }
 
 static void
 lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct lagg_port *lp;
 	struct lagg_softc *sc;
 
 	if ((lp = ifp->if_lagg) == NULL)
 		return;
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 
 	sc = lp->lp_softc;
 
 	LAGG_XLOCK(sc);
 	lp->lp_detaching = 1;
 	lagg_port_destroy(lp, 1);
 	LAGG_XUNLOCK(sc);
 	VLAN_CAPABILITIES(sc->sc_ifp);
 }
 
 static void
 lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 
 	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
 	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
 	rp->rp_prio = lp->lp_prio;
 	rp->rp_flags = lp->lp_flags;
 	lagg_proto_portreq(sc, lp, &rp->rp_psc);
 
 	/* Add protocol specific flags */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			if (lp == sc->sc_primary)
 				rp->rp_flags |= LAGG_PORT_MASTER;
 			if (lp == lagg_link_active(sc, sc->sc_primary))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			if (LAGG_PORTACTIVE(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_LACP:
 			/* LACP has a different definition of active */
 			if (lacp_isactive(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			if (lacp_iscollecting(lp))
 				rp->rp_flags |= LAGG_PORT_COLLECTING;
 			if (lacp_isdistributing(lp))
 				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
 			break;
 	}
 
 }
 
 static void
 lagg_init(void *xsc)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		LAGG_XUNLOCK(sc);
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/*
 	 * Update the port lladdrs if needed.
 	 * This might be if_setlladdr() notification
 	 * that lladdr has been changed.
 	 */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp),
 		    ETHER_ADDR_LEN) != 0)
 			if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN);
 	}
 
 	lagg_proto_init(sc);
 
 	LAGG_XUNLOCK(sc);
 }
 
 static void
 lagg_stop(struct lagg_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	lagg_proto_stop(sc);
 }
 
 static int
 lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_reqall *ra = (struct lagg_reqall *)data;
 	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
 	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
 	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct lagg_port *lp;
 	struct ifnet *tpif;
 	struct thread *td = curthread;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	bzero(&rpbuf, sizeof(rpbuf));
 
 	switch (cmd) {
 	case SIOCGLAGG:
 		LAGG_XLOCK(sc);
 		buflen = sc->sc_count * sizeof(struct lagg_reqport);
 		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 		ra->ra_proto = sc->sc_proto;
 		lagg_proto_request(sc, &ra->ra_psc);
 		count = 0;
 		buf = outbuf;
 		len = min(ra->ra_size, buflen);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (len < sizeof(rpbuf))
 				break;
 
 			lagg_port2req(lp, &rpbuf);
 			memcpy(buf, &rpbuf, sizeof(rpbuf));
 			count++;
 			buf += sizeof(rpbuf);
 			len -= sizeof(rpbuf);
 		}
 		LAGG_XUNLOCK(sc);
 		ra->ra_ports = count;
 		ra->ra_size = count * sizeof(rpbuf);
 		error = copyout(outbuf, ra->ra_port, ra->ra_size);
 		free(outbuf, M_TEMP);
 		break;
 	case SIOCSLAGG:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ra->ra_proto >= LAGG_PROTO_MAX) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 		lagg_proto_detach(sc);
 		LAGG_UNLOCK_ASSERT();
 		lagg_proto_attach(sc, ra->ra_proto);
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGOPTS:
 		LAGG_XLOCK(sc);
 		ro->ro_opts = sc->sc_opts;
 		if (sc->sc_proto == LAGG_PROTO_LACP) {
 			struct lacp_softc *lsc;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 			if (lsc->lsc_debug.lsc_tx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
 			if (lsc->lsc_debug.lsc_rx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
 			if (lsc->lsc_strict_mode != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
 			if (lsc->lsc_fast_timeout != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT;
 
 			ro->ro_active = sc->sc_active;
 		} else {
 			ro->ro_active = 0;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				ro->ro_active += LAGG_PORTACTIVE(lp);
 		}
 		ro->ro_bkt = sc->sc_bkt;
 		ro->ro_flapping = sc->sc_flapping;
 		ro->ro_flowid_shift = sc->flowid_shift;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGOPTS:
 		if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) {
 			if (ro->ro_bkt == 0)
 				sc->sc_bkt = 1; // Minimum 1 packet per iface.
 			else
 				sc->sc_bkt = ro->ro_bkt;
 		}
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ro->ro_opts == 0)
 			break;
 		/*
 		 * Set options.  LACP options are stored in sc->sc_psc,
 		 * not in sc_opts.
 		 */
 		int valid, lacp;
 
 		switch (ro->ro_opts) {
 		case LAGG_OPT_USE_FLOWID:
 		case -LAGG_OPT_USE_FLOWID:
 		case LAGG_OPT_USE_NUMA:
 		case -LAGG_OPT_USE_NUMA:
 		case LAGG_OPT_FLOWIDSHIFT:
 			valid = 1;
 			lacp = 0;
 			break;
 		case LAGG_OPT_LACP_TXTEST:
 		case -LAGG_OPT_LACP_TXTEST:
 		case LAGG_OPT_LACP_RXTEST:
 		case -LAGG_OPT_LACP_RXTEST:
 		case LAGG_OPT_LACP_STRICT:
 		case -LAGG_OPT_LACP_STRICT:
 		case LAGG_OPT_LACP_TIMEOUT:
 		case -LAGG_OPT_LACP_TIMEOUT:
 			valid = lacp = 1;
 			break;
 		default:
 			valid = lacp = 0;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 
 		if (valid == 0 ||
 		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
 			/* Invalid combination of options specified. */
 			error = EINVAL;
 			LAGG_XUNLOCK(sc);
 			break;	/* Return from SIOCSLAGGOPTS. */ 
 		}
 		/*
 		 * Store new options into sc->sc_opts except for
 		 * FLOWIDSHIFT and LACP options.
 		 */
 		if (lacp == 0) {
 			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
 				sc->flowid_shift = ro->ro_flowid_shift;
 			else if (ro->ro_opts > 0)
 				sc->sc_opts |= ro->ro_opts;
 			else
 				sc->sc_opts &= ~ro->ro_opts;
 		} else {
 			struct lacp_softc *lsc;
 			struct lacp_port *lp;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 
 			switch (ro->ro_opts) {
 			case LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 0;
 				break;
 			case LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 0;
 				break;
 			case LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 1;
 				break;
 			case -LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 0;
 				break;
 			case LAGG_OPT_LACP_TIMEOUT:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state |= LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 1;
 				break;
 			case -LAGG_OPT_LACP_TIMEOUT:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state &= ~LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 0;
 				break;
 			}
 		}
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGFLAGS:
 		rf->rf_flags = 0;
 		LAGG_XLOCK(sc);
 		if (sc->sc_flags & MBUF_HASHFLAG_L2)
 			rf->rf_flags |= LAGG_F_HASHL2;
 		if (sc->sc_flags & MBUF_HASHFLAG_L3)
 			rf->rf_flags |= LAGG_F_HASHL3;
 		if (sc->sc_flags & MBUF_HASHFLAG_L4)
 			rf->rf_flags |= LAGG_F_HASHL4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGHASH:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
 			error = EINVAL;
 			break;
 		}
 		LAGG_XLOCK(sc);
 		sc->sc_flags = 0;
 		if (rf->rf_flags & LAGG_F_HASHL2)
 			sc->sc_flags |= MBUF_HASHFLAG_L2;
 		if (rf->rf_flags & LAGG_F_HASHL3)
 			sc->sc_flags |= MBUF_HASHFLAG_L3;
 		if (rf->rf_flags & LAGG_F_HASHL4)
 			sc->sc_flags |= MBUF_HASHFLAG_L4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_RLOCK();
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_RUNLOCK();
 			if_rele(tpif);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		LAGG_RUNLOCK();
 		if_rele(tpif);
 		break;
 	case SIOCSLAGGPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 #ifdef INET6
 		/*
 		 * A laggport interface should not have inet6 address
 		 * because two interfaces with a valid link-local
 		 * scope zone must not be merged in any form.  This
 		 * restriction is needed to prevent violation of
 		 * link-local scope zone.  Attempts to add a laggport
 		 * interface which has inet6 addresses triggers
 		 * removal of all inet6 addresses on the member
 		 * interface.
 		 */
 		if (in6ifa_llaonifp(tpif)) {
 			in6_ifdetach(tpif);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    tpif->if_xname);
 		}
 #endif
 		LAGG_XLOCK(sc);
 		error = lagg_port_create(sc, tpif);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSLAGGDELPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_XUNLOCK(sc);
 			if_rele(tpif);
 			break;
 		}
 
 		error = lagg_port_destroy(lp, 1);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSIFFLAGS:
 		/* Set flags on ports too */
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setflags(lp, 1);
 		}
 
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			lagg_stop(sc);
 			LAGG_XUNLOCK(sc);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			LAGG_XUNLOCK(sc);
 			(*ifp->if_init)(sc);
 		} else
 			LAGG_XUNLOCK(sc);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_clrmulti(lp);
 			lagg_setmulti(lp);
 		}
 		LAGG_XUNLOCK(sc);
 		error = 0;
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
 		break;
 
 	case SIOCSIFCAP:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 		}
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(ifp);
 		error = 0;
 		break;
 
 	case SIOCSIFMTU:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			else
 				error = EINVAL;
 			if (error != 0) {
 				if_printf(ifp,
 				    "failed to change MTU to %d on port %s, "
 				    "reverting all ports to original MTU (%d)\n",
 				    ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu);
 				break;
 			}
 		}
 		if (error == 0) {
 			ifp->if_mtu = ifr->ifr_mtu;
 		} else {
 			/* set every port back to the original MTU */
 			ifr->ifr_mtu = ifp->if_mtu;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 				if (lp->lp_ioctl != NULL)
 					(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			}
 		}
 		LAGG_XUNLOCK(sc);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 	return (error);
 }
 
 #ifdef RATELIMIT
-static int
-lagg_snd_tag_alloc(struct ifnet *ifp,
-    union if_snd_tag_alloc_params *params,
-    struct m_snd_tag **ppmt)
+static inline struct lagg_snd_tag *
+mst_to_lst(struct m_snd_tag *mst)
 {
-	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
+
+	return (__containerof(mst, struct lagg_snd_tag, com));
+}
+
+/*
+ * Look up the port used by a specific flow.  This only works for lagg
+ * protocols with deterministic port mappings (e.g. not roundrobin).
+ * In addition protocols which use a hash to map flows to ports must
+ * be configured to use the mbuf flowid rather than hashing packet
+ * contents.
+ */
+static struct lagg_port *
+lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype)
+{
+	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 	uint32_t p;
 
-	LAGG_RLOCK();
+	sc = ifp->if_softc;
+
 	switch (sc->sc_proto) {
 	case LAGG_PROTO_FAILOVER:
-		lp = lagg_link_active(sc, sc->sc_primary);
-		break;
+		return (lagg_link_active(sc, sc->sc_primary));
 	case LAGG_PROTO_LOADBALANCE:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
-		    params->hdr.flowtype == M_HASHTYPE_NONE) {
-			LAGG_RUNLOCK();
-			return (EOPNOTSUPP);
-		}
-		p = params->hdr.flowid >> sc->flowid_shift;
+		    flowtype == M_HASHTYPE_NONE)
+			return (NULL);
+		p = flowid >> sc->flowid_shift;
 		p %= sc->sc_count;
 		lb = (struct lagg_lb *)sc->sc_psc;
 		lp = lb->lb_ports[p];
-		lp = lagg_link_active(sc, lp);
-		break;
+		return (lagg_link_active(sc, lp));
 	case LAGG_PROTO_LACP:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
-		    params->hdr.flowtype == M_HASHTYPE_NONE) {
-			LAGG_RUNLOCK();
-			return (EOPNOTSUPP);
-		}
-		lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid);
-		break;
+		    flowtype == M_HASHTYPE_NONE)
+			return (NULL);
+		return (lacp_select_tx_port_by_hash(sc, flowid));
 	default:
+		return (NULL);
+	}
+}
+
+static int
+lagg_snd_tag_alloc(struct ifnet *ifp,
+    union if_snd_tag_alloc_params *params,
+    struct m_snd_tag **ppmt)
+{
+	struct lagg_snd_tag *lst;
+	struct lagg_softc *sc;
+	struct lagg_port *lp;
+	struct ifnet *lp_ifp;
+	int error;
+
+	sc = ifp->if_softc;
+
+	LAGG_RLOCK();
+	lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype);
+	if (lp == NULL) {
 		LAGG_RUNLOCK();
 		return (EOPNOTSUPP);
 	}
-	if (lp == NULL) {
+	if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) {
 		LAGG_RUNLOCK();
 		return (EOPNOTSUPP);
 	}
-	ifp = lp->lp_ifp;
+	lp_ifp = lp->lp_ifp;
+	if_ref(lp_ifp);
 	LAGG_RUNLOCK();
-	if (ifp == NULL || ifp->if_snd_tag_alloc == NULL ||
-	    (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
-		return (EOPNOTSUPP);
 
-	/* forward allocation request */
-	return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+	lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT);
+	if (lst == NULL) {
+		if_rele(lp_ifp);
+		return (ENOMEM);
+	}
+
+	error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag);
+	if_rele(lp_ifp);
+	if (error) {
+		free(lst, M_LAGG);
+		return (error);
+	}
+
+	m_snd_tag_init(&lst->com, ifp);
+
+	*ppmt = &lst->com;
+	return (0);
 }
 
+static int
+lagg_snd_tag_modify(struct m_snd_tag *mst,
+    union if_snd_tag_modify_params *params)
+{
+	struct lagg_snd_tag *lst;
+
+	lst = mst_to_lst(mst);
+	return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params));
+}
+
+static int
+lagg_snd_tag_query(struct m_snd_tag *mst,
+    union if_snd_tag_query_params *params)
+{
+	struct lagg_snd_tag *lst;
+
+	lst = mst_to_lst(mst);
+	return (lst->tag->ifp->if_snd_tag_query(lst->tag, params));
+}
+
 static void
-lagg_snd_tag_free(struct m_snd_tag *tag)
+lagg_snd_tag_free(struct m_snd_tag *mst)
 {
-	tag->ifp->if_snd_tag_free(tag);
+	struct lagg_snd_tag *lst;
+
+	lst = mst_to_lst(mst);
+	m_snd_tag_rele(lst->tag);
+	free(lst, M_LAGG);
 }
 
 #endif
 
 static int
 lagg_setmulti(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *ifp = lp->lp_ifp;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct lagg_mc *mc;
 	struct ifmultiaddr *ifma;
 	int error;
 
 	IF_ADDR_WLOCK(scifp);
 	CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(scifp);
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr,
 		    ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp->if_index;
 		mc->mc_ifma = NULL;
 		SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(scifp);
 	SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
 		error = if_addmulti(ifp,
 		    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_clrmulti(struct lagg_port *lp)
 {
 	struct lagg_mc *mc;
 
 	LAGG_XLOCK_ASSERT(lp->lp_softc);
 	while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
 		SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
 		if (mc->mc_ifma && lp->lp_detaching == 0)
 			if_delmulti_ifma(mc->mc_ifma);
 		free(mc, M_LAGG);
 	}
 	return (0);
 }
 
 static int
 lagg_setcaps(struct lagg_port *lp, int cap)
 {
 	struct ifreq ifr;
 
 	if (lp->lp_ifp->if_capenable == cap)
 		return (0);
 	if (lp->lp_ioctl == NULL)
 		return (ENXIO);
 	ifr.ifr_reqcap = cap;
 	return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr));
 }
 
 /* Handle a ref counted flag that should be set on the lagg port as well */
 static int
 lagg_setflag(struct lagg_port *lp, int flag, int status,
     int (*func)(struct ifnet *, int))
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct ifnet *ifp = lp->lp_ifp;
 	int error;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	status = status ? (scifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded ports status is different from what
 	 * we want it to be.  If it is, flip it.  We record ports
 	 * status in lp_ifflags so that we won't clear ports flag
 	 * we haven't set.  In fact, we don't clear or set ports
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual ports flags.
 	 */
 	if (status != (lp->lp_ifflags & flag)) {
 		error = (*func)(ifp, status);
 		if (error)
 			return (error);
 		lp->lp_ifflags &= ~flag;
 		lp->lp_ifflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the lagg port
  * if "status" is true, update ports flags respective to the lagg
  * if "status" is false, forcedly clear the flags set on port.
  */
 static int
 lagg_setflags(struct lagg_port *lp, int status)
 {
 	int error, i;
 
 	for (i = 0; lagg_pflags[i].flag; i++) {
 		error = lagg_setflag(lp, lagg_pflags[i].flag,
 		    status, lagg_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	int error;
 
+#ifdef RATELIMIT
+	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
+		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
+#endif
 	LAGG_RLOCK();
 	/* We need a Tx algorithm and at least one port */
 	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
 		LAGG_RUNLOCK();
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	error = lagg_proto_start(sc, m);
 	LAGG_RUNLOCK();
 
 	if (error != 0)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for lagg(4) is no-op.
  */
 static void
 lagg_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static struct mbuf *
 lagg_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 
 	LAGG_RLOCK();
 	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    lp->lp_detaching != 0 ||
 	    sc->sc_proto == LAGG_PROTO_NONE) {
 		LAGG_RUNLOCK();
 		m_freem(m);
 		return (NULL);
 	}
 
 	ETHER_BPF_MTAP(scifp, m);
 
 	m = lagg_proto_input(sc, lp, m);
 	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
 		m_freem(m);
 		m = NULL;
 	}
 
 	LAGG_RUNLOCK();
 	return (m);
 }
 
 static int
 lagg_media_change(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		printf("%s\n", __func__);
 
 	/* Ignore */
 	return (0);
 }
 
 static void
 lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	imr->ifm_status = IFM_AVALID;
 	imr->ifm_active = IFM_ETHER | IFM_AUTO;
 
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp))
 			imr->ifm_status |= IFM_ACTIVE;
 	}
 	LAGG_RUNLOCK();
 }
 
 static void
 lagg_linkstate(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int new_link = LINK_STATE_DOWN;
 	uint64_t speed;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* LACP handles link state itself */
 	if (sc->sc_proto == LAGG_PROTO_LACP)
 		return;
 
 	/* Our link is considered up if at least one of our ports is active */
 	LAGG_RLOCK();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	LAGG_RUNLOCK();
 	if_link_state_change(sc->sc_ifp, new_link);
 
 	/* Update if_baudrate to reflect the max possible speed */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
 			    sc->sc_primary->lp_ifp->if_baudrate : 0;
 			break;
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			speed = 0;
 			LAGG_RLOCK();
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				speed += lp->lp_ifp->if_baudrate;
 			LAGG_RUNLOCK();
 			sc->sc_ifp->if_baudrate = speed;
 			break;
 		case LAGG_PROTO_LACP:
 			/* LACP updates if_baudrate itself */
 			break;
 	}
 }
 
 static void
 lagg_port_state(struct ifnet *ifp, int state)
 {
 	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
 	struct lagg_softc *sc = NULL;
 
 	if (lp != NULL)
 		sc = lp->lp_softc;
 	if (sc == NULL)
 		return;
 
 	LAGG_XLOCK(sc);
 	lagg_linkstate(sc);
 	lagg_proto_linkstate(sc, lp);
 	LAGG_XUNLOCK(sc);
 }
 
 struct lagg_port *
 lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_port *lp_next, *rval = NULL;
 
 	/*
 	 * Search a port which reports an active link state.
 	 */
 
 	/*
 	 * This is called with either LAGG_RLOCK() held or
 	 * LAGG_XLOCK(sc) held.
 	 */
 	if (!in_epoch(net_epoch_preempt))
 		LAGG_XLOCK_ASSERT(sc);
 
 	if (lp == NULL)
 		goto search;
 	if (LAGG_PORTACTIVE(lp)) {
 		rval = lp;
 		goto found;
 	}
 	if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL &&
 	    LAGG_PORTACTIVE(lp_next)) {
 		rval = lp_next;
 		goto found;
 	}
 
 search:
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp_next)) {
 			return (lp_next);
 		}
 	}
 found:
 	return (rval);
 }
 
 int
 lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
 {
 
+#ifdef RATELIMIT
+	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
+		struct lagg_snd_tag *lst;
+		struct m_snd_tag *mst;
+
+		mst = m->m_pkthdr.snd_tag;
+		lst = mst_to_lst(mst);
+		if (lst->tag->ifp != ifp) {
+			m_freem(m);
+			return (EAGAIN);
+		}
+		m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag);
+		m_snd_tag_rele(mst);
+	}
+#endif
 	return (ifp->if_transmit)(ifp, m);
 }
 
 /*
  * Simple round robin aggregation
  */
 static void
 lagg_rr_attach(struct lagg_softc *sc)
 {
 	sc->sc_seq = 0;
 	sc->sc_bkt_count = sc->sc_bkt;
 }
 
 static int
 lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 	uint32_t p;
 
 	if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0)
 		sc->sc_bkt_count = sc->sc_bkt;
 
 	if (sc->sc_bkt > 0) {
 		atomic_subtract_int(&sc->sc_bkt_count, 1);
 	if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt))
 		p = atomic_fetchadd_32(&sc->sc_seq, 1);
 	else
 		p = sc->sc_seq; 
 	} else
 		p = atomic_fetchadd_32(&sc->sc_seq, 1);
 
 	p %= sc->sc_count;
 	lp = CK_SLIST_FIRST(&sc->sc_ports);
 
 	while (p--)
 		lp = CK_SLIST_NEXT(lp, lp_entries);
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * Broadcast mode
  */
 static int
 lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	int active_ports = 0;
 	int errors = 0;
 	int ret;
 	struct lagg_port *lp, *last = NULL;
 	struct mbuf *m0;
 
 	LAGG_RLOCK_ASSERT();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (!LAGG_PORTACTIVE(lp))
 			continue;
 
 		active_ports++;
 
 		if (last != NULL) {
 			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (m0 == NULL) {
 				ret = ENOBUFS;
 				errors++;
 				break;
 			}
 
 			ret = lagg_enqueue(last->lp_ifp, m0);
 			if (ret != 0)
 				errors++;
 		}
 		last = lp;
 	}
 
 	if (last == NULL) {
 		m_freem(m);
 		return (ENOENT);
 	}
 	if ((last = lagg_link_active(sc, last)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	ret = lagg_enqueue(last->lp_ifp, m);
 	if (ret != 0)
 		errors++;
 
 	if (errors == 0)
 		return (ret);
 
 	return (0);
 }
 
 static struct mbuf*
 lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
 /*
  * Active failover
  */
 static int
 lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	/* Use the master port if active or the next available port */
 	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *tmp_tp;
 
 	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
 		m->m_pkthdr.rcvif = ifp;
 		return (m);
 	}
 
 	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
 		tmp_tp = lagg_link_active(sc, sc->sc_primary);
 		/*
 		 * If tmp_tp is null, we've received a packet when all
 		 * our links are down. Weird, but process it anyways.
 		 */
 		if ((tmp_tp == NULL || tmp_tp == lp)) {
 			m->m_pkthdr.rcvif = ifp;
 			return (m);
 		}
 	}
 
 	m_freem(m);
 	return (NULL);
 }
 
 /*
  * Loadbalancing
  */
 static void
 lagg_lb_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 
 	LAGG_XLOCK_ASSERT(sc);
 	lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO);
 	lb->lb_key = m_ether_tcpip_hash_init();
 	sc->sc_psc = lb;
 
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lagg_lb_port_create(lp);
 }
 
 static void
 lagg_lb_detach(struct lagg_softc *sc)
 {
 	struct lagg_lb *lb;
 
 	lb = (struct lagg_lb *)sc->sc_psc;
 	if (lb != NULL)
 		free(lb, M_LAGG);
 }
 
 static int
 lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp_next;
 	int i = 0, rv;
 
 	rv = 0;
 	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (lp_next == lp)
 			continue;
 		if (i >= LAGG_MAX_PORTS) {
 			rv = EINVAL;
 			break;
 		}
 		if (sc->sc_ifflags & IFF_DEBUG)
 			printf("%s: port %s at index %d\n",
 			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
 		lb->lb_ports[i++] = lp_next;
 	}
 
 	return (rv);
 }
 
 static int
 lagg_lb_port_create(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	return (lagg_lb_porttable(sc, NULL));
 }
 
 static void
 lagg_lb_port_destroy(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	lagg_lb_porttable(sc, lp);
 }
 
 static int
 lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp = NULL;
 	uint32_t p = 0;
 
 	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
 	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		p = m->m_pkthdr.flowid >> sc->flowid_shift;
 	else
 		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
 	p %= sc->sc_count;
 	lp = lb->lb_ports[p];
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * 802.3ad LACP
  */
 static void
 lagg_lacp_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	lacp_attach(sc);
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static void
 lagg_lacp_detach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	void *psc;
 
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	psc = sc->sc_psc;
 	sc->sc_psc = NULL;
 	lacp_detach(psc);
 }
 
 static void
 lagg_lacp_lladdr(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	LAGG_SXLOCK_ASSERT(sc);
 
 	/* purge all the lacp ports */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	/* add them back in */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static int
 lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	lp = lacp_select_tx_port(sc, m);
 	if (lp == NULL) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct ether_header *eh;
 	u_short etype;
 
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 
 	/* Tap off LACP control messages */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
 		m = lacp_input(lp, m);
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * If the port is not collecting or not in the active aggregator then
 	 * free and return.
 	 */
 	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
 		m_freem(m);
 		return (NULL);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
Index: head/sys/net/if_vlan.c
===================================================================
--- head/sys/net/if_vlan.c	(revision 348253)
+++ head/sys/net/if_vlan.c	(revision 348254)
@@ -1,1945 +1,2038 @@
 /*-
  * Copyright 1998 Massachusetts Institute of Technology
  * Copyright 2012 ADARA Networks, Inc.
  * Copyright 2017 Dell EMC Isilon
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to ADARA Networks, Inc.
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  * 
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs.
  * This is sort of sneaky in the implementation, since
  * we need to pretend to be enough of an Ethernet implementation
  * to make arp work.  The way we do this is by telling everyone
  * that we are an Ethernet, and then catch the packets that
  * ether_output() sends to us via if_transmit(), rewrite them for
  * use by the real outgoing interface, and ask it to send them.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_vlan.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/priv.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/vnet.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #endif
 
 #define	VLAN_DEF_HWIDTH	4
 #define	VLAN_IFFLAGS	(IFF_BROADCAST | IFF_MULTICAST)
 
 #define	UP_AND_RUNNING(ifp) \
     ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING)
 
 CK_SLIST_HEAD(ifvlanhead, ifvlan);
 
 struct ifvlantrunk {
 	struct	ifnet   *parent;	/* parent interface of this trunk */
 	struct	mtx	lock;
 #ifdef VLAN_ARRAY
 #define	VLAN_ARRAY_SIZE	(EVL_VLID_MASK + 1)
 	struct	ifvlan	*vlans[VLAN_ARRAY_SIZE]; /* static table */
 #else
 	struct	ifvlanhead *hash;	/* dynamic hash-list table */
 	uint16_t	hmask;
 	uint16_t	hwidth;
 #endif
 	int		refcnt;
 };
 
+#ifdef RATELIMIT
+struct vlan_snd_tag {
+	struct m_snd_tag com;
+	struct m_snd_tag *tag;
+};
+
+static inline struct vlan_snd_tag *
+mst_to_vst(struct m_snd_tag *mst)
+{
+
+	return (__containerof(mst, struct vlan_snd_tag, com));
+}
+#endif
+
 /*
  * This macro provides a facility to iterate over every vlan on a trunk with
  * the assumption that none will be added/removed during iteration.
  */
 #ifdef VLAN_ARRAY
 #define VLAN_FOREACH(_ifv, _trunk) \
 	size_t _i; \
 	for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]) != NULL)
 #else /* VLAN_ARRAY */
 #define VLAN_FOREACH(_ifv, _trunk) \
 	struct ifvlan *_next; \
 	size_t _i; \
 	for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \
 		CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next)
 #endif /* VLAN_ARRAY */
 
 /*
  * This macro provides a facility to iterate over every vlan on a trunk while
  * also modifying the number of vlans on the trunk. The iteration continues
  * until some condition is met or there are no more vlans on the trunk.
  */
 #ifdef VLAN_ARRAY
 /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]))
 #else /* VLAN_ARRAY */
 /*
  * The hash table case is more complicated. We allow for the hash table to be
  * modified (i.e. vlans removed) while we are iterating over it. To allow for
  * this we must restart the iteration every time we "touch" something during
  * the iteration, since removal will resize the hash table and invalidate our
  * current position. If acting on the touched element causes the trunk to be
  * emptied, then iteration also stops.
  */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	bool _touch = false; \
 	for (_i = 0; \
 	    !(_cond) && _i < (1 << (_trunk)->hwidth); \
 	    _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \
 		if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \
 		    (_touch = true))
 #endif /* VLAN_ARRAY */
 
 struct vlan_mc_entry {
 	struct sockaddr_dl		mc_addr;
 	CK_SLIST_ENTRY(vlan_mc_entry)	mc_entries;
 	struct epoch_context		mc_epoch_ctx;
 };
 
 struct ifvlan {
 	struct	ifvlantrunk *ifv_trunk;
 	struct	ifnet *ifv_ifp;
 #define	TRUNK(ifv)	((ifv)->ifv_trunk)
 #define	PARENT(ifv)	((ifv)->ifv_trunk->parent)
 	void	*ifv_cookie;
 	int	ifv_pflags;	/* special flags we have set on parent */
 	int	ifv_capenable;
 	int	ifv_encaplen;	/* encapsulation length */
 	int	ifv_mtufudge;	/* MTU fudged by this much */
 	int	ifv_mintu;	/* min transmission unit */
 	uint16_t ifv_proto;	/* encapsulation ethertype */
 	uint16_t ifv_tag;	/* tag to apply on packets leaving if */
 	uint16_t ifv_vid;	/* VLAN ID */
 	uint8_t	ifv_pcp;	/* Priority Code Point (PCP). */
 	struct task lladdr_task;
 	CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead;
 #ifndef VLAN_ARRAY
 	CK_SLIST_ENTRY(ifvlan) ifv_list;
 #endif
 };
 
 /* Special flags we should propagate to parent. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } vlan_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 extern int vlan_mtag_pcp;
 
 static const char vlanname[] = "vlan";
 static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface");
 
 static eventhandler_tag ifdetach_tag;
 static eventhandler_tag iflladdr_tag;
 
 /*
  * if_vlan uses two module-level synchronizations primitives to allow concurrent 
  * modification of vlan interfaces and (mostly) allow for vlans to be destroyed 
  * while they are being used for tx/rx. To accomplish this in a way that has 
  * acceptable performance and cooperation with other parts of the network stack
  * there is a non-sleepable epoch(9) and an sx(9).
  *
  * The performance-sensitive paths that warrant using the epoch(9) are
  * vlan_transmit and vlan_input. Both have to check for the vlan interface's
  * existence using if_vlantrunk, and being in the network tx/rx paths the use
  * of an epoch(9) gives a measureable improvement in performance.
  *
  * The reason for having an sx(9) is mostly because there are still areas that
  * must be sleepable and also have safe concurrent access to a vlan interface.
  * Since the sx(9) exists, it is used by default in most paths unless sleeping
  * is not permitted, or if it is not clear whether sleeping is permitted.
  *
  */
 #define _VLAN_SX_ID ifv_sx
 
 static struct sx _VLAN_SX_ID;
 
 #define VLAN_LOCKING_INIT() \
 	sx_init(&_VLAN_SX_ID, "vlan_sx")
 
 #define VLAN_LOCKING_DESTROY() \
 	sx_destroy(&_VLAN_SX_ID)
 
 #define	VLAN_SLOCK()			sx_slock(&_VLAN_SX_ID)
 #define	VLAN_SUNLOCK()			sx_sunlock(&_VLAN_SX_ID)
 #define	VLAN_XLOCK()			sx_xlock(&_VLAN_SX_ID)
 #define	VLAN_XUNLOCK()			sx_xunlock(&_VLAN_SX_ID)
 #define	VLAN_SLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_SLOCKED)
 #define	VLAN_XLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_XLOCKED)
 #define	VLAN_SXLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_LOCKED)
 
 
 /*
  * We also have a per-trunk mutex that should be acquired when changing
  * its state.
  */
 #define	TRUNK_LOCK_INIT(trunk)		mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF)
 #define	TRUNK_LOCK_DESTROY(trunk)	mtx_destroy(&(trunk)->lock)
 #define	TRUNK_WLOCK(trunk)		mtx_lock(&(trunk)->lock)
 #define	TRUNK_WUNLOCK(trunk)		mtx_unlock(&(trunk)->lock)
 #define	TRUNK_LOCK_ASSERT(trunk)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(trunk)->lock))
 #define	TRUNK_WLOCK_ASSERT(trunk)	mtx_assert(&(trunk)->lock, MA_OWNED);
 
 /*
  * The VLAN_ARRAY substitutes the dynamic hash with a static array
  * with 4096 entries. In theory this can give a boost in processing,
  * however in practice it does not. Probably this is because the array
  * is too big to fit into CPU cache.
  */
 #ifndef VLAN_ARRAY
 static	void vlan_inithash(struct ifvlantrunk *trunk);
 static	void vlan_freehash(struct ifvlantrunk *trunk);
 static	int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	void vlan_growhash(struct ifvlantrunk *trunk, int howmuch);
 static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk,
 	uint16_t vid);
 #endif
 static	void trunk_destroy(struct ifvlantrunk *trunk);
 
 static	void vlan_init(void *foo);
 static	void vlan_input(struct ifnet *ifp, struct mbuf *m);
 static	int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
 #ifdef RATELIMIT
 static	int vlan_snd_tag_alloc(struct ifnet *,
     union if_snd_tag_alloc_params *, struct m_snd_tag **);
-static void vlan_snd_tag_free(struct m_snd_tag *);
+static	int vlan_snd_tag_modify(struct m_snd_tag *,
+    union if_snd_tag_modify_params *);
+static	int vlan_snd_tag_query(struct m_snd_tag *,
+    union if_snd_tag_query_params *);
+static	void vlan_snd_tag_free(struct m_snd_tag *);
 #endif
 static	void vlan_qflush(struct ifnet *ifp);
 static	int vlan_setflag(struct ifnet *ifp, int flag, int status,
     int (*func)(struct ifnet *, int));
 static	int vlan_setflags(struct ifnet *ifp, int status);
 static	int vlan_setmulti(struct ifnet *ifp);
 static	int vlan_transmit(struct ifnet *ifp, struct mbuf *m);
 static	void vlan_unconfig(struct ifnet *ifp);
 static	void vlan_unconfig_locked(struct ifnet *ifp, int departing);
 static	int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag);
 static	void vlan_link_state(struct ifnet *ifp);
 static	void vlan_capabilities(struct ifvlan *ifv);
 static	void vlan_trunk_capabilities(struct ifnet *ifp);
 
 static	struct ifnet *vlan_clone_match_ethervid(const char *, int *);
 static	int vlan_clone_match(struct if_clone *, const char *);
 static	int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static	int vlan_clone_destroy(struct if_clone *, struct ifnet *);
 
 static	void vlan_ifdetach(void *arg, struct ifnet *ifp);
 static  void vlan_iflladdr(void *arg, struct ifnet *ifp);
 
 static  void vlan_lladdr_fn(void *arg, int pending);
 
 static struct if_clone *vlan_cloner;
 
 #ifdef VIMAGE
 VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner);
 #define	V_vlan_cloner	VNET(vlan_cloner)
 #endif
 
 static void
 vlan_mc_free(struct epoch_context *ctx)
 {
 	struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx);
 	free(mc, M_VLAN);
 }
 
 #ifndef VLAN_ARRAY
 #define HASH(n, m)	((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m))
 
 static void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 	int i, n;
 	
 	/*
 	 * The trunk must not be locked here since we call malloc(M_WAITOK).
 	 * It is OK in case this function is called before the trunk struct
 	 * gets hooked up and becomes visible from other threads.
 	 */
 
 	KASSERT(trunk->hwidth == 0 && trunk->hash == NULL,
 	    ("%s: hash already initialized", __func__));
 
 	trunk->hwidth = VLAN_DEF_HWIDTH;
 	n = 1 << trunk->hwidth;
 	trunk->hmask = n - 1;
 	trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK);
 	for (i = 0; i < n; i++)
 		CK_SLIST_INIT(&trunk->hash[i]);
 }
 
 static void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 #ifdef INVARIANTS
 	int i;
 
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 	for (i = 0; i < (1 << trunk->hwidth); i++)
 		KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]),
 		    ("%s: hash table not empty", __func__));
 #endif
 	free(trunk->hash, M_VLAN);
 	trunk->hash = NULL;
 	trunk->hwidth = trunk->hmask = 0;
 }
 
 static int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	b = 1 << trunk->hwidth;
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv->ifv_vid == ifv2->ifv_vid)
 			return (EEXIST);
 
 	/*
 	 * Grow the hash when the number of vlans exceeds half of the number of
 	 * hash buckets squared. This will make the average linked-list length
 	 * buckets/2.
 	 */
 	if (trunk->refcnt > (b * b) / 2) {
 		vlan_growhash(trunk, 1);
 		i = HASH(ifv->ifv_vid, trunk->hmask);
 	}
 	CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list);
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 	
 	b = 1 << trunk->hwidth;
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv2 == ifv) {
 			trunk->refcnt--;
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list);
 			if (trunk->refcnt < (b * b) / 2)
 				vlan_growhash(trunk, -1);
 			return (0);
 		}
 
 	panic("%s: vlan not found\n", __func__);
 	return (ENOENT); /*NOTREACHED*/
 }
 
 /*
  * Grow the hash larger or smaller if memory permits.
  */
 static void
 vlan_growhash(struct ifvlantrunk *trunk, int howmuch)
 {
 	struct ifvlan *ifv;
 	struct ifvlanhead *hash2;
 	int hwidth2, i, j, n, n2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	if (howmuch == 0) {
 		/* Harmless yet obvious coding error */
 		printf("%s: howmuch is 0\n", __func__);
 		return;
 	}
 
 	hwidth2 = trunk->hwidth + howmuch;
 	n = 1 << trunk->hwidth;
 	n2 = 1 << hwidth2;
 	/* Do not shrink the table below the default */
 	if (hwidth2 < VLAN_DEF_HWIDTH)
 		return;
 
 	hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK);
 	if (hash2 == NULL) {
 		printf("%s: out of memory -- hash size not changed\n",
 		    __func__);
 		return;		/* We can live with the old hash table */
 	}
 	for (j = 0; j < n2; j++)
 		CK_SLIST_INIT(&hash2[j]);
 	for (i = 0; i < n; i++)
 		while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) {
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list);
 			j = HASH(ifv->ifv_vid, n2 - 1);
 			CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list);
 		}
 	NET_EPOCH_WAIT();
 	free(trunk->hash, M_VLAN);
 	trunk->hash = hash2;
 	trunk->hwidth = hwidth2;
 	trunk->hmask = n2 - 1;
 
 	if (bootverbose)
 		if_printf(trunk->parent,
 		    "VLAN hash table resized from %d to %d buckets\n", n, n2);
 }
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list)
 		if (ifv->ifv_vid == vid)
 			return (ifv);
 	return (NULL);
 }
 
 #if 0
 /* Debugging code to view the hashtables. */
 static void
 vlan_dumphash(struct ifvlantrunk *trunk)
 {
 	int i;
 	struct ifvlan *ifv;
 
 	for (i = 0; i < (1 << trunk->hwidth); i++) {
 		printf("%d: ", i);
 		CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list)
 			printf("%s ", ifv->ifv_ifp->if_xname);
 		printf("\n");
 	}
 }
 #endif /* 0 */
 #else
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 
 	return trunk->vlans[vid];
 }
 
 static __inline int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	if (trunk->vlans[ifv->ifv_vid] != NULL)
 		return EEXIST;
 	trunk->vlans[ifv->ifv_vid] = ifv;
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static __inline int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	trunk->vlans[ifv->ifv_vid] = NULL;
 	trunk->refcnt--;
 
 	return (0);
 }
 
 static __inline void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 }
 
 static __inline void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 }
 
 #endif /* !VLAN_ARRAY */
 
 static void
 trunk_destroy(struct ifvlantrunk *trunk)
 {
 	VLAN_XLOCK_ASSERT();
 
 	vlan_freehash(trunk);
 	trunk->parent->if_vlantrunk = NULL;
 	TRUNK_LOCK_DESTROY(trunk);
 	if_rele(trunk->parent);
 	free(trunk, M_VLAN);
 }
 
 /*
  * Program our multicast filter. What we're actually doing is
  * programming the multicast filter of the parent. This has the
  * side effect of causing the parent interface to receive multicast
  * traffic that it doesn't really want, which ends up being discarded
  * later by the upper protocol layers. Unfortunately, there's no way
  * to avoid this: there really is only one physical interface.
  */
 static int
 vlan_setmulti(struct ifnet *ifp)
 {
 	struct ifnet		*ifp_p;
 	struct ifmultiaddr	*ifma;
 	struct ifvlan		*sc;
 	struct vlan_mc_entry	*mc;
 	int			error;
 
 	VLAN_XLOCK_ASSERT();
 
 	/* Find the parent. */
 	sc = ifp->if_softc;
 	ifp_p = PARENT(sc);
 
 	CURVNET_SET_QUIET(ifp_p->if_vnet);
 
 	/* First, remove any existing filter entries. */
 	while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) {
 		CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries);
 		(void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr);
 		epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free);
 	}
 
 	/* Now program new ones. */
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(ifp);
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp_p->if_index;
 		CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 	CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) {
 		error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr,
 		    NULL);
 		if (error)
 			return (error);
 	}
 
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * A handler for parent interface link layer address changes.
  * If the parent interface link layer address is changed we
  * should also change it on all children vlans.
  */
 static void
 vlan_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlan *ifv;
 	struct ifnet *ifv_ifp;
 	struct ifvlantrunk *trunk;
 	struct sockaddr_dl *sdl;
 
 	/* Need the epoch since this is run on taskqueue_swi. */
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and change all vlan's lladdrs on it.
 	 * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR
 	 * ioctl calls on the parent garbling the lladdr of the child vlan.
 	 */
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		/*
 		 * Copy new new lladdr into the ifv_ifp, enqueue a task
 		 * to actually call if_setlladdr. if_setlladdr needs to
 		 * be deferred to a taskqueue because it will call into
 		 * the if_vlan ioctl path and try to acquire the global
 		 * lock.
 		 */
 		ifv_ifp = ifv->ifv_ifp;
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp),
 		    ifp->if_addrlen);
 		sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr;
 		sdl->sdl_alen = ifp->if_addrlen;
 		taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * A handler for network interface departure events.
  * Track departure of trunks here so that we don't access invalid
  * pointers or whatever if a trunk is ripped from under us, e.g.,
  * by ejecting its hot-plug card.  However, if an ifnet is simply
  * being renamed, then there's no need to tear down the state.
  */
 static void
 vlan_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	VLAN_XLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_XUNLOCK();
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and detach all vlan's on it.
 	 * Check trunk pointer after each vlan_unconfig() as it will
 	 * free it and set to NULL after the last vlan was detached.
 	 */
 	VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk,
 	    ifp->if_vlantrunk == NULL)
 		vlan_unconfig_locked(ifv->ifv_ifp, 1);
 
 	/* Trunk should have been destroyed in vlan_unconfig(). */
 	KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__));
 	VLAN_XUNLOCK();
 }
 
 /*
  * Return the trunk device for a virtual interface.
  */
 static struct ifnet  *
 vlan_trunkdev(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 
 	NET_EPOCH_ENTER(et);
 	ifv = ifp->if_softc;
 	ifp = NULL;
 	if (ifv->ifv_trunk)
 		ifp = PARENT(ifv);
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 /*
  * Return the 12-bit VLAN VID for this interface, for use by external
  * components such as Infiniband.
  *
  * XXXRW: Note that the function name here is historical; it should be named
  * vlan_vid().
  */
 static int
 vlan_tag(struct ifnet *ifp, uint16_t *vidp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*vidp = ifv->ifv_vid;
 	return (0);
 }
 
 static int
 vlan_pcp(struct ifnet *ifp, uint16_t *pcpp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*pcpp = ifv->ifv_pcp;
 	return (0);
 }
 
 /*
  * Return a driver specific cookie for this interface.  Synchronization
  * with setcookie must be provided by the driver. 
  */
 static void *
 vlan_cookie(struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 	ifv = ifp->if_softc;
 	return (ifv->ifv_cookie);
 }
 
 /*
  * Store a cookie in our softc that drivers can use to store driver
  * private per-instance data in.
  */
 static int
 vlan_setcookie(struct ifnet *ifp, void *cookie)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	ifv->ifv_cookie = cookie;
 	return (0);
 }
 
 /*
  * Return the vlan device present at the specific VID.
  */
 static struct ifnet *
 vlan_devat(struct ifnet *ifp, uint16_t vid)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	ifp = NULL;
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv)
 		ifp = ifv->ifv_ifp;
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 /*
  * Recalculate the cached VLAN tag exposed via the MIB.
  */
 static void
 vlan_tag_recalculate(struct ifvlan *ifv)
 {
 
        ifv->ifv_tag = EVL_MAKETAG(ifv->ifv_vid, ifv->ifv_pcp, 0);
 }
 
 /*
  * VLAN support can be loaded as a module.  The only place in the
  * system that's intimately aware of this is ether_input.  We hook
  * into this code through vlan_input_p which is defined there and
  * set here.  No one else in the system should be aware of this so
  * we use an explicit reference here.
  */
 extern	void (*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* For if_link_state_change() eyes only... */
 extern	void (*vlan_link_state_p)(struct ifnet *);
 
 static int
 vlan_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 		    vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
 		if (ifdetach_tag == NULL)
 			return (ENOMEM);
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 		if (iflladdr_tag == NULL)
 			return (ENOMEM);
 		VLAN_LOCKING_INIT();
 		vlan_input_p = vlan_input;
 		vlan_link_state_p = vlan_link_state;
 		vlan_trunk_cap_p = vlan_trunk_capabilities;
 		vlan_trunkdev_p = vlan_trunkdev;
 		vlan_cookie_p = vlan_cookie;
 		vlan_setcookie_p = vlan_setcookie;
 		vlan_tag_p = vlan_tag;
 		vlan_pcp_p = vlan_pcp;
 		vlan_devat_p = vlan_devat;
 #ifndef VIMAGE
 		vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match,
 		    vlan_clone_create, vlan_clone_destroy);
 #endif
 		if (bootverbose)
 			printf("vlan: initialized, using "
 #ifdef VLAN_ARRAY
 			       "full-size arrays"
 #else
 			       "hash tables with chaining"
 #endif
 			
 			       "\n");
 		break;
 	case MOD_UNLOAD:
 #ifndef VIMAGE
 		if_clone_detach(vlan_cloner);
 #endif
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag);
 		vlan_input_p = NULL;
 		vlan_link_state_p = NULL;
 		vlan_trunk_cap_p = NULL;
 		vlan_trunkdev_p = NULL;
 		vlan_tag_p = NULL;
 		vlan_cookie_p = NULL;
 		vlan_setcookie_p = NULL;
 		vlan_devat_p = NULL;
 		VLAN_LOCKING_DESTROY();
 		if (bootverbose)
 			printf("vlan: unloaded\n");
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t vlan_mod = {
 	"if_vlan",
 	vlan_modevent,
 	0
 };
 
 DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_vlan, 3);
 
 #ifdef VIMAGE
 static void
 vnet_vlan_init(const void *unused __unused)
 {
 
 	vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match,
 		    vlan_clone_create, vlan_clone_destroy);
 	V_vlan_cloner = vlan_cloner;
 }
 VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_vlan_init, NULL);
 
 static void
 vnet_vlan_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_vlan_cloner);
 }
 VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_vlan_uninit, NULL);
 #endif
 
 /*
  * Check for <etherif>.<vlan> style interface names.
  */
 static struct ifnet *
 vlan_clone_match_ethervid(const char *name, int *vidp)
 {
 	char ifname[IFNAMSIZ];
 	char *cp;
 	struct ifnet *ifp;
 	int vid;
 
 	strlcpy(ifname, name, IFNAMSIZ);
 	if ((cp = strchr(ifname, '.')) == NULL)
 		return (NULL);
 	*cp = '\0';
 	if ((ifp = ifunit_ref(ifname)) == NULL)
 		return (NULL);
 	/* Parse VID. */
 	if (*++cp == '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	vid = 0;
 	for(; *cp >= '0' && *cp <= '9'; cp++)
 		vid = (vid * 10) + (*cp - '0');
 	if (*cp != '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	if (vidp != NULL)
 		*vidp = vid;
 
 	return (ifp);
 }
 
 static int
 vlan_clone_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 
 	if (vlan_clone_match_ethervid(name, NULL) != NULL)
 		return (1);
 
 	if (strncmp(vlanname, name, strlen(vlanname)) != 0)
 		return (0);
 	for (cp = name + 4; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	char *dp;
 	int wildcard;
 	int unit;
 	int error;
 	int vid;
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 	struct ifnet *p;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	struct vlanreq vlr;
 	static const u_char eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 	/*
 	 * There are 3 (ugh) ways to specify the cloned device:
 	 * o pass a parameter block with the clone request.
 	 * o specify parameters in the text of the clone device name
 	 * o specify no parameters and get an unattached device that
 	 *   must be configured separately.
 	 * The first technique is preferred; the latter two are
 	 * supported for backwards compatibility.
 	 *
 	 * XXXRW: Note historic use of the word "tag" here.  New ioctls may be
 	 * called for.
 	 */
 	if (params) {
 		error = copyin(params, &vlr, sizeof(vlr));
 		if (error)
 			return error;
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL)
 			return (ENXIO);
 		error = ifc_name2unit(name, &unit);
 		if (error != 0) {
 			if_rele(p);
 			return (error);
 		}
 		vid = vlr.vlr_tag;
 		wildcard = (unit < 0);
 	} else if ((p = vlan_clone_match_ethervid(name, &vid)) != NULL) {
 		unit = -1;
 		wildcard = 0;
 	} else {
 		p = NULL;
 		error = ifc_name2unit(name, &unit);
 		if (error != 0)
 			return (error);
 
 		wildcard = (unit < 0);
 	}
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0) {
 		if (p != NULL)
 			if_rele(p);
 		return (error);
 	}
 
 	/* In the wildcard case, we need to update the name. */
 	if (wildcard) {
 		for (dp = name; *dp != '\0'; dp++);
 		if (snprintf(dp, len - (dp-name), "%d", unit) >
 		    len - (dp-name) - 1) {
 			panic("%s: interface name too long", __func__);
 		}
 	}
 
 	ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO);
 	ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		ifc_free_unit(ifc, unit);
 		free(ifv, M_VLAN);
 		if (p != NULL)
 			if_rele(p);
 		return (ENOSPC);
 	}
 	CK_SLIST_INIT(&ifv->vlan_mc_listhead);
 	ifp->if_softc = ifv;
 	/*
 	 * Set the name manually rather than using if_initname because
 	 * we don't conform to the default naming convention for interfaces.
 	 */
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = vlanname;
 	ifp->if_dunit = unit;
 
 	ifp->if_init = vlan_init;
 	ifp->if_transmit = vlan_transmit;
 	ifp->if_qflush = vlan_qflush;
 	ifp->if_ioctl = vlan_ioctl;
 #ifdef RATELIMIT
 	ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
+	ifp->if_snd_tag_modify = vlan_snd_tag_modify;
+	ifp->if_snd_tag_query = vlan_snd_tag_query;
 	ifp->if_snd_tag_free = vlan_snd_tag_free;
 #endif
 	ifp->if_flags = VLAN_IFFLAGS;
 	ether_ifattach(ifp, eaddr);
 	/* Now undo some of the damage... */
 	ifp->if_baudrate = 0;
 	ifp->if_type = IFT_L2VLAN;
 	ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN;
 	ifa = ifp->if_addr;
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_L2VLAN;
 
 	if (p != NULL) {
 		error = vlan_config(ifv, p, vid);
 		if_rele(p);
 		if (error != 0) {
 			/*
 			 * Since we've partially failed, we need to back
 			 * out all the way, otherwise userland could get
 			 * confused.  Thus, we destroy the interface.
 			 */
 			ether_ifdetach(ifp);
 			vlan_unconfig(ifp);
 			if_free(ifp);
 			ifc_free_unit(ifc, unit);
 			free(ifv, M_VLAN);
 
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ifvlan *ifv = ifp->if_softc;
 	int unit = ifp->if_dunit;
 
 	ether_ifdetach(ifp);	/* first, remove it from system-wide lists */
 	vlan_unconfig(ifp);	/* now it can be unconfigured and freed */
 	/*
 	 * We should have the only reference to the ifv now, so we can now
 	 * drain any remaining lladdr task before freeing the ifnet and the
 	 * ifvlan.
 	 */
 	taskqueue_drain(taskqueue_thread, &ifv->lladdr_task);
 	NET_EPOCH_WAIT();
 	if_free(ifp);
 	free(ifv, M_VLAN);
 	ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 /*
  * The ifp->if_init entry point for vlan(4) is a no-op.
  */
 static void
 vlan_init(void *foo __unused)
 {
 }
 
 /*
  * The if_transmit method for vlan(4) interface.
  */
 static int
 vlan_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct ifvlan *ifv;
 	struct ifnet *p;
 	int error, len, mcast;
 
 	NET_EPOCH_ENTER(et);
 	ifv = ifp->if_softc;
 	if (TRUNK(ifv) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 	p = PARENT(ifv);
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 
 	BPF_MTAP(ifp, m);
 
+#ifdef RATELIMIT
+	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
+		struct vlan_snd_tag *vst;
+		struct m_snd_tag *mst;
+
+		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
+		mst = m->m_pkthdr.snd_tag;
+		vst = mst_to_vst(mst);
+		if (vst->tag->ifp != p) {
+			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+			NET_EPOCH_EXIT(et);
+			m_freem(m);
+			return (EAGAIN);
+		}
+
+		m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag);
+		m_snd_tag_rele(mst);
+	}
+#endif
+
 	/*
 	 * Do not run parent's if_transmit() if the parent is not up,
 	 * or parent's driver will cause a system crash.
 	 */
 	if (!UP_AND_RUNNING(p)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if (!ether_8021q_frame(&m, ifp, p, ifv->ifv_vid, ifv->ifv_pcp)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		NET_EPOCH_EXIT(et);
 		return (0);
 	}
 
 	/*
 	 * Send it, precisely as ether_output() would have.
 	 */
 	error = (p->if_transmit)(p, m);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for vlan(4) is a no-op.
  */
 static void
 vlan_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static void
 vlan_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 	struct m_tag *mtag;
 	uint16_t vid, tag;
 
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_flags & M_VLANTAG) {
 		/*
 		 * Packet is tagged, but m contains a normal
 		 * Ethernet frame; the tag is stored out-of-band.
 		 */
 		tag = m->m_pkthdr.ether_vtag;
 		m->m_flags &= ~M_VLANTAG;
 	} else {
 		struct ether_vlan_header *evl;
 
 		/*
 		 * Packet is tagged in-band as specified by 802.1q.
 		 */
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 			if (m->m_len < sizeof(*evl) &&
 			    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 				if_printf(ifp, "cannot pullup VLAN header\n");
 				NET_EPOCH_EXIT(et);
 				return;
 			}
 			evl = mtod(m, struct ether_vlan_header *);
 			tag = ntohs(evl->evl_tag);
 
 			/*
 			 * Remove the 802.1q header by copying the Ethernet
 			 * addresses over it and adjusting the beginning of
 			 * the data in the mbuf.  The encapsulated Ethernet
 			 * type field is already in place.
 			 */
 			bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 			      ETHER_HDR_LEN - ETHER_TYPE_LEN);
 			m_adj(m, ETHER_VLAN_ENCAP_LEN);
 			break;
 
 		default:
 #ifdef INVARIANTS
 			panic("%s: %s has unsupported if_type %u",
 			      __func__, ifp->if_xname, ifp->if_type);
 #endif
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			NET_EPOCH_EXIT(et);
 			m_freem(m);
 			return;
 		}
 	}
 
 	vid = EVL_VLANOFTAG(tag);
 
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) {
 		NET_EPOCH_EXIT(et);
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		m_freem(m);
 		return;
 	}
 
 	if (vlan_mtag_pcp) {
 		/*
 		 * While uncommon, it is possible that we will find a 802.1q
 		 * packet encapsulated inside another packet that also had an
 		 * 802.1q header.  For example, ethernet tunneled over IPSEC
 		 * arriving over ethernet.  In that case, we replace the
 		 * existing 802.1q PCP m_tag value.
 		 */
 		mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
 		if (mtag == NULL) {
 			mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN,
 			    sizeof(uint8_t), M_NOWAIT);
 			if (mtag == NULL) {
 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 				NET_EPOCH_EXIT(et);
 				m_freem(m);
 				return;
 			}
 			m_tag_prepend(m, mtag);
 		}
 		*(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag);
 	}
 
 	m->m_pkthdr.rcvif = ifv->ifv_ifp;
 	if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1);
 	NET_EPOCH_EXIT(et);
 
 	/* Pass it back through the parent's input routine. */
 	(*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m);
 }
 
 static void
 vlan_lladdr_fn(void *arg, int pending __unused)
 {
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 
 	ifv = (struct ifvlan *)arg;
 	ifp = ifv->ifv_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
 
 	/* The ifv_ifp already has the lladdr copied in. */
 	if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen);
 
 	CURVNET_RESTORE();
 }
 
 static int
 vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifnet *ifp;
 	int error = 0;
 
 	/*
 	 * We can handle non-ethernet hardware types as long as
 	 * they handle the tagging and headers themselves.
 	 */
 	if (p->if_type != IFT_ETHER &&
 	    (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0)
 		return (EPROTONOSUPPORT);
 	if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS)
 		return (EPROTONOSUPPORT);
 	/*
 	 * Don't let the caller set up a VLAN VID with
 	 * anything except VLID bits.
 	 * VID numbers 0x0 and 0xFFF are reserved.
 	 */
 	if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK))
 		return (EINVAL);
 	if (ifv->ifv_trunk)
 		return (EBUSY);
 
 	VLAN_XLOCK();
 	if (p->if_vlantrunk == NULL) {
 		trunk = malloc(sizeof(struct ifvlantrunk),
 		    M_VLAN, M_WAITOK | M_ZERO);
 		vlan_inithash(trunk);
 		TRUNK_LOCK_INIT(trunk);
 		TRUNK_WLOCK(trunk);
 		p->if_vlantrunk = trunk;
 		trunk->parent = p;
 		if_ref(trunk->parent);
 		TRUNK_WUNLOCK(trunk);
 	} else {
 		trunk = p->if_vlantrunk;
 	}
 
 	ifv->ifv_vid = vid;	/* must set this before vlan_inshash() */
 	ifv->ifv_pcp = 0;       /* Default: best effort delivery. */
 	vlan_tag_recalculate(ifv);
 	error = vlan_inshash(trunk, ifv);
 	if (error)
 		goto done;
 	ifv->ifv_proto = ETHERTYPE_VLAN;
 	ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN;
 	ifv->ifv_mintu = ETHERMIN;
 	ifv->ifv_pflags = 0;
 	ifv->ifv_capenable = -1;
 
 	/*
 	 * If the parent supports the VLAN_MTU capability,
 	 * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames,
 	 * use it.
 	 */
 	if (p->if_capenable & IFCAP_VLAN_MTU) {
 		/*
 		 * No need to fudge the MTU since the parent can
 		 * handle extended frames.
 		 */
 		ifv->ifv_mtufudge = 0;
 	} else {
 		/*
 		 * Fudge the MTU by the encapsulation size.  This
 		 * makes us incompatible with strictly compliant
 		 * 802.1Q implementations, but allows us to use
 		 * the feature with other NetBSD implementations,
 		 * which might still be useful.
 		 */
 		ifv->ifv_mtufudge = ifv->ifv_encaplen;
 	}
 
 	ifv->ifv_trunk = trunk;
 	ifp = ifv->ifv_ifp;
 	/*
 	 * Initialize fields from our parent.  This duplicates some
 	 * work with ether_ifattach() but allows for non-ethernet
 	 * interfaces to also work.
 	 */
 	ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge;
 	ifp->if_baudrate = p->if_baudrate;
 	ifp->if_output = p->if_output;
 	ifp->if_input = p->if_input;
 	ifp->if_resolvemulti = p->if_resolvemulti;
 	ifp->if_addrlen = p->if_addrlen;
 	ifp->if_broadcastaddr = p->if_broadcastaddr;
 	ifp->if_pcp = ifv->ifv_pcp;
 
 	/*
 	 * Copy only a selected subset of flags from the parent.
 	 * Other flags are none of our business.
 	 */
 #define VLAN_COPY_FLAGS (IFF_SIMPLEX)
 	ifp->if_flags &= ~VLAN_COPY_FLAGS;
 	ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS;
 #undef VLAN_COPY_FLAGS
 
 	ifp->if_link_state = p->if_link_state;
 
 	NET_EPOCH_ENTER(et);
 	vlan_capabilities(ifv);
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * Set up our interface address to reflect the underlying
 	 * physical interface's.
 	 */
 	bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen);
 	((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen =
 	    p->if_addrlen;
 
 	TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv);
 
 	/* We are ready for operation now. */
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/* Update flags on the parent, if necessary. */
 	vlan_setflags(ifp, 1);
 
 	/*
 	 * Configure multicast addresses that may already be
 	 * joined on the vlan device.
 	 */
 	(void)vlan_setmulti(ifp);
 
 done:
 	if (error == 0)
 		EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid);
 	VLAN_XUNLOCK();
 
 	return (error);
 }
 
 static void
 vlan_unconfig(struct ifnet *ifp)
 {
 
 	VLAN_XLOCK();
 	vlan_unconfig_locked(ifp, 0);
 	VLAN_XUNLOCK();
 }
 
 static void
 vlan_unconfig_locked(struct ifnet *ifp, int departing)
 {
 	struct ifvlantrunk *trunk;
 	struct vlan_mc_entry *mc;
 	struct ifvlan *ifv;
 	struct ifnet  *parent;
 	int error;
 
 	VLAN_XLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	trunk = ifv->ifv_trunk;
 	parent = NULL;
 
 	if (trunk != NULL) {
 		parent = trunk->parent;
 
 		/*
 		 * Since the interface is being unconfigured, we need to
 		 * empty the list of multicast groups that we may have joined
 		 * while we were alive from the parent's list.
 		 */
 		while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) {
 			/*
 			 * If the parent interface is being detached,
 			 * all its multicast addresses have already
 			 * been removed.  Warn about errors if
 			 * if_delmulti() does fail, but don't abort as
 			 * all callers expect vlan destruction to
 			 * succeed.
 			 */
 			if (!departing) {
 				error = if_delmulti(parent,
 				    (struct sockaddr *)&mc->mc_addr);
 				if (error)
 					if_printf(ifp,
 		    "Failed to delete multicast address from parent: %d\n",
 					    error);
 			}
 			CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries);
 			epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free);
 		}
 
 		vlan_setflags(ifp, 0); /* clear special flags on parent */
 
 		vlan_remhash(trunk, ifv);
 		ifv->ifv_trunk = NULL;
 
 		/*
 		 * Check if we were the last.
 		 */
 		if (trunk->refcnt == 0) {
 			parent->if_vlantrunk = NULL;
 			NET_EPOCH_WAIT();
 			trunk_destroy(trunk);
 		}
 	}
 
 	/* Disconnect from parent. */
 	if (ifv->ifv_pflags)
 		if_printf(ifp, "%s: ifv_pflags unclean\n", __func__);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_link_state = LINK_STATE_UNKNOWN;
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * Only dispatch an event if vlan was
 	 * attached, otherwise there is nothing
 	 * to cleanup anyway.
 	 */
 	if (parent != NULL)
 		EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid);
 }
 
 /* Handle a reference counted flag that should be set on the parent as well */
 static int
 vlan_setflag(struct ifnet *ifp, int flag, int status,
 	     int (*func)(struct ifnet *, int))
 {
 	struct ifvlan *ifv;
 	int error;
 
 	VLAN_SXLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	status = status ? (ifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded parent's status is different from what
 	 * we want it to be.  If it is, flip it.  We record parent's
 	 * status in ifv_pflags so that we won't clear parent's flag
 	 * we haven't set.  In fact, we don't clear or set parent's
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual parent's flags.
 	 */
 	if (status != (ifv->ifv_pflags & flag)) {
 		error = (*func)(PARENT(ifv), status);
 		if (error)
 			return (error);
 		ifv->ifv_pflags &= ~flag;
 		ifv->ifv_pflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the parent:
  * if "status" is true, update parent's flags respective to our if_flags;
  * if "status" is false, forcedly clear the flags set on parent.
  */
 static int
 vlan_setflags(struct ifnet *ifp, int status)
 {
 	int error, i;
 	
 	for (i = 0; vlan_pflags[i].flag; i++) {
 		error = vlan_setflag(ifp, vlan_pflags[i].flag,
 				     status, vlan_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 /* Inform all vlans that their parent has changed link state */
 static void
 vlan_link_state(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	/* Called from a taskqueue_swi task, so we cannot sleep. */
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate;
 		if_link_state_change(ifv->ifv_ifp,
 		    trunk->parent->if_link_state);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 vlan_capabilities(struct ifvlan *ifv)
 {
 	struct ifnet *p;
 	struct ifnet *ifp;
 	struct ifnet_hw_tsomax hw_tsomax;
 	int cap = 0, ena = 0, mena;
 	u_long hwa = 0;
 
 	VLAN_SXLOCK_ASSERT();
 	NET_EPOCH_ASSERT();
 	p = PARENT(ifv);
 	ifp = ifv->ifv_ifp;
 
 	/* Mask parent interface enabled capabilities disabled by user. */
 	mena = p->if_capenable & ifv->ifv_capenable;
 
 	/*
 	 * If the parent interface can do checksum offloading
 	 * on VLANs, then propagate its hardware-assisted
 	 * checksumming flags. Also assert that checksum
 	 * offloading requires hardware VLAN tagging.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM &&
 	    p->if_capenable & IFCAP_VLAN_HWTAGGING) {
 		ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 		if (ena & IFCAP_TXCSUM)
 			hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP |
 			    CSUM_UDP | CSUM_SCTP);
 		if (ena & IFCAP_TXCSUM_IPV6)
 			hwa |= p->if_hwassist & (CSUM_TCP_IPV6 |
 			    CSUM_UDP_IPV6 | CSUM_SCTP_IPV6);
 	}
 
 	/*
 	 * If the parent interface can do TSO on VLANs then
 	 * propagate the hardware-assisted flag. TSO on VLANs
 	 * does not necessarily require hardware VLAN tagging.
 	 */
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	if_hw_tsomax_common(p, &hw_tsomax);
 	if_hw_tsomax_update(ifp, &hw_tsomax);
 	if (p->if_capabilities & IFCAP_VLAN_HWTSO)
 		cap |= p->if_capabilities & IFCAP_TSO;
 	if (p->if_capenable & IFCAP_VLAN_HWTSO) {
 		ena |= mena & IFCAP_TSO;
 		if (ena & IFCAP_TSO)
 			hwa |= p->if_hwassist & CSUM_TSO;
 	}
 
 	/*
 	 * If the parent interface can do LRO and checksum offloading on
 	 * VLANs, then guess it may do LRO on VLANs.  False positive here
 	 * cost nothing, while false negative may lead to some confusions.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & IFCAP_LRO;
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM)
 		ena |= p->if_capenable & IFCAP_LRO;
 
 	/*
 	 * If the parent interface can offload TCP connections over VLANs then
 	 * propagate its TOE capability to the VLAN interface.
 	 *
 	 * All TOE drivers in the tree today can deal with VLANs.  If this
 	 * changes then IFCAP_VLAN_TOE should be promoted to a full capability
 	 * with its own bit.
 	 */
 #define	IFCAP_VLAN_TOE IFCAP_TOE
 	if (p->if_capabilities & IFCAP_VLAN_TOE)
 		cap |= p->if_capabilities & IFCAP_TOE;
 	if (p->if_capenable & IFCAP_VLAN_TOE) {
 		TOEDEV(ifp) = TOEDEV(p);
 		ena |= mena & IFCAP_TOE;
 	}
 
 	/*
 	 * If the parent interface supports dynamic link state, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_LINKSTATE);
 	ena |= (mena & IFCAP_LINKSTATE);
 
 #ifdef RATELIMIT
 	/*
 	 * If the parent interface supports ratelimiting, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_TXRTLMT);
 	ena |= (mena & IFCAP_TXRTLMT);
 #endif
 
 	ifp->if_capabilities = cap;
 	ifp->if_capenable = ena;
 	ifp->if_hwassist = hwa;
 }
 
 static void
 vlan_trunk_capabilities(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	VLAN_SLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_SUNLOCK();
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 	VLAN_FOREACH(ifv, trunk) {
 		vlan_capabilities(ifv);
 	}
 	NET_EPOCH_EXIT(et);
 	VLAN_SUNLOCK();
 }
 
 static int
 vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifnet *p;
 	struct ifreq *ifr;
 	struct ifaddr *ifa;
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 	struct vlanreq vlr;
 	int error = 0;
 
 	ifr = (struct ifreq *)data;
 	ifa = (struct ifaddr *) data;
 	ifv = ifp->if_softc;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 #endif
 		break;
 	case SIOCGIFADDR:
 		bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
 		    ifp->if_addrlen);
 		break;
 	case SIOCGIFMEDIA:
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			p = PARENT(ifv);
 			if_ref(p);
 			error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data);
 			if_rele(p);
 			/* Limit the result to the parent's current config. */
 			if (error == 0) {
 				struct ifmediareq *ifmr;
 
 				ifmr = (struct ifmediareq *)data;
 				if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) {
 					ifmr->ifm_count = 1;
 					error = copyout(&ifmr->ifm_current,
 						ifmr->ifm_ulist,
 						sizeof(int));
 				}
 			}
 		} else {
 			error = EINVAL;
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSIFMEDIA:
 		error = EINVAL;
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		VLAN_SLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			TRUNK_WLOCK(trunk);
 			if (ifr->ifr_mtu >
 			     (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) ||
 			    ifr->ifr_mtu <
 			     (ifv->ifv_mintu - ifv->ifv_mtufudge))
 				error = EINVAL;
 			else
 				ifp->if_mtu = ifr->ifr_mtu;
 			TRUNK_WUNLOCK(trunk);
 		} else
 			error = EINVAL;
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSETVLAN:
 #ifdef VIMAGE
 		/*
 		 * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN
 		 * interface to be delegated to a jail without allowing the
 		 * jail to change what underlying interface/VID it is
 		 * associated with.  We are not entirely convinced that this
 		 * is the right way to accomplish that policy goal.
 		 */
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr));
 		if (error)
 			break;
 		if (vlr.vlr_parent[0] == '\0') {
 			vlan_unconfig(ifp);
 			break;
 		}
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL) {
 			error = ENOENT;
 			break;
 		}
 		error = vlan_config(ifv, p, vlr.vlr_tag);
 		if_rele(p);
 		break;
 
 	case SIOCGETVLAN:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		bzero(&vlr, sizeof(vlr));
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname,
 			    sizeof(vlr.vlr_parent));
 			vlr.vlr_tag = ifv->ifv_vid;
 		}
 		VLAN_SUNLOCK();
 		error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr));
 		break;
 		
 	case SIOCSIFFLAGS:
 		/*
 		 * We should propagate selected flags to the parent,
 		 * e.g., promiscuous mode.
 		 */
 		VLAN_XLOCK();
 		if (TRUNK(ifv) != NULL)
 			error = vlan_setflags(ifp, 1);
 		VLAN_XUNLOCK();
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		/*
 		 * If we don't have a parent, just remember the membership for
 		 * when we do.
 		 *
 		 * XXX We need the rmlock here to avoid sleeping while
 		 * holding in6_multi_mtx.
 		 */
 		VLAN_XLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL)
 			error = vlan_setmulti(ifp);
 		VLAN_XUNLOCK();
 
 		break;
 	case SIOCGVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		ifr->ifr_vlan_pcp = ifv->ifv_pcp;
 		break;
 
 	case SIOCSVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = priv_check(curthread, PRIV_NET_SETVLANPCP);
 		if (error)
 			break;
 		if (ifr->ifr_vlan_pcp > 7) {
 			error = EINVAL;
 			break;
 		}
 		ifv->ifv_pcp = ifr->ifr_vlan_pcp;
 		ifp->if_pcp = ifv->ifv_pcp;
 		vlan_tag_recalculate(ifv);
 		/* broadcast event about PCP change */
 		EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP);
 		break;
 
 	case SIOCSIFCAP:
 		VLAN_SLOCK();
 		ifv->ifv_capenable = ifr->ifr_reqcap;
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			vlan_capabilities(ifv);
 			NET_EPOCH_EXIT(et);
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 #ifdef RATELIMIT
 static int
 vlan_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
+	struct epoch_tracker et;
+	struct vlan_snd_tag *vst;
+	struct ifvlan *ifv;
+	struct ifnet *parent;
+	int error;
 
-	/* get trunk device */
-	ifp = vlan_trunkdev(ifp);
-	if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+	NET_EPOCH_ENTER(et);
+	ifv = ifp->if_softc;
+	if (ifv->ifv_trunk != NULL)
+		parent = PARENT(ifv);
+	else
+		parent = NULL;
+	if (parent == NULL || parent->if_snd_tag_alloc == NULL) {
+		NET_EPOCH_EXIT(et);
 		return (EOPNOTSUPP);
-	/* forward allocation request */
-	return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+	}
+	if_ref(parent);
+	NET_EPOCH_EXIT(et);
+
+	vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT);
+	if (vst == NULL) {
+		if_rele(parent);
+		return (ENOMEM);
+	}
+
+	error = parent->if_snd_tag_alloc(parent, params, &vst->tag);
+	if_rele(parent);
+	if (error) {
+		free(vst, M_VLAN);
+		return (error);
+	}
+
+	m_snd_tag_init(&vst->com, ifp);
+
+	*ppmt = &vst->com;
+	return (0);
 }
 
+static int
+vlan_snd_tag_modify(struct m_snd_tag *mst,
+    union if_snd_tag_modify_params *params)
+{
+	struct vlan_snd_tag *vst;
+
+	vst = mst_to_vst(mst);
+	return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params));
+}
+
+static int
+vlan_snd_tag_query(struct m_snd_tag *mst,
+    union if_snd_tag_query_params *params)
+{
+	struct vlan_snd_tag *vst;
+
+	vst = mst_to_vst(mst);
+	return (vst->tag->ifp->if_snd_tag_query(vst->tag, params));
+}
+
 static void
-vlan_snd_tag_free(struct m_snd_tag *tag)
+vlan_snd_tag_free(struct m_snd_tag *mst)
 {
-	tag->ifp->if_snd_tag_free(tag);
+	struct vlan_snd_tag *vst;
+
+	vst = mst_to_vst(mst);
+	m_snd_tag_rele(vst->tag);
+	free(vst, M_VLAN);
 }
 #endif
Index: head/sys/net/netisr.c
===================================================================
--- head/sys/net/netisr.c	(revision 348253)
+++ head/sys/net/netisr.c	(revision 348254)
@@ -1,1533 +1,1534 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
  * to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * netisr is a packet dispatch service, allowing synchronous (directly
  * dispatched) and asynchronous (deferred dispatch) processing of packets by
  * registered protocol handlers.  Callers pass a protocol identifier and
  * packet to netisr, along with a direct dispatch hint, and work will either
  * be immediately processed by the registered handler, or passed to a
  * software interrupt (SWI) thread for deferred dispatch.  Callers will
  * generally select one or the other based on:
  *
  * - Whether directly dispatching a netisr handler lead to code reentrance or
  *   lock recursion, such as entering the socket code from the socket code.
  * - Whether directly dispatching a netisr handler lead to recursive
  *   processing, such as when decapsulating several wrapped layers of tunnel
  *   information (IPSEC within IPSEC within ...).
  *
  * Maintaining ordering for protocol streams is a critical design concern.
  * Enforcing ordering limits the opportunity for concurrency, but maintains
  * the strong ordering requirements found in some protocols, such as TCP.  Of
  * related concern is CPU affinity--it is desirable to process all data
  * associated with a particular stream on the same CPU over time in order to
  * avoid acquiring locks associated with the connection on different CPUs,
  * keep connection data in one cache, and to generally encourage associated
  * user threads to live on the same CPU as the stream.  It's also desirable
  * to avoid lock migration and contention where locks are associated with
  * more than one flow.
  *
  * netisr supports several policy variations, represented by the
  * NETISR_POLICY_* constants, allowing protocols to play various roles in
  * identifying flows, assigning work to CPUs, etc.  These are described in
  * netisr.h.
  */
 
 #include "opt_ddb.h"
 #include "opt_device_polling.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/interrupt.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	_WANT_NETISR_INTERNAL	/* Enable definitions from netisr_internal.h */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/netisr_internal.h>
 #include <net/vnet.h>
 
 /*-
  * Synchronize use and modification of the registered netisr data structures;
  * acquire a read lock while modifying the set of registered protocols to
  * prevent partially registered or unregistered protocols from being run.
  *
  * The following data structures and fields are protected by this lock:
  *
  * - The netisr_proto array, including all fields of struct netisr_proto.
  * - The nws array, including all fields of struct netisr_worker.
  * - The nws_array array.
  *
  * Note: the NETISR_LOCKING define controls whether read locks are acquired
  * in packet processing paths requiring netisr registration stability.  This
  * is disabled by default as it can lead to measurable performance
  * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and
  * because netisr registration and unregistration is extremely rare at
  * runtime.  If it becomes more common, this decision should be revisited.
  *
  * XXXRW: rmlocks don't support assertions.
  */
 static struct rmlock	netisr_rmlock;
 #define	NETISR_LOCK_INIT()	rm_init_flags(&netisr_rmlock, "netisr", \
 				    RM_NOWITNESS)
 #define	NETISR_LOCK_ASSERT()
 #define	NETISR_RLOCK(tracker)	rm_rlock(&netisr_rmlock, (tracker))
 #define	NETISR_RUNLOCK(tracker)	rm_runlock(&netisr_rmlock, (tracker))
 #define	NETISR_WLOCK()		rm_wlock(&netisr_rmlock)
 #define	NETISR_WUNLOCK()	rm_wunlock(&netisr_rmlock)
 /* #define	NETISR_LOCKING */
 
 static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr");
 
 /*-
  * Three global direct dispatch policies are supported:
  *
  * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of
  * context (may be overriden by protocols).
  *
  * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch,
  * and we're running on the CPU the work would be performed on, then direct
  * dispatch it if it wouldn't violate ordering constraints on the workstream.
  *
  * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch,
  * always direct dispatch.  (The default.)
  *
  * Notice that changing the global policy could lead to short periods of
  * misordered processing, but this is considered acceptable as compared to
  * the complexity of enforcing ordering during policy changes.  Protocols can
  * override the global policy (when they're not doing that, they select
  * NETISR_DISPATCH_DEFAULT).
  */
 #define	NETISR_DISPATCH_POLICY_DEFAULT	NETISR_DISPATCH_DIRECT
 #define	NETISR_DISPATCH_POLICY_MAXSTR	20 /* Used for temporary buffers. */
 static u_int	netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT;
 static int	sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN,
     0, 0, sysctl_netisr_dispatch_policy, "A",
     "netisr dispatch policy");
 
 /*
  * Allow the administrator to limit the number of threads (CPUs) to use for
  * netisr.  We don't check netisr_maxthreads before creating the thread for
  * CPU 0. This must be set at boot. We will create at most one thread per CPU.
  * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and
  * therefore only 1 workstream. If set to -1, netisr would use all cpus
  * (mp_ncpus) and therefore would have those many workstreams. One workstream
  * per thread (CPU).
  */
 static int	netisr_maxthreads = 1;		/* Max number of threads. */
 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN,
     &netisr_maxthreads, 0,
     "Use at most this many CPUs for netisr processing");
 
 static int	netisr_bindthreads = 0;		/* Bind threads to CPUs. */
 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN,
     &netisr_bindthreads, 0, "Bind netisr threads to CPUs.");
 
 /*
  * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit,
  * both for initial configuration and later modification using
  * netisr_setqlimit().
  */
 #define	NETISR_DEFAULT_MAXQLIMIT	10240
 static u_int	netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN,
     &netisr_maxqlimit, 0,
     "Maximum netisr per-protocol, per-CPU queue depth.");
 
 /*
  * The default per-workstream mbuf queue limit for protocols that don't
  * initialize the nh_qlimit field of their struct netisr_handler.  If this is
  * set above netisr_maxqlimit, we truncate it to the maximum during boot.
  */
 #define	NETISR_DEFAULT_DEFAULTQLIMIT	256
 static u_int	netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT;
 SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN,
     &netisr_defaultqlimit, 0,
     "Default netisr per-protocol, per-CPU queue limit if not set by protocol");
 
 /*
  * Store and export the compile-time constant NETISR_MAXPROT limit on the
  * number of protocols that can register with netisr at a time.  This is
  * required for crashdump analysis, as it sizes netisr_proto[].
  */
 static u_int	netisr_maxprot = NETISR_MAXPROT;
 SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD,
     &netisr_maxprot, 0,
     "Compile-time limit on the number of protocols supported by netisr.");
 
 /*
  * The netisr_proto array describes all registered protocols, indexed by
  * protocol number.  See netisr_internal.h for more details.
  */
 static struct netisr_proto	netisr_proto[NETISR_MAXPROT];
 
 #ifdef VIMAGE
 /*
  * The netisr_enable array describes a per-VNET flag for registered
  * protocols on whether this netisr is active in this VNET or not.
  * netisr_register() will automatically enable the netisr for the
  * default VNET and all currently active instances.
  * netisr_unregister() will disable all active VNETs, including vnet0.
  * Individual network stack instances can be enabled/disabled by the
  * netisr_(un)register _vnet() functions.
  * With this we keep the one netisr_proto per protocol but add a
  * mechanism to stop netisr processing for vnet teardown.
  * Apart from that we expect a VNET to always be enabled.
  */
 VNET_DEFINE_STATIC(u_int,	netisr_enable[NETISR_MAXPROT]);
 #define	V_netisr_enable		VNET(netisr_enable)
 #endif
 
 /*
  * Per-CPU workstream data.  See netisr_internal.h for more details.
  */
 DPCPU_DEFINE(struct netisr_workstream, nws);
 
 /*
  * Map contiguous values between 0 and nws_count into CPU IDs appropriate for
  * accessing workstreams.  This allows constructions of the form
  * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws).
  */
 static u_int				 nws_array[MAXCPU];
 
 /*
  * Number of registered workstreams.  Will be at most the number of running
  * CPUs once fully started.
  */
 static u_int				 nws_count;
 SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD,
     &nws_count, 0, "Number of extant netisr threads.");
 
 /*
  * Synchronization for each workstream: a mutex protects all mutable fields
  * in each stream, including per-protocol state (mbuf queues).  The SWI is
  * woken up if asynchronous dispatch is required.
  */
 #define	NWS_LOCK(s)		mtx_lock(&(s)->nws_mtx)
 #define	NWS_LOCK_ASSERT(s)	mtx_assert(&(s)->nws_mtx, MA_OWNED)
 #define	NWS_UNLOCK(s)		mtx_unlock(&(s)->nws_mtx)
 #define	NWS_SIGNAL(s)		swi_sched((s)->nws_swi_cookie, 0)
 
 /*
  * Utility routines for protocols that implement their own mapping of flows
  * to CPUs.
  */
 u_int
 netisr_get_cpucount(void)
 {
 
 	return (nws_count);
 }
 
 u_int
 netisr_get_cpuid(u_int cpunumber)
 {
 
 	return (nws_array[cpunumber % nws_count]);
 }
 
 /*
  * The default implementation of flow -> CPU ID mapping.
  *
  * Non-static so that protocols can use it to map their own work to specific
  * CPUs in a manner consistent to netisr for affinity purposes.
  */
 u_int
 netisr_default_flow2cpu(u_int flowid)
 {
 
 	return (nws_array[flowid % nws_count]);
 }
 
 /*
  * Dispatch tunable and sysctl configuration.
  */
 struct netisr_dispatch_table_entry {
 	u_int		 ndte_policy;
 	const char	*ndte_policy_str;
 };
 static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = {
 	{ NETISR_DISPATCH_DEFAULT, "default" },
 	{ NETISR_DISPATCH_DEFERRED, "deferred" },
 	{ NETISR_DISPATCH_HYBRID, "hybrid" },
 	{ NETISR_DISPATCH_DIRECT, "direct" },
 };
 
 static void
 netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer,
     u_int buflen)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	const char *str;
 	u_int i;
 
 	str = "unknown";
 	for (i = 0; i < nitems(netisr_dispatch_table); i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (ndtep->ndte_policy == dispatch_policy) {
 			str = ndtep->ndte_policy_str;
 			break;
 		}
 	}
 	snprintf(buffer, buflen, "%s", str);
 }
 
 static int
 netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp)
 {
 	const struct netisr_dispatch_table_entry *ndtep;
 	u_int i;
 
 	for (i = 0; i < nitems(netisr_dispatch_table); i++) {
 		ndtep = &netisr_dispatch_table[i];
 		if (strcmp(ndtep->ndte_policy_str, str) == 0) {
 			*dispatch_policyp = ndtep->ndte_policy;
 			return (0);
 		}
 	}
 	return (EINVAL);
 }
 
 static int
 sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS)
 {
 	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
 	u_int dispatch_policy;
 	int error;
 
 	netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp,
 	    sizeof(tmp));
 	error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req);
 	if (error == 0 && req->newptr != NULL) {
 		error = netisr_dispatch_policy_from_str(tmp,
 		    &dispatch_policy);
 		if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT)
 			error = EINVAL;
 		if (error == 0)
 			netisr_dispatch_policy = dispatch_policy;
 	}
 	return (error);
 }
 
 /*
  * Register a new netisr handler, which requires initializing per-protocol
  * fields for each workstream.  All netisr work is briefly suspended while
  * the protocol is installed.
  */
 void
 netisr_register(const struct netisr_handler *nhp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct netisr_work *npwp;
 	const char *name;
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 	name = nhp->nh_name;
 
 	/*
 	 * Test that the requested registration is valid.
 	 */
 	KASSERT(nhp->nh_name != NULL,
 	    ("%s: nh_name NULL for %u", __func__, proto));
 	KASSERT(nhp->nh_handler != NULL,
 	    ("%s: nh_handler NULL for %s", __func__, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE ||
 	    nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_policy == NETISR_POLICY_CPU,
 	    ("%s: unsupported nh_policy %u for %s", __func__,
 	    nhp->nh_policy, name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW ||
 	    nhp->nh_m2flow == NULL,
 	    ("%s: nh_policy != FLOW but m2flow defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL,
 	    ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
 	    ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__,
 	    name));
 	KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_HYBRID ||
 	    nhp->nh_dispatch == NETISR_DISPATCH_DIRECT,
 	    ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch));
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u, %s): protocol too big", __func__, proto, name));
 
 	/*
 	 * Test that no existing registration exists for this protocol.
 	 */
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_name == NULL,
 	    ("%s(%u, %s): name present", __func__, proto, name));
 	KASSERT(netisr_proto[proto].np_handler == NULL,
 	    ("%s(%u, %s): handler present", __func__, proto, name));
 
 	netisr_proto[proto].np_name = name;
 	netisr_proto[proto].np_handler = nhp->nh_handler;
 	netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
 	netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
 	netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
 	if (nhp->nh_qlimit == 0)
 		netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
 	else if (nhp->nh_qlimit > netisr_maxqlimit) {
 		printf("%s: %s requested queue limit %u capped to "
 		    "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit,
 		    netisr_maxqlimit);
 		netisr_proto[proto].np_qlimit = netisr_maxqlimit;
 	} else
 		netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
 	netisr_proto[proto].np_policy = nhp->nh_policy;
 	netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		bzero(npwp, sizeof(*npwp));
 		npwp->nw_qlimit = netisr_proto[proto].np_qlimit;
 	}
 
 #ifdef VIMAGE
 	/*
 	 * Test that we are in vnet0 and have a curvnet set.
 	 */
 	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
 	KASSERT(IS_DEFAULT_VNET(curvnet), ("%s: curvnet %p is not vnet0 %p",
 	    __func__, curvnet, vnet0));
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_netisr_enable[proto] = 1;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 #endif
 	NETISR_WUNLOCK();
 }
 
 /*
  * Clear drop counters across all workstreams for a protocol.
  */
 void
 netisr_clearqdrops(const struct netisr_handler *nhp)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qdrops = 0;
 	}
 	NETISR_WUNLOCK();
 }
 
 /*
  * Query current drop counters across all workstreams for a protocol.
  */
 void
 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp)
 {
 	struct netisr_work *npwp;
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	*qdropp = 0;
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		*qdropp += npwp->nw_qdrops;
 	}
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Query current per-workstream queue limit for a protocol.
  */
 void
 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp)
 {
 	struct rm_priotracker tracker;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_RLOCK(&tracker);
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 	*qlimitp = netisr_proto[proto].np_qlimit;
 	NETISR_RUNLOCK(&tracker);
 }
 
 /*
  * Update the queue limit across per-workstream queues for a protocol.  We
  * simply change the limits, and don't drain overflowed packets as they will
  * (hopefully) take care of themselves shortly.
  */
 int
 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit)
 {
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	if (qlimit > netisr_maxqlimit)
 		return (EINVAL);
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 	netisr_proto[proto].np_qlimit = qlimit;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		npwp->nw_qlimit = qlimit;
 	}
 	NETISR_WUNLOCK();
 	return (0);
 }
 
 /*
  * Drain all packets currently held in a particular protocol work queue.
  */
 static void
 netisr_drain_proto(struct netisr_work *npwp)
 {
 	struct mbuf *m;
 
 	/*
 	 * We would assert the lock on the workstream but it's not passed in.
 	 */
 	while ((m = npwp->nw_head) != NULL) {
 		npwp->nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL)
 			npwp->nw_tail = NULL;
 		npwp->nw_len--;
 		m_freem(m);
 	}
 	KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__));
 	KASSERT(npwp->nw_len == 0, ("%s: len", __func__));
 }
 
 /*
  * Remove the registration of a network protocol, which requires clearing
  * per-protocol fields across all workstreams, including freeing all mbufs in
  * the queues at time of unregister.  All work in netisr is briefly suspended
  * while this takes place.
  */
 void
 netisr_unregister(const struct netisr_handler *nhp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct netisr_work *npwp;
 #ifdef INVARIANTS
 	const char *name;
 #endif
 	u_int i, proto;
 
 	proto = nhp->nh_proto;
 #ifdef INVARIANTS
 	name = nhp->nh_name;
 #endif
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
 
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    name));
 
 #ifdef VIMAGE
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_netisr_enable[proto] = 0;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 #endif
 
 	netisr_proto[proto].np_name = NULL;
 	netisr_proto[proto].np_handler = NULL;
 	netisr_proto[proto].np_m2flow = NULL;
 	netisr_proto[proto].np_m2cpuid = NULL;
 	netisr_proto[proto].np_qlimit = 0;
 	netisr_proto[proto].np_policy = 0;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		netisr_drain_proto(npwp);
 		bzero(npwp, sizeof(*npwp));
 	}
 	NETISR_WUNLOCK();
 }
 
 #ifdef VIMAGE
 void
 netisr_register_vnet(const struct netisr_handler *nhp)
 {
 	u_int proto;
 
 	proto = nhp->nh_proto;
 
 	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name));
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    nhp->nh_name));
 	
 	V_netisr_enable[proto] = 1;
 	NETISR_WUNLOCK();
 }
 
 static void
 netisr_drain_proto_vnet(struct vnet *vnet, u_int proto)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *npwp;
 	struct mbuf *m, *mp, *n, *ne;
 	u_int i;
 
 	KASSERT(vnet != NULL, ("%s: vnet is NULL", __func__));
 	NETISR_LOCK_ASSERT();
 
 	CPU_FOREACH(i) {
 		nwsp = DPCPU_ID_PTR(i, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		npwp = &nwsp->nws_work[proto];
 		NWS_LOCK(nwsp);
 
 		/*
 		 * Rather than dissecting and removing mbufs from the middle
 		 * of the chain, we build a new chain if the packet stays and
 		 * update the head and tail pointers at the end.  All packets
 		 * matching the given vnet are freed.
 		 */
 		m = npwp->nw_head;
 		n = ne = NULL;
 		while (m != NULL) {
 			mp = m;
 			m = m->m_nextpkt;
 			mp->m_nextpkt = NULL;
 			if (mp->m_pkthdr.rcvif->if_vnet != vnet) {
 				if (n == NULL) {
 					n = ne = mp;
 				} else {
 					ne->m_nextpkt = mp;
 					ne = mp;
 				}
 				continue;
 			}
 			/* This is a packet in the selected vnet. Free it. */
 			npwp->nw_len--;
 			m_freem(mp);
 		}
 		npwp->nw_head = n;
 		npwp->nw_tail = ne;
 		NWS_UNLOCK(nwsp);
 	}
 }
 
 void
 netisr_unregister_vnet(const struct netisr_handler *nhp)
 {
 	u_int proto;
 
 	proto = nhp->nh_proto;
 
 	KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__));
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name));
 	NETISR_WLOCK();
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s(%u): protocol not registered for %s", __func__, proto,
 	    nhp->nh_name));
 	
 	V_netisr_enable[proto] = 0;
 
 	netisr_drain_proto_vnet(curvnet, proto);
 	NETISR_WUNLOCK();
 }
 #endif
 
 /*
  * Compose the global and per-protocol policies on dispatch, and return the
  * dispatch policy to use.
  */
 static u_int
 netisr_get_dispatch(struct netisr_proto *npp)
 {
 
 	/*
 	 * Protocol-specific configuration overrides the global default.
 	 */
 	if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT)
 		return (npp->np_dispatch);
 	return (netisr_dispatch_policy);
 }
 
 /*
  * Look up the workstream given a packet and source identifier.  Do this by
  * checking the protocol's policy, and optionally call out to the protocol
  * for assistance if required.
  */
 static struct mbuf *
 netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy,
     uintptr_t source, struct mbuf *m, u_int *cpuidp)
 {
 	struct ifnet *ifp;
 	u_int policy;
 
 	NETISR_LOCK_ASSERT();
 
 	/*
 	 * In the event we have only one worker, shortcut and deliver to it
 	 * without further ado.
 	 */
 	if (nws_count == 1) {
 		*cpuidp = nws_array[0];
 		return (m);
 	}
 
 	/*
 	 * What happens next depends on the policy selected by the protocol.
 	 * If we want to support per-interface policies, we should do that
 	 * here first.
 	 */
 	policy = npp->np_policy;
 	if (policy == NETISR_POLICY_CPU) {
 		m = npp->np_m2cpuid(m, source, cpuidp);
 		if (m == NULL)
 			return (NULL);
 
 		/*
 		 * It's possible for a protocol not to have a good idea about
 		 * where to process a packet, in which case we fall back on
 		 * the netisr code to decide.  In the hybrid case, return the
 		 * current CPU ID, which will force an immediate direct
 		 * dispatch.  In the queued case, fall back on the SOURCE
 		 * policy.
 		 */
 		if (*cpuidp != NETISR_CPUID_NONE) {
 			*cpuidp = netisr_get_cpuid(*cpuidp);
 			return (m);
 		}
 		if (dispatch_policy == NETISR_DISPATCH_HYBRID) {
 			*cpuidp = netisr_get_cpuid(curcpu);
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	if (policy == NETISR_POLICY_FLOW) {
 		if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE &&
 		    npp->np_m2flow != NULL) {
 			m = npp->np_m2flow(m, source);
 			if (m == NULL)
 				return (NULL);
 		}
 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 			*cpuidp =
 			    netisr_default_flow2cpu(m->m_pkthdr.flowid);
 			return (m);
 		}
 		policy = NETISR_POLICY_SOURCE;
 	}
 
 	KASSERT(policy == NETISR_POLICY_SOURCE,
 	    ("%s: invalid policy %u for %s", __func__, npp->np_policy,
 	    npp->np_name));
 
+	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL)
 		*cpuidp = nws_array[(ifp->if_index + source) % nws_count];
 	else
 		*cpuidp = nws_array[source % nws_count];
 	return (m);
 }
 
 /*
  * Process packets associated with a workstream and protocol.  For reasons of
  * fairness, we process up to one complete netisr queue at a time, moving the
  * queue to a stack-local queue for processing, but do not loop refreshing
  * from the global queue.  The caller is responsible for deciding whether to
  * loop, and for setting the NWS_RUNNING flag.  The passed workstream will be
  * locked on entry and relocked before return, but will be released while
  * processing.  The number of packets processed is returned.
  */
 static u_int
 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto)
 {
 	struct netisr_work local_npw, *npwp;
 	u_int handled;
 	struct mbuf *m;
 
 	NETISR_LOCK_ASSERT();
 	NWS_LOCK_ASSERT(nwsp);
 
 	KASSERT(nwsp->nws_flags & NWS_RUNNING,
 	    ("%s(%u): not running", __func__, proto));
 	KASSERT(proto >= 0 && proto < NETISR_MAXPROT,
 	    ("%s(%u): invalid proto\n", __func__, proto));
 
 	npwp = &nwsp->nws_work[proto];
 	if (npwp->nw_len == 0)
 		return (0);
 
 	/*
 	 * Move the global work queue to a thread-local work queue.
 	 *
 	 * Notice that this means the effective maximum length of the queue
 	 * is actually twice that of the maximum queue length specified in
 	 * the protocol registration call.
 	 */
 	handled = npwp->nw_len;
 	local_npw = *npwp;
 	npwp->nw_head = NULL;
 	npwp->nw_tail = NULL;
 	npwp->nw_len = 0;
 	nwsp->nws_pendingbits &= ~(1 << proto);
 	NWS_UNLOCK(nwsp);
 	while ((m = local_npw.nw_head) != NULL) {
 		local_npw.nw_head = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if (local_npw.nw_head == NULL)
 			local_npw.nw_tail = NULL;
 		local_npw.nw_len--;
 		VNET_ASSERT(m->m_pkthdr.rcvif != NULL,
 		    ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m));
 		CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
 		netisr_proto[proto].np_handler(m);
 		CURVNET_RESTORE();
 	}
 	KASSERT(local_npw.nw_len == 0,
 	    ("%s(%u): len %u", __func__, proto, local_npw.nw_len));
 	if (netisr_proto[proto].np_drainedcpu)
 		netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
 	NWS_LOCK(nwsp);
 	npwp->nw_handled += handled;
 	return (handled);
 }
 
 /*
  * SWI handler for netisr -- processes packets in a set of workstreams that
  * it owns, woken up by calls to NWS_SIGNAL().  If this workstream is already
  * being direct dispatched, go back to sleep and wait for the dispatching
  * thread to wake us up again.
  */
 static void
 swi_net(void *arg)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	u_int bits, prot;
 
 	nwsp = arg;
 
 #ifdef DEVICE_POLLING
 	KASSERT(nws_count == 1,
 	    ("%s: device_polling but nws_count != 1", __func__));
 	netisr_poll();
 #endif
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	NWS_LOCK(nwsp);
 	KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running"));
 	if (nwsp->nws_flags & NWS_DISPATCHING)
 		goto out;
 	nwsp->nws_flags |= NWS_RUNNING;
 	nwsp->nws_flags &= ~NWS_SCHEDULED;
 	while ((bits = nwsp->nws_pendingbits) != 0) {
 		while ((prot = ffs(bits)) != 0) {
 			prot--;
 			bits &= ~(1 << prot);
 			(void)netisr_process_workstream_proto(nwsp, prot);
 		}
 	}
 	nwsp->nws_flags &= ~NWS_RUNNING;
 out:
 	NWS_UNLOCK(nwsp);
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 #ifdef DEVICE_POLLING
 	netisr_pollmore();
 #endif
 }
 
 static int
 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto,
     struct netisr_work *npwp, struct mbuf *m, int *dosignalp)
 {
 
 	NWS_LOCK_ASSERT(nwsp);
 
 	*dosignalp = 0;
 	if (npwp->nw_len < npwp->nw_qlimit) {
 		m->m_nextpkt = NULL;
 		if (npwp->nw_head == NULL) {
 			npwp->nw_head = m;
 			npwp->nw_tail = m;
 		} else {
 			npwp->nw_tail->m_nextpkt = m;
 			npwp->nw_tail = m;
 		}
 		npwp->nw_len++;
 		if (npwp->nw_len > npwp->nw_watermark)
 			npwp->nw_watermark = npwp->nw_len;
 
 		/*
 		 * We must set the bit regardless of NWS_RUNNING, so that
 		 * swi_net() keeps calling netisr_process_workstream_proto().
 		 */
 		nwsp->nws_pendingbits |= (1 << proto);
 		if (!(nwsp->nws_flags & 
 		    (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) {
 			nwsp->nws_flags |= NWS_SCHEDULED;
 			*dosignalp = 1;	/* Defer until unlocked. */
 		}
 		npwp->nw_queued++;
 		return (0);
 	} else {
 		m_freem(m);
 		npwp->nw_qdrops++;
 		return (ENOBUFS);
 	}
 }
 
 static int
 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 
 #ifdef NETISR_LOCKING
 	NETISR_LOCK_ASSERT();
 #endif
 	KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__,
 	    cpuid, mp_maxid));
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	dosignal = 0;
 	error = 0;
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	npwp = &nwsp->nws_work[proto];
 	NWS_LOCK(nwsp);
 	error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal);
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	return (error);
 }
 
 int
 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	u_int cpuid;
 	int error;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s: invalid proto %u", __func__, proto));
 
 #ifdef VIMAGE
 	if (V_netisr_enable[proto] == 0) {
 		m_freem(m);
 		return (ENOPROTOOPT);
 	}
 #endif
 
 	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
 	    source, m, &cpuid);
 	if (m != NULL) {
 		KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__,
 		    cpuid));
 		error = netisr_queue_internal(proto, m, cpuid);
 	} else
 		error = ENOBUFS;
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_queue(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_queue_src(proto, 0, m));
 }
 
 /*
  * Dispatch a packet for netisr processing; direct dispatch is permitted by
  * calling context.
  */
 int
 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m)
 {
 #ifdef NETISR_LOCKING
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *npwp;
 	int dosignal, error;
 	u_int cpuid, dispatch_policy;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
 	npp = &netisr_proto[proto];
 	KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__,
 	    proto));
 
 #ifdef VIMAGE
 	if (V_netisr_enable[proto] == 0) {
 		m_freem(m);
 		return (ENOPROTOOPT);
 	}
 #endif
 
 	dispatch_policy = netisr_get_dispatch(npp);
 	if (dispatch_policy == NETISR_DISPATCH_DEFERRED)
 		return (netisr_queue_src(proto, source, m));
 
 	/*
 	 * If direct dispatch is forced, then unconditionally dispatch
 	 * without a formal CPU selection.  Borrow the current CPU's stats,
 	 * even if there's no worker on it.  In this case we don't update
 	 * nws_flags because all netisr processing will be source ordered due
 	 * to always being forced to directly dispatch.
 	 */
 	if (dispatch_policy == NETISR_DISPATCH_DIRECT) {
 		nwsp = DPCPU_PTR(nws);
 		npwp = &nwsp->nws_work[proto];
 		npwp->nw_dispatched++;
 		npwp->nw_handled++;
 		netisr_proto[proto].np_handler(m);
 		error = 0;
 		goto out_unlock;
 	}
 
 	KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID,
 	    ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy));
 
 	/*
 	 * Otherwise, we execute in a hybrid mode where we will try to direct
 	 * dispatch if we're on the right CPU and the netisr worker isn't
 	 * already running.
 	 */
 	sched_pin();
 	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID,
 	    source, m, &cpuid);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto out_unpin;
 	}
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 	if (cpuid != curcpu)
 		goto queue_fallback;
 	nwsp = DPCPU_PTR(nws);
 	npwp = &nwsp->nws_work[proto];
 
 	/*-
 	 * We are willing to direct dispatch only if three conditions hold:
 	 *
 	 * (1) The netisr worker isn't already running,
 	 * (2) Another thread isn't already directly dispatching, and
 	 * (3) The netisr hasn't already been woken up.
 	 */
 	NWS_LOCK(nwsp);
 	if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) {
 		error = netisr_queue_workstream(nwsp, proto, npwp, m,
 		    &dosignal);
 		NWS_UNLOCK(nwsp);
 		if (dosignal)
 			NWS_SIGNAL(nwsp);
 		goto out_unpin;
 	}
 
 	/*
 	 * The current thread is now effectively the netisr worker, so set
 	 * the dispatching flag to prevent concurrent processing of the
 	 * stream from another thread (even the netisr worker), which could
 	 * otherwise lead to effective misordering of the stream.
 	 */
 	nwsp->nws_flags |= NWS_DISPATCHING;
 	NWS_UNLOCK(nwsp);
 	netisr_proto[proto].np_handler(m);
 	NWS_LOCK(nwsp);
 	nwsp->nws_flags &= ~NWS_DISPATCHING;
 	npwp->nw_handled++;
 	npwp->nw_hybrid_dispatched++;
 
 	/*
 	 * If other work was enqueued by another thread while we were direct
 	 * dispatching, we need to signal the netisr worker to do that work.
 	 * In the future, we might want to do some of that work in the
 	 * current thread, rather than trigger further context switches.  If
 	 * so, we'll want to establish a reasonable bound on the work done in
 	 * the "borrowed" context.
 	 */
 	if (nwsp->nws_pendingbits != 0) {
 		nwsp->nws_flags |= NWS_SCHEDULED;
 		dosignal = 1;
 	} else
 		dosignal = 0;
 	NWS_UNLOCK(nwsp);
 	if (dosignal)
 		NWS_SIGNAL(nwsp);
 	error = 0;
 	goto out_unpin;
 
 queue_fallback:
 	error = netisr_queue_internal(proto, m, cpuid);
 out_unpin:
 	sched_unpin();
 out_unlock:
 #ifdef NETISR_LOCKING
 	NETISR_RUNLOCK(&tracker);
 #endif
 	return (error);
 }
 
 int
 netisr_dispatch(u_int proto, struct mbuf *m)
 {
 
 	return (netisr_dispatch_src(proto, 0, m));
 }
 
 #ifdef DEVICE_POLLING
 /*
  * Kernel polling borrows a netisr thread to run interface polling in; this
  * function allows kernel polling to request that the netisr thread be
  * scheduled even if no packets are pending for protocols.
  */
 void
 netisr_sched_poll(void)
 {
 	struct netisr_workstream *nwsp;
 
 	nwsp = DPCPU_ID_PTR(nws_array[0], nws);
 	NWS_SIGNAL(nwsp);
 }
 #endif
 
 static void
 netisr_start_swi(u_int cpuid, struct pcpu *pc)
 {
 	char swiname[12];
 	struct netisr_workstream *nwsp;
 	int error;
 
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
 
 	nwsp = DPCPU_ID_PTR(cpuid, nws);
 	mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF);
 	nwsp->nws_cpu = cpuid;
 	snprintf(swiname, sizeof(swiname), "netisr %u", cpuid);
 	error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp,
 	    SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie);
 	if (error)
 		panic("%s: swi_add %d", __func__, error);
 	pc->pc_netisr = nwsp->nws_intr_event;
 	if (netisr_bindthreads) {
 		error = intr_event_bind(nwsp->nws_intr_event, cpuid);
 		if (error != 0)
 			printf("%s: cpu %u: intr_event_bind: %d", __func__,
 			    cpuid, error);
 	}
 	NETISR_WLOCK();
 	nws_array[nws_count] = nwsp->nws_cpu;
 	nws_count++;
 	NETISR_WUNLOCK();
 }
 
 /*
  * Initialize the netisr subsystem.  We rely on BSS and static initialization
  * of most fields in global data structures.
  *
  * Start a worker thread for the boot CPU so that we can support network
  * traffic immediately in case the network stack is used before additional
  * CPUs are started (for example, diskless boot).
  */
 static void
 netisr_init(void *arg)
 {
 	struct pcpu *pc;
 
 	NETISR_LOCK_INIT();
 	if (netisr_maxthreads == 0 || netisr_maxthreads < -1 )
 		netisr_maxthreads = 1;		/* default behavior */
 	else if (netisr_maxthreads == -1)
 		netisr_maxthreads = mp_ncpus;	/* use max cpus */
 	if (netisr_maxthreads > mp_ncpus) {
 		printf("netisr_init: forcing maxthreads from %d to %d\n",
 		    netisr_maxthreads, mp_ncpus);
 		netisr_maxthreads = mp_ncpus;
 	}
 	if (netisr_defaultqlimit > netisr_maxqlimit) {
 		printf("netisr_init: forcing defaultqlimit from %d to %d\n",
 		    netisr_defaultqlimit, netisr_maxqlimit);
 		netisr_defaultqlimit = netisr_maxqlimit;
 	}
 #ifdef DEVICE_POLLING
 	/*
 	 * The device polling code is not yet aware of how to deal with
 	 * multiple netisr threads, so for the time being compiling in device
 	 * polling disables parallel netisr workers.
 	 */
 	if (netisr_maxthreads != 1 || netisr_bindthreads != 0) {
 		printf("netisr_init: forcing maxthreads to 1 and "
 		    "bindthreads to 0 for device polling\n");
 		netisr_maxthreads = 1;
 		netisr_bindthreads = 0;
 	}
 #endif
 
 #ifdef EARLY_AP_STARTUP
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (nws_count >= netisr_maxthreads)
 			break;
 		netisr_start_swi(pc->pc_cpuid, pc);
 	}
 #else
 	pc = get_pcpu();
 	netisr_start_swi(pc->pc_cpuid, pc);
 #endif
 }
 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL);
 
 #ifndef EARLY_AP_STARTUP
 /*
  * Start worker threads for additional CPUs.  No attempt to gracefully handle
  * work reassignment, we don't yet support dynamic reconfiguration.
  */
 static void
 netisr_start(void *arg)
 {
 	struct pcpu *pc;
 
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (nws_count >= netisr_maxthreads)
 			break;
 		/* Worker will already be present for boot CPU. */
 		if (pc->pc_netisr != NULL)
 			continue;
 		netisr_start_swi(pc->pc_cpuid, pc);
 	}
 }
 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL);
 #endif
 
 /*
  * Sysctl monitoring for netisr: query a list of registered protocols.
  */
 static int
 sysctl_netisr_proto(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_proto *snpp, *snp_array;
 	struct netisr_proto *npp;
 	u_int counter, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	for (proto = 0; proto < NETISR_MAXPROT; proto++) {
 		npp = &netisr_proto[proto];
 		if (npp->np_name == NULL)
 			continue;
 		snpp = &snp_array[counter];
 		snpp->snp_version = sizeof(*snpp);
 		strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN);
 		snpp->snp_proto = proto;
 		snpp->snp_qlimit = npp->np_qlimit;
 		snpp->snp_policy = npp->np_policy;
 		snpp->snp_dispatch = npp->np_dispatch;
 		if (npp->np_m2flow != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW;
 		if (npp->np_m2cpuid != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID;
 		if (npp->np_drainedcpu != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU;
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= NETISR_MAXPROT,
 	    ("sysctl_netisr_proto: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter);
 	free(snp_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, proto,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto,
     "S,sysctl_netisr_proto",
     "Return list of protocols registered with netisr");
 
 /*
  * Sysctl monitoring for netisr: query a list of workstreams.
  */
 static int
 sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_workstream *snwsp, *snws_array;
 	struct netisr_workstream *nwsp;
 	u_int counter, cpuid;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP,
 	    M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		snwsp = &snws_array[counter];
 		snwsp->snws_version = sizeof(*snwsp);
 
 		/*
 		 * For now, we equate workstream IDs and CPU IDs in the
 		 * kernel, but expose them independently to userspace in case
 		 * that assumption changes in the future.
 		 */
 		snwsp->snws_wsid = cpuid;
 		snwsp->snws_cpu = cpuid;
 		if (nwsp->nws_intr_event != NULL)
 			snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR;
 		NWS_UNLOCK(nwsp);
 		counter++;
 	}
 	NETISR_RUNLOCK(&tracker);
 	KASSERT(counter <= MAXCPU,
 	    ("sysctl_netisr_workstream: counter too big (%d)", counter));
 	error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter);
 	free(snws_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, workstream,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream,
     "S,sysctl_netisr_workstream",
     "Return list of workstreams implemented by netisr");
 
 /*
  * Sysctl monitoring for netisr: query per-protocol data across all
  * workstreams.
  */
 static int
 sysctl_netisr_work(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sysctl_netisr_work *snwp, *snw_array;
 	struct netisr_workstream *nwsp;
 	struct netisr_proto *npp;
 	struct netisr_work *nwp;
 	u_int counter, cpuid, proto;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EINVAL);
 	snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT,
 	    M_TEMP, M_ZERO | M_WAITOK);
 	counter = 0;
 	NETISR_RLOCK(&tracker);
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		NWS_LOCK(nwsp);
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
 			npp = &netisr_proto[proto];
 			if (npp->np_name == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			snwp = &snw_array[counter];
 			snwp->snw_version = sizeof(*snwp);
 			snwp->snw_wsid = cpuid;		/* See comment above. */
 			snwp->snw_proto = proto;
 			snwp->snw_len = nwp->nw_len;
 			snwp->snw_watermark = nwp->nw_watermark;
 			snwp->snw_dispatched = nwp->nw_dispatched;
 			snwp->snw_hybrid_dispatched =
 			    nwp->nw_hybrid_dispatched;
 			snwp->snw_qdrops = nwp->nw_qdrops;
 			snwp->snw_queued = nwp->nw_queued;
 			snwp->snw_handled = nwp->nw_handled;
 			counter++;
 		}
 		NWS_UNLOCK(nwsp);
 	}
 	KASSERT(counter <= MAXCPU * NETISR_MAXPROT,
 	    ("sysctl_netisr_work: counter too big (%d)", counter));
 	NETISR_RUNLOCK(&tracker);
 	error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter);
 	free(snw_array, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_isr, OID_AUTO, work,
     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work,
     "S,sysctl_netisr_work",
     "Return list of per-workstream, per-protocol work in netisr");
 
 #ifdef DDB
 DB_SHOW_COMMAND(netisr, db_show_netisr)
 {
 	struct netisr_workstream *nwsp;
 	struct netisr_work *nwp;
 	int first, proto;
 	u_int cpuid;
 
 	db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto",
 	    "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue");
 	CPU_FOREACH(cpuid) {
 		nwsp = DPCPU_ID_PTR(cpuid, nws);
 		if (nwsp->nws_intr_event == NULL)
 			continue;
 		first = 1;
 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
 			if (netisr_proto[proto].np_handler == NULL)
 				continue;
 			nwp = &nwsp->nws_work[proto];
 			if (first) {
 				db_printf("%3d ", cpuid);
 				first = 0;
 			} else
 				db_printf("%3s ", "");
 			db_printf(
 			    "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n",
 			    netisr_proto[proto].np_name, nwp->nw_len,
 			    nwp->nw_watermark, nwp->nw_qlimit,
 			    nwp->nw_dispatched, nwp->nw_hybrid_dispatched,
 			    nwp->nw_qdrops, nwp->nw_queued);
 		}
 	}
 }
 #endif
Index: head/sys/net80211/ieee80211_hwmp.c
===================================================================
--- head/sys/net80211/ieee80211_hwmp.c	(revision 348253)
+++ head/sys/net80211/ieee80211_hwmp.c	(revision 348254)
@@ -1,2084 +1,2085 @@
 /*- 
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 The FreeBSD Foundation 
  * All rights reserved. 
  * 
  * This software was developed by Rui Paulo under sponsorship from the
  * FreeBSD Foundation. 
  *  
  * Redistribution and use in source and binary forms, with or without 
  * modification, are permitted provided that the following conditions 
  * are met: 
  * 1. Redistributions of source code must retain the above copyright 
  *    notice, this list of conditions and the following disclaimer. 
  * 2. Redistributions in binary form must reproduce the above copyright 
  *    notice, this list of conditions and the following disclaimer in the 
  *    documentation and/or other materials provided with the distribution. 
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  * SUCH DAMAGE. 
  */ 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11s Hybrid Wireless Mesh Protocol, HWMP.
  *
  * Based on March 2009, D3.0 802.11s draft spec.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
 #include <net/ethernet.h>
 
 #include <net/bpf.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_action.h>
 #include <net80211/ieee80211_input.h>
 #include <net80211/ieee80211_mesh.h>
 
 static void	hwmp_vattach(struct ieee80211vap *);
 static void	hwmp_vdetach(struct ieee80211vap *);
 static int	hwmp_newstate(struct ieee80211vap *,
 		    enum ieee80211_state, int);
 static int	hwmp_send_action(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN],
 		    uint8_t *, size_t);
 static uint8_t * hwmp_add_meshpreq(uint8_t *,
 		    const struct ieee80211_meshpreq_ie *);
 static uint8_t * hwmp_add_meshprep(uint8_t *,
 		    const struct ieee80211_meshprep_ie *);
 static uint8_t * hwmp_add_meshperr(uint8_t *,
 		    const struct ieee80211_meshperr_ie *);
 static uint8_t * hwmp_add_meshrann(uint8_t *,
 		    const struct ieee80211_meshrann_ie *);
 static void	hwmp_rootmode_setup(struct ieee80211vap *);
 static void	hwmp_rootmode_cb(void *);
 static void	hwmp_rootmode_rann_cb(void *);
 static void	hwmp_recv_preq(struct ieee80211vap *, struct ieee80211_node *,
 		    const struct ieee80211_frame *,
 		    const struct ieee80211_meshpreq_ie *);
 static int	hwmp_send_preq(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN],
 		    struct ieee80211_meshpreq_ie *,
 		    struct timeval *, struct timeval *);
 static void	hwmp_recv_prep(struct ieee80211vap *, struct ieee80211_node *,
 		    const struct ieee80211_frame *,
 		    const struct ieee80211_meshprep_ie *);
 static int	hwmp_send_prep(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN],
 		    struct ieee80211_meshprep_ie *);
 static void	hwmp_recv_perr(struct ieee80211vap *, struct ieee80211_node *,
 		    const struct ieee80211_frame *,
 		    const struct ieee80211_meshperr_ie *);
 static int	hwmp_send_perr(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN],
 		    struct ieee80211_meshperr_ie *);
 static void	hwmp_senderror(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN],
 		    struct ieee80211_mesh_route *, int);
 static void	hwmp_recv_rann(struct ieee80211vap *, struct ieee80211_node *,
 		   const struct ieee80211_frame *,
 		   const struct ieee80211_meshrann_ie *);
 static int	hwmp_send_rann(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN],
 		    struct ieee80211_meshrann_ie *);
 static struct ieee80211_node *
 		hwmp_discover(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN], struct mbuf *);
 static void	hwmp_peerdown(struct ieee80211_node *);
 
 static struct timeval ieee80211_hwmp_preqminint = { 0, 100000 };
 static struct timeval ieee80211_hwmp_perrminint = { 0, 100000 };
 
 
 /* NB: the Target Address set in a Proactive PREQ is the broadcast address. */
 static const uint8_t	broadcastaddr[IEEE80211_ADDR_LEN] =
 	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 typedef uint32_t ieee80211_hwmp_seq;
 #define	HWMP_SEQ_LT(a, b)	((int32_t)((a)-(b)) < 0)
 #define	HWMP_SEQ_LEQ(a, b)	((int32_t)((a)-(b)) <= 0)
 #define	HWMP_SEQ_EQ(a, b)	((int32_t)((a)-(b)) == 0)
 #define	HWMP_SEQ_GT(a, b)	((int32_t)((a)-(b)) > 0)
 
 #define HWMP_SEQ_MAX(a, b)	(a > b ? a : b)
 
 /*
  * Private extension of ieee80211_mesh_route.
  */
 struct ieee80211_hwmp_route {
 	ieee80211_hwmp_seq	hr_seq;		/* last HWMP seq seen from dst*/
 	ieee80211_hwmp_seq	hr_preqid;	/* last PREQ ID seen from dst */
 	ieee80211_hwmp_seq	hr_origseq;	/* seq. no. on our latest PREQ*/
 	struct timeval		hr_lastpreq;	/* last time we sent a PREQ */
 	struct timeval		hr_lastrootconf; /* last sent PREQ root conf */
 	int			hr_preqretries;	/* number of discoveries */
 	int			hr_lastdiscovery; /* last discovery in ticks */
 };
 struct ieee80211_hwmp_state {
 	ieee80211_hwmp_seq	hs_seq;		/* next seq to be used */
 	ieee80211_hwmp_seq	hs_preqid;	/* next PREQ ID to be used */
 	int			hs_rootmode;	/* proactive HWMP */
 	struct timeval		hs_lastperr;	/* last time we sent a PERR */
 	struct callout		hs_roottimer;
 	uint8_t			hs_maxhops;	/* max hop count */
 };
 
 static SYSCTL_NODE(_net_wlan, OID_AUTO, hwmp, CTLFLAG_RD, 0,
     "IEEE 802.11s HWMP parameters");
 static int	ieee80211_hwmp_targetonly = 0;
 SYSCTL_INT(_net_wlan_hwmp, OID_AUTO, targetonly, CTLFLAG_RW,
     &ieee80211_hwmp_targetonly, 0, "Set TO bit on generated PREQs");
 static int	ieee80211_hwmp_pathtimeout = -1;
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, pathlifetime, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_hwmp_pathtimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "path entry lifetime (ms)");
 static int	ieee80211_hwmp_maxpreq_retries = -1;
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, maxpreq_retries, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_hwmp_maxpreq_retries, 0, ieee80211_sysctl_msecs_ticks, "I",
     "maximum number of preq retries");
 static int	ieee80211_hwmp_net_diameter_traversaltime = -1;
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, net_diameter_traversal_time,
     CTLTYPE_INT | CTLFLAG_RW, &ieee80211_hwmp_net_diameter_traversaltime, 0,
     ieee80211_sysctl_msecs_ticks, "I",
     "estimate travelse time across the MBSS (ms)");
 static int	ieee80211_hwmp_roottimeout = -1;
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, roottimeout, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_hwmp_roottimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "root PREQ timeout (ms)");
 static int	ieee80211_hwmp_rootint = -1;
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, rootint, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_hwmp_rootint, 0, ieee80211_sysctl_msecs_ticks, "I",
     "root interval (ms)");
 static int	ieee80211_hwmp_rannint = -1;
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, rannint, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_hwmp_rannint, 0, ieee80211_sysctl_msecs_ticks, "I",
     "root announcement interval (ms)");
 static struct timeval ieee80211_hwmp_rootconfint = { 0, 0 };
 static int	ieee80211_hwmp_rootconfint_internal = -1;
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, rootconfint, CTLTYPE_INT | CTLFLAG_RD,
     &ieee80211_hwmp_rootconfint_internal, 0, ieee80211_sysctl_msecs_ticks, "I",
     "root confirmation interval (ms) (read-only)");
 
 #define	IEEE80211_HWMP_DEFAULT_MAXHOPS	31
 
 static	ieee80211_recv_action_func hwmp_recv_action_meshpath;
 
 static struct ieee80211_mesh_proto_path mesh_proto_hwmp = {
 	.mpp_descr	= "HWMP",
 	.mpp_ie		= IEEE80211_MESHCONF_PATH_HWMP,
 	.mpp_discover	= hwmp_discover,
 	.mpp_peerdown	= hwmp_peerdown,
 	.mpp_senderror	= hwmp_senderror,
 	.mpp_vattach	= hwmp_vattach,
 	.mpp_vdetach	= hwmp_vdetach,
 	.mpp_newstate	= hwmp_newstate,
 	.mpp_privlen	= sizeof(struct ieee80211_hwmp_route),
 };
 SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, inact, CTLTYPE_INT | CTLFLAG_RW,
 	&mesh_proto_hwmp.mpp_inact, 0, ieee80211_sysctl_msecs_ticks, "I",
 	"mesh route inactivity timeout (ms)");
 
 
 static void
 ieee80211_hwmp_init(void)
 {
 	/* Default values as per amendment */
 	ieee80211_hwmp_pathtimeout = msecs_to_ticks(5*1000);
 	ieee80211_hwmp_roottimeout = msecs_to_ticks(5*1000);
 	ieee80211_hwmp_rootint = msecs_to_ticks(2*1000);
 	ieee80211_hwmp_rannint = msecs_to_ticks(1*1000);
 	ieee80211_hwmp_rootconfint_internal = msecs_to_ticks(2*1000);
 	ieee80211_hwmp_maxpreq_retries = 3;
 	/*
 	 * (TU): A measurement of time equal to 1024 μs,
 	 * 500 TU is 512 ms.
 	 */
 	ieee80211_hwmp_net_diameter_traversaltime = msecs_to_ticks(512);
 
 	/*
 	 * NB: I dont know how to make SYSCTL_PROC that calls ms to ticks
 	 * and return a struct timeval...
 	 */
 	ieee80211_hwmp_rootconfint.tv_usec =
 	    ieee80211_hwmp_rootconfint_internal * 1000;
 
 	/*
 	 * Register action frame handler.
 	 */
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_HWMP, hwmp_recv_action_meshpath);
 
 	/* NB: default is 5 secs per spec */
 	mesh_proto_hwmp.mpp_inact = msecs_to_ticks(5*1000);
 
 	/*
 	 * Register HWMP.
 	 */
 	ieee80211_mesh_register_proto_path(&mesh_proto_hwmp);
 }
 SYSINIT(wlan_hwmp, SI_SUB_DRIVERS, SI_ORDER_SECOND, ieee80211_hwmp_init, NULL);
 
 static void
 hwmp_vattach(struct ieee80211vap *vap)
 {
 	struct ieee80211_hwmp_state *hs;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS,
 	    ("not a mesh vap, opmode %d", vap->iv_opmode));
 
 	hs = IEEE80211_MALLOC(sizeof(struct ieee80211_hwmp_state), M_80211_VAP,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (hs == NULL) {
 		printf("%s: couldn't alloc HWMP state\n", __func__);
 		return;
 	}
 	hs->hs_maxhops = IEEE80211_HWMP_DEFAULT_MAXHOPS;
 	callout_init(&hs->hs_roottimer, 1);
 	vap->iv_hwmp = hs;
 }
 
 static void
 hwmp_vdetach(struct ieee80211vap *vap)
 {
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 
 	callout_drain(&hs->hs_roottimer);
 	IEEE80211_FREE(vap->iv_hwmp, M_80211_VAP);
 	vap->iv_hwmp = NULL;
 } 
 
 static int
 hwmp_newstate(struct ieee80211vap *vap, enum ieee80211_state ostate, int arg)
 {
 	enum ieee80211_state nstate = vap->iv_state;
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n",
 	    __func__, ieee80211_state_name[ostate],
 	    ieee80211_state_name[nstate], arg);
 
 	if (nstate != IEEE80211_S_RUN && ostate == IEEE80211_S_RUN)
 		callout_drain(&hs->hs_roottimer);
 	if (nstate == IEEE80211_S_RUN)
 		hwmp_rootmode_setup(vap);
 	return 0;
 }
 
 /*
  * Verify the length of an HWMP PREQ and return the number
  * of destinations >= 1, if verification fails -1 is returned.
  */
 static int
 verify_mesh_preq_len(struct ieee80211vap *vap,
     const struct ieee80211_frame *wh, const uint8_t *iefrm)
 {
 	int alloc_sz = -1;
 	int ndest = -1;
 	if (iefrm[2] & IEEE80211_MESHPREQ_FLAGS_AE) {
 		/* Originator External Address  present */
 		alloc_sz =  IEEE80211_MESHPREQ_BASE_SZ_AE;
 		ndest = iefrm[IEEE80211_MESHPREQ_TCNT_OFFSET_AE];
 	} else {
 		/* w/o Originator External Address */
 		alloc_sz =  IEEE80211_MESHPREQ_BASE_SZ;
 		ndest = iefrm[IEEE80211_MESHPREQ_TCNT_OFFSET];
 	}
 	alloc_sz += ndest * IEEE80211_MESHPREQ_TRGT_SZ;
 
 	if(iefrm[1] != (alloc_sz)) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP,
 		    wh, NULL, "PREQ (AE=%s) with wrong len",
 		    iefrm[2] & IEEE80211_MESHPREQ_FLAGS_AE ? "1" : "0");
 		return (-1);
 	}
 	return ndest;
 }
 
 /*
  * Verify the length of an HWMP PREP and returns 1 on success,
  * otherwise -1.
  */
 static int
 verify_mesh_prep_len(struct ieee80211vap *vap,
     const struct ieee80211_frame *wh, const uint8_t *iefrm)
 {
 	int alloc_sz = -1;
 	if (iefrm[2] & IEEE80211_MESHPREP_FLAGS_AE) {
 		if (iefrm[1] == IEEE80211_MESHPREP_BASE_SZ_AE)
 			alloc_sz = IEEE80211_MESHPREP_BASE_SZ_AE;
 	} else if (iefrm[1] == IEEE80211_MESHPREP_BASE_SZ)
 		alloc_sz = IEEE80211_MESHPREP_BASE_SZ;
 	if(alloc_sz < 0) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP,
 		    wh, NULL, "PREP (AE=%s) with wrong len",
 		    iefrm[2] & IEEE80211_MESHPREP_FLAGS_AE ? "1" : "0");
 		return (-1);
 	}
 	return (1);
 }
 
 /*
  * Verify the length of an HWMP PERR and return the number
  * of destinations >= 1, if verification fails -1 is returned.
  */
 static int
 verify_mesh_perr_len(struct ieee80211vap *vap,
     const struct ieee80211_frame *wh, const uint8_t *iefrm)
 {
 	int alloc_sz = -1;
 	const uint8_t *iefrm_t = iefrm;
 	uint8_t ndest = iefrm_t[IEEE80211_MESHPERR_NDEST_OFFSET];
 	int i;
 
 	if(ndest > IEEE80211_MESHPERR_MAXDEST) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP,
 		    wh, NULL, "PERR with wrong number of destionat (>19), %u",
 		    ndest);
 		return (-1);
 	}
 
 	iefrm_t += IEEE80211_MESHPERR_NDEST_OFFSET + 1; /* flag is next field */
 	/* We need to check each destionation flag to know size */
 	for(i = 0; i<ndest; i++) {
 		if ((*iefrm_t) & IEEE80211_MESHPERR_FLAGS_AE)
 			iefrm_t += IEEE80211_MESHPERR_DEST_SZ_AE;
 		else
 			iefrm_t += IEEE80211_MESHPERR_DEST_SZ;
 	}
 
 	alloc_sz = (iefrm_t - iefrm) - 2; /* action + code */
 	if(alloc_sz !=  iefrm[1]) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP,
 		    wh, NULL, "%s", "PERR with wrong len");
 		return (-1);
 	}
 	return ndest;
 }
 
 static int
 hwmp_recv_action_meshpath(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_meshpreq_ie *preq;
 	struct ieee80211_meshprep_ie *prep;
 	struct ieee80211_meshperr_ie *perr;
 	struct ieee80211_meshrann_ie rann;
 	const uint8_t *iefrm = frm + 2; /* action + code */
 	const uint8_t *iefrm_t = iefrm; /* temporary pointer */
 	int ndest = -1;
 	int found = 0;
 
 	while (efrm - iefrm > 1) {
 		IEEE80211_VERIFY_LENGTH(efrm - iefrm, iefrm[1] + 2, return 0);
 		switch (*iefrm) {
 		case IEEE80211_ELEMID_MESHPREQ:
 		{
 			int i = 0;
 
 			iefrm_t = iefrm;
 			ndest = verify_mesh_preq_len(vap, wh, iefrm_t);
 			if (ndest < 0) {
 				vap->iv_stats.is_rx_mgtdiscard++;
 				break;
 			}
 			preq = IEEE80211_MALLOC(sizeof(*preq) +
 			    (ndest - 1) * sizeof(*preq->preq_targets),
 			    M_80211_MESH_PREQ,
 			    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 			KASSERT(preq != NULL, ("preq == NULL"));
 
 			preq->preq_ie = *iefrm_t++;
 			preq->preq_len = *iefrm_t++;
 			preq->preq_flags = *iefrm_t++;
 			preq->preq_hopcount = *iefrm_t++;
 			preq->preq_ttl = *iefrm_t++;
 			preq->preq_id = le32dec(iefrm_t); iefrm_t += 4;
 			IEEE80211_ADDR_COPY(preq->preq_origaddr, iefrm_t);
 			iefrm_t += 6;
 			preq->preq_origseq = le32dec(iefrm_t); iefrm_t += 4;
 			/* NB: may have Originator Proxied Address */
 			if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE)  {
 				IEEE80211_ADDR_COPY(
 				    preq->preq_orig_ext_addr, iefrm_t);
 				iefrm_t += 6;
 			}
 			preq->preq_lifetime = le32dec(iefrm_t); iefrm_t += 4;
 			preq->preq_metric = le32dec(iefrm_t); iefrm_t += 4;
 			preq->preq_tcount = *iefrm_t++;
 			
 			for (i = 0; i < preq->preq_tcount; i++) {
 				preq->preq_targets[i].target_flags = *iefrm_t++;
 				IEEE80211_ADDR_COPY(
 				    preq->preq_targets[i].target_addr, iefrm_t);
 				iefrm_t += 6;
 				preq->preq_targets[i].target_seq =
 				    le32dec(iefrm_t);
 				iefrm_t += 4;
 			}
 
 			hwmp_recv_preq(vap, ni, wh, preq);
 			IEEE80211_FREE(preq, M_80211_MESH_PREQ);
 			found++;
 			break;
 		}
 		case IEEE80211_ELEMID_MESHPREP:
 		{
 			iefrm_t = iefrm;
 			ndest = verify_mesh_prep_len(vap, wh, iefrm_t);
 			if (ndest < 0) {
 				vap->iv_stats.is_rx_mgtdiscard++;
 				break;
 			}
 			prep = IEEE80211_MALLOC(sizeof(*prep),
 			    M_80211_MESH_PREP,
 			    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 			KASSERT(prep != NULL, ("prep == NULL"));
 
 			prep->prep_ie = *iefrm_t++;
 			prep->prep_len = *iefrm_t++;
 			prep->prep_flags = *iefrm_t++;
 			prep->prep_hopcount = *iefrm_t++;
 			prep->prep_ttl = *iefrm_t++;
 			IEEE80211_ADDR_COPY(prep->prep_targetaddr, iefrm_t);
 			iefrm_t += 6;
 			prep->prep_targetseq = le32dec(iefrm_t); iefrm_t += 4;
 			/* NB: May have Target Proxied Address */
 			if (prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE)  {
 				IEEE80211_ADDR_COPY(
 				    prep->prep_target_ext_addr, iefrm_t);
 				iefrm_t += 6;
 			}
 			prep->prep_lifetime = le32dec(iefrm_t); iefrm_t += 4;
 			prep->prep_metric = le32dec(iefrm_t); iefrm_t += 4;
 			IEEE80211_ADDR_COPY(prep->prep_origaddr, iefrm_t);
 			iefrm_t += 6;
 			prep->prep_origseq = le32dec(iefrm_t); iefrm_t += 4;
 
 			hwmp_recv_prep(vap, ni, wh, prep);
 			IEEE80211_FREE(prep, M_80211_MESH_PREP);
 			found++;
 			break;
 		}
 		case IEEE80211_ELEMID_MESHPERR:
 		{
 			int i = 0;
 
 			iefrm_t = iefrm;
 			ndest = verify_mesh_perr_len(vap, wh, iefrm_t);
 			if (ndest < 0) {
 				vap->iv_stats.is_rx_mgtdiscard++;
 				break;
 			}
 			perr = IEEE80211_MALLOC(sizeof(*perr) +
 			    (ndest - 1) * sizeof(*perr->perr_dests),
 			    M_80211_MESH_PERR,
 			    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 			KASSERT(perr != NULL, ("perr == NULL"));
 
 			perr->perr_ie = *iefrm_t++;
 			perr->perr_len = *iefrm_t++;
 			perr->perr_ttl = *iefrm_t++;
 			perr->perr_ndests = *iefrm_t++;
 
 			for (i = 0; i<perr->perr_ndests; i++) {
 				perr->perr_dests[i].dest_flags = *iefrm_t++;
 				IEEE80211_ADDR_COPY(
 				    perr->perr_dests[i].dest_addr, iefrm_t);
 				iefrm_t += 6;
 				perr->perr_dests[i].dest_seq = le32dec(iefrm_t);
 				iefrm_t += 4;
 				/* NB: May have Target Proxied Address */
 				if (perr->perr_dests[i].dest_flags &
 				    IEEE80211_MESHPERR_FLAGS_AE) {
 					IEEE80211_ADDR_COPY(
 					    perr->perr_dests[i].dest_ext_addr,
 					    iefrm_t);
 					iefrm_t += 6;
 				}
 				perr->perr_dests[i].dest_rcode =
 				    le16dec(iefrm_t);
 				iefrm_t += 2;
 			}
 
 			hwmp_recv_perr(vap, ni, wh, perr);
 			IEEE80211_FREE(perr, M_80211_MESH_PERR);
 			found++;
 			break;
 		}
 		case IEEE80211_ELEMID_MESHRANN:
 		{
 			const struct ieee80211_meshrann_ie *mrann =
 			    (const struct ieee80211_meshrann_ie *) iefrm;
 			if (mrann->rann_len !=
 			    sizeof(struct ieee80211_meshrann_ie) - 2) {
 				IEEE80211_DISCARD(vap,
 				    IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP,
 				    wh, NULL, "%s", "RAN with wrong len");
 				    vap->iv_stats.is_rx_mgtdiscard++;
 				return 1;
 			}
 			memcpy(&rann, mrann, sizeof(rann));
 			rann.rann_seq = le32dec(&mrann->rann_seq);
 			rann.rann_interval = le32dec(&mrann->rann_interval);
 			rann.rann_metric = le32dec(&mrann->rann_metric);
 			hwmp_recv_rann(vap, ni, wh, &rann);
 			found++;
 			break;
 		}
 		}
 		iefrm += iefrm[1] + 2;
 	}
 	if (!found) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP,
 		    wh, NULL, "%s", "PATH SEL action without IE");
 		vap->iv_stats.is_rx_mgtdiscard++;
 	}
 	return 0;
 }
 
 static int
 hwmp_send_action(struct ieee80211vap *vap,
     const uint8_t da[IEEE80211_ADDR_LEN],
     uint8_t *ie, size_t len)
 {
 	struct ieee80211_node *ni;
 	struct ieee80211com *ic;
 	struct ieee80211_bpf_params params;
 	struct mbuf *m;
 	uint8_t *frm;
 	int ret;
 
 	if (IEEE80211_IS_MULTICAST(da)) {
 		ni = ieee80211_ref_node(vap->iv_bss);
 #ifdef IEEE80211_DEBUG_REFCNT
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 		"ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n",
 		__func__, __LINE__,
 		ni, ether_sprintf(ni->ni_macaddr),
 		ieee80211_node_refcnt(ni)+1);
 #endif
 		ieee80211_ref_node(ni);
 	}
 	else
 		ni = ieee80211_mesh_find_txnode(vap, da);
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni,
 		    "block %s frame in CAC state", "HWMP action");
 		vap->iv_stats.is_tx_badstate++;
 		return EIO;	/* XXX */
 	}
 
 	KASSERT(ni != NULL, ("null node"));
 	ic = ni->ni_ic;
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(struct ieee80211_action) + len
 	);
 	if (m == NULL) {
 		ieee80211_free_node(ni);
 		vap->iv_stats.is_tx_nobuf++;
 		return ENOMEM;
 	}
 	*frm++ = IEEE80211_ACTION_CAT_MESH;
 	*frm++ = IEEE80211_ACTION_MESH_HWMP;
 	switch (*ie) {
 	case IEEE80211_ELEMID_MESHPREQ:
 		frm = hwmp_add_meshpreq(frm,
 		    (struct ieee80211_meshpreq_ie *)ie);
 		break;
 	case IEEE80211_ELEMID_MESHPREP:
 		frm = hwmp_add_meshprep(frm,
 		    (struct ieee80211_meshprep_ie *)ie);
 		break;
 	case IEEE80211_ELEMID_MESHPERR:
 		frm = hwmp_add_meshperr(frm,
 		    (struct ieee80211_meshperr_ie *)ie);
 		break;
 	case IEEE80211_ELEMID_MESHRANN:
 		frm = hwmp_add_meshrann(frm,
 		    (struct ieee80211_meshrann_ie *)ie);
 		break;
 	}
 
 	m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 	M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT);
 	if (m == NULL) {
 		ieee80211_free_node(ni);
 		vap->iv_stats.is_tx_nobuf++;
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 
 	ieee80211_send_setup(ni, m,
 	    IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_ACTION,
 	    IEEE80211_NONQOS_TID, vap->iv_myaddr, da, vap->iv_myaddr);
 
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 	IEEE80211_NODE_STAT(ni, tx_mgmt);
 
 	memset(&params, 0, sizeof(params));
 	params.ibp_pri = WME_AC_VO;
 	params.ibp_rate0 = ni->ni_txparms->mgmtrate;
 	if (IEEE80211_IS_MULTICAST(da))
 		params.ibp_try0 = 1;
 	else
 		params.ibp_try0 = ni->ni_txparms->maxretry;
 	params.ibp_power = ni->ni_txpower;
 	ret = ieee80211_raw_output(vap, ni, m, &params);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 #define ADDSHORT(frm, v) do {		\
 	le16enc(frm, v);		\
 	frm += 2;			\
 } while (0)
 #define ADDWORD(frm, v) do {		\
 	le32enc(frm, v);		\
 	frm += 4;			\
 } while (0)
 /*
  * Add a Mesh Path Request IE to a frame.
  */
 #define	PREQ_TFLAGS(n)	preq->preq_targets[n].target_flags
 #define	PREQ_TADDR(n)	preq->preq_targets[n].target_addr
 #define	PREQ_TSEQ(n)	preq->preq_targets[n].target_seq
 static uint8_t *
 hwmp_add_meshpreq(uint8_t *frm, const struct ieee80211_meshpreq_ie *preq)
 {
 	int i;
 
 	*frm++ = IEEE80211_ELEMID_MESHPREQ;
 	*frm++ = preq->preq_len;	/* len already calculated */
 	*frm++ = preq->preq_flags;
 	*frm++ = preq->preq_hopcount;
 	*frm++ = preq->preq_ttl;
 	ADDWORD(frm, preq->preq_id);
 	IEEE80211_ADDR_COPY(frm, preq->preq_origaddr); frm += 6;
 	ADDWORD(frm, preq->preq_origseq);
 	if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE) {
 		IEEE80211_ADDR_COPY(frm, preq->preq_orig_ext_addr);
 		frm += 6;
 	}
 	ADDWORD(frm, preq->preq_lifetime);
 	ADDWORD(frm, preq->preq_metric);
 	*frm++ = preq->preq_tcount;
 	for (i = 0; i < preq->preq_tcount; i++) {
 		*frm++ = PREQ_TFLAGS(i);
 		IEEE80211_ADDR_COPY(frm, PREQ_TADDR(i));
 		frm += 6;
 		ADDWORD(frm, PREQ_TSEQ(i));
 	}
 	return frm;
 }
 #undef	PREQ_TFLAGS
 #undef	PREQ_TADDR
 #undef	PREQ_TSEQ
 
 /*
  * Add a Mesh Path Reply IE to a frame.
  */
 static uint8_t *
 hwmp_add_meshprep(uint8_t *frm, const struct ieee80211_meshprep_ie *prep)
 {
 	*frm++ = IEEE80211_ELEMID_MESHPREP;
 	*frm++ = prep->prep_len;	/* len already calculated */
 	*frm++ = prep->prep_flags;
 	*frm++ = prep->prep_hopcount;
 	*frm++ = prep->prep_ttl;
 	IEEE80211_ADDR_COPY(frm, prep->prep_targetaddr); frm += 6;
 	ADDWORD(frm, prep->prep_targetseq);
 	if (prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE) {
 		IEEE80211_ADDR_COPY(frm, prep->prep_target_ext_addr);
 		frm += 6;
 	}
 	ADDWORD(frm, prep->prep_lifetime);
 	ADDWORD(frm, prep->prep_metric);
 	IEEE80211_ADDR_COPY(frm, prep->prep_origaddr); frm += 6;
 	ADDWORD(frm, prep->prep_origseq);
 	return frm;
 }
 
 /*
  * Add a Mesh Path Error IE to a frame.
  */
 #define	PERR_DFLAGS(n)	perr->perr_dests[n].dest_flags
 #define	PERR_DADDR(n)	perr->perr_dests[n].dest_addr
 #define	PERR_DSEQ(n)	perr->perr_dests[n].dest_seq
 #define	PERR_EXTADDR(n)	perr->perr_dests[n].dest_ext_addr
 #define	PERR_DRCODE(n)	perr->perr_dests[n].dest_rcode
 static uint8_t *
 hwmp_add_meshperr(uint8_t *frm, const struct ieee80211_meshperr_ie *perr)
 {
 	int i;
 
 	*frm++ = IEEE80211_ELEMID_MESHPERR;
 	*frm++ = perr->perr_len;	/* len already calculated */
 	*frm++ = perr->perr_ttl;
 	*frm++ = perr->perr_ndests;
 	for (i = 0; i < perr->perr_ndests; i++) {
 		*frm++ = PERR_DFLAGS(i);
 		IEEE80211_ADDR_COPY(frm, PERR_DADDR(i));
 		frm += 6;
 		ADDWORD(frm, PERR_DSEQ(i));
 		if (PERR_DFLAGS(i) & IEEE80211_MESHPERR_FLAGS_AE) {
 			IEEE80211_ADDR_COPY(frm, PERR_EXTADDR(i));
 			frm += 6;
 		}
 		ADDSHORT(frm, PERR_DRCODE(i));
 	}
 	return frm;
 }
 #undef	PERR_DFLAGS
 #undef	PERR_DADDR
 #undef	PERR_DSEQ
 #undef	PERR_EXTADDR
 #undef	PERR_DRCODE
 
 /*
  * Add a Root Annoucement IE to a frame.
  */
 static uint8_t *
 hwmp_add_meshrann(uint8_t *frm, const struct ieee80211_meshrann_ie *rann)
 {
 	*frm++ = IEEE80211_ELEMID_MESHRANN;
 	*frm++ = rann->rann_len;
 	*frm++ = rann->rann_flags;
 	*frm++ = rann->rann_hopcount;
 	*frm++ = rann->rann_ttl;
 	IEEE80211_ADDR_COPY(frm, rann->rann_addr); frm += 6;
 	ADDWORD(frm, rann->rann_seq);
 	ADDWORD(frm, rann->rann_interval);
 	ADDWORD(frm, rann->rann_metric);
 	return frm;
 }
 
 static void
 hwmp_rootmode_setup(struct ieee80211vap *vap)
 {
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	switch (hs->hs_rootmode) {
 	case IEEE80211_HWMP_ROOTMODE_DISABLED:
 		callout_drain(&hs->hs_roottimer);
 		ms->ms_flags &= ~IEEE80211_MESHFLAGS_ROOT;
 		break;
 	case IEEE80211_HWMP_ROOTMODE_NORMAL:
 	case IEEE80211_HWMP_ROOTMODE_PROACTIVE:
 		callout_reset(&hs->hs_roottimer, ieee80211_hwmp_rootint,
 		    hwmp_rootmode_cb, vap);
 		ms->ms_flags |= IEEE80211_MESHFLAGS_ROOT;
 		break;
 	case IEEE80211_HWMP_ROOTMODE_RANN:
 		callout_reset(&hs->hs_roottimer, ieee80211_hwmp_rannint,
 		    hwmp_rootmode_rann_cb, vap);
 		ms->ms_flags |= IEEE80211_MESHFLAGS_ROOT;
 		break;
 	}
 }
 
 /*
  * Send a broadcast Path Request to find all nodes on the mesh. We are
  * called when the vap is configured as a HWMP root node.
  */
 #define	PREQ_TFLAGS(n)	preq.preq_targets[n].target_flags
 #define	PREQ_TADDR(n)	preq.preq_targets[n].target_addr
 #define	PREQ_TSEQ(n)	preq.preq_targets[n].target_seq
 static void
 hwmp_rootmode_cb(void *arg)
 {
 	struct ieee80211vap *vap = (struct ieee80211vap *)arg;
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshpreq_ie preq;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, vap->iv_bss,
 	    "%s", "send broadcast PREQ");
 
 	preq.preq_flags = 0;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE)
 		preq.preq_flags |= IEEE80211_MESHPREQ_FLAGS_GATE;
 	if (hs->hs_rootmode == IEEE80211_HWMP_ROOTMODE_PROACTIVE)
 		preq.preq_flags |= IEEE80211_MESHPREQ_FLAGS_PP;
 	preq.preq_hopcount = 0;
 	preq.preq_ttl = ms->ms_ttl;
 	preq.preq_id = ++hs->hs_preqid;
 	IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr);
 	preq.preq_origseq = ++hs->hs_seq;
 	preq.preq_lifetime = ticks_to_msecs(ieee80211_hwmp_roottimeout);
 	preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 	preq.preq_tcount = 1;
 	IEEE80211_ADDR_COPY(PREQ_TADDR(0), broadcastaddr);
 	PREQ_TFLAGS(0) = IEEE80211_MESHPREQ_TFLAGS_TO |
 	    IEEE80211_MESHPREQ_TFLAGS_USN;
 	PREQ_TSEQ(0) = 0;
 	vap->iv_stats.is_hwmp_rootreqs++;
 	/* NB: we enforce rate check ourself */
 	hwmp_send_preq(vap, broadcastaddr, &preq, NULL, NULL);
 	hwmp_rootmode_setup(vap);
 }
 #undef	PREQ_TFLAGS
 #undef	PREQ_TADDR
 #undef	PREQ_TSEQ
 
 /*
  * Send a Root Annoucement (RANN) to find all the nodes on the mesh. We are
  * called when the vap is configured as a HWMP RANN root node.
  */
 static void
 hwmp_rootmode_rann_cb(void *arg)
 {
 	struct ieee80211vap *vap = (struct ieee80211vap *)arg;
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshrann_ie rann;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, vap->iv_bss,
 	    "%s", "send broadcast RANN");
 
 	rann.rann_flags = 0;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE)
 		rann.rann_flags |= IEEE80211_MESHFLAGS_GATE;
 	rann.rann_hopcount = 0;
 	rann.rann_ttl = ms->ms_ttl;
 	IEEE80211_ADDR_COPY(rann.rann_addr, vap->iv_myaddr);
 	rann.rann_seq = ++hs->hs_seq;
 	rann.rann_interval = ieee80211_hwmp_rannint;
 	rann.rann_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 
 	vap->iv_stats.is_hwmp_rootrann++;
 	hwmp_send_rann(vap, broadcastaddr, &rann);
 	hwmp_rootmode_setup(vap);
 }
 
 /*
  * Update forwarding information to TA if metric improves.
  */
 static void
 hwmp_update_transmitter(struct ieee80211vap *vap, struct ieee80211_node *ni,
     const char *hwmp_frame)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rttran = NULL;	/* Transmitter */
 	int metric = 0;
 
 	rttran = ieee80211_mesh_rt_find(vap, ni->ni_macaddr);
 	if (rttran == NULL) {
 		rttran = ieee80211_mesh_rt_add(vap, ni->ni_macaddr);
 		if (rttran == NULL) {
 			IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 			    "unable to add path to transmitter %6D of %s",
 			    ni->ni_macaddr, ":", hwmp_frame);
 			vap->iv_stats.is_mesh_rtaddfailed++;
 			return;
 		}
 	}
 	metric = ms->ms_pmetric->mpm_metric(ni);
 	if (!(rttran->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) ||
 	    rttran->rt_metric > metric)
 	{
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "%s path to transmitter %6D of %s, metric %d:%d",
 		    rttran->rt_flags & IEEE80211_MESHRT_FLAGS_VALID ?
 		    "prefer" : "update", ni->ni_macaddr, ":", hwmp_frame,
 		    rttran->rt_metric, metric);
 		IEEE80211_ADDR_COPY(rttran->rt_nexthop, ni->ni_macaddr);
 		rttran->rt_metric = metric;
 		rttran->rt_nhops  = 1;
 		ieee80211_mesh_rt_update(rttran, ms->ms_ppath->mpp_inact);
 		rttran->rt_flags = IEEE80211_MESHRT_FLAGS_VALID;
 	}
 }
 
 #define	PREQ_TFLAGS(n)	preq->preq_targets[n].target_flags
 #define	PREQ_TADDR(n)	preq->preq_targets[n].target_addr
 #define	PREQ_TSEQ(n)	preq->preq_targets[n].target_seq
 static void
 hwmp_recv_preq(struct ieee80211vap *vap, struct ieee80211_node *ni,
     const struct ieee80211_frame *wh, const struct ieee80211_meshpreq_ie *preq)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rtorig = NULL;
 	struct ieee80211_mesh_route *rtorig_ext = NULL;
 	struct ieee80211_mesh_route *rttarg = NULL;
 	struct ieee80211_hwmp_route *hrorig = NULL;
 	struct ieee80211_hwmp_route *hrtarg = NULL;
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	ieee80211_hwmp_seq preqid;	/* last seen preqid for orig */
 	uint32_t metric = 0;
 
 	/*
 	 * Ignore PREQs from us. Could happen because someone forward it
 	 * back to us.
 	 */
 	if (IEEE80211_ADDR_EQ(vap->iv_myaddr, preq->preq_origaddr))
 		return;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 	    "received PREQ, orig %6D, targ(0) %6D", preq->preq_origaddr, ":",
 	    PREQ_TADDR(0), ":");
 
 	/*
 	 * Acceptance criteria: (if the PREQ is not for us or not broadcast,
 	 * or an external mac address not proxied by us),
 	 * AND forwarding is disabled, discard this PREQ.
 	 */
 	rttarg = ieee80211_mesh_rt_find(vap, PREQ_TADDR(0));
 	if (!(ms->ms_flags & IEEE80211_MESHFLAGS_FWD) &&
 	    (!IEEE80211_ADDR_EQ(vap->iv_myaddr, PREQ_TADDR(0)) ||
 	    !IEEE80211_IS_MULTICAST(PREQ_TADDR(0)) ||
 	    (rttarg != NULL &&
 	    rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY &&
 	    IEEE80211_ADDR_EQ(vap->iv_myaddr, rttarg->rt_mesh_gate)))) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_HWMP,
 		    preq->preq_origaddr, NULL, "%s", "not accepting PREQ");
 		return;
 	}
 	/*
 	 * Acceptance criteria: if unicast addressed 
 	 * AND no valid forwarding for Target of PREQ, discard this PREQ.
 	 */
 	if(rttarg != NULL)
 		hrtarg = IEEE80211_MESH_ROUTE_PRIV(rttarg,
 		    struct ieee80211_hwmp_route);
 	/* Address mode: ucast */
 	if(preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AM &&
 	    rttarg == NULL &&
 	    !IEEE80211_ADDR_EQ(vap->iv_myaddr, PREQ_TADDR(0))) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_HWMP,
 		    preq->preq_origaddr, NULL,
 		    "unicast addressed PREQ of unknown target %6D",
 		    PREQ_TADDR(0), ":");
 		return;
 	}
 
 	/* PREQ ACCEPTED */
 
 	rtorig = ieee80211_mesh_rt_find(vap, preq->preq_origaddr);
 	if (rtorig == NULL) {
 		rtorig = ieee80211_mesh_rt_add(vap, preq->preq_origaddr);
 		if (rtorig == NULL) {
 			IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 			    "unable to add orig path to %6D",
 			    preq->preq_origaddr, ":");
 			vap->iv_stats.is_mesh_rtaddfailed++;
 			return;
 		}
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "adding originator %6D", preq->preq_origaddr, ":");
 	}
 	hrorig = IEEE80211_MESH_ROUTE_PRIV(rtorig, struct ieee80211_hwmp_route);
 
 	/* record last seen preqid */
 	preqid = hrorig->hr_preqid;
 	hrorig->hr_preqid = HWMP_SEQ_MAX(hrorig->hr_preqid, preq->preq_id);
 
 	/* Data creation and update of forwarding information
 	 * according to Table 11C-8 for originator mesh STA.
 	 */
 	metric = preq->preq_metric + ms->ms_pmetric->mpm_metric(ni);
 	if (HWMP_SEQ_GT(preq->preq_origseq, hrorig->hr_seq) ||
 	    (HWMP_SEQ_EQ(preq->preq_origseq, hrorig->hr_seq) &&
 	    metric < rtorig->rt_metric)) {
 		hrorig->hr_seq = preq->preq_origseq;
 		IEEE80211_ADDR_COPY(rtorig->rt_nexthop, wh->i_addr2);
 		rtorig->rt_metric = metric;
 		rtorig->rt_nhops  = preq->preq_hopcount + 1;
 		ieee80211_mesh_rt_update(rtorig, preq->preq_lifetime);
 		/* Path to orig is valid now.
 		 * NB: we know it can't be Proxy, and if it is GATE
 		 * it will be marked below.
 		 */
 		rtorig->rt_flags = IEEE80211_MESHRT_FLAGS_VALID;
 	} else if ((hrtarg != NULL &&
 	    !HWMP_SEQ_EQ(hrtarg->hr_seq, PREQ_TSEQ(0))) ||
 	    (rtorig->rt_flags & IEEE80211_MESHRT_FLAGS_VALID &&
 	    preqid >= preq->preq_id)) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "discard PREQ from %6D, old seqno %u <= %u,"
 		    " or old preqid %u < %u",
 		    preq->preq_origaddr, ":",
 		    preq->preq_origseq, hrorig->hr_seq,
 		    preq->preq_id, preqid);
 		return;
 	}
 
 	/* Update forwarding information to TA if metric improves. */
 	hwmp_update_transmitter(vap, ni, "PREQ");
 
 	/*
 	 * Check if the PREQ is addressed to us.
 	 * or a Proxy currently gated by us.
 	 */
 	if (IEEE80211_ADDR_EQ(vap->iv_myaddr, PREQ_TADDR(0)) ||
 	    (ms->ms_flags & IEEE80211_MESHFLAGS_GATE &&
 	    rttarg != NULL &&
 	    IEEE80211_ADDR_EQ(vap->iv_myaddr, rttarg->rt_mesh_gate) &&
 	    rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY &&
 	    rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_VALID)) {
 		struct ieee80211_meshprep_ie prep;
 
 		/*
 		 * When we are the target we shall update our own HWMP seq
 		 * number with max of (current and preq->seq) + 1
 		 */
 		hs->hs_seq = HWMP_SEQ_MAX(hs->hs_seq, PREQ_TSEQ(0)) + 1;
 
 		prep.prep_flags = 0;
 		prep.prep_hopcount = 0;
 		prep.prep_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 		IEEE80211_ADDR_COPY(prep.prep_targetaddr, vap->iv_myaddr);
 		if (rttarg != NULL && /* if NULL it means we are the target */
 		    rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) {
 			IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 			    "reply for proxy %6D", rttarg->rt_dest, ":");
 			prep.prep_flags |= IEEE80211_MESHPREP_FLAGS_AE;
 			IEEE80211_ADDR_COPY(prep.prep_target_ext_addr,
 			    rttarg->rt_dest);
 			/* update proxy seqno to HWMP seqno */
 			rttarg->rt_ext_seq = hs->hs_seq;
 			prep.prep_hopcount = rttarg->rt_nhops;
 			prep.prep_metric = rttarg->rt_metric;
 			IEEE80211_ADDR_COPY(prep.prep_targetaddr, rttarg->rt_mesh_gate);
 		}
 		/*
 		 * Build and send a PREP frame.
 		 */
 		prep.prep_ttl = ms->ms_ttl;
 		prep.prep_targetseq = hs->hs_seq;
 		prep.prep_lifetime = preq->preq_lifetime;
 		IEEE80211_ADDR_COPY(prep.prep_origaddr, preq->preq_origaddr);
 		prep.prep_origseq = preq->preq_origseq;
 
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "reply to %6D", preq->preq_origaddr, ":");
 		hwmp_send_prep(vap, wh->i_addr2, &prep);
 		return;
 	}
 	/* we may update our proxy information for the orig external */
 	else if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE) {
 		rtorig_ext =
 		    ieee80211_mesh_rt_find(vap, preq->preq_orig_ext_addr);
 		if (rtorig_ext == NULL) {
 			rtorig_ext = ieee80211_mesh_rt_add(vap,
 			    preq->preq_orig_ext_addr);
 			if (rtorig_ext == NULL) {
 				IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 				    "unable to add orig ext proxy to %6D",
 				    preq->preq_orig_ext_addr, ":");
 				vap->iv_stats.is_mesh_rtaddfailed++;
 				return;
 			}
 			IEEE80211_ADDR_COPY(rtorig_ext->rt_mesh_gate,
 			    preq->preq_origaddr);
 		}
 		rtorig_ext->rt_ext_seq = preq->preq_origseq;
 		ieee80211_mesh_rt_update(rtorig_ext, preq->preq_lifetime);
 	}
 	/*
 	 * Proactive PREQ: reply with a proactive PREP to the
 	 * root STA if requested.
 	 */
 	if (IEEE80211_ADDR_EQ(PREQ_TADDR(0), broadcastaddr) &&
 	    (PREQ_TFLAGS(0) & IEEE80211_MESHPREQ_TFLAGS_TO)) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "root mesh station @ %6D", preq->preq_origaddr, ":");
 
 		/* Check if root is a mesh gate, mark it */
 		if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_GATE) {
 			struct ieee80211_mesh_gate_route *gr;
 
 			rtorig->rt_flags |= IEEE80211_MESHRT_FLAGS_GATE;
 			gr = ieee80211_mesh_mark_gate(vap, preq->preq_origaddr,
 			    rtorig);
 			gr->gr_lastseq = 0; /* NOT GANN */
 		}
 
 		/*
 		 * Reply with a PREP if we don't have a path to the root
 		 * or if the root sent us a proactive PREQ.
 		 */
 		if ((rtorig->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0 ||
 		    (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_PP)) {
 			struct ieee80211_meshprep_ie prep;
 
 			prep.prep_flags = 0;
 			prep.prep_hopcount = 0;
 			prep.prep_ttl = ms->ms_ttl;
 			IEEE80211_ADDR_COPY(prep.prep_origaddr,
 			    preq->preq_origaddr);
 			prep.prep_origseq = preq->preq_origseq;
 			prep.prep_lifetime = preq->preq_lifetime;
 			prep.prep_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 			IEEE80211_ADDR_COPY(prep.prep_targetaddr,
 			    vap->iv_myaddr);
 			prep.prep_targetseq = ++hs->hs_seq;
 			hwmp_send_prep(vap, rtorig->rt_nexthop, &prep);
 		}
 	}
 
 	/*
 	 * Forwarding and Intermediate reply for PREQs with 1 target.
 	 */
 	if ((preq->preq_tcount == 1) && (preq->preq_ttl > 1) &&
 	    (ms->ms_flags & IEEE80211_MESHFLAGS_FWD)) {
 		struct ieee80211_meshpreq_ie ppreq; /* propagated PREQ */
 
 		memcpy(&ppreq, preq, sizeof(ppreq));
 
 		/*
 		 * We have a valid route to this node.
 		 * NB: if target is proxy dont reply.
 		 */
 		if (rttarg != NULL &&
 		    rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_VALID &&
 		    !(rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY)) {
 			/*
 			 * Check if we can send an intermediate Path Reply,
 			 * i.e., Target Only bit is not set and target is not
 			 * the MAC broadcast address.
 			 */
 			if (!(PREQ_TFLAGS(0) & IEEE80211_MESHPREQ_TFLAGS_TO) &&
 			    !IEEE80211_ADDR_EQ(PREQ_TADDR(0), broadcastaddr)) {
 				struct ieee80211_meshprep_ie prep;
 
 				IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 				    "intermediate reply for PREQ from %6D",
 				    preq->preq_origaddr, ":");
 				prep.prep_flags = 0;
 				prep.prep_hopcount = rttarg->rt_nhops;
 				prep.prep_ttl = ms->ms_ttl;
 				IEEE80211_ADDR_COPY(&prep.prep_targetaddr,
 				    PREQ_TADDR(0));
 				prep.prep_targetseq = hrtarg->hr_seq;
 				prep.prep_lifetime = preq->preq_lifetime;
 				prep.prep_metric =rttarg->rt_metric;
 				IEEE80211_ADDR_COPY(&prep.prep_origaddr,
 				    preq->preq_origaddr);
 				prep.prep_origseq = hrorig->hr_seq;
 				hwmp_send_prep(vap, rtorig->rt_nexthop, &prep);
 
 				/*
 				 * Set TO and unset RF bits because we have
 				 * sent a PREP.
 				 */
 				ppreq.preq_targets[0].target_flags |=
 				    IEEE80211_MESHPREQ_TFLAGS_TO;
 			}
 		}
 
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "forward PREQ from %6D",
 		    preq->preq_origaddr, ":");
 		ppreq.preq_hopcount += 1;
 		ppreq.preq_ttl -= 1;
 		ppreq.preq_metric += ms->ms_pmetric->mpm_metric(ni);
 
 		/* don't do PREQ ratecheck when we propagate */
 		hwmp_send_preq(vap, broadcastaddr, &ppreq, NULL, NULL);
 	}
 }
 #undef	PREQ_TFLAGS
 #undef	PREQ_TADDR
 #undef	PREQ_TSEQ
 
 static int
 hwmp_send_preq(struct ieee80211vap *vap,
     const uint8_t da[IEEE80211_ADDR_LEN],
     struct ieee80211_meshpreq_ie *preq,
     struct timeval *last, struct timeval *minint)
 {
 
 	/*
 	 * Enforce PREQ interval.
 	 * NB: Proactive ROOT PREQs rate is handled by cb task.
 	 */
 	if (last != NULL && minint != NULL) {
 		if (ratecheck(last, minint) == 0)
 			return EALREADY; /* XXX: we should postpone */
 		getmicrouptime(last);
 	}
 
 	/*
 	 * mesh preq action frame format
 	 *     [6] da
 	 *     [6] sa
 	 *     [6] addr3 = sa
 	 *     [1] action
 	 *     [1] category
 	 *     [tlv] mesh path request
 	 */
 	preq->preq_ie = IEEE80211_ELEMID_MESHPREQ;
 	preq->preq_len = (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE ?
 	    IEEE80211_MESHPREQ_BASE_SZ_AE : IEEE80211_MESHPREQ_BASE_SZ) +
 	    preq->preq_tcount * IEEE80211_MESHPREQ_TRGT_SZ;
 	return hwmp_send_action(vap, da, (uint8_t *)preq, preq->preq_len+2);
 }
 
 static void
 hwmp_recv_prep(struct ieee80211vap *vap, struct ieee80211_node *ni,
     const struct ieee80211_frame *wh, const struct ieee80211_meshprep_ie *prep)
 {
 #define	IS_PROXY(rt)	(rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY)
 #define	PROXIED_BY_US(rt)		\
     (IEEE80211_ADDR_EQ(vap->iv_myaddr, rt->rt_mesh_gate))
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	struct ieee80211_mesh_route *rt = NULL;
 	struct ieee80211_mesh_route *rtorig = NULL;
 	struct ieee80211_mesh_route *rtext = NULL;
 	struct ieee80211_hwmp_route *hr;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct mbuf *m, *next;
 	uint32_t metric = 0;
 	const uint8_t *addr;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 	    "received PREP, orig %6D, targ %6D", prep->prep_origaddr, ":",
 	    prep->prep_targetaddr, ":");
 
 	/*
 	 * Acceptance criteria: (If the corresponding PREP was not generated
 	 * by us OR not generated by an external mac that is not proxied by us)
 	 * AND forwarding is disabled, discard this PREP.
 	 */
 	rtorig = ieee80211_mesh_rt_find(vap, prep->prep_origaddr);
 	if ((!IEEE80211_ADDR_EQ(vap->iv_myaddr, prep->prep_origaddr) ||
 	    (rtorig != NULL && IS_PROXY(rtorig) && !PROXIED_BY_US(rtorig))) &&
 	    !(ms->ms_flags & IEEE80211_MESHFLAGS_FWD)){
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "discard PREP, orig(%6D) not proxied or generated by us",
 		    prep->prep_origaddr, ":");
 		return;
 	}
 
 	/* PREP ACCEPTED */
 
 	/*
 	 * If accepted shall create or update the active forwarding information
 	 * it maintains for the target mesh STA of the PREP (according to the
 	 * rules defined in 13.10.8.4). If the conditions for creating or
 	 * updating the forwarding information have not been met in those
 	 * rules, no further steps are applied to the PREP.
 	 */
 	rt = ieee80211_mesh_rt_find(vap, prep->prep_targetaddr);
 	if (rt == NULL) {
 		rt = ieee80211_mesh_rt_add(vap, prep->prep_targetaddr);
 		if (rt == NULL) {
 			IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 			    "unable to add PREP path to %6D",
 			    prep->prep_targetaddr, ":");
 			vap->iv_stats.is_mesh_rtaddfailed++;
 			return;
 		}
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "adding target %6D", prep->prep_targetaddr, ":");
 	}
 	hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route);
 	/* update path metric */
 	metric = prep->prep_metric + ms->ms_pmetric->mpm_metric(ni);
 	if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID)) {
 		if (HWMP_SEQ_LT(prep->prep_targetseq, hr->hr_seq)) {
 			IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 			    "discard PREP from %6D, old seq no %u < %u",
 			    prep->prep_targetaddr, ":",
 			    prep->prep_targetseq, hr->hr_seq);
 			return;
 		} else if (HWMP_SEQ_LEQ(prep->prep_targetseq, hr->hr_seq) &&
 		    metric > rt->rt_metric) {
 			IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 			    "discard PREP from %6D, new metric %u > %u",
 			    prep->prep_targetaddr, ":",
 			    metric, rt->rt_metric);
 			return;
 		}
 	}
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 	    "%s path to %6D, hopcount %d:%d metric %d:%d",
 	    rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID ?
 	    "prefer" : "update",
 	    prep->prep_targetaddr, ":",
 	    rt->rt_nhops, prep->prep_hopcount + 1,
 	    rt->rt_metric, metric);
 
 	hr->hr_seq = prep->prep_targetseq;
 	hr->hr_preqretries = 0;
 	IEEE80211_ADDR_COPY(rt->rt_nexthop, ni->ni_macaddr);
 	rt->rt_metric = metric;
 	rt->rt_nhops = prep->prep_hopcount + 1;
 	ieee80211_mesh_rt_update(rt, prep->prep_lifetime);
 	if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_DISCOVER) {
 		/* discovery complete */
 		rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_DISCOVER;
 	}
 	rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID; /* mark valid */
 
 	/* Update forwarding information to TA if metric improves */
 	hwmp_update_transmitter(vap, ni, "PREP");
 
 	/*
 	 * If it's NOT for us, propagate the PREP
 	 */
 	if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, prep->prep_origaddr) &&
 	    prep->prep_ttl > 1 &&
 	    prep->prep_hopcount < hs->hs_maxhops) {
 		struct ieee80211_meshprep_ie pprep; /* propagated PREP */
 		/*
 		 * NB: We should already have setup the path to orig
 		 * mesh STA when we propagated PREQ to target mesh STA,
 		 * no PREP is generated without a corresponding PREQ.
 		 * XXX: for now just ignore.
 		 */
 		if (rtorig == NULL) {
 			IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 			    "received PREP for an unknown orig(%6D)",
 			    prep->prep_origaddr, ":");
 			return;
 		}
 
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "propagate PREP from %6D",
 		    prep->prep_targetaddr, ":");
 
 		memcpy(&pprep, prep, sizeof(pprep));
 		pprep.prep_hopcount += 1;
 		pprep.prep_ttl -= 1;
 		pprep.prep_metric += ms->ms_pmetric->mpm_metric(ni);
 		hwmp_send_prep(vap, rtorig->rt_nexthop, &pprep);
 
 		/* precursor list for the Target Mesh STA Address is updated */
 	}
 
 	/*
 	 * Check if we received a PREP w/ AE and store target external address.
 	 * We may store target external address if recevied PREP w/ AE
 	 * and we are not final destination
 	 */
 	if (prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE) {
 		rtext = ieee80211_mesh_rt_find(vap,
 			prep->prep_target_ext_addr);
 		if (rtext == NULL) {
 			rtext = ieee80211_mesh_rt_add(vap,
 				prep->prep_target_ext_addr);
 			if (rtext == NULL) {
 				IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 				    "unable to add PREP path to proxy %6D",
 				    prep->prep_targetaddr, ":");
 				vap->iv_stats.is_mesh_rtaddfailed++;
 				return;
 			}
 		}
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "%s path to %6D, hopcount %d:%d metric %d:%d",
 		    rtext->rt_flags & IEEE80211_MESHRT_FLAGS_VALID ?
 		    "prefer" : "update",
 		    prep->prep_target_ext_addr, ":",
 		    rtext->rt_nhops, prep->prep_hopcount + 1,
 		    rtext->rt_metric, metric);
 
 		rtext->rt_flags = IEEE80211_MESHRT_FLAGS_PROXY |
 			IEEE80211_MESHRT_FLAGS_VALID;
 		IEEE80211_ADDR_COPY(rtext->rt_dest,
 		    prep->prep_target_ext_addr);
 		IEEE80211_ADDR_COPY(rtext->rt_mesh_gate,
 		    prep->prep_targetaddr);
 		IEEE80211_ADDR_COPY(rtext->rt_nexthop, wh->i_addr2);
 		rtext->rt_metric = metric;
 		rtext->rt_lifetime = prep->prep_lifetime;
 		rtext->rt_nhops = prep->prep_hopcount + 1;
 		rtext->rt_ext_seq = prep->prep_origseq; /* new proxy seq */
 		/*
 		 * XXX: proxy entries have no HWMP priv data,
 		 * nullify them to be sure?
 		 */
 	}
 	/*
 	 * Check for frames queued awaiting path discovery.
 	 * XXX probably can tell exactly and avoid remove call
 	 * NB: hash may have false matches, if so they will get
 	 *     stuck back on the stageq because there won't be
 	 *     a path.
 	 */
 	addr = prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE ?
 	    prep->prep_target_ext_addr : prep->prep_targetaddr;
 	m = ieee80211_ageq_remove(&ic->ic_stageq,
 	    (struct ieee80211_node *)(uintptr_t)
 	    ieee80211_mac_hash(ic, addr)); /* either dest or ext_dest */
 
 	/*
 	 * All frames in the stageq here should be non-M_ENCAP; or things
 	 * will get very unhappy.
 	 */
 	for (; m != NULL; m = next) {
 		next = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "flush queued frame %p len %d", m, m->m_pkthdr.len);
 		/*
 		 * If the mbuf has M_ENCAP set, ensure we free it.
 		 * Note that after if_transmit() is called, m is invalid.
 		 */
 		(void) ieee80211_vap_xmitpkt(vap, m);
 	}
 #undef	IS_PROXY
 #undef	PROXIED_BY_US
 }
 
 static int
 hwmp_send_prep(struct ieee80211vap *vap,
     const uint8_t da[IEEE80211_ADDR_LEN],
     struct ieee80211_meshprep_ie *prep)
 {
 	/* NB: there's no PREP minimum interval. */
 
 	/*
 	 * mesh prep action frame format
 	 *     [6] da
 	 *     [6] sa
 	 *     [6] addr3 = sa
 	 *     [1] action
 	 *     [1] category
 	 *     [tlv] mesh path reply
 	 */
 	prep->prep_ie = IEEE80211_ELEMID_MESHPREP;
 	prep->prep_len = prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE ?
 	    IEEE80211_MESHPREP_BASE_SZ_AE : IEEE80211_MESHPREP_BASE_SZ;
 	return hwmp_send_action(vap, da, (uint8_t *)prep, prep->prep_len + 2);
 }
 
 #define	PERR_DFLAGS(n)	perr.perr_dests[n].dest_flags
 #define	PERR_DADDR(n)	perr.perr_dests[n].dest_addr
 #define	PERR_DSEQ(n)	perr.perr_dests[n].dest_seq
 #define	PERR_DRCODE(n)	perr.perr_dests[n].dest_rcode
 static void
 hwmp_peerdown(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshperr_ie perr;
 	struct ieee80211_mesh_route *rt;
 	struct ieee80211_hwmp_route *hr;
 
 	rt = ieee80211_mesh_rt_find(vap, ni->ni_macaddr);
 	if (rt == NULL)
 		return;
 	hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route);
 	IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 	    "%s", "delete route entry");
 	perr.perr_ttl = ms->ms_ttl;
 	perr.perr_ndests = 1;
 	PERR_DFLAGS(0) = 0;
 	if (hr->hr_seq == 0)
 		PERR_DFLAGS(0) |= IEEE80211_MESHPERR_DFLAGS_USN;
 	PERR_DFLAGS(0) |= IEEE80211_MESHPERR_DFLAGS_RC;
 	IEEE80211_ADDR_COPY(PERR_DADDR(0), rt->rt_dest);
 	PERR_DSEQ(0) = ++hr->hr_seq;
 	PERR_DRCODE(0) = IEEE80211_REASON_MESH_PERR_DEST_UNREACH;
 	/* NB: flush everything passing through peer */
 	ieee80211_mesh_rt_flush_peer(vap, ni->ni_macaddr);
 	hwmp_send_perr(vap, broadcastaddr, &perr);
 }
 #undef	PERR_DFLAGS
 #undef	PERR_DADDR
 #undef	PERR_DSEQ
 #undef	PERR_DRCODE
 
 #define	PERR_DFLAGS(n)		perr->perr_dests[n].dest_flags
 #define	PERR_DADDR(n)		perr->perr_dests[n].dest_addr
 #define	PERR_DSEQ(n)		perr->perr_dests[n].dest_seq
 #define	PERR_DEXTADDR(n)	perr->perr_dests[n].dest_ext_addr
 static void
 hwmp_recv_perr(struct ieee80211vap *vap, struct ieee80211_node *ni,
     const struct ieee80211_frame *wh, const struct ieee80211_meshperr_ie *perr)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt = NULL;
 	struct ieee80211_mesh_route *rt_ext = NULL;
 	struct ieee80211_hwmp_route *hr;
 	struct ieee80211_meshperr_ie *pperr = NULL;
 	int i, j = 0, forward = 0;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 	    "received PERR from %6D", wh->i_addr2, ":");
 
 	/*
 	 * if forwarding is true, prepare pperr
 	 */
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_FWD) {
 		forward = 1;
 		pperr = IEEE80211_MALLOC(sizeof(*perr) + 31*sizeof(*perr->perr_dests),
 		    M_80211_MESH_PERR, IEEE80211_M_NOWAIT); /* XXX: magic number, 32 err dests */
 	}
 
 	/*
 	 * Acceptance criteria: check if we have forwarding information
 	 * stored about destination, and that nexthop == TA of this PERR.
 	 * NB: we also build a new PERR to propagate in case we should forward.
 	 */
 	for (i = 0; i < perr->perr_ndests; i++) {
 		rt = ieee80211_mesh_rt_find(vap, PERR_DADDR(i));
 		if (rt == NULL)
 			continue;
 		if (!IEEE80211_ADDR_EQ(rt->rt_nexthop, wh->i_addr2))
 			continue;
 
 		/* found and accepted a PERR ndest element, process it... */
 		if (forward)
 			memcpy(&pperr->perr_dests[j], &perr->perr_dests[i],
 			    sizeof(*perr->perr_dests));
 		hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route);
 		switch(PERR_DFLAGS(i)) {
 		case (IEEE80211_REASON_MESH_PERR_NO_FI):
 			if (PERR_DSEQ(i) == 0) {
 				hr->hr_seq++;
 				if (forward) {
 					pperr->perr_dests[j].dest_seq =
 					    hr->hr_seq;
 				}
 			} else {
 				hr->hr_seq = PERR_DSEQ(i);
 			}
 			rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID;
 			j++;
 			break;
 		case (IEEE80211_REASON_MESH_PERR_DEST_UNREACH):
 			if(HWMP_SEQ_GT(PERR_DSEQ(i), hr->hr_seq)) {
 				hr->hr_seq = PERR_DSEQ(i);
 				rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID;
 				j++;
 			}
 			break;
 		case (IEEE80211_REASON_MESH_PERR_NO_PROXY):
 			rt_ext = ieee80211_mesh_rt_find(vap, PERR_DEXTADDR(i));
 			if (rt_ext != NULL) {
 				rt_ext->rt_flags &=
 				    ~IEEE80211_MESHRT_FLAGS_VALID;
 				j++;
 			}
 			break;
 		default:
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL,
 			    "PERR, unknown reason code %u\n", PERR_DFLAGS(i));
 			goto done; /* XXX: stats?? */
 		}
 		ieee80211_mesh_rt_flush_peer(vap, PERR_DADDR(i));
 		KASSERT(j < 32, ("PERR, error ndest >= 32 (%u)", j));
 	}
 	if (j == 0) {
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL, "%s",
 		    "PERR not accepted");
 		goto done; /* XXX: stats?? */
 	}
 
 	/*
 	 * Propagate the PERR if we previously found it on our routing table.
 	 */
 	if (forward && perr->perr_ttl > 1) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni,
 		    "propagate PERR from %6D", wh->i_addr2, ":");
 		pperr->perr_ndests = j;
 		pperr->perr_ttl--;
 		hwmp_send_perr(vap, broadcastaddr, pperr);
 	}
 done:
 	if (pperr != NULL)
 		IEEE80211_FREE(pperr, M_80211_MESH_PERR);
 }
 #undef	PERR_DFLAGS
 #undef	PERR_DADDR
 #undef	PERR_DSEQ
 #undef	PERR_DEXTADDR
 
 static int
 hwmp_send_perr(struct ieee80211vap *vap,
     const uint8_t da[IEEE80211_ADDR_LEN],
     struct ieee80211_meshperr_ie *perr)
 {
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	int i;
 	uint8_t length = 0;
 
 	/*
 	 * Enforce PERR interval.
 	 */
 	if (ratecheck(&hs->hs_lastperr, &ieee80211_hwmp_perrminint) == 0)
 		return EALREADY;
 	getmicrouptime(&hs->hs_lastperr);
 
 	/*
 	 * mesh perr action frame format
 	 *     [6] da
 	 *     [6] sa
 	 *     [6] addr3 = sa
 	 *     [1] action
 	 *     [1] category
 	 *     [tlv] mesh path error
 	 */
 	perr->perr_ie = IEEE80211_ELEMID_MESHPERR;
 	length = IEEE80211_MESHPERR_BASE_SZ;
 	for (i = 0; i<perr->perr_ndests; i++) {
 		if (perr->perr_dests[i].dest_flags &
 		    IEEE80211_MESHPERR_FLAGS_AE) {
 			length += IEEE80211_MESHPERR_DEST_SZ_AE;
 			continue ;
 		}
 		length += IEEE80211_MESHPERR_DEST_SZ;
 	}
 	perr->perr_len =length;
 	return hwmp_send_action(vap, da, (uint8_t *)perr, perr->perr_len+2);
 }
 
 /*
  * Called from the rest of the net80211 code (mesh code for example).
  * NB: IEEE80211_REASON_MESH_PERR_DEST_UNREACH can be trigger by the fact that
  * a mesh STA is unable to forward an MSDU/MMPDU to a next-hop mesh STA.
  */
 #define	PERR_DFLAGS(n)		perr.perr_dests[n].dest_flags
 #define	PERR_DADDR(n)		perr.perr_dests[n].dest_addr
 #define	PERR_DSEQ(n)		perr.perr_dests[n].dest_seq
 #define	PERR_DEXTADDR(n)	perr.perr_dests[n].dest_ext_addr
 #define	PERR_DRCODE(n)		perr.perr_dests[n].dest_rcode
 static void
 hwmp_senderror(struct ieee80211vap *vap,
     const uint8_t addr[IEEE80211_ADDR_LEN],
     struct ieee80211_mesh_route *rt, int rcode)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_hwmp_route *hr = NULL;
 	struct ieee80211_meshperr_ie perr;
 
 	if (rt != NULL)
 		hr = IEEE80211_MESH_ROUTE_PRIV(rt,
 		    struct ieee80211_hwmp_route);
 
 	perr.perr_ndests = 1;
 	perr.perr_ttl = ms->ms_ttl;
 	PERR_DFLAGS(0) = 0;
 	PERR_DRCODE(0) = rcode;
 
 	switch (rcode) {
 	case IEEE80211_REASON_MESH_PERR_NO_FI:
 		IEEE80211_ADDR_COPY(PERR_DADDR(0), addr);
 		PERR_DSEQ(0) = 0; /* reserved */
 		break;
 	case IEEE80211_REASON_MESH_PERR_NO_PROXY:
 		KASSERT(rt != NULL, ("no proxy info for sending PERR"));
 		KASSERT(rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY,
 		    ("route is not marked proxy"));
 		PERR_DFLAGS(0) |= IEEE80211_MESHPERR_FLAGS_AE;
 		IEEE80211_ADDR_COPY(PERR_DADDR(0), vap->iv_myaddr);
 		PERR_DSEQ(0) = rt->rt_ext_seq;
 		IEEE80211_ADDR_COPY(PERR_DEXTADDR(0), addr);
 		break;
 	case IEEE80211_REASON_MESH_PERR_DEST_UNREACH:
 		KASSERT(rt != NULL, ("no route info for sending PERR"));
 		IEEE80211_ADDR_COPY(PERR_DADDR(0), addr);
 		PERR_DSEQ(0) = hr->hr_seq;
 		break;
 	default:
 		KASSERT(0, ("unknown reason code for HWMP PERR (%u)", rcode));
 	}
 	hwmp_send_perr(vap, broadcastaddr, &perr);
 }
 #undef	PERR_DFLAGS
 #undef	PEER_DADDR
 #undef	PERR_DSEQ
 #undef	PERR_DEXTADDR
 #undef	PERR_DRCODE
 
 static void
 hwmp_recv_rann(struct ieee80211vap *vap, struct ieee80211_node *ni,
     const struct ieee80211_frame *wh, const struct ieee80211_meshrann_ie *rann)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	struct ieee80211_mesh_route *rt = NULL;
 	struct ieee80211_hwmp_route *hr;
 	struct ieee80211_meshpreq_ie preq;
 	struct ieee80211_meshrann_ie prann;
 
 	if (IEEE80211_ADDR_EQ(rann->rann_addr, vap->iv_myaddr))
 		return;
 
 	rt = ieee80211_mesh_rt_find(vap, rann->rann_addr);
 	if (rt != NULL && rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) {
 		hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route);
 
 		/* Acceptance criteria: if RANN.seq < stored seq, discard RANN */
 		if (HWMP_SEQ_LT(rann->rann_seq, hr->hr_seq)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL,
 			"RANN seq %u < %u", rann->rann_seq, hr->hr_seq);
 			return;
 		}
 
 		/* Acceptance criteria: if RANN.seq == stored seq AND
 		* RANN.metric > stored metric, discard RANN */
 		if (HWMP_SEQ_EQ(rann->rann_seq, hr->hr_seq) &&
 		rann->rann_metric > rt->rt_metric) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL,
 			"RANN metric %u > %u", rann->rann_metric, rt->rt_metric);
 			return;
 		}
 	}
 
 	/* RANN ACCEPTED */
 
 	ieee80211_hwmp_rannint = rann->rann_interval; /* XXX: mtx lock? */
 
 	if (rt == NULL) {
 		rt = ieee80211_mesh_rt_add(vap, rann->rann_addr);
 		if (rt == NULL) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL,
 			    "unable to add mac for RANN root %6D",
 			    rann->rann_addr, ":");
 			    vap->iv_stats.is_mesh_rtaddfailed++;
 			return;
 		}
 	}
 	hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route);
 	/* Check if root is a mesh gate, mark it */
 	if (rann->rann_flags & IEEE80211_MESHRANN_FLAGS_GATE) {
 		struct ieee80211_mesh_gate_route *gr;
 
 		rt->rt_flags |= IEEE80211_MESHRT_FLAGS_GATE;
 		gr = ieee80211_mesh_mark_gate(vap, rann->rann_addr,
 			rt);
 		gr->gr_lastseq = 0; /* NOT GANN */
 	}
 	/* discovery timeout */
 	ieee80211_mesh_rt_update(rt,
 	    ticks_to_msecs(ieee80211_hwmp_roottimeout));
 
 	preq.preq_flags = IEEE80211_MESHPREQ_FLAGS_AM;
 	preq.preq_hopcount = 0;
 	preq.preq_ttl = ms->ms_ttl;
 	preq.preq_id = 0; /* reserved */
 	IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr);
 	preq.preq_origseq = ++hs->hs_seq;
 	preq.preq_lifetime = ieee80211_hwmp_roottimeout;
 	preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 	preq.preq_tcount = 1;
 	preq.preq_targets[0].target_flags = IEEE80211_MESHPREQ_TFLAGS_TO;
 	/* NB: IEEE80211_MESHPREQ_TFLAGS_USN = 0 implicitly implied */
 	IEEE80211_ADDR_COPY(preq.preq_targets[0].target_addr, rann->rann_addr);
 	preq.preq_targets[0].target_seq = rann->rann_seq;
 	/* XXX: if rootconfint have not passed, we built this preq in vain */
 	hwmp_send_preq(vap, wh->i_addr2, &preq, &hr->hr_lastrootconf,
 	    &ieee80211_hwmp_rootconfint);
 
 	/* propagate a RANN */
 	if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID &&
 	    rann->rann_ttl > 1 &&
 	    ms->ms_flags & IEEE80211_MESHFLAGS_FWD) {
 		hr->hr_seq = rann->rann_seq;
 		memcpy(&prann, rann, sizeof(prann));
 		prann.rann_hopcount += 1;
 		prann.rann_ttl -= 1;
 		prann.rann_metric += ms->ms_pmetric->mpm_metric(ni);
 		hwmp_send_rann(vap, broadcastaddr, &prann);
 	}
 }
 
 static int
 hwmp_send_rann(struct ieee80211vap *vap,
     const uint8_t da[IEEE80211_ADDR_LEN],
     struct ieee80211_meshrann_ie *rann)
 {
 	/*
 	 * mesh rann action frame format
 	 *     [6] da
 	 *     [6] sa
 	 *     [6] addr3 = sa
 	 *     [1] action
 	 *     [1] category
 	 *     [tlv] root annoucement
 	 */
 	rann->rann_ie = IEEE80211_ELEMID_MESHRANN;
 	rann->rann_len = IEEE80211_MESHRANN_BASE_SZ;
 	return hwmp_send_action(vap, da, (uint8_t *)rann, rann->rann_len + 2);
 }
 
 #define	PREQ_TFLAGS(n)	preq.preq_targets[n].target_flags
 #define	PREQ_TADDR(n)	preq.preq_targets[n].target_addr
 #define	PREQ_TSEQ(n)	preq.preq_targets[n].target_seq
 static void
 hwmp_rediscover_cb(void *arg)
 {
 	struct ieee80211_mesh_route *rt = arg;
 	struct ieee80211vap *vap = rt->rt_vap;
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_hwmp_route *hr;
 	struct ieee80211_meshpreq_ie preq; /* Optimize: storing first preq? */
 
 	if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID))
 		return ; /* nothing to do */
 
 	hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route);
 	if (hr->hr_preqretries >=
 		ieee80211_hwmp_maxpreq_retries) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ANY,
 			rt->rt_dest, "%s",
 			"max number of discovery, send queued frames to GATE");
 		ieee80211_mesh_forward_to_gates(vap, rt);
 		vap->iv_stats.is_mesh_fwd_nopath++;
 		return ; /* XXX: flush queue? */
 	}
 
 	hr->hr_preqretries++;
 
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, rt->rt_dest,
 	    "start path rediscovery , target seq %u", hr->hr_seq);
 	/*
 	 * Try to discover the path for this node.
 	 * Group addressed PREQ Case A
 	 */
 	preq.preq_flags = 0;
 	preq.preq_hopcount = 0;
 	preq.preq_ttl = ms->ms_ttl;
 	preq.preq_id = ++hs->hs_preqid;
 	IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr);
 	preq.preq_origseq = hr->hr_origseq;
 	preq.preq_lifetime = ticks_to_msecs(ieee80211_hwmp_pathtimeout);
 	preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 	preq.preq_tcount = 1;
 	IEEE80211_ADDR_COPY(PREQ_TADDR(0), rt->rt_dest);
 	PREQ_TFLAGS(0) = 0;
 	if (ieee80211_hwmp_targetonly)
 		PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_TO;
 	PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_USN;
 	PREQ_TSEQ(0) = 0; /* RESERVED when USN flag is set */
 	/* XXX check return value */
 	hwmp_send_preq(vap, broadcastaddr, &preq, &hr->hr_lastpreq,
 	    &ieee80211_hwmp_preqminint);
 	callout_reset(&rt->rt_discovery,
 		ieee80211_hwmp_net_diameter_traversaltime * 2,
 		hwmp_rediscover_cb, rt);
 }
 
 static struct ieee80211_node *
 hwmp_discover(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN], struct mbuf *m)
 {
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt = NULL;
 	struct ieee80211_hwmp_route *hr;
 	struct ieee80211_meshpreq_ie preq;
 	struct ieee80211_node *ni;
 	int sendpreq = 0;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS,
 	    ("not a mesh vap, opmode %d", vap->iv_opmode));
 
 	KASSERT(!IEEE80211_ADDR_EQ(vap->iv_myaddr, dest),
 	    ("%s: discovering self!", __func__));
 
 	ni = NULL;
 	if (!IEEE80211_IS_MULTICAST(dest)) {
 		rt = ieee80211_mesh_rt_find(vap, dest);
 		if (rt == NULL) {
 			rt = ieee80211_mesh_rt_add(vap, dest);
 			if (rt == NULL) {
 				IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP,
 				    ni, "unable to add discovery path to %6D",
 				    dest, ":");
 				vap->iv_stats.is_mesh_rtaddfailed++;
 				goto done;
 			}
 		}
 		hr = IEEE80211_MESH_ROUTE_PRIV(rt,
 		    struct ieee80211_hwmp_route);
 		if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_DISCOVER) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest,
 			    "%s", "already discovering queue frame until path found");
 			sendpreq = 1;
 			goto done;
 		}
 		if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) {
 			if (hr->hr_lastdiscovery != 0 &&
 			    (ticks - hr->hr_lastdiscovery <
 			    (ieee80211_hwmp_net_diameter_traversaltime * 2))) {
 				IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			            dest, NULL, "%s",
 				    "too frequent discovery requeust");
 				sendpreq = 1;
 				goto done;
 			}
 			hr->hr_lastdiscovery = ticks;
 			if (hr->hr_preqretries >=
 			    ieee80211_hwmp_maxpreq_retries) {
 				IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			            dest, NULL, "%s",
 				    "no valid path , max number of discovery");
 				vap->iv_stats.is_mesh_fwd_nopath++;
 				goto done;
 			}
 			rt->rt_flags = IEEE80211_MESHRT_FLAGS_DISCOVER;
 			hr->hr_preqretries++;
 			if (hr->hr_origseq == 0)
 				hr->hr_origseq = ++hs->hs_seq;
 			rt->rt_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 			sendpreq = 1;
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest,
 			    "start path discovery (src %s), target seq %u",
 			    m == NULL ? "<none>" : ether_sprintf(
 			    mtod(m, struct ether_header *)->ether_shost),
 			    hr->hr_seq);
 			/*
 			 * Try to discover the path for this node.
 			 * Group addressed PREQ Case A
 			 */
 			preq.preq_flags = 0;
 			preq.preq_hopcount = 0;
 			preq.preq_ttl = ms->ms_ttl;
 			preq.preq_id = ++hs->hs_preqid;
 			IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr);
 			preq.preq_origseq = hr->hr_origseq;
 			preq.preq_lifetime =
 			    ticks_to_msecs(ieee80211_hwmp_pathtimeout);
 			preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL;
 			preq.preq_tcount = 1;
 			IEEE80211_ADDR_COPY(PREQ_TADDR(0), dest);
 			PREQ_TFLAGS(0) = 0;
 			if (ieee80211_hwmp_targetonly)
 				PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_TO;
 			PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_USN;
 			PREQ_TSEQ(0) = 0; /* RESERVED when USN flag is set */
 			/* XXX check return value */
 			hwmp_send_preq(vap, broadcastaddr, &preq,
 			    &hr->hr_lastpreq, &ieee80211_hwmp_preqminint);
 			callout_reset(&rt->rt_discovery,
 			    ieee80211_hwmp_net_diameter_traversaltime * 2,
 			    hwmp_rediscover_cb, rt);
 		}
 		if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID)
 			ni = ieee80211_find_txnode(vap, rt->rt_nexthop);
 	} else {
 		ni = ieee80211_find_txnode(vap, dest);
 		/* NB: if null then we leak mbuf */
 		KASSERT(ni != NULL, ("leak mcast frame"));
 		return ni;
 	}
 done:
 	if (ni == NULL && m != NULL) {
 		if (sendpreq) {
 			struct ieee80211com *ic = vap->iv_ic;
 			/*
 			 * Queue packet for transmit when path discovery
 			 * completes.  If discovery never completes the
 			 * frame will be flushed by way of the aging timer.
 			 */
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest,
 			    "%s", "queue frame until path found");
+			MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 			m->m_pkthdr.rcvif = (void *)(uintptr_t)
 			    ieee80211_mac_hash(ic, dest);
 			/* XXX age chosen randomly */
 			ieee80211_ageq_append(&ic->ic_stageq, m,
 			    IEEE80211_INACT_WAIT);
 		} else {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_HWMP,
 			    dest, NULL, "%s", "no valid path to this node");
 			m_freem(m);
 		}
 	}
 	return ni;
 }
 #undef	PREQ_TFLAGS
 #undef	PREQ_TADDR
 #undef	PREQ_TSEQ
 
 static int
 hwmp_ioctl_get80211(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	int error;
 
 	if (vap->iv_opmode != IEEE80211_M_MBSS)
 		return ENOSYS;
 	error = 0;
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_HWMP_ROOTMODE:
 		ireq->i_val = hs->hs_rootmode;
 		break;
 	case IEEE80211_IOC_HWMP_MAXHOPS:
 		ireq->i_val = hs->hs_maxhops;
 		break;
 	default:
 		return ENOSYS;
 	}
 	return error;
 }
 IEEE80211_IOCTL_GET(hwmp, hwmp_ioctl_get80211);
 
 static int
 hwmp_ioctl_set80211(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_hwmp_state *hs = vap->iv_hwmp;
 	int error;
 
 	if (vap->iv_opmode != IEEE80211_M_MBSS)
 		return ENOSYS;
 	error = 0;
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_HWMP_ROOTMODE:
 		if (ireq->i_val < 0 || ireq->i_val > 3)
 			return EINVAL;
 		hs->hs_rootmode = ireq->i_val;
 		hwmp_rootmode_setup(vap);
 		break;
 	case IEEE80211_IOC_HWMP_MAXHOPS:
 		if (ireq->i_val <= 0 || ireq->i_val > 255)
 			return EINVAL;
 		hs->hs_maxhops = ireq->i_val;
 		break;
 	default:
 		return ENOSYS;
 	}
 	return error;
 }
 IEEE80211_IOCTL_SET(hwmp, hwmp_ioctl_set80211);
Index: head/sys/net80211/ieee80211_mesh.c
===================================================================
--- head/sys/net80211/ieee80211_mesh.c	(revision 348253)
+++ head/sys/net80211/ieee80211_mesh.c	(revision 348254)
@@ -1,3608 +1,3609 @@
 /*- 
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 The FreeBSD Foundation 
  * All rights reserved. 
  * 
  * This software was developed by Rui Paulo under sponsorship from the
  * FreeBSD Foundation. 
  *  
  * Redistribution and use in source and binary forms, with or without 
  * modification, are permitted provided that the following conditions 
  * are met: 
  * 1. Redistributions of source code must retain the above copyright 
  *    notice, this list of conditions and the following disclaimer. 
  * 2. Redistributions in binary form must reproduce the above copyright 
  *    notice, this list of conditions and the following disclaimer in the 
  *    documentation and/or other materials provided with the distribution. 
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  * SUCH DAMAGE. 
  */ 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11s Mesh Point (MBSS) support.
  *
  * Based on March 2009, D3.0 802.11s draft spec.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
 #include <net/ethernet.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_action.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #include <net80211/ieee80211_input.h>
 #include <net80211/ieee80211_mesh.h>
 
 static void	mesh_rt_flush_invalid(struct ieee80211vap *);
 static int	mesh_select_proto_path(struct ieee80211vap *, const char *);
 static int	mesh_select_proto_metric(struct ieee80211vap *, const char *);
 static void	mesh_vattach(struct ieee80211vap *);
 static int	mesh_newstate(struct ieee80211vap *, enum ieee80211_state, int);
 static void	mesh_rt_cleanup_cb(void *);
 static void	mesh_gatemode_setup(struct ieee80211vap *);
 static void	mesh_gatemode_cb(void *);
 static void	mesh_linkchange(struct ieee80211_node *,
 		    enum ieee80211_mesh_mlstate);
 static void	mesh_checkid(void *, struct ieee80211_node *);
 static uint32_t	mesh_generateid(struct ieee80211vap *);
 static int	mesh_checkpseq(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN], uint32_t);
 static void	mesh_transmit_to_gate(struct ieee80211vap *, struct mbuf *,
 		    struct ieee80211_mesh_route *);
 static void	mesh_forward(struct ieee80211vap *, struct mbuf *,
 		    const struct ieee80211_meshcntl *);
 static int	mesh_input(struct ieee80211_node *, struct mbuf *,
 		    const struct ieee80211_rx_stats *rxs, int, int);
 static void	mesh_recv_mgmt(struct ieee80211_node *, struct mbuf *, int,
 		    const struct ieee80211_rx_stats *rxs, int, int);
 static void	mesh_recv_ctl(struct ieee80211_node *, struct mbuf *, int);
 static void	mesh_peer_timeout_setup(struct ieee80211_node *);
 static void	mesh_peer_timeout_backoff(struct ieee80211_node *);
 static void	mesh_peer_timeout_cb(void *);
 static __inline void
 		mesh_peer_timeout_stop(struct ieee80211_node *);
 static int	mesh_verify_meshid(struct ieee80211vap *, const uint8_t *);
 static int	mesh_verify_meshconf(struct ieee80211vap *, const uint8_t *);
 static int	mesh_verify_meshpeer(struct ieee80211vap *, uint8_t,
     		    const uint8_t *);
 uint32_t	mesh_airtime_calc(struct ieee80211_node *);
 
 /*
  * Timeout values come from the specification and are in milliseconds.
  */
 static SYSCTL_NODE(_net_wlan, OID_AUTO, mesh, CTLFLAG_RD, 0,
     "IEEE 802.11s parameters");
 static int	ieee80211_mesh_gateint = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, gateint, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_mesh_gateint, 0, ieee80211_sysctl_msecs_ticks, "I",
     "mesh gate interval (ms)");
 static int ieee80211_mesh_retrytimeout = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, retrytimeout, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_mesh_retrytimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Retry timeout (msec)");
 static int ieee80211_mesh_holdingtimeout = -1;
 
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, holdingtimeout, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_mesh_holdingtimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Holding state timeout (msec)");
 static int ieee80211_mesh_confirmtimeout = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, confirmtimeout, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_mesh_confirmtimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Confirm state timeout (msec)");
 static int ieee80211_mesh_backofftimeout = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, backofftimeout, CTLTYPE_INT | CTLFLAG_RW,
     &ieee80211_mesh_backofftimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Backoff timeout (msec). This is to throutles peering forever when "
     "not receiving answer or is rejected by a neighbor");
 static int ieee80211_mesh_maxretries = 2;
 SYSCTL_INT(_net_wlan_mesh, OID_AUTO, maxretries, CTLFLAG_RW,
     &ieee80211_mesh_maxretries, 0,
     "Maximum retries during peer link establishment");
 static int ieee80211_mesh_maxholding = 2;
 SYSCTL_INT(_net_wlan_mesh, OID_AUTO, maxholding, CTLFLAG_RW,
     &ieee80211_mesh_maxholding, 0,
     "Maximum times we are allowed to transition to HOLDING state before "
     "backinoff during peer link establishment");
 
 static const uint8_t broadcastaddr[IEEE80211_ADDR_LEN] =
 	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	ieee80211_recv_action_func mesh_recv_action_meshpeering_open;
 static	ieee80211_recv_action_func mesh_recv_action_meshpeering_confirm;
 static	ieee80211_recv_action_func mesh_recv_action_meshpeering_close;
 static	ieee80211_recv_action_func mesh_recv_action_meshlmetric;
 static	ieee80211_recv_action_func mesh_recv_action_meshgate;
 
 static	ieee80211_send_action_func mesh_send_action_meshpeering_open;
 static	ieee80211_send_action_func mesh_send_action_meshpeering_confirm;
 static	ieee80211_send_action_func mesh_send_action_meshpeering_close;
 static	ieee80211_send_action_func mesh_send_action_meshlmetric;
 static	ieee80211_send_action_func mesh_send_action_meshgate;
 
 static const struct ieee80211_mesh_proto_metric mesh_metric_airtime = {
 	.mpm_descr	= "AIRTIME",
 	.mpm_ie		= IEEE80211_MESHCONF_METRIC_AIRTIME,
 	.mpm_metric	= mesh_airtime_calc,
 };
 
 static struct ieee80211_mesh_proto_path		mesh_proto_paths[4];
 static struct ieee80211_mesh_proto_metric	mesh_proto_metrics[4];
 
 MALLOC_DEFINE(M_80211_MESH_PREQ, "80211preq", "802.11 MESH Path Request frame");
 MALLOC_DEFINE(M_80211_MESH_PREP, "80211prep", "802.11 MESH Path Reply frame");
 MALLOC_DEFINE(M_80211_MESH_PERR, "80211perr", "802.11 MESH Path Error frame");
 
 /* The longer one of the lifetime should be stored as new lifetime */
 #define MESH_ROUTE_LIFETIME_MAX(a, b)	(a > b ? a : b)
 
 MALLOC_DEFINE(M_80211_MESH_RT, "80211mesh_rt", "802.11s routing table");
 MALLOC_DEFINE(M_80211_MESH_GT_RT, "80211mesh_gt", "802.11s known gates table");
 
 /*
  * Helper functions to manipulate the Mesh routing table.
  */
 
 static struct ieee80211_mesh_route *
 mesh_rt_find_locked(struct ieee80211_mesh_state *ms,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_route *rt;
 
 	MESH_RT_LOCK_ASSERT(ms);
 
 	TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) {
 		if (IEEE80211_ADDR_EQ(dest, rt->rt_dest))
 			return rt;
 	}
 	return NULL;
 }
 
 static struct ieee80211_mesh_route *
 mesh_rt_add_locked(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	KASSERT(!IEEE80211_ADDR_EQ(broadcastaddr, dest),
 	    ("%s: adding broadcast to the routing table", __func__));
 
 	MESH_RT_LOCK_ASSERT(ms);
 
 	rt = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_route)) +
 	    ms->ms_ppath->mpp_privlen, M_80211_MESH_RT,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (rt != NULL) {
 		rt->rt_vap = vap;
 		IEEE80211_ADDR_COPY(rt->rt_dest, dest);
 		rt->rt_priv = (void *)ALIGN(&rt[1]);
 		MESH_RT_ENTRY_LOCK_INIT(rt, "MBSS_RT");
 		callout_init(&rt->rt_discovery, 1);
 		rt->rt_updtime = ticks;	/* create time */
 		TAILQ_INSERT_TAIL(&ms->ms_routes, rt, rt_next);
 	}
 	return rt;
 }
 
 struct ieee80211_mesh_route *
 ieee80211_mesh_rt_find(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	MESH_RT_LOCK(ms);
 	rt = mesh_rt_find_locked(ms, dest);
 	MESH_RT_UNLOCK(ms);
 	return rt;
 }
 
 struct ieee80211_mesh_route *
 ieee80211_mesh_rt_add(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	KASSERT(ieee80211_mesh_rt_find(vap, dest) == NULL,
 	    ("%s: duplicate entry in the routing table", __func__));
 	KASSERT(!IEEE80211_ADDR_EQ(vap->iv_myaddr, dest),
 	    ("%s: adding self to the routing table", __func__));
 
 	MESH_RT_LOCK(ms);
 	rt = mesh_rt_add_locked(vap, dest);
 	MESH_RT_UNLOCK(ms);
 	return rt;
 }
 
 /*
  * Update the route lifetime and returns the updated lifetime.
  * If new_lifetime is zero and route is timedout it will be invalidated.
  * new_lifetime is in msec
  */
 int
 ieee80211_mesh_rt_update(struct ieee80211_mesh_route *rt, int new_lifetime)
 {
 	int timesince, now;
 	uint32_t lifetime = 0;
 
 	KASSERT(rt != NULL, ("route is NULL"));
 
 	now = ticks;
 	MESH_RT_ENTRY_LOCK(rt);
 
 	/* dont clobber a proxy entry gated by us */
 	if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY && rt->rt_nhops == 0) {
 		MESH_RT_ENTRY_UNLOCK(rt);
 		return rt->rt_lifetime;
 	}
 
 	timesince = ticks_to_msecs(now - rt->rt_updtime);
 	rt->rt_updtime = now;
 	if (timesince >= rt->rt_lifetime) {
 		if (new_lifetime != 0) {
 			rt->rt_lifetime = new_lifetime;
 		}
 		else {
 			rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID;
 			rt->rt_lifetime = 0;
 		}
 	} else {
 		/* update what is left of lifetime */
 		rt->rt_lifetime = rt->rt_lifetime - timesince;
 		rt->rt_lifetime  = MESH_ROUTE_LIFETIME_MAX(
 			new_lifetime, rt->rt_lifetime);
 	}
 	lifetime = rt->rt_lifetime;
 	MESH_RT_ENTRY_UNLOCK(rt);
 
 	return lifetime;
 }
 
 /*
  * Add a proxy route (as needed) for the specified destination.
  */
 void
 ieee80211_mesh_proxy_check(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	MESH_RT_LOCK(ms);
 	rt = mesh_rt_find_locked(ms, dest);
 	if (rt == NULL) {
 		rt = mesh_rt_add_locked(vap, dest);
 		if (rt == NULL) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 			    "%s", "unable to add proxy entry");
 			vap->iv_stats.is_mesh_rtaddfailed++;
 		} else {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 			    "%s", "add proxy entry");
 			IEEE80211_ADDR_COPY(rt->rt_mesh_gate, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(rt->rt_nexthop, vap->iv_myaddr);
 			rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID
 				     |  IEEE80211_MESHRT_FLAGS_PROXY;
 		}
 	} else if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) {
 		KASSERT(rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY,
 		    ("no proxy flag for poxy entry"));
 		struct ieee80211com *ic = vap->iv_ic;
 		/*
 		 * Fix existing entry created by received frames from
 		 * stations that have some memory of dest.  We also
 		 * flush any frames held on the staging queue; delivering
 		 * them is too much trouble right now.
 		 */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 		    "%s", "fix proxy entry");
 		IEEE80211_ADDR_COPY(rt->rt_nexthop, vap->iv_myaddr);
 		rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID
 			     |  IEEE80211_MESHRT_FLAGS_PROXY;
 		/* XXX belongs in hwmp */
 		ieee80211_ageq_drain_node(&ic->ic_stageq,
 		   (void *)(uintptr_t) ieee80211_mac_hash(ic, dest));
 		/* XXX stat? */
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 static __inline void
 mesh_rt_del(struct ieee80211_mesh_state *ms, struct ieee80211_mesh_route *rt)
 {
 	TAILQ_REMOVE(&ms->ms_routes, rt, rt_next);
 	/*
 	 * Grab the lock before destroying it, to be sure no one else
 	 * is holding the route.
 	 */
 	MESH_RT_ENTRY_LOCK(rt);
 	callout_drain(&rt->rt_discovery);
 	MESH_RT_ENTRY_LOCK_DESTROY(rt);
 	IEEE80211_FREE(rt, M_80211_MESH_RT);
 }
 
 void
 ieee80211_mesh_rt_del(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) {
 		if (IEEE80211_ADDR_EQ(rt->rt_dest, dest)) {
 			if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) {
 				ms->ms_ppath->mpp_senderror(vap, dest, rt,
 				    IEEE80211_REASON_MESH_PERR_NO_PROXY);
 			} else {
 				ms->ms_ppath->mpp_senderror(vap, dest, rt,
 				    IEEE80211_REASON_MESH_PERR_DEST_UNREACH);
 			}
 			mesh_rt_del(ms, rt);
 			MESH_RT_UNLOCK(ms);
 			return;
 		}
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 void
 ieee80211_mesh_rt_flush(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	if (ms == NULL)
 		return;
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next)
 		mesh_rt_del(ms, rt);
 	MESH_RT_UNLOCK(ms);
 }
 
 void
 ieee80211_mesh_rt_flush_peer(struct ieee80211vap *vap,
     const uint8_t peer[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) {
 		if (IEEE80211_ADDR_EQ(rt->rt_nexthop, peer))
 			mesh_rt_del(ms, rt);
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 /*
  * Flush expired routing entries, i.e. those in invalid state for
  * some time.
  */
 static void
 mesh_rt_flush_invalid(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	if (ms == NULL)
 		return;
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) {
 		/* Discover paths will be deleted by their own callout */
 		if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_DISCOVER)
 			continue;
 		ieee80211_mesh_rt_update(rt, 0);
 		if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0)
 			mesh_rt_del(ms, rt);
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 int
 ieee80211_mesh_register_proto_path(const struct ieee80211_mesh_proto_path *mpp)
 {
 	int i, firstempty = -1;
 
 	for (i = 0; i < nitems(mesh_proto_paths); i++) {
 		if (strncmp(mpp->mpp_descr, mesh_proto_paths[i].mpp_descr,
 		    IEEE80211_MESH_PROTO_DSZ) == 0)
 			return EEXIST;
 		if (!mesh_proto_paths[i].mpp_active && firstempty == -1)
 			firstempty = i;
 	}
 	if (firstempty < 0)
 		return ENOSPC;
 	memcpy(&mesh_proto_paths[firstempty], mpp, sizeof(*mpp));
 	mesh_proto_paths[firstempty].mpp_active = 1;
 	return 0;
 }
 
 int
 ieee80211_mesh_register_proto_metric(const struct
     ieee80211_mesh_proto_metric *mpm)
 {
 	int i, firstempty = -1;
 
 	for (i = 0; i < nitems(mesh_proto_metrics); i++) {
 		if (strncmp(mpm->mpm_descr, mesh_proto_metrics[i].mpm_descr,
 		    IEEE80211_MESH_PROTO_DSZ) == 0)
 			return EEXIST;
 		if (!mesh_proto_metrics[i].mpm_active && firstempty == -1)
 			firstempty = i;
 	}
 	if (firstempty < 0)
 		return ENOSPC;
 	memcpy(&mesh_proto_metrics[firstempty], mpm, sizeof(*mpm));
 	mesh_proto_metrics[firstempty].mpm_active = 1;
 	return 0;
 }
 
 static int
 mesh_select_proto_path(struct ieee80211vap *vap, const char *name)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	int i;
 
 	for (i = 0; i < nitems(mesh_proto_paths); i++) {
 		if (strcasecmp(mesh_proto_paths[i].mpp_descr, name) == 0) {
 			ms->ms_ppath = &mesh_proto_paths[i];
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static int
 mesh_select_proto_metric(struct ieee80211vap *vap, const char *name)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	int i;
 
 	for (i = 0; i < nitems(mesh_proto_metrics); i++) {
 		if (strcasecmp(mesh_proto_metrics[i].mpm_descr, name) == 0) {
 			ms->ms_pmetric = &mesh_proto_metrics[i];
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static void
 mesh_gatemode_setup(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	/*
 	 * NB: When a mesh gate is running as a ROOT it shall
 	 * not send out periodic GANNs but instead mark the
 	 * mesh gate flag for the corresponding proactive PREQ
 	 * and RANN frames.
 	 */
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_ROOT ||
 	    (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) == 0) {
 		callout_drain(&ms->ms_gatetimer);
 		return ;
 	}
 	callout_reset(&ms->ms_gatetimer, ieee80211_mesh_gateint,
 	    mesh_gatemode_cb, vap);
 }
 
 static void
 mesh_gatemode_cb(void *arg)
 {
 	struct ieee80211vap *vap = (struct ieee80211vap *)arg;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshgann_ie gann;
 
 	gann.gann_flags = 0; /* Reserved */
 	gann.gann_hopcount = 0;
 	gann.gann_ttl = ms->ms_ttl;
 	IEEE80211_ADDR_COPY(gann.gann_addr, vap->iv_myaddr);
 	gann.gann_seq = ms->ms_gateseq++;
 	gann.gann_interval = ieee80211_mesh_gateint;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_MESH, vap->iv_bss,
 	    "send broadcast GANN (seq %u)", gann.gann_seq);
 
 	ieee80211_send_action(vap->iv_bss, IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN, &gann);
 	mesh_gatemode_setup(vap);
 }
 
 static void
 ieee80211_mesh_init(void)
 {
 
 	memset(mesh_proto_paths, 0, sizeof(mesh_proto_paths));
 	memset(mesh_proto_metrics, 0, sizeof(mesh_proto_metrics));
 
 	/*
 	 * Setup mesh parameters that depends on the clock frequency.
 	 */
 	ieee80211_mesh_gateint = msecs_to_ticks(10000);
 	ieee80211_mesh_retrytimeout = msecs_to_ticks(40);
 	ieee80211_mesh_holdingtimeout = msecs_to_ticks(40);
 	ieee80211_mesh_confirmtimeout = msecs_to_ticks(40);
 	ieee80211_mesh_backofftimeout = msecs_to_ticks(5000);
 
 	/*
 	 * Register action frame handlers.
 	 */
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_OPEN,
 	    mesh_recv_action_meshpeering_open);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 	    mesh_recv_action_meshpeering_confirm);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CLOSE,
 	    mesh_recv_action_meshpeering_close);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_LMETRIC, mesh_recv_action_meshlmetric);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN, mesh_recv_action_meshgate);
 
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_OPEN,
 	    mesh_send_action_meshpeering_open);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 	    mesh_send_action_meshpeering_confirm);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CLOSE,
 	    mesh_send_action_meshpeering_close);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_LMETRIC,
 	    mesh_send_action_meshlmetric);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN,
 	    mesh_send_action_meshgate);
 
 	/*
 	 * Register Airtime Link Metric.
 	 */
 	ieee80211_mesh_register_proto_metric(&mesh_metric_airtime);
 
 }
 SYSINIT(wlan_mesh, SI_SUB_DRIVERS, SI_ORDER_FIRST, ieee80211_mesh_init, NULL);
 
 void
 ieee80211_mesh_attach(struct ieee80211com *ic)
 {
 	ic->ic_vattach[IEEE80211_M_MBSS] = mesh_vattach;
 }
 
 void
 ieee80211_mesh_detach(struct ieee80211com *ic)
 {
 }
 
 static void
 mesh_vdetach_peers(void *arg, struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t args[3];
 
 	if (ni->ni_mlstate == IEEE80211_NODE_MESH_ESTABLISHED) {
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 	}
 	callout_drain(&ni->ni_mltimer);
 	/* XXX belongs in hwmp */
 	ieee80211_ageq_drain_node(&ic->ic_stageq,
 	   (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr));
 }
 
 static void
 mesh_vdetach(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	callout_drain(&ms->ms_cleantimer);
 	ieee80211_iterate_nodes(&vap->iv_ic->ic_sta, mesh_vdetach_peers,
 	    NULL);
 	ieee80211_mesh_rt_flush(vap);
 	MESH_RT_LOCK_DESTROY(ms);
 	ms->ms_ppath->mpp_vdetach(vap);
 	IEEE80211_FREE(vap->iv_mesh, M_80211_VAP);
 	vap->iv_mesh = NULL;
 }
 
 static void
 mesh_vattach(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms;
 	vap->iv_newstate = mesh_newstate;
 	vap->iv_input = mesh_input;
 	vap->iv_opdetach = mesh_vdetach;
 	vap->iv_recv_mgmt = mesh_recv_mgmt;
 	vap->iv_recv_ctl = mesh_recv_ctl;
 	ms = IEEE80211_MALLOC(sizeof(struct ieee80211_mesh_state), M_80211_VAP,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (ms == NULL) {
 		printf("%s: couldn't alloc MBSS state\n", __func__);
 		return;
 	}
 	vap->iv_mesh = ms;
 	ms->ms_seq = 0;
 	ms->ms_flags = (IEEE80211_MESHFLAGS_AP | IEEE80211_MESHFLAGS_FWD);
 	ms->ms_ttl = IEEE80211_MESH_DEFAULT_TTL;
 	TAILQ_INIT(&ms->ms_known_gates);
 	TAILQ_INIT(&ms->ms_routes);
 	MESH_RT_LOCK_INIT(ms, "MBSS");
 	callout_init(&ms->ms_cleantimer, 1);
 	callout_init(&ms->ms_gatetimer, 1);
 	ms->ms_gateseq = 0;
 	mesh_select_proto_metric(vap, "AIRTIME");
 	KASSERT(ms->ms_pmetric, ("ms_pmetric == NULL"));
 	mesh_select_proto_path(vap, "HWMP");
 	KASSERT(ms->ms_ppath, ("ms_ppath == NULL"));
 	ms->ms_ppath->mpp_vattach(vap);
 }
 
 /*
  * IEEE80211_M_MBSS vap state machine handler.
  */
 static int
 mesh_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node *ni;
 	enum ieee80211_state ostate;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n",
 	    __func__, ieee80211_state_name[ostate],
 	    ieee80211_state_name[nstate], arg);
 	vap->iv_state = nstate;		/* state transition */
 	if (ostate != IEEE80211_S_SCAN)
 		ieee80211_cancel_scan(vap);	/* background scan */
 	ni = vap->iv_bss;			/* NB: no reference held */
 	if (nstate != IEEE80211_S_RUN && ostate == IEEE80211_S_RUN) {
 		callout_drain(&ms->ms_cleantimer);
 		callout_drain(&ms->ms_gatetimer);
 	}
 	switch (nstate) {
 	case IEEE80211_S_INIT:
 		switch (ostate) {
 		case IEEE80211_S_SCAN:
 			ieee80211_cancel_scan(vap);
 			break;
 		case IEEE80211_S_CAC:
 			ieee80211_dfs_cac_stop(vap);
 			break;
 		case IEEE80211_S_RUN:
 			ieee80211_iterate_nodes(&ic->ic_sta,
 			    mesh_vdetach_peers, NULL);
 			break;
 		default:
 			break;
 		}
 		if (ostate != IEEE80211_S_INIT) {
 			/* NB: optimize INIT -> INIT case */
 			ieee80211_reset_bss(vap);
 			ieee80211_mesh_rt_flush(vap);
 		}
 		break;
 	case IEEE80211_S_SCAN:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			if (vap->iv_des_chan != IEEE80211_CHAN_ANYC &&
 			    !IEEE80211_IS_CHAN_RADAR(vap->iv_des_chan) &&
 			    ms->ms_idlen != 0) {
 				/*
 				 * Already have a channel and a mesh ID; bypass
 				 * the scan and startup immediately.
 				 */
 				ieee80211_create_ibss(vap, vap->iv_des_chan);
 				break;
 			}
 			/*
 			 * Initiate a scan.  We can come here as a result
 			 * of an IEEE80211_IOC_SCAN_REQ too in which case
 			 * the vap will be marked with IEEE80211_FEXT_SCANREQ
 			 * and the scan request parameters will be present
 			 * in iv_scanreq.  Otherwise we do the default.
 			*/
 			if (vap->iv_flags_ext & IEEE80211_FEXT_SCANREQ) {
 				ieee80211_check_scan(vap,
 				    vap->iv_scanreq_flags,
 				    vap->iv_scanreq_duration,
 				    vap->iv_scanreq_mindwell,
 				    vap->iv_scanreq_maxdwell,
 				    vap->iv_scanreq_nssid, vap->iv_scanreq_ssid);
 				vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANREQ;
 			} else
 				ieee80211_check_scan_current(vap);
 			break;
 		default:
 			break;
 		}
 		break;
 	case IEEE80211_S_CAC:
 		/*
 		 * Start CAC on a DFS channel.  We come here when starting
 		 * a bss on a DFS channel (see ieee80211_create_ibss).
 		 */
 		ieee80211_dfs_cac_start(vap);
 		break;
 	case IEEE80211_S_RUN:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			/*
 			 * Already have a channel; bypass the
 			 * scan and startup immediately.
 			 * Note that ieee80211_create_ibss will call
 			 * back to do a RUN->RUN state change.
 			 */
 			ieee80211_create_ibss(vap,
 			    ieee80211_ht_adjust_channel(ic,
 				ic->ic_curchan, vap->iv_flags_ht));
 			/* NB: iv_bss is changed on return */
 			break;
 		case IEEE80211_S_CAC:
 			/*
 			 * NB: This is the normal state change when CAC
 			 * expires and no radar was detected; no need to
 			 * clear the CAC timer as it's already expired.
 			 */
 			/* fall thru... */
 		case IEEE80211_S_CSA:
 #if 0
 			/*
 			 * Shorten inactivity timer of associated stations
 			 * to weed out sta's that don't follow a CSA.
 			 */
 			ieee80211_iterate_nodes(&ic->ic_sta, sta_csa, vap);
 #endif
 			/*
 			 * Update bss node channel to reflect where
 			 * we landed after CSA.
 			 */
 			ieee80211_node_set_chan(ni,
 			    ieee80211_ht_adjust_channel(ic, ic->ic_curchan,
 				ieee80211_htchanflags(ni->ni_chan)));
 			/* XXX bypass debug msgs */
 			break;
 		case IEEE80211_S_SCAN:
 		case IEEE80211_S_RUN:
 #ifdef IEEE80211_DEBUG
 			if (ieee80211_msg_debug(vap)) {
 				ieee80211_note(vap,
 				    "synchronized with %s meshid ",
 				    ether_sprintf(ni->ni_meshid));
 				ieee80211_print_essid(ni->ni_meshid,
 				    ni->ni_meshidlen);
 				/* XXX MCS/HT */
 				printf(" channel %d\n",
 				    ieee80211_chan2ieee(ic, ic->ic_curchan));
 			}
 #endif
 			break;
 		default:
 			break;
 		}
 		ieee80211_node_authorize(ni);
 		callout_reset(&ms->ms_cleantimer, ms->ms_ppath->mpp_inact,
                     mesh_rt_cleanup_cb, vap);
 		mesh_gatemode_setup(vap);
 		break;
 	default:
 		break;
 	}
 	/* NB: ostate not nstate */
 	ms->ms_ppath->mpp_newstate(vap, ostate, arg);
 	return 0;
 }
 
 static void
 mesh_rt_cleanup_cb(void *arg)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	mesh_rt_flush_invalid(vap);
 	callout_reset(&ms->ms_cleantimer, ms->ms_ppath->mpp_inact,
 	    mesh_rt_cleanup_cb, vap);
 }
 
 /*
  * Mark a mesh STA as gate and return a pointer to it.
  * If this is first time, we create a new gate route.
  * Always update the path route to this mesh gate.
  */
 struct ieee80211_mesh_gate_route *
 ieee80211_mesh_mark_gate(struct ieee80211vap *vap, const uint8_t *addr,
     struct ieee80211_mesh_route *rt)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_gate_route *gr = NULL, *next;
 	int found = 0;
 
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, next) {
 		if (IEEE80211_ADDR_EQ(gr->gr_addr, addr)) {
 			found = 1;
 			break;
 		}
 	}
 
 	if (!found) {
 		/* New mesh gate add it to known table. */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, addr,
 		    "%s", "stored new gate information from pro-PREQ.");
 		gr = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_gate_route)),
 		    M_80211_MESH_GT_RT,
 		    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 		IEEE80211_ADDR_COPY(gr->gr_addr, addr);
 		TAILQ_INSERT_TAIL(&ms->ms_known_gates, gr, gr_next);
 	}
 	gr->gr_route = rt;
 	/* TODO: link from path route to gate route */
 	MESH_RT_UNLOCK(ms);
 
 	return gr;
 }
 
 
 /*
  * Helper function to note the Mesh Peer Link FSM change.
  */
 static void
 mesh_linkchange(struct ieee80211_node *ni, enum ieee80211_mesh_mlstate state)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 #ifdef IEEE80211_DEBUG
 	static const char *meshlinkstates[] = {
 		[IEEE80211_NODE_MESH_IDLE]		= "IDLE",
 		[IEEE80211_NODE_MESH_OPENSNT]		= "OPEN SENT",
 		[IEEE80211_NODE_MESH_OPENRCV]		= "OPEN RECEIVED",
 		[IEEE80211_NODE_MESH_CONFIRMRCV]	= "CONFIRM RECEIVED",
 		[IEEE80211_NODE_MESH_ESTABLISHED]	= "ESTABLISHED",
 		[IEEE80211_NODE_MESH_HOLDING]		= "HOLDING"
 	};
 #endif
 	IEEE80211_NOTE(vap, IEEE80211_MSG_MESH,
 	    ni, "peer link: %s -> %s",
 	    meshlinkstates[ni->ni_mlstate], meshlinkstates[state]);
 
 	/* track neighbor count */
 	if (state == IEEE80211_NODE_MESH_ESTABLISHED &&
 	    ni->ni_mlstate != IEEE80211_NODE_MESH_ESTABLISHED) {
 		KASSERT(ms->ms_neighbors < 65535, ("neighbor count overflow"));
 		ms->ms_neighbors++;
 		ieee80211_beacon_notify(vap, IEEE80211_BEACON_MESHCONF);
 	} else if (ni->ni_mlstate == IEEE80211_NODE_MESH_ESTABLISHED &&
 	    state != IEEE80211_NODE_MESH_ESTABLISHED) {
 		KASSERT(ms->ms_neighbors > 0, ("neighbor count 0"));
 		ms->ms_neighbors--;
 		ieee80211_beacon_notify(vap, IEEE80211_BEACON_MESHCONF);
 	}
 	ni->ni_mlstate = state;
 	switch (state) {
 	case IEEE80211_NODE_MESH_HOLDING:
 		ms->ms_ppath->mpp_peerdown(ni);
 		break;
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		ieee80211_mesh_discover(vap, ni->ni_macaddr, NULL);
 		break;
 	default:
 		break;
 	}
 }
 
 /*
  * Helper function to generate a unique local ID required for mesh
  * peer establishment.
  */
 static void
 mesh_checkid(void *arg, struct ieee80211_node *ni)
 {
 	uint16_t *r = arg;
 	
 	if (*r == ni->ni_mllid)
 		*(uint16_t *)arg = 0;
 }
 
 static uint32_t
 mesh_generateid(struct ieee80211vap *vap)
 {
 	int maxiter = 4;
 	uint16_t r;
 
 	do {
 		get_random_bytes(&r, 2);
 		ieee80211_iterate_nodes(&vap->iv_ic->ic_sta, mesh_checkid, &r);
 		maxiter--;
 	} while (r == 0 && maxiter > 0);
 	return r;
 }
 
 /*
  * Verifies if we already received this packet by checking its
  * sequence number.
  * Returns 0 if the frame is to be accepted, 1 otherwise.
  */
 static int
 mesh_checkpseq(struct ieee80211vap *vap,
     const uint8_t source[IEEE80211_ADDR_LEN], uint32_t seq)
 {
 	struct ieee80211_mesh_route *rt;
 
 	rt = ieee80211_mesh_rt_find(vap, source);
 	if (rt == NULL) {
 		rt = ieee80211_mesh_rt_add(vap, source);
 		if (rt == NULL) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, source,
 			    "%s", "add mcast route failed");
 			vap->iv_stats.is_mesh_rtaddfailed++;
 			return 1;
 		}
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, source,
 		    "add mcast route, mesh seqno %d", seq);
 		rt->rt_lastmseq = seq;
 		return 0;
 	}
 	if (IEEE80211_MESH_SEQ_GEQ(rt->rt_lastmseq, seq)) {
 		return 1;
 	} else {
 		rt->rt_lastmseq = seq;
 		return 0;
 	}
 }
 
 /*
  * Iterate the routing table and locate the next hop.
  */
 struct ieee80211_node *
 ieee80211_mesh_find_txnode(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_route *rt;
 
 	rt = ieee80211_mesh_rt_find(vap, dest);
 	if (rt == NULL)
 		return NULL;
 	if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 		    "%s: !valid, flags 0x%x", __func__, rt->rt_flags);
 		/* XXX stat */
 		return NULL;
 	}
 	if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) {
 		rt = ieee80211_mesh_rt_find(vap, rt->rt_mesh_gate);
 		if (rt == NULL) return NULL;
 		if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 			    "%s: meshgate !valid, flags 0x%x", __func__,
 			    rt->rt_flags);
 			/* XXX stat */
 			return NULL;
 		}
 	}
 	return ieee80211_find_txnode(vap, rt->rt_nexthop);
 }
 
 static void
 mesh_transmit_to_gate(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_mesh_route *rt_gate)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_node *ni;
 
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	ni = ieee80211_mesh_find_txnode(vap, rt_gate->rt_dest);
 	if (ni == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Send through the VAP packet transmit path.
 	 * This consumes the node ref grabbed above and
 	 * the mbuf, regardless of whether there's a problem
 	 * or not.
 	 */
 	(void) ieee80211_vap_pkt_send_dest(vap, m, ni);
 }
 
 /*
  * Forward the queued frames to known valid mesh gates.
  * Assume destination to be outside the MBSS (i.e. proxy entry),
  * If no valid mesh gates are known silently discard queued frames.
  * After transmitting frames to all known valid mesh gates, this route
  * will be marked invalid, and a new path discovery will happen in the hopes
  * that (at least) one of the mesh gates have a new proxy entry for us to use.
  */
 void
 ieee80211_mesh_forward_to_gates(struct ieee80211vap *vap,
     struct ieee80211_mesh_route *rt_dest)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt_gate;
 	struct ieee80211_mesh_gate_route *gr = NULL, *gr_next;
 	struct mbuf *m, *mcopy, *next;
 
 	IEEE80211_TX_UNLOCK_ASSERT(ic);
 
 	KASSERT( rt_dest->rt_flags == IEEE80211_MESHRT_FLAGS_DISCOVER,
 	    ("Route is not marked with IEEE80211_MESHRT_FLAGS_DISCOVER"));
 
 	/* XXX: send to more than one valid mash gate */
 	MESH_RT_LOCK(ms);
 
 	m = ieee80211_ageq_remove(&ic->ic_stageq,
 	    (struct ieee80211_node *)(uintptr_t)
 	    ieee80211_mac_hash(ic, rt_dest->rt_dest));
 
 	TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, gr_next) {
 		rt_gate = gr->gr_route;
 		if (rt_gate == NULL) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP,
 				rt_dest->rt_dest,
 				"mesh gate with no path %6D",
 				gr->gr_addr, ":");
 			continue;
 		}
 		if ((rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0)
 			continue;
 		KASSERT(rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_GATE,
 		    ("route not marked as a mesh gate"));
 		KASSERT((rt_gate->rt_flags &
 			IEEE80211_MESHRT_FLAGS_PROXY) == 0,
 			("found mesh gate that is also marked porxy"));
 		/*
 		 * convert route to a proxy route gated by the current
 		 * mesh gate, this is needed so encap can built data
 		 * frame with correct address.
 		 */
 		rt_dest->rt_flags = IEEE80211_MESHRT_FLAGS_PROXY |
 			IEEE80211_MESHRT_FLAGS_VALID;
 		rt_dest->rt_ext_seq = 1; /* random value */
 		IEEE80211_ADDR_COPY(rt_dest->rt_mesh_gate, rt_gate->rt_dest);
 		IEEE80211_ADDR_COPY(rt_dest->rt_nexthop, rt_gate->rt_nexthop);
 		rt_dest->rt_metric = rt_gate->rt_metric;
 		rt_dest->rt_nhops = rt_gate->rt_nhops;
 		ieee80211_mesh_rt_update(rt_dest, ms->ms_ppath->mpp_inact);
 		MESH_RT_UNLOCK(ms);
 		/* XXX: lock?? */
 		mcopy = m_dup(m, M_NOWAIT);
 		for (; mcopy != NULL; mcopy = next) {
 			next = mcopy->m_nextpkt;
 			mcopy->m_nextpkt = NULL;
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP,
 			    rt_dest->rt_dest,
 			    "flush queued frame %p len %d", mcopy,
 			    mcopy->m_pkthdr.len);
 			mesh_transmit_to_gate(vap, mcopy, rt_gate);
 		}
 		MESH_RT_LOCK(ms);
 	}
 	rt_dest->rt_flags = 0; /* Mark invalid */
 	m_freem(m);
 	MESH_RT_UNLOCK(ms);
 }
 
 /*
  * Forward the specified frame.
  * Decrement the TTL and set TA to our MAC address.
  */
 static void
 mesh_forward(struct ieee80211vap *vap, struct mbuf *m,
     const struct ieee80211_meshcntl *mc)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ifnet *ifp = vap->iv_ifp;
 	const struct ieee80211_frame *wh =
 	    mtod(m, const struct ieee80211_frame *);
 	struct mbuf *mcopy;
 	struct ieee80211_meshcntl *mccopy;
 	struct ieee80211_frame *whcopy;
 	struct ieee80211_node *ni;
 	int err;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(ic);
 
 	/*
 	 * mesh ttl of 1 means we are the last one receiving it,
 	 * according to amendment we decrement and then check if
 	 * 0, if so we dont forward.
 	 */
 	if (mc->mc_ttl < 1) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, ttl 1");
 		vap->iv_stats.is_mesh_fwd_ttl++;
 		return;
 	}
 	if (!(ms->ms_flags & IEEE80211_MESHFLAGS_FWD)) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, fwding disabled");
 		vap->iv_stats.is_mesh_fwd_disabled++;
 		return;
 	}
 	mcopy = m_dup(m, M_NOWAIT);
 	if (mcopy == NULL) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, cannot dup");
 		vap->iv_stats.is_mesh_fwd_nobuf++;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return;
 	}
 	mcopy = m_pullup(mcopy, ieee80211_hdrspace(ic, wh) +
 	    sizeof(struct ieee80211_meshcntl));
 	if (mcopy == NULL) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, too short");
 		vap->iv_stats.is_mesh_fwd_tooshort++;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(mcopy);
 		return;
 	}
 	whcopy = mtod(mcopy, struct ieee80211_frame *);
 	mccopy = (struct ieee80211_meshcntl *)
 	    (mtod(mcopy, uint8_t *) + ieee80211_hdrspace(ic, wh));
 	/* XXX clear other bits? */
 	whcopy->i_fc[1] &= ~IEEE80211_FC1_RETRY;
 	IEEE80211_ADDR_COPY(whcopy->i_addr2, vap->iv_myaddr);
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 		ni = ieee80211_ref_node(vap->iv_bss);
 		mcopy->m_flags |= M_MCAST;
 	} else {
 		ni = ieee80211_mesh_find_txnode(vap, whcopy->i_addr3);
 		if (ni == NULL) {
 			/*
 			 * [Optional] any of the following three actions:
 			 * o silently discard
 			 * o trigger a path discovery
 			 * o inform TA that meshDA is unknown.
 			 */
 			IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 			    "%s", "frame not fwd'd, no path");
 			ms->ms_ppath->mpp_senderror(vap, whcopy->i_addr3, NULL,
 			    IEEE80211_REASON_MESH_PERR_NO_FI);
 			vap->iv_stats.is_mesh_fwd_nopath++;
 			m_freem(mcopy);
 			return;
 		}
 		IEEE80211_ADDR_COPY(whcopy->i_addr1, ni->ni_macaddr);
 	}
 	KASSERT(mccopy->mc_ttl > 0, ("%s called with wrong ttl", __func__));
 	mccopy->mc_ttl--;
 
 	/* XXX calculate priority so drivers can find the tx queue */
 	M_WME_SETAC(mcopy, WME_AC_BE);
 
 	/* XXX do we know m_nextpkt is NULL? */
+	MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mcopy->m_pkthdr.rcvif = (void *) ni;
 
 	/*
 	 * XXX this bypasses all of the VAP TX handling; it passes frames
 	 * directly to the parent interface.
 	 *
 	 * Because of this, there's no TX lock being held as there's no
 	 * encaps state being used.
 	 *
 	 * Doing a direct parent transmit may not be the correct thing
 	 * to do here; we'll have to re-think this soon.
 	 */
 	IEEE80211_TX_LOCK(ic);
 	err = ieee80211_parent_xmitpkt(ic, mcopy);
 	IEEE80211_TX_UNLOCK(ic);
 	if (!err)
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 }
 
 static struct mbuf *
 mesh_decap(struct ieee80211vap *vap, struct mbuf *m, int hdrlen, int meshdrlen)
 {
 #define	WHDIR(wh)	((wh)->i_fc[1] & IEEE80211_FC1_DIR_MASK)
 #define	MC01(mc)	((const struct ieee80211_meshcntl_ae01 *)mc)
 	uint8_t b[sizeof(struct ieee80211_qosframe_addr4) +
 		  sizeof(struct ieee80211_meshcntl_ae10)];
 	const struct ieee80211_qosframe_addr4 *wh;
 	const struct ieee80211_meshcntl_ae10 *mc;
 	struct ether_header *eh;
 	struct llc *llc;
 	int ae;
 
 	if (m->m_len < hdrlen + sizeof(*llc) &&
 	    (m = m_pullup(m, hdrlen + sizeof(*llc))) == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_ANY,
 		    "discard data frame: %s", "m_pullup failed");
 		vap->iv_stats.is_rx_tooshort++;
 		return NULL;
 	}
 	memcpy(b, mtod(m, caddr_t), hdrlen);
 	wh = (const struct ieee80211_qosframe_addr4 *)&b[0];
 	mc = (const struct ieee80211_meshcntl_ae10 *)&b[hdrlen - meshdrlen];
 	KASSERT(WHDIR(wh) == IEEE80211_FC1_DIR_FROMDS ||
 		WHDIR(wh) == IEEE80211_FC1_DIR_DSTODS,
 	    ("bogus dir, fc 0x%x:0x%x", wh->i_fc[0], wh->i_fc[1]));
 
 	llc = (struct llc *)(mtod(m, caddr_t) + hdrlen);
 	if (llc->llc_dsap == LLC_SNAP_LSAP && llc->llc_ssap == LLC_SNAP_LSAP &&
 	    llc->llc_control == LLC_UI && llc->llc_snap.org_code[0] == 0 &&
 	    llc->llc_snap.org_code[1] == 0 && llc->llc_snap.org_code[2] == 0 &&
 	    /* NB: preserve AppleTalk frames that have a native SNAP hdr */
 	    !(llc->llc_snap.ether_type == htons(ETHERTYPE_AARP) ||
 	      llc->llc_snap.ether_type == htons(ETHERTYPE_IPX))) {
 		m_adj(m, hdrlen + sizeof(struct llc) - sizeof(*eh));
 		llc = NULL;
 	} else {
 		m_adj(m, hdrlen - sizeof(*eh));
 	}
 	eh = mtod(m, struct ether_header *);
 	ae = mc->mc_flags & IEEE80211_MESH_AE_MASK;
 	if (WHDIR(wh) == IEEE80211_FC1_DIR_FROMDS) {
 		IEEE80211_ADDR_COPY(eh->ether_dhost, wh->i_addr1);
 		if (ae == IEEE80211_MESH_AE_00) {
 			IEEE80211_ADDR_COPY(eh->ether_shost, wh->i_addr3);
 		} else if (ae == IEEE80211_MESH_AE_01) {
 			IEEE80211_ADDR_COPY(eh->ether_shost,
 			    MC01(mc)->mc_addr4);
 		} else {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    (const struct ieee80211_frame *)wh, NULL,
 			    "bad AE %d", ae);
 			vap->iv_stats.is_mesh_badae++;
 			m_freem(m);
 			return NULL;
 		}
 	} else {
 		if (ae == IEEE80211_MESH_AE_00) {
 			IEEE80211_ADDR_COPY(eh->ether_dhost, wh->i_addr3);
 			IEEE80211_ADDR_COPY(eh->ether_shost, wh->i_addr4);
 		} else if (ae == IEEE80211_MESH_AE_10) {
 			IEEE80211_ADDR_COPY(eh->ether_dhost, mc->mc_addr5);
 			IEEE80211_ADDR_COPY(eh->ether_shost, mc->mc_addr6);
 		} else {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    (const struct ieee80211_frame *)wh, NULL,
 			    "bad AE %d", ae);
 			vap->iv_stats.is_mesh_badae++;
 			m_freem(m);
 			return NULL;
 		}
 	}
 #ifndef __NO_STRICT_ALIGNMENT
 	if (!ALIGNED_POINTER(mtod(m, caddr_t) + sizeof(*eh), uint32_t)) {
 		m = ieee80211_realign(vap, m, sizeof(*eh));
 		if (m == NULL)
 			return NULL;
 	}
 #endif /* !__NO_STRICT_ALIGNMENT */
 	if (llc != NULL) {
 		eh = mtod(m, struct ether_header *);
 		eh->ether_type = htons(m->m_pkthdr.len - sizeof(*eh));
 	}
 	return m;
 #undef	WDIR
 #undef	MC01
 }
 
 /*
  * Return non-zero if the unicast mesh data frame should be processed
  * locally.  Frames that are not proxy'd have our address, otherwise
  * we need to consult the routing table to look for a proxy entry.
  */
 static __inline int
 mesh_isucastforme(struct ieee80211vap *vap, const struct ieee80211_frame *wh,
     const struct ieee80211_meshcntl *mc)
 {
 	int ae = mc->mc_flags & 3;
 
 	KASSERT((wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) == IEEE80211_FC1_DIR_DSTODS,
 	    ("bad dir 0x%x:0x%x", wh->i_fc[0], wh->i_fc[1]));
 	KASSERT(ae == IEEE80211_MESH_AE_00 || ae == IEEE80211_MESH_AE_10,
 	    ("bad AE %d", ae));
 	if (ae == IEEE80211_MESH_AE_10) {	/* ucast w/ proxy */
 		const struct ieee80211_meshcntl_ae10 *mc10 =
 		    (const struct ieee80211_meshcntl_ae10 *) mc;
 		struct ieee80211_mesh_route *rt =
 		    ieee80211_mesh_rt_find(vap, mc10->mc_addr5);
 		/* check for proxy route to ourself */
 		return (rt != NULL &&
 		    (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY));
 	} else					/* ucast w/o proxy */
 		return IEEE80211_ADDR_EQ(wh->i_addr3, vap->iv_myaddr);
 }
 
 /*
  * Verifies transmitter, updates lifetime, precursor list and forwards data.
  * > 0 means we have forwarded data and no need to process locally
  * == 0 means we want to process locally (and we may have forwarded data
  * < 0 means there was an error and data should be discarded
  */
 static int
 mesh_recv_indiv_data_to_fwrd(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc)
 {
 	struct ieee80211_qosframe_addr4 *qwh;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt_meshda, *rt_meshsa;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	qwh = (struct ieee80211_qosframe_addr4 *)wh;
 
 	/*
 	 * TODO:
 	 * o verify addr2 is  a legitimate transmitter
 	 * o lifetime of precursor of addr3 (addr2) is max(init, curr)
 	 * o lifetime of precursor of addr4 (nexthop) is max(init, curr)
 	 */
 
 	/* set lifetime of addr3 (meshDA) to initial value */
 	rt_meshda = ieee80211_mesh_rt_find(vap, qwh->i_addr3);
 	if (rt_meshda == NULL) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, qwh->i_addr2,
 		    "no route to meshDA(%6D)", qwh->i_addr3, ":");
 		/*
 		 * [Optional] any of the following three actions:
 		 * o silently discard 				[X]
 		 * o trigger a path discovery			[ ]
 		 * o inform TA that meshDA is unknown.		[ ]
 		 */
 		/* XXX: stats */
 		return (-1);
 	}
 
 	ieee80211_mesh_rt_update(rt_meshda, ticks_to_msecs(
 	    ms->ms_ppath->mpp_inact));
 
 	/* set lifetime of addr4 (meshSA) to initial value */
 	rt_meshsa = ieee80211_mesh_rt_find(vap, qwh->i_addr4);
 	KASSERT(rt_meshsa != NULL, ("no route"));
 	ieee80211_mesh_rt_update(rt_meshsa, ticks_to_msecs(
 	    ms->ms_ppath->mpp_inact));
 
 	mesh_forward(vap, m, mc);
 	return (1); /* dont process locally */
 }
 
 /*
  * Verifies transmitter, updates lifetime, precursor list and process data
  * locally, if data is proxy with AE = 10 it could mean data should go
  * on another mesh path or data should be forwarded to the DS.
  *
  * > 0 means we have forwarded data and no need to process locally
  * == 0 means we want to process locally (and we may have forwarded data
  * < 0 means there was an error and data should be discarded
  */
 static int
 mesh_recv_indiv_data_to_me(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc)
 {
 	struct ieee80211_qosframe_addr4 *qwh;
 	const struct ieee80211_meshcntl_ae10 *mc10;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 	int ae;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	qwh = (struct ieee80211_qosframe_addr4 *)wh;
 	mc10 = (const struct ieee80211_meshcntl_ae10 *)mc;
 
 	/*
 	 * TODO:
 	 * o verify addr2 is  a legitimate transmitter
 	 * o lifetime of precursor entry is max(init, curr)
 	 */
 
 	/* set lifetime of addr4 (meshSA) to initial value */
 	rt = ieee80211_mesh_rt_find(vap, qwh->i_addr4);
 	KASSERT(rt != NULL, ("no route"));
 	ieee80211_mesh_rt_update(rt, ticks_to_msecs(ms->ms_ppath->mpp_inact));
 	rt = NULL;
 
 	ae = mc10->mc_flags & IEEE80211_MESH_AE_MASK;
 	KASSERT(ae == IEEE80211_MESH_AE_00 ||
 	    ae == IEEE80211_MESH_AE_10, ("bad AE %d", ae));
 	if (ae == IEEE80211_MESH_AE_10) {
 		if (IEEE80211_ADDR_EQ(mc10->mc_addr5, qwh->i_addr3)) {
 			return (0); /* process locally */
 		}
 
 		rt =  ieee80211_mesh_rt_find(vap, mc10->mc_addr5);
 		if (rt != NULL &&
 		    (rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) &&
 		    (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) == 0) {
 			/*
 			 * Forward on another mesh-path, according to
 			 * amendment as specified in 9.32.4.1
 			 */
 			IEEE80211_ADDR_COPY(qwh->i_addr3, mc10->mc_addr5);
 			mesh_forward(vap, m,
 			    (const struct ieee80211_meshcntl *)mc10);
 			return (1); /* dont process locally */
 		}
 		/*
 		 * All other cases: forward of MSDUs from the MBSS to DS indiv.
 		 * addressed according to 13.11.3.2.
 		 */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_OUTPUT, qwh->i_addr2,
 		    "forward frame to DS, SA(%6D) DA(%6D)",
 		    mc10->mc_addr6, ":", mc10->mc_addr5, ":");
 	}
 	return (0); /* process locally */
 }
 
 /*
  * Try to forward the group addressed data on to other mesh STAs, and
  * also to the DS.
  *
  * > 0 means we have forwarded data and no need to process locally
  * == 0 means we want to process locally (and we may have forwarded data
  * < 0 means there was an error and data should be discarded
  */
 static int
 mesh_recv_group_data(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc)
 {
 #define	MC01(mc)	((const struct ieee80211_meshcntl_ae01 *)mc)
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	mesh_forward(vap, m, mc);
 
 	if(mc->mc_ttl > 0) {
 		if (mc->mc_flags & IEEE80211_MESH_AE_01) {
 			/*
 			 * Forward of MSDUs from the MBSS to DS group addressed
 			 * (according to 13.11.3.2)
 			 * This happens by delivering the packet, and a bridge
 			 * will sent it on another port member.
 			 */
 			if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE &&
 			    ms->ms_flags & IEEE80211_MESHFLAGS_FWD) {
 				IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH,
 				    MC01(mc)->mc_addr4, "%s",
 				    "forward from MBSS to the DS");
 			}
 		}
 	}
 	return (0); /* process locally */
 #undef	MC01
 }
 
 static int
 mesh_input(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 #define	HAS_SEQ(type)	((type & 0x4) == 0)
 #define	MC01(mc)	((const struct ieee80211_meshcntl_ae01 *)mc)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	const struct ieee80211_meshcntl *mc;
 	int hdrspace, meshdrlen, need_tap, error;
 	uint8_t dir, type, subtype, ae;
 	uint32_t seq;
 	const uint8_t *addr;
 	uint8_t qos[2];
 
 	KASSERT(ni != NULL, ("null node"));
 	ni->ni_inact = ni->ni_inact_reload;
 
 	need_tap = 1;			/* mbuf need to be tapped. */
 	type = -1;			/* undefined */
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(ic);
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL,
 		    "too short (1): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/*
 	 * Bit of a cheat here, we use a pointer for a 3-address
 	 * frame format but don't reference fields past outside
 	 * ieee80211_frame_min w/o first validating the data is
 	 * present.
 	*/
 	wh = mtod(m, struct ieee80211_frame *);
 
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL, "wrong version %x", wh->i_fc[0]);
 		vap->iv_stats.is_rx_badversion++;
 		goto err;
 	}
 	dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 	if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 		ni->ni_noise = nf;
 		if (HAS_SEQ(type)) {
 			uint8_t tid = ieee80211_gettid(wh);
 
 			if (IEEE80211_QOS_HAS_SEQ(wh) &&
 			    TID_TO_WME_AC(tid) >= WME_AC_VI)
 				ic->ic_wme.wme_hipri_traffic++;
 			if (! ieee80211_check_rxseq(ni, wh, wh->i_addr1, rxs))
 				goto out;
 		}
 	}
 #ifdef IEEE80211_DEBUG
 	/*
 	 * It's easier, but too expensive, to simulate different mesh
 	 * topologies by consulting the ACL policy very early, so do this
 	 * only under DEBUG.
 	 *
 	 * NB: this check is also done upon peering link initiation.
 	 */
 	if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) {
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL,
 		    wh, NULL, "%s", "disallowed by ACL");
 		vap->iv_stats.is_rx_acl++;
 		goto out;
 	}
 #endif
 	switch (type) {
 	case IEEE80211_FC0_TYPE_DATA:
 		if (ni == vap->iv_bss)
 			goto out;
 		if (ni->ni_mlstate != IEEE80211_NODE_MESH_ESTABLISHED) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 			    ni->ni_macaddr, NULL,
 			    "peer link not yet established (%d)",
 			    ni->ni_mlstate);
 			vap->iv_stats.is_mesh_nolink++;
 			goto out;
 		}
 		if (dir != IEEE80211_FC1_DIR_FROMDS &&
 		    dir != IEEE80211_FC1_DIR_DSTODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 
 		/* All Mesh data frames are QoS subtype */
 		if (!HAS_SEQ(type)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect subtype 0x%x", subtype);
 			vap->iv_stats.is_rx_badsubtype++;
 			goto err;
 		}
 
 		/*
 		 * Next up, any fragmentation.
 		 * XXX: we defrag before we even try to forward,
 		 * Mesh Control field is not present in sub-sequent
 		 * fragmented frames. This is in contrast to Draft 4.0.
 		 */
 		hdrspace = ieee80211_hdrspace(ic, wh);
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			m = ieee80211_defrag(ni, m, hdrspace);
 			if (m == NULL) {
 				/* Fragment dropped or frame not complete yet */
 				goto out;
 			}
 		}
 		wh = mtod(m, struct ieee80211_frame *); /* NB: after defrag */
 
 		/*
 		 * Now we have a complete Mesh Data frame.
 		 */
 
 		/*
 		 * Only fromDStoDS data frames use 4 address qos frames
 		 * as specified in amendment. Otherwise addr4 is located
 		 * in the Mesh Control field and a 3 address qos frame
 		 * is used.
 		 */
 		*(uint16_t *)qos = *(uint16_t *)ieee80211_getqos(wh);
 
 		/*
 		 * NB: The mesh STA sets the Mesh Control Present
 		 * subfield to 1 in the Mesh Data frame containing
 		 * an unfragmented MSDU, an A-MSDU, or the first
 		 * fragment of an MSDU.
 		 * After defrag it should always be present.
 		 */
 		if (!(qos[1] & IEEE80211_QOS_MC)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 			    ni->ni_macaddr, NULL,
 			    "%s", "Mesh control field not present");
 			vap->iv_stats.is_rx_elem_missing++; /* XXX: kinda */
 			goto err;
 		}
 
 		/* pull up enough to get to the mesh control */
 		if (m->m_len < hdrspace + sizeof(struct ieee80211_meshcntl) &&
 		    (m = m_pullup(m, hdrspace +
 		        sizeof(struct ieee80211_meshcntl))) == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		/*
 		 * Now calculate the full extent of the headers. Note
 		 * mesh_decap will pull up anything we didn't get
 		 * above when it strips the 802.11 headers.
 		 */
 		mc = (const struct ieee80211_meshcntl *)
 		    (mtod(m, const uint8_t *) + hdrspace);
 		ae = mc->mc_flags & IEEE80211_MESH_AE_MASK;
 		meshdrlen = sizeof(struct ieee80211_meshcntl) +
 		    ae * IEEE80211_ADDR_LEN;
 		hdrspace += meshdrlen;
 
 		/* pull complete hdrspace = ieee80211_hdrspace + meshcontrol */
 		if ((meshdrlen > sizeof(struct ieee80211_meshcntl)) &&
 		    (m->m_len < hdrspace) &&
 		    ((m = m_pullup(m, hdrspace)) == NULL)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		/* XXX: are we sure there is no reallocating after m_pullup? */
 
 		seq = le32dec(mc->mc_seq);
 		if (IEEE80211_IS_MULTICAST(wh->i_addr1))
 			addr = wh->i_addr3;
 		else if (ae == IEEE80211_MESH_AE_01)
 			addr = MC01(mc)->mc_addr4;
 		else
 			addr = ((struct ieee80211_qosframe_addr4 *)wh)->i_addr4;
 		if (IEEE80211_ADDR_EQ(vap->iv_myaddr, addr)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    addr, "data", "%s", "not to me");
 			vap->iv_stats.is_rx_wrongbss++;	/* XXX kinda */
 			goto out;
 		}
 		if (mesh_checkpseq(vap, addr, seq) != 0) {
 			vap->iv_stats.is_rx_dup++;
 			goto out;
 		}
 
 		/* This code "routes" the frame to the right control path */
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			if (IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr3))
 				error =
 				    mesh_recv_indiv_data_to_me(vap, m, wh, mc);
 			else if (IEEE80211_IS_MULTICAST(wh->i_addr3))
 				error = mesh_recv_group_data(vap, m, wh, mc);
 			else
 				error = mesh_recv_indiv_data_to_fwrd(vap, m,
 				    wh, mc);
 		} else
 			error = mesh_recv_group_data(vap, m, wh, mc);
 		if (error < 0)
 			goto err;
 		else if (error > 0)
 			goto out;
 
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 
 		/*
 		 * Finally, strip the 802.11 header.
 		 */
 		m = mesh_decap(vap, m, hdrspace, meshdrlen);
 		if (m == NULL) {
 			/* XXX mask bit to check for both */
 			/* don't count Null data frames as errors */
 			if (subtype == IEEE80211_FC0_SUBTYPE_NODATA ||
 			    subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL)
 				goto out;
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "decap error");
 			vap->iv_stats.is_rx_decap++;
 			IEEE80211_NODE_STAT(ni, rx_decap);
 			goto err;
 		}
 		if (qos[0] & IEEE80211_QOS_AMSDU) {
 			m = ieee80211_decap_amsdu(ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 		}
 		ieee80211_deliver_data(vap, ni, m);
 		return type;
 	case IEEE80211_FC0_TYPE_MGT:
 		vap->iv_stats.is_rx_mgmt++;
 		IEEE80211_NODE_STAT(ni, rx_mgmt);
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "mgt", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 		if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, "mgt", "too short: len %u",
 			    m->m_pkthdr.len);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;
 		}
 #ifdef IEEE80211_DEBUG
 		if ((ieee80211_msg_debug(vap) && 
 		    (vap->iv_ic->ic_flags & IEEE80211_F_SCAN)) ||
 		    ieee80211_msg_dumppkts(vap)) {
 			if_printf(ifp, "received %s from %s rssi %d\n",
 			    ieee80211_mgt_subtype_name(subtype),
 			    ether_sprintf(wh->i_addr2), rssi);
 		}
 #endif
 		if (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "WEP set but not permitted");
 			vap->iv_stats.is_rx_mgtdiscard++; /* XXX */
 			goto out;
 		}
 		vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf);
 		goto out;
 	case IEEE80211_FC0_TYPE_CTL:
 		vap->iv_stats.is_rx_ctl++;
 		IEEE80211_NODE_STAT(ni, rx_ctrl);
 		goto out;
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "bad", "frame type 0x%x", type);
 		/* should not come here */
 		break;
 	}
 err:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 out:
 	if (m != NULL) {
 		if (need_tap && ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		m_freem(m);
 	}
 	return type;
 #undef	HAS_SEQ
 #undef	MC01
 }
 
 static void
 mesh_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, int subtype,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_channel *rxchan = ic->ic_curchan;
 	struct ieee80211_frame *wh;
 	struct ieee80211_mesh_route *rt;
 	uint8_t *frm, *efrm;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	frm = (uint8_t *)&wh[1];
 	efrm = mtod(m0, uint8_t *) + m0->m_len;
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 	{
 		struct ieee80211_scanparams scan;
 		struct ieee80211_channel *c;
 		/*
 		 * We process beacon/probe response
 		 * frames to discover neighbors.
 		 */
 		if (rxs != NULL) {
 			c = ieee80211_lookup_channel_rxstatus(vap, rxs);
 			if (c != NULL)
 				rxchan = c;
 		}
 		if (ieee80211_parse_beacon(ni, m0, rxchan, &scan) != 0)
 			return;
 		/*
 		 * Count frame now that we know it's to be processed.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_BEACON) {
 			vap->iv_stats.is_rx_beacon++;	/* XXX remove */
 			IEEE80211_NODE_STAT(ni, rx_beacons);
 		} else
 			IEEE80211_NODE_STAT(ni, rx_proberesp);
 		/*
 		 * If scanning, just pass information to the scan module.
 		 */
 		if (ic->ic_flags & IEEE80211_F_SCAN) {
 			if (ic->ic_flags_ext & IEEE80211_FEXT_PROBECHAN) {
 				/*
 				 * Actively scanning a channel marked passive;
 				 * send a probe request now that we know there
 				 * is 802.11 traffic present.
 				 *
 				 * XXX check if the beacon we recv'd gives
 				 * us what we need and suppress the probe req
 				 */
 				ieee80211_probe_curchan(vap, 1);
 				ic->ic_flags_ext &= ~IEEE80211_FEXT_PROBECHAN;
 			}
 			ieee80211_add_scan(vap, rxchan, &scan, wh,
 			    subtype, rssi, nf);
 			return;
 		}
 
 		/* The rest of this code assumes we are running */
 		if (vap->iv_state != IEEE80211_S_RUN)
 			return;
 		/*
 		 * Ignore non-mesh STAs.
 		 */
 		if ((scan.capinfo &
 		     (IEEE80211_CAPINFO_ESS|IEEE80211_CAPINFO_IBSS)) ||
 		    scan.meshid == NULL || scan.meshconf == NULL) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "beacon", "%s", "not a mesh sta");
 			vap->iv_stats.is_mesh_wrongmesh++;
 			return;
 		}
 		/*
 		 * Ignore STAs for other mesh networks.
 		 */
 		if (memcmp(scan.meshid+2, ms->ms_id, ms->ms_idlen) != 0 ||
 		    mesh_verify_meshconf(vap, scan.meshconf)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "beacon", "%s", "not for our mesh");
 			vap->iv_stats.is_mesh_wrongmesh++;
 			return;
 		}
 		/*
 		 * Peer only based on the current ACL policy.
 		 */
 		if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL,
 			    wh, NULL, "%s", "disallowed by ACL");
 			vap->iv_stats.is_rx_acl++;
 			return;
 		}
 		/*
 		 * Do neighbor discovery.
 		 */
 		if (!IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_macaddr)) {
 			/*
 			 * Create a new entry in the neighbor table.
 			 */
 			ni = ieee80211_add_neighbor(vap, wh, &scan);
 		}
 		/*
 		 * Automatically peer with discovered nodes if possible.
 		 */
 		if (ni != vap->iv_bss &&
 		    (ms->ms_flags & IEEE80211_MESHFLAGS_AP)) {
 			switch (ni->ni_mlstate) {
 			case IEEE80211_NODE_MESH_IDLE:
 			{
 				uint16_t args[1];
 
 				/* Wait for backoff callout to reset counter */
 				if (ni->ni_mlhcnt >= ieee80211_mesh_maxholding)
 					return;
 
 				ni->ni_mlpid = mesh_generateid(vap);
 				if (ni->ni_mlpid == 0)
 					return;
 				mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENSNT);
 				args[0] = ni->ni_mlpid;
 				ieee80211_send_action(ni,
 				IEEE80211_ACTION_CAT_SELF_PROT,
 				IEEE80211_ACTION_MESHPEERING_OPEN, args);
 				ni->ni_mlrcnt = 0;
 				mesh_peer_timeout_setup(ni);
 				break;
 			}
 			case IEEE80211_NODE_MESH_ESTABLISHED:
 			{
 				/*
 				 * Valid beacon from a peer mesh STA
 				 * bump TA lifetime
 				 */
 				rt = ieee80211_mesh_rt_find(vap, wh->i_addr2);
 				if(rt != NULL) {
 					ieee80211_mesh_rt_update(rt,
 					    ticks_to_msecs(
 					    ms->ms_ppath->mpp_inact));
 				}
 				break;
 			}
 			default:
 				break; /* ignore */
 			}
 		}
 		break;
 	}
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 	{
 		uint8_t *ssid, *meshid, *rates, *xrates;
 
 		if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		if (IEEE80211_IS_MULTICAST(wh->i_addr2)) {
 			/* frame must be directed */
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not unicast");
 			vap->iv_stats.is_rx_mgtdiscard++;	/* XXX stat */
 			return;
 		}
 		/*
 		 * prreq frame format
 		 *      [tlv] ssid
 		 *      [tlv] supported rates
 		 *      [tlv] extended supported rates
 		 *	[tlv] mesh id
 		 */
 		ssid = meshid = rates = xrates = NULL;
 		while (efrm - frm > 1) {
 			IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return);
 			switch (*frm) {
 			case IEEE80211_ELEMID_SSID:
 				ssid = frm;
 				break;
 			case IEEE80211_ELEMID_RATES:
 				rates = frm;
 				break;
 			case IEEE80211_ELEMID_XRATES:
 				xrates = frm;
 				break;
 			case IEEE80211_ELEMID_MESHID:
 				meshid = frm;
 				break;
 			}
 			frm += frm[1] + 2;
 		}
 		IEEE80211_VERIFY_ELEMENT(ssid, IEEE80211_NWID_LEN, return);
 		IEEE80211_VERIFY_ELEMENT(rates, IEEE80211_RATE_MAXSIZE, return);
 		if (xrates != NULL)
 			IEEE80211_VERIFY_ELEMENT(xrates,
 			    IEEE80211_RATE_MAXSIZE - rates[1], return);
 		if (meshid != NULL) {
 			IEEE80211_VERIFY_ELEMENT(meshid,
 			    IEEE80211_MESHID_LEN, return);
 			/* NB: meshid, not ssid */
 			IEEE80211_VERIFY_SSID(vap->iv_bss, meshid, return);
 		}
 
 		/* XXX find a better class or define it's own */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_INPUT, wh->i_addr2,
 		    "%s", "recv probe req");
 		/*
 		 * Some legacy 11b clients cannot hack a complete
 		 * probe response frame.  When the request includes
 		 * only a bare-bones rate set, communicate this to
 		 * the transmit side.
 		 */
 		ieee80211_send_proberesp(vap, wh->i_addr2, 0);
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_ACTION:
 	case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 		if (ni == vap->iv_bss) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "unknown node");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1) &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not for us");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else {
 			if (ieee80211_parse_action(ni, m0) == 0)
 				(void)ic->ic_recv_action(ni, wh, frm, efrm);
 		}
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 	case IEEE80211_FC0_SUBTYPE_ATIM:
 	case IEEE80211_FC0_SUBTYPE_DISASSOC:
 	case IEEE80211_FC0_SUBTYPE_AUTH:
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 		    wh, NULL, "%s", "not handled");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "mgt", "subtype 0x%x not handled", subtype);
 		vap->iv_stats.is_rx_badsubtype++;
 		break;
 	}
 }
 
 static void
 mesh_recv_ctl(struct ieee80211_node *ni, struct mbuf *m, int subtype)
 {
 
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_BAR:
 		ieee80211_recv_bar(ni, m);
 		break;
 	}
 }
 
 /*
  * Parse meshpeering action ie's for MPM frames
  */
 static const struct ieee80211_meshpeer_ie *
 mesh_parse_meshpeering_action(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,	/* XXX for VERIFY_LENGTH */
 	const uint8_t *frm, const uint8_t *efrm,
 	struct ieee80211_meshpeer_ie *mp, uint8_t subtype)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	const struct ieee80211_meshpeer_ie *mpie;
 	uint16_t args[3];
 	const uint8_t *meshid, *meshconf;
 	uint8_t sendclose = 0; /* 1 = MPM frame rejected, close will be sent */
 
 	meshid = meshconf = NULL;
 	while (efrm - frm > 1) {
 		IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return NULL);
 		switch (*frm) {
 		case IEEE80211_ELEMID_MESHID:
 			meshid = frm;
 			break;
 		case IEEE80211_ELEMID_MESHCONF:
 			meshconf = frm;
 			break;
 		case IEEE80211_ELEMID_MESHPEER:
 			mpie = (const struct ieee80211_meshpeer_ie *) frm;
 			memset(mp, 0, sizeof(*mp));
 			mp->peer_len = mpie->peer_len;
 			mp->peer_proto = le16dec(&mpie->peer_proto);
 			mp->peer_llinkid = le16dec(&mpie->peer_llinkid);
 			switch (subtype) {
 			case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 				mp->peer_linkid =
 				    le16dec(&mpie->peer_linkid);
 				break;
 			case IEEE80211_ACTION_MESHPEERING_CLOSE:
 				/* NB: peer link ID is optional */
 				if (mpie->peer_len ==
 				    (IEEE80211_MPM_BASE_SZ + 2)) {
 					mp->peer_linkid = 0;
 					mp->peer_rcode =
 					    le16dec(&mpie->peer_linkid);
 				} else {
 					mp->peer_linkid =
 					    le16dec(&mpie->peer_linkid);
 					mp->peer_rcode =
 					    le16dec(&mpie->peer_rcode);
 				}
 				break;
 			}
 			break;
 		}
 		frm += frm[1] + 2;
 	}
 
 	/*
 	 * Verify the contents of the frame.
 	 * If it fails validation, close the peer link.
 	 */
 	if (mesh_verify_meshpeer(vap, subtype, (const uint8_t *)mp)) {
 		sendclose = 1;
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "%s", "MPM validation failed");
 	}
 
 	/* If meshid is not the same reject any frames type. */
 	if (sendclose == 0 && mesh_verify_meshid(vap, meshid)) {
 		sendclose = 1;
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "%s", "not for our mesh");
 		if (subtype == IEEE80211_ACTION_MESHPEERING_CLOSE) {
 			/*
 			 * Standard not clear about this, if we dont ignore
 			 * there will be an endless loop between nodes sending
 			 * CLOSE frames between each other with wrong meshid.
 			 * Discard and timers will bring FSM to IDLE state.
 			 */
 			return NULL;
 		}
 	}
 	
 	/*
 	 * Close frames are accepted if meshid is the same.
 	 * Verify the other two types.
 	 */
 	if (sendclose == 0 && subtype != IEEE80211_ACTION_MESHPEERING_CLOSE &&
 	    mesh_verify_meshconf(vap, meshconf)) {
 		sendclose = 1;
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "%s", "configuration missmatch");
 	}
 
 	if (sendclose) {
 		vap->iv_stats.is_rx_mgtdiscard++;
 		switch (ni->ni_mlstate) {
 		case IEEE80211_NODE_MESH_IDLE:
 		case IEEE80211_NODE_MESH_ESTABLISHED:
 		case IEEE80211_NODE_MESH_HOLDING:
 			/* ignore */
 			break;
 		case IEEE80211_NODE_MESH_OPENSNT:
 		case IEEE80211_NODE_MESH_OPENRCV:
 		case IEEE80211_NODE_MESH_CONFIRMRCV:
 			args[0] = ni->ni_mlpid;
 			args[1] = ni->ni_mllid;
 			/* Reason codes for rejection */
 			switch (subtype) {
 			case IEEE80211_ACTION_MESHPEERING_OPEN:
 				args[2] = IEEE80211_REASON_MESH_CPVIOLATION;
 				break;
 			case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 				args[2] = IEEE80211_REASON_MESH_INCONS_PARAMS;
 				break;
 			}
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		return NULL;
 	}
 	
 	return (const struct ieee80211_meshpeer_ie *) mp;
 }
 
 static int
 mesh_recv_action_meshpeering_open(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshpeer_ie ie;
 	const struct ieee80211_meshpeer_ie *meshpeer;
 	uint16_t args[3];
 
 	/* +2+2 for action + code + capabilites */
 	meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2+2, efrm, &ie,
 	    IEEE80211_ACTION_MESHPEERING_OPEN);
 	if (meshpeer == NULL) {
 		return 0;
 	}
 
 	/* XXX move up */
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "recv PEER OPEN, lid 0x%x", meshpeer->peer_llinkid);
 
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_IDLE:
 		/* Reject open request if reached our maximum neighbor count */
 		if (ms->ms_neighbors >= IEEE80211_MESH_MAX_NEIGHBORS) {
 			args[0] = meshpeer->peer_llinkid;
 			args[1] = 0;
 			args[2] = IEEE80211_REASON_MESH_MAX_PEERS;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			/* stay in IDLE state */
 			return (0);
 		}
 		/* Open frame accepted */
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENRCV);
 		ni->ni_mllid = meshpeer->peer_llinkid;
 		ni->ni_mlpid = mesh_generateid(vap);
 		if (ni->ni_mlpid == 0)
 			return 0;		/* XXX */
 		args[0] = ni->ni_mlpid;
 		/* Announce we're open too... */
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_OPEN, args);
 		/* ...and confirm the link. */
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_OPENRCV:
 		/* Wrong Link ID */
 		if (ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mllid;
 			args[1] = ni->ni_mlpid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		/* Duplicate open, confirm again. */
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		break;
 	case IEEE80211_NODE_MESH_OPENSNT:
 		ni->ni_mllid = meshpeer->peer_llinkid;
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENRCV);
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		/* NB: don't setup/clear any timeout */
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		if (ni->ni_mlpid != meshpeer->peer_linkid ||
 		    ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mlpid;
 			args[1] = ni->ni_mllid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni,
 			    IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_ESTABLISHED);
 		ni->ni_mllid = meshpeer->peer_llinkid;
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		mesh_peer_timeout_stop(ni);
 		break;
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		if (ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mllid;
 			args[1] = ni->ni_mlpid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		args[0] = ni->ni_mlpid;
 		args[1] = meshpeer->peer_llinkid;
 		/* Standard not clear about what the reaason code should be */
 		args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 		break;
 	}
 	return 0;
 }
 
 static int
 mesh_recv_action_meshpeering_confirm(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_meshpeer_ie ie;
 	const struct ieee80211_meshpeer_ie *meshpeer;
 	uint16_t args[3];
 
 	/* +2+2+2+2 for action + code + capabilites + status code + AID */
 	meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2+2+2+2, efrm, &ie,
 	    IEEE80211_ACTION_MESHPEERING_CONFIRM);
 	if (meshpeer == NULL) {
 		return 0;
 	}
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "recv PEER CONFIRM, local id 0x%x, peer id 0x%x",
 	    meshpeer->peer_llinkid, meshpeer->peer_linkid);
 
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_OPENRCV:
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_ESTABLISHED);
 		mesh_peer_timeout_stop(ni);
 		break;
 	case IEEE80211_NODE_MESH_OPENSNT:
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_CONFIRMRCV);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		args[0] = ni->ni_mlpid;
 		args[1] = meshpeer->peer_llinkid;
 		/* Standard not clear about what the reaason code should be */
 		args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		if (ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mlpid;
 			args[1] = ni->ni_mllid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 		}
 		break;
 	default:
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "received confirm in invalid state %d",
 		    ni->ni_mlstate);
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 	}
 	return 0;
 }
 
 static int
 mesh_recv_action_meshpeering_close(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211_meshpeer_ie ie;
 	const struct ieee80211_meshpeer_ie *meshpeer;
 	uint16_t args[3];
 
 	/* +2 for action + code */
 	meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2, efrm, &ie,
 	    IEEE80211_ACTION_MESHPEERING_CLOSE);
 	if (meshpeer == NULL) {
 		return 0;
 	}
 
 	/*
 	 * XXX: check reason code, for example we could receive
 	 * IEEE80211_REASON_MESH_MAX_PEERS then we should not attempt
 	 * to peer again.
 	 */
 
 	IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 	    ni, "%s", "recv PEER CLOSE");
 
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_IDLE:
 		/* ignore */
 		break;
 	case IEEE80211_NODE_MESH_OPENRCV:
 	case IEEE80211_NODE_MESH_OPENSNT:
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		args[2] = IEEE80211_REASON_MESH_CLOSE_RCVD;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_IDLE);
 		mesh_peer_timeout_stop(ni);
 		break;
 	}
 	return 0;
 }
 
 /*
  * Link Metric handling.
  */
 static int
 mesh_recv_action_meshlmetric(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	const struct ieee80211_meshlmetric_ie *ie =
 	    (const struct ieee80211_meshlmetric_ie *)
 	    (frm+2); /* action + code */
 	struct ieee80211_meshlmetric_ie lm_rep;
 	
 	if (ie->lm_flags & IEEE80211_MESH_LMETRIC_FLAGS_REQ) {
 		lm_rep.lm_flags = 0;
 		lm_rep.lm_metric = mesh_airtime_calc(ni);
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_MESH,
 		    IEEE80211_ACTION_MESH_LMETRIC,
 		    &lm_rep);
 	}
 	/* XXX: else do nothing for now */
 	return 0;
 }
 
 /*
  * Parse meshgate action ie's for GANN frames.
  * Returns -1 if parsing fails, otherwise 0.
  */
 static int
 mesh_parse_meshgate_action(struct ieee80211_node *ni,
     const struct ieee80211_frame *wh,	/* XXX for VERIFY_LENGTH */
     struct ieee80211_meshgann_ie *ie, const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	const struct ieee80211_meshgann_ie *gannie;
 
 	while (efrm - frm > 1) {
 		IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return -1);
 		switch (*frm) {
 		case IEEE80211_ELEMID_MESHGANN:
 			gannie = (const struct ieee80211_meshgann_ie *) frm;
 			memset(ie, 0, sizeof(*ie));
 			ie->gann_ie = gannie->gann_ie;
 			ie->gann_len = gannie->gann_len;
 			ie->gann_flags = gannie->gann_flags;
 			ie->gann_hopcount = gannie->gann_hopcount;
 			ie->gann_ttl = gannie->gann_ttl;
 			IEEE80211_ADDR_COPY(ie->gann_addr, gannie->gann_addr);
 			ie->gann_seq = le32dec(&gannie->gann_seq);
 			ie->gann_interval = le16dec(&gannie->gann_interval);
 			break;
 		}
 		frm += frm[1] + 2;
 	}
 
 	return 0;
 }
 
 /*
  * Mesh Gate Announcement handling.
  */
 static int
 mesh_recv_action_meshgate(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_gate_route *gr, *next;
 	struct ieee80211_mesh_route *rt_gate;
 	struct ieee80211_meshgann_ie pgann;
 	struct ieee80211_meshgann_ie ie;
 	int found = 0;
 
 	/* +2 for action + code */
 	if (mesh_parse_meshgate_action(ni, wh, &ie, frm+2, efrm) != 0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 		    ni->ni_macaddr, NULL, "%s",
 		    "GANN parsing failed");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		return (0);
 	}
 
 	if (IEEE80211_ADDR_EQ(vap->iv_myaddr, ie.gann_addr))
 		return 0;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ni->ni_macaddr,
 	    "received GANN, meshgate: %6D (seq %u)", ie.gann_addr, ":",
 	    ie.gann_seq);
 
 	if (ms == NULL)
 		return (0);
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, next) {
 		if (!IEEE80211_ADDR_EQ(gr->gr_addr, ie.gann_addr))
 			continue;
 		if (ie.gann_seq <= gr->gr_lastseq) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 			    ni->ni_macaddr, NULL,
 			    "GANN old seqno %u <= %u",
 			    ie.gann_seq, gr->gr_lastseq);
 			MESH_RT_UNLOCK(ms);
 			return (0);
 		}
 		/* corresponding mesh gate found & GANN accepted */
 		found = 1;
 		break;
 
 	}
 	if (found == 0) {
 		/* this GANN is from a new mesh Gate add it to known table. */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ie.gann_addr,
 		    "stored new GANN information, seq %u.", ie.gann_seq);
 		gr = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_gate_route)),
 		    M_80211_MESH_GT_RT,
 		    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 		IEEE80211_ADDR_COPY(gr->gr_addr, ie.gann_addr);
 		TAILQ_INSERT_TAIL(&ms->ms_known_gates, gr, gr_next);
 	}
 	gr->gr_lastseq = ie.gann_seq;
 
 	/* check if we have a path to this gate */
 	rt_gate = mesh_rt_find_locked(ms, gr->gr_addr);
 	if (rt_gate != NULL &&
 	    rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) {
 		gr->gr_route = rt_gate;
 		rt_gate->rt_flags |= IEEE80211_MESHRT_FLAGS_GATE;
 	}
 
 	MESH_RT_UNLOCK(ms);
 
 	/* popagate only if decremented ttl >= 1 && forwarding is enabled */
 	if ((ie.gann_ttl - 1) < 1 && !(ms->ms_flags & IEEE80211_MESHFLAGS_FWD))
 		return 0;
 	pgann.gann_flags = ie.gann_flags; /* Reserved */
 	pgann.gann_hopcount = ie.gann_hopcount + 1;
 	pgann.gann_ttl = ie.gann_ttl - 1;
 	IEEE80211_ADDR_COPY(pgann.gann_addr, ie.gann_addr);
 	pgann.gann_seq = ie.gann_seq;
 	pgann.gann_interval = ie.gann_interval;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ie.gann_addr,
 	    "%s", "propagate GANN");
 
 	ieee80211_send_action(vap->iv_bss, IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN, &pgann);
 
 	return 0;
 }
 
 static int
 mesh_send_action(struct ieee80211_node *ni,
     const uint8_t sa[IEEE80211_ADDR_LEN],
     const uint8_t da[IEEE80211_ADDR_LEN],
     struct mbuf *m)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_bpf_params params;
 	int ret;
 
 	KASSERT(ni != NULL, ("null node"));
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni,
 		    "block %s frame in CAC state", "Mesh action");
 		vap->iv_stats.is_tx_badstate++;
 		ieee80211_free_node(ni);
 		m_freem(m);
 		return EIO;		/* XXX */
 	}
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT);
 	if (m == NULL) {
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 	ieee80211_send_setup(ni, m,
 	     IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_ACTION,
 	     IEEE80211_NONQOS_TID, sa, da, sa);
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	memset(&params, 0, sizeof(params));
 	params.ibp_pri = WME_AC_VO;
 	params.ibp_rate0 = ni->ni_txparms->mgmtrate;
 	if (IEEE80211_IS_MULTICAST(da))
 		params.ibp_try0 = 1;
 	else
 		params.ibp_try0 = ni->ni_txparms->maxretry;
 	params.ibp_power = ni->ni_txpower;
 
 	IEEE80211_NODE_STAT(ni, tx_mgmt);
 
 	ret = ieee80211_raw_output(vap, ni, m, &params);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 #define	ADDSHORT(frm, v) do {			\
 	frm[0] = (v) & 0xff;			\
 	frm[1] = (v) >> 8;			\
 	frm += 2;				\
 } while (0)
 #define	ADDWORD(frm, v) do {			\
 	frm[0] = (v) & 0xff;			\
 	frm[1] = ((v) >> 8) & 0xff;		\
 	frm[2] = ((v) >> 16) & 0xff;		\
 	frm[3] = ((v) >> 24) & 0xff;		\
 	frm += 4;				\
 } while (0)
 
 static int
 mesh_send_action_meshpeering_open(struct ieee80211_node *ni,
 	int category, int action, void *args0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t *args = args0;
 	const struct ieee80211_rateset *rs;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "send PEER OPEN action: localid 0x%x", args[0]);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t)	/* action+category */
 	    + sizeof(uint16_t)	/* capabilites */
 	    + 2 + IEEE80211_RATE_SIZE
 	    + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 	    + 2 + IEEE80211_MESHID_LEN
 	    + sizeof(struct ieee80211_meshconf_ie)
 	    + sizeof(struct ieee80211_meshpeer_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh peer open action frame format:
 		 *   [1] category
 		 *   [1] action
 		 *   [2] capabilities
 		 *   [tlv] rates
 		 *   [tlv] xrates
 		 *   [tlv] mesh id
 		 *   [tlv] mesh conf
 		 *   [tlv] mesh peer link mgmt
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		ADDSHORT(frm, ieee80211_getcapinfo(vap, ni->ni_chan));
 		rs = ieee80211_get_suprates(ic, ic->ic_curchan);
 		frm = ieee80211_add_rates(frm, rs);
 		frm = ieee80211_add_xrates(frm, rs);
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshconf(frm, vap);
 		frm = ieee80211_add_meshpeer(frm, IEEE80211_ACTION_MESHPEERING_OPEN,
 		    args[0], 0, 0);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshpeering_confirm(struct ieee80211_node *ni,
 	int category, int action, void *args0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t *args = args0;
 	const struct ieee80211_rateset *rs;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "send PEER CONFIRM action: localid 0x%x, peerid 0x%x",
 	    args[0], args[1]);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t)	/* action+category */
 	    + sizeof(uint16_t)	/* capabilites */
 	    + sizeof(uint16_t)	/* status code */
 	    + sizeof(uint16_t)	/* AID */
 	    + 2 + IEEE80211_RATE_SIZE
 	    + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 	    + 2 + IEEE80211_MESHID_LEN
 	    + sizeof(struct ieee80211_meshconf_ie)
 	    + sizeof(struct ieee80211_meshpeer_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh peer confirm action frame format:
 		 *   [1] category
 		 *   [1] action
 		 *   [2] capabilities
 		 *   [2] status code
 		 *   [2] association id (peer ID)
 		 *   [tlv] rates
 		 *   [tlv] xrates
 		 *   [tlv] mesh id
 		 *   [tlv] mesh conf
 		 *   [tlv] mesh peer link mgmt
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		ADDSHORT(frm, ieee80211_getcapinfo(vap, ni->ni_chan));
 		ADDSHORT(frm, 0);		/* status code */
 		ADDSHORT(frm, args[1]);		/* AID */
 		rs = ieee80211_get_suprates(ic, ic->ic_curchan);
 		frm = ieee80211_add_rates(frm, rs);
 		frm = ieee80211_add_xrates(frm, rs);
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshconf(frm, vap);
 		frm = ieee80211_add_meshpeer(frm,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args[0], args[1], 0);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshpeering_close(struct ieee80211_node *ni,
 	int category, int action, void *args0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t *args = args0;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "send PEER CLOSE action: localid 0x%x, peerid 0x%x reason %d (%s)",
 	    args[0], args[1], args[2], ieee80211_reason_to_string(args[2]));
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t)	/* action+category */
 	    + sizeof(uint16_t)	/* reason code */
 	    + 2 + IEEE80211_MESHID_LEN
 	    + sizeof(struct ieee80211_meshpeer_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh peer close action frame format:
 		 *   [1] category
 		 *   [1] action
 		 *   [tlv] mesh id
 		 *   [tlv] mesh peer link mgmt
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshpeer(frm,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args[0], args[1], args[2]);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshlmetric(struct ieee80211_node *ni,
 	int category, int action, void *arg0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_meshlmetric_ie *ie = arg0;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	if (ie->lm_flags & IEEE80211_MESH_LMETRIC_FLAGS_REQ) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    ni, "%s", "send LINK METRIC REQUEST action");
 	} else {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    ni, "send LINK METRIC REPLY action: metric 0x%x",
 		    ie->lm_metric);
 	}
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t) +	/* action+category */
 	    sizeof(struct ieee80211_meshlmetric_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh link metric
 		 *   [1] category
 		 *   [1] action
 		 *   [tlv] mesh link metric
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		frm = ieee80211_add_meshlmetric(frm,
 		    ie->lm_flags, ie->lm_metric);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshgate(struct ieee80211_node *ni,
 	int category, int action, void *arg0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_meshgann_ie *ie = arg0;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t) +	/* action+category */
 	    IEEE80211_MESHGANN_BASE_SZ
 	);
 	if (m != NULL) {
 		/*
 		 * mesh link metric
 		 *   [1] category
 		 *   [1] action
 		 *   [tlv] mesh gate annoucement
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		frm = ieee80211_add_meshgate(frm, ie);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, broadcastaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static void
 mesh_peer_timeout_setup(struct ieee80211_node *ni)
 {
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_HOLDING:
 		ni->ni_mltval = ieee80211_mesh_holdingtimeout;
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		ni->ni_mltval = ieee80211_mesh_confirmtimeout;
 		break;
 	case IEEE80211_NODE_MESH_IDLE:
 		ni->ni_mltval = 0;
 		break;
 	default:
 		ni->ni_mltval = ieee80211_mesh_retrytimeout;
 		break;
 	}
 	if (ni->ni_mltval)
 		callout_reset(&ni->ni_mltimer, ni->ni_mltval,
 		    mesh_peer_timeout_cb, ni);
 }
 
 /*
  * Same as above but backoffs timer statisically 50%.
  */
 static void
 mesh_peer_timeout_backoff(struct ieee80211_node *ni)
 {
 	uint32_t r;
 	
 	r = arc4random();
 	ni->ni_mltval += r % ni->ni_mltval;
 	callout_reset(&ni->ni_mltimer, ni->ni_mltval, mesh_peer_timeout_cb,
 	    ni);
 }
 
 static __inline void
 mesh_peer_timeout_stop(struct ieee80211_node *ni)
 {
 	callout_drain(&ni->ni_mltimer);
 }
 
 static void
 mesh_peer_backoff_cb(void *arg)
 {
 	struct ieee80211_node *ni = (struct ieee80211_node *)arg;
 
 	/* After backoff timeout, try to peer automatically again. */
 	ni->ni_mlhcnt = 0;
 }
 
 /*
  * Mesh Peer Link Management FSM timeout handling.
  */
 static void
 mesh_peer_timeout_cb(void *arg)
 {
 	struct ieee80211_node *ni = (struct ieee80211_node *)arg;
 	uint16_t args[3];
 
 	IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_MESH,
 	    ni, "mesh link timeout, state %d, retry counter %d",
 	    ni->ni_mlstate, ni->ni_mlrcnt);
 	
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_IDLE:
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		break;
 	case IEEE80211_NODE_MESH_OPENSNT:
 	case IEEE80211_NODE_MESH_OPENRCV:
 		if (ni->ni_mlrcnt == ieee80211_mesh_maxretries) {
 			args[0] = ni->ni_mlpid;
 			args[2] = IEEE80211_REASON_MESH_MAX_RETRIES;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE, args);
 			ni->ni_mlrcnt = 0;
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 		} else {
 			args[0] = ni->ni_mlpid;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_OPEN, args);
 			ni->ni_mlrcnt++;
 			mesh_peer_timeout_backoff(ni);
 		}
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		args[0] = ni->ni_mlpid;
 		args[2] = IEEE80211_REASON_MESH_CONFIRM_TIMEOUT;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE, args);
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		ni->ni_mlhcnt++;
 		if (ni->ni_mlhcnt >= ieee80211_mesh_maxholding)
 			callout_reset(&ni->ni_mlhtimer,
 			    ieee80211_mesh_backofftimeout,
 			    mesh_peer_backoff_cb, ni);
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_IDLE);
 		break;
 	}
 }
 
 static int
 mesh_verify_meshid(struct ieee80211vap *vap, const uint8_t *ie)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	if (ie == NULL || ie[1] != ms->ms_idlen)
 		return 1;
 	return memcmp(ms->ms_id, ie + 2, ms->ms_idlen);
 }
 
 /*
  * Check if we are using the same algorithms for this mesh.
  */
 static int
 mesh_verify_meshconf(struct ieee80211vap *vap, const uint8_t *ie)
 {
 	const struct ieee80211_meshconf_ie *meshconf =
 	    (const struct ieee80211_meshconf_ie *) ie;
 	const struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	if (meshconf == NULL)
 		return 1;
 	if (meshconf->conf_pselid != ms->ms_ppath->mpp_ie) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown path selection algorithm: 0x%x\n",
 		    meshconf->conf_pselid);
 		return 1;
 	}
 	if (meshconf->conf_pmetid != ms->ms_pmetric->mpm_ie) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown path metric algorithm: 0x%x\n",
 		    meshconf->conf_pmetid);
 		return 1;
 	}
 	if (meshconf->conf_ccid != 0) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown congestion control algorithm: 0x%x\n",
 		    meshconf->conf_ccid);
 		return 1;
 	}
 	if (meshconf->conf_syncid != IEEE80211_MESHCONF_SYNC_NEIGHOFF) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown sync algorithm: 0x%x\n",
 		    meshconf->conf_syncid);
 		return 1;
 	}
 	if (meshconf->conf_authid != 0) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown auth auth algorithm: 0x%x\n",
 		    meshconf->conf_pselid);
 		return 1;
 	}
 	/* Not accepting peers */
 	if (!(meshconf->conf_cap & IEEE80211_MESHCONF_CAP_AP)) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "not accepting peers: 0x%x\n", meshconf->conf_cap);
 		return 1;
 	}
 	return 0;
 }
 
 static int
 mesh_verify_meshpeer(struct ieee80211vap *vap, uint8_t subtype,
     const uint8_t *ie)
 {
 	const struct ieee80211_meshpeer_ie *meshpeer =
 	    (const struct ieee80211_meshpeer_ie *) ie;
 
 	if (meshpeer == NULL ||
 	    meshpeer->peer_len < IEEE80211_MPM_BASE_SZ ||
 	    meshpeer->peer_len > IEEE80211_MPM_MAX_SZ)
 		return 1;
 	if (meshpeer->peer_proto != IEEE80211_MPPID_MPM) {
 		IEEE80211_DPRINTF(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    "Only MPM protocol is supported (proto: 0x%02X)",
 		    meshpeer->peer_proto);
 		return 1;
 	}
 	switch (subtype) {
 	case IEEE80211_ACTION_MESHPEERING_OPEN:
 		if (meshpeer->peer_len != IEEE80211_MPM_BASE_SZ)
 			return 1;
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 		if (meshpeer->peer_len != IEEE80211_MPM_BASE_SZ + 2)
 			return 1;
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CLOSE:
 		if (meshpeer->peer_len < IEEE80211_MPM_BASE_SZ + 2)
 			return 1;
 		if (meshpeer->peer_len == (IEEE80211_MPM_BASE_SZ + 2) &&
 		    meshpeer->peer_linkid != 0)
 			return 1;
 		if (meshpeer->peer_rcode == 0)
 			return 1;
 		break;
 	}
 	return 0;
 }
 
 /*
  * Add a Mesh ID IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshid(uint8_t *frm, struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a mbss vap"));
 
 	*frm++ = IEEE80211_ELEMID_MESHID;
 	*frm++ = ms->ms_idlen;
 	memcpy(frm, ms->ms_id, ms->ms_idlen);
 	return frm + ms->ms_idlen;
 }
 
 /*
  * Add a Mesh Configuration IE to a frame.
  * For now just use HWMP routing, Airtime link metric, Null Congestion
  * Signaling, Null Sync Protocol and Null Authentication.
  */
 uint8_t *
 ieee80211_add_meshconf(uint8_t *frm, struct ieee80211vap *vap)
 {
 	const struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	uint16_t caps;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a MBSS vap"));
 
 	*frm++ = IEEE80211_ELEMID_MESHCONF;
 	*frm++ = IEEE80211_MESH_CONF_SZ;
 	*frm++ = ms->ms_ppath->mpp_ie;		/* path selection */
 	*frm++ = ms->ms_pmetric->mpm_ie;	/* link metric */
 	*frm++ = IEEE80211_MESHCONF_CC_DISABLED;
 	*frm++ = IEEE80211_MESHCONF_SYNC_NEIGHOFF;
 	*frm++ = IEEE80211_MESHCONF_AUTH_DISABLED;
 	/* NB: set the number of neighbors before the rest */
 	*frm = (ms->ms_neighbors > IEEE80211_MESH_MAX_NEIGHBORS ?
 	    IEEE80211_MESH_MAX_NEIGHBORS : ms->ms_neighbors) << 1;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE)
 		*frm |= IEEE80211_MESHCONF_FORM_GATE;
 	frm += 1;
 	caps = 0;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_AP)
 		caps |= IEEE80211_MESHCONF_CAP_AP;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_FWD)
 		caps |= IEEE80211_MESHCONF_CAP_FWRD;
 	*frm++ = caps;
 	return frm;
 }
 
 /*
  * Add a Mesh Peer Management IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshpeer(uint8_t *frm, uint8_t subtype, uint16_t localid,
     uint16_t peerid, uint16_t reason)
 {
 
 	KASSERT(localid != 0, ("localid == 0"));
 
 	*frm++ = IEEE80211_ELEMID_MESHPEER;
 	switch (subtype) {
 	case IEEE80211_ACTION_MESHPEERING_OPEN:
 		*frm++ = IEEE80211_MPM_BASE_SZ;		/* length */
 		ADDSHORT(frm, IEEE80211_MPPID_MPM);	/* proto */
 		ADDSHORT(frm, localid);			/* local ID */
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 		KASSERT(peerid != 0, ("sending peer confirm without peer id"));
 		*frm++ = IEEE80211_MPM_BASE_SZ + 2;	/* length */
 		ADDSHORT(frm, IEEE80211_MPPID_MPM);	/* proto */
 		ADDSHORT(frm, localid);			/* local ID */
 		ADDSHORT(frm, peerid);			/* peer ID */
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CLOSE:
 		if (peerid)
 			*frm++ = IEEE80211_MPM_MAX_SZ;	/* length */
 		else
 			*frm++ = IEEE80211_MPM_BASE_SZ + 2; /* length */
 		ADDSHORT(frm, IEEE80211_MPPID_MPM);	/* proto */
 		ADDSHORT(frm, localid);	/* local ID */
 		if (peerid)
 			ADDSHORT(frm, peerid);	/* peer ID */
 		ADDSHORT(frm, reason);
 		break;
 	}
 	return frm;
 }
 
 /*
  * Compute an Airtime Link Metric for the link with this node.
  *
  * Based on Draft 3.0 spec (11B.10, p.149).
  */
 /*
  * Max 802.11s overhead.
  */
 #define IEEE80211_MESH_MAXOVERHEAD \
 	(sizeof(struct ieee80211_qosframe_addr4) \
 	 + sizeof(struct ieee80211_meshcntl_ae10) \
 	+ sizeof(struct llc) \
 	+ IEEE80211_ADDR_LEN \
 	+ IEEE80211_WEP_IVLEN \
 	+ IEEE80211_WEP_KIDLEN \
 	+ IEEE80211_WEP_CRCLEN \
 	+ IEEE80211_WEP_MICLEN \
 	+ IEEE80211_CRC_LEN)
 uint32_t
 mesh_airtime_calc(struct ieee80211_node *ni)
 {
 #define M_BITS 8
 #define S_FACTOR (2 * M_BITS)
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = ni->ni_vap->iv_ifp;
 	const static int nbits = 8192 << M_BITS;
 	uint32_t overhead, rate, errrate;
 	uint64_t res;
 
 	/* Time to transmit a frame */
 	rate = ni->ni_txrate;
 	overhead = ieee80211_compute_duration(ic->ic_rt,
 	    ifp->if_mtu + IEEE80211_MESH_MAXOVERHEAD, rate, 0) << M_BITS;
 	/* Error rate in percentage */
 	/* XXX assuming small failures are ok */
 	errrate = (((ifp->if_get_counter(ifp, IFCOUNTER_OERRORS) +
 	    ifp->if_get_counter(ifp, IFCOUNTER_IERRORS)) / 100) << M_BITS)
 	    / 100;
 	res = (overhead + (nbits / rate)) *
 	    ((1 << S_FACTOR) / ((1 << M_BITS) - errrate));
 
 	return (uint32_t)(res >> S_FACTOR);
 #undef M_BITS
 #undef S_FACTOR
 }
 
 /*
  * Add a Mesh Link Metric report IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshlmetric(uint8_t *frm, uint8_t flags, uint32_t metric)
 {
 	*frm++ = IEEE80211_ELEMID_MESHLINK;
 	*frm++ = 5;
 	*frm++ = flags;
 	ADDWORD(frm, metric);
 	return frm;
 }
 
 /*
  * Add a Mesh Gate Announcement IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshgate(uint8_t *frm, struct ieee80211_meshgann_ie *ie)
 {
 	*frm++ = IEEE80211_ELEMID_MESHGANN; /* ie */
 	*frm++ = IEEE80211_MESHGANN_BASE_SZ; /* len */
 	*frm++ = ie->gann_flags;
 	*frm++ = ie->gann_hopcount;
 	*frm++ = ie->gann_ttl;
 	IEEE80211_ADDR_COPY(frm, ie->gann_addr);
 	frm += 6;
 	ADDWORD(frm, ie->gann_seq);
 	ADDSHORT(frm, ie->gann_interval);
 	return frm;
 }
 #undef ADDSHORT
 #undef ADDWORD
 
 /*
  * Initialize any mesh-specific node state.
  */
 void
 ieee80211_mesh_node_init(struct ieee80211vap *vap, struct ieee80211_node *ni)
 {
 	ni->ni_flags |= IEEE80211_NODE_QOS;
 	callout_init(&ni->ni_mltimer, 1);
 	callout_init(&ni->ni_mlhtimer, 1);
 }
 
 /*
  * Cleanup any mesh-specific node state.
  */
 void
 ieee80211_mesh_node_cleanup(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	callout_drain(&ni->ni_mltimer);
 	callout_drain(&ni->ni_mlhtimer);
 	/* NB: short-circuit callbacks after mesh_vdetach */
 	if (vap->iv_mesh != NULL)
 		ms->ms_ppath->mpp_peerdown(ni);
 }
 
 void
 ieee80211_parse_meshid(struct ieee80211_node *ni, const uint8_t *ie)
 {
 	ni->ni_meshidlen = ie[1];
 	memcpy(ni->ni_meshid, ie + 2, ie[1]);
 }
 
 /*
  * Setup mesh-specific node state on neighbor discovery.
  */
 void
 ieee80211_mesh_init_neighbor(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const struct ieee80211_scanparams *sp)
 {
 	ieee80211_parse_meshid(ni, sp->meshid);
 }
 
 void
 ieee80211_mesh_update_beacon(struct ieee80211vap *vap,
 	struct ieee80211_beacon_offsets *bo)
 {
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a MBSS vap"));
 
 	if (isset(bo->bo_flags, IEEE80211_BEACON_MESHCONF)) {
 		(void)ieee80211_add_meshconf(bo->bo_meshconf, vap);
 		clrbit(bo->bo_flags, IEEE80211_BEACON_MESHCONF);
 	}
 }
 
 static int
 mesh_ioctl_get80211(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	uint8_t tmpmeshid[IEEE80211_NWID_LEN];
 	struct ieee80211_mesh_route *rt;
 	struct ieee80211req_mesh_route *imr;
 	size_t len, off;
 	uint8_t *p;
 	int error;
 
 	if (vap->iv_opmode != IEEE80211_M_MBSS)
 		return ENOSYS;
 
 	error = 0;
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_MESH_ID:
 		ireq->i_len = ms->ms_idlen;
 		memcpy(tmpmeshid, ms->ms_id, ireq->i_len);
 		error = copyout(tmpmeshid, ireq->i_data, ireq->i_len);
 		break;
 	case IEEE80211_IOC_MESH_AP:
 		ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_AP) != 0;
 		break;
 	case IEEE80211_IOC_MESH_FWRD:
 		ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_FWD) != 0;
 		break;
 	case IEEE80211_IOC_MESH_GATE:
 		ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) != 0;
 		break;
 	case IEEE80211_IOC_MESH_TTL:
 		ireq->i_val = ms->ms_ttl;
 		break;
 	case IEEE80211_IOC_MESH_RTCMD:
 		switch (ireq->i_val) {
 		case IEEE80211_MESH_RTCMD_LIST:
 			len = 0;
 			MESH_RT_LOCK(ms);
 			TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) {
 				len += sizeof(*imr);
 			}
 			MESH_RT_UNLOCK(ms);
 			if (len > ireq->i_len || ireq->i_len < sizeof(*imr)) {
 				ireq->i_len = len;
 				return ENOMEM;
 			}
 			ireq->i_len = len;
 			/* XXX M_WAIT? */
 			p = IEEE80211_MALLOC(len, M_TEMP,
 			    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 			if (p == NULL)
 				return ENOMEM;
 			off = 0;
 			MESH_RT_LOCK(ms);
 			TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) {
 				if (off >= len)
 					break;
 				imr = (struct ieee80211req_mesh_route *)
 				    (p + off);
 				IEEE80211_ADDR_COPY(imr->imr_dest,
 				    rt->rt_dest);
 				IEEE80211_ADDR_COPY(imr->imr_nexthop,
 				    rt->rt_nexthop);
 				imr->imr_metric = rt->rt_metric;
 				imr->imr_nhops = rt->rt_nhops;
 				imr->imr_lifetime =
 				    ieee80211_mesh_rt_update(rt, 0);
 				imr->imr_lastmseq = rt->rt_lastmseq;
 				imr->imr_flags = rt->rt_flags; /* last */
 				off += sizeof(*imr);
 			}
 			MESH_RT_UNLOCK(ms);
 			error = copyout(p, (uint8_t *)ireq->i_data,
 			    ireq->i_len);
 			IEEE80211_FREE(p, M_TEMP);
 			break;
 		case IEEE80211_MESH_RTCMD_FLUSH:
 		case IEEE80211_MESH_RTCMD_ADD:
 		case IEEE80211_MESH_RTCMD_DELETE:
 			return EINVAL;
 		default:
 			return ENOSYS;
 		}
 		break;
 	case IEEE80211_IOC_MESH_PR_METRIC:
 		len = strlen(ms->ms_pmetric->mpm_descr);
 		if (ireq->i_len < len)
 			return EINVAL;
 		ireq->i_len = len;
 		error = copyout(ms->ms_pmetric->mpm_descr,
 		    (uint8_t *)ireq->i_data, len);
 		break;
 	case IEEE80211_IOC_MESH_PR_PATH:
 		len = strlen(ms->ms_ppath->mpp_descr);
 		if (ireq->i_len < len)
 			return EINVAL;
 		ireq->i_len = len;
 		error = copyout(ms->ms_ppath->mpp_descr,
 		    (uint8_t *)ireq->i_data, len);
 		break;
 	default:
 		return ENOSYS;
 	}
 
 	return error;
 }
 IEEE80211_IOCTL_GET(mesh, mesh_ioctl_get80211);
 
 static int
 mesh_ioctl_set80211(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	uint8_t tmpmeshid[IEEE80211_NWID_LEN];
 	uint8_t tmpaddr[IEEE80211_ADDR_LEN];
 	char tmpproto[IEEE80211_MESH_PROTO_DSZ];
 	int error;
 
 	if (vap->iv_opmode != IEEE80211_M_MBSS)
 		return ENOSYS;
 
 	error = 0;
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_MESH_ID:
 		if (ireq->i_val != 0 || ireq->i_len > IEEE80211_MESHID_LEN)
 			return EINVAL;
 		error = copyin(ireq->i_data, tmpmeshid, ireq->i_len);
 		if (error != 0)
 			break;
 		memset(ms->ms_id, 0, IEEE80211_NWID_LEN);
 		ms->ms_idlen = ireq->i_len;
 		memcpy(ms->ms_id, tmpmeshid, ireq->i_len);
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_MESH_AP:
 		if (ireq->i_val)
 			ms->ms_flags |= IEEE80211_MESHFLAGS_AP;
 		else
 			ms->ms_flags &= ~IEEE80211_MESHFLAGS_AP;
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_MESH_FWRD:
 		if (ireq->i_val)
 			ms->ms_flags |= IEEE80211_MESHFLAGS_FWD;
 		else
 			ms->ms_flags &= ~IEEE80211_MESHFLAGS_FWD;
 		mesh_gatemode_setup(vap);
 		break;
 	case IEEE80211_IOC_MESH_GATE:
 		if (ireq->i_val)
 			ms->ms_flags |= IEEE80211_MESHFLAGS_GATE;
 		else
 			ms->ms_flags &= ~IEEE80211_MESHFLAGS_GATE;
 		break;
 	case IEEE80211_IOC_MESH_TTL:
 		ms->ms_ttl = (uint8_t) ireq->i_val;
 		break;
 	case IEEE80211_IOC_MESH_RTCMD:
 		switch (ireq->i_val) {
 		case IEEE80211_MESH_RTCMD_LIST:
 			return EINVAL;
 		case IEEE80211_MESH_RTCMD_FLUSH:
 			ieee80211_mesh_rt_flush(vap);
 			break;
 		case IEEE80211_MESH_RTCMD_ADD:
 			if (IEEE80211_ADDR_EQ(vap->iv_myaddr, ireq->i_data) ||
 			    IEEE80211_ADDR_EQ(broadcastaddr, ireq->i_data))
 				return EINVAL;
 			error = copyin(ireq->i_data, &tmpaddr,
 			    IEEE80211_ADDR_LEN);
 			if (error == 0)
 				ieee80211_mesh_discover(vap, tmpaddr, NULL);
 			break;
 		case IEEE80211_MESH_RTCMD_DELETE:
 			ieee80211_mesh_rt_del(vap, ireq->i_data);
 			break;
 		default:
 			return ENOSYS;
 		}
 		break;
 	case IEEE80211_IOC_MESH_PR_METRIC:
 		error = copyin(ireq->i_data, tmpproto, sizeof(tmpproto));
 		if (error == 0) {
 			error = mesh_select_proto_metric(vap, tmpproto);
 			if (error == 0)
 				error = ENETRESET;
 		}
 		break;
 	case IEEE80211_IOC_MESH_PR_PATH:
 		error = copyin(ireq->i_data, tmpproto, sizeof(tmpproto));
 		if (error == 0) {
 			error = mesh_select_proto_path(vap, tmpproto);
 			if (error == 0)
 				error = ENETRESET;
 		}
 		break;
 	default:
 		return ENOSYS;
 	}
 	return error;
 }
 IEEE80211_IOCTL_SET(mesh, mesh_ioctl_set80211);
Index: head/sys/net80211/ieee80211_output.c
===================================================================
--- head/sys/net80211/ieee80211_output.c	(revision 348253)
+++ head/sys/net80211/ieee80211_output.c	(revision 348254)
@@ -1,4012 +1,4014 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Atsushi Onoe
  * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>   
 #include <sys/endian.h>
 
 #include <sys/socket.h>
  
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llc.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_regdomain.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 #include <net80211/ieee80211_tdma.h>
 #endif
 #include <net80211/ieee80211_wds.h>
 #include <net80211/ieee80211_mesh.h>
 #include <net80211/ieee80211_vht.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h> 
 #endif
 
 #ifdef INET
 #include <netinet/if_ether.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #define	ETHER_HEADER_COPY(dst, src) \
 	memcpy(dst, src, sizeof(struct ether_header))
 
 static int ieee80211_fragment(struct ieee80211vap *, struct mbuf *,
 	u_int hdrsize, u_int ciphdrsize, u_int mtu);
 static	void ieee80211_tx_mgt_cb(struct ieee80211_node *, void *, int);
 
 #ifdef IEEE80211_DEBUG
 /*
  * Decide if an outbound management frame should be
  * printed when debugging is enabled.  This filters some
  * of the less interesting frames that come frequently
  * (e.g. beacons).
  */
 static __inline int
 doprint(struct ieee80211vap *vap, int subtype)
 {
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 		return (vap->iv_opmode == IEEE80211_M_IBSS);
 	}
 	return 1;
 }
 #endif
 
 /*
  * Transmit a frame to the given destination on the given VAP.
  *
  * It's up to the caller to figure out the details of who this
  * is going to and resolving the node.
  *
  * This routine takes care of queuing it for power save,
  * A-MPDU state stuff, fast-frames state stuff, encapsulation
  * if required, then passing it up to the driver layer.
  *
  * This routine (for now) consumes the mbuf and frees the node
  * reference; it ideally will return a TX status which reflects
  * whether the mbuf was consumed or not, so the caller can
  * free the mbuf (if appropriate) and the node reference (again,
  * if appropriate.)
  */
 int
 ieee80211_vap_pkt_send_dest(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	int mcast;
 
 	if ((ni->ni_flags & IEEE80211_NODE_PWR_MGT) &&
 	    (m->m_flags & M_PWR_SAV) == 0) {
 		/*
 		 * Station in power save mode; pass the frame
 		 * to the 802.11 layer and continue.  We'll get
 		 * the frame back when the time is right.
 		 * XXX lose WDS vap linkage?
 		 */
 		if (ieee80211_pwrsave(ni, m) != 0)
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		ieee80211_free_node(ni);
 
 		/*
 		 * We queued it fine, so tell the upper layer
 		 * that we consumed it.
 		 */
 		return (0);
 	}
 	/* calculate priority so drivers can find the tx queue */
 	if (ieee80211_classify(ni, m)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT,
 		    ni->ni_macaddr, NULL,
 		    "%s", "classification failure");
 		vap->iv_stats.is_tx_classify++;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		ieee80211_free_node(ni);
 
 		/* XXX better status? */
 		return (0);
 	}
 	/*
 	 * Stash the node pointer.  Note that we do this after
 	 * any call to ieee80211_dwds_mcast because that code
 	 * uses any existing value for rcvif to identify the
 	 * interface it (might have been) received on.
 	 */
+	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (void *)ni;
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1: 0;
 
 	BPF_MTAP(ifp, m);		/* 802.3 tx */
 
 	/*
 	 * Check if A-MPDU tx aggregation is setup or if we
 	 * should try to enable it.  The sta must be associated
 	 * with HT and A-MPDU enabled for use.  When the policy
 	 * routine decides we should enable A-MPDU we issue an
 	 * ADDBA request and wait for a reply.  The frame being
 	 * encapsulated will go out w/o using A-MPDU, or possibly
 	 * it might be collected by the driver and held/retransmit.
 	 * The default ic_ampdu_enable routine handles staggering
 	 * ADDBA requests in case the receiver NAK's us or we are
 	 * otherwise unable to establish a BA stream.
 	 *
 	 * Don't treat group-addressed frames as candidates for aggregation;
 	 * net80211 doesn't support 802.11aa-2012 and so group addressed
 	 * frames will always have sequence numbers allocated from the NON_QOS
 	 * TID.
 	 */
 	if ((ni->ni_flags & IEEE80211_NODE_AMPDU_TX) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_AMPDU_TX)) {
 		if ((m->m_flags & M_EAPOL) == 0 && (! mcast)) {
 			int tid = WME_AC_TO_TID(M_WME_GETAC(m));
 			struct ieee80211_tx_ampdu *tap = &ni->ni_tx_ampdu[tid];
 
 			ieee80211_txampdu_count_packet(tap);
 			if (IEEE80211_AMPDU_RUNNING(tap)) {
 				/*
 				 * Operational, mark frame for aggregation.
 				 *
 				 * XXX do tx aggregation here
 				 */
 				m->m_flags |= M_AMPDU_MPDU;
 			} else if (!IEEE80211_AMPDU_REQUESTED(tap) &&
 			    ic->ic_ampdu_enable(ni, tap)) {
 				/*
 				 * Not negotiated yet, request service.
 				 */
 				ieee80211_ampdu_request(ni, tap);
 				/* XXX hold frame for reply? */
 			}
 		}
 	}
 
 #ifdef IEEE80211_SUPPORT_SUPERG
 	/*
 	 * Check for AMSDU/FF; queue for aggregation
 	 *
 	 * Note: we don't bother trying to do fast frames or
 	 * A-MSDU encapsulation for 802.3 drivers.  Now, we
 	 * likely could do it for FF (because it's a magic
 	 * atheros tunnel LLC type) but I don't think we're going
 	 * to really need to.  For A-MSDU we'd have to set the
 	 * A-MSDU QoS bit in the wifi header, so we just plain
 	 * can't do it.
 	 *
 	 * Strictly speaking, we could actually /do/ A-MSDU / FF
 	 * with A-MPDU together which for certain circumstances
 	 * is beneficial (eg A-MSDU of TCK ACKs.)  However,
 	 * I'll ignore that for now so existing behaviour is maintained.
 	 * Later on it would be good to make "amsdu + ampdu" configurable.
 	 */
 	else if (__predict_true((vap->iv_caps & IEEE80211_C_8023ENCAP) == 0)) {
 		if ((! mcast) && ieee80211_amsdu_tx_ok(ni)) {
 			m = ieee80211_amsdu_check(ni, m);
 			if (m == NULL) {
 				/* NB: any ni ref held on stageq */
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG,
 				    "%s: amsdu_check queued frame\n",
 				    __func__);
 				return (0);
 			}
 		} else if ((! mcast) && IEEE80211_ATH_CAP(vap, ni,
 		    IEEE80211_NODE_FF)) {
 			m = ieee80211_ff_check(ni, m);
 			if (m == NULL) {
 				/* NB: any ni ref held on stageq */
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG,
 				    "%s: ff_check queued frame\n",
 				    __func__);
 				return (0);
 			}
 		}
 	}
 #endif /* IEEE80211_SUPPORT_SUPERG */
 
 	/*
 	 * Grab the TX lock - serialise the TX process from this
 	 * point (where TX state is being checked/modified)
 	 * through to driver queue.
 	 */
 	IEEE80211_TX_LOCK(ic);
 
 	/*
 	 * XXX make the encap and transmit code a separate function
 	 * so things like the FF (and later A-MSDU) path can just call
 	 * it for flushed frames.
 	 */
 	if (__predict_true((vap->iv_caps & IEEE80211_C_8023ENCAP) == 0)) {
 		/*
 		 * Encapsulate the packet in prep for transmission.
 		 */
 		m = ieee80211_encap(vap, ni, m);
 		if (m == NULL) {
 			/* NB: stat+msg handled in ieee80211_encap */
 			IEEE80211_TX_UNLOCK(ic);
 			ieee80211_free_node(ni);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		}
 	}
 	(void) ieee80211_parent_xmitpkt(ic, m);
 
 	/*
 	 * Unlock at this point - no need to hold it across
 	 * ieee80211_free_node() (ie, the comlock)
 	 */
 	IEEE80211_TX_UNLOCK(ic);
 	ic->ic_lastdata = ticks;
 
 	return (0);
 }
 
 
 
 /*
  * Send the given mbuf through the given vap.
  *
  * This consumes the mbuf regardless of whether the transmit
  * was successful or not.
  *
  * This does none of the initial checks that ieee80211_start()
  * does (eg CAC timeout, interface wakeup) - the caller must
  * do this first.
  */
 static int
 ieee80211_start_pkt(struct ieee80211vap *vap, struct mbuf *m)
 {
 #define	IS_DWDS(vap) \
 	(vap->iv_opmode == IEEE80211_M_WDS && \
 	 (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0)
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_node *ni;
 	struct ether_header *eh;
 
 	/*
 	 * Cancel any background scan.
 	 */
 	if (ic->ic_flags & IEEE80211_F_SCAN)
 		ieee80211_cancel_anyscan(vap);
 	/* 
 	 * Find the node for the destination so we can do
 	 * things like power save and fast frames aggregation.
 	 *
 	 * NB: past this point various code assumes the first
 	 *     mbuf has the 802.3 header present (and contiguous).
 	 */
 	ni = NULL;
 	if (m->m_len < sizeof(struct ether_header) &&
 	   (m = m_pullup(m, sizeof(struct ether_header))) == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 		    "discard frame, %s\n", "m_pullup failed");
 		vap->iv_stats.is_tx_nobuf++;	/* XXX */
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENOBUFS);
 	}
 	eh = mtod(m, struct ether_header *);
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (IS_DWDS(vap)) {
 			/*
 			 * Only unicast frames from the above go out
 			 * DWDS vaps; multicast frames are handled by
 			 * dispatching the frame as it comes through
 			 * the AP vap (see below).
 			 */
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_WDS,
 			    eh->ether_dhost, "mcast", "%s", "on DWDS");
 			vap->iv_stats.is_dwds_mcast++;
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP) {
 			/*
 			 * Spam DWDS vap's w/ multicast traffic.
 			 */
 			/* XXX only if dwds in use? */
 			ieee80211_dwds_mcast(vap, m);
 		}
 	}
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode != IEEE80211_M_MBSS) {
 #endif
 		ni = ieee80211_find_txnode(vap, eh->ether_dhost);
 		if (ni == NULL) {
 			/* NB: ieee80211_find_txnode does stat+msg */
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 		if (ni->ni_associd == 0 &&
 		    (ni->ni_flags & IEEE80211_NODE_ASSOCID)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT,
 			    eh->ether_dhost, NULL,
 			    "sta not associated (type 0x%04x)",
 			    htons(eh->ether_type));
 			vap->iv_stats.is_tx_notassoc++;
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			ieee80211_free_node(ni);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 #ifdef IEEE80211_SUPPORT_MESH
 	} else {
 		if (!IEEE80211_ADDR_EQ(eh->ether_shost, vap->iv_myaddr)) {
 			/*
 			 * Proxy station only if configured.
 			 */
 			if (!ieee80211_mesh_isproxyena(vap)) {
 				IEEE80211_DISCARD_MAC(vap,
 				    IEEE80211_MSG_OUTPUT |
 				    IEEE80211_MSG_MESH,
 				    eh->ether_dhost, NULL,
 				    "%s", "proxy not enabled");
 				vap->iv_stats.is_mesh_notproxy++;
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 				m_freem(m);
 				/* XXX better status? */
 				return (ENOBUFS);
 			}
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 			    "forward frame from DS SA(%6D), DA(%6D)\n",
 			    eh->ether_shost, ":",
 			    eh->ether_dhost, ":");
 			ieee80211_mesh_proxy_check(vap, eh->ether_shost);
 		}
 		ni = ieee80211_mesh_discover(vap, eh->ether_dhost, m);
 		if (ni == NULL) {
 			/*
 			 * NB: ieee80211_mesh_discover holds/disposes
 			 * frame (e.g. queueing on path discovery).
 			 */
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 	}
 #endif
 
 	/*
 	 * We've resolved the sender, so attempt to transmit it.
 	 */
 
 	if (vap->iv_state == IEEE80211_S_SLEEP) {
 		/*
 		 * In power save; queue frame and then  wakeup device
 		 * for transmit.
 		 */
 		ic->ic_lastdata = ticks;
 		if (ieee80211_pwrsave(ni, m) != 0)
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		ieee80211_free_node(ni);
 		ieee80211_new_state(vap, IEEE80211_S_RUN, 0);
 		return (0);
 	}
 
 	if (ieee80211_vap_pkt_send_dest(vap, m, ni) != 0)
 		return (ENOBUFS);
 	return (0);
 #undef	IS_DWDS
 }
 
 /*
  * Start method for vap's.  All packets from the stack come
  * through here.  We handle common processing of the packets
  * before dispatching them to the underlying device.
  *
  * if_transmit() requires that the mbuf be consumed by this call
  * regardless of the return condition.
  */
 int
 ieee80211_vap_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	/*
 	 * No data frames go out unless we're running.
 	 * Note in particular this covers CAC and CSA
 	 * states (though maybe we should check muting
 	 * for CSA).
 	 */
 	if (vap->iv_state != IEEE80211_S_RUN &&
 	    vap->iv_state != IEEE80211_S_SLEEP) {
 		IEEE80211_LOCK(ic);
 		/* re-check under the com lock to avoid races */
 		if (vap->iv_state != IEEE80211_S_RUN &&
 		    vap->iv_state != IEEE80211_S_SLEEP) {
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 			    "%s: ignore queue, in %s state\n",
 			    __func__, ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_tx_badstate++;
 			IEEE80211_UNLOCK(ic);
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENETDOWN);
 		}
 		IEEE80211_UNLOCK(ic);
 	}
 
 	/*
 	 * Sanitize mbuf flags for net80211 use.  We cannot
 	 * clear M_PWR_SAV or M_MORE_DATA because these may
 	 * be set for frames that are re-submitted from the
 	 * power save queue.
 	 *
 	 * NB: This must be done before ieee80211_classify as
 	 *     it marks EAPOL in frames with M_EAPOL.
 	 */
 	m->m_flags &= ~(M_80211_TX - M_PWR_SAV - M_MORE_DATA);
 
 	/*
 	 * Bump to the packet transmission path.
 	 * The mbuf will be consumed here.
 	 */
 	return (ieee80211_start_pkt(vap, m));
 }
 
 void
 ieee80211_vap_qflush(struct ifnet *ifp)
 {
 
 	/* Empty for now */
 }
 
 /*
  * 802.11 raw output routine.
  *
  * XXX TODO: this (and other send routines) should correctly
  * XXX keep the pwr mgmt bit set if it decides to call into the
  * XXX driver to send a frame whilst the state is SLEEP.
  *
  * Otherwise the peer may decide that we're awake and flood us
  * with traffic we are still too asleep to receive!
  */
 int
 ieee80211_raw_output(struct ieee80211vap *vap, struct ieee80211_node *ni,
     struct mbuf *m, const struct ieee80211_bpf_params *params)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int error;
 
 	/*
 	 * Set node - the caller has taken a reference, so ensure
 	 * that the mbuf has the same node value that
 	 * it would if it were going via the normal path.
 	 */
+	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (void *)ni;
 
 	/*
 	 * Attempt to add bpf transmit parameters.
 	 *
 	 * For now it's ok to fail; the raw_xmit api still takes
 	 * them as an option.
 	 *
 	 * Later on when ic_raw_xmit() has params removed,
 	 * they'll have to be added - so fail the transmit if
 	 * they can't be.
 	 */
 	if (params)
 		(void) ieee80211_add_xmit_params(m, params);
 
 	error = ic->ic_raw_xmit(ni, m, params);
 	if (error) {
 		if_inc_counter(vap->iv_ifp, IFCOUNTER_OERRORS, 1);
 		ieee80211_free_node(ni);
 	}
 	return (error);
 }
 
 static int
 ieee80211_validate_frame(struct mbuf *m,
     const struct ieee80211_bpf_params *params)
 {
 	struct ieee80211_frame *wh;
 	int type;
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_ack))
 		return (EINVAL);
 
 	wh = mtod(m, struct ieee80211_frame *);
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0)
 		return (EINVAL);
 
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	if (type != IEEE80211_FC0_TYPE_DATA) {
 		if ((wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) !=
 		    IEEE80211_FC1_DIR_NODS)
 			return (EINVAL);
 
 		if (type != IEEE80211_FC0_TYPE_MGT &&
 		    (wh->i_fc[1] & IEEE80211_FC1_MORE_FRAG) != 0)
 			return (EINVAL);
 
 		/* XXX skip other field checks? */
 	}
 
 	if ((params && (params->ibp_flags & IEEE80211_BPF_CRYPTO) != 0) ||
 	    (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) != 0) {
 		int subtype;
 
 		subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 
 		/*
 		 * See IEEE Std 802.11-2012,
 		 * 8.2.4.1.9 'Protected Frame field'
 		 */
 		/* XXX no support for robust management frames yet. */
 		if (!(type == IEEE80211_FC0_TYPE_DATA ||
 		    (type == IEEE80211_FC0_TYPE_MGT &&
 		     subtype == IEEE80211_FC0_SUBTYPE_AUTH)))
 			return (EINVAL);
 
 		wh->i_fc[1] |= IEEE80211_FC1_PROTECTED;
 	}
 
 	if (m->m_pkthdr.len < ieee80211_anyhdrsize(wh))
 		return (EINVAL);
 
 	return (0);
 }
 
 static int
 ieee80211_validate_rate(struct ieee80211_node *ni, uint8_t rate)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 
 	if (IEEE80211_IS_HT_RATE(rate)) {
 		if ((ic->ic_htcaps & IEEE80211_HTC_HT) == 0)
 			return (EINVAL);
 
 		rate = IEEE80211_RV(rate);
 		if (rate <= 31) {
 			if (rate > ic->ic_txstream * 8 - 1)
 				return (EINVAL);
 
 			return (0);
 		}
 
 		if (rate == 32) {
 			if ((ic->ic_htcaps & IEEE80211_HTC_TXMCS32) == 0)
 				return (EINVAL);
 
 			return (0);
 		}
 
 		if ((ic->ic_htcaps & IEEE80211_HTC_TXUNEQUAL) == 0)
 			return (EINVAL);
 
 		switch (ic->ic_txstream) {
 		case 0:
 		case 1:
 			return (EINVAL);
 		case 2:
 			if (rate > 38)
 				return (EINVAL);
 
 			return (0);
 		case 3:
 			if (rate > 52)
 				return (EINVAL);
 
 			return (0);
 		case 4:
 		default:
 			if (rate > 76)
 				return (EINVAL);
 
 			return (0);
 		}
 	}
 
 	if (!ieee80211_isratevalid(ic->ic_rt, rate))
 		return (EINVAL);
 
 	return (0);
 }
 
 static int
 ieee80211_sanitize_rates(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_bpf_params *params)
 {
 	int error;
 
 	if (!params)
 		return (0);	/* nothing to do */
 
 	/* NB: most drivers assume that ibp_rate0 is set (!= 0). */
 	if (params->ibp_rate0 != 0) {
 		error = ieee80211_validate_rate(ni, params->ibp_rate0);
 		if (error != 0)
 			return (error);
 	} else {
 		/* XXX pre-setup some default (e.g., mgmt / mcast) rate */
 		/* XXX __DECONST? */
 		(void) m;
 	}
 
 	if (params->ibp_rate1 != 0 &&
 	    (error = ieee80211_validate_rate(ni, params->ibp_rate1)) != 0)
 		return (error);
 
 	if (params->ibp_rate2 != 0 &&
 	    (error = ieee80211_validate_rate(ni, params->ibp_rate2)) != 0)
 		return (error);
 
 	if (params->ibp_rate3 != 0 &&
 	    (error = ieee80211_validate_rate(ni, params->ibp_rate3)) != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * 802.11 output routine. This is (currently) used only to
  * connect bpf write calls to the 802.11 layer for injecting
  * raw 802.11 frames.
  */
 int
 ieee80211_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 #define senderr(e) do { error = (e); goto bad;} while (0)
 	const struct ieee80211_bpf_params *params = NULL;
 	struct ieee80211_node *ni = NULL;
 	struct ieee80211vap *vap;
 	struct ieee80211_frame *wh;
 	struct ieee80211com *ic = NULL;
 	int error;
 	int ret;
 
 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE) {
 		/*
 		 * Short-circuit requests if the vap is marked OACTIVE
 		 * as this can happen because a packet came down through
 		 * ieee80211_start before the vap entered RUN state in
 		 * which case it's ok to just drop the frame.  This
 		 * should not be necessary but callers of if_output don't
 		 * check OACTIVE.
 		 */
 		senderr(ENETDOWN);
 	}
 	vap = ifp->if_softc;
 	ic = vap->iv_ic;
 	/*
 	 * Hand to the 802.3 code if not tagged as
 	 * a raw 802.11 frame.
 	 */
 	if (dst->sa_family != AF_IEEE80211)
 		return vap->iv_output(ifp, m, dst, ro);
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!IFNET_IS_UP_RUNNING(ifp))
 		senderr(ENETDOWN);
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_DPRINTF(vap,
 		    IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH,
 		    "block %s frame in CAC state\n", "raw data");
 		vap->iv_stats.is_tx_badstate++;
 		senderr(EIO);		/* XXX */
 	} else if (vap->iv_state == IEEE80211_S_SCAN)
 		senderr(EIO);
 	/* XXX bypass bridge, pfil, carp, etc. */
 
 	/*
 	 * NB: DLT_IEEE802_11_RADIO identifies the parameters are
 	 * present by setting the sa_len field of the sockaddr (yes,
 	 * this is a hack).
 	 * NB: we assume sa_data is suitably aligned to cast.
 	 */
 	if (dst->sa_len != 0)
 		params = (const struct ieee80211_bpf_params *)dst->sa_data;
 
 	error = ieee80211_validate_frame(m, params);
 	if (error != 0)
 		senderr(error);
 
 	wh = mtod(m, struct ieee80211_frame *);
 
 	/* locate destination node */
 	switch (wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) {
 	case IEEE80211_FC1_DIR_NODS:
 	case IEEE80211_FC1_DIR_FROMDS:
 		ni = ieee80211_find_txnode(vap, wh->i_addr1);
 		break;
 	case IEEE80211_FC1_DIR_TODS:
 	case IEEE80211_FC1_DIR_DSTODS:
 		ni = ieee80211_find_txnode(vap, wh->i_addr3);
 		break;
 	default:
 		senderr(EDOOFUS);
 	}
 	if (ni == NULL) {
 		/*
 		 * Permit packets w/ bpf params through regardless
 		 * (see below about sa_len).
 		 */
 		if (dst->sa_len == 0)
 			senderr(EHOSTUNREACH);
 		ni = ieee80211_ref_node(vap->iv_bss);
 	}
 
 	/*
 	 * Sanitize mbuf for net80211 flags leaked from above.
 	 *
 	 * NB: This must be done before ieee80211_classify as
 	 *     it marks EAPOL in frames with M_EAPOL.
 	 */
 	m->m_flags &= ~M_80211_TX;
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	if (IEEE80211_IS_DATA(wh)) {
 		/* calculate priority so drivers can find the tx queue */
 		if (ieee80211_classify(ni, m))
 			senderr(EIO);		/* XXX */
 
 		/* NB: ieee80211_encap does not include 802.11 header */
 		IEEE80211_NODE_STAT_ADD(ni, tx_bytes,
 		    m->m_pkthdr.len - ieee80211_hdrsize(wh));
 	} else
 		M_WME_SETAC(m, WME_AC_BE);
 
 	error = ieee80211_sanitize_rates(ni, m, params);
 	if (error != 0)
 		senderr(error);
 
 	IEEE80211_NODE_STAT(ni, tx_data);
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 		IEEE80211_NODE_STAT(ni, tx_mcast);
 		m->m_flags |= M_MCAST;
 	} else
 		IEEE80211_NODE_STAT(ni, tx_ucast);
 
 	IEEE80211_TX_LOCK(ic);
 	ret = ieee80211_raw_output(vap, ni, m, params);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 bad:
 	if (m != NULL)
 		m_freem(m);
 	if (ni != NULL)
 		ieee80211_free_node(ni);
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return error;
 #undef senderr
 }
 
 /*
  * Set the direction field and address fields of an outgoing
  * frame.  Note this should be called early on in constructing
  * a frame as it sets i_fc[1]; other bits can then be or'd in.
  */
 void
 ieee80211_send_setup(
 	struct ieee80211_node *ni,
 	struct mbuf *m,
 	int type, int tid,
 	const uint8_t sa[IEEE80211_ADDR_LEN],
 	const uint8_t da[IEEE80211_ADDR_LEN],
 	const uint8_t bssid[IEEE80211_ADDR_LEN])
 {
 #define	WH4(wh)	((struct ieee80211_frame_addr4 *)wh)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_tx_ampdu *tap;
 	struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *);
 	ieee80211_seq seqno;
 
 	IEEE80211_TX_LOCK_ASSERT(ni->ni_ic);
 
 	wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | type;
 	if ((type & IEEE80211_FC0_TYPE_MASK) == IEEE80211_FC0_TYPE_DATA) {
 		switch (vap->iv_opmode) {
 		case IEEE80211_M_STA:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_TODS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, bssid);
 			IEEE80211_ADDR_COPY(wh->i_addr2, sa);
 			IEEE80211_ADDR_COPY(wh->i_addr3, da);
 			break;
 		case IEEE80211_M_IBSS:
 		case IEEE80211_M_AHDEMO:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, da);
 			IEEE80211_ADDR_COPY(wh->i_addr2, sa);
 			IEEE80211_ADDR_COPY(wh->i_addr3, bssid);
 			break;
 		case IEEE80211_M_HOSTAP:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, da);
 			IEEE80211_ADDR_COPY(wh->i_addr2, bssid);
 			IEEE80211_ADDR_COPY(wh->i_addr3, sa);
 			break;
 		case IEEE80211_M_WDS:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, da);
 			IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(wh->i_addr3, da);
 			IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, sa);
 			break;
 		case IEEE80211_M_MBSS:
 #ifdef IEEE80211_SUPPORT_MESH
 			if (IEEE80211_IS_MULTICAST(da)) {
 				wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 				/* XXX next hop */
 				IEEE80211_ADDR_COPY(wh->i_addr1, da);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 			} else {
 				wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS;
 				IEEE80211_ADDR_COPY(wh->i_addr1, da);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr3, da);
 				IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, sa);
 			}
 #endif
 			break;
 		case IEEE80211_M_MONITOR:	/* NB: to quiet compiler */
 			break;
 		}
 	} else {
 		wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, da);
 		IEEE80211_ADDR_COPY(wh->i_addr2, sa);
 #ifdef IEEE80211_SUPPORT_MESH
 		if (vap->iv_opmode == IEEE80211_M_MBSS)
 			IEEE80211_ADDR_COPY(wh->i_addr3, sa);
 		else
 #endif
 			IEEE80211_ADDR_COPY(wh->i_addr3, bssid);
 	}
 	*(uint16_t *)&wh->i_dur[0] = 0;
 
 	/*
 	 * XXX TODO: this is what the TX lock is for.
 	 * Here we're incrementing sequence numbers, and they
 	 * need to be in lock-step with what the driver is doing
 	 * both in TX ordering and crypto encap (IV increment.)
 	 *
 	 * If the driver does seqno itself, then we can skip
 	 * assigning sequence numbers here, and we can avoid
 	 * requiring the TX lock.
 	 */
 	tap = &ni->ni_tx_ampdu[tid];
 	if (tid != IEEE80211_NONQOS_TID && IEEE80211_AMPDU_RUNNING(tap)) {
 		m->m_flags |= M_AMPDU_MPDU;
 
 		/* NB: zero out i_seq field (for s/w encryption etc) */
 		*(uint16_t *)&wh->i_seq[0] = 0;
 	} else {
 		if (IEEE80211_HAS_SEQ(type & IEEE80211_FC0_TYPE_MASK,
 				      type & IEEE80211_FC0_SUBTYPE_MASK))
 			/*
 			 * 802.11-2012 9.3.2.10 - QoS multicast frames
 			 * come out of a different seqno space.
 			 */
 			if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 				seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 			} else {
 				seqno = ni->ni_txseqs[tid]++;
 			}
 		else
 			seqno = 0;
 
 		*(uint16_t *)&wh->i_seq[0] =
 		    htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 		M_SEQNO_SET(m, seqno);
 	}
 
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1))
 		m->m_flags |= M_MCAST;
 #undef WH4
 }
 
 /*
  * Send a management frame to the specified node.  The node pointer
  * must have a reference as the pointer will be passed to the driver
  * and potentially held for a long time.  If the frame is successfully
  * dispatched to the driver, then it is responsible for freeing the
  * reference (and potentially free'ing up any associated storage);
  * otherwise deal with reclaiming any reference (on error).
  */
 int
 ieee80211_mgmt_output(struct ieee80211_node *ni, struct mbuf *m, int type,
 	struct ieee80211_bpf_params *params)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_frame *wh;
 	int ret;
 
 	KASSERT(ni != NULL, ("null node"));
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH,
 		    ni, "block %s frame in CAC state",
 			ieee80211_mgt_subtype_name(type));
 		vap->iv_stats.is_tx_badstate++;
 		ieee80211_free_node(ni);
 		m_freem(m);
 		return EIO;		/* XXX */
 	}
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT);
 	if (m == NULL) {
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 
 	wh = mtod(m, struct ieee80211_frame *);
 	ieee80211_send_setup(ni, m,
 	     IEEE80211_FC0_TYPE_MGT | type, IEEE80211_NONQOS_TID,
 	     vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid);
 	if (params->ibp_flags & IEEE80211_BPF_CRYPTO) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_AUTH, wh->i_addr1,
 		    "encrypting frame (%s)", __func__);
 		wh->i_fc[1] |= IEEE80211_FC1_PROTECTED;
 	}
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	KASSERT(type != IEEE80211_FC0_SUBTYPE_PROBE_RESP, ("probe response?"));
 	M_WME_SETAC(m, params->ibp_pri);
 
 #ifdef IEEE80211_DEBUG
 	/* avoid printing too many frames */
 	if ((ieee80211_msg_debug(vap) && doprint(vap, type)) ||
 	    ieee80211_msg_dumppkts(vap)) {
 		printf("[%s] send %s on channel %u\n",
 		    ether_sprintf(wh->i_addr1),
 		    ieee80211_mgt_subtype_name(type),
 		    ieee80211_chan2ieee(ic, ic->ic_curchan));
 	}
 #endif
 	IEEE80211_NODE_STAT(ni, tx_mgmt);
 
 	ret = ieee80211_raw_output(vap, ni, m, params);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 static void
 ieee80211_nulldata_transmitted(struct ieee80211_node *ni, void *arg,
     int status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	wakeup(vap);
 }
 
 /*
  * Send a null data frame to the specified node.  If the station
  * is setup for QoS then a QoS Null Data frame is constructed.
  * If this is a WDS station then a 4-address frame is constructed.
  *
  * NB: the caller is assumed to have setup a node reference
  *     for use; this is necessary to deal with a race condition
  *     when probing for inactive stations.  Like ieee80211_mgmt_output
  *     we must cleanup any node reference on error;  however we
  *     can safely just unref it as we know it will never be the
  *     last reference to the node.
  */
 int
 ieee80211_send_nulldata(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct mbuf *m;
 	struct ieee80211_frame *wh;
 	int hdrlen;
 	uint8_t *frm;
 	int ret;
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH,
 		    ni, "block %s frame in CAC state", "null data");
 		ieee80211_unref_node(&ni);
 		vap->iv_stats.is_tx_badstate++;
 		return EIO;		/* XXX */
 	}
 
 	if (ni->ni_flags & (IEEE80211_NODE_QOS|IEEE80211_NODE_HT))
 		hdrlen = sizeof(struct ieee80211_qosframe);
 	else
 		hdrlen = sizeof(struct ieee80211_frame);
 	/* NB: only WDS vap's get 4-address frames */
 	if (vap->iv_opmode == IEEE80211_M_WDS)
 		hdrlen += IEEE80211_ADDR_LEN;
 	if (ic->ic_flags & IEEE80211_F_DATAPAD)
 		hdrlen = roundup(hdrlen, sizeof(uint32_t));
 
 	m = ieee80211_getmgtframe(&frm, ic->ic_headroom + hdrlen, 0);
 	if (m == NULL) {
 		/* XXX debug msg */
 		ieee80211_unref_node(&ni);
 		vap->iv_stats.is_tx_nobuf++;
 		return ENOMEM;
 	}
 	KASSERT(M_LEADINGSPACE(m) >= hdrlen,
 	    ("leading space %zd", M_LEADINGSPACE(m)));
 	M_PREPEND(m, hdrlen, M_NOWAIT);
 	if (m == NULL) {
 		/* NB: cannot happen */
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 
 	wh = mtod(m, struct ieee80211_frame *);		/* NB: a little lie */
 	if (ni->ni_flags & IEEE80211_NODE_QOS) {
 		const int tid = WME_AC_TO_TID(WME_AC_BE);
 		uint8_t *qos;
 
 		ieee80211_send_setup(ni, m,
 		    IEEE80211_FC0_TYPE_DATA | IEEE80211_FC0_SUBTYPE_QOS_NULL,
 		    tid, vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid);
 
 		if (vap->iv_opmode == IEEE80211_M_WDS)
 			qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos;
 		else
 			qos = ((struct ieee80211_qosframe *) wh)->i_qos;
 		qos[0] = tid & IEEE80211_QOS_TID;
 		if (ic->ic_wme.wme_wmeChanParams.cap_wmeParams[WME_AC_BE].wmep_noackPolicy)
 			qos[0] |= IEEE80211_QOS_ACKPOLICY_NOACK;
 		qos[1] = 0;
 	} else {
 		ieee80211_send_setup(ni, m,
 		    IEEE80211_FC0_TYPE_DATA | IEEE80211_FC0_SUBTYPE_NODATA,
 		    IEEE80211_NONQOS_TID,
 		    vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid);
 	}
 	if (vap->iv_opmode != IEEE80211_M_WDS) {
 		/* NB: power management bit is never sent by an AP */
 		if ((ni->ni_flags & IEEE80211_NODE_PWR_MGT) &&
 		    vap->iv_opmode != IEEE80211_M_HOSTAP)
 			wh->i_fc[1] |= IEEE80211_FC1_PWR_MGT;
 	}
 	if ((ic->ic_flags & IEEE80211_F_SCAN) &&
 	    (ni->ni_flags & IEEE80211_NODE_PWR_MGT)) {
 		ieee80211_add_callback(m, ieee80211_nulldata_transmitted,
 		    NULL);
 	}
 	m->m_len = m->m_pkthdr.len = hdrlen;
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	M_WME_SETAC(m, WME_AC_BE);
 
 	IEEE80211_NODE_STAT(ni, tx_data);
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS, ni,
 	    "send %snull data frame on channel %u, pwr mgt %s",
 	    ni->ni_flags & IEEE80211_NODE_QOS ? "QoS " : "",
 	    ieee80211_chan2ieee(ic, ic->ic_curchan),
 	    wh->i_fc[1] & IEEE80211_FC1_PWR_MGT ? "ena" : "dis");
 
 	ret = ieee80211_raw_output(vap, ni, m, NULL);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 /* 
  * Assign priority to a frame based on any vlan tag assigned
  * to the station and/or any Diffserv setting in an IP header.
  * Finally, if an ACM policy is setup (in station mode) it's
  * applied.
  */
 int
 ieee80211_classify(struct ieee80211_node *ni, struct mbuf *m)
 {
 	const struct ether_header *eh = NULL;
 	uint16_t ether_type;
 	int v_wme_ac, d_wme_ac, ac;
 
 	if (__predict_false(m->m_flags & M_ENCAP)) {
 		struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *);
 		struct llc *llc;
 		int hdrlen, subtype;
 
 		subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 		if (subtype & IEEE80211_FC0_SUBTYPE_NODATA) {
 			ac = WME_AC_BE;
 			goto done;
 		}
 
 		hdrlen = ieee80211_hdrsize(wh);
 		if (m->m_pkthdr.len < hdrlen + sizeof(*llc))
 			return 1;
 
 		llc = (struct llc *)mtodo(m, hdrlen);
 		if (llc->llc_dsap != LLC_SNAP_LSAP ||
 		    llc->llc_ssap != LLC_SNAP_LSAP ||
 		    llc->llc_control != LLC_UI ||
 		    llc->llc_snap.org_code[0] != 0 ||
 		    llc->llc_snap.org_code[1] != 0 ||
 		    llc->llc_snap.org_code[2] != 0)
 			return 1;
 
 		ether_type = llc->llc_snap.ether_type;
 	} else {
 		eh = mtod(m, struct ether_header *);
 		ether_type = eh->ether_type;
 	}
 
 	/*
 	 * Always promote PAE/EAPOL frames to high priority.
 	 */
 	if (ether_type == htons(ETHERTYPE_PAE)) {
 		/* NB: mark so others don't need to check header */
 		m->m_flags |= M_EAPOL;
 		ac = WME_AC_VO;
 		goto done;
 	}
 	/*
 	 * Non-qos traffic goes to BE.
 	 */
 	if ((ni->ni_flags & IEEE80211_NODE_QOS) == 0) {
 		ac = WME_AC_BE;
 		goto done;
 	}
 
 	/* 
 	 * If node has a vlan tag then all traffic
 	 * to it must have a matching tag.
 	 */
 	v_wme_ac = 0;
 	if (ni->ni_vlan != 0) {
 		 if ((m->m_flags & M_VLANTAG) == 0) {
 			IEEE80211_NODE_STAT(ni, tx_novlantag);
 			return 1;
 		}
 		if (EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) !=
 		    EVL_VLANOFTAG(ni->ni_vlan)) {
 			IEEE80211_NODE_STAT(ni, tx_vlanmismatch);
 			return 1;
 		}
 		/* map vlan priority to AC */
 		v_wme_ac = TID_TO_WME_AC(EVL_PRIOFTAG(ni->ni_vlan));
 	}
 
 	/* XXX m_copydata may be too slow for fast path */
 #ifdef INET
 	if (eh && eh->ether_type == htons(ETHERTYPE_IP)) {
 		uint8_t tos;
 		/*
 		 * IP frame, map the DSCP bits from the TOS field.
 		 */
 		/* NB: ip header may not be in first mbuf */
 		m_copydata(m, sizeof(struct ether_header) +
 		    offsetof(struct ip, ip_tos), sizeof(tos), &tos);
 		tos >>= 5;		/* NB: ECN + low 3 bits of DSCP */
 		d_wme_ac = TID_TO_WME_AC(tos);
 	} else {
 #endif /* INET */
 #ifdef INET6
 	if (eh && eh->ether_type == htons(ETHERTYPE_IPV6)) {
 		uint32_t flow;
 		uint8_t tos;
 		/*
 		 * IPv6 frame, map the DSCP bits from the traffic class field.
 		 */
 		m_copydata(m, sizeof(struct ether_header) +
 		    offsetof(struct ip6_hdr, ip6_flow), sizeof(flow),
 		    (caddr_t) &flow);
 		tos = (uint8_t)(ntohl(flow) >> 20);
 		tos >>= 5;		/* NB: ECN + low 3 bits of DSCP */
 		d_wme_ac = TID_TO_WME_AC(tos);
 	} else {
 #endif /* INET6 */
 		d_wme_ac = WME_AC_BE;
 #ifdef INET6
 	}
 #endif
 #ifdef INET
 	}
 #endif
 	/*
 	 * Use highest priority AC.
 	 */
 	if (v_wme_ac > d_wme_ac)
 		ac = v_wme_ac;
 	else
 		ac = d_wme_ac;
 
 	/*
 	 * Apply ACM policy.
 	 */
 	if (ni->ni_vap->iv_opmode == IEEE80211_M_STA) {
 		static const int acmap[4] = {
 			WME_AC_BK,	/* WME_AC_BE */
 			WME_AC_BK,	/* WME_AC_BK */
 			WME_AC_BE,	/* WME_AC_VI */
 			WME_AC_VI,	/* WME_AC_VO */
 		};
 		struct ieee80211com *ic = ni->ni_ic;
 
 		while (ac != WME_AC_BK &&
 		    ic->ic_wme.wme_wmeBssChanParams.cap_wmeParams[ac].wmep_acm)
 			ac = acmap[ac];
 	}
 done:
 	M_WME_SETAC(m, ac);
 	return 0;
 }
 
 /*
  * Insure there is sufficient contiguous space to encapsulate the
  * 802.11 data frame.  If room isn't already there, arrange for it.
  * Drivers and cipher modules assume we have done the necessary work
  * and fail rudely if they don't find the space they need.
  */
 struct mbuf *
 ieee80211_mbuf_adjust(struct ieee80211vap *vap, int hdrsize,
 	struct ieee80211_key *key, struct mbuf *m)
 {
 #define	TO_BE_RECLAIMED	(sizeof(struct ether_header) - sizeof(struct llc))
 	int needed_space = vap->iv_ic->ic_headroom + hdrsize;
 
 	if (key != NULL) {
 		/* XXX belongs in crypto code? */
 		needed_space += key->wk_cipher->ic_header;
 		/* XXX frags */
 		/*
 		 * When crypto is being done in the host we must insure
 		 * the data are writable for the cipher routines; clone
 		 * a writable mbuf chain.
 		 * XXX handle SWMIC specially
 		 */
 		if (key->wk_flags & (IEEE80211_KEY_SWENCRYPT|IEEE80211_KEY_SWENMIC)) {
 			m = m_unshare(m, M_NOWAIT);
 			if (m == NULL) {
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 				    "%s: cannot get writable mbuf\n", __func__);
 				vap->iv_stats.is_tx_nobuf++; /* XXX new stat */
 				return NULL;
 			}
 		}
 	}
 	/*
 	 * We know we are called just before stripping an Ethernet
 	 * header and prepending an LLC header.  This means we know
 	 * there will be
 	 *	sizeof(struct ether_header) - sizeof(struct llc)
 	 * bytes recovered to which we need additional space for the
 	 * 802.11 header and any crypto header.
 	 */
 	/* XXX check trailing space and copy instead? */
 	if (M_LEADINGSPACE(m) < needed_space - TO_BE_RECLAIMED) {
 		struct mbuf *n = m_gethdr(M_NOWAIT, m->m_type);
 		if (n == NULL) {
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 			    "%s: cannot expand storage\n", __func__);
 			vap->iv_stats.is_tx_nobuf++;
 			m_freem(m);
 			return NULL;
 		}
 		KASSERT(needed_space <= MHLEN,
 		    ("not enough room, need %u got %d\n", needed_space, MHLEN));
 		/*
 		 * Setup new mbuf to have leading space to prepend the
 		 * 802.11 header and any crypto header bits that are
 		 * required (the latter are added when the driver calls
 		 * back to ieee80211_crypto_encap to do crypto encapsulation).
 		 */
 		/* NB: must be first 'cuz it clobbers m_data */
 		m_move_pkthdr(n, m);
 		n->m_len = 0;			/* NB: m_gethdr does not set */
 		n->m_data += needed_space;
 		/*
 		 * Pull up Ethernet header to create the expected layout.
 		 * We could use m_pullup but that's overkill (i.e. we don't
 		 * need the actual data) and it cannot fail so do it inline
 		 * for speed.
 		 */
 		/* NB: struct ether_header is known to be contiguous */
 		n->m_len += sizeof(struct ether_header);
 		m->m_len -= sizeof(struct ether_header);
 		m->m_data += sizeof(struct ether_header);
 		/*
 		 * Replace the head of the chain.
 		 */
 		n->m_next = m;
 		m = n;
 	}
 	return m;
 #undef TO_BE_RECLAIMED
 }
 
 /*
  * Return the transmit key to use in sending a unicast frame.
  * If a unicast key is set we use that.  When no unicast key is set
  * we fall back to the default transmit key.
  */ 
 static __inline struct ieee80211_key *
 ieee80211_crypto_getucastkey(struct ieee80211vap *vap,
 	struct ieee80211_node *ni)
 {
 	if (IEEE80211_KEY_UNDEFINED(&ni->ni_ucastkey)) {
 		if (vap->iv_def_txkey == IEEE80211_KEYIX_NONE ||
 		    IEEE80211_KEY_UNDEFINED(&vap->iv_nw_keys[vap->iv_def_txkey]))
 			return NULL;
 		return &vap->iv_nw_keys[vap->iv_def_txkey];
 	} else {
 		return &ni->ni_ucastkey;
 	}
 }
 
 /*
  * Return the transmit key to use in sending a multicast frame.
  * Multicast traffic always uses the group key which is installed as
  * the default tx key.
  */ 
 static __inline struct ieee80211_key *
 ieee80211_crypto_getmcastkey(struct ieee80211vap *vap,
 	struct ieee80211_node *ni)
 {
 	if (vap->iv_def_txkey == IEEE80211_KEYIX_NONE ||
 	    IEEE80211_KEY_UNDEFINED(&vap->iv_nw_keys[vap->iv_def_txkey]))
 		return NULL;
 	return &vap->iv_nw_keys[vap->iv_def_txkey];
 }
 
 /*
  * Encapsulate an outbound data frame.  The mbuf chain is updated.
  * If an error is encountered NULL is returned.  The caller is required
  * to provide a node reference and pullup the ethernet header in the
  * first mbuf.
  *
  * NB: Packet is assumed to be processed by ieee80211_classify which
  *     marked EAPOL frames w/ M_EAPOL.
  */
 struct mbuf *
 ieee80211_encap(struct ieee80211vap *vap, struct ieee80211_node *ni,
     struct mbuf *m)
 {
 #define	WH4(wh)	((struct ieee80211_frame_addr4 *)(wh))
 #define MC01(mc)	((struct ieee80211_meshcntl_ae01 *)mc)
 	struct ieee80211com *ic = ni->ni_ic;
 #ifdef IEEE80211_SUPPORT_MESH
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshcntl_ae10 *mc;
 	struct ieee80211_mesh_route *rt = NULL;
 	int dir = -1;
 #endif
 	struct ether_header eh;
 	struct ieee80211_frame *wh;
 	struct ieee80211_key *key;
 	struct llc *llc;
 	int hdrsize, hdrspace, datalen, addqos, txfrag, is4addr, is_mcast;
 	ieee80211_seq seqno;
 	int meshhdrsize, meshae;
 	uint8_t *qos;
 	int is_amsdu = 0;
 	
 	IEEE80211_TX_LOCK_ASSERT(ic);
 
 	is_mcast = !! (m->m_flags & (M_MCAST | M_BCAST));
 
 	/*
 	 * Copy existing Ethernet header to a safe place.  The
 	 * rest of the code assumes it's ok to strip it when
 	 * reorganizing state for the final encapsulation.
 	 */
 	KASSERT(m->m_len >= sizeof(eh), ("no ethernet header!"));
 	ETHER_HEADER_COPY(&eh, mtod(m, caddr_t));
 
 	/*
 	 * Insure space for additional headers.  First identify
 	 * transmit key to use in calculating any buffer adjustments
 	 * required.  This is also used below to do privacy
 	 * encapsulation work.  Then calculate the 802.11 header
 	 * size and any padding required by the driver.
 	 *
 	 * Note key may be NULL if we fall back to the default
 	 * transmit key and that is not set.  In that case the
 	 * buffer may not be expanded as needed by the cipher
 	 * routines, but they will/should discard it.
 	 */
 	if (vap->iv_flags & IEEE80211_F_PRIVACY) {
 		if (vap->iv_opmode == IEEE80211_M_STA ||
 		    !IEEE80211_IS_MULTICAST(eh.ether_dhost) ||
 		    (vap->iv_opmode == IEEE80211_M_WDS &&
 		     (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY)))
 			key = ieee80211_crypto_getucastkey(vap, ni);
 		else
 			key = ieee80211_crypto_getmcastkey(vap, ni);
 		if (key == NULL && (m->m_flags & M_EAPOL) == 0) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO,
 			    eh.ether_dhost,
 			    "no default transmit key (%s) deftxkey %u",
 			    __func__, vap->iv_def_txkey);
 			vap->iv_stats.is_tx_nodefkey++;
 			goto bad;
 		}
 	} else
 		key = NULL;
 	/*
 	 * XXX Some ap's don't handle QoS-encapsulated EAPOL
 	 * frames so suppress use.  This may be an issue if other
 	 * ap's require all data frames to be QoS-encapsulated
 	 * once negotiated in which case we'll need to make this
 	 * configurable.
 	 *
 	 * Don't send multicast QoS frames.
 	 * Technically multicast frames can be QoS if all stations in the
 	 * BSS are also QoS.
 	 *
 	 * NB: mesh data frames are QoS, including multicast frames.
 	 */
 	addqos =
 	    (((is_mcast == 0) && (ni->ni_flags &
 	     (IEEE80211_NODE_QOS|IEEE80211_NODE_HT))) ||
 	    (vap->iv_opmode == IEEE80211_M_MBSS)) &&
 	    (m->m_flags & M_EAPOL) == 0;
 
 	if (addqos)
 		hdrsize = sizeof(struct ieee80211_qosframe);
 	else
 		hdrsize = sizeof(struct ieee80211_frame);
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS) {
 		/*
 		 * Mesh data frames are encapsulated according to the
 		 * rules of Section 11B.8.5 (p.139 of D3.0 spec).
 		 * o Group Addressed data (aka multicast) originating
 		 *   at the local sta are sent w/ 3-address format and
 		 *   address extension mode 00
 		 * o Individually Addressed data (aka unicast) originating
 		 *   at the local sta are sent w/ 4-address format and
 		 *   address extension mode 00
 		 * o Group Addressed data forwarded from a non-mesh sta are
 		 *   sent w/ 3-address format and address extension mode 01
 		 * o Individually Address data from another sta are sent
 		 *   w/ 4-address format and address extension mode 10
 		 */
 		is4addr = 0;		/* NB: don't use, disable */
 		if (!IEEE80211_IS_MULTICAST(eh.ether_dhost)) {
 			rt = ieee80211_mesh_rt_find(vap, eh.ether_dhost);
 			KASSERT(rt != NULL, ("route is NULL"));
 			dir = IEEE80211_FC1_DIR_DSTODS;
 			hdrsize += IEEE80211_ADDR_LEN;
 			if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) {
 				if (IEEE80211_ADDR_EQ(rt->rt_mesh_gate,
 				    vap->iv_myaddr)) {
 					IEEE80211_NOTE_MAC(vap,
 					    IEEE80211_MSG_MESH,
 					    eh.ether_dhost,
 					    "%s", "trying to send to ourself");
 					goto bad;
 				}
 				meshae = IEEE80211_MESH_AE_10;
 				meshhdrsize =
 				    sizeof(struct ieee80211_meshcntl_ae10);
 			} else {
 				meshae = IEEE80211_MESH_AE_00;
 				meshhdrsize =
 				    sizeof(struct ieee80211_meshcntl);
 			}
 		} else {
 			dir = IEEE80211_FC1_DIR_FROMDS;
 			if (!IEEE80211_ADDR_EQ(eh.ether_shost, vap->iv_myaddr)) {
 				/* proxy group */
 				meshae = IEEE80211_MESH_AE_01;
 				meshhdrsize =
 				    sizeof(struct ieee80211_meshcntl_ae01);
 			} else {
 				/* group */
 				meshae = IEEE80211_MESH_AE_00;
 				meshhdrsize = sizeof(struct ieee80211_meshcntl);
 			}
 		}
 	} else {
 #endif
 		/*
 		 * 4-address frames need to be generated for:
 		 * o packets sent through a WDS vap (IEEE80211_M_WDS)
 		 * o packets sent through a vap marked for relaying
 		 *   (e.g. a station operating with dynamic WDS)
 		 */
 		is4addr = vap->iv_opmode == IEEE80211_M_WDS ||
 		    ((vap->iv_flags_ext & IEEE80211_FEXT_4ADDR) &&
 		     !IEEE80211_ADDR_EQ(eh.ether_shost, vap->iv_myaddr));
 		if (is4addr)
 			hdrsize += IEEE80211_ADDR_LEN;
 		meshhdrsize = meshae = 0;
 #ifdef IEEE80211_SUPPORT_MESH
 	}
 #endif
 	/*
 	 * Honor driver DATAPAD requirement.
 	 */
 	if (ic->ic_flags & IEEE80211_F_DATAPAD)
 		hdrspace = roundup(hdrsize, sizeof(uint32_t));
 	else
 		hdrspace = hdrsize;
 
 	if (__predict_true((m->m_flags & M_FF) == 0)) {
 		/*
 		 * Normal frame.
 		 */
 		m = ieee80211_mbuf_adjust(vap, hdrspace + meshhdrsize, key, m);
 		if (m == NULL) {
 			/* NB: ieee80211_mbuf_adjust handles msgs+statistics */
 			goto bad;
 		}
 		/* NB: this could be optimized 'cuz of ieee80211_mbuf_adjust */
 		m_adj(m, sizeof(struct ether_header) - sizeof(struct llc));
 		llc = mtod(m, struct llc *);
 		llc->llc_dsap = llc->llc_ssap = LLC_SNAP_LSAP;
 		llc->llc_control = LLC_UI;
 		llc->llc_snap.org_code[0] = 0;
 		llc->llc_snap.org_code[1] = 0;
 		llc->llc_snap.org_code[2] = 0;
 		llc->llc_snap.ether_type = eh.ether_type;
 	} else {
 #ifdef IEEE80211_SUPPORT_SUPERG
 		/*
 		 * Aggregated frame.  Check if it's for AMSDU or FF.
 		 *
 		 * XXX TODO: IEEE80211_NODE_AMSDU* isn't implemented
 		 * anywhere for some reason.  But, since 11n requires
 		 * AMSDU RX, we can just assume "11n" == "AMSDU".
 		 */
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG, "%s: called; M_FF\n", __func__);
 		if (ieee80211_amsdu_tx_ok(ni)) {
 			m = ieee80211_amsdu_encap(vap, m, hdrspace + meshhdrsize, key);
 			is_amsdu = 1;
 		} else {
 			m = ieee80211_ff_encap(vap, m, hdrspace + meshhdrsize, key);
 		}
 		if (m == NULL)
 #endif
 			goto bad;
 	}
 	datalen = m->m_pkthdr.len;		/* NB: w/o 802.11 header */
 
 	M_PREPEND(m, hdrspace + meshhdrsize, M_NOWAIT);
 	if (m == NULL) {
 		vap->iv_stats.is_tx_nobuf++;
 		goto bad;
 	}
 	wh = mtod(m, struct ieee80211_frame *);
 	wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_DATA;
 	*(uint16_t *)wh->i_dur = 0;
 	qos = NULL;	/* NB: quiet compiler */
 	if (is4addr) {
 		wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, ni->ni_macaddr);
 		IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 		IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_dhost);
 		IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, eh.ether_shost);
 	} else switch (vap->iv_opmode) {
 	case IEEE80211_M_STA:
 		wh->i_fc[1] = IEEE80211_FC1_DIR_TODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, ni->ni_bssid);
 		IEEE80211_ADDR_COPY(wh->i_addr2, eh.ether_shost);
 		IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_dhost);
 		break;
 	case IEEE80211_M_IBSS:
 	case IEEE80211_M_AHDEMO:
 		wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost);
 		IEEE80211_ADDR_COPY(wh->i_addr2, eh.ether_shost);
 		/*
 		 * NB: always use the bssid from iv_bss as the
 		 *     neighbor's may be stale after an ibss merge
 		 */
 		IEEE80211_ADDR_COPY(wh->i_addr3, vap->iv_bss->ni_bssid);
 		break;
 	case IEEE80211_M_HOSTAP:
 		wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost);
 		IEEE80211_ADDR_COPY(wh->i_addr2, ni->ni_bssid);
 		IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_shost);
 		break;
 #ifdef IEEE80211_SUPPORT_MESH
 	case IEEE80211_M_MBSS:
 		/* NB: offset by hdrspace to deal with DATAPAD */
 		mc = (struct ieee80211_meshcntl_ae10 *)
 		     (mtod(m, uint8_t *) + hdrspace);
 		wh->i_fc[1] = dir;
 		switch (meshae) {
 		case IEEE80211_MESH_AE_00:	/* no proxy */
 			mc->mc_flags = 0;
 			if (dir == IEEE80211_FC1_DIR_DSTODS) { /* ucast */
 				IEEE80211_ADDR_COPY(wh->i_addr1,
 				    ni->ni_macaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr3,
 				    eh.ether_dhost);
 				IEEE80211_ADDR_COPY(WH4(wh)->i_addr4,
 				    eh.ether_shost);
 				qos =((struct ieee80211_qosframe_addr4 *)
 				    wh)->i_qos;
 			} else if (dir == IEEE80211_FC1_DIR_FROMDS) {
 				 /* mcast */
 				IEEE80211_ADDR_COPY(wh->i_addr1,
 				    eh.ether_dhost);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr3,
 				    eh.ether_shost);
 				qos = ((struct ieee80211_qosframe *)
 				    wh)->i_qos;
 			}
 			break;
 		case IEEE80211_MESH_AE_01:	/* mcast, proxy */
 			wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost);
 			IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(wh->i_addr3, vap->iv_myaddr);
 			mc->mc_flags = 1;
 			IEEE80211_ADDR_COPY(MC01(mc)->mc_addr4,
 			    eh.ether_shost);
 			qos = ((struct ieee80211_qosframe *) wh)->i_qos;
 			break;
 		case IEEE80211_MESH_AE_10:	/* ucast, proxy */
 			KASSERT(rt != NULL, ("route is NULL"));
 			IEEE80211_ADDR_COPY(wh->i_addr1, rt->rt_nexthop);
 			IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(wh->i_addr3, rt->rt_mesh_gate);
 			IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, vap->iv_myaddr);
 			mc->mc_flags = IEEE80211_MESH_AE_10;
 			IEEE80211_ADDR_COPY(mc->mc_addr5, eh.ether_dhost);
 			IEEE80211_ADDR_COPY(mc->mc_addr6, eh.ether_shost);
 			qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos;
 			break;
 		default:
 			KASSERT(0, ("meshae %d", meshae));
 			break;
 		}
 		mc->mc_ttl = ms->ms_ttl;
 		ms->ms_seq++;
 		le32enc(mc->mc_seq, ms->ms_seq);
 		break;
 #endif
 	case IEEE80211_M_WDS:		/* NB: is4addr should always be true */
 	default:
 		goto bad;
 	}
 	if (m->m_flags & M_MORE_DATA)
 		wh->i_fc[1] |= IEEE80211_FC1_MORE_DATA;
 	if (addqos) {
 		int ac, tid;
 
 		if (is4addr) {
 			qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos;
 		/* NB: mesh case handled earlier */
 		} else if (vap->iv_opmode != IEEE80211_M_MBSS)
 			qos = ((struct ieee80211_qosframe *) wh)->i_qos;
 		ac = M_WME_GETAC(m);
 		/* map from access class/queue to 11e header priorty value */
 		tid = WME_AC_TO_TID(ac);
 		qos[0] = tid & IEEE80211_QOS_TID;
 		if (ic->ic_wme.wme_wmeChanParams.cap_wmeParams[ac].wmep_noackPolicy)
 			qos[0] |= IEEE80211_QOS_ACKPOLICY_NOACK;
 #ifdef IEEE80211_SUPPORT_MESH
 		if (vap->iv_opmode == IEEE80211_M_MBSS)
 			qos[1] = IEEE80211_QOS_MC;
 		else
 #endif
 			qos[1] = 0;
 		wh->i_fc[0] |= IEEE80211_FC0_SUBTYPE_QOS;
 
 		/*
 		 * If this is an A-MSDU then ensure we set the
 		 * relevant field.
 		 */
 		if (is_amsdu)
 			qos[0] |= IEEE80211_QOS_AMSDU;
 
 		/*
 		 * XXX TODO TX lock is needed for atomic updates of sequence
 		 * numbers.  If the driver does it, then don't do it here;
 		 * and we don't need the TX lock held.
 		 */
 		if ((m->m_flags & M_AMPDU_MPDU) == 0) {
 			/*
 			 * 802.11-2012 9.3.2.10 -
 			 *
 			 * If this is a multicast frame then we need
 			 * to ensure that the sequence number comes from
 			 * a separate seqno space and not the TID space.
 			 *
 			 * Otherwise multicast frames may actually cause
 			 * holes in the TX blockack window space and
 			 * upset various things.
 			 */
 			if (IEEE80211_IS_MULTICAST(wh->i_addr1))
 				seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 			else
 				seqno = ni->ni_txseqs[tid]++;
 
 			/*
 			 * NB: don't assign a sequence # to potential
 			 * aggregates; we expect this happens at the
 			 * point the frame comes off any aggregation q
 			 * as otherwise we may introduce holes in the
 			 * BA sequence space and/or make window accouting
 			 * more difficult.
 			 *
 			 * XXX may want to control this with a driver
 			 * capability; this may also change when we pull
 			 * aggregation up into net80211
 			 */
 			*(uint16_t *)wh->i_seq =
 			    htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 			M_SEQNO_SET(m, seqno);
 		} else {
 			/* NB: zero out i_seq field (for s/w encryption etc) */
 			*(uint16_t *)wh->i_seq = 0;
 		}
 	} else {
 		/*
 		 * XXX TODO TX lock is needed for atomic updates of sequence
 		 * numbers.  If the driver does it, then don't do it here;
 		 * and we don't need the TX lock held.
 		 */
 		seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 		*(uint16_t *)wh->i_seq =
 		    htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 		M_SEQNO_SET(m, seqno);
 
 		/*
 		 * XXX TODO: we shouldn't allow EAPOL, etc that would
 		 * be forced to be non-QoS traffic to be A-MSDU encapsulated.
 		 */
 		if (is_amsdu)
 			printf("%s: XXX ERROR: is_amsdu set; not QoS!\n",
 			    __func__);
 	}
 
 	/*
 	 * Check if xmit fragmentation is required.
 	 *
 	 * If the hardware does fragmentation offload, then don't bother
 	 * doing it here.
 	 */
 	if (IEEE80211_CONF_FRAG_OFFLOAD(ic))
 		txfrag = 0;
 	else
 		txfrag = (m->m_pkthdr.len > vap->iv_fragthreshold &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1) &&
 		    (vap->iv_caps & IEEE80211_C_TXFRAG) &&
 		    (m->m_flags & (M_FF | M_AMPDU_MPDU)) == 0);
 
 	if (key != NULL) {
 		/*
 		 * IEEE 802.1X: send EAPOL frames always in the clear.
 		 * WPA/WPA2: encrypt EAPOL keys when pairwise keys are set.
 		 */
 		if ((m->m_flags & M_EAPOL) == 0 ||
 		    ((vap->iv_flags & IEEE80211_F_WPA) &&
 		     (vap->iv_opmode == IEEE80211_M_STA ?
 		      !IEEE80211_KEY_UNDEFINED(key) :
 		      !IEEE80211_KEY_UNDEFINED(&ni->ni_ucastkey)))) {
 			wh->i_fc[1] |= IEEE80211_FC1_PROTECTED;
 			if (!ieee80211_crypto_enmic(vap, key, m, txfrag)) {
 				IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_OUTPUT,
 				    eh.ether_dhost,
 				    "%s", "enmic failed, discard frame");
 				vap->iv_stats.is_crypto_enmicfail++;
 				goto bad;
 			}
 		}
 	}
 	if (txfrag && !ieee80211_fragment(vap, m, hdrsize,
 	    key != NULL ? key->wk_cipher->ic_header : 0, vap->iv_fragthreshold))
 		goto bad;
 
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	IEEE80211_NODE_STAT(ni, tx_data);
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 		IEEE80211_NODE_STAT(ni, tx_mcast);
 		m->m_flags |= M_MCAST;
 	} else
 		IEEE80211_NODE_STAT(ni, tx_ucast);
 	IEEE80211_NODE_STAT_ADD(ni, tx_bytes, datalen);
 
 	return m;
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return NULL;
 #undef WH4
 #undef MC01
 }
 
 void
 ieee80211_free_mbuf(struct mbuf *m)
 {
 	struct mbuf *next;
 
 	if (m == NULL)
 		return;
 
 	do {
 		next = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		m_freem(m);
 	} while ((m = next) != NULL);
 }
 
 /*
  * Fragment the frame according to the specified mtu.
  * The size of the 802.11 header (w/o padding) is provided
  * so we don't need to recalculate it.  We create a new
  * mbuf for each fragment and chain it through m_nextpkt;
  * we might be able to optimize this by reusing the original
  * packet's mbufs but that is significantly more complicated.
  */
 static int
 ieee80211_fragment(struct ieee80211vap *vap, struct mbuf *m0,
 	u_int hdrsize, u_int ciphdrsize, u_int mtu)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_frame *wh, *whf;
 	struct mbuf *m, *prev;
 	u_int totalhdrsize, fragno, fragsize, off, remainder, payload;
 	u_int hdrspace;
 
 	KASSERT(m0->m_nextpkt == NULL, ("mbuf already chained?"));
 	KASSERT(m0->m_pkthdr.len > mtu,
 		("pktlen %u mtu %u", m0->m_pkthdr.len, mtu));
 
 	/*
 	 * Honor driver DATAPAD requirement.
 	 */
 	if (ic->ic_flags & IEEE80211_F_DATAPAD)
 		hdrspace = roundup(hdrsize, sizeof(uint32_t));
 	else
 		hdrspace = hdrsize;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	/* NB: mark the first frag; it will be propagated below */
 	wh->i_fc[1] |= IEEE80211_FC1_MORE_FRAG;
 	totalhdrsize = hdrspace + ciphdrsize;
 	fragno = 1;
 	off = mtu - ciphdrsize;
 	remainder = m0->m_pkthdr.len - off;
 	prev = m0;
 	do {
 		fragsize = MIN(totalhdrsize + remainder, mtu);
 		m = m_get2(fragsize, M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			goto bad;
 		/* leave room to prepend any cipher header */
 		m_align(m, fragsize - ciphdrsize);
 
 		/*
 		 * Form the header in the fragment.  Note that since
 		 * we mark the first fragment with the MORE_FRAG bit
 		 * it automatically is propagated to each fragment; we
 		 * need only clear it on the last fragment (done below).
 		 * NB: frag 1+ dont have Mesh Control field present.
 		 */
 		whf = mtod(m, struct ieee80211_frame *);
 		memcpy(whf, wh, hdrsize);
 #ifdef IEEE80211_SUPPORT_MESH
 		if (vap->iv_opmode == IEEE80211_M_MBSS)
 			ieee80211_getqos(wh)[1] &= ~IEEE80211_QOS_MC;
 #endif
 		*(uint16_t *)&whf->i_seq[0] |= htole16(
 			(fragno & IEEE80211_SEQ_FRAG_MASK) <<
 				IEEE80211_SEQ_FRAG_SHIFT);
 		fragno++;
 
 		payload = fragsize - totalhdrsize;
 		/* NB: destination is known to be contiguous */
 
 		m_copydata(m0, off, payload, mtod(m, uint8_t *) + hdrspace);
 		m->m_len = hdrspace + payload;
 		m->m_pkthdr.len = hdrspace + payload;
 		m->m_flags |= M_FRAG;
 
 		/* chain up the fragment */
 		prev->m_nextpkt = m;
 		prev = m;
 
 		/* deduct fragment just formed */
 		remainder -= payload;
 		off += payload;
 	} while (remainder != 0);
 
 	/* set the last fragment */
 	m->m_flags |= M_LASTFRAG;
 	whf->i_fc[1] &= ~IEEE80211_FC1_MORE_FRAG;
 
 	/* strip first mbuf now that everything has been copied */
 	m_adj(m0, -(m0->m_pkthdr.len - (mtu - ciphdrsize)));
 	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
 
 	vap->iv_stats.is_tx_fragframes++;
 	vap->iv_stats.is_tx_frags += fragno-1;
 
 	return 1;
 bad:
 	/* reclaim fragments but leave original frame for caller to free */
 	ieee80211_free_mbuf(m0->m_nextpkt);
 	m0->m_nextpkt = NULL;
 	return 0;
 }
 
 /*
  * Add a supported rates element id to a frame.
  */
 uint8_t *
 ieee80211_add_rates(uint8_t *frm, const struct ieee80211_rateset *rs)
 {
 	int nrates;
 
 	*frm++ = IEEE80211_ELEMID_RATES;
 	nrates = rs->rs_nrates;
 	if (nrates > IEEE80211_RATE_SIZE)
 		nrates = IEEE80211_RATE_SIZE;
 	*frm++ = nrates;
 	memcpy(frm, rs->rs_rates, nrates);
 	return frm + nrates;
 }
 
 /*
  * Add an extended supported rates element id to a frame.
  */
 uint8_t *
 ieee80211_add_xrates(uint8_t *frm, const struct ieee80211_rateset *rs)
 {
 	/*
 	 * Add an extended supported rates element if operating in 11g mode.
 	 */
 	if (rs->rs_nrates > IEEE80211_RATE_SIZE) {
 		int nrates = rs->rs_nrates - IEEE80211_RATE_SIZE;
 		*frm++ = IEEE80211_ELEMID_XRATES;
 		*frm++ = nrates;
 		memcpy(frm, rs->rs_rates + IEEE80211_RATE_SIZE, nrates);
 		frm += nrates;
 	}
 	return frm;
 }
 
 /* 
  * Add an ssid element to a frame.
  */
 uint8_t *
 ieee80211_add_ssid(uint8_t *frm, const uint8_t *ssid, u_int len)
 {
 	*frm++ = IEEE80211_ELEMID_SSID;
 	*frm++ = len;
 	memcpy(frm, ssid, len);
 	return frm + len;
 }
 
 /*
  * Add an erp element to a frame.
  */
 static uint8_t *
 ieee80211_add_erp(uint8_t *frm, struct ieee80211com *ic)
 {
 	uint8_t erp;
 
 	*frm++ = IEEE80211_ELEMID_ERP;
 	*frm++ = 1;
 	erp = 0;
 	if (ic->ic_nonerpsta != 0)
 		erp |= IEEE80211_ERP_NON_ERP_PRESENT;
 	if (ic->ic_flags & IEEE80211_F_USEPROT)
 		erp |= IEEE80211_ERP_USE_PROTECTION;
 	if (ic->ic_flags & IEEE80211_F_USEBARKER)
 		erp |= IEEE80211_ERP_LONG_PREAMBLE;
 	*frm++ = erp;
 	return frm;
 }
 
 /*
  * Add a CFParams element to a frame.
  */
 static uint8_t *
 ieee80211_add_cfparms(uint8_t *frm, struct ieee80211com *ic)
 {
 #define	ADDSHORT(frm, v) do {	\
 	le16enc(frm, v);	\
 	frm += 2;		\
 } while (0)
 	*frm++ = IEEE80211_ELEMID_CFPARMS;
 	*frm++ = 6;
 	*frm++ = 0;		/* CFP count */
 	*frm++ = 2;		/* CFP period */
 	ADDSHORT(frm, 0);	/* CFP MaxDuration (TU) */
 	ADDSHORT(frm, 0);	/* CFP CurRemaining (TU) */
 	return frm;
 #undef ADDSHORT
 }
 
 static __inline uint8_t *
 add_appie(uint8_t *frm, const struct ieee80211_appie *ie)
 {
 	memcpy(frm, ie->ie_data, ie->ie_len);
 	return frm + ie->ie_len;
 }
 
 static __inline uint8_t *
 add_ie(uint8_t *frm, const uint8_t *ie)
 {
 	memcpy(frm, ie, 2 + ie[1]);
 	return frm + 2 + ie[1];
 }
 
 #define	WME_OUI_BYTES		0x00, 0x50, 0xf2
 /*
  * Add a WME information element to a frame.
  */
 uint8_t *
 ieee80211_add_wme_info(uint8_t *frm, struct ieee80211_wme_state *wme)
 {
 	static const struct ieee80211_wme_info info = {
 		.wme_id		= IEEE80211_ELEMID_VENDOR,
 		.wme_len	= sizeof(struct ieee80211_wme_info) - 2,
 		.wme_oui	= { WME_OUI_BYTES },
 		.wme_type	= WME_OUI_TYPE,
 		.wme_subtype	= WME_INFO_OUI_SUBTYPE,
 		.wme_version	= WME_VERSION,
 		.wme_info	= 0,
 	};
 	memcpy(frm, &info, sizeof(info));
 	return frm + sizeof(info); 
 }
 
 /*
  * Add a WME parameters element to a frame.
  */
 static uint8_t *
 ieee80211_add_wme_param(uint8_t *frm, struct ieee80211_wme_state *wme)
 {
 #define	SM(_v, _f)	(((_v) << _f##_S) & _f)
 #define	ADDSHORT(frm, v) do {	\
 	le16enc(frm, v);	\
 	frm += 2;		\
 } while (0)
 	/* NB: this works 'cuz a param has an info at the front */
 	static const struct ieee80211_wme_info param = {
 		.wme_id		= IEEE80211_ELEMID_VENDOR,
 		.wme_len	= sizeof(struct ieee80211_wme_param) - 2,
 		.wme_oui	= { WME_OUI_BYTES },
 		.wme_type	= WME_OUI_TYPE,
 		.wme_subtype	= WME_PARAM_OUI_SUBTYPE,
 		.wme_version	= WME_VERSION,
 	};
 	int i;
 
 	memcpy(frm, &param, sizeof(param));
 	frm += __offsetof(struct ieee80211_wme_info, wme_info);
 	*frm++ = wme->wme_bssChanParams.cap_info;	/* AC info */
 	*frm++ = 0;					/* reserved field */
 	for (i = 0; i < WME_NUM_AC; i++) {
 		const struct wmeParams *ac =
 		       &wme->wme_bssChanParams.cap_wmeParams[i];
 		*frm++ = SM(i, WME_PARAM_ACI)
 		       | SM(ac->wmep_acm, WME_PARAM_ACM)
 		       | SM(ac->wmep_aifsn, WME_PARAM_AIFSN)
 		       ;
 		*frm++ = SM(ac->wmep_logcwmax, WME_PARAM_LOGCWMAX)
 		       | SM(ac->wmep_logcwmin, WME_PARAM_LOGCWMIN)
 		       ;
 		ADDSHORT(frm, ac->wmep_txopLimit);
 	}
 	return frm;
 #undef SM
 #undef ADDSHORT
 }
 #undef WME_OUI_BYTES
 
 /*
  * Add an 11h Power Constraint element to a frame.
  */
 static uint8_t *
 ieee80211_add_powerconstraint(uint8_t *frm, struct ieee80211vap *vap)
 {
 	const struct ieee80211_channel *c = vap->iv_bss->ni_chan;
 	/* XXX per-vap tx power limit? */
 	int8_t limit = vap->iv_ic->ic_txpowlimit / 2;
 
 	frm[0] = IEEE80211_ELEMID_PWRCNSTR;
 	frm[1] = 1;
 	frm[2] = c->ic_maxregpower > limit ?  c->ic_maxregpower - limit : 0;
 	return frm + 3;
 }
 
 /*
  * Add an 11h Power Capability element to a frame.
  */
 static uint8_t *
 ieee80211_add_powercapability(uint8_t *frm, const struct ieee80211_channel *c)
 {
 	frm[0] = IEEE80211_ELEMID_PWRCAP;
 	frm[1] = 2;
 	frm[2] = c->ic_minpower;
 	frm[3] = c->ic_maxpower;
 	return frm + 4;
 }
 
 /*
  * Add an 11h Supported Channels element to a frame.
  */
 static uint8_t *
 ieee80211_add_supportedchannels(uint8_t *frm, struct ieee80211com *ic)
 {
 	static const int ielen = 26;
 
 	frm[0] = IEEE80211_ELEMID_SUPPCHAN;
 	frm[1] = ielen;
 	/* XXX not correct */
 	memcpy(frm+2, ic->ic_chan_avail, ielen);
 	return frm + 2 + ielen;
 }
 
 /*
  * Add an 11h Quiet time element to a frame.
  */
 static uint8_t *
 ieee80211_add_quiet(uint8_t *frm, struct ieee80211vap *vap, int update)
 {
 	struct ieee80211_quiet_ie *quiet = (struct ieee80211_quiet_ie *) frm;
 
 	quiet->quiet_ie = IEEE80211_ELEMID_QUIET;
 	quiet->len = 6;
 
 	/*
 	 * Only update every beacon interval - otherwise probe responses
 	 * would update the quiet count value.
 	 */
 	if (update) {
 		if (vap->iv_quiet_count_value == 1)
 			vap->iv_quiet_count_value = vap->iv_quiet_count;
 		else if (vap->iv_quiet_count_value > 1)
 			vap->iv_quiet_count_value--;
 	}
 
 	if (vap->iv_quiet_count_value == 0) {
 		/* value 0 is reserved as per 802.11h standerd */
 		vap->iv_quiet_count_value = 1;
 	}
 
 	quiet->tbttcount = vap->iv_quiet_count_value;
 	quiet->period = vap->iv_quiet_period;
 	quiet->duration = htole16(vap->iv_quiet_duration);
 	quiet->offset = htole16(vap->iv_quiet_offset);
 	return frm + sizeof(*quiet);
 }
 
 /*
  * Add an 11h Channel Switch Announcement element to a frame.
  * Note that we use the per-vap CSA count to adjust the global
  * counter so we can use this routine to form probe response
  * frames and get the current count.
  */
 static uint8_t *
 ieee80211_add_csa(uint8_t *frm, struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_csa_ie *csa = (struct ieee80211_csa_ie *) frm;
 
 	csa->csa_ie = IEEE80211_ELEMID_CSA;
 	csa->csa_len = 3;
 	csa->csa_mode = 1;		/* XXX force quiet on channel */
 	csa->csa_newchan = ieee80211_chan2ieee(ic, ic->ic_csa_newchan);
 	csa->csa_count = ic->ic_csa_count - vap->iv_csa_count;
 	return frm + sizeof(*csa);
 }
 
 /*
  * Add an 11h country information element to a frame.
  */
 static uint8_t *
 ieee80211_add_countryie(uint8_t *frm, struct ieee80211com *ic)
 {
 
 	if (ic->ic_countryie == NULL ||
 	    ic->ic_countryie_chan != ic->ic_bsschan) {
 		/*
 		 * Handle lazy construction of ie.  This is done on
 		 * first use and after a channel change that requires
 		 * re-calculation.
 		 */
 		if (ic->ic_countryie != NULL)
 			IEEE80211_FREE(ic->ic_countryie, M_80211_NODE_IE);
 		ic->ic_countryie = ieee80211_alloc_countryie(ic);
 		if (ic->ic_countryie == NULL)
 			return frm;
 		ic->ic_countryie_chan = ic->ic_bsschan;
 	}
 	return add_appie(frm, ic->ic_countryie);
 }
 
 uint8_t *
 ieee80211_add_wpa(uint8_t *frm, const struct ieee80211vap *vap)
 {
 	if (vap->iv_flags & IEEE80211_F_WPA1 && vap->iv_wpa_ie != NULL)
 		return (add_ie(frm, vap->iv_wpa_ie));
 	else {
 		/* XXX else complain? */
 		return (frm);
 	}
 }
 
 uint8_t *
 ieee80211_add_rsn(uint8_t *frm, const struct ieee80211vap *vap)
 {
 	if (vap->iv_flags & IEEE80211_F_WPA2 && vap->iv_rsn_ie != NULL)
 		return (add_ie(frm, vap->iv_rsn_ie));
 	else {
 		/* XXX else complain? */
 		return (frm);
 	}
 }
 
 uint8_t *
 ieee80211_add_qos(uint8_t *frm, const struct ieee80211_node *ni)
 {
 	if (ni->ni_flags & IEEE80211_NODE_QOS) {
 		*frm++ = IEEE80211_ELEMID_QOS;
 		*frm++ = 1;
 		*frm++ = 0;
 	}
 
 	return (frm);
 }
 
 /*
  * Send a probe request frame with the specified ssid
  * and any optional information element data.
  */
 int
 ieee80211_send_probereq(struct ieee80211_node *ni,
 	const uint8_t sa[IEEE80211_ADDR_LEN],
 	const uint8_t da[IEEE80211_ADDR_LEN],
 	const uint8_t bssid[IEEE80211_ADDR_LEN],
 	const uint8_t *ssid, size_t ssidlen)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_node *bss;
 	const struct ieee80211_txparam *tp;
 	struct ieee80211_bpf_params params;
 	const struct ieee80211_rateset *rs;
 	struct mbuf *m;
 	uint8_t *frm;
 	int ret;
 
 	bss = ieee80211_ref_node(vap->iv_bss);
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni,
 		    "block %s frame in CAC state", "probe request");
 		vap->iv_stats.is_tx_badstate++;
 		ieee80211_free_node(bss);
 		return EIO;		/* XXX */
 	}
 
 	/*
 	 * Hold a reference on the node so it doesn't go away until after
 	 * the xmit is complete all the way in the driver.  On error we
 	 * will remove our reference.
 	 */
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 		"ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n",
 		__func__, __LINE__,
 		ni, ether_sprintf(ni->ni_macaddr),
 		ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	/*
 	 * prreq frame format
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[tlv] RSN (optional)
 	 *	[tlv] extended supported rates
 	 *	[tlv] HT cap (optional)
 	 *	[tlv] VHT cap (optional)
 	 *	[tlv] WPA (optional)
 	 *	[tlv] user-specified ie's
 	 */
 	m = ieee80211_getmgtframe(&frm,
 		 ic->ic_headroom + sizeof(struct ieee80211_frame),
 	       	 2 + IEEE80211_NWID_LEN
 	       + 2 + IEEE80211_RATE_SIZE
 	       + sizeof(struct ieee80211_ie_htcap)
 	       + sizeof(struct ieee80211_ie_vhtcap)
 	       + sizeof(struct ieee80211_ie_htinfo)	/* XXX not needed? */
 	       + sizeof(struct ieee80211_ie_wpa)
 	       + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 	       + sizeof(struct ieee80211_ie_wpa)
 	       + (vap->iv_appie_probereq != NULL ?
 		   vap->iv_appie_probereq->ie_len : 0)
 	);
 	if (m == NULL) {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		ieee80211_free_node(bss);
 		return ENOMEM;
 	}
 
 	frm = ieee80211_add_ssid(frm, ssid, ssidlen);
 	rs = ieee80211_get_suprates(ic, ic->ic_curchan);
 	frm = ieee80211_add_rates(frm, rs);
 	frm = ieee80211_add_rsn(frm, vap);
 	frm = ieee80211_add_xrates(frm, rs);
 
 	/*
 	 * Note: we can't use bss; we don't have one yet.
 	 *
 	 * So, we should announce our capabilities
 	 * in this channel mode (2g/5g), not the
 	 * channel details itself.
 	 */
 	if ((vap->iv_opmode == IEEE80211_M_IBSS) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_HT)) {
 		struct ieee80211_channel *c;
 
 		/*
 		 * Get the HT channel that we should try upgrading to.
 		 * If we can do 40MHz then this'll upgrade it appropriately.
 		 */
 		c = ieee80211_ht_adjust_channel(ic, ic->ic_curchan,
 		    vap->iv_flags_ht);
 		frm = ieee80211_add_htcap_ch(frm, vap, c);
 	}
 
 	/*
 	 * XXX TODO: need to figure out what/how to update the
 	 * VHT channel.
 	 */
 #if 0
 	(vap->iv_flags_vht & IEEE80211_FVHT_VHT) {
 		struct ieee80211_channel *c;
 
 		c = ieee80211_ht_adjust_channel(ic, ic->ic_curchan,
 		    vap->iv_flags_ht);
 		c = ieee80211_vht_adjust_channel(ic, c, vap->iv_flags_vht);
 		frm = ieee80211_add_vhtcap_ch(frm, vap, c);
 	}
 #endif
 
 	frm = ieee80211_add_wpa(frm, vap);
 	if (vap->iv_appie_probereq != NULL)
 		frm = add_appie(frm, vap->iv_appie_probereq);
 	m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 
 	KASSERT(M_LEADINGSPACE(m) >= sizeof(struct ieee80211_frame),
 	    ("leading space %zd", M_LEADINGSPACE(m)));
 	M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT);
 	if (m == NULL) {
 		/* NB: cannot happen */
 		ieee80211_free_node(ni);
 		ieee80211_free_node(bss);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 	ieee80211_send_setup(ni, m,
 	     IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_PROBE_REQ,
 	     IEEE80211_NONQOS_TID, sa, da, bssid);
 	/* XXX power management? */
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	M_WME_SETAC(m, WME_AC_BE);
 
 	IEEE80211_NODE_STAT(ni, tx_probereq);
 	IEEE80211_NODE_STAT(ni, tx_mgmt);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS,
 	    "send probe req on channel %u bssid %s sa %6D da %6D ssid \"%.*s\"\n",
 	    ieee80211_chan2ieee(ic, ic->ic_curchan),
 	    ether_sprintf(bssid),
 	    sa, ":",
 	    da, ":",
 	    ssidlen, ssid);
 
 	memset(&params, 0, sizeof(params));
 	params.ibp_pri = M_WME_GETAC(m);
 	tp = &vap->iv_txparms[ieee80211_chan2mode(ic->ic_curchan)];
 	params.ibp_rate0 = tp->mgmtrate;
 	if (IEEE80211_IS_MULTICAST(da)) {
 		params.ibp_flags |= IEEE80211_BPF_NOACK;
 		params.ibp_try0 = 1;
 	} else
 		params.ibp_try0 = tp->maxretry;
 	params.ibp_power = ni->ni_txpower;
 	ret = ieee80211_raw_output(vap, ni, m, &params);
 	IEEE80211_TX_UNLOCK(ic);
 	ieee80211_free_node(bss);
 	return (ret);
 }
 
 /*
  * Calculate capability information for mgt frames.
  */
 uint16_t
 ieee80211_getcapinfo(struct ieee80211vap *vap, struct ieee80211_channel *chan)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	uint16_t capinfo;
 
 	KASSERT(vap->iv_opmode != IEEE80211_M_STA, ("station mode"));
 
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP)
 		capinfo = IEEE80211_CAPINFO_ESS;
 	else if (vap->iv_opmode == IEEE80211_M_IBSS)
 		capinfo = IEEE80211_CAPINFO_IBSS;
 	else
 		capinfo = 0;
 	if (vap->iv_flags & IEEE80211_F_PRIVACY)
 		capinfo |= IEEE80211_CAPINFO_PRIVACY;
 	if ((ic->ic_flags & IEEE80211_F_SHPREAMBLE) &&
 	    IEEE80211_IS_CHAN_2GHZ(chan))
 		capinfo |= IEEE80211_CAPINFO_SHORT_PREAMBLE;
 	if (ic->ic_flags & IEEE80211_F_SHSLOT)
 		capinfo |= IEEE80211_CAPINFO_SHORT_SLOTTIME;
 	if (IEEE80211_IS_CHAN_5GHZ(chan) && (vap->iv_flags & IEEE80211_F_DOTH))
 		capinfo |= IEEE80211_CAPINFO_SPECTRUM_MGMT;
 	return capinfo;
 }
 
 /*
  * Send a management frame.  The node is for the destination (or ic_bss
  * when in station mode).  Nodes other than ic_bss have their reference
  * count bumped to reflect our use for an indeterminant time.
  */
 int
 ieee80211_send_mgmt(struct ieee80211_node *ni, int type, int arg)
 {
 #define	HTFLAGS (IEEE80211_NODE_HT | IEEE80211_NODE_HTCOMPAT)
 #define	senderr(_x, _v)	do { vap->iv_stats._v++; ret = _x; goto bad; } while (0)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_node *bss = vap->iv_bss;
 	struct ieee80211_bpf_params params;
 	struct mbuf *m;
 	uint8_t *frm;
 	uint16_t capinfo;
 	int has_challenge, is_shared_key, ret, status;
 
 	KASSERT(ni != NULL, ("null node"));
 
 	/*
 	 * Hold a reference on the node so it doesn't go away until after
 	 * the xmit is complete all the way in the driver.  On error we
 	 * will remove our reference.
 	 */
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 		"ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n",
 		__func__, __LINE__,
 		ni, ether_sprintf(ni->ni_macaddr),
 		ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	memset(&params, 0, sizeof(params));
 	switch (type) {
 
 	case IEEE80211_FC0_SUBTYPE_AUTH:
 		status = arg >> 16;
 		arg &= 0xffff;
 		has_challenge = ((arg == IEEE80211_AUTH_SHARED_CHALLENGE ||
 		    arg == IEEE80211_AUTH_SHARED_RESPONSE) &&
 		    ni->ni_challenge != NULL);
 
 		/*
 		 * Deduce whether we're doing open authentication or
 		 * shared key authentication.  We do the latter if
 		 * we're in the middle of a shared key authentication
 		 * handshake or if we're initiating an authentication
 		 * request and configured to use shared key.
 		 */
 		is_shared_key = has_challenge ||
 		     arg >= IEEE80211_AUTH_SHARED_RESPONSE ||
 		     (arg == IEEE80211_AUTH_SHARED_REQUEST &&
 		      bss->ni_authmode == IEEE80211_AUTH_SHARED);
 
 		m = ieee80211_getmgtframe(&frm,
 			  ic->ic_headroom + sizeof(struct ieee80211_frame),
 			  3 * sizeof(uint16_t)
 			+ (has_challenge && status == IEEE80211_STATUS_SUCCESS ?
 				sizeof(uint16_t)+IEEE80211_CHALLENGE_LEN : 0)
 		);
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 
 		((uint16_t *)frm)[0] =
 		    (is_shared_key) ? htole16(IEEE80211_AUTH_ALG_SHARED)
 		                    : htole16(IEEE80211_AUTH_ALG_OPEN);
 		((uint16_t *)frm)[1] = htole16(arg);	/* sequence number */
 		((uint16_t *)frm)[2] = htole16(status);/* status */
 
 		if (has_challenge && status == IEEE80211_STATUS_SUCCESS) {
 			((uint16_t *)frm)[3] =
 			    htole16((IEEE80211_CHALLENGE_LEN << 8) |
 			    IEEE80211_ELEMID_CHALLENGE);
 			memcpy(&((uint16_t *)frm)[4], ni->ni_challenge,
 			    IEEE80211_CHALLENGE_LEN);
 			m->m_pkthdr.len = m->m_len =
 				4 * sizeof(uint16_t) + IEEE80211_CHALLENGE_LEN;
 			if (arg == IEEE80211_AUTH_SHARED_RESPONSE) {
 				IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni,
 				    "request encrypt frame (%s)", __func__);
 				/* mark frame for encryption */
 				params.ibp_flags |= IEEE80211_BPF_CRYPTO;
 			}
 		} else
 			m->m_pkthdr.len = m->m_len = 3 * sizeof(uint16_t);
 
 		/* XXX not right for shared key */
 		if (status == IEEE80211_STATUS_SUCCESS)
 			IEEE80211_NODE_STAT(ni, tx_auth);
 		else
 			IEEE80211_NODE_STAT(ni, tx_auth_fail);
 
 		if (vap->iv_opmode == IEEE80211_M_STA)
 			ieee80211_add_callback(m, ieee80211_tx_mgt_cb,
 				(void *) vap->iv_state);
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni,
 		    "send station deauthenticate (reason: %d (%s))", arg,
 		    ieee80211_reason_to_string(arg));
 		m = ieee80211_getmgtframe(&frm,
 			ic->ic_headroom + sizeof(struct ieee80211_frame),
 			sizeof(uint16_t));
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 		*(uint16_t *)frm = htole16(arg);	/* reason */
 		m->m_pkthdr.len = m->m_len = sizeof(uint16_t);
 
 		IEEE80211_NODE_STAT(ni, tx_deauth);
 		IEEE80211_NODE_STAT_SET(ni, tx_deauth_code, arg);
 
 		ieee80211_node_unauthorize(ni);		/* port closed */
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 		/*
 		 * asreq frame format
 		 *	[2] capability information
 		 *	[2] listen interval
 		 *	[6*] current AP address (reassoc only)
 		 *	[tlv] ssid
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 *	[4] power capability (optional)
 		 *	[28] supported channels (optional)
 		 *	[tlv] HT capabilities
 		 *	[tlv] VHT capabilities
 		 *	[tlv] WME (optional)
 		 *	[tlv] Vendor OUI HT capabilities (optional)
 		 *	[tlv] Atheros capabilities (if negotiated)
 		 *	[tlv] AppIE's (optional)
 		 */
 		m = ieee80211_getmgtframe(&frm,
 			 ic->ic_headroom + sizeof(struct ieee80211_frame),
 			 sizeof(uint16_t)
 		       + sizeof(uint16_t)
 		       + IEEE80211_ADDR_LEN
 		       + 2 + IEEE80211_NWID_LEN
 		       + 2 + IEEE80211_RATE_SIZE
 		       + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 		       + 4
 		       + 2 + 26
 		       + sizeof(struct ieee80211_wme_info)
 		       + sizeof(struct ieee80211_ie_htcap)
 		       + sizeof(struct ieee80211_ie_vhtcap)
 		       + 4 + sizeof(struct ieee80211_ie_htcap)
 #ifdef IEEE80211_SUPPORT_SUPERG
 		       + sizeof(struct ieee80211_ath_ie)
 #endif
 		       + (vap->iv_appie_wpa != NULL ?
 				vap->iv_appie_wpa->ie_len : 0)
 		       + (vap->iv_appie_assocreq != NULL ?
 				vap->iv_appie_assocreq->ie_len : 0)
 		);
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 
 		KASSERT(vap->iv_opmode == IEEE80211_M_STA,
 		    ("wrong mode %u", vap->iv_opmode));
 		capinfo = IEEE80211_CAPINFO_ESS;
 		if (vap->iv_flags & IEEE80211_F_PRIVACY)
 			capinfo |= IEEE80211_CAPINFO_PRIVACY;
 		/*
 		 * NB: Some 11a AP's reject the request when
 		 *     short preamble is set.
 		 */
 		if ((ic->ic_flags & IEEE80211_F_SHPREAMBLE) &&
 		    IEEE80211_IS_CHAN_2GHZ(ic->ic_curchan))
 			capinfo |= IEEE80211_CAPINFO_SHORT_PREAMBLE;
 		if (IEEE80211_IS_CHAN_ANYG(ic->ic_curchan) &&
 		    (ic->ic_caps & IEEE80211_C_SHSLOT))
 			capinfo |= IEEE80211_CAPINFO_SHORT_SLOTTIME;
 		if ((ni->ni_capinfo & IEEE80211_CAPINFO_SPECTRUM_MGMT) &&
 		    (vap->iv_flags & IEEE80211_F_DOTH))
 			capinfo |= IEEE80211_CAPINFO_SPECTRUM_MGMT;
 		*(uint16_t *)frm = htole16(capinfo);
 		frm += 2;
 
 		KASSERT(bss->ni_intval != 0, ("beacon interval is zero!"));
 		*(uint16_t *)frm = htole16(howmany(ic->ic_lintval,
 						    bss->ni_intval));
 		frm += 2;
 
 		if (type == IEEE80211_FC0_SUBTYPE_REASSOC_REQ) {
 			IEEE80211_ADDR_COPY(frm, bss->ni_bssid);
 			frm += IEEE80211_ADDR_LEN;
 		}
 
 		frm = ieee80211_add_ssid(frm, ni->ni_essid, ni->ni_esslen);
 		frm = ieee80211_add_rates(frm, &ni->ni_rates);
 		frm = ieee80211_add_rsn(frm, vap);
 		frm = ieee80211_add_xrates(frm, &ni->ni_rates);
 		if (capinfo & IEEE80211_CAPINFO_SPECTRUM_MGMT) {
 			frm = ieee80211_add_powercapability(frm,
 			    ic->ic_curchan);
 			frm = ieee80211_add_supportedchannels(frm, ic);
 		}
 
 		/*
 		 * Check the channel - we may be using an 11n NIC with an
 		 * 11n capable station, but we're configured to be an 11b
 		 * channel.
 		 */
 		if ((vap->iv_flags_ht & IEEE80211_FHT_HT) &&
 		    IEEE80211_IS_CHAN_HT(ni->ni_chan) &&
 		    ni->ni_ies.htcap_ie != NULL &&
 		    ni->ni_ies.htcap_ie[0] == IEEE80211_ELEMID_HTCAP) {
 			frm = ieee80211_add_htcap(frm, ni);
 		}
 
 		if ((vap->iv_flags_vht & IEEE80211_FVHT_VHT) &&
 		    IEEE80211_IS_CHAN_VHT(ni->ni_chan) &&
 		    ni->ni_ies.vhtcap_ie != NULL &&
 		    ni->ni_ies.vhtcap_ie[0] == IEEE80211_ELEMID_VHT_CAP) {
 			frm = ieee80211_add_vhtcap(frm, ni);
 		}
 
 		frm = ieee80211_add_wpa(frm, vap);
 		if ((ic->ic_flags & IEEE80211_F_WME) &&
 		    ni->ni_ies.wme_ie != NULL)
 			frm = ieee80211_add_wme_info(frm, &ic->ic_wme);
 
 		/*
 		 * Same deal - only send HT info if we're on an 11n
 		 * capable channel.
 		 */
 		if ((vap->iv_flags_ht & IEEE80211_FHT_HT) &&
 		    IEEE80211_IS_CHAN_HT(ni->ni_chan) &&
 		    ni->ni_ies.htcap_ie != NULL &&
 		    ni->ni_ies.htcap_ie[0] == IEEE80211_ELEMID_VENDOR) {
 			frm = ieee80211_add_htcap_vendor(frm, ni);
 		}
 #ifdef IEEE80211_SUPPORT_SUPERG
 		if (IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS)) {
 			frm = ieee80211_add_ath(frm, 
 				IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS),
 				((vap->iv_flags & IEEE80211_F_WPA) == 0 &&
 				 ni->ni_authmode != IEEE80211_AUTH_8021X) ?
 				vap->iv_def_txkey : IEEE80211_KEYIX_NONE);
 		}
 #endif /* IEEE80211_SUPPORT_SUPERG */
 		if (vap->iv_appie_assocreq != NULL)
 			frm = add_appie(frm, vap->iv_appie_assocreq);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 
 		ieee80211_add_callback(m, ieee80211_tx_mgt_cb,
 			(void *) vap->iv_state);
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 		/*
 		 * asresp frame format
 		 *	[2] capability information
 		 *	[2] status
 		 *	[2] association ID
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 *	[tlv] HT capabilities (standard, if STA enabled)
 		 *	[tlv] HT information (standard, if STA enabled)
 		 *	[tlv] VHT capabilities (standard, if STA enabled)
 		 *	[tlv] VHT information (standard, if STA enabled)
 		 *	[tlv] WME (if configured and STA enabled)
 		 *	[tlv] HT capabilities (vendor OUI, if STA enabled)
 		 *	[tlv] HT information (vendor OUI, if STA enabled)
 		 *	[tlv] Atheros capabilities (if STA enabled)
 		 *	[tlv] AppIE's (optional)
 		 */
 		m = ieee80211_getmgtframe(&frm,
 			 ic->ic_headroom + sizeof(struct ieee80211_frame),
 			 sizeof(uint16_t)
 		       + sizeof(uint16_t)
 		       + sizeof(uint16_t)
 		       + 2 + IEEE80211_RATE_SIZE
 		       + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 		       + sizeof(struct ieee80211_ie_htcap) + 4
 		       + sizeof(struct ieee80211_ie_htinfo) + 4
 		       + sizeof(struct ieee80211_ie_vhtcap)
 		       + sizeof(struct ieee80211_ie_vht_operation)
 		       + sizeof(struct ieee80211_wme_param)
 #ifdef IEEE80211_SUPPORT_SUPERG
 		       + sizeof(struct ieee80211_ath_ie)
 #endif
 		       + (vap->iv_appie_assocresp != NULL ?
 				vap->iv_appie_assocresp->ie_len : 0)
 		);
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 
 		capinfo = ieee80211_getcapinfo(vap, bss->ni_chan);
 		*(uint16_t *)frm = htole16(capinfo);
 		frm += 2;
 
 		*(uint16_t *)frm = htole16(arg);	/* status */
 		frm += 2;
 
 		if (arg == IEEE80211_STATUS_SUCCESS) {
 			*(uint16_t *)frm = htole16(ni->ni_associd);
 			IEEE80211_NODE_STAT(ni, tx_assoc);
 		} else
 			IEEE80211_NODE_STAT(ni, tx_assoc_fail);
 		frm += 2;
 
 		frm = ieee80211_add_rates(frm, &ni->ni_rates);
 		frm = ieee80211_add_xrates(frm, &ni->ni_rates);
 		/* NB: respond according to what we received */
 		if ((ni->ni_flags & HTFLAGS) == IEEE80211_NODE_HT) {
 			frm = ieee80211_add_htcap(frm, ni);
 			frm = ieee80211_add_htinfo(frm, ni);
 		}
 		if ((vap->iv_flags & IEEE80211_F_WME) &&
 		    ni->ni_ies.wme_ie != NULL)
 			frm = ieee80211_add_wme_param(frm, &ic->ic_wme);
 		if ((ni->ni_flags & HTFLAGS) == HTFLAGS) {
 			frm = ieee80211_add_htcap_vendor(frm, ni);
 			frm = ieee80211_add_htinfo_vendor(frm, ni);
 		}
 		if (ni->ni_flags & IEEE80211_NODE_VHT) {
 			frm = ieee80211_add_vhtcap(frm, ni);
 			frm = ieee80211_add_vhtinfo(frm, ni);
 		}
 #ifdef IEEE80211_SUPPORT_SUPERG
 		if (IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS))
 			frm = ieee80211_add_ath(frm, 
 				IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS),
 				((vap->iv_flags & IEEE80211_F_WPA) == 0 &&
 				 ni->ni_authmode != IEEE80211_AUTH_8021X) ?
 				vap->iv_def_txkey : IEEE80211_KEYIX_NONE);
 #endif /* IEEE80211_SUPPORT_SUPERG */
 		if (vap->iv_appie_assocresp != NULL)
 			frm = add_appie(frm, vap->iv_appie_assocresp);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_DISASSOC:
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ASSOC, ni,
 		    "send station disassociate (reason: %d (%s))", arg,
 		    ieee80211_reason_to_string(arg));
 		m = ieee80211_getmgtframe(&frm,
 			ic->ic_headroom + sizeof(struct ieee80211_frame),
 			sizeof(uint16_t));
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 		*(uint16_t *)frm = htole16(arg);	/* reason */
 		m->m_pkthdr.len = m->m_len = sizeof(uint16_t);
 
 		IEEE80211_NODE_STAT(ni, tx_disassoc);
 		IEEE80211_NODE_STAT_SET(ni, tx_disassoc_code, arg);
 		break;
 
 	default:
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ANY, ni,
 		    "invalid mgmt frame type %u", type);
 		senderr(EINVAL, is_tx_unknownmgt);
 		/* NOTREACHED */
 	}
 
 	/* NB: force non-ProbeResp frames to the highest queue */
 	params.ibp_pri = WME_AC_VO;
 	params.ibp_rate0 = bss->ni_txparms->mgmtrate;
 	/* NB: we know all frames are unicast */
 	params.ibp_try0 = bss->ni_txparms->maxretry;
 	params.ibp_power = bss->ni_txpower;
 	return ieee80211_mgmt_output(ni, m, type, &params);
 bad:
 	ieee80211_free_node(ni);
 	return ret;
 #undef senderr
 #undef HTFLAGS
 }
 
 /*
  * Return an mbuf with a probe response frame in it.
  * Space is left to prepend and 802.11 header at the
  * front but it's left to the caller to fill in.
  */
 struct mbuf *
 ieee80211_alloc_proberesp(struct ieee80211_node *bss, int legacy)
 {
 	struct ieee80211vap *vap = bss->ni_vap;
 	struct ieee80211com *ic = bss->ni_ic;
 	const struct ieee80211_rateset *rs;
 	struct mbuf *m;
 	uint16_t capinfo;
 	uint8_t *frm;
 
 	/*
 	 * probe response frame format
 	 *	[8] time stamp
 	 *	[2] beacon interval
 	 *	[2] cabability information
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[tlv] parameter set (FH/DS)
 	 *	[tlv] parameter set (IBSS)
 	 *	[tlv] country (optional)
 	 *	[3] power control (optional)
 	 *	[5] channel switch announcement (CSA) (optional)
 	 *	[tlv] extended rate phy (ERP)
 	 *	[tlv] extended supported rates
 	 *	[tlv] RSN (optional)
 	 *	[tlv] HT capabilities
 	 *	[tlv] HT information
 	 *	[tlv] VHT capabilities
 	 *	[tlv] VHT information
 	 *	[tlv] WPA (optional)
 	 *	[tlv] WME (optional)
 	 *	[tlv] Vendor OUI HT capabilities (optional)
 	 *	[tlv] Vendor OUI HT information (optional)
 	 *	[tlv] Atheros capabilities
 	 *	[tlv] AppIE's (optional)
 	 *	[tlv] Mesh ID (MBSS)
 	 *	[tlv] Mesh Conf (MBSS)
 	 */
 	m = ieee80211_getmgtframe(&frm,
 		 ic->ic_headroom + sizeof(struct ieee80211_frame),
 		 8
 	       + sizeof(uint16_t)
 	       + sizeof(uint16_t)
 	       + 2 + IEEE80211_NWID_LEN
 	       + 2 + IEEE80211_RATE_SIZE
 	       + 7	/* max(7,3) */
 	       + IEEE80211_COUNTRY_MAX_SIZE
 	       + 3
 	       + sizeof(struct ieee80211_csa_ie)
 	       + sizeof(struct ieee80211_quiet_ie)
 	       + 3
 	       + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 	       + sizeof(struct ieee80211_ie_wpa)
 	       + sizeof(struct ieee80211_ie_htcap)
 	       + sizeof(struct ieee80211_ie_htinfo)
 	       + sizeof(struct ieee80211_ie_wpa)
 	       + sizeof(struct ieee80211_wme_param)
 	       + 4 + sizeof(struct ieee80211_ie_htcap)
 	       + 4 + sizeof(struct ieee80211_ie_htinfo)
 	       +  sizeof(struct ieee80211_ie_vhtcap)
 	       +  sizeof(struct ieee80211_ie_vht_operation)
 #ifdef IEEE80211_SUPPORT_SUPERG
 	       + sizeof(struct ieee80211_ath_ie)
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 	       + 2 + IEEE80211_MESHID_LEN
 	       + sizeof(struct ieee80211_meshconf_ie)
 #endif
 	       + (vap->iv_appie_proberesp != NULL ?
 			vap->iv_appie_proberesp->ie_len : 0)
 	);
 	if (m == NULL) {
 		vap->iv_stats.is_tx_nobuf++;
 		return NULL;
 	}
 
 	memset(frm, 0, 8);	/* timestamp should be filled later */
 	frm += 8;
 	*(uint16_t *)frm = htole16(bss->ni_intval);
 	frm += 2;
 	capinfo = ieee80211_getcapinfo(vap, bss->ni_chan);
 	*(uint16_t *)frm = htole16(capinfo);
 	frm += 2;
 
 	frm = ieee80211_add_ssid(frm, bss->ni_essid, bss->ni_esslen);
 	rs = ieee80211_get_suprates(ic, bss->ni_chan);
 	frm = ieee80211_add_rates(frm, rs);
 
 	if (IEEE80211_IS_CHAN_FHSS(bss->ni_chan)) {
 		*frm++ = IEEE80211_ELEMID_FHPARMS;
 		*frm++ = 5;
 		*frm++ = bss->ni_fhdwell & 0x00ff;
 		*frm++ = (bss->ni_fhdwell >> 8) & 0x00ff;
 		*frm++ = IEEE80211_FH_CHANSET(
 		    ieee80211_chan2ieee(ic, bss->ni_chan));
 		*frm++ = IEEE80211_FH_CHANPAT(
 		    ieee80211_chan2ieee(ic, bss->ni_chan));
 		*frm++ = bss->ni_fhindex;
 	} else {
 		*frm++ = IEEE80211_ELEMID_DSPARMS;
 		*frm++ = 1;
 		*frm++ = ieee80211_chan2ieee(ic, bss->ni_chan);
 	}
 
 	if (vap->iv_opmode == IEEE80211_M_IBSS) {
 		*frm++ = IEEE80211_ELEMID_IBSSPARMS;
 		*frm++ = 2;
 		*frm++ = 0; *frm++ = 0;		/* TODO: ATIM window */
 	}
 	if ((vap->iv_flags & IEEE80211_F_DOTH) ||
 	    (vap->iv_flags_ext & IEEE80211_FEXT_DOTD))
 		frm = ieee80211_add_countryie(frm, ic);
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_5GHZ(bss->ni_chan))
 			frm = ieee80211_add_powerconstraint(frm, vap);
 		if (ic->ic_flags & IEEE80211_F_CSAPENDING)
 			frm = ieee80211_add_csa(frm, vap);
 	}
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_DFS)) {
 			if (vap->iv_quiet)
 				frm = ieee80211_add_quiet(frm, vap, 0);
 		}
 	}
 	if (IEEE80211_IS_CHAN_ANYG(bss->ni_chan))
 		frm = ieee80211_add_erp(frm, ic);
 	frm = ieee80211_add_xrates(frm, rs);
 	frm = ieee80211_add_rsn(frm, vap);
 	/*
 	 * NB: legacy 11b clients do not get certain ie's.
 	 *     The caller identifies such clients by passing
 	 *     a token in legacy to us.  Could expand this to be
 	 *     any legacy client for stuff like HT ie's.
 	 */
 	if (IEEE80211_IS_CHAN_HT(bss->ni_chan) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B) {
 		frm = ieee80211_add_htcap(frm, bss);
 		frm = ieee80211_add_htinfo(frm, bss);
 	}
 	if (IEEE80211_IS_CHAN_VHT(bss->ni_chan) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B) {
 		frm = ieee80211_add_vhtcap(frm, bss);
 		frm = ieee80211_add_vhtinfo(frm, bss);
 	}
 	frm = ieee80211_add_wpa(frm, vap);
 	if (vap->iv_flags & IEEE80211_F_WME)
 		frm = ieee80211_add_wme_param(frm, &ic->ic_wme);
 	if (IEEE80211_IS_CHAN_HT(bss->ni_chan) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B) {
 		frm = ieee80211_add_htcap_vendor(frm, bss);
 		frm = ieee80211_add_htinfo_vendor(frm, bss);
 	}
 #ifdef IEEE80211_SUPPORT_SUPERG
 	if ((vap->iv_flags & IEEE80211_F_ATHEROS) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B)
 		frm = ieee80211_add_athcaps(frm, bss);
 #endif
 	if (vap->iv_appie_proberesp != NULL)
 		frm = add_appie(frm, vap->iv_appie_proberesp);
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS) {
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshconf(frm, vap);
 	}
 #endif
 	m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 
 	return m;
 }
 
 /*
  * Send a probe response frame to the specified mac address.
  * This does not go through the normal mgt frame api so we
  * can specify the destination address and re-use the bss node
  * for the sta reference.
  */
 int
 ieee80211_send_proberesp(struct ieee80211vap *vap,
 	const uint8_t da[IEEE80211_ADDR_LEN], int legacy)
 {
 	struct ieee80211_node *bss = vap->iv_bss;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct mbuf *m;
 	int ret;
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, bss,
 		    "block %s frame in CAC state", "probe response");
 		vap->iv_stats.is_tx_badstate++;
 		return EIO;		/* XXX */
 	}
 
 	/*
 	 * Hold a reference on the node so it doesn't go away until after
 	 * the xmit is complete all the way in the driver.  On error we
 	 * will remove our reference.
 	 */
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n",
 	    __func__, __LINE__, bss, ether_sprintf(bss->ni_macaddr),
 	    ieee80211_node_refcnt(bss)+1);
 	ieee80211_ref_node(bss);
 
 	m = ieee80211_alloc_proberesp(bss, legacy);
 	if (m == NULL) {
 		ieee80211_free_node(bss);
 		return ENOMEM;
 	}
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT);
 	KASSERT(m != NULL, ("no room for header"));
 
 	IEEE80211_TX_LOCK(ic);
 	ieee80211_send_setup(bss, m,
 	     IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_PROBE_RESP,
 	     IEEE80211_NONQOS_TID, vap->iv_myaddr, da, bss->ni_bssid);
 	/* XXX power management? */
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	M_WME_SETAC(m, WME_AC_BE);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS,
 	    "send probe resp on channel %u to %s%s\n",
 	    ieee80211_chan2ieee(ic, ic->ic_curchan), ether_sprintf(da),
 	    legacy ? " <legacy>" : "");
 	IEEE80211_NODE_STAT(bss, tx_mgmt);
 
 	ret = ieee80211_raw_output(vap, bss, m, NULL);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 /*
  * Allocate and build a RTS (Request To Send) control frame.
  */
 struct mbuf *
 ieee80211_alloc_rts(struct ieee80211com *ic,
 	const uint8_t ra[IEEE80211_ADDR_LEN],
 	const uint8_t ta[IEEE80211_ADDR_LEN],
 	uint16_t dur)
 {
 	struct ieee80211_frame_rts *rts;
 	struct mbuf *m;
 
 	/* XXX honor ic_headroom */
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m != NULL) {
 		rts = mtod(m, struct ieee80211_frame_rts *);
 		rts->i_fc[0] = IEEE80211_FC0_VERSION_0 |
 			IEEE80211_FC0_TYPE_CTL | IEEE80211_FC0_SUBTYPE_RTS;
 		rts->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		*(u_int16_t *)rts->i_dur = htole16(dur);
 		IEEE80211_ADDR_COPY(rts->i_ra, ra);
 		IEEE80211_ADDR_COPY(rts->i_ta, ta);
 
 		m->m_pkthdr.len = m->m_len = sizeof(struct ieee80211_frame_rts);
 	}
 	return m;
 }
 
 /*
  * Allocate and build a CTS (Clear To Send) control frame.
  */
 struct mbuf *
 ieee80211_alloc_cts(struct ieee80211com *ic,
 	const uint8_t ra[IEEE80211_ADDR_LEN], uint16_t dur)
 {
 	struct ieee80211_frame_cts *cts;
 	struct mbuf *m;
 
 	/* XXX honor ic_headroom */
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m != NULL) {
 		cts = mtod(m, struct ieee80211_frame_cts *);
 		cts->i_fc[0] = IEEE80211_FC0_VERSION_0 |
 			IEEE80211_FC0_TYPE_CTL | IEEE80211_FC0_SUBTYPE_CTS;
 		cts->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		*(u_int16_t *)cts->i_dur = htole16(dur);
 		IEEE80211_ADDR_COPY(cts->i_ra, ra);
 
 		m->m_pkthdr.len = m->m_len = sizeof(struct ieee80211_frame_cts);
 	}
 	return m;
 }
 
 /*
  * Wrapper for CTS/RTS frame allocation.
  */
 struct mbuf *
 ieee80211_alloc_prot(struct ieee80211_node *ni, const struct mbuf *m,
     uint8_t rate, int prot)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 	const struct ieee80211_frame *wh;
 	struct mbuf *mprot;
 	uint16_t dur;
 	int pktlen, isshort;
 
 	KASSERT(prot == IEEE80211_PROT_RTSCTS ||
 	    prot == IEEE80211_PROT_CTSONLY,
 	    ("wrong protection type %d", prot));
 
 	wh = mtod(m, const struct ieee80211_frame *);
 	pktlen = m->m_pkthdr.len + IEEE80211_CRC_LEN;
 	isshort = (ic->ic_flags & IEEE80211_F_SHPREAMBLE) != 0;
 	dur = ieee80211_compute_duration(ic->ic_rt, pktlen, rate, isshort)
 	    + ieee80211_ack_duration(ic->ic_rt, rate, isshort);
 
 	if (prot == IEEE80211_PROT_RTSCTS) {
 		/* NB: CTS is the same size as an ACK */
 		dur += ieee80211_ack_duration(ic->ic_rt, rate, isshort);
 		mprot = ieee80211_alloc_rts(ic, wh->i_addr1, wh->i_addr2, dur);
 	} else
 		mprot = ieee80211_alloc_cts(ic, ni->ni_vap->iv_myaddr, dur);
 
 	return (mprot);
 }
 
 static void
 ieee80211_tx_mgt_timeout(void *arg)
 {
 	struct ieee80211vap *vap = arg;
 
 	IEEE80211_LOCK(vap->iv_ic);
 	if (vap->iv_state != IEEE80211_S_INIT &&
 	    (vap->iv_ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		/*
 		 * NB: it's safe to specify a timeout as the reason here;
 		 *     it'll only be used in the right state.
 		 */
 		ieee80211_new_state_locked(vap, IEEE80211_S_SCAN,
 			IEEE80211_SCAN_FAIL_TIMEOUT);
 	}
 	IEEE80211_UNLOCK(vap->iv_ic);
 }
 
 /*
  * This is the callback set on net80211-sourced transmitted
  * authentication request frames.
  *
  * This does a couple of things:
  *
  * + If the frame transmitted was a success, it schedules a future
  *   event which will transition the interface to scan.
  *   If a state transition _then_ occurs before that event occurs,
  *   said state transition will cancel this callout.
  *
  * + If the frame transmit was a failure, it immediately schedules
  *   the transition back to scan.
  */
 static void
 ieee80211_tx_mgt_cb(struct ieee80211_node *ni, void *arg, int status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	enum ieee80211_state ostate = (enum ieee80211_state) arg;
 
 	/*
 	 * Frame transmit completed; arrange timer callback.  If
 	 * transmit was successfully we wait for response.  Otherwise
 	 * we arrange an immediate callback instead of doing the
 	 * callback directly since we don't know what state the driver
 	 * is in (e.g. what locks it is holding).  This work should
 	 * not be too time-critical and not happen too often so the
 	 * added overhead is acceptable.
 	 *
 	 * XXX what happens if !acked but response shows up before callback?
 	 */
 	if (vap->iv_state == ostate) {
 		callout_reset(&vap->iv_mgtsend,
 			status == 0 ? IEEE80211_TRANS_WAIT*hz : 0,
 			ieee80211_tx_mgt_timeout, vap);
 	}
 }
 
 static void
 ieee80211_beacon_construct(struct mbuf *m, uint8_t *frm,
 	struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_beacon_offsets *bo = &vap->iv_bcn_off;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_rateset *rs = &ni->ni_rates;
 	uint16_t capinfo;
 
 	/*
 	 * beacon frame format
 	 *
 	 * TODO: update to 802.11-2012; a lot of stuff has changed;
 	 * vendor extensions should be at the end, etc.
 	 *
 	 *	[8] time stamp
 	 *	[2] beacon interval
 	 *	[2] cabability information
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[3] parameter set (DS)
 	 *	[8] CF parameter set (optional)
 	 *	[tlv] parameter set (IBSS/TIM)
 	 *	[tlv] country (optional)
 	 *	[3] power control (optional)
 	 *	[5] channel switch announcement (CSA) (optional)
 	 * XXX TODO: Quiet
 	 * XXX TODO: IBSS DFS
 	 * XXX TODO: TPC report
 	 *	[tlv] extended rate phy (ERP)
 	 *	[tlv] extended supported rates
 	 *	[tlv] RSN parameters
 	 * XXX TODO: BSSLOAD
 	 * (XXX EDCA parameter set, QoS capability?)
 	 * XXX TODO: AP channel report
 	 *
 	 *	[tlv] HT capabilities
 	 *	[tlv] HT information
 	 *	XXX TODO: 20/40 BSS coexistence
 	 * Mesh:
 	 * XXX TODO: Meshid
 	 * XXX TODO: mesh config
 	 * XXX TODO: mesh awake window
 	 * XXX TODO: beacon timing (mesh, etc)
 	 * XXX TODO: MCCAOP Advertisement Overview
 	 * XXX TODO: MCCAOP Advertisement
 	 * XXX TODO: Mesh channel switch parameters
 	 * VHT:
 	 * XXX TODO: VHT capabilities
 	 * XXX TODO: VHT operation
 	 * XXX TODO: VHT transmit power envelope
 	 * XXX TODO: channel switch wrapper element
 	 * XXX TODO: extended BSS load element
 	 *
 	 * XXX Vendor-specific OIDs (e.g. Atheros)
 	 *	[tlv] WPA parameters
 	 *	[tlv] WME parameters
 	 *	[tlv] Vendor OUI HT capabilities (optional)
 	 *	[tlv] Vendor OUI HT information (optional)
 	 *	[tlv] Atheros capabilities (optional)
 	 *	[tlv] TDMA parameters (optional)
 	 *	[tlv] Mesh ID (MBSS)
 	 *	[tlv] Mesh Conf (MBSS)
 	 *	[tlv] application data (optional)
 	 */
 
 	memset(bo, 0, sizeof(*bo));
 
 	memset(frm, 0, 8);	/* XXX timestamp is set by hardware/driver */
 	frm += 8;
 	*(uint16_t *)frm = htole16(ni->ni_intval);
 	frm += 2;
 	capinfo = ieee80211_getcapinfo(vap, ni->ni_chan);
 	bo->bo_caps = (uint16_t *)frm;
 	*(uint16_t *)frm = htole16(capinfo);
 	frm += 2;
 	*frm++ = IEEE80211_ELEMID_SSID;
 	if ((vap->iv_flags & IEEE80211_F_HIDESSID) == 0) {
 		*frm++ = ni->ni_esslen;
 		memcpy(frm, ni->ni_essid, ni->ni_esslen);
 		frm += ni->ni_esslen;
 	} else
 		*frm++ = 0;
 	frm = ieee80211_add_rates(frm, rs);
 	if (!IEEE80211_IS_CHAN_FHSS(ni->ni_chan)) {
 		*frm++ = IEEE80211_ELEMID_DSPARMS;
 		*frm++ = 1;
 		*frm++ = ieee80211_chan2ieee(ic, ni->ni_chan);
 	}
 	if (ic->ic_flags & IEEE80211_F_PCF) {
 		bo->bo_cfp = frm;
 		frm = ieee80211_add_cfparms(frm, ic);
 	}
 	bo->bo_tim = frm;
 	if (vap->iv_opmode == IEEE80211_M_IBSS) {
 		*frm++ = IEEE80211_ELEMID_IBSSPARMS;
 		*frm++ = 2;
 		*frm++ = 0; *frm++ = 0;		/* TODO: ATIM window */
 		bo->bo_tim_len = 0;
 	} else if (vap->iv_opmode == IEEE80211_M_HOSTAP ||
 	    vap->iv_opmode == IEEE80211_M_MBSS) {
 		/* TIM IE is the same for Mesh and Hostap */
 		struct ieee80211_tim_ie *tie = (struct ieee80211_tim_ie *) frm;
 
 		tie->tim_ie = IEEE80211_ELEMID_TIM;
 		tie->tim_len = 4;	/* length */
 		tie->tim_count = 0;	/* DTIM count */ 
 		tie->tim_period = vap->iv_dtim_period;	/* DTIM period */
 		tie->tim_bitctl = 0;	/* bitmap control */
 		tie->tim_bitmap[0] = 0;	/* Partial Virtual Bitmap */
 		frm += sizeof(struct ieee80211_tim_ie);
 		bo->bo_tim_len = 1;
 	}
 	bo->bo_tim_trailer = frm;
 	if ((vap->iv_flags & IEEE80211_F_DOTH) ||
 	    (vap->iv_flags_ext & IEEE80211_FEXT_DOTD))
 		frm = ieee80211_add_countryie(frm, ic);
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_5GHZ(ni->ni_chan))
 			frm = ieee80211_add_powerconstraint(frm, vap);
 		bo->bo_csa = frm;
 		if (ic->ic_flags & IEEE80211_F_CSAPENDING)
 			frm = ieee80211_add_csa(frm, vap);	
 	} else
 		bo->bo_csa = frm;
 
 	bo->bo_quiet = NULL;
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_DFS) &&
 		    (vap->iv_quiet == 1)) {
 			/*
 			 * We only insert the quiet IE offset if
 			 * the quiet IE is enabled.  Otherwise don't
 			 * put it here or we'll just overwrite
 			 * some other beacon contents.
 			 */
 			if (vap->iv_quiet) {
 				bo->bo_quiet = frm;
 				frm = ieee80211_add_quiet(frm,vap, 0);
 			}
 		}
 	}
 
 	if (IEEE80211_IS_CHAN_ANYG(ni->ni_chan)) {
 		bo->bo_erp = frm;
 		frm = ieee80211_add_erp(frm, ic);
 	}
 	frm = ieee80211_add_xrates(frm, rs);
 	frm = ieee80211_add_rsn(frm, vap);
 	if (IEEE80211_IS_CHAN_HT(ni->ni_chan)) {
 		frm = ieee80211_add_htcap(frm, ni);
 		bo->bo_htinfo = frm;
 		frm = ieee80211_add_htinfo(frm, ni);
 	}
 
 	if (IEEE80211_IS_CHAN_VHT(ni->ni_chan)) {
 		frm = ieee80211_add_vhtcap(frm, ni);
 		bo->bo_vhtinfo = frm;
 		frm = ieee80211_add_vhtinfo(frm, ni);
 		/* Transmit power envelope */
 		/* Channel switch wrapper element */
 		/* Extended bss load element */
 	}
 
 	frm = ieee80211_add_wpa(frm, vap);
 	if (vap->iv_flags & IEEE80211_F_WME) {
 		bo->bo_wme = frm;
 		frm = ieee80211_add_wme_param(frm, &ic->ic_wme);
 	}
 	if (IEEE80211_IS_CHAN_HT(ni->ni_chan) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT)) {
 		frm = ieee80211_add_htcap_vendor(frm, ni);
 		frm = ieee80211_add_htinfo_vendor(frm, ni);
 	}
 
 #ifdef IEEE80211_SUPPORT_SUPERG
 	if (vap->iv_flags & IEEE80211_F_ATHEROS) {
 		bo->bo_ath = frm;
 		frm = ieee80211_add_athcaps(frm, ni);
 	}
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 	if (vap->iv_caps & IEEE80211_C_TDMA) {
 		bo->bo_tdma = frm;
 		frm = ieee80211_add_tdma(frm, vap);
 	}
 #endif
 	if (vap->iv_appie_beacon != NULL) {
 		bo->bo_appie = frm;
 		bo->bo_appie_len = vap->iv_appie_beacon->ie_len;
 		frm = add_appie(frm, vap->iv_appie_beacon);
 	}
 
 	/* XXX TODO: move meshid/meshconf up to before vendor extensions? */
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS) {
 		frm = ieee80211_add_meshid(frm, vap);
 		bo->bo_meshconf = frm;
 		frm = ieee80211_add_meshconf(frm, vap);
 	}
 #endif
 	bo->bo_tim_trailer_len = frm - bo->bo_tim_trailer;
 	bo->bo_csa_trailer_len = frm - bo->bo_csa;
 	m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 }
 
 /*
  * Allocate a beacon frame and fillin the appropriate bits.
  */
 struct mbuf *
 ieee80211_beacon_alloc(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	struct mbuf *m;
 	int pktlen;
 	uint8_t *frm;
 
 	/*
 	 * Update the "We're putting the quiet IE in the beacon" state.
 	 */
 	if (vap->iv_quiet == 1)
 		vap->iv_flags_ext |= IEEE80211_FEXT_QUIET_IE;
 	else if (vap->iv_quiet == 0)
 		vap->iv_flags_ext &= ~IEEE80211_FEXT_QUIET_IE;
 
 	/*
 	 * beacon frame format
 	 *
 	 * Note: This needs updating for 802.11-2012.
 	 *
 	 *	[8] time stamp
 	 *	[2] beacon interval
 	 *	[2] cabability information
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[3] parameter set (DS)
 	 *	[8] CF parameter set (optional)
 	 *	[tlv] parameter set (IBSS/TIM)
 	 *	[tlv] country (optional)
 	 *	[3] power control (optional)
 	 *	[5] channel switch announcement (CSA) (optional)
 	 *	[tlv] extended rate phy (ERP)
 	 *	[tlv] extended supported rates
 	 *	[tlv] RSN parameters
 	 *	[tlv] HT capabilities
 	 *	[tlv] HT information
 	 *	[tlv] VHT capabilities
 	 *	[tlv] VHT operation
 	 *	[tlv] Vendor OUI HT capabilities (optional)
 	 *	[tlv] Vendor OUI HT information (optional)
 	 * XXX Vendor-specific OIDs (e.g. Atheros)
 	 *	[tlv] WPA parameters
 	 *	[tlv] WME parameters
 	 *	[tlv] TDMA parameters (optional)
 	 *	[tlv] Mesh ID (MBSS)
 	 *	[tlv] Mesh Conf (MBSS)
 	 *	[tlv] application data (optional)
 	 * NB: we allocate the max space required for the TIM bitmap.
 	 * XXX how big is this?
 	 */
 	pktlen =   8					/* time stamp */
 		 + sizeof(uint16_t)			/* beacon interval */
 		 + sizeof(uint16_t)			/* capabilities */
 		 + 2 + ni->ni_esslen			/* ssid */
 	         + 2 + IEEE80211_RATE_SIZE		/* supported rates */
 	         + 2 + 1				/* DS parameters */
 		 + 2 + 6				/* CF parameters */
 		 + 2 + 4 + vap->iv_tim_len		/* DTIM/IBSSPARMS */
 		 + IEEE80211_COUNTRY_MAX_SIZE		/* country */
 		 + 2 + 1				/* power control */
 		 + sizeof(struct ieee80211_csa_ie)	/* CSA */
 		 + sizeof(struct ieee80211_quiet_ie)	/* Quiet */
 		 + 2 + 1				/* ERP */
 	         + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 		 + (vap->iv_caps & IEEE80211_C_WPA ?	/* WPA 1+2 */
 			2*sizeof(struct ieee80211_ie_wpa) : 0)
 		 /* XXX conditional? */
 		 + 4+2*sizeof(struct ieee80211_ie_htcap)/* HT caps */
 		 + 4+2*sizeof(struct ieee80211_ie_htinfo)/* HT info */
 		 + sizeof(struct ieee80211_ie_vhtcap)/* VHT caps */
 		 + sizeof(struct ieee80211_ie_vht_operation)/* VHT info */
 		 + (vap->iv_caps & IEEE80211_C_WME ?	/* WME */
 			sizeof(struct ieee80211_wme_param) : 0)
 #ifdef IEEE80211_SUPPORT_SUPERG
 		 + sizeof(struct ieee80211_ath_ie)	/* ATH */
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 		 + (vap->iv_caps & IEEE80211_C_TDMA ?	/* TDMA */
 			sizeof(struct ieee80211_tdma_param) : 0)
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 		 + 2 + ni->ni_meshidlen
 		 + sizeof(struct ieee80211_meshconf_ie)
 #endif
 		 + IEEE80211_MAX_APPIE
 		 ;
 	m = ieee80211_getmgtframe(&frm,
 		ic->ic_headroom + sizeof(struct ieee80211_frame), pktlen);
 	if (m == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_ANY,
 			"%s: cannot get buf; size %u\n", __func__, pktlen);
 		vap->iv_stats.is_tx_nobuf++;
 		return NULL;
 	}
 	ieee80211_beacon_construct(m, frm, ni);
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT);
 	KASSERT(m != NULL, ("no space for 802.11 header?"));
 	wh = mtod(m, struct ieee80211_frame *);
 	wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_MGT |
 	    IEEE80211_FC0_SUBTYPE_BEACON;
 	wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 	*(uint16_t *)wh->i_dur = 0;
 	IEEE80211_ADDR_COPY(wh->i_addr1, ifp->if_broadcastaddr);
 	IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 	IEEE80211_ADDR_COPY(wh->i_addr3, ni->ni_bssid);
 	*(uint16_t *)wh->i_seq = 0;
 
 	return m;
 }
 
 /*
  * Update the dynamic parts of a beacon frame based on the current state.
  */
 int
 ieee80211_beacon_update(struct ieee80211_node *ni, struct mbuf *m, int mcast)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_beacon_offsets *bo = &vap->iv_bcn_off;
 	struct ieee80211com *ic = ni->ni_ic;
 	int len_changed = 0;
 	uint16_t capinfo;
 	struct ieee80211_frame *wh;
 	ieee80211_seq seqno;
 
 	IEEE80211_LOCK(ic);
 	/*
 	 * Handle 11h channel change when we've reached the count.
 	 * We must recalculate the beacon frame contents to account
 	 * for the new channel.  Note we do this only for the first
 	 * vap that reaches this point; subsequent vaps just update
 	 * their beacon state to reflect the recalculated channel.
 	 */
 	if (isset(bo->bo_flags, IEEE80211_BEACON_CSA) &&
 	    vap->iv_csa_count == ic->ic_csa_count) {
 		vap->iv_csa_count = 0;
 		/*
 		 * Effect channel change before reconstructing the beacon
 		 * frame contents as many places reference ni_chan.
 		 */
 		if (ic->ic_csa_newchan != NULL)
 			ieee80211_csa_completeswitch(ic);
 		/*
 		 * NB: ieee80211_beacon_construct clears all pending
 		 * updates in bo_flags so we don't need to explicitly
 		 * clear IEEE80211_BEACON_CSA.
 		 */
 		ieee80211_beacon_construct(m,
 		    mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni);
 
 		/* XXX do WME aggressive mode processing? */
 		IEEE80211_UNLOCK(ic);
 		return 1;		/* just assume length changed */
 	}
 
 	/*
 	 * Handle the quiet time element being added and removed.
 	 * Again, for now we just cheat and reconstruct the whole
 	 * beacon - that way the gap is provided as appropriate.
 	 *
 	 * So, track whether we have already added the IE versus
 	 * whether we want to be adding the IE.
 	 */
 	if ((vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE) &&
 	    (vap->iv_quiet == 0)) {
 		/*
 		 * Quiet time beacon IE enabled, but it's disabled;
 		 * recalc
 		 */
 		vap->iv_flags_ext &= ~IEEE80211_FEXT_QUIET_IE;
 		ieee80211_beacon_construct(m,
 		    mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni);
 		/* XXX do WME aggressive mode processing? */
 		IEEE80211_UNLOCK(ic);
 		return 1;		/* just assume length changed */
 	}
 
 	if (((vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE) == 0) &&
 	    (vap->iv_quiet == 1)) {
 		/*
 		 * Quiet time beacon IE disabled, but it's now enabled;
 		 * recalc
 		 */
 		vap->iv_flags_ext |= IEEE80211_FEXT_QUIET_IE;
 		ieee80211_beacon_construct(m,
 		    mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni);
 		/* XXX do WME aggressive mode processing? */
 		IEEE80211_UNLOCK(ic);
 		return 1;		/* just assume length changed */
 	}
 
 	wh = mtod(m, struct ieee80211_frame *);
 
 	/*
 	 * XXX TODO Strictly speaking this should be incremented with the TX
 	 * lock held so as to serialise access to the non-qos TID sequence
 	 * number space.
 	 *
 	 * If the driver identifies it does its own TX seqno management then
 	 * we can skip this (and still not do the TX seqno.)
 	 */
 	seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 	*(uint16_t *)&wh->i_seq[0] =
 		htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 	M_SEQNO_SET(m, seqno);
 
 	/* XXX faster to recalculate entirely or just changes? */
 	capinfo = ieee80211_getcapinfo(vap, ni->ni_chan);
 	*bo->bo_caps = htole16(capinfo);
 
 	if (vap->iv_flags & IEEE80211_F_WME) {
 		struct ieee80211_wme_state *wme = &ic->ic_wme;
 
 		/*
 		 * Check for aggressive mode change.  When there is
 		 * significant high priority traffic in the BSS
 		 * throttle back BE traffic by using conservative
 		 * parameters.  Otherwise BE uses aggressive params
 		 * to optimize performance of legacy/non-QoS traffic.
 		 */
 		if (wme->wme_flags & WME_F_AGGRMODE) {
 			if (wme->wme_hipri_traffic >
 			    wme->wme_hipri_switch_thresh) {
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 				    "%s: traffic %u, disable aggressive mode\n",
 				    __func__, wme->wme_hipri_traffic);
 				wme->wme_flags &= ~WME_F_AGGRMODE;
 				ieee80211_wme_updateparams_locked(vap);
 				wme->wme_hipri_traffic =
 					wme->wme_hipri_switch_hysteresis;
 			} else
 				wme->wme_hipri_traffic = 0;
 		} else {
 			if (wme->wme_hipri_traffic <=
 			    wme->wme_hipri_switch_thresh) {
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 				    "%s: traffic %u, enable aggressive mode\n",
 				    __func__, wme->wme_hipri_traffic);
 				wme->wme_flags |= WME_F_AGGRMODE;
 				ieee80211_wme_updateparams_locked(vap);
 				wme->wme_hipri_traffic = 0;
 			} else
 				wme->wme_hipri_traffic =
 					wme->wme_hipri_switch_hysteresis;
 		}
 		if (isset(bo->bo_flags, IEEE80211_BEACON_WME)) {
 			(void) ieee80211_add_wme_param(bo->bo_wme, wme);
 			clrbit(bo->bo_flags, IEEE80211_BEACON_WME);
 		}
 	}
 
 	if (isset(bo->bo_flags,  IEEE80211_BEACON_HTINFO)) {
 		ieee80211_ht_update_beacon(vap, bo);
 		clrbit(bo->bo_flags, IEEE80211_BEACON_HTINFO);
 	}
 #ifdef IEEE80211_SUPPORT_TDMA
 	if (vap->iv_caps & IEEE80211_C_TDMA) {
 		/*
 		 * NB: the beacon is potentially updated every TBTT.
 		 */
 		ieee80211_tdma_update_beacon(vap, bo);
 	}
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS)
 		ieee80211_mesh_update_beacon(vap, bo);
 #endif
 
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP ||
 	    vap->iv_opmode == IEEE80211_M_MBSS) {	/* NB: no IBSS support*/
 		struct ieee80211_tim_ie *tie =
 			(struct ieee80211_tim_ie *) bo->bo_tim;
 		if (isset(bo->bo_flags, IEEE80211_BEACON_TIM)) {
 			u_int timlen, timoff, i;
 			/* 
 			 * ATIM/DTIM needs updating.  If it fits in the
 			 * current space allocated then just copy in the
 			 * new bits.  Otherwise we need to move any trailing
 			 * data to make room.  Note that we know there is
 			 * contiguous space because ieee80211_beacon_allocate
 			 * insures there is space in the mbuf to write a
 			 * maximal-size virtual bitmap (based on iv_max_aid).
 			 */
 			/*
 			 * Calculate the bitmap size and offset, copy any
 			 * trailer out of the way, and then copy in the
 			 * new bitmap and update the information element.
 			 * Note that the tim bitmap must contain at least
 			 * one byte and any offset must be even.
 			 */
 			if (vap->iv_ps_pending != 0) {
 				timoff = 128;		/* impossibly large */
 				for (i = 0; i < vap->iv_tim_len; i++)
 					if (vap->iv_tim_bitmap[i]) {
 						timoff = i &~ 1;
 						break;
 					}
 				KASSERT(timoff != 128, ("tim bitmap empty!"));
 				for (i = vap->iv_tim_len-1; i >= timoff; i--)
 					if (vap->iv_tim_bitmap[i])
 						break;
 				timlen = 1 + (i - timoff);
 			} else {
 				timoff = 0;
 				timlen = 1;
 			}
 
 			/*
 			 * TODO: validate this!
 			 */
 			if (timlen != bo->bo_tim_len) {
 				/* copy up/down trailer */
 				int adjust = tie->tim_bitmap+timlen
 					   - bo->bo_tim_trailer;
 				ovbcopy(bo->bo_tim_trailer,
 				    bo->bo_tim_trailer+adjust,
 				    bo->bo_tim_trailer_len);
 				bo->bo_tim_trailer += adjust;
 				bo->bo_erp += adjust;
 				bo->bo_htinfo += adjust;
 				bo->bo_vhtinfo += adjust;
 #ifdef IEEE80211_SUPPORT_SUPERG
 				bo->bo_ath += adjust;
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 				bo->bo_tdma += adjust;
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 				bo->bo_meshconf += adjust;
 #endif
 				bo->bo_appie += adjust;
 				bo->bo_wme += adjust;
 				bo->bo_csa += adjust;
 				bo->bo_quiet += adjust;
 				bo->bo_tim_len = timlen;
 
 				/* update information element */
 				tie->tim_len = 3 + timlen;
 				tie->tim_bitctl = timoff;
 				len_changed = 1;
 			}
 			memcpy(tie->tim_bitmap, vap->iv_tim_bitmap + timoff,
 				bo->bo_tim_len);
 
 			clrbit(bo->bo_flags, IEEE80211_BEACON_TIM);
 
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_POWER,
 				"%s: TIM updated, pending %u, off %u, len %u\n",
 				__func__, vap->iv_ps_pending, timoff, timlen);
 		}
 		/* count down DTIM period */
 		if (tie->tim_count == 0)
 			tie->tim_count = tie->tim_period - 1;
 		else
 			tie->tim_count--;
 		/* update state for buffered multicast frames on DTIM */
 		if (mcast && tie->tim_count == 0)
 			tie->tim_bitctl |= 1;
 		else
 			tie->tim_bitctl &= ~1;
 		if (isset(bo->bo_flags, IEEE80211_BEACON_CSA)) {
 			struct ieee80211_csa_ie *csa =
 			    (struct ieee80211_csa_ie *) bo->bo_csa;
 
 			/*
 			 * Insert or update CSA ie.  If we're just starting
 			 * to count down to the channel switch then we need
 			 * to insert the CSA ie.  Otherwise we just need to
 			 * drop the count.  The actual change happens above
 			 * when the vap's count reaches the target count.
 			 */
 			if (vap->iv_csa_count == 0) {
 				memmove(&csa[1], csa, bo->bo_csa_trailer_len);
 				bo->bo_erp += sizeof(*csa);
 				bo->bo_htinfo += sizeof(*csa);
 				bo->bo_vhtinfo += sizeof(*csa);
 				bo->bo_wme += sizeof(*csa);
 #ifdef IEEE80211_SUPPORT_SUPERG
 				bo->bo_ath += sizeof(*csa);
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 				bo->bo_tdma += sizeof(*csa);
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 				bo->bo_meshconf += sizeof(*csa);
 #endif
 				bo->bo_appie += sizeof(*csa);
 				bo->bo_csa_trailer_len += sizeof(*csa);
 				bo->bo_quiet += sizeof(*csa);
 				bo->bo_tim_trailer_len += sizeof(*csa);
 				m->m_len += sizeof(*csa);
 				m->m_pkthdr.len += sizeof(*csa);
 
 				ieee80211_add_csa(bo->bo_csa, vap);
 			} else
 				csa->csa_count--;
 			vap->iv_csa_count++;
 			/* NB: don't clear IEEE80211_BEACON_CSA */
 		}
 
 		/*
 		 * Only add the quiet time IE if we've enabled it
 		 * as appropriate.
 		 */
 		if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_DFS)) {
 			if (vap->iv_quiet &&
 			    (vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE)) {
 				ieee80211_add_quiet(bo->bo_quiet, vap, 1);
 			}
 		}
 		if (isset(bo->bo_flags, IEEE80211_BEACON_ERP)) {
 			/*
 			 * ERP element needs updating.
 			 */
 			(void) ieee80211_add_erp(bo->bo_erp, ic);
 			clrbit(bo->bo_flags, IEEE80211_BEACON_ERP);
 		}
 #ifdef IEEE80211_SUPPORT_SUPERG
 		if (isset(bo->bo_flags,  IEEE80211_BEACON_ATH)) {
 			ieee80211_add_athcaps(bo->bo_ath, ni);
 			clrbit(bo->bo_flags, IEEE80211_BEACON_ATH);
 		}
 #endif
 	}
 	if (isset(bo->bo_flags, IEEE80211_BEACON_APPIE)) {
 		const struct ieee80211_appie *aie = vap->iv_appie_beacon;
 		int aielen;
 		uint8_t *frm;
 
 		aielen = 0;
 		if (aie != NULL)
 			aielen += aie->ie_len;
 		if (aielen != bo->bo_appie_len) {
 			/* copy up/down trailer */
 			int adjust = aielen - bo->bo_appie_len;
 			ovbcopy(bo->bo_tim_trailer, bo->bo_tim_trailer+adjust,
 				bo->bo_tim_trailer_len);
 			bo->bo_tim_trailer += adjust;
 			bo->bo_appie += adjust;
 			bo->bo_appie_len = aielen;
 
 			len_changed = 1;
 		}
 		frm = bo->bo_appie;
 		if (aie != NULL)
 			frm  = add_appie(frm, aie);
 		clrbit(bo->bo_flags, IEEE80211_BEACON_APPIE);
 	}
 	IEEE80211_UNLOCK(ic);
 
 	return len_changed;
 }
 
 /*
  * Do Ethernet-LLC encapsulation for each payload in a fast frame
  * tunnel encapsulation.  The frame is assumed to have an Ethernet
  * header at the front that must be stripped before prepending the
  * LLC followed by the Ethernet header passed in (with an Ethernet
  * type that specifies the payload size).
  */
 struct mbuf *
 ieee80211_ff_encap1(struct ieee80211vap *vap, struct mbuf *m,
 	const struct ether_header *eh)
 {
 	struct llc *llc;
 	uint16_t payload;
 
 	/* XXX optimize by combining m_adj+M_PREPEND */
 	m_adj(m, sizeof(struct ether_header) - sizeof(struct llc));
 	llc = mtod(m, struct llc *);
 	llc->llc_dsap = llc->llc_ssap = LLC_SNAP_LSAP;
 	llc->llc_control = LLC_UI;
 	llc->llc_snap.org_code[0] = 0;
 	llc->llc_snap.org_code[1] = 0;
 	llc->llc_snap.org_code[2] = 0;
 	llc->llc_snap.ether_type = eh->ether_type;
 	payload = m->m_pkthdr.len;		/* NB: w/o Ethernet header */
 
 	M_PREPEND(m, sizeof(struct ether_header), M_NOWAIT);
 	if (m == NULL) {		/* XXX cannot happen */
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG,
 			"%s: no space for ether_header\n", __func__);
 		vap->iv_stats.is_tx_nobuf++;
 		return NULL;
 	}
 	ETHER_HEADER_COPY(mtod(m, void *), eh);
 	mtod(m, struct ether_header *)->ether_type = htons(payload);
 	return m;
 }
 
 /*
  * Complete an mbuf transmission.
  *
  * For now, this simply processes a completed frame after the
  * driver has completed it's transmission and/or retransmission.
  * It assumes the frame is an 802.11 encapsulated frame.
  *
  * Later on it will grow to become the exit path for a given frame
  * from the driver and, depending upon how it's been encapsulated
  * and already transmitted, it may end up doing A-MPDU retransmission,
  * power save requeuing, etc.
  *
  * In order for the above to work, the driver entry point to this
  * must not hold any driver locks.  Thus, the driver needs to delay
  * any actual mbuf completion until it can release said locks.
  *
  * This frees the mbuf and if the mbuf has a node reference,
  * the node reference will be freed.
  */
 void
 ieee80211_tx_complete(struct ieee80211_node *ni, struct mbuf *m, int status)
 {
 
 	if (ni != NULL) {
 		struct ifnet *ifp = ni->ni_vap->iv_ifp;
 
 		if (status == 0) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if (m->m_flags & M_MCAST)
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		} else
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		if (m->m_flags & M_TXCB)
 			ieee80211_process_callback(ni, m, status);
 		ieee80211_free_node(ni);
 	}
 	m_freem(m);
 }
Index: head/sys/net80211/ieee80211_wds.c
===================================================================
--- head/sys/net80211/ieee80211_wds.c	(revision 348253)
+++ head/sys/net80211/ieee80211_wds.c	(revision 348254)
@@ -1,798 +1,800 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2008 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11 WDS mode support.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
 #include <net/ethernet.h>
 
 #include <net/bpf.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_wds.h>
 #include <net80211/ieee80211_input.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 
 static void wds_vattach(struct ieee80211vap *);
 static int wds_newstate(struct ieee80211vap *, enum ieee80211_state, int);
 static	int wds_input(struct ieee80211_node *ni, struct mbuf *m,
 	    const struct ieee80211_rx_stats *rxs, int, int);
 static void wds_recv_mgmt(struct ieee80211_node *, struct mbuf *, int subtype,
 	const struct ieee80211_rx_stats *, int, int);
 
 void
 ieee80211_wds_attach(struct ieee80211com *ic)
 {
 	ic->ic_vattach[IEEE80211_M_WDS] = wds_vattach;
 }
 
 void
 ieee80211_wds_detach(struct ieee80211com *ic)
 {
 }
 
 static void
 wds_vdetach(struct ieee80211vap *vap)
 {
 	if (vap->iv_bss != NULL) {
 		/* XXX locking? */
 		if (vap->iv_bss->ni_wdsvap == vap)
 			vap->iv_bss->ni_wdsvap = NULL;
 	}
 }
 
 static void
 wds_vattach(struct ieee80211vap *vap)
 {
 	vap->iv_newstate = wds_newstate;
 	vap->iv_input = wds_input;
 	vap->iv_recv_mgmt = wds_recv_mgmt;
 	vap->iv_opdetach = wds_vdetach;
 }
 
 static void
 wds_flush(struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 	struct mbuf *m, *next;
 	int8_t rssi, nf;
 
 	m = ieee80211_ageq_remove(&ic->ic_stageq,
 	    (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr));
 	if (m == NULL)
 		return;
 
 	IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_WDS, ni,
 	    "%s", "flush wds queue");
 	ic->ic_node_getsignal(ni, &rssi, &nf);
 	for (; m != NULL; m = next) {
 		next = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		ieee80211_input(ni, m, rssi, nf);
 	}
 }
 
 static int
 ieee80211_create_wds(struct ieee80211vap *vap, struct ieee80211_channel *chan)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node_table *nt = &ic->ic_sta;
 	struct ieee80211_node *ni, *obss;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS,
 	     "%s: creating link to %s on channel %u\n", __func__,
 	     ether_sprintf(vap->iv_des_bssid), ieee80211_chan2ieee(ic, chan));
 
 	/* NB: vap create must specify the bssid for the link */
 	KASSERT(vap->iv_flags & IEEE80211_F_DESBSSID, ("no bssid"));
 	/* NB: we should only be called on RUN transition */
 	KASSERT(vap->iv_state == IEEE80211_S_RUN, ("!RUN state"));
 
 	if ((vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0) {
 		/*
 		 * Dynamic/non-legacy WDS.  Reference the associated
 		 * station specified by the desired bssid setup at vap
 		 * create.  Point ni_wdsvap at the WDS vap so 4-address
 		 * frames received through the associated AP vap will
 		 * be dispatched upward (e.g. to a bridge) as though
 		 * they arrived on the WDS vap.
 		 */
 		IEEE80211_NODE_LOCK(nt);
 		obss = NULL;
 		ni = ieee80211_find_node_locked(&ic->ic_sta, vap->iv_des_bssid);
 		if (ni == NULL) {
 			/*
 			 * Node went away before we could hookup.  This
 			 * should be ok; no traffic will flow and a leave
 			 * event will be dispatched that should cause
 			 * the vap to be destroyed.
 			 */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS,
 			    "%s: station %s went away\n",
 			    __func__, ether_sprintf(vap->iv_des_bssid));
 			/* XXX stat? */
 		} else if (ni->ni_wdsvap != NULL) {
 			/*
 			 * Node already setup with a WDS vap; we cannot
 			 * allow multiple references so disallow.  If
 			 * ni_wdsvap points at us that's ok; we should
 			 * do nothing anyway.
 			 */
 			/* XXX printf instead? */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS,
 			    "%s: station %s in use with %s\n",
 			    __func__, ether_sprintf(vap->iv_des_bssid),
 			    ni->ni_wdsvap->iv_ifp->if_xname);
 			/* XXX stat? */
 		} else {
 			/*
 			 * Committed to new node, setup state.
 			 */
 			obss = vap->iv_bss;
 			vap->iv_bss = ni;
 			ni->ni_wdsvap = vap;
 		}
 		IEEE80211_NODE_UNLOCK(nt);
 		if (obss != NULL) {
 			/* NB: deferred to avoid recursive lock */
 			ieee80211_free_node(obss);
 		}
 	} else {
 		/*
 		 * Legacy WDS vap setup.
 		 */
 		/*
 		 * The far end does not associate so we just create
 		 * create a new node and install it as the vap's
 		 * bss node.  We must simulate an association and
 		 * authorize the port for traffic to flow.
 		 * XXX check if node already in sta table?
 		 */
 		ni = ieee80211_node_create_wds(vap, vap->iv_des_bssid, chan);
 		if (ni != NULL) {
 			obss = vap->iv_bss;
 			vap->iv_bss = ieee80211_ref_node(ni);
 			ni->ni_flags |= IEEE80211_NODE_AREF;
 			if (obss != NULL)
 				ieee80211_free_node(obss);
 			/* give driver a chance to setup state like ni_txrate */
 			if (ic->ic_newassoc != NULL)
 				ic->ic_newassoc(ni, 1);
 			/* tell the authenticator about new station */
 			if (vap->iv_auth->ia_node_join != NULL)
 				vap->iv_auth->ia_node_join(ni);
 			if (ni->ni_authmode != IEEE80211_AUTH_8021X)
 				ieee80211_node_authorize(ni);
 
 			ieee80211_notify_node_join(ni, 1 /*newassoc*/);
 			/* XXX inject l2uf frame */
 		}
 	}
 
 	/*
 	 * Flush any pending frames now that were setup.
 	 */
 	if (ni != NULL)
 		wds_flush(ni);
 	return (ni == NULL ? ENOENT : 0);
 }
 
 /*
  * Propagate multicast frames of an ap vap to all DWDS links.
  * The caller is assumed to have verified this frame is multicast.
  */
 void
 ieee80211_dwds_mcast(struct ieee80211vap *vap0, struct mbuf *m)
 {
 	struct ieee80211com *ic = vap0->iv_ic;
 	const struct ether_header *eh = mtod(m, const struct ether_header *);
 	struct ieee80211_node *ni;
 	struct ieee80211vap *vap;
 	struct ifnet *ifp;
 	struct mbuf *mcopy;
 	int err;
 
 	KASSERT(ETHER_IS_MULTICAST(eh->ether_dhost),
 	    ("%s not mcast", ether_sprintf(eh->ether_dhost)));
 
 	/* XXX locking */
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		/* only DWDS vaps are interesting */
 		if (vap->iv_opmode != IEEE80211_M_WDS ||
 		    (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY))
 			continue;
 		/* if it came in this interface, don't send it back out */
 		ifp = vap->iv_ifp;
 		if (ifp == m->m_pkthdr.rcvif)
 			continue;
 		/*
 		 * Duplicate the frame and send it.
 		 */
 		mcopy = m_copypacket(m, M_NOWAIT);
 		if (mcopy == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			/* XXX stat + msg */
 			continue;
 		}
 		ni = ieee80211_find_txnode(vap, eh->ether_dhost);
 		if (ni == NULL) {
 			/* NB: ieee80211_find_txnode does stat+msg */
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(mcopy);
 			continue;
 		}
 		/* calculate priority so drivers can find the tx queue */
 		if (ieee80211_classify(ni, mcopy)) {
 			IEEE80211_DISCARD_MAC(vap,
 			    IEEE80211_MSG_OUTPUT | IEEE80211_MSG_WDS,
 			    eh->ether_dhost, NULL,
 			    "%s", "classification failure");
 			vap->iv_stats.is_tx_classify++;
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(mcopy);
 			ieee80211_free_node(ni);
 			continue;
 		}
 
 		BPF_MTAP(ifp, m);		/* 802.3 tx */
 
 		/*
 		 * Encapsulate the packet in prep for transmission.
 		 */
 		IEEE80211_TX_LOCK(ic);
 		mcopy = ieee80211_encap(vap, ni, mcopy);
 		if (mcopy == NULL) {
 			/* NB: stat+msg handled in ieee80211_encap */
 			IEEE80211_TX_UNLOCK(ic);
 			ieee80211_free_node(ni);
 			continue;
 		}
 		mcopy->m_flags |= M_MCAST;
+		MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 		mcopy->m_pkthdr.rcvif = (void *) ni;
 
 		err = ieee80211_parent_xmitpkt(ic, mcopy);
 		IEEE80211_TX_UNLOCK(ic);
 		if (!err) {
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
 			    m->m_pkthdr.len);
 		}
 	}
 }
 
 /*
  * Handle DWDS discovery on receipt of a 4-address frame in
  * ap mode.  Queue the frame and post an event for someone
  * to plumb the necessary WDS vap for this station.  Frames
  * received prior to the vap set running will then be reprocessed
  * as if they were just received.
  */
 void
 ieee80211_dwds_discover(struct ieee80211_node *ni, struct mbuf *m)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 
 	/*
 	 * Save the frame with an aging interval 4 times
 	 * the listen interval specified by the station. 
 	 * Frames that sit around too long are reclaimed
 	 * using this information.
 	 * XXX handle overflow?
 	 * XXX per/vap beacon interval?
 	 */
+	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (void *)(uintptr_t)
 	    ieee80211_mac_hash(ic, ni->ni_macaddr);
 	(void) ieee80211_ageq_append(&ic->ic_stageq, m,
 	    ((ni->ni_intval * ic->ic_lintval) << 2) / 1024);
 	ieee80211_notify_wds_discover(ni);
 }
 
 /*
  * IEEE80211_M_WDS vap state machine handler.
  */
 static int
 wds_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	enum ieee80211_state ostate;
 	int error;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s\n", __func__,
 		ieee80211_state_name[ostate], ieee80211_state_name[nstate]);
 	vap->iv_state = nstate;			/* state transition */
 	callout_stop(&vap->iv_mgtsend);		/* XXX callout_drain */
 	if (ostate != IEEE80211_S_SCAN)
 		ieee80211_cancel_scan(vap);	/* background scan */
 	error = 0;
 	switch (nstate) {
 	case IEEE80211_S_INIT:
 		switch (ostate) {
 		case IEEE80211_S_SCAN:
 			ieee80211_cancel_scan(vap);
 			break;
 		default:
 			break;
 		}
 		if (ostate != IEEE80211_S_INIT) {
 			/* NB: optimize INIT -> INIT case */
 			ieee80211_reset_bss(vap);
 		}
 		break;
 	case IEEE80211_S_SCAN:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			ieee80211_check_scan_current(vap);
 			break;
 		default:
 			break;
 		}
 		break;
 	case IEEE80211_S_RUN:
 		if (ostate == IEEE80211_S_INIT) {
 			/*
 			 * Already have a channel; bypass the scan
 			 * and startup immediately.
 			 */
 			error = ieee80211_create_wds(vap, ic->ic_curchan);
 		}
 		break;
 	default:
 		break;
 	}
 	return error;
 }
 
 /*
  * Process a received frame.  The node associated with the sender
  * should be supplied.  If nothing was found in the node table then
  * the caller is assumed to supply a reference to iv_bss instead.
  * The RSSI and a timestamp are also supplied.  The RSSI data is used
  * during AP scanning to select a AP to associate with; it can have
  * any units so long as values have consistent units and higher values
  * mean ``better signal''.  The receive timestamp is currently not used
  * by the 802.11 layer.
  */
 static int
 wds_input(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	struct ieee80211_key *key;
 	struct ether_header *eh;
 	int hdrspace, need_tap = 1;	/* mbuf need to be tapped. */
 	uint8_t dir, type, subtype, qos;
 	int is_hw_decrypted = 0;
 	int has_decrypted = 0;
 
 	/*
 	 * Some devices do hardware decryption all the way through
 	 * to pretending the frame wasn't encrypted in the first place.
 	 * So, tag it appropriately so it isn't discarded inappropriately.
 	 */
 	if ((rxs != NULL) && (rxs->c_pktflags & IEEE80211_RX_F_DECRYPTED))
 		is_hw_decrypted = 1;
 
 	if (m->m_flags & M_AMPDU_MPDU) {
 		/*
 		 * Fastpath for A-MPDU reorder q resubmission.  Frames
 		 * w/ M_AMPDU_MPDU marked have already passed through
 		 * here but were received out of order and been held on
 		 * the reorder queue.  When resubmitted they are marked
 		 * with the M_AMPDU_MPDU flag and we can bypass most of
 		 * the normal processing.
 		 */
 		wh = mtod(m, struct ieee80211_frame *);
 		type = IEEE80211_FC0_TYPE_DATA;
 		dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 		subtype = IEEE80211_FC0_SUBTYPE_QOS;
 		hdrspace = ieee80211_hdrspace(ic, wh);	/* XXX optimize? */
 		goto resubmit_ampdu;
 	}
 
 	KASSERT(ni != NULL, ("null node"));
 
 	type = -1;			/* undefined */
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL,
 		    "too short (1): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/*
 	 * Bit of a cheat here, we use a pointer for a 3-address
 	 * frame format but don't reference fields past outside
 	 * ieee80211_frame_min w/o first validating the data is
 	 * present.
 	 */
 	wh = mtod(m, struct ieee80211_frame *);
 
 	if (!IEEE80211_IS_MULTICAST(wh->i_addr1))
 		ni->ni_inact = ni->ni_inact_reload;
 
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL, "wrong version, fc %02x:%02x",
 		    wh->i_fc[0], wh->i_fc[1]);
 		vap->iv_stats.is_rx_badversion++;
 		goto err;
 	}
 
 	dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 
 	/* NB: WDS vap's do not scan */
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_addr4)) {
 		IEEE80211_DISCARD_MAC(vap,
 		    IEEE80211_MSG_ANY, ni->ni_macaddr, NULL,
 		    "too short (3): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/* NB: the TA is implicitly verified by finding the wds peer node */
 	if (!IEEE80211_ADDR_EQ(wh->i_addr1, vap->iv_myaddr) &&
 	    !IEEE80211_ADDR_EQ(wh->i_addr1, ifp->if_broadcastaddr)) {
 		/* not interested in */
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 		    wh->i_addr1, NULL, "%s", "not to bss");
 		vap->iv_stats.is_rx_wrongbss++;
 		goto out;
 	}
 	IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 	ni->ni_noise = nf;
 	if (IEEE80211_HAS_SEQ(type, subtype)) {
 		uint8_t tid = ieee80211_gettid(wh);
 		if (IEEE80211_QOS_HAS_SEQ(wh) &&
 		    TID_TO_WME_AC(tid) >= WME_AC_VI)
 			ic->ic_wme.wme_hipri_traffic++;
 		if (! ieee80211_check_rxseq(ni, wh, wh->i_addr1, rxs))
 			goto out;
 	}
 	switch (type) {
 	case IEEE80211_FC0_TYPE_DATA:
 		hdrspace = ieee80211_hdrspace(ic, wh);
 		if (m->m_len < hdrspace &&
 		    (m = m_pullup(m, hdrspace)) == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		if (dir != IEEE80211_FC1_DIR_DSTODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto out;
 		}
 		/*
 		 * Only legacy WDS traffic should take this path.
 		 */
 		if ((vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "%s", "not legacy wds");
 			vap->iv_stats.is_rx_wrongdir++;/*XXX*/
 			goto out;
 		}
 		/*
 		 * Handle A-MPDU re-ordering.  If the frame is to be
 		 * processed directly then ieee80211_ampdu_reorder
 		 * will return 0; otherwise it has consumed the mbuf
 		 * and we should do nothing more with it.
 		 */
 		if ((m->m_flags & M_AMPDU) &&
 		    ieee80211_ampdu_reorder(ni, m, rxs) != 0) {
 			m = NULL;
 			goto out;
 		}
 	resubmit_ampdu:
 
 		/*
 		 * Handle privacy requirements.  Note that we
 		 * must not be preempted from here until after
 		 * we (potentially) call ieee80211_crypto_demic;
 		 * otherwise we may violate assumptions in the
 		 * crypto cipher modules used to do delayed update
 		 * of replay sequence numbers.
 		 */
 		if (is_hw_decrypted || wh->i_fc[1] & IEEE80211_FC1_PROTECTED) {
 			if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 				/*
 				 * Discard encrypted frames when privacy is off.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "WEP", "%s", "PRIVACY off");
 				vap->iv_stats.is_rx_noprivacy++;
 				IEEE80211_NODE_STAT(ni, rx_noprivacy);
 				goto out;
 			}
 			if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) {
 				/* NB: stats+msgs handled in crypto_decap */
 				IEEE80211_NODE_STAT(ni, rx_wepfail);
 				goto out;
 			}
 			wh = mtod(m, struct ieee80211_frame *);
 			wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 			has_decrypted = 1;
 		} else {
 			/* XXX M_WEP and IEEE80211_F_PRIVACY */
 			key = NULL;
 		}
 
 		/*
 		 * Save QoS bits for use below--before we strip the header.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_QOS)
 			qos = ieee80211_getqos(wh)[0];
 		else
 			qos = 0;
 
 		/*
 		 * Next up, any fragmentation.
 		 */
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			m = ieee80211_defrag(ni, m, hdrspace);
 			if (m == NULL) {
 				/* Fragment dropped or frame not complete yet */
 				goto out;
 			}
 		}
 		wh = NULL;		/* no longer valid, catch any uses */
 
 		/*
 		 * Next strip any MSDU crypto bits.
 		 */
 		if (!ieee80211_crypto_demic(vap, key, m, 0)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "demic error");
 			vap->iv_stats.is_rx_demicfail++;
 			IEEE80211_NODE_STAT(ni, rx_demicfail);
 			goto out;
 		}
 
 		/* copy to listener after decrypt */
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 
 		/*
 		 * Finally, strip the 802.11 header.
 		 */
 		m = ieee80211_decap(vap, m, hdrspace);
 		if (m == NULL) {
 			/* XXX mask bit to check for both */
 			/* don't count Null data frames as errors */
 			if (subtype == IEEE80211_FC0_SUBTYPE_NODATA ||
 			    subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL)
 				goto out;
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "decap error");
 			vap->iv_stats.is_rx_decap++;
 			IEEE80211_NODE_STAT(ni, rx_decap);
 			goto err;
 		}
 		eh = mtod(m, struct ether_header *);
 		if (!ieee80211_node_is_authorized(ni)) {
 			/*
 			 * Deny any non-PAE frames received prior to
 			 * authorization.  For open/shared-key
 			 * authentication the port is mark authorized
 			 * after authentication completes.  For 802.1x
 			 * the port is not marked authorized by the
 			 * authenticator until the handshake has completed.
 			 */
 			if (eh->ether_type != htons(ETHERTYPE_PAE)) {
 				IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 				    eh->ether_shost, "data",
 				    "unauthorized port: ether type 0x%x len %u",
 				    eh->ether_type, m->m_pkthdr.len);
 				vap->iv_stats.is_rx_unauth++;
 				IEEE80211_NODE_STAT(ni, rx_unauth);
 				goto err;
 			}
 		} else {
 			/*
 			 * When denying unencrypted frames, discard
 			 * any non-PAE frames received without encryption.
 			 */
 			if ((vap->iv_flags & IEEE80211_F_DROPUNENC) &&
 			    ((has_decrypted == 0) && (m->m_flags & M_WEP) == 0) &&
 			    (is_hw_decrypted == 0) &&
 			    eh->ether_type != htons(ETHERTYPE_PAE)) {
 				/*
 				 * Drop unencrypted frames.
 				 */
 				vap->iv_stats.is_rx_unencrypted++;
 				IEEE80211_NODE_STAT(ni, rx_unencrypted);
 				goto out;
 			}
 		}
 		/* XXX require HT? */
 		if (qos & IEEE80211_QOS_AMSDU) {
 			m = ieee80211_decap_amsdu(ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 		} else {
 #ifdef IEEE80211_SUPPORT_SUPERG
 			m = ieee80211_decap_fastframe(vap, ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 #endif
 		}
 		ieee80211_deliver_data(vap, ni, m);
 		return IEEE80211_FC0_TYPE_DATA;
 
 	case IEEE80211_FC0_TYPE_MGT:
 		vap->iv_stats.is_rx_mgmt++;
 		IEEE80211_NODE_STAT(ni, rx_mgmt);
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 		if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, "mgt", "too short: len %u",
 			    m->m_pkthdr.len);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;
 		}
 #ifdef IEEE80211_DEBUG
 		if (ieee80211_msg_debug(vap) || ieee80211_msg_dumppkts(vap)) {
 			if_printf(ifp, "received %s from %s rssi %d\n",
 			    ieee80211_mgt_subtype_name(subtype),
 			    ether_sprintf(wh->i_addr2), rssi);
 		}
 #endif
 		if (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "WEP set but not permitted");
 			vap->iv_stats.is_rx_mgtdiscard++; /* XXX */
 			goto out;
 		}
 		vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf);
 		goto out;
 
 	case IEEE80211_FC0_TYPE_CTL:
 		vap->iv_stats.is_rx_ctl++;
 		IEEE80211_NODE_STAT(ni, rx_ctrl);
 		goto out;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "bad", "frame type 0x%x", type);
 		/* should not come here */
 		break;
 	}
 err:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 out:
 	if (m != NULL) {
 		if (need_tap && ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		m_freem(m);
 	}
 	return type;
 }
 
 static void
 wds_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, int subtype,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_frame *wh;
 	u_int8_t *frm, *efrm;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	frm = (u_int8_t *)&wh[1];
 	efrm = mtod(m0, u_int8_t *) + m0->m_len;
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_ACTION:
 	case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 		if (ni == vap->iv_bss) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "unknown node");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1)) {
 			/* NB: not interested in multicast frames. */
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not for us");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else {
 			if (ieee80211_parse_action(ni, m0) == 0)
 				(void)ic->ic_recv_action(ni, wh, frm, efrm);
 		}
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 	case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 	case IEEE80211_FC0_SUBTYPE_ATIM:
 	case IEEE80211_FC0_SUBTYPE_DISASSOC:
 	case IEEE80211_FC0_SUBTYPE_AUTH:
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 		    wh, NULL, "%s", "not handled");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "mgt", "subtype 0x%x not handled", subtype);
 		vap->iv_stats.is_rx_badsubtype++;
 		break;
 	}
 }
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c	(revision 348253)
+++ head/sys/netinet/in_pcb.c	(revision 348254)
@@ -1,3434 +1,3425 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #ifdef TCPHPTS
 #include <netinet/tcp_hpts.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #endif
 #ifdef INET
 #include <netinet/in_var.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 #include <netipsec/ipsec_support.h>
 
 #include <security/mac/mac_framework.h>
 
 #define	INPCBLBGROUP_SIZMIN	8
 #define	INPCBLBGROUP_SIZMAX	256
 
 static struct callout	ipport_tick_callout;
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Variables dealing with random ephemeral port allocation. */
 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
 VNET_DEFINE(int, ipport_tcpallocs);
 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
 static void	in_pcbremlists(struct inpcb *inp);
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
     "IP Ports");
 
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
 	&VNET_NAME(ipport_reservedhigh), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 #endif /* INET */
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 static struct inpcblbgroup *
 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
     uint16_t port, const union in_dependaddr *addr, int size)
 {
 	struct inpcblbgroup *grp;
 	size_t bytes;
 
 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
 	if (!grp)
 		return (NULL);
 	grp->il_vflag = vflag;
 	grp->il_lport = port;
 	grp->il_dependladdr = *addr;
 	grp->il_inpsiz = size;
 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
 	return (grp);
 }
 
 static void
 in_pcblbgroup_free_deferred(epoch_context_t ctx)
 {
 	struct inpcblbgroup *grp;
 
 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
 	free(grp, M_PCB);
 }
 
 static void
 in_pcblbgroup_free(struct inpcblbgroup *grp)
 {
 
 	CK_LIST_REMOVE(grp, il_list);
 	epoch_call(net_epoch_preempt, &grp->il_epoch_ctx,
 	    in_pcblbgroup_free_deferred);
 }
 
 static struct inpcblbgroup *
 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
     struct inpcblbgroup *old_grp, int size)
 {
 	struct inpcblbgroup *grp;
 	int i;
 
 	grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
 	    old_grp->il_lport, &old_grp->il_dependladdr, size);
 	if (grp == NULL)
 		return (NULL);
 
 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid new local group size %d and old local group count %d",
 	     grp->il_inpsiz, old_grp->il_inpcnt));
 
 	for (i = 0; i < old_grp->il_inpcnt; ++i)
 		grp->il_inp[i] = old_grp->il_inp[i];
 	grp->il_inpcnt = old_grp->il_inpcnt;
 	in_pcblbgroup_free(old_grp);
 	return (grp);
 }
 
 /*
  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
  * and shrink group if possible.
  */
 static void
 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
     int i)
 {
 	struct inpcblbgroup *grp, *new_grp;
 
 	grp = *grpp;
 	for (; i + 1 < grp->il_inpcnt; ++i)
 		grp->il_inp[i] = grp->il_inp[i + 1];
 	grp->il_inpcnt--;
 
 	if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
 	    grp->il_inpcnt <= grp->il_inpsiz / 4) {
 		/* Shrink this group. */
 		new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
 		if (new_grp != NULL)
 			*grpp = new_grp;
 	}
 }
 
 /*
  * Add PCB to load balance group for SO_REUSEPORT_LB option.
  */
 static int
 in_pcbinslbgrouphash(struct inpcb *inp)
 {
 	const static struct timeval interval = { 60, 0 };
 	static struct timeval lastprint;
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	/*
 	 * Don't allow jailed socket to join local group.
 	 */
 	if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
 		return (0);
 
 #ifdef INET6
 	/*
 	 * Don't allow IPv4 mapped INET6 wild socket.
 	 */
 	if ((inp->inp_vflag & INP_IPV4) &&
 	    inp->inp_laddr.s_addr == INADDR_ANY &&
 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
 		return (0);
 	}
 #endif
 
 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		if (grp->il_vflag == inp->inp_vflag &&
 		    grp->il_lport == inp->inp_lport &&
 		    memcmp(&grp->il_dependladdr,
 		    &inp->inp_inc.inc_ie.ie_dependladdr,
 		    sizeof(grp->il_dependladdr)) == 0)
 			break;
 	}
 	if (grp == NULL) {
 		/* Create new load balance group. */
 		grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
 		    INPCBLBGROUP_SIZMIN);
 		if (grp == NULL)
 			return (ENOBUFS);
 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
 			if (ratecheck(&lastprint, &interval))
 				printf("lb group port %d, limit reached\n",
 				    ntohs(grp->il_lport));
 			return (0);
 		}
 
 		/* Expand this local group. */
 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
 		if (grp == NULL)
 			return (ENOBUFS);
 	}
 
 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
 	    grp->il_inpcnt));
 
 	grp->il_inp[grp->il_inpcnt] = inp;
 	grp->il_inpcnt++;
 	return (0);
 }
 
 /*
  * Remove PCB from load balance group.
  */
 static void
 in_pcbremlbgrouphash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int i;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_inpcnt == 1) {
 				/* We are the last, free this local group. */
 				in_pcblbgroup_free(grp);
 			} else {
 				/* Pull up inpcbs, shrink group if possible. */
 				in_pcblbgroup_reorder(hdr, &grp, i);
 			}
 			return;
 		}
 	}
 }
 
 /*
  * Different protocols initialize their inpcbs differently - giving
  * different name to the lock.  But they all are disposed the same.
  */
 static void
 inpcb_fini(void *mem, int size)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_DESTROY(inp);
 }
 
 /*
  * Initialize an inpcbinfo -- we should be able to reduce the number of
  * arguments in time.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
     char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
 {
 
 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
 
 	INP_INFO_LOCK_INIT(pcbinfo, name);
 	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
 	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	pcbinfo->ipi_listhead = listhead;
 	CK_LIST_INIT(pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
 	    NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
 	uma_zone_set_warning(pcbinfo->ipi_zone,
 	    "kern.ipc.maxsockets limit reached");
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
 	    pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_destroy(pcbinfo);
 #endif
 	uma_zdestroy(pcbinfo->ipi_zone);
 	INP_LIST_LOCK_DESTROY(pcbinfo);
 	INP_HASH_LOCK_DESTROY(pcbinfo);
 	INP_INFO_LOCK_DESTROY(pcbinfo);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 	int error;
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_RLOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(&inp->inp_start_zero, inp_zero_size);
 #ifdef NUMA
 	inp->inp_numa_domain = M_NODOM;
 #endif
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	error = ipsec_init_pcbpolicy(inp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	INP_WLOCK(inp);
 	INP_LIST_WLOCK(pcbinfo);
 	CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
 
 	/*
 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
 	 * to be cleaned up.
 	 */
 	inp->inp_route.ro_flags = RT_LLE_CACHE;
 	INP_LIST_WUNLOCK(pcbinfo);
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 /*
  * Select a local port (number) to use.
  */
 #if defined(INET) || defined(INET6)
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, dorandom, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 	/*
 	 * For UDP(-Lite), use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
 		pcbinfo == &V_ulitecbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP(-Lite) packets in the count. */
 	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
 		V_ipport_tcpallocs++;
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	/* Make the compiler happy. */
 	laddr.s_addr = 0;
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
 		    __func__, inp));
 		laddr = *laddrp;
 	}
 #endif
 	tmpinp = NULL;	/* Make compiler happy. */
 	lport = *lportp;
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV6) != 0)
 			tmpinp = in6_pcblookup_local(pcbinfo,
 			    &inp->in6p_laddr, lport, lookupflags, cred);
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 			tmpinp = in_pcblookup_local(pcbinfo, laddr,
 			    lport, lookupflags, cred);
 #endif
 	} while (tmpinp != NULL);
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
 		laddrp->s_addr = laddr.s_addr;
 #endif
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Return cached socket options.
  */
 int
 inp_so_options(const struct inpcb *inp)
 {
 	int so_options;
 
 	so_options = 0;
 
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 		so_options |= SO_REUSEPORT_LB;
 	if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 		so_options |= SO_REUSEPORT;
 	if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 		so_options |= SO_REUSEADDR;
 	return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the exisitng inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
 	 * so that we don't have to add to the (already messy) code below.
 	 */
 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		if (nam->sa_len != sizeof (*sin))
 			return (EINVAL);
 #ifdef notdef
 		/*
 		 * We should check the family, but old programs
 		 * incorrectly fail to initialize it.
 		 */
 		if (sin->sin_family != AF_INET)
 			return (EAFNOSUPPORT);
 #endif
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 			/*
 			 * XXX: How to deal with SO_REUSEPORT_LB here?
 			 * Treat same as SO_REUSEPORT for now.
 			 */
 			if ((so->so_options &
 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address?
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) ||
 				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    ((reuseport & tw->tw_so_options) == 0 &&
 					(reuseport_lb &
 				            tw->tw_so_options) == 0)) {
 					return (EADDRINUSE);
 				}
 			} else if (t &&
 				   ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				   (reuseport & inp_so_options(t)) == 0 &&
 				   (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 						return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	in_pcbrehash_mbuf(inp, m);
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin;
 	struct route sro;
 	struct epoch_tracker et;
 	int error;
 
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 	bzero(&sro, sizeof(sro));
 
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 * 
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	NET_EPOCH_ENTER(et);
 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL) {
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 						inp->inp_socket->so_fibnum));
 
 		}
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ia = NULL;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = sro.ro_rt->rt_ifp;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct sockaddr_in sain;
 		struct in_ifaddr *ia;
 
 		bzero(&sain, sizeof(struct sockaddr_in));
 		sain.sin_family = AF_INET;
 		sain.sin_len = sizeof(struct sockaddr_in);
 		sain.sin_addr.s_addr = faddr->s_addr;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ia = NULL;
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				goto done;
 			}
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	NET_EPOCH_EXIT(et);
 	if (sro.ro_rt != NULL)
 		RTFREE(sro.ro_rt);
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 
 	if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			faddr =
 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			if (cred != NULL &&
 			    (error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&CK_STAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				IN_IFADDR_RLOCK(&in_ifa_tracker);
 				CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if ((ia->ia_ifp == ifp) &&
 					    (cred == NULL ||
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0))
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			}
 		}
 		if (error)
 			return (error);
 	}
 	oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
 	    laddr, lport, 0, NULL);
 	if (oinp != NULL) {
 		if (oinpp != NULL)
 			*oinpp = oinp;
 		return (EADDRINUSE);
 	}
 	if (lport == 0) {
 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
 		    cred);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 #ifdef RATELIMIT
 	if (inp->inp_snd_tag != NULL)
 		in_pcbdetach_txrtlmt(inp);
 #endif
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock may already held, or when acquiring a reference
  * via a pcbgroup.
  *
  * in_pcbref() should be used only to provide brief memory stability, and
  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
  * garbage collect the inpcb if it has been in_pcbfree()'d from another
  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
  * lock and rele are the *only* safe operations that may be performed on the
  * inpcb.
  *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
  * using in_pcbrele().
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	refcount_acquire(&inp->inp_refcount);
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
  *
  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
  * reference on an inpcb.  Historically more work was done here (actually, in
  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
  * about memory stability (and continued use of the write lock).
  */
 int
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_RUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 	
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 #ifdef TCPHPTS
 	if (inp->inp_in_hpts || inp->inp_in_input) {
 		struct tcp_hpts_entry *hpts;
 		/*
 		 * We should not be on the hpts at 
 		 * this point in any form. we must
 		 * get the lock to be sure.
 		 */
 		hpts = tcp_hpts_lock(inp);
 		if (inp->inp_in_hpts)
 			panic("Hpts:%p inp:%p at free still on hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 		hpts = tcp_input_lock(inp);
 		if (inp->inp_in_input) 
 			panic("Hpts:%p inp:%p at free still on input hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 	}
 #endif
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 int
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_WUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 #ifdef TCPHPTS
 	if (inp->inp_in_hpts || inp->inp_in_input) {
 		struct tcp_hpts_entry *hpts;
 		/*
 		 * We should not be on the hpts at 
 		 * this point in any form. we must
 		 * get the lock to be sure.
 		 */
 		hpts = tcp_hpts_lock(inp);
 		if (inp->inp_in_hpts)
 			panic("Hpts:%p inp:%p at free still on hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 		hpts = tcp_input_lock(inp);
 		if (inp->inp_in_input) 
 			panic("Hpts:%p inp:%p at free still on input hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 	}
 #endif
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 /*
  * Temporary wrapper.
  */
 int
 in_pcbrele(struct inpcb *inp)
 {
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 void
 in_pcblist_rele_rlocked(epoch_context_t ctx)
 {
 	struct in_pcblist *il;
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int i, n;
 
 	il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
 	pcbinfo = il->il_pcbinfo;
 	n = il->il_count;
 	INP_INFO_WLOCK(pcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = il->il_inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 	free(il, M_TEMP);
 }
 
 static void
 inpcbport_free(epoch_context_t ctx)
 {
 	struct inpcbport *phd;
 
 	phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
 	free(phd, M_PCB);
 }
 
 static void
 in_pcbfree_deferred(epoch_context_t ctx)
 {
 	struct inpcb *inp;
 	int released __unused;
 
 	inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
 
 	INP_WLOCK(inp);
 	CURVNET_SET(inp->inp_vnet);
 #ifdef INET
 	struct ip_moptions *imo = inp->inp_moptions;
 	inp->inp_moptions = NULL;
 #endif
 	/* XXXRW: Do as much as possible here. */
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 #ifdef INET6
 	struct ip6_moptions *im6o = NULL;
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		im6o = inp->in6p_moptions;
 		inp->in6p_moptions = NULL;
 	}
 #endif
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 	inp->inp_vflag = 0;
 	crfree(inp->inp_cred);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	released = in_pcbrele_wlocked(inp);
 	MPASS(released);
 #ifdef INET6
 	ip6_freemoptions(im6o);
 #endif
 #ifdef INET
 	inp_freemoptions(imo);
 #endif	
 	CURVNET_RESTORE();
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
  * work, including removal from global lists, is done in this context, where
  * the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 	KASSERT((inp->inp_flags2 & INP_FREED) == 0,
 	    ("%s: called twice for pcb %p", __func__, inp));
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_WUNLOCK(inp);
 		return;
 	}
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_LOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 	INP_WLOCK_ASSERT(inp);
 	INP_LIST_WLOCK(pcbinfo);
 	in_pcbremlists(inp);
 	INP_LIST_WUNLOCK(pcbinfo);
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	/* mark as destruction in progress */
 	inp->inp_flags2 |= INP_FREED;
 	INP_WUNLOCK(inp);
 	epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 #ifdef INVARIANTS
 	if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
 		MPASS(inp->inp_refcount > 1);
 #endif
 
 	/*
 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 	 * the hash lock...?
 	 */
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		in_pcbremlbgrouphash(inp);
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 #ifdef PCBGROUP
 		in_pcbgroup_remove(inp);
 #endif
 	}
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct ip_moptions *imo;
 	int i, gap;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 *
 			 * XXX This can all be deferred to an epoch_call
 			 */
 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
 			    i++) {
 				if (imo->imo_membership[i]->inm_ifp == ifp) {
 					IN_MULTI_LOCK_ASSERT();
 					in_leavegroup_locked(imo->imo_membership[i], NULL);
 					gap++;
 				} else if (gap != 0)
 					imo->imo_membership[i - gap] =
 					    imo->imo_membership[i];
 			}
 			imo->imo_num_memberships -= gap;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    prison_equal_ip4(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
 					cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 static struct inpcb *
 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
     uint16_t fport, int lookupflags)
 {
 	struct inpcb *local_wild;
 	const struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 
 	/*
 	 * Order of socket selection:
 	 * 1. non-wild.
 	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
 	 *
 	 * NOTE:
 	 * - Load balanced group does not contain jailed sockets
 	 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
 	 */
 	local_wild = NULL;
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 #ifdef INET6
 		if (!(grp->il_vflag & INP_IPV4))
 			continue;
 #endif
 		if (grp->il_lport != lport)
 			continue;
 
 		idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
 		    grp->il_inpcnt;
 		if (grp->il_laddr.s_addr == laddr->s_addr)
 			return (grp->il_inp[idx]);
 		if (grp->il_laddr.s_addr == INADDR_ANY &&
 		    (lookupflags & INPLOOKUP_WILDCARD) != 0)
 			local_wild = grp->il_inp[idx];
 	}
 	return (local_wild);
 }
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 	bool locked;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbgroup->ipg_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 #ifdef	RSS
 	/*
 	 * For incoming connections, we may wish to do a wildcard
 	 * match for an RSS-local socket.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
 		    lport, 0, pcbgroup->ipg_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	}
 #endif
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_wildmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		locked = INP_TRY_WLOCK(inp);
 	else if (lookupflags & INPLOOKUP_RLOCKPCB)
 		locked = INP_TRY_RLOCK(inp);
 	else
 		panic("%s: locking bug", __func__);
 	if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB)
 			INP_WUNLOCK(inp);
 		else
 			INP_RUNLOCK(inp);
 		return (NULL);
 	} else if (!locked)
 		in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (!locked) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		}
 	}
 #ifdef INVARIANTS
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		INP_WLOCK_ASSERT(inp);
 	else
 		INP_RLOCK_ASSERT(inp);
 #endif
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has locked the hash list, and will not perform any further
  * locking or reference operations on either the hash list or the connection.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 #ifdef INVARIANTS
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	if (!mtx_owned(&pcbinfo->ipi_hash_lock))
 		MPASS(in_epoch_verbose(net_epoch_preempt, 1));
 #endif
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look in lb group (for wildcard match).
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
 		    fport, lookupflags);
 		if (inp != NULL)
 			return (inp);
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcb *inp;
 
 	INP_HASH_RLOCK(pcbinfo);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 	if (inp != NULL) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 				INP_WUNLOCK(inp);
 				inp = NULL;
 			}
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 			if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 				INP_RUNLOCK(inp);
 				inp = NULL;
 			}
 		} else
 			panic("%s: locking bug", __func__);
 #ifdef INVARIANTS
 		if (inp != NULL) {
 			if (lookupflags & INPLOOKUP_WLOCKPCB)
 				INP_WLOCK_ASSERT(inp);
 			else
 				INP_RLOCK_ASSERT(inp);
 		}
 #endif
 	}
 	INP_HASH_RUNLOCK(pcbinfo);
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in_pcbgroup.c.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
 		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 static int
 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 	int so_options;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Add entry to load balance group.
 	 * Only do this if SO_REUSEPORT_LB is set.
 	 */
 	so_options = inp_so_options(inp);
 	if (so_options & SO_REUSEPORT_LB) {
 		int ret = in_pcbinslbgrouphash(inp);
 		if (ret) {
 			/* pcb lb group malloc fail (ret=ENOBUFS). */
 			return (ret);
 		}
 	}
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
 		phd->phd_port = inp->inp_lport;
 		CK_LIST_INIT(&phd->phd_pcblist);
 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 #ifdef PCBGROUP
 	if (do_pcbgroup_update)
 		in_pcbgroup_update(inp);
 #endif
 	return (0);
 }
 
 /*
  * For now, there are two public interfaces to insert an inpcb into the hash
  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
  * is used only in the TCP syncache, where in_pcbinshash is called before the
  * full 4-tuple is set for the inpcb, and we don't want to install in the
  * pcbgroup until later.
  *
  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
  * connection groups, and partially initialised inpcbs should not be exposed
  * to either reservation hash tables or pcbgroups.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 1));
 }
 
 int
 in_pcbinshash_nopcbgroup(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 0));
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	CK_LIST_REMOVE(inp, inp_hash);
 	CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 
 #ifdef PCBGROUP
 	if (m != NULL)
 		in_pcbgroup_update_mbuf(inp, m);
 	else
 		in_pcbgroup_update(inp);
 #endif
 }
 
 void
 in_pcbrehash(struct inpcb *inp)
 {
 
 	in_pcbrehash_mbuf(inp, NULL);
 }
 
 /*
  * Remove PCB from various lists.
  */
 static void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_RLOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	INP_LIST_WLOCK_ASSERT(pcbinfo);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
 
 		/* XXX: Only do if SO_REUSEPORT_LB set? */
 		in_pcbremlbgrouphash(inp);
 
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
 		}
 		INP_HASH_WUNLOCK(pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 	CK_LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 #ifdef PCBGROUP
 	in_pcbgroup_remove(inp);
 #endif
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in_losing(struct inpcb *inp)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	return;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 static void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 static void
 ip_fini(void *xtp)
 {
 
 	callout_stop(&ipport_tick_callout);
 }
 
 /* 
  * The ipport_callout should start running at about the time we attach the
  * inet or inet6 domains.
  */
 static void
 ipport_tick_init(const void *unused __unused)
 {
 
 	/* Start ipport_tick. */
 	callout_init(&ipport_tick_callout, 1);
 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 }
 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
     ipport_tick_init, NULL);
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANT_SUPPORT
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 /*
  * Create an external-format (``xinpcb'') structure using the information in
  * the kernel-format in_pcb structure pointed to by inp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
 {
 
 	bzero(xi, sizeof(*xi));
 	xi->xi_len = sizeof(struct xinpcb);
 	if (inp->inp_socket)
 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
 	xi->inp_gencnt = inp->inp_gencnt;
 	xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
 	xi->inp_flow = inp->inp_flow;
 	xi->inp_flowid = inp->inp_flowid;
 	xi->inp_flowtype = inp->inp_flowtype;
 	xi->inp_flags = inp->inp_flags;
 	xi->inp_flags2 = inp->inp_flags2;
 	xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
 	xi->in6p_cksum = inp->in6p_cksum;
 	xi->in6p_hops = inp->in6p_hops;
 	xi->inp_ip_tos = inp->inp_ip_tos;
 	xi->inp_vflag = inp->inp_vflag;
 	xi->inp_ip_ttl = inp->inp_ip_ttl;
 	xi->inp_ip_p = inp->inp_ip_p;
 	xi->inp_ip_minttl = inp->inp_ip_minttl;
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ORIGDSTADDR) {
 		db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
 
 #ifdef RATELIMIT
 /*
  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
  * if any.
  */
 int
 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 	};
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	ifp = mst->ifp;
 	if (ifp == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_modify == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_modify(mst, &params);
 	}
 	return (error);
 }
 
 /*
  * Query existing TX rate limit based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	ifp = mst->ifp;
 	if (ifp == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_query == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_query(mst, &params);
 		if (error == 0 &&  p_max_pacing_rate != NULL)
 			*p_max_pacing_rate = params.rate_limit.max_rate;
 	}
 	return (error);
 }
 
 /*
  * Query existing TX queue level based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	ifp = mst->ifp;
 	if (ifp == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_query == NULL)
 		return (EOPNOTSUPP);
 
 	error = ifp->if_snd_tag_query(mst, &params);
 	if (error == 0 &&  p_txqueue_level != NULL)
 		*p_txqueue_level = params.rate_limit.queue_level;
 	return (error);
 }
 
 /*
  * Allocate a new TX rate limit send tag from the network interface
  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.max_rate = max_pacing_rate,
 	};
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_snd_tag != NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
-
-		/*
-		 * At success increment the refcount on
-		 * the send tag's network interface:
-		 */
-		if (error == 0)
-			if_ref(inp->inp_snd_tag->ifp);
 	}
 	return (error);
 }
 
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
  */
 void
 in_pcbdetach_txrtlmt(struct inpcb *inp)
 {
 	struct m_snd_tag *mst;
-	struct ifnet *ifp;
 
 	INP_WLOCK_ASSERT(inp);
 
 	mst = inp->inp_snd_tag;
 	inp->inp_snd_tag = NULL;
 
 	if (mst == NULL)
 		return;
 
-	ifp = mst->ifp;
-	if (ifp == NULL)
-		return;
-
-	/*
-	 * If the device was detached while we still had reference(s)
-	 * on the ifp, we assume if_snd_tag_free() was replaced with
-	 * stubs.
-	 */
-	ifp->if_snd_tag_free(mst);
-
-	/* release reference count on network interface */
-	if_rele(ifp);
+	m_snd_tag_rele(mst);
 }
 
 /*
  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
  * is set in the fast path and will attach/detach/modify the TX rate
  * limit send tag based on the socket's so_max_pacing_rate value.
  */
 void
 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 {
 	struct socket *socket;
 	uint32_t max_pacing_rate;
 	bool did_upgrade;
 	int error;
 
 	if (inp == NULL)
 		return;
 
 	socket = inp->inp_socket;
 	if (socket == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/*
 	 * NOTE: The so_max_pacing_rate value is read unlocked,
 	 * because atomic updates are not required since the variable
 	 * is checked at every mbuf we send. It is assumed that the
 	 * variable read itself will be atomic.
 	 */
 	max_pacing_rate = socket->so_max_pacing_rate;
+
+	/*
+	 * If the existing send tag is for the wrong interface due to
+	 * a route change, first drop the existing tag.  Set the
+	 * CHANGED flag so that we will keep trying to allocate a new
+	 * tag if we fail to allocate one this time.
+	 */
+	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
+		in_pcbdetach_txrtlmt(inp);
+		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+	}
 
 	/*
 	 * NOTE: When attaching to a network interface a reference is
 	 * made to ensure the network interface doesn't go away until
 	 * all ratelimit connections are gone. The network interface
 	 * pointers compared below represent valid network interfaces,
 	 * except when comparing towards NULL.
 	 */
 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
 		error = 0;
 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
 		if (inp->inp_snd_tag != NULL)
 			in_pcbdetach_txrtlmt(inp);
 		error = 0;
 	} else if (inp->inp_snd_tag == NULL) {
 		/*
 		 * In order to utilize packet pacing with RSS, we need
 		 * to wait until there is a valid RSS hash before we
 		 * can proceed:
 		 */
 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
 			error = EAGAIN;
 		} else {
 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
 			    mb->m_pkthdr.flowid, max_pacing_rate);
 		}
 	} else {
 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 	}
 	if (error == 0 || error == EOPNOTSUPP)
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 /*
  * Track route changes for TX rate limiting.
  */
 void
 in_pcboutput_eagain(struct inpcb *inp)
 {
 	bool did_upgrade;
 
 	if (inp == NULL)
 		return;
 
 	if (inp->inp_snd_tag == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/* detach rate limiting */
 	in_pcbdetach_txrtlmt(inp);
 
 	/* make sure new mbuf send tag allocation is made */
 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 #endif /* RATELIMIT */
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c	(revision 348253)
+++ head/sys/netinet/ip_output.c	(revision 348254)
@@ -1,1494 +1,1507 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ratelimit.h"
 #include "opt_ipsec.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback(struct ifnet *, const struct mbuf *, int);
 
 
 extern int in_mcast_loop;
 extern	struct protosw inetsw[];
 
 static inline int
 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp,
     struct sockaddr_in *dst, int *fibnum, int *error)
 {
 	struct m_tag *fwd_tag = NULL;
 	struct mbuf *m;
 	struct in_addr odst;
 	struct ip *ip;
 
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, PFIL_OUT, inp)) {
 	case PFIL_DROPPED:
 		*error = EPERM;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		return 1; /* Finished */
 	case PFIL_PASS:
 		*error = 0;
 	}
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 					CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 				CSUM_IP_CHECKED | CSUM_IP_VALID;
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			*error = netisr_queue(NETISR_IP, m);
 			return 1; /* Finished */
 		}
 
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 
 		return -1; /* Reloop */
 	}
 	/* See if fib was changed by packet filter. */
 	if ((*fibnum) != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		*fibnum = M_GETFIB(m);
 		return -1; /* Reloop for FIB change */
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 				CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		*error = netisr_queue(NETISR_IP, m);
 		return 1; /* Finished */
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 
 		return -1; /* Reloop for CHANGE of dst */
 	}
 
 	return 0;
 }
 
+static int
+ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
+    const struct sockaddr_in *gw, struct route *ro)
+{
+	struct m_snd_tag *mst;
+	int error;
+
+	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+	mst = NULL;
+
+#ifdef RATELIMIT
+	if (inp != NULL) {
+		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
+		    (inp->inp_snd_tag != NULL &&
+		    inp->inp_snd_tag->ifp != ifp))
+			in_pcboutput_txrtlmt(inp, ifp, m);
+
+		if (inp->inp_snd_tag != NULL)
+			mst = inp->inp_snd_tag;
+	}
+#endif
+	if (mst != NULL) {
+		KASSERT(m->m_pkthdr.rcvif == NULL,
+		    ("trying to add a send tag to a forwarded packet"));
+		if (mst->ifp != ifp) {
+			error = EAGAIN;
+			goto done;
+		}
+
+		/* stamp send tag on mbuf */
+		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
+		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
+	}
+
+	error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro);
+
+done:
+	/* Check for route change invalidating send tags. */
+#ifdef RATELIMIT
+	if (error == EAGAIN)
+		in_pcboutput_eagain(inp);
+#endif
+	return (error);
+}
+
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct epoch_tracker et;
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int error = 0;
 	struct sockaddr_in *dst, sin;
 	const struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 	struct in_addr src;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	uint32_t fibnum;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	int no_route_but_check_spd = 0;
 #endif
 	M_ASSERTPKTHDR(m);
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip_fillid(ip);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 	if ((flags & IP_FORWARDING) == 0)
 		IPSTAT_INC(ips_localout);
 
 	/*
 	 * dst/gw handling:
 	 *
 	 * gw is readonly but can point either to dst OR rt_gateway,
 	 * therefore we need restore gw if we're redoing lookup.
 	 */
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 	if (ro != NULL)
 		dst = (struct sockaddr_in *)&ro->ro_dst;
 	else
 		dst = &sin;
 	if (ro == NULL || ro->ro_rt == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 	gw = dst;
 	NET_EPOCH_ENTER(et);
 again:
 	/*
 	 * Validate route against routing table additions;
 	 * a better/more specific route might have been added.
 	 */
 	if (inp != NULL && ro != NULL && ro->ro_rt != NULL)
 		RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 * Also check whether routing cache needs invalidation.
 	 */
 	if (ro != NULL && ro->ro_rt != NULL &&
 	    ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 	    ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) ||
 	    dst->sin_family != AF_INET ||
 	    dst->sin_addr.s_addr != ip->ip_dst.s_addr))
 		RO_INVALIDATE_CACHE(ro);
 	ia = NULL;
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
 						      M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
 						M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = ifp->if_flags & IFF_BROADCAST ?
 		    in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		mtu = ifp->if_mtu;
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		isbroadcast = 0;	/* fool gcc */
 		/* Interface may have no addresses. */
 		if (ia != NULL)
 			src = IA_SIN(ia)->sin_addr;
 		else
 			src.s_addr = INADDR_ANY;
 	} else if (ro != NULL) {
 		if (ro->ro_rt == NULL) {
 			/*
 			 * We want to do any cloning requested by the link
 			 * layer, as this is probably required in all cases
 			 * for correct operation (as it is for ARP).
 			 */
 #ifdef RADIX_MPATH
 			rtalloc_mpath_fib(ro,
 			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 			    fibnum);
 #else
 			in_rtalloc_ign(ro, 0, fibnum);
 #endif
 			if (ro->ro_rt == NULL ||
 			    (ro->ro_rt->rt_flags & RTF_UP) == 0 ||
 			    ro->ro_rt->rt_ifp == NULL ||
 			    !RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 				/*
 				 * There is no route for this packet, but it is
 				 * possible that a matching SPD entry exists.
 				 */
 				no_route_but_check_spd = 1;
 				mtu = 0; /* Silence GCC warning. */
 				goto sendit;
 #endif
 				IPSTAT_INC(ips_noroute);
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 		}
 		ia = ifatoia(ro->ro_rt->rt_ifa);
 		ifp = ro->ro_rt->rt_ifp;
 		counter_u64_add(ro->ro_rt->rt_pksent, 1);
 		rt_update_ro_flags(ro);
 		if (ro->ro_rt->rt_flags & RTF_GATEWAY)
 			gw = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
 		if (ro->ro_rt->rt_flags & RTF_HOST)
 			isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
 		else if (ifp->if_flags & IFF_BROADCAST)
 			isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
 		else
 			isbroadcast = 0;
 		if (ro->ro_rt->rt_flags & RTF_HOST)
 			mtu = ro->ro_rt->rt_mtu;
 		else
 			mtu = ifp->if_mtu;
 		src = IA_SIN(ia)->sin_addr;
 	} else {
 		struct nhop4_extended nh;
 
 		bzero(&nh, sizeof(nh));
 		if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) !=
 		    0) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			mtu = 0; /* Silence GCC warning. */
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ifp = nh.nh_ifp;
 		mtu = nh.nh_mtu;
 		/*
 		 * We are rewriting here dst to be gw actually, contradicting
 		 * comment at the beginning of the function. However, in this
 		 * case we are always dealing with on stack dst.
 		 * In case if pfil(9) sends us back to beginning of the
 		 * function, the dst would be rewritten by ip_output_pfil().
 		 */
 		MPASS(dst == &sin);
 		dst->sin_addr = nh.nh_addr;
 		ia = nh.nh_ia;
 		src = nh.nh_src;
 		isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
 		    (NHF_HOST | NHF_BROADCAST)) ||
 		    ((ifp->if_flags & IFF_BROADCAST) &&
 		    in_ifaddr_broadcast(dst->sin_addr, ia)));
 	}
 
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (rt_flags=0x%08x) ifp=%p",
 	    __func__, mtu, ro,
 	    (ro != NULL && ro->ro_rt != NULL) ? ro->ro_rt->rt_flags : 0, ifp));
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "gw"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		gw = dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY)
 			ip->ip_src = src;
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY)
 		ip->ip_src = src;
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
 		switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) {
 		case 1: /* Finished */
 			goto done;
 
 		case 0: /* Continue normally */
 			ip = mtod(m, struct ip *);
 			break;
 
 		case -1: /* Need to try again */
 			/* Reset everything for a new round */
 			if (ro != NULL) {
 				RO_RTFREE(ro);
 				ro->ro_prepend = NULL;
 			}
 			gw = dst;
 			ip = mtod(m, struct ip *);
 			goto again;
 
 		}
 	}
 
 	/* IN_LOOPBACK must not appear on the wire - RFC1122. */
 	if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags & CSUM_TSO)
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
-#ifdef RATELIMIT
-		if (inp != NULL) {
-			if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
-				in_pcboutput_txrtlmt(inp, ifp, m);
-			/* stamp send tag on mbuf */
-			m->m_pkthdr.snd_tag = inp->inp_snd_tag;
-		} else {
-			m->m_pkthdr.snd_tag = NULL;
-		}
-#endif
-		error = (*ifp->if_output)(ifp, m,
-		    (const struct sockaddr *)gw, ro);
-#ifdef RATELIMIT
-		/* check for route change */
-		if (error == EAGAIN)
-			in_pcboutput_eagain(inp);
-#endif
+		error = ip_output_send(inp, ifp, m, gw, ro);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
 			    mtod(m, struct ip *), NULL);
-#ifdef RATELIMIT
-			if (inp != NULL) {
-				if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
-					in_pcboutput_txrtlmt(inp, ifp, m);
-				/* stamp send tag on mbuf */
-				m->m_pkthdr.snd_tag = inp->inp_snd_tag;
-			} else {
-				m->m_pkthdr.snd_tag = NULL;
-			}
-#endif
-			error = (*ifp->if_output)(ifp, m,
-			    (const struct sockaddr *)gw, ro);
-#ifdef RATELIMIT
-			/* check for route change */
-			if (error == EAGAIN)
-				in_pcboutput_eagain(inp);
-#endif
+			error = ip_output_send(inp, ifp, m, gw, ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	NET_EPOCH_EXIT(et);
 	return (error);
  bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if (ip_off & IP_DF) {	/* Fragmentation not allowed */
 		IPSTAT_INC(ips_cantfrag);
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/*
 		 * Fragment large datagrams such that each segment
 		 * contains a multiple of PAGE_SIZE amount of data,
 		 * plus headers. This enables a receiver to perform
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 
 		off = MIN(mtu, m0->m_pkthdr.len);
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	uint16_t cklen, csum, offset;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 
 	if (m->m_pkthdr.csum_flags & CSUM_UDP) {
 		/* if udp header is not in the first mbuf copy udplen */
 		if (offset + sizeof(struct udphdr) > m->m_len) {
 			m_copydata(m, offset + offsetof(struct udphdr,
 			    uh_ulen), sizeof(cklen), (caddr_t)&cklen);
 			cklen = ntohs(cklen);
 		} else {
 			uh = (struct udphdr *)mtodo(m, offset);
 			cklen = ntohs(uh->uh_ulen);
 		}
 		csum = in_cksum_skip(m, cklen + offset, offset);
 		if (csum == 0)
 			csum = 0xffff;
 	} else {
 		cklen = ntohs(ip->ip_len);
 		csum = in_cksum_skip(m, cklen, offset);
 	}
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(inp);
 				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(inp);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_BINDMULTI:
 #ifdef	RSS
 		case IP_RSS_LISTEN_BUCKET:
 #endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RECVRSSBUCKETID:
 #endif
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 #define	OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				OPTSET2(INP_ORIGDSTADDR, optval);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			case IP_BINDMULTI:
 				OPTSET2(INP_BINDMULTI, optval);
 				break;
 			case IP_RECVFLOWID:
 				OPTSET2(INP_RECVFLOWID, optval);
 				break;
 #ifdef	RSS
 			case IP_RSS_LISTEN_BUCKET:
 				if ((optval >= 0) &&
 				    (optval < rss_getnumbuckets())) {
 					inp->inp_rss_listen_bucket = optval;
 					OPTSET2(INP_RSS_BUCKET_SET, 1);
 				} else {
 					error = EINVAL;
 				}
 				break;
 			case IP_RECVRSSBUCKETID:
 				OPTSET2(INP_RECVRSSBUCKETID, optval);
 				break;
 #endif
 			}
 			break;
 #undef OPTSET
 #undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			INP_RLOCK(inp);
 			if (inp->inp_options) {
 				struct mbuf *options;
 
 				options = m_copym(inp->inp_options, 0,
 				    M_COPYALL, M_NOWAIT);
 				INP_RUNLOCK(inp);
 				if (options != NULL) {
 					error = sooptcopyout(sopt,
 							     mtod(options, char *),
 							     options->m_len);
 					m_freem(options);
 				} else
 					error = ENOMEM;
 			} else {
 				INP_RUNLOCK(inp);
 				sopt->sopt_valsize = 0;
 			}
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RSSBUCKETID:
 		case IP_RECVRSSBUCKETID:
 #endif
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 #define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				optval = OPTBIT2(INP_ORIGDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			case IP_FLOWID:
 				optval = inp->inp_flowid;
 				break;
 			case IP_FLOWTYPE:
 				optval = inp->inp_flowtype;
 				break;
 			case IP_RECVFLOWID:
 				optval = OPTBIT2(INP_RECVFLOWID);
 				break;
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				retval = rss_hash2bucket(inp->inp_flowid,
 				    inp->inp_flowtype,
 				    &rss_bucket);
 				if (retval == 0)
 					optval = rss_bucket;
 				else
 					error = EINVAL;
 				break;
 			case IP_RECVRSSBUCKETID:
 				optval = OPTBIT2(INP_RECVRSSBUCKETID);
 				break;
 #endif
 			case IP_BINDMULTI:
 				optval = OPTBIT2(INP_BINDMULTI);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
 {
 	struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 		if_simloop(ifp, copym, AF_INET, 0);
 	}
 }
Index: head/sys/netinet6/ip6_output.c
===================================================================
--- head/sys/netinet6/ip6_output.c	(revision 348253)
+++ head/sys/netinet6/ip6_output.c	(revision 348254)
@@ -1,3160 +1,3173 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_ipsec.h"
 #include "opt_sctp.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <machine/in_cksum.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #include <netipsec/ipsec_support.h>
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/scope6_var.h>
 
 extern int in6_mcast_loop;
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
 
 static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
 			   struct ucred *, int);
 static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
 	struct socket *, struct sockopt *);
 static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *);
 static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
 	struct ucred *, int, int, int);
 
 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
 static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
 	struct ip6_frag **);
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, int,
 	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
 	u_int);
 static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
 	u_long *, int *, u_int);
 static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 
 /*
  * Make an extension header from option data.  hp is the source, and
  * mp is the destination.
  */
 #define MAKE_EXTHDR(hp, mp)						\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
 		    ((eh)->ip6e_len + 1) << 3);				\
 		if (error)						\
 			goto freehdrs;					\
 	}								\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * Form a chain of extension headers.
  * m is the extension header mbuf
  * mp is the previous mbuf in the chain
  * p is the next header
  * i is the type of option.
  */
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("assumption failed: hdr not split"); \
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (/*CONSTCOND*/ 0)
 
 void
 in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
 {
 	u_short csum;
 
 	csum = in_cksum_skip(m, offset + plen, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 int
 ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
     int fraglen , uint32_t id)
 {
 	struct mbuf *m, **mnext, *m_frgpart;
 	struct ip6_hdr *ip6, *mhip6;
 	struct ip6_frag *ip6f;
 	int off;
 	int error;
 	int tlen = m0->m_pkthdr.len;
 
 	KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8"));
 
 	m = m0;
 	ip6 = mtod(m, struct ip6_hdr *);
 	mnext = &m->m_nextpkt;
 
 	for (off = hlen; off < tlen; off += fraglen) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (!m) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 		m->m_data += max_linkhdr;
 		mhip6 = mtod(m, struct ip6_hdr *);
 		*mhip6 = *ip6;
 		m->m_len = sizeof(*mhip6);
 		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
 		if (error) {
 			IP6STAT_INC(ip6s_odropped);
 			return (error);
 		}
 		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 		if (off + fraglen >= tlen)
 			fraglen = tlen - off;
 		else
 			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 		mhip6->ip6_plen = htons((u_short)(fraglen + hlen +
 		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
 		if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 		m_cat(m, m_frgpart);
 		m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f);
 		ip6f->ip6f_reserved = 0;
 		ip6f->ip6f_ident = id;
 		ip6f->ip6f_nxt = nextproto;
 		IP6STAT_INC(ip6s_ofragments);
 		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 	}
 
 	return (0);
 }
 
+static int
+ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp,
+    struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro)
+{
+	struct m_snd_tag *mst;
+	int error;
+
+	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+	mst = NULL;
+
+#ifdef RATELIMIT
+	if (inp != NULL) {
+		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
+		    (inp->inp_snd_tag != NULL &&
+		    inp->inp_snd_tag->ifp != ifp))
+			in_pcboutput_txrtlmt(inp, ifp, m);
+
+		if (inp->inp_snd_tag != NULL)
+			mst = inp->inp_snd_tag;
+	}
+#endif
+	if (mst != NULL) {
+		KASSERT(m->m_pkthdr.rcvif == NULL,
+		    ("trying to add a send tag to a forwarded packet"));
+		if (mst->ifp != ifp) {
+			error = EAGAIN;
+			goto done;
+		}
+
+		/* stamp send tag on mbuf */
+		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
+		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
+	}
+
+	error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro);
+
+done:
+	/* Check for route change invalidating send tags. */
+#ifdef RATELIMIT
+	if (error == EAGAIN)
+		in_pcboutput_eagain(inp);
+#endif
+	return (error);
+}
+
 /*
  * IP6 output. The packet in mbuf chain m contains a skeletal IP6
  * header (with pri, len, nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route_in6 ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and
  * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
  * which is rt_mtu.
  *
  * ifpp - XXX: just for statistics
  */
 /*
  * XXX TODO: no flowid is assigned for outbound flows?
  */
 int
 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
     struct ifnet **ifpp, struct inpcb *inp)
 {
 	struct ip6_hdr *ip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	struct mbuf *mprev = NULL;
 	int hlen, tlen, len;
 	struct route_in6 ip6route;
 	struct rtentry *rt = NULL;
 	struct sockaddr_in6 *dst, src_sa, dst_sa;
 	struct in6_addr odst;
 	int error = 0;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	int alwaysfrag, dontfrag;
 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr src0, dst0;
 	u_int32_t zone;
 	struct route_in6 *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	int sw_csum, tso;
 	int needfiblookup;
 	uint32_t fibnum;
 	struct m_tag *fwd_tag = NULL;
 	uint32_t id;
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			/* unconditionally set flowid */
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * IPSec checking which handles several cases.
 	 * FAST IPSEC: We re-injected the packet.
 	 * XXX: need scope argument.
 	 */
 	if (IPSEC_ENABLED(ipv6)) {
 		if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 #endif /* IPSEC */
 
 	bzero(&exthdrs, sizeof(exthdrs));
 	if (opt) {
 		/* Hop-by-Hop options header */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
 		/* Destination options header(1st part) */
 		if (opt->ip6po_rthdr) {
 			/*
 			 * Destination options header(1st part)
 			 * This only makes sense with a routing header.
 			 * See Section 9.2 of RFC 3542.
 			 * Disabling this part just for MIP6 convenience is
 			 * a bad idea.  We need to think carefully about a
 			 * way to make the advanced API coexist with MIP6
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
 		}
 		/* Routing header */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
 		/* Destination options header(2nd part) */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
 	}
 
 	/*
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	optlen = 0;
 	if (exthdrs.ip6e_hbh)
 		optlen += exthdrs.ip6e_hbh->m_len;
 	if (exthdrs.ip6e_dest1)
 		optlen += exthdrs.ip6e_dest1->m_len;
 	if (exthdrs.ip6e_rthdr)
 		optlen += exthdrs.ip6e_rthdr->m_len;
 	unfragpartlen = optlen + sizeof(struct ip6_hdr);
 
 	/* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */
 	if (exthdrs.ip6e_dest2)
 		optlen += exthdrs.ip6e_dest2->m_len;
 
 	/*
 	 * If there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	if (optlen && !hdrsplit) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		hdrsplit++;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf packet header length */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			hdrsplit++;
 		}
 		/* adjust pointer */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 
 	/*
 	 * Concatenate headers and fill in next header fields.
 	 * Here we have, on "m"
 	 *	IPv6 payload
 	 * and we insert headers accordingly.  Finally, we should be getting:
 	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
 	 *
 	 * during the header composing process, "m" points to IPv6 header.
 	 * "mprev" points to an extension header prior to esp.
 	 */
 	u_char *nexthdrp = &ip6->ip6_nxt;
 	mprev = m;
 
 	/*
 	 * we treat dest2 specially.  this makes IPsec processing
 	 * much easier.  the goal here is to make mprev point the
 	 * mbuf prior to dest2.
 	 *
 	 * result: IPv6 dest2 payload
 	 * m and mprev will point to IPv6 header.
 	 */
 	if (exthdrs.ip6e_dest2) {
 		if (!hdrsplit)
 			panic("assumption failed: hdr not split");
 		exthdrs.ip6e_dest2->m_next = m->m_next;
 		m->m_next = exthdrs.ip6e_dest2;
 		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 		ip6->ip6_nxt = IPPROTO_DSTOPTS;
 	}
 
 	/*
 	 * result: IPv6 hbh dest1 rthdr dest2 payload
 	 * m will point to IPv6 header.  mprev will point to the
 	 * extension header prior to dest2 (rthdr in the above case).
 	 */
 	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
 		   IPPROTO_DSTOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
 		   IPPROTO_ROUTING);
 
 	/*
 	 * If there is a routing header, discard the packet.
 	 */
 	if (exthdrs.ip6e_rthdr) {
 		 error = EINVAL;
 		 goto bad;
 	}
 
 	/* Source address validation */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_UNSPECSRC) == 0) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_localout);
 
 	/*
 	 * Route packet.
 	 */
 	if (ro == NULL) {
 		ro = &ip6route;
 		bzero((caddr_t)ro, sizeof(*ro));
 	}
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	dst = (struct sockaddr_in6 *)&ro->ro_dst;
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 again:
 	/*
 	 * if specified, try to fill in the traffic class field.
 	 * do not override if a non-zero value is already set.
 	 * we check the diffserv field and the ecn field separately.
 	 */
 	if (opt && opt->ip6po_tclass >= 0) {
 		int mask = 0;
 
 		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
 			mask |= 0xfc;
 		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
 			mask |= 0x03;
 		if (mask != 0)
 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
 	}
 
 	/* fill in or override the hop limit field, if necessary. */
 	if (opt && opt->ip6po_hlim != -1)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 		else
 			ip6->ip6_hlim = V_ip6_defmcasthlim;
 	}
 	/*
 	 * Validate route against routing table additions;
 	 * a better/more specific route might have been added.
 	 * Make sure address family is set in route.
 	 */
 	if (inp) {
 		ro->ro_dst.sin6_family = AF_INET6;
 		RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum);
 	}
 	if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) &&
 	    ro->ro_dst.sin6_family == AF_INET6 &&
 	    IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
 		rt = ro->ro_rt;
 		ifp = ro->ro_rt->rt_ifp;
 	} else {
 		if (ro->ro_lle)
 			LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
 		ro->ro_lle = NULL;
 		if (fwd_tag == NULL) {
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 		}
 		error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp,
 		    &rt, fibnum);
 		if (error != 0) {
 			if (ifp != NULL)
 				in6_ifstat_inc(ifp, ifs6_out_discard);
 			goto bad;
 		}
 	}
 	if (rt == NULL) {
 		/*
 		 * If in6_selectroute() does not return a route entry,
 		 * dst may not have been updated.
 		 */
 		*dst = dst_sa;	/* XXX */
 	}
 
 	/*
 	 * then rt (for unicast) and ifp must be non-NULL valid values.
 	 */
 	if ((flags & IPV6_FORWARDING) == 0) {
 		/* XXX: the FORWARDING flag can be set for mrouting. */
 		in6_ifstat_inc(ifp, ifs6_out_request);
 	}
 	if (rt != NULL) {
 		ia = (struct in6_ifaddr *)(rt->rt_ifa);
 		counter_u64_add(rt->rt_pksent, 1);
 	}
 
 	/* Setup data structures for scope ID checks. */
 	src0 = ip6->ip6_src;
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = ip6->ip6_src;
 
 	dst0 = ip6->ip6_dst;
 	/* re-initialize to be sure */
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 
 	/* Check for valid scope ID. */
 	if (in6_setscope(&src0, ifp, &zone) == 0 &&
 	    sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id &&
 	    in6_setscope(&dst0, ifp, &zone) == 0 &&
 	    sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) {
 		/*
 		 * The outgoing interface is in the zone of the source
 		 * and destination addresses.
 		 *
 		 * Because the loopback interface cannot receive
 		 * packets with a different scope ID than its own,
 		 * there is a trick is to pretend the outgoing packet
 		 * was received by the real network interface, by
 		 * setting "origifp" different from "ifp". This is
 		 * only allowed when "ifp" is a loopback network
 		 * interface. Refer to code in nd6_output_ifp() for
 		 * more details.
 		 */
 		origifp = ifp;
 	
 		/*
 		 * We should use ia_ifp to support the case of sending
 		 * packets to an address of our own.
 		 */
 		if (ia != NULL && ia->ia_ifp)
 			ifp = ia->ia_ifp;
 
 	} else if ((ifp->if_flags & IFF_LOOPBACK) == 0 ||
 	    sa6_recoverscope(&src_sa) != 0 ||
 	    sa6_recoverscope(&dst_sa) != 0 ||
 	    dst_sa.sin6_scope_id == 0 ||
 	    (src_sa.sin6_scope_id != 0 &&
 	    src_sa.sin6_scope_id != dst_sa.sin6_scope_id) ||
 	    (origifp = ifnet_byindex(dst_sa.sin6_scope_id)) == NULL) {
 		/*
 		 * If the destination network interface is not a
 		 * loopback interface, or the destination network
 		 * address has no scope ID, or the source address has
 		 * a scope ID set which is different from the
 		 * destination address one, or there is no network
 		 * interface representing this scope ID, the address
 		 * pair is considered invalid.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(ifp, ifs6_out_discard);
 		if (error == 0)
 			error = EHOSTUNREACH; /* XXX */
 		goto bad;
 	}
 
 	/* All scope ID checks are successful. */
 
 	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (opt && opt->ip6po_nextroute.ro_rt) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
 			 * address.
 			 */
 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
 		}
 		else if ((rt->rt_flags & RTF_GATEWAY))
 			dst = (struct sockaddr_in6 *)rt->rt_gateway;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
 	} else {
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if (!(ifp->if_flags & IFF_MULTICAST)) {
 			IP6STAT_INC(ip6s_noroute);
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		if ((im6o == NULL && in6_mcast_loop) ||
 		    (im6o && im6o->im6o_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we have not joined
 			 * the address; protocols will filter it later,
 			 * thus deferring a hash lookup and lock acquisition
 			 * at the expense of an m_copym().
 			 */
 			ip6_mloopback(ifp, m);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				/*
 				 * XXX: ip6_mforward expects that rcvif is NULL
 				 * when it is called from the originating path.
 				 * However, it may not always be the case.
 				 */
 				m->m_pkthdr.rcvif = NULL;
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
 		    &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
 		goto bad;
 
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
 	 * setting.  The logic is a bit complicated; by default, unicast
 	 * packets will follow path MTU while multicast packets will be sent at
 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
 	 * including unicast ones will be sent at the minimum MTU.  Multicast
 	 * packets will always be sent at the minimum MTU unless
 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
 	 * See RFC 3542 for more details.
 	 */
 	if (mtu > IPV6_MMTU) {
 		if ((flags & IPV6_MINMTU))
 			mtu = IPV6_MMTU;
 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
 			mtu = IPV6_MMTU;
 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 (opt == NULL ||
 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
 			mtu = IPV6_MMTU;
 		}
 	}
 
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy; /* XXX unused */
 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not contiguous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       contiguous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
 		    &dummy, &plen) < 0) {
 			/* m was already freed at this point */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED_OUT(V_inet6_pfil_head))
 		goto passout;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for output packets. */
 	switch (pfil_run_hooks(V_inet6_pfil_head, &m, ifp, PFIL_OUT, inp)) {
 	case PFIL_PASS:
 		ip6 = mtod(m, struct ip6_hdr *);
 		break;
 	case PFIL_DROPPED:
 		error = EPERM;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		goto done;
 	}
 
 	needfiblookup = 0;
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localip(&ip6->ip6_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IPV6, m);
 			goto done;
 		} else {
 			RO_INVALIDATE_CACHE(ro);
 			needfiblookup = 1; /* Redo the routing table lookup. */
 		}
 	}
 	/* See if fib was changed by packet filter. */
 	if (fibnum != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		fibnum = M_GETFIB(m);
 		RO_INVALIDATE_CACHE(ro);
 		needfiblookup = 1;
 	}
 	if (needfiblookup)
 		goto again;
 
 	/* See if local, if yes, send it to netisr. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		error = netisr_queue(NETISR_IPV6, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		dst = (struct sockaddr_in6 *)&ro->ro_dst;
 		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		goto again;
 	}
 
 passout:
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 *
 	 * the logic here is rather complex:
 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
 	 * 1-a:	send as is if tlen <= path mtu
 	 * 1-b:	fragment if tlen > path mtu
 	 *
 	 * 2: if user asks us not to fragment (dontfrag == 1)
 	 * 2-a:	send as is if tlen <= interface mtu
 	 * 2-b:	error if tlen > interface mtu
 	 *
 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
 	 *	always fragment
 	 *
 	 * 4: if dontfrag == 1 && alwaysfrag == 1
 	 *	error, as we cannot handle this conflicting request
 	 */
 	sw_csum = m->m_pkthdr.csum_flags;
 	if (!hdrsplit) {
 		tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
 		sw_csum &= ~ifp->if_hwassist;
 	} else
 		tso = 0;
 	/*
 	 * If we added extension headers, we will not do TSO and calculate the
 	 * checksums ourselves for now.
 	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
 	 * with ext. hdrs.
 	 */
 	if (sw_csum & CSUM_DELAY_DATA_IPV6) {
 		sw_csum &= ~CSUM_DELAY_DATA_IPV6;
 		in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
 	}
 #ifdef SCTP
 	if (sw_csum & CSUM_SCTP_IPV6) {
 		sw_csum &= ~CSUM_SCTP_IPV6;
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 	}
 #endif
 	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
 	tlen = m->m_pkthdr.len;
 
 	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
 		dontfrag = 1;
 	else
 		dontfrag = 0;
 	if (dontfrag && alwaysfrag) {	/* case 4 */
 		/* conflicting request - can't transmit */
 		error = EMSGSIZE;
 		goto bad;
 	}
 	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* case 2-b */
 		/*
 		 * Even if the DONTFRAG option is specified, we cannot send the
 		 * packet when the data length is larger than the MTU of the
 		 * outgoing interface.
 		 * Notify the error by sending IPV6_PATHMTU ancillary data if
 		 * application wanted to know the MTU value. Also return an
 		 * error code (this is not described in the API spec).
 		 */
 		if (inp != NULL)
 			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/*
 	 * transmit packet without fragmentation
 	 */
 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
 		struct in6_ifaddr *ia6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
 		if (ia6) {
 			/* Record statistics for this interface address. */
 			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_obytes,
 			    m->m_pkthdr.len);
 			ifa_free(&ia6->ia_ifa);
 		}
-#ifdef RATELIMIT
-		if (inp != NULL) {
-			if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
-				in_pcboutput_txrtlmt(inp, ifp, m);
-			/* stamp send tag on mbuf */
-			m->m_pkthdr.snd_tag = inp->inp_snd_tag;
-		} else {
-			m->m_pkthdr.snd_tag = NULL;
-		}
-#endif
-		error = nd6_output_ifp(ifp, origifp, m, dst,
-		    (struct route *)ro);
-#ifdef RATELIMIT
-		/* check for route change */
-		if (error == EAGAIN)
-			in_pcboutput_eagain(inp);
-#endif
+		error = ip6_output_send(inp, ifp, origifp, m, dst, ro);
 		goto done;
 	}
 
 	/*
 	 * try to fragment the packet.  case 1-b and 3
 	 */
 	if (mtu < IPV6_MMTU) {
 		/* path MTU cannot be less than IPV6_MMTU */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) {
 		/* jumbo payload cannot be fragmented */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		u_char nextproto;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		hlen = unfragpartlen;
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		/*
 		 * If the interface will not calculate checksums on
 		 * fragmented packets, then do it here.
 		 * XXX-BZ handle the hw offloading case.  Need flags.
 		 */
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			in6_delayed_cksum(m, plen, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 			sctp_delayed_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 		}
 #endif
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		id = htonl(ip6_randomid());
 		if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id)))
 			goto sendorfree;
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/*
 	 * Remove leading garbages.
 	 */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
-#ifdef RATELIMIT
-			if (inp != NULL) {
-				if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
-					in_pcboutput_txrtlmt(inp, ifp, m);
-				/* stamp send tag on mbuf */
-				m->m_pkthdr.snd_tag = inp->inp_snd_tag;
-			} else {
-				m->m_pkthdr.snd_tag = NULL;
-			}
-#endif
-			error = nd6_output_ifp(ifp, origifp, m, dst,
-			    (struct route *)ro);
-#ifdef RATELIMIT
-			/* check for route change */
-			if (error == EAGAIN)
-				in_pcboutput_eagain(inp);
-#endif
+			error = ip6_output_send(inp, ifp, origifp, m, dst, ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IP6STAT_INC(ip6s_fragmented);
 
 done:
 	if (ro == &ip6route)
 		RO_RTFREE(ro);
 	return (error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* FALLTHROUGH */
 bad:
 	if (m)
 		m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return (ENOBUFS); /* XXX */
 
 	if (hlen > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return (0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == NULL) {
 		mopt = m_get(M_NOWAIT, MT_DATA);
 		if (mopt == NULL)
 			return (ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return (ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			n = m_getcl(M_NOWAIT, MT_DATA, 0);
 			if (n == NULL)
 				return (ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			    oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return (0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
     struct ip6_frag **frghdrp)
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
 		if (n == NULL)
 			return (ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if (M_WRITABLE(mlast) &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
 		    mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		mfrg = m_get(M_NOWAIT, MT_DATA);
 		if (mfrg == NULL)
 			return (ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return (0);
 }
 
 /*
  * Calculates IPv6 path mtu for destination @dst.
  * Resulting MTU is stored in @mtup.
  *
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
 {
 	struct nhop6_extended nh6;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct ifnet *ifp;
 	u_long mtu;
 	int error;
 
 	in6_splitscope(dst, &kdst, &scopeid);
 	if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0)
 		return (EHOSTUNREACH);
 
 	ifp = nh6.nh_ifp;
 	mtu = nh6.nh_mtu;
 
 	error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0);
 	fib6_free_nh_ext(fibnum, &nh6);
 
 	return (error);
 }
 
 /*
  * Calculates IPv6 path MTU for @dst based on transmit @ifp,
  * and cached data in @ro_pmtu.
  * MTU from (successful) route lookup is saved (along with dst)
  * inside @ro_pmtu to avoid subsequent route lookups after packet
  * filter processing.
  *
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
     struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
     int *alwaysfragp, u_int fibnum, u_int proto)
 {
 	struct nhop6_basic nh6;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct sockaddr_in6 *sa6_dst;
 	u_long mtu;
 
 	mtu = 0;
 	if (do_lookup) {
 
 		/*
 		 * Here ro_pmtu has final destination address, while
 		 * ro might represent immediate destination.
 		 * Use ro_pmtu destination since mtu might differ.
 		 */
 		sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 		if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))
 			ro_pmtu->ro_mtu = 0;
 
 		if (ro_pmtu->ro_mtu == 0) {
 			bzero(sa6_dst, sizeof(*sa6_dst));
 			sa6_dst->sin6_family = AF_INET6;
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
 			in6_splitscope(dst, &kdst, &scopeid);
 			if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0,
 			    &nh6) == 0)
 				ro_pmtu->ro_mtu = nh6.nh_mtu;
 		}
 
 		mtu = ro_pmtu->ro_mtu;
 	}
 
 	if (ro_pmtu->ro_rt)
 		mtu = ro_pmtu->ro_rt->rt_mtu;
 
 	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
 }
 
 /*
  * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and
  * hostcache data for @dst.
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  *
  * Returns 0 on success.
  */
 static int
 ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
     u_long *mtup, int *alwaysfragp, u_int proto)
 {
 	u_long mtu = 0;
 	int alwaysfrag = 0;
 	int error = 0;
 
 	if (rt_mtu > 0) {
 		u_int32_t ifmtu;
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = *dst;
 
 		ifmtu = IN6_LINKMTU(ifp);
 
 		/* TCP is known to react to pmtu changes so skip hc */
 		if (proto != IPPROTO_TCP)
 			mtu = tcp_hc_getmtu(&inc);
 
 		if (mtu)
 			mtu = min(mtu, rt_mtu);
 		else
 			mtu = rt_mtu;
 		if (mtu == 0)
 			mtu = ifmtu;
 		else if (mtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph:
 			 * if we record ICMPv6 too big message with
 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
 			 * or smaller, with framgent header attached.
 			 * (fragment header is needed regardless from the
 			 * packet size, for translators to identify packets)
 			 */
 			alwaysfrag = 1;
 			mtu = IPV6_MMTU;
 		}
 	} else if (ifp) {
 		mtu = IN6_LINKMTU(ifp);
 	} else
 		error = EHOSTUNREACH; /* XXX */
 
 	*mtup = mtu;
 	if (alwaysfragp)
 		*alwaysfragp = alwaysfrag;
 	return (error);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int optdatalen, uproto;
 	void *optdata;
 	struct inpcb *in6p = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 /*
  * Don't use more than a quarter of mbuf clusters.  N.B.:
  * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow
  * on LP64 architectures, so cast to u_long to avoid undefined
  * behavior.  ILP32 architectures cannot have nmbclusters
  * large enough to overflow for other reasons.
  */
 #define IPV6_PKTOPTIONS_MBUF_LIMIT	((u_long)nmbclusters * MCLBYTES / 4)
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 	td = sopt->sopt_td;
 	error = 0;
 	optval = 0;
 	uproto = (int)so->so_proto->pr_protocol;
 
 	if (level != IPPROTO_IPV6) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(in6p);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					in6p->inp_flags2 |= INP_REUSEADDR;
 				else
 					in6p->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(in6p);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					in6p->inp_flags2 |= INP_REUSEPORT;
 				else
 					in6p->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(in6p);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					in6p->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					in6p->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(in6p);
 				in6p->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(in6p);
 				in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(in6p);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 	} else {		/* level == IPPROTO_IPV6 */
 		switch (op) {
 
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 			{
 				struct mbuf *m;
 
 				if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) {
 					printf("ip6_ctloutput: mbuf limit hit\n");
 					error = ENOBUFS;
 					break;
 				}
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				error = ip6_pcbopts(&in6p->in6p_outputopts,
 						    m, so, sopt);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 				if (td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_SETHDROPTS);
 					if (error)
 						break;
 				}
 				/* FALLTHROUGH */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_HOPLIMIT:
 
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 			case IPV6_RECVTCLASS:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_V6ONLY:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_ORIGDSTADDR:
 			case IPV6_BINDANY:
 			case IPV6_BINDMULTI:
 #ifdef	RSS
 			case IPV6_RSS_LISTEN_BUCKET:
 #endif
 				if (optname == IPV6_BINDANY && td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_BINDANY);
 					if (error)
 						break;
 				}
 
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						in6p->in6p_hops = optval;
 						if ((in6p->inp_vflag &
 						     INP_IPV4) != 0)
 							in6p->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	INP_WLOCK(in6p); \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 	INP_WUNLOCK(in6p); \
 } while (/*CONSTCOND*/ 0)
 #define OPTSET2292(bit) \
 do { \
 	INP_WLOCK(in6p); \
 	in6p->inp_flags |= IN6P_RFC2292; \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 	INP_WUNLOCK(in6p); \
 } while (/*CONSTCOND*/ 0)
 #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
 
 #define OPTSET2_N(bit, val) do {					\
 	if (val)							\
 		in6p->inp_flags2 |= bit;				\
 	else								\
 		in6p->inp_flags2 &= ~bit;				\
 } while (0)
 #define OPTSET2(bit, val) do {						\
 	INP_WLOCK(in6p);						\
 	OPTSET2_N(bit, val);						\
 	INP_WUNLOCK(in6p);						\
 } while (0)
 #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0)
 #define OPTSET2292_EXCLUSIVE(bit)					\
 do {									\
 	INP_WLOCK(in6p);						\
 	if (OPTBIT(IN6P_RFC2292)) {					\
 		error = EINVAL;						\
 	} else {							\
 		if (optval)						\
 			in6p->inp_flags |= (bit);			\
 		else							\
 			in6p->inp_flags &= ~(bit);			\
 	}								\
 	INP_WUNLOCK(in6p);						\
 } while (/*CONSTCOND*/ 0)
 
 				case IPV6_RECVPKTINFO:
 					OPTSET2292_EXCLUSIVE(IN6P_PKTINFO);
 					break;
 
 				case IPV6_HOPLIMIT:
 				{
 					struct ip6_pktopts **optp;
 
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					INP_WLOCK(in6p);
 					if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 						INP_WUNLOCK(in6p);
 						return (ECONNRESET);
 					}
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(IPV6_HOPLIMIT,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(in6p);
 					break;
 				}
 
 				case IPV6_RECVHOPLIMIT:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVHOPOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDR:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					/*
 					 * We ignore this option for TCP
 					 * sockets.
 					 * (RFC3542 leaves this case
 					 * unspecified.)
 					 */
 					if (uproto != IPPROTO_TCP)
 						OPTSET(IN6P_MTU);
 					break;
 
 				case IPV6_RECVFLOWID:
 					OPTSET2(INP_RECVFLOWID, optval);
 					break;
 
 #ifdef	RSS
 				case IPV6_RECVRSSBUCKETID:
 					OPTSET2(INP_RECVRSSBUCKETID, optval);
 					break;
 #endif
 
 				case IPV6_V6ONLY:
 					/*
 					 * make setsockopt(IPV6_V6ONLY)
 					 * available only prior to bind(2).
 					 * see ipng mailing list, Jun 22 2001.
 					 */
 					if (in6p->inp_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_IPV6_V6ONLY);
 					if (optval)
 						in6p->inp_vflag &= ~INP_IPV4;
 					else
 						in6p->inp_vflag |= INP_IPV4;
 					break;
 				case IPV6_RECVTCLASS:
 					/* cannot mix with RFC2292 XXX */
 					OPTSET2292_EXCLUSIVE(IN6P_TCLASS);
 					break;
 				case IPV6_AUTOFLOWLABEL:
 					OPTSET(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					OPTSET2(INP_ORIGDSTADDR, optval);
 					break;
 				case IPV6_BINDANY:
 					OPTSET(INP_BINDANY);
 					break;
 
 				case IPV6_BINDMULTI:
 					OPTSET2(INP_BINDMULTI, optval);
 					break;
 #ifdef	RSS
 				case IPV6_RSS_LISTEN_BUCKET:
 					if ((optval >= 0) &&
 					    (optval < rss_getnumbuckets())) {
 						INP_WLOCK(in6p);
 						in6p->inp_rss_listen_bucket = optval;
 						OPTSET2_N(INP_RSS_BUCKET_SET, 1);
 						INP_WUNLOCK(in6p);
 					} else {
 						error = EINVAL;
 					}
 					break;
 #endif
 				}
 				break;
 
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				if (optlen != sizeof(optval)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				{
 					struct ip6_pktopts **optp;
 					INP_WLOCK(in6p);
 					if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 						INP_WUNLOCK(in6p);
 						return (ECONNRESET);
 					}
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(optname,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(in6p);
 					break;
 				}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292DSTOPTS:
 			case IPV6_2292RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					OPTSET2292(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					OPTSET2292(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292DSTOPTS:
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_2292RTHDR:
 					OPTSET2292(IN6P_RTHDR);
 					break;
 				}
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			{
 				/* new advanced API (RFC3542) */
 				u_char *optbuf;
 				u_char optbuf_storage[MCLBYTES];
 				int optlen;
 				struct ip6_pktopts **optp;
 
 				/* cannot mix with RFC2292 */
 				if (OPTBIT(IN6P_RFC2292)) {
 					error = EINVAL;
 					break;
 				}
 
 				/*
 				 * We only ensure valsize is not too large
 				 * here.  Further validation will be done
 				 * later.
 				 */
 				error = sooptcopyin(sopt, optbuf_storage,
 				    sizeof(optbuf_storage), 0);
 				if (error)
 					break;
 				optlen = sopt->sopt_valsize;
 				optbuf = optbuf_storage;
 				INP_WLOCK(in6p);
 				if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 					INP_WUNLOCK(in6p);
 					return (ECONNRESET);
 				}
 				optp = &in6p->in6p_outputopts;
 				error = ip6_pcbopt(optname, optbuf, optlen,
 				    optp, (td != NULL) ? td->td_ucred : NULL,
 				    uproto);
 				INP_WUNLOCK(in6p);
 				break;
 			}
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			case IPV6_MSFILTER:
 			case MCAST_BLOCK_SOURCE:
 			case MCAST_UNBLOCK_SOURCE:
 			case MCAST_JOIN_GROUP:
 			case MCAST_LEAVE_GROUP:
 			case MCAST_JOIN_SOURCE_GROUP:
 			case MCAST_LEAVE_SOURCE_GROUP:
 				error = ip6_setmoptions(in6p, sopt);
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				INP_WLOCK(in6p);
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					in6p->inp_flags &= ~(INP_LOWPORT);
 					in6p->inp_flags &= ~(INP_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					in6p->inp_flags &= ~(INP_LOWPORT);
 					in6p->inp_flags |= INP_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					in6p->inp_flags &= ~(INP_HIGHPORT);
 					in6p->inp_flags |= INP_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				INP_WUNLOCK(in6p);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, in6p, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 				/*
 				 * RFC3542 (effectively) deprecated the
 				 * semantics of the 2292-style pktoptions.
 				 * Since it was not reliable in nature (i.e.,
 				 * applications had to expect the lack of some
 				 * information after all), it would make sense
 				 * to simplify this part by always returning
 				 * empty data.
 				 */
 				sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 			case IPV6_UNICAST_HOPS:
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 			case IPV6_RECVTCLASS:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_FLOWID:
 			case IPV6_FLOWTYPE:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RSSBUCKETID:
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_BINDMULTI:
 				switch (optname) {
 
 				case IPV6_RECVHOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_UNICAST_HOPS:
 					optval = in6p->in6p_hops;
 					break;
 
 				case IPV6_RECVPKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 
 				case IPV6_RECVHOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVRTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					optval = OPTBIT(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = in6p->inp_flags;
 					if (flags & INP_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & INP_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				case IPV6_RECVTCLASS:
 					optval = OPTBIT(IN6P_TCLASS);
 					break;
 
 				case IPV6_AUTOFLOWLABEL:
 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					optval = OPTBIT2(INP_ORIGDSTADDR);
 					break;
 
 				case IPV6_BINDANY:
 					optval = OPTBIT(INP_BINDANY);
 					break;
 
 				case IPV6_FLOWID:
 					optval = in6p->inp_flowid;
 					break;
 
 				case IPV6_FLOWTYPE:
 					optval = in6p->inp_flowtype;
 					break;
 
 				case IPV6_RECVFLOWID:
 					optval = OPTBIT2(INP_RECVFLOWID);
 					break;
 #ifdef	RSS
 				case IPV6_RSSBUCKETID:
 					retval =
 					    rss_hash2bucket(in6p->inp_flowid,
 					    in6p->inp_flowtype,
 					    &rss_bucket);
 					if (retval == 0)
 						optval = rss_bucket;
 					else
 						error = EINVAL;
 					break;
 
 				case IPV6_RECVRSSBUCKETID:
 					optval = OPTBIT2(INP_RECVRSSBUCKETID);
 					break;
 #endif
 
 				case IPV6_BINDMULTI:
 					optval = OPTBIT2(INP_BINDMULTI);
 					break;
 
 				}
 				if (error)
 					break;
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PATHMTU:
 			{
 				u_long pmtu = 0;
 				struct ip6_mtuinfo mtuinfo;
 				struct in6_addr addr;
 
 				if (!(so->so_state & SS_ISCONNECTED))
 					return (ENOTCONN);
 				/*
 				 * XXX: we dot not consider the case of source
 				 * routing, or optional information to specify
 				 * the outgoing interface.
 				 * Copy faddr out of in6p to avoid holding lock
 				 * on inp during route lookup.
 				 */
 				INP_RLOCK(in6p);
 				bcopy(&in6p->in6p_faddr, &addr, sizeof(addr));
 				INP_RUNLOCK(in6p);
 				error = ip6_getpmtu_ctl(so->so_fibnum,
 				    &addr, &pmtu);
 				if (error)
 					break;
 				if (pmtu > IPV6_MAXPACKET)
 					pmtu = IPV6_MAXPACKET;
 
 				bzero(&mtuinfo, sizeof(mtuinfo));
 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
 				optdata = (void *)&mtuinfo;
 				optdatalen = sizeof(mtuinfo);
 				error = sooptcopyout(sopt, optdata,
 				    optdatalen);
 				break;
 			}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292RTHDR:
 			case IPV6_2292DSTOPTS:
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_2292DSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 				    sizeof optval);
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				error = ip6_getpcbopt(in6p, optname, sopt);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_MSFILTER:
 				error = ip6_getmoptions(in6p, sopt);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, in6p, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	}
 	return (error);
 }
 
 int
 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval, optlen;
 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
 	struct inpcb *in6p = sotoinpcb(so);
 	int level, op, optname;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 
 	if (level != IPPROTO_IPV6) {
 		return (EINVAL);
 	}
 
 	switch (optname) {
 	case IPV6_CHECKSUM:
 		/*
 		 * For ICMPv6 sockets, no modification allowed for checksum
 		 * offset, permit "no change" values to help existing apps.
 		 *
 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
 		 * for an ICMPv6 socket will fail."
 		 * The current behavior does not meet RFC3542.
 		 */
 		switch (op) {
 		case SOPT_SET:
 			if (optlen != sizeof(int)) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 			if (optval < -1 || (optval % 2) != 0) {
 				/*
 				 * The API assumes non-negative even offset
 				 * values or -1 as a special value.
 				 */
 				error = EINVAL;
 			} else if (so->so_proto->pr_protocol ==
 			    IPPROTO_ICMPV6) {
 				if (optval != icmp6off)
 					error = EINVAL;
 			} else
 				in6p->in6p_cksum = optval;
 			break;
 
 		case SOPT_GET:
 			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 				optval = icmp6off;
 			else
 				optval = in6p->in6p_cksum;
 
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
     struct socket *so, struct sockopt *sopt)
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, -1);
 	} else
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options, regardless of
 		 * whether the opt is just created or given.
 		 */
 		free(opt, M_IP6OPT);
 		return (0);
 	}
 
 	/*  set options specified by user. */
 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
 		free(opt, M_IP6OPT);
 		return (error);
 	}
 	*pktopt = opt;
 	return (0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 ip6_initpktopts(struct ip6_pktopts *opt)
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
 }
 
 static int
 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
     struct ucred *cred, int uproto)
 {
 	struct ip6_pktopts *opt;
 
 	if (*pktopt == NULL) {
 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
 		    M_NOWAIT);
 		if (*pktopt == NULL)
 			return (ENOBUFS);
 		ip6_initpktopts(*pktopt);
 	}
 	opt = *pktopt;
 
 	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
 }
 
 #define GET_PKTOPT_VAR(field, lenexpr) do {					\
 	if (pktopt && pktopt->field) {						\
 		INP_RUNLOCK(in6p);						\
 		optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK);		\
 		malloc_optdata = true;						\
 		INP_RLOCK(in6p);						\
 		if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {		\
 			INP_RUNLOCK(in6p);					\
 			free(optdata, M_TEMP);					\
 			return (ECONNRESET);					\
 		}								\
 		pktopt = in6p->in6p_outputopts;					\
 		if (pktopt && pktopt->field) {					\
 			optdatalen = min(lenexpr, sopt->sopt_valsize);		\
 			bcopy(&pktopt->field, optdata, optdatalen);		\
 		} else {							\
 			free(optdata, M_TEMP);					\
 			optdata = NULL;						\
 			malloc_optdata = false;					\
 		}								\
 	}									\
 } while(0)
 
 #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field,				\
 	(((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3)
 
 #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field,			\
 	pktopt->field->sa_len)
 
 static int
 ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt)
 {
 	void *optdata = NULL;
 	bool malloc_optdata = false;
 	int optdatalen = 0;
 	int error = 0;
 	struct in6_pktinfo null_pktinfo;
 	int deftclass = 0, on;
 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
 	struct ip6_pktopts *pktopt;
 
 	INP_RLOCK(in6p);
 	pktopt = in6p->in6p_outputopts;
 
 	switch (optname) {
 	case IPV6_PKTINFO:
 		optdata = (void *)&null_pktinfo;
 		if (pktopt && pktopt->ip6po_pktinfo) {
 			bcopy(pktopt->ip6po_pktinfo, &null_pktinfo,
 			    sizeof(null_pktinfo));
 			in6_clearscope(&null_pktinfo.ipi6_addr);
 		} else {
 			/* XXX: we don't have to do this every time... */
 			bzero(&null_pktinfo, sizeof(null_pktinfo));
 		}
 		optdatalen = sizeof(struct in6_pktinfo);
 		break;
 	case IPV6_TCLASS:
 		if (pktopt && pktopt->ip6po_tclass >= 0)
 			deftclass = pktopt->ip6po_tclass;
 		optdata = (void *)&deftclass;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_HOPOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_hbh);
 		break;
 	case IPV6_RTHDR:
 		GET_PKTOPT_EXT_HDR(ip6po_rthdr);
 		break;
 	case IPV6_RTHDRDSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest1);
 		break;
 	case IPV6_DSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest2);
 		break;
 	case IPV6_NEXTHOP:
 		GET_PKTOPT_SOCKADDR(ip6po_nexthop);
 		break;
 	case IPV6_USE_MIN_MTU:
 		if (pktopt)
 			defminmtu = pktopt->ip6po_minmtu;
 		optdata = (void *)&defminmtu;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_DONTFRAG:
 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
 			on = 1;
 		else
 			on = 0;
 		optdata = (void *)&on;
 		optdatalen = sizeof(on);
 		break;
 	case IPV6_PREFER_TEMPADDR:
 		if (pktopt)
 			defpreftemp = pktopt->ip6po_prefer_tempaddr;
 		optdata = (void *)&defpreftemp;
 		optdatalen = sizeof(int);
 		break;
 	default:		/* should not happen */
 #ifdef DIAGNOSTIC
 		panic("ip6_getpcbopt: unexpected option\n");
 #endif
 		INP_RUNLOCK(in6p);
 		return (ENOPROTOOPT);
 	}
 	INP_RUNLOCK(in6p);
 
 	error = sooptcopyout(sopt, optdata, optdatalen);
 	if (malloc_optdata)
 		free(optdata, M_TEMP);
 
 	return (error);
 }
 
 void
 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1 || optname == IPV6_PKTINFO) {
 		if (pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPLIMIT)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1 || optname == IPV6_TCLASS)
 		pktopt->ip6po_tclass = -1;
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_rt) {
 			RTFREE(pktopt->ip6po_nextroute.ro_rt);
 			pktopt->ip6po_nextroute.ro_rt = NULL;
 		}
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_rt) {
 			RTFREE(pktopt->ip6po_route.ro_rt);
 			pktopt->ip6po_route.ro_rt = NULL;
 		}
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (/*CONSTCOND*/ 0)
 
 static int
 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 {
 	if (dst == NULL || src == NULL)  {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return (EINVAL);
 	}
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	dst->ip6po_tclass = src->ip6po_tclass;
 	dst->ip6po_flags = src->ip6po_flags;
 	dst->ip6po_minmtu = src->ip6po_minmtu;
 	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		    src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return (0);
 
   bad:
 	ip6_clearpktopts(dst, -1);
 	return (ENOBUFS);
 }
 #undef PKTOPT_EXTHDRCPY
 
 struct ip6_pktopts *
 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
 {
 	int error;
 	struct ip6_pktopts *dst;
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL)
 		return (NULL);
 	ip6_initpktopts(dst);
 
 	if ((error = copypktopts(dst, src, canwait)) != 0) {
 		free(dst, M_IP6OPT);
 		return (NULL);
 	}
 
 	return (dst);
 }
 
 void
 ip6_freepcbopts(struct ip6_pktopts *pktopt)
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
 {
 	struct cmsghdr *cm = NULL;
 
 	if (control == NULL || opt == NULL)
 		return (EINVAL);
 
 	ip6_initpktopts(opt);
 	if (stickyopt) {
 		int error;
 
 		/*
 		 * If stickyopt is provided, make a local copy of the options
 		 * for this particular packet, then override them by ancillary
 		 * objects.
 		 * XXX: copypktopts() does not copy the cached route to a next
 		 * hop (if any).  This is not very good in terms of efficiency,
 		 * but we can allow this since this option should be rarely
 		 * used.
 		 */
 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return (EINVAL);
 
 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		int error;
 
 		if (control->m_len < CMSG_LEN(0))
 			return (EINVAL);
 
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return (EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Set a particular packet option, as a sticky option or an ancillary data
  * item.  "len" can be 0 only when it's a sticky option.
  * We have 4 cases of combination of "sticky" and "cmsg":
  * "sticky=0, cmsg=0": impossible
  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
  * "sticky=1, cmsg=0": RFC3542 socket option
  * "sticky=1, cmsg=1": RFC2292 socket option
  */
 static int
 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
     struct ucred *cred, int sticky, int cmsg, int uproto)
 {
 	int minmtupolicy, preftemp;
 	int error;
 
 	if (!sticky && !cmsg) {
 #ifdef DIAGNOSTIC
 		printf("ip6_setpktopt: impossible case\n");
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
 	 * not be specified in the context of RFC3542.  Conversely,
 	 * RFC3542 types should not be specified in the context of RFC2292.
 	 */
 	if (!cmsg) {
 		switch (optname) {
 		case IPV6_2292PKTINFO:
 		case IPV6_2292HOPLIMIT:
 		case IPV6_2292NEXTHOP:
 		case IPV6_2292HOPOPTS:
 		case IPV6_2292DSTOPTS:
 		case IPV6_2292RTHDR:
 		case IPV6_2292PKTOPTIONS:
 			return (ENOPROTOOPT);
 		}
 	}
 	if (sticky && cmsg) {
 		switch (optname) {
 		case IPV6_PKTINFO:
 		case IPV6_HOPLIMIT:
 		case IPV6_NEXTHOP:
 		case IPV6_HOPOPTS:
 		case IPV6_DSTOPTS:
 		case IPV6_RTHDRDSTOPTS:
 		case IPV6_RTHDR:
 		case IPV6_USE_MIN_MTU:
 		case IPV6_DONTFRAG:
 		case IPV6_TCLASS:
 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
 			return (ENOPROTOOPT);
 		}
 	}
 
 	switch (optname) {
 	case IPV6_2292PKTINFO:
 	case IPV6_PKTINFO:
 	{
 		struct ifnet *ifp = NULL;
 		struct in6_pktinfo *pktinfo;
 
 		if (len != sizeof(struct in6_pktinfo))
 			return (EINVAL);
 
 		pktinfo = (struct in6_pktinfo *)buf;
 
 		/*
 		 * An application can clear any sticky IPV6_PKTINFO option by
 		 * doing a "regular" setsockopt with ipi6_addr being
 		 * in6addr_any and ipi6_ifindex being zero.
 		 * [RFC 3542, Section 6]
 		 */
 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
 		    pktinfo->ipi6_ifindex == 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			ip6_clearpktopts(opt, optname);
 			break;
 		}
 
 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
 			return (EINVAL);
 		/* validate the interface index if specified. */
 		if (pktinfo->ipi6_ifindex > V_if_index)
 			 return (ENXIO);
 		if (pktinfo->ipi6_ifindex) {
 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 			if (ifp == NULL)
 				return (ENXIO);
 		}
 		if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL ||
 		    (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0))
 			return (ENETDOWN);
 
 		if (ifp != NULL &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			struct in6_ifaddr *ia;
 
 			in6_setscope(&pktinfo->ipi6_addr, ifp, NULL);
 			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
 			if (ia == NULL)
 				return (EADDRNOTAVAIL);
 			ifa_free(&ia->ia_ifa);
 		}
 		/*
 		 * We store the address anyway, and let in6_selectsrc()
 		 * validate the specified address.  This is because ipi6_addr
 		 * may not have enough information about its scope zone, and
 		 * we may need additional information (such as outgoing
 		 * interface or the scope zone of a destination address) to
 		 * disambiguate the scope.
 		 * XXX: the delay of the validation may confuse the
 		 * application when it is used as a sticky option.
 		 */
 		if (opt->ip6po_pktinfo == NULL) {
 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
 			    M_IP6OPT, M_NOWAIT);
 			if (opt->ip6po_pktinfo == NULL)
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
 		break;
 	}
 
 	case IPV6_2292HOPLIMIT:
 	case IPV6_HOPLIMIT:
 	{
 		int *hlimp;
 
 		/*
 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
 		 * to simplify the ordering among hoplimit options.
 		 */
 		if (optname == IPV6_HOPLIMIT && sticky)
 			return (ENOPROTOOPT);
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		hlimp = (int *)buf;
 		if (*hlimp < -1 || *hlimp > 255)
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
 		break;
 	}
 
 	case IPV6_TCLASS:
 	{
 		int tclass;
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		tclass = *(int *)buf;
 		if (tclass < -1 || tclass > 255)
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
 		break;
 	}
 
 	case IPV6_2292NEXTHOP:
 	case IPV6_NEXTHOP:
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {	/* just remove the option */
 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
 			break;
 		}
 
 		/* check if cmsg_len is large enough for sa_len */
 		if (len < sizeof(struct sockaddr) || len < *buf)
 			return (EINVAL);
 
 		switch (((struct sockaddr *)buf)->sa_family) {
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
 			int error;
 
 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
 				return (EINVAL);
 			}
 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
 			    != 0) {
 				return (error);
 			}
 			break;
 		}
 		case AF_LINK:	/* should eventually be supported */
 		default:
 			return (EAFNOSUPPORT);
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
 		break;
 
 	case IPV6_2292HOPOPTS:
 	case IPV6_HOPOPTS:
 	{
 		struct ip6_hbh *hbh;
 		int hbhlen;
 
 		/*
 		 * XXX: We don't allow a non-privileged user to set ANY HbH
 		 * options, since per-option restriction has too much
 		 * overhead.
 		 */
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_hbh))
 			return (EINVAL);
 		hbh = (struct ip6_hbh *)buf;
 		hbhlen = (hbh->ip6h_len + 1) << 3;
 		if (len != hbhlen)
 			return (EINVAL);
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
 
 		break;
 	}
 
 	case IPV6_2292DSTOPTS:
 	case IPV6_DSTOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	{
 		struct ip6_dest *dest, **newdest = NULL;
 		int destlen;
 
 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, optname);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_dest))
 			return (EINVAL);
 		dest = (struct ip6_dest *)buf;
 		destlen = (dest->ip6d_len + 1) << 3;
 		if (len != destlen)
 			return (EINVAL);
 
 		/*
 		 * Determine the position that the destination options header
 		 * should be inserted; before or after the routing header.
 		 */
 		switch (optname) {
 		case IPV6_2292DSTOPTS:
 			/*
 			 * The old advacned API is ambiguous on this point.
 			 * Our approach is to determine the position based
 			 * according to the existence of a routing header.
 			 * Note, however, that this depends on the order of the
 			 * extension headers in the ancillary data; the 1st
 			 * part of the destination options header must appear
 			 * before the routing header in the ancillary data,
 			 * too.
 			 * RFC3542 solved the ambiguity by introducing
 			 * separate ancillary data or option types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 			break;
 		case IPV6_RTHDRDSTOPTS:
 			newdest = &opt->ip6po_dest1;
 			break;
 		case IPV6_DSTOPTS:
 			newdest = &opt->ip6po_dest2;
 			break;
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, optname);
 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
 
 		break;
 	}
 
 	case IPV6_2292RTHDR:
 	case IPV6_RTHDR:
 	{
 		struct ip6_rthdr *rth;
 		int rthlen;
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_RTHDR);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_rthdr))
 			return (EINVAL);
 		rth = (struct ip6_rthdr *)buf;
 		rthlen = (rth->ip6r_len + 1) << 3;
 		if (len != rthlen)
 			return (EINVAL);
 
 		switch (rth->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			if (rth->ip6r_len == 0)	/* must contain one addr */
 				return (EINVAL);
 			if (rth->ip6r_len % 2) /* length must be even */
 				return (EINVAL);
 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);	/* not supported */
 		}
 
 		/* turn off the previous option */
 		ip6_clearpktopts(opt, IPV6_RTHDR);
 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
 
 		break;
 	}
 
 	case IPV6_USE_MIN_MTU:
 		if (len != sizeof(int))
 			return (EINVAL);
 		minmtupolicy = *(int *)buf;
 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
 		    minmtupolicy != IP6PO_MINMTU_ALL) {
 			return (EINVAL);
 		}
 		opt->ip6po_minmtu = minmtupolicy;
 		break;
 
 	case IPV6_DONTFRAG:
 		if (len != sizeof(int))
 			return (EINVAL);
 
 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
 			/*
 			 * we ignore this option for TCP sockets.
 			 * (RFC3542 leaves this case unspecified.)
 			 */
 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
 		} else
 			opt->ip6po_flags |= IP6PO_DONTFRAG;
 		break;
 
 	case IPV6_PREFER_TEMPADDR:
 		if (len != sizeof(int))
 			return (EINVAL);
 		preftemp = *(int *)buf;
 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
 		    preftemp != IP6PO_TEMPADDR_PREFER) {
 			return (EINVAL);
 		}
 		opt->ip6po_prefer_tempaddr = preftemp;
 		break;
 
 	default:
 		return (ENOPROTOOPT);
 	} /* end of switch */
 
 	return (0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if (!M_WRITABLE(copym) ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 	ip6 = mtod(copym, struct ip6_hdr *);
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
 		    CSUM_PSEUDO_HDR;
 		copym->m_pkthdr.csum_data = 0xffff;
 	}
 	if_simloop(ifp, copym, AF_INET6, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		mh = m_gethdr(M_NOWAIT, MT_DATA);
 		if (mh == NULL) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_move_pkthdr(mh, m);
 		M_ALIGN(mh, sizeof(*ip6));
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(struct inpcb *in6p)
 {
 	int len;
 
 	if (!in6p->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(in6p->in6p_outputopts->ip6po_hbh);
 	if (in6p->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(in6p->in6p_outputopts->ip6po_dest1);
 	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
 	len += elen(in6p->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }
Index: head/sys/netpfil/ipfw/ip_fw2.c
===================================================================
--- head/sys/netpfil/ipfw/ip_fw2.c	(revision 348253)
+++ head/sys/netpfil/ipfw/ip_fw2.c	(revision 348254)
@@ -1,3491 +1,3491 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * The FreeBSD IP packet firewall, main file
  */
 
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_inet.h"
 #ifndef INET
 #error "IPFIREWALL requires INET"
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/counter.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/jail.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_carp.h>
 #include <netinet/pim.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #include <netinet/sctp_header.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/in_fib.h>
 #ifdef INET6
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <net/if_gre.h> /* for struct grehdr */
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 /*
  * static variables followed by global ones.
  * All ipfw global variables are here.
  */
 
 VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs);
 #define	V_fw_deny_unknown_exthdrs	VNET(fw_deny_unknown_exthdrs)
 
 VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1;
 #define	V_fw_permit_single_frag6	VNET(fw_permit_single_frag6)
 
 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
 static int default_to_accept = 1;
 #else
 static int default_to_accept;
 #endif
 
 VNET_DEFINE(int, autoinc_step);
 VNET_DEFINE(int, fw_one_pass) = 1;
 
 VNET_DEFINE(unsigned int, fw_tables_max);
 VNET_DEFINE(unsigned int, fw_tables_sets) = 0;	/* Don't use set-aware tables */
 /* Use 128 tables by default */
 static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT;
 
 #ifndef LINEAR_SKIPTO
 static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_fast(ch, f, num, targ, back)
 #else
 static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_linear(ch, f, num, targ, back)
 #endif
 
 /*
  * Each rule belongs to one of 32 different sets (0..31).
  * The variable set_disable contains one bit per set.
  * If the bit is set, all rules in the corresponding set
  * are disabled. Set RESVD_SET(31) is reserved for the default rule
  * and rules that are not deleted by the flush command,
  * and CANNOT be disabled.
  * Rules in set RESVD_SET can only be deleted individually.
  */
 VNET_DEFINE(u_int32_t, set_disable);
 #define	V_set_disable			VNET(set_disable)
 
 VNET_DEFINE(int, fw_verbose);
 /* counter for ipfw_log(NULL...) */
 VNET_DEFINE(u_int64_t, norule_counter);
 VNET_DEFINE(int, verbose_limit);
 
 /* layer3_chain contains the list of rules for layer 3 */
 VNET_DEFINE(struct ip_fw_chain, layer3_chain);
 
 /* ipfw_vnet_ready controls when we are open for business */
 VNET_DEFINE(int, ipfw_vnet_ready) = 0;
 
 VNET_DEFINE(int, ipfw_nat_ready) = 0;
 
 ipfw_nat_t *ipfw_nat_ptr = NULL;
 struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
 ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_del_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 #ifdef SYSCTL_NODE
 uint32_t dummy_def = IPFW_DEFAULT_RULE;
 static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS);
 static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS);
 
 SYSBEGIN(f3)
 
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
     "Only do a single pass through ipfw when using dummynet(4)");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
     "Rule number auto-increment step");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
     "Log matches to ipfw rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
     &dummy_def, 0,
     "The default/max possible rule number.");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
     "Maximum number of concurrently used tables");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     0, 0, sysctl_ipfw_tables_sets, "IU",
     "Use per-set namespace for tables");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
     "Make the default rule accept all packets.");
 TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables);
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count,
     CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
     "Number of static rules");
 
 #ifdef INET6
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_deny_unknown_exthdrs), 0,
     "Deny packets with unknown IPv6 Extension Headers");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_permit_single_frag6), 0,
     "Permit single packet IPv6 fragments");
 #endif /* INET6 */
 
 SYSEND
 
 #endif /* SYSCTL_NODE */
 
 
 /*
  * Some macros used in the various matching options.
  * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
  * Other macros just cast void * into the appropriate type
  */
 #define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 #define	ICMP(p)		((struct icmphdr *)(p))
 #define	ICMP6(p)	((struct icmp6_hdr *)(p))
 
 static __inline int
 icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
 }
 
 #define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
     (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
 
 static int
 is_icmp_query(struct icmphdr *icmp)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
 }
 #undef TT
 
 /*
  * The following checks use two arrays of 8 or 16 bits to store the
  * bits that we want set or clear, respectively. They are in the
  * low and high half of cmd->arg1 or cmd->d[0].
  *
  * We scan options and store the bits we find set. We succeed if
  *
  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
  *
  * The code is sometimes optimized not to store additional variables.
  */
 
 static int
 flags_match(ipfw_insn *cmd, u_int8_t bits)
 {
 	u_char want_clear;
 	bits = ~bits;
 
 	if ( ((cmd->arg1 & 0xff) & bits) != 0)
 		return 0; /* some bits we want set were clear */
 	want_clear = (cmd->arg1 >> 8) & 0xff;
 	if ( (want_clear & bits) != want_clear)
 		return 0; /* some bits we want clear were set */
 	return 1;
 }
 
 static int
 ipopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(ip + 1);
 	int x = (ip->ip_hl << 2) - sizeof (struct ip);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[IPOPT_OPTVAL];
 
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[IPOPT_OLEN];
 			if (optlen <= 0 || optlen > x)
 				return 0; /* invalid or truncated */
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 			bits |= IP_FW_IPOPT_LSRR;
 			break;
 
 		case IPOPT_SSRR:
 			bits |= IP_FW_IPOPT_SSRR;
 			break;
 
 		case IPOPT_RR:
 			bits |= IP_FW_IPOPT_RR;
 			break;
 
 		case IPOPT_TS:
 			bits |= IP_FW_IPOPT_TS;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(tcp + 1);
 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[1];
 			if (optlen <= 0)
 				break;
 		}
 
 		switch (opt) {
 
 		default:
 			break;
 
 		case TCPOPT_MAXSEG:
 			bits |= IP_FW_TCPOPT_MSS;
 			break;
 
 		case TCPOPT_WINDOW:
 			bits |= IP_FW_TCPOPT_WINDOW;
 			break;
 
 		case TCPOPT_SACK_PERMITTED:
 		case TCPOPT_SACK:
 			bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_TIMESTAMP:
 			bits |= IP_FW_TCPOPT_TS;
 			break;
 
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain,
     uint32_t *tablearg)
 {
 
 	if (ifp == NULL)	/* no iface with this packet, match fails */
 		return (0);
 
 	/* Check by name or by IP address */
 	if (cmd->name[0] != '\0') { /* match by name */
 		if (cmd->name[0] == '\1') /* use tablearg to match */
 			return ipfw_lookup_table(chain, cmd->p.kidx, 0,
 			    &ifp->if_index, tablearg);
 		/* Check name */
 		if (cmd->p.glob) {
 			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
 				return(1);
 		} else {
 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
 				return(1);
 		}
 	} else {
 #if !defined(USERSPACE) && defined(__FreeBSD__)	/* and OSX too ? */
 		struct ifaddr *ia;
 
 		if_addr_rlock(ifp);
 		CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 			if (ia->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
 			    (ia->ifa_addr))->sin_addr.s_addr) {
 				if_addr_runlock(ifp);
 				return(1);	/* match */
 			}
 		}
 		if_addr_runlock(ifp);
 #endif /* __FreeBSD__ */
 	}
 	return(0);	/* no match, fail ... */
 }
 
 /*
  * The verify_path function checks if a route to the src exists and
  * if it is reachable via ifp (when provided).
  * 
  * The 'verrevpath' option checks that the interface that an IP packet
  * arrives on is the same interface that traffic destined for the
  * packet's source address would be routed out of.
  * The 'versrcreach' option just checks that the source address is
  * reachable via any route (except default) in the routing table.
  * These two are a measure to block forged packets. This is also
  * commonly known as "anti-spoofing" or Unicast Reverse Path
  * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
  * is purposely reminiscent of the Cisco IOS command,
  *
  *   ip verify unicast reverse-path
  *   ip verify unicast source reachable-via any
  *
  * which implements the same functionality. But note that the syntax
  * is misleading, and the check may be performed on all IP packets
  * whether unicast, multicast, or broadcast.
  */
 static int
 verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 {
 #if defined(USERSPACE) || !defined(__FreeBSD__)
 	return 0;
 #else
 	struct nhop4_basic nh4;
 
 	if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0)
 		return (0);
 
 	/*
 	 * If ifp is provided, check for equality with rtentry.
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * in order to pass packets injected back by if_simloop():
 	 * routing entry (via lo0) for our own address
 	 * may exist, so we need to handle routing assymetry.
 	 */
 	if (ifp != NULL && ifp != nh4.nh_ifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 #endif /* __FreeBSD__ */
 }
 
 /*
  * Generate an SCTP packet containing an ABORT chunk. The verification tag
  * is given by vtag. The T-bit is set in the ABORT chunk if and only if
  * reflected is not 0.
  */
 
 static struct mbuf *
 ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag,
     int reflected)
 {
 	struct mbuf *m;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct sctphdr *sctp;
 	struct sctp_chunkhdr *chunk;
 	u_int16_t hlen, plen, tlen;
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	M_SETFIB(m, id->fib);
 #ifdef MAC
 	if (replyto != NULL)
 		mac_netinet_firewall_reply(replyto, m);
 	else
 		mac_netinet_firewall_send(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	switch (id->addr_type) {
 	case 4:
 		hlen = sizeof(struct ip);
 		break;
 #ifdef INET6
 	case 6:
 		hlen = sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
 		/* XXX: log me?!? */
 		FREE_PKT(m);
 		return (NULL);
 	}
 	plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
 	tlen = hlen + plen;
 	m->m_data += max_linkhdr;
 	m->m_flags |= M_SKIP_FIREWALL;
 	m->m_pkthdr.len = m->m_len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, tlen);
 
 	switch (id->addr_type) {
 	case 4:
 		ip = mtod(m, struct ip *);
 
 		ip->ip_v = 4;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		ip->ip_len = htons(tlen);
 		ip->ip_id = htons(0);
 		ip->ip_off = htons(0);
 		ip->ip_ttl = V_ip_defttl;
 		ip->ip_p = IPPROTO_SCTP;
 		ip->ip_sum = 0;
 		ip->ip_src.s_addr = htonl(id->dst_ip);
 		ip->ip_dst.s_addr = htonl(id->src_ip);
 
 		sctp = (struct sctphdr *)(ip + 1);
 		break;
 #ifdef INET6
 	case 6:
 		ip6 = mtod(m, struct ip6_hdr *);
 
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_plen = htons(plen);
 		ip6->ip6_nxt = IPPROTO_SCTP;
 		ip6->ip6_hlim = IPV6_DEFHLIM;
 		ip6->ip6_src = id->dst_ip6;
 		ip6->ip6_dst = id->src_ip6;
 
 		sctp = (struct sctphdr *)(ip6 + 1);
 		break;
 #endif
 	}
 
 	sctp->src_port = htons(id->dst_port);
 	sctp->dest_port = htons(id->src_port);
 	sctp->v_tag = htonl(vtag);
 	sctp->checksum = htonl(0);
 
 	chunk = (struct sctp_chunkhdr *)(sctp + 1);
 	chunk->chunk_type = SCTP_ABORT_ASSOCIATION;
 	chunk->chunk_flags = 0;
 	if (reflected != 0) {
 		chunk->chunk_flags |= SCTP_HAD_NO_TCB;
 	}
 	chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr));
 
 	sctp->checksum = sctp_calculate_cksum(m, hlen);
 
 	return (m);
 }
 
 /*
  * Generate a TCP packet, containing either a RST or a keepalive.
  * When flags & TH_RST, we are sending a RST packet, because of a
  * "reset" action matched the packet.
  * Otherwise we are sending a keepalive, and flags & TH_
  * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
  * so that MAC can label the reply appropriately.
  */
 struct mbuf *
 ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
     u_int32_t ack, int flags)
 {
 	struct mbuf *m = NULL;		/* stupid compiler */
 	struct ip *h = NULL;		/* stupid compiler */
 #ifdef INET6
 	struct ip6_hdr *h6 = NULL;
 #endif
 	struct tcphdr *th = NULL;
 	int len, dir;
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	M_SETFIB(m, id->fib);
 #ifdef MAC
 	if (replyto != NULL)
 		mac_netinet_firewall_reply(replyto, m);
 	else
 		mac_netinet_firewall_send(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	switch (id->addr_type) {
 	case 4:
 		len = sizeof(struct ip) + sizeof(struct tcphdr);
 		break;
 #ifdef INET6
 	case 6:
 		len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		break;
 #endif
 	default:
 		/* XXX: log me?!? */
 		FREE_PKT(m);
 		return (NULL);
 	}
 	dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
 
 	m->m_data += max_linkhdr;
 	m->m_flags |= M_SKIP_FIREWALL;
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, len);
 
 	switch (id->addr_type) {
 	case 4:
 		h = mtod(m, struct ip *);
 
 		/* prepare for checksum */
 		h->ip_p = IPPROTO_TCP;
 		h->ip_len = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h->ip_src.s_addr = htonl(id->src_ip);
 			h->ip_dst.s_addr = htonl(id->dst_ip);
 		} else {
 			h->ip_src.s_addr = htonl(id->dst_ip);
 			h->ip_dst.s_addr = htonl(id->src_ip);
 		}
 
 		th = (struct tcphdr *)(h + 1);
 		break;
 #ifdef INET6
 	case 6:
 		h6 = mtod(m, struct ip6_hdr *);
 
 		/* prepare for checksum */
 		h6->ip6_nxt = IPPROTO_TCP;
 		h6->ip6_plen = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h6->ip6_src = id->src_ip6;
 			h6->ip6_dst = id->dst_ip6;
 		} else {
 			h6->ip6_src = id->dst_ip6;
 			h6->ip6_dst = id->src_ip6;
 		}
 
 		th = (struct tcphdr *)(h6 + 1);
 		break;
 #endif
 	}
 
 	if (dir) {
 		th->th_sport = htons(id->src_port);
 		th->th_dport = htons(id->dst_port);
 	} else {
 		th->th_sport = htons(id->dst_port);
 		th->th_dport = htons(id->src_port);
 	}
 	th->th_off = sizeof(struct tcphdr) >> 2;
 
 	if (flags & TH_RST) {
 		if (flags & TH_ACK) {
 			th->th_seq = htonl(ack);
 			th->th_flags = TH_RST;
 		} else {
 			if (flags & TH_SYN)
 				seq++;
 			th->th_ack = htonl(seq);
 			th->th_flags = TH_RST | TH_ACK;
 		}
 	} else {
 		/*
 		 * Keepalive - use caller provided sequence numbers
 		 */
 		th->th_seq = htonl(seq);
 		th->th_ack = htonl(ack);
 		th->th_flags = TH_ACK;
 	}
 
 	switch (id->addr_type) {
 	case 4:
 		th->th_sum = in_cksum(m, len);
 
 		/* finish the ip header */
 		h->ip_v = 4;
 		h->ip_hl = sizeof(*h) >> 2;
 		h->ip_tos = IPTOS_LOWDELAY;
 		h->ip_off = htons(0);
 		h->ip_len = htons(len);
 		h->ip_ttl = V_ip_defttl;
 		h->ip_sum = 0;
 		break;
 #ifdef INET6
 	case 6:
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
 		    sizeof(struct tcphdr));
 
 		/* finish the ip6 header */
 		h6->ip6_vfc |= IPV6_VERSION;
 		h6->ip6_hlim = IPV6_DEFHLIM;
 		break;
 #endif
 	}
 
 	return (m);
 }
 
 #ifdef INET6
 /*
  * ipv6 specific rules here...
  */
 static __inline int
 icmp6type_match (int type, ipfw_insn_u32 *cmd)
 {
 	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
 }
 
 static int
 flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
 {
 	int i;
 	for (i=0; i <= cmd->o.arg1; ++i )
 		if (curr_flow == cmd->d[i] )
 			return 1;
 	return 0;
 }
 
 /* support for IP6_*_ME opcodes */
 static const struct in6_addr lla_mask = {{{
 	0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 }}};
 
 static int
 ipfw_localip6(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_MULTICAST(in6))
 		return (0);
 
 	if (!IN6_IS_ADDR_LINKLOCAL(in6))
 		return (in6_localip(in6));
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr))
 			continue;
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
 		    in6, &lla_mask)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
 
 static int
 verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib)
 {
 	struct nhop6_basic nh6;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(src))
 		return (1);
 
 	if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0)
 		return (0);
 
 	/* If ifp is provided, check for equality with route table. */
 	if (ifp != NULL && ifp != nh6.nh_ifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 }
 
 static int
 is_icmp6_query(int icmp6_type)
 {
 	if ((icmp6_type <= ICMP6_MAXTYPE) &&
 	    (icmp6_type == ICMP6_ECHO_REQUEST ||
 	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
 	    icmp6_type == ICMP6_WRUREQUEST ||
 	    icmp6_type == ICMP6_FQDN_QUERY ||
 	    icmp6_type == ICMP6_NI_QUERY))
 		return (1);
 
 	return (0);
 }
 
 static int
 map_icmp_unreach(int code)
 {
 
 	/* RFC 7915 p4.2 */
 	switch (code) {
 	case ICMP_UNREACH_NET:
 	case ICMP_UNREACH_HOST:
 	case ICMP_UNREACH_SRCFAIL:
 	case ICMP_UNREACH_NET_UNKNOWN:
 	case ICMP_UNREACH_HOST_UNKNOWN:
 	case ICMP_UNREACH_TOSNET:
 	case ICMP_UNREACH_TOSHOST:
 		return (ICMP6_DST_UNREACH_NOROUTE);
 	case ICMP_UNREACH_PORT:
 		return (ICMP6_DST_UNREACH_NOPORT);
 	default:
 		/*
 		 * Map the rest of codes into admit prohibited.
 		 * XXX: unreach proto should be mapped into ICMPv6
 		 * parameter problem, but we use only unreach type.
 		 */
 		return (ICMP6_DST_UNREACH_ADMIN);
 	}
 }
 
 static void
 send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
 {
 	struct mbuf *m;
 
 	m = args->m;
 	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *tcp;
 		tcp = (struct tcphdr *)((char *)ip6 + hlen);
 
 		if ((tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m0;
 			m0 = ipfw_send_pkt(args->m, &(args->f_id),
 			    ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 			    tcp->th_flags | TH_RST);
 			if (m0 != NULL)
 				ip6_output(m0, NULL, NULL, 0, NULL, NULL,
 				    NULL);
 		}
 		FREE_PKT(m);
 	} else if (code == ICMP6_UNREACH_ABORT &&
 	    args->f_id.proto == IPPROTO_SCTP) {
 		struct mbuf *m0;
 		struct sctphdr *sctp;
 		u_int32_t v_tag;
 		int reflected;
 
 		sctp = (struct sctphdr *)((char *)ip6 + hlen);
 		reflected = 1;
 		v_tag = ntohl(sctp->v_tag);
 		/* Investigate the first chunk header if available */
 		if (m->m_len >= hlen + sizeof(struct sctphdr) +
 		    sizeof(struct sctp_chunkhdr)) {
 			struct sctp_chunkhdr *chunk;
 
 			chunk = (struct sctp_chunkhdr *)(sctp + 1);
 			switch (chunk->chunk_type) {
 			case SCTP_INITIATION:
 				/*
 				 * Packets containing an INIT chunk MUST have
 				 * a zero v-tag.
 				 */
 				if (v_tag != 0) {
 					v_tag = 0;
 					break;
 				}
 				/* INIT chunk MUST NOT be bundled */
 				if (m->m_pkthdr.len >
 				    hlen + sizeof(struct sctphdr) +
 				    ntohs(chunk->chunk_length) + 3) {
 					break;
 				}
 				/* Use the initiate tag if available */
 				if ((m->m_len >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))) {
 					struct sctp_init *init;
 
 					init = (struct sctp_init *)(chunk + 1);
 					v_tag = ntohl(init->initiate_tag);
 					reflected = 0;
 				}
 				break;
 			case SCTP_ABORT_ASSOCIATION:
 				/*
 				 * If the packet contains an ABORT chunk, don't
 				 * reply.
 				 * XXX: We should search through all chunks,
 				 *      but don't do to avoid attacks.
 				 */
 				v_tag = 0;
 				break;
 			}
 		}
 		if (v_tag == 0) {
 			m0 = NULL;
 		} else {
 			m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag,
 			    reflected);
 		}
 		if (m0 != NULL)
 			ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
 		FREE_PKT(m);
 	} else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) {
 		/* Send an ICMPv6 unreach. */
 #if 0
 		/*
 		 * Unlike above, the mbufs need to line up with the ip6 hdr,
 		 * as the contents are read. We need to m_adj() the
 		 * needed amount.
 		 * The mbuf will however be thrown away so we can adjust it.
 		 * Remember we did an m_pullup on it already so we
 		 * can make some assumptions about contiguousness.
 		 */
 		if (args->L3offset)
 			m_adj(m, args->L3offset);
 #endif
 		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
 	} else
 		FREE_PKT(m);
 
 	args->m = NULL;
 }
 
 #endif /* INET6 */
 
 
 /*
  * sends a reject message, consuming the mbuf passed as an argument.
  */
 static void
 send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
 {
 
 #if 0
 	/* XXX When ip is not guaranteed to be at mtod() we will
 	 * need to account for this */
 	 * The mbuf will however be thrown away so we can adjust it.
 	 * Remember we did an m_pullup on it already so we
 	 * can make some assumptions about contiguousness.
 	 */
 	if (args->L3offset)
 		m_adj(m, args->L3offset);
 #endif
 	if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) {
 		/* Send an ICMP unreach */
 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
 	} else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *const tcp =
 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
 		if ( (tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m;
 			m = ipfw_send_pkt(args->m, &(args->f_id),
 				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 				tcp->th_flags | TH_RST);
 			if (m != NULL)
 				ip_output(m, NULL, NULL, 0, NULL, NULL);
 		}
 		FREE_PKT(args->m);
 	} else if (code == ICMP_REJECT_ABORT &&
 	    args->f_id.proto == IPPROTO_SCTP) {
 		struct mbuf *m;
 		struct sctphdr *sctp;
 		struct sctp_chunkhdr *chunk;
 		struct sctp_init *init;
 		u_int32_t v_tag;
 		int reflected;
 
 		sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *));
 		reflected = 1;
 		v_tag = ntohl(sctp->v_tag);
 		if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) +
 		    sizeof(struct sctp_chunkhdr)) {
 			/* Look at the first chunk header if available */
 			chunk = (struct sctp_chunkhdr *)(sctp + 1);
 			switch (chunk->chunk_type) {
 			case SCTP_INITIATION:
 				/*
 				 * Packets containing an INIT chunk MUST have
 				 * a zero v-tag.
 				 */
 				if (v_tag != 0) {
 					v_tag = 0;
 					break;
 				}
 				/* INIT chunk MUST NOT be bundled */
 				if (iplen >
 				    (ip->ip_hl << 2) + sizeof(struct sctphdr) +
 				    ntohs(chunk->chunk_length) + 3) {
 					break;
 				}
 				/* Use the initiate tag if available */
 				if ((iplen >= (ip->ip_hl << 2) +
 				    sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))) {
 					init = (struct sctp_init *)(chunk + 1);
 					v_tag = ntohl(init->initiate_tag);
 					reflected = 0;
 				}
 				break;
 			case SCTP_ABORT_ASSOCIATION:
 				/*
 				 * If the packet contains an ABORT chunk, don't
 				 * reply.
 				 * XXX: We should search through all chunks,
 				 * but don't do to avoid attacks.
 				 */
 				v_tag = 0;
 				break;
 			}
 		}
 		if (v_tag == 0) {
 			m = NULL;
 		} else {
 			m = ipfw_send_abort(args->m, &(args->f_id), v_tag,
 			    reflected);
 		}
 		if (m != NULL)
 			ip_output(m, NULL, NULL, 0, NULL, NULL);
 		FREE_PKT(args->m);
 	} else
 		FREE_PKT(args->m);
 	args->m = NULL;
 }
 
 /*
  * Support for uid/gid/jail lookup. These tests are expensive
  * (because we may need to look into the list of active sockets)
  * so we cache the results. ugid_lookupp is 0 if we have not
  * yet done a lookup, 1 if we succeeded, and -1 if we tried
  * and failed. The function always returns the match value.
  * We could actually spare the variable and use *uc, setting
  * it to '(void *)check_uidgid if we have no info, NULL if
  * we tried and failed, or any other value if successful.
  */
 static int
 check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
     struct ucred **uc)
 {
 #if defined(USERSPACE)
 	return 0;	// not supported in userspace
 #else
 #ifndef __FreeBSD__
 	/* XXX */
 	return cred_check(insn, proto, oif,
 	    dst_ip, dst_port, src_ip, src_port,
 	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
 #else  /* FreeBSD */
 	struct in_addr src_ip, dst_ip;
 	struct inpcbinfo *pi;
 	struct ipfw_flow_id *id;
 	struct inpcb *pcb, *inp;
 	int lookupflags;
 	int match;
 
 	id = &args->f_id;
 	inp = args->inp;
 
 	/*
 	 * Check to see if the UDP or TCP stack supplied us with
 	 * the PCB. If so, rather then holding a lock and looking
 	 * up the PCB, we can use the one that was supplied.
 	 */
 	if (inp && *ugid_lookupp == 0) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_socket != NULL) {
 			*uc = crhold(inp->inp_cred);
 			*ugid_lookupp = 1;
 		} else
 			*ugid_lookupp = -1;
 	}
 	/*
 	 * If we have already been here and the packet has no
 	 * PCB entry associated with it, then we can safely
 	 * assume that this is a no match.
 	 */
 	if (*ugid_lookupp == -1)
 		return (0);
 	if (id->proto == IPPROTO_TCP) {
 		lookupflags = 0;
 		pi = &V_tcbinfo;
 	} else if (id->proto == IPPROTO_UDP) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_udbinfo;
 	} else if (id->proto == IPPROTO_UDPLITE) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_ulitecbinfo;
 	} else
 		return 0;
 	lookupflags |= INPLOOKUP_RLOCKPCB;
 	match = 0;
 	if (*ugid_lookupp == 0) {
 		if (id->addr_type == 6) {
 #ifdef INET6
 			if (args->flags & IPFW_ARGS_IN)
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->src_ip6, htons(id->src_port),
 				    &id->dst_ip6, htons(id->dst_port),
 				    lookupflags, NULL, args->m);
 			else
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->dst_ip6, htons(id->dst_port),
 				    &id->src_ip6, htons(id->src_port),
 				    lookupflags, args->ifp, args->m);
 #else
 			*ugid_lookupp = -1;
 			return (0);
 #endif
 		} else {
 			src_ip.s_addr = htonl(id->src_ip);
 			dst_ip.s_addr = htonl(id->dst_ip);
 			if (args->flags & IPFW_ARGS_IN)
 				pcb = in_pcblookup_mbuf(pi,
 				    src_ip, htons(id->src_port),
 				    dst_ip, htons(id->dst_port),
 				    lookupflags, NULL, args->m);
 			else
 				pcb = in_pcblookup_mbuf(pi,
 				    dst_ip, htons(id->dst_port),
 				    src_ip, htons(id->src_port),
 				    lookupflags, args->ifp, args->m);
 		}
 		if (pcb != NULL) {
 			INP_RLOCK_ASSERT(pcb);
 			*uc = crhold(pcb->inp_cred);
 			*ugid_lookupp = 1;
 			INP_RUNLOCK(pcb);
 		}
 		if (*ugid_lookupp == 0) {
 			/*
 			 * We tried and failed, set the variable to -1
 			 * so we will not try again on this packet.
 			 */
 			*ugid_lookupp = -1;
 			return (0);
 		}
 	}
 	if (insn->o.opcode == O_UID)
 		match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
 	else if (insn->o.opcode == O_GID)
 		match = groupmember((gid_t)insn->d[0], *uc);
 	else if (insn->o.opcode == O_JAIL)
 		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
 	return (match);
 #endif /* __FreeBSD__ */
 #endif /* not supported in userspace */
 }
 
 /*
  * Helper function to set args with info on the rule after the matching
  * one. slot is precise, whereas we guess rule_id as they are
  * assigned sequentially.
  */
 static inline void
 set_match(struct ip_fw_args *args, int slot,
 	struct ip_fw_chain *chain)
 {
 	args->rule.chain_id = chain->id;
 	args->rule.slot = slot + 1; /* we use 0 as a marker */
 	args->rule.rule_id = 1 + chain->map[slot]->id;
 	args->rule.rulenum = chain->map[slot]->rulenum;
 	args->flags |= IPFW_ARGS_REF;
 }
 
 #ifndef LINEAR_SKIPTO
 /*
  * Helper function to enable cached rule lookups using
  * cached_id and cached_pos fields in ipfw rule.
  */
 static int
 jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	/* If possible use cached f_pos (in f->cached_pos),
 	 * whose version is written in f->cached_id
 	 * (horrible hacks to avoid changing the ABI).
 	 */
 	if (num != IP_FW_TARG && f->cached_id == chain->id)
 		f_pos = f->cached_pos;
 	else {
 		int i = IP_FW_ARG_TABLEARG(chain, num, skipto);
 		/* make sure we do not jump backward */
 		if (jump_backwards == 0 && i <= f->rulenum)
 			i = f->rulenum + 1;
 		if (chain->idxmap != NULL)
 			f_pos = chain->idxmap[i];
 		else
 			f_pos = ipfw_find_rule(chain, i, 0);
 		/* update the cache */
 		if (num != IP_FW_TARG) {
 			f->cached_id = chain->id;
 			f->cached_pos = f_pos;
 		}
 	}
 
 	return (f_pos);
 }
 #else
 /*
  * Helper function to enable real fast rule lookups.
  */
 static int
 jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	num = IP_FW_ARG_TABLEARG(chain, num, skipto);
 	/* make sure we do not jump backward */
 	if (jump_backwards == 0 && num <= f->rulenum)
 		num = f->rulenum + 1;
 	f_pos = chain->idxmap[num];
 
 	return (f_pos);
 }
 #endif
 
 #define	TARG(k, f)	IP_FW_ARG_TABLEARG(chain, k, f)
 /*
  * The main check routine for the firewall.
  *
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
  *		Starts with the IP header.
  *	args->L3offset	Number of bytes bypassed if we came from L2.
  *			e.g. often sizeof(eh)  ** NOTYET **
  *	args->ifp	Incoming or outgoing interface.
  *	args->divert_rule (in/out)
  *		Skip up to the first rule past this rule number;
  *		upon return, non-zero port number for divert or tee.
  *
  *	args->rule	Pointer to the last matching rule (in/out)
  *	args->next_hop	Socket we are forwarding to (out).
  *	args->next_hop6	IPv6 next hop we are forwarding to (out).
  *	args->f_id	Addresses grabbed from the packet (out)
  * 	args->rule.info	a cookie depending on rule action
  *
  * Return value:
  *
  *	IP_FW_PASS	the packet must be accepted
  *	IP_FW_DENY	the packet must be dropped
  *	IP_FW_DIVERT	divert packet, port in m_tag
  *	IP_FW_TEE	tee packet, port in m_tag
  *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
  *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
  *		args->rule contains the matching rule,
  *		args->rule.info has additional information.
  *
  */
 int
 ipfw_chk(struct ip_fw_args *args)
 {
 
 	/*
 	 * Local variables holding state while processing a packet:
 	 *
 	 * IMPORTANT NOTE: to speed up the processing of rules, there
 	 * are some assumption on the values of the variables, which
 	 * are documented here. Should you change them, please check
 	 * the implementation of the various instructions to make sure
 	 * that they still work.
 	 *
 	 * m | args->m	Pointer to the mbuf, as received from the caller.
 	 *	It may change if ipfw_chk() does an m_pullup, or if it
 	 *	consumes the packet because it calls send_reject().
 	 *	XXX This has to change, so that ipfw_chk() never modifies
 	 *	or consumes the buffer.
 	 *	OR
 	 * args->mem	Pointer to contigous memory chunk.
 	 * ip	Is the beginning of the ip(4 or 6) header.
 	 * eh	Ethernet header in case if input is Layer2.
 	 */
 	struct mbuf *m;
 	struct ip *ip;
 	struct ether_header *eh;
 
 	/*
 	 * For rules which contain uid/gid or jail constraints, cache
 	 * a copy of the users credentials after the pcb lookup has been
 	 * executed. This will speed up the processing of rules with
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
 #ifndef __FreeBSD__
 	struct bsd_ucred ucred_cache;
 #else
 	struct ucred *ucred_cache = NULL;
 #endif
 	int ucred_lookup = 0;
 	int f_pos = 0;		/* index of current rule in the array */
 	int retval = 0;
 	struct ifnet *oif, *iif;
 
 	/*
 	 * hlen	The length of the IP header.
 	 */
 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
 
 	/*
 	 * offset	The offset of a fragment. offset != 0 means that
 	 *	we have a fragment at this offset of an IPv4 packet.
 	 *	offset == 0 means that (if this is an IPv4 packet)
 	 *	this is the first or only fragment.
 	 *	For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header
 	 *	or there is a single packet fragment (fragment header added
 	 *	without needed).  We will treat a single packet fragment as if
 	 *	there was no fragment header (or log/block depending on the
 	 *	V_fw_permit_single_frag6 sysctl setting).
 	 */
 	u_short offset = 0;
 	u_short ip6f_mf = 0;
 
 	/*
 	 * Local copies of addresses. They are only valid if we have
 	 * an IP packet.
 	 *
 	 * proto	The protocol. Set to 0 for non-ip packets,
 	 *	or to the protocol read from the packet otherwise.
 	 *	proto != 0 means that we have an IPv4 packet.
 	 *
 	 * src_port, dst_port	port numbers, in HOST format. Only
 	 *	valid for TCP and UDP packets.
 	 *
 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
 	 *	Only valid for IPv4 packets.
 	 */
 	uint8_t proto;
 	uint16_t src_port, dst_port;		/* NOTE: host format	*/
 	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
 	int iplen = 0;
 	int pktlen;
 
 	struct ipfw_dyn_info dyn_info;
 	struct ip_fw *q = NULL;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 
 	/*
 	 * We store in ulp a pointer to the upper layer protocol header.
 	 * In the ipv4 case this is easy to determine from the header,
 	 * but for ipv6 we might have some additional headers in the middle.
 	 * ulp is NULL if not found.
 	 */
 	void *ulp = NULL;		/* upper layer protocol pointer. */
 
 	/* XXX ipv6 variables */
 	int is_ipv6 = 0;
 	uint8_t	icmp6_type = 0;
 	uint16_t ext_hd = 0;	/* bits vector for extension header filtering */
 	/* end of ipv6 variables */
 
 	int is_ipv4 = 0;
 
 	int done = 0;		/* flag to exit the outer loop */
 	IPFW_RLOCK_TRACKER;
 	bool mem;
 
 	if ((mem = (args->flags & IPFW_ARGS_LENMASK))) {
 		if (args->flags & IPFW_ARGS_ETHER) {
 			eh = (struct ether_header *)args->mem;
 			if (eh->ether_type == htons(ETHERTYPE_VLAN))
 				ip = (struct ip *)
 				    ((struct ether_vlan_header *)eh + 1);
 			else
 				ip = (struct ip *)(eh + 1);
 		} else {
 			eh = NULL;
 			ip = (struct ip *)args->mem;
 		}
 		pktlen = IPFW_ARGS_LENGTH(args->flags);
 		args->f_id.fib = args->ifp->if_fib;	/* best guess */
 	} else {
 		m = args->m;
 		if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
 			return (IP_FW_PASS);	/* accept */
 		if (args->flags & IPFW_ARGS_ETHER) {
 	                /* We need some amount of data to be contiguous. */
 			if (m->m_len < min(m->m_pkthdr.len, max_protohdr) &&
 			    (args->m = m = m_pullup(m, min(m->m_pkthdr.len,
 			    max_protohdr))) == NULL)
 				goto pullup_failed;
 			eh = mtod(m, struct ether_header *);
 			ip = (struct ip *)(eh + 1);
 		} else {
 			eh = NULL;
 			ip = mtod(m, struct ip *);
 		}
 		pktlen = m->m_pkthdr.len;
 		args->f_id.fib = M_GETFIB(m); /* mbuf not altered */
 	}
 
 	dst_ip.s_addr = 0;		/* make sure it is initialized */
 	src_ip.s_addr = 0;		/* make sure it is initialized */
 	src_port = dst_port = 0;
 
 	DYN_INFO_INIT(&dyn_info);
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
 #define PULLUP_TO(_len, p, T)	PULLUP_LEN(_len, p, sizeof(T))
 #define	EHLEN	(eh != NULL ? ((char *)ip - (char *)eh) : 0)
 #define PULLUP_LEN(_len, p, T)					\
 do {								\
 	int x = (_len) + T + EHLEN;				\
 	if (mem) {						\
 		MPASS(pktlen >= x);				\
 		p = (char *)args->mem + (_len) + EHLEN;		\
 	} else {						\
 		if (__predict_false((m)->m_len < x)) {		\
 			args->m = m = m_pullup(m, x);		\
 			if (m == NULL)				\
 				goto pullup_failed;		\
 		}						\
 		p = mtod(m, char *) + (_len) + EHLEN;		\
 	}							\
 } while (0)
 /*
  * In case pointers got stale after pullups, update them.
  */
 #define	UPDATE_POINTERS()					\
 do {								\
 	if (!mem) {						\
 		if (eh != NULL) {				\
 			eh = mtod(m, struct ether_header *);	\
 			ip = (struct ip *)(eh + 1);		\
 		} else						\
 			ip = mtod(m, struct ip *);		\
 		args->m = m;					\
 	}							\
 } while (0)
 
 	/* Identify IP packets and fill up variables. */
 	if (pktlen >= sizeof(struct ip6_hdr) &&
 	    (eh == NULL || eh->ether_type == htons(ETHERTYPE_IPV6)) &&
 	    ip->ip_v == 6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
 
 		is_ipv6 = 1;
 		args->flags |= IPFW_ARGS_IP6;
 		hlen = sizeof(struct ip6_hdr);
 		proto = ip6->ip6_nxt;
 		/* Search extension headers to find upper layer protocols */
 		while (ulp == NULL && offset == 0) {
 			switch (proto) {
 			case IPPROTO_ICMPV6:
 				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
 				icmp6_type = ICMP6(ulp)->icmp6_type;
 				break;
 
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				if (pktlen >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr) +
 					    sizeof(struct sctp_chunkhdr) +
 					    offsetof(struct sctp_init, a_rwnd));
 				else if (pktlen >= hlen + sizeof(struct sctphdr))
 					PULLUP_LEN(hlen, ulp, pktlen - hlen);
 				else
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr));
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 			case IPPROTO_UDPLITE:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_HOPOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_HOPOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ROUTING:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
 				case 0:
 					ext_hd |= EXT_RTHDR0;
 					break;
 				case 2:
 					ext_hd |= EXT_RTHDR2;
 					break;
 				default:
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Unknown "
 						    "Routing Header type(%d)\n",
 						    ((struct ip6_rthdr *)
 						    ulp)->ip6r_type);
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				ext_hd |= EXT_ROUTING;
 				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_FRAGMENT:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_frag);
 				ext_hd |= EXT_FRAGMENT;
 				hlen += sizeof (struct ip6_frag);
 				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_OFF_MASK;
 				ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_MORE_FRAG;
 				if (V_fw_permit_single_frag6 == 0 &&
 				    offset == 0 && ip6f_mf == 0) {
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Invalid "
 						    "Fragment Header\n");
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				args->f_id.extra =
 				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
 				ulp = NULL;
 				break;
 
 			case IPPROTO_DSTOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_DSTOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_AH:	/* RFC 2402 */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				ext_hd |= EXT_AH;
 				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ESP:	/* RFC 2406 */
 				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
 				/* Anything past Seq# is variable length and
 				 * data past this ext. header is encrypted. */
 				ext_hd |= EXT_ESP;
 				break;
 
 			case IPPROTO_NONE:	/* RFC 2460 */
 				/*
 				 * Packet ends here, and IPv6 header has
 				 * already been pulled up. If ip6e_len!=0
 				 * then octets must be ignored.
 				 */
 				ulp = ip; /* non-NULL to get out of loop. */
 				break;
 
 			case IPPROTO_OSPFIGP:
 				/* XXX OSPF header check? */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 
 			case IPPROTO_PIM:
 				/* XXX PIM header check? */
 				PULLUP_TO(hlen, ulp, struct pim);
 				break;
 
 			case IPPROTO_GRE:	/* RFC 1701 */
 				/* XXX GRE header check? */
 				PULLUP_TO(hlen, ulp, struct grehdr);
 				break;
 
 			case IPPROTO_CARP:
 				PULLUP_TO(hlen, ulp, offsetof(
 				    struct carp_header, carp_counter));
 				if (CARP_ADVERTISEMENT !=
 				    ((struct carp_header *)ulp)->carp_type)
 					return (IP_FW_DENY);
 				break;
 
 			case IPPROTO_IPV6:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip6_hdr);
 				break;
 
 			case IPPROTO_IPV4:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip);
 				break;
 
 			default:
 				if (V_fw_verbose)
 					printf("IPFW2: IPV6 - Unknown "
 					    "Extension Header(%d), ext_hd=%x\n",
 					     proto, ext_hd);
 				if (V_fw_deny_unknown_exthdrs)
 				    return (IP_FW_DENY);
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 			} /*switch */
 		}
 		UPDATE_POINTERS();
 		ip6 = (struct ip6_hdr *)ip;
 		args->f_id.addr_type = 6;
 		args->f_id.src_ip6 = ip6->ip6_src;
 		args->f_id.dst_ip6 = ip6->ip6_dst;
 		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
 		iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6);
 	} else if (pktlen >= sizeof(struct ip) &&
 	    (eh == NULL || eh->ether_type == htons(ETHERTYPE_IP)) &&
 	    ip->ip_v == 4) {
 		is_ipv4 = 1;
 		args->flags |= IPFW_ARGS_IP4;
 		hlen = ip->ip_hl << 2;
 		/*
 		 * Collect parameters into local variables for faster
 		 * matching.
 		 */
 		proto = ip->ip_p;
 		src_ip = ip->ip_src;
 		dst_ip = ip->ip_dst;
 		offset = ntohs(ip->ip_off) & IP_OFFMASK;
 		iplen = ntohs(ip->ip_len);
 
 		if (offset == 0) {
 			switch (proto) {
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				if (pktlen >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr) +
 					    sizeof(struct sctp_chunkhdr) +
 					    offsetof(struct sctp_init, a_rwnd));
 				else if (pktlen >= hlen + sizeof(struct sctphdr))
 					PULLUP_LEN(hlen, ulp, pktlen - hlen);
 				else
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr));
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 			case IPPROTO_UDPLITE:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_ICMP:
 				PULLUP_TO(hlen, ulp, struct icmphdr);
 				//args->f_id.flags = ICMP(ulp)->icmp_type;
 				break;
 
 			default:
 				break;
 			}
 		}
 
 		UPDATE_POINTERS();
 		args->f_id.addr_type = 4;
 		args->f_id.src_ip = ntohl(src_ip.s_addr);
 		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
 	} else {
 		proto = 0;
 		dst_ip.s_addr = src_ip.s_addr = 0;
 
 		args->f_id.addr_type = 1; /* XXX */
 	}
 #undef PULLUP_TO
 	pktlen = iplen < pktlen ? iplen: pktlen;
 
 	/* Properly initialize the rest of f_id */
 	args->f_id.proto = proto;
 	args->f_id.src_port = src_port = ntohs(src_port);
 	args->f_id.dst_port = dst_port = ntohs(dst_port);
 
 	IPFW_PF_RLOCK(chain);
 	if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
 		IPFW_PF_RUNLOCK(chain);
 		return (IP_FW_PASS);	/* accept */
 	}
 	if (args->flags & IPFW_ARGS_REF) {
 		/*
 		 * Packet has already been tagged as a result of a previous
 		 * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
 		 * REASS, NETGRAPH, DIVERT/TEE...)
 		 * Validate the slot and continue from the next one
 		 * if still present, otherwise do a lookup.
 		 */
 		f_pos = (args->rule.chain_id == chain->id) ?
 		    args->rule.slot :
 		    ipfw_find_rule(chain, args->rule.rulenum,
 			args->rule.rule_id);
 	} else {
 		f_pos = 0;
 	}
 
 	if (args->flags & IPFW_ARGS_IN) {
 		iif = args->ifp;
 		oif = NULL;
 	} else {
 		MPASS(args->flags & IPFW_ARGS_OUT);
-		iif = mem ? NULL : m->m_pkthdr.rcvif;
+		iif = mem ? NULL : m_rcvif(m);
 		oif = args->ifp;
 	}
 
 	/*
 	 * Now scan the rules, and parse microinstructions for each rule.
 	 * We have two nested loops and an inner switch. Sometimes we
 	 * need to break out of one or both loops, or re-enter one of
 	 * the loops with updated variables. Loop variables are:
 	 *
 	 *	f_pos (outer loop) points to the current rule.
 	 *		On output it points to the matching rule.
 	 *	done (outer loop) is used as a flag to break the loop.
 	 *	l (inner loop)	residual length of current rule.
 	 *		cmd points to the current microinstruction.
 	 *
 	 * We break the inner loop by setting l=0 and possibly
 	 * cmdlen=0 if we don't want to advance cmd.
 	 * We break the outer loop by setting done=1
 	 * We can restart the inner loop by setting l>0 and f_pos, f, cmd
 	 * as needed.
 	 */
 	for (; f_pos < chain->n_rules; f_pos++) {
 		ipfw_insn *cmd;
 		uint32_t tablearg = 0;
 		int l, cmdlen, skip_or; /* skip rest of OR block */
 		struct ip_fw *f;
 
 		f = chain->map[f_pos];
 		if (V_set_disable & (1 << f->set) )
 			continue;
 
 		skip_or = 0;
 		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
 		    l -= cmdlen, cmd += cmdlen) {
 			int match;
 
 			/*
 			 * check_body is a jump target used when we find a
 			 * CHECK_STATE, and need to jump to the body of
 			 * the target rule.
 			 */
 
 /* check_body: */
 			cmdlen = F_LEN(cmd);
 			/*
 			 * An OR block (insn_1 || .. || insn_n) has the
 			 * F_OR bit set in all but the last instruction.
 			 * The first match will set "skip_or", and cause
 			 * the following instructions to be skipped until
 			 * past the one with the F_OR bit clear.
 			 */
 			if (skip_or) {		/* skip this instruction */
 				if ((cmd->len & F_OR) == 0)
 					skip_or = 0;	/* next one is good */
 				continue;
 			}
 			match = 0; /* set to 1 if we succeed */
 
 			switch (cmd->opcode) {
 			/*
 			 * The first set of opcodes compares the packet's
 			 * fields with some pattern, setting 'match' if a
 			 * match is found. At the end of the loop there is
 			 * logic to deal with F_NOT and F_OR flags associated
 			 * with the opcode.
 			 */
 			case O_NOP:
 				match = 1;
 				break;
 
 			case O_FORWARD_MAC:
 				printf("ipfw: opcode %d unimplemented\n",
 				    cmd->opcode);
 				break;
 
 			case O_GID:
 			case O_UID:
 			case O_JAIL:
 				/*
 				 * We only check offset == 0 && proto != 0,
 				 * as this ensures that we have a
 				 * packet with the ports info.
 				 */
 				if (offset != 0)
 					break;
 				if (proto == IPPROTO_TCP ||
 				    proto == IPPROTO_UDP ||
 				    proto == IPPROTO_UDPLITE)
 					match = check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 #ifdef __FreeBSD__
 						    &ucred_cache);
 #else
 						    (void *)&ucred_cache);
 #endif
 				break;
 
 			case O_RECV:
 				match = iface_match(iif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_XMIT:
 				match = iface_match(oif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_VIA:
 				match = iface_match(args->ifp,
 				    (ipfw_insn_if *)cmd, chain, &tablearg);
 				break;
 
 			case O_MACADDR2:
 				if (args->flags & IPFW_ARGS_ETHER) {
 					u_int32_t *want = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->addr;
 					u_int32_t *mask = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->mask;
 					u_int32_t *hdr = (u_int32_t *)eh;
 
 					match =
 					    ( want[0] == (hdr[0] & mask[0]) &&
 					      want[1] == (hdr[1] & mask[1]) &&
 					      want[2] == (hdr[2] & mask[2]) );
 				}
 				break;
 
 			case O_MAC_TYPE:
 				if (args->flags & IPFW_ARGS_ETHER) {
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match =
 						    (ntohs(eh->ether_type) >=
 						    p[0] &&
 						    ntohs(eh->ether_type) <=
 						    p[1]);
 				}
 				break;
 
 			case O_FRAG:
 				match = (offset != 0);
 				break;
 
 			case O_IN:	/* "out" is "not in" */
 				match = (oif == NULL);
 				break;
 
 			case O_LAYER2:
 				match = (args->flags & IPFW_ARGS_ETHER);
 				break;
 
 			case O_DIVERTED:
 				if ((args->flags & IPFW_ARGS_REF) == 0)
 					break;
 				/*
 				 * For diverted packets, args->rule.info
 				 * contains the divert port (in host format)
 				 * reason and direction.
 				 */
 				match = ((args->rule.info & IPFW_IS_MASK) ==
 				    IPFW_IS_DIVERT) && (
 				    ((args->rule.info & IPFW_INFO_IN) ?
 					1: 2) & cmd->arg1);
 				break;
 
 			case O_PROTO:
 				/*
 				 * We do not allow an arg of 0 so the
 				 * check of "proto" only suffices.
 				 */
 				match = (proto == cmd->arg1);
 				break;
 
 			case O_IP_SRC:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    src_ip.s_addr);
 				break;
 
 			case O_IP_DST_LOOKUP:
 			{
 				void *pkey;
 				uint32_t vidx, key;
 				uint16_t keylen;
 
 				if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
 					/* Determine lookup key type */
 					vidx = ((ipfw_insn_u32 *)cmd)->d[1];
 					if (vidx != 4 /* uid */ &&
 					    vidx != 5 /* jail */ &&
 					    is_ipv6 == 0 && is_ipv4 == 0)
 						break;
 					/* Determine key length */
 					if (vidx == 0 /* dst-ip */ ||
 					    vidx == 1 /* src-ip */)
 						keylen = is_ipv6 ?
 						    sizeof(struct in6_addr):
 						    sizeof(in_addr_t);
 					else {
 						keylen = sizeof(key);
 						pkey = &key;
 					}
 					if (vidx == 0 /* dst-ip */)
 						pkey = is_ipv4 ? (void *)&dst_ip:
 						    (void *)&args->f_id.dst_ip6;
 					else if (vidx == 1 /* src-ip */)
 						pkey = is_ipv4 ? (void *)&src_ip:
 						    (void *)&args->f_id.src_ip6;
 					else if (vidx == 6 /* dscp */) {
 						if (is_ipv4)
 							key = ip->ip_tos >> 2;
 						else {
 							key = args->f_id.flow_id6;
 							key = (key & 0x0f) << 2 |
 							    (key & 0xf000) >> 14;
 						}
 						key &= 0x3f;
 					} else if (vidx == 2 /* dst-port */ ||
 					    vidx == 3 /* src-port */) {
 						/* Skip fragments */
 						if (offset != 0)
 							break;
 						/* Skip proto without ports */
 						if (proto != IPPROTO_TCP &&
 						    proto != IPPROTO_UDP &&
 						    proto != IPPROTO_UDPLITE &&
 						    proto != IPPROTO_SCTP)
 							break;
 						if (vidx == 2 /* dst-port */)
 							key = dst_port;
 						else
 							key = src_port;
 					}
 #ifndef USERSPACE
 					else if (vidx == 4 /* uid */ ||
 					    vidx == 5 /* jail */) {
 						check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 #ifdef __FreeBSD__
 						    &ucred_cache);
 						if (vidx == 4 /* uid */)
 							key = ucred_cache->cr_uid;
 						else if (vidx == 5 /* jail */)
 							key = ucred_cache->cr_prison->pr_id;
 #else /* !__FreeBSD__ */
 						    (void *)&ucred_cache);
 						if (vidx == 4 /* uid */)
 							key = ucred_cache.uid;
 						else if (vidx == 5 /* jail */)
 							key = ucred_cache.xid;
 #endif /* !__FreeBSD__ */
 					}
 #endif /* !USERSPACE */
 					else
 						break;
 					match = ipfw_lookup_table(chain,
 					    cmd->arg1, keylen, pkey, &vidx);
 					if (!match)
 						break;
 					tablearg = vidx;
 					break;
 				}
 				/* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */
 				/* FALLTHROUGH */
 			}
 			case O_IP_SRC_LOOKUP:
 			{
 				void *pkey;
 				uint32_t vidx;
 				uint16_t keylen;
 
 				if (is_ipv4) {
 					keylen = sizeof(in_addr_t);
 					if (cmd->opcode == O_IP_DST_LOOKUP)
 						pkey = &dst_ip;
 					else
 						pkey = &src_ip;
 				} else if (is_ipv6) {
 					keylen = sizeof(struct in6_addr);
 					if (cmd->opcode == O_IP_DST_LOOKUP)
 						pkey = &args->f_id.dst_ip6;
 					else
 						pkey = &args->f_id.src_ip6;
 				} else
 					break;
 				match = ipfw_lookup_table(chain, cmd->arg1,
 				    keylen, pkey, &vidx);
 				if (!match)
 					break;
 				if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) {
 					match = ((ipfw_insn_u32 *)cmd)->d[0] ==
 					    TARG_VAL(chain, vidx, tag);
 					if (!match)
 						break;
 				}
 				tablearg = vidx;
 				break;
 			}
 
 			case O_IP_FLOW_LOOKUP:
 				{
 					uint32_t v = 0;
 					match = ipfw_lookup_table(chain,
 					    cmd->arg1, 0, &args->f_id, &v);
 					if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 						match = ((ipfw_insn_u32 *)cmd)->d[0] ==
 						    TARG_VAL(chain, v, tag);
 					if (match)
 						tablearg = v;
 				}
 				break;
 			case O_IP_SRC_MASK:
 			case O_IP_DST_MASK:
 				if (is_ipv4) {
 				    uint32_t a =
 					(cmd->opcode == O_IP_DST_MASK) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
 				    int i = cmdlen-1;
 
 				    for (; !match && i>0; i-= 2, p+= 2)
 					match = (p[0] == (a & p[1]));
 				}
 				break;
 
 			case O_IP_SRC_ME:
 				if (is_ipv4) {
 					match = in_localip(src_ip);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_SRC_ME:
 				match = is_ipv6 &&
 				    ipfw_localip6(&args->f_id.src_ip6);
 #endif
 				break;
 
 			case O_IP_DST_SET:
 			case O_IP_SRC_SET:
 				if (is_ipv4) {
 					u_int32_t *d = (u_int32_t *)(cmd+1);
 					u_int32_t addr =
 					    cmd->opcode == O_IP_DST_SET ?
 						args->f_id.dst_ip :
 						args->f_id.src_ip;
 
 					    if (addr < d[0])
 						    break;
 					    addr -= d[0]; /* subtract base */
 					    match = (addr < cmd->arg1) &&
 						( d[ 1 + (addr>>5)] &
 						  (1<<(addr & 0x1f)) );
 				}
 				break;
 
 			case O_IP_DST:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    dst_ip.s_addr);
 				break;
 
 			case O_IP_DST_ME:
 				if (is_ipv4) {
 					match = in_localip(dst_ip);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_DST_ME:
 				match = is_ipv6 &&
 				    ipfw_localip6(&args->f_id.dst_ip6);
 #endif
 				break;
 
 
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
 				 * offset == 0 && proto != 0 is enough
 				 * to guarantee that we have a
 				 * packet with port info.
 				 */
 				if ((proto == IPPROTO_UDP ||
 				    proto == IPPROTO_UDPLITE ||
 				    proto == IPPROTO_TCP ||
 				    proto == IPPROTO_SCTP) && offset == 0) {
 					u_int16_t x =
 					    (cmd->opcode == O_IP_SRCPORT) ?
 						src_port : dst_port ;
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (x>=p[0] && x<=p[1]);
 				}
 				break;
 
 			case O_ICMPTYPE:
 				match = (offset == 0 && proto==IPPROTO_ICMP &&
 				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
 				break;
 
 #ifdef INET6
 			case O_ICMP6TYPE:
 				match = is_ipv6 && offset == 0 &&
 				    proto==IPPROTO_ICMPV6 &&
 				    icmp6type_match(
 					ICMP6(ulp)->icmp6_type,
 					(ipfw_insn_u32 *)cmd);
 				break;
 #endif /* INET6 */
 
 			case O_IPOPT:
 				match = (is_ipv4 &&
 				    ipopts_match(ip, cmd) );
 				break;
 
 			case O_IPVER:
 				match = (is_ipv4 &&
 				    cmd->arg1 == ip->ip_v);
 				break;
 
 			case O_IPID:
 			case O_IPTTL:
 				if (!is_ipv4)
 					break;
 			case O_IPLEN:
 				{	/* only for IP packets */
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    if (cmd->opcode == O_IPLEN)
 					x = iplen;
 				    else if (cmd->opcode == O_IPTTL)
 					x = ip->ip_ttl;
 				    else /* must be IPID */
 					x = ntohs(ip->ip_id);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_IPPRECEDENCE:
 				match = (is_ipv4 &&
 				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
 				break;
 
 			case O_IPTOS:
 				match = (is_ipv4 &&
 				    flags_match(cmd, ip->ip_tos));
 				break;
 
 			case O_DSCP:
 			    {
 				uint32_t *p;
 				uint16_t x;
 
 				p = ((ipfw_insn_u32 *)cmd)->d;
 
 				if (is_ipv4)
 					x = ip->ip_tos >> 2;
 				else if (is_ipv6) {
 					uint8_t *v;
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					x = (*v & 0x0F) << 2;
 					v++;
 					x |= *v >> 6;
 				} else
 					break;
 
 				/* DSCP bitmask is stored as low_u32 high_u32 */
 				if (x >= 32)
 					match = *(p + 1) & (1 << (x - 32));
 				else
 					match = *p & (1 << x);
 			    }
 				break;
 
 			case O_TCPDATALEN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    struct tcphdr *tcp;
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 #ifdef INET6
 				    if (is_ipv6) {
 					    struct ip6_hdr *ip6;
 
 					    ip6 = (struct ip6_hdr *)ip;
 					    if (ip6->ip6_plen == 0) {
 						    /*
 						     * Jumbo payload is not
 						     * supported by this
 						     * opcode.
 						     */
 						    break;
 					    }
 					    x = iplen - hlen;
 				    } else
 #endif /* INET6 */
 					    x = iplen - (ip->ip_hl << 2);
 				    tcp = TCP(ulp);
 				    x -= tcp->th_off << 2;
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_TCPFLAGS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    flags_match(cmd, TCP(ulp)->th_flags));
 				break;
 
 			case O_TCPOPTS:
 				if (proto == IPPROTO_TCP && offset == 0 && ulp){
 					PULLUP_LEN(hlen, ulp,
 					    (TCP(ulp)->th_off << 2));
 					match = tcpopts_match(TCP(ulp), cmd);
 				}
 				break;
 
 			case O_TCPSEQ:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_seq);
 				break;
 
 			case O_TCPACK:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_ack);
 				break;
 
 			case O_TCPWIN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    x = ntohs(TCP(ulp)->th_win);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* Otherwise we have ranges. */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i > 0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_ESTAB:
 				/* reject packets which have SYN only */
 				/* XXX should i also check for TH_ACK ? */
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    (TCP(ulp)->th_flags &
 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
 				break;
 
 			case O_ALTQ: {
 				struct pf_mtag *at;
 				struct m_tag *mtag;
 				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 				/*
 				 * ALTQ uses mbuf tags from another
 				 * packet filtering system - pf(4).
 				 * We allocate a tag in its format
 				 * and fill it in, pretending to be pf(4).
 				 */
 				match = 1;
 				at = pf_find_mtag(m);
 				if (at != NULL && at->qid != 0)
 					break;
 				mtag = m_tag_get(PACKET_TAG_PF,
 				    sizeof(struct pf_mtag), M_NOWAIT | M_ZERO);
 				if (mtag == NULL) {
 					/*
 					 * Let the packet fall back to the
 					 * default ALTQ.
 					 */
 					break;
 				}
 				m_tag_prepend(m, mtag);
 				at = (struct pf_mtag *)(mtag + 1);
 				at->qid = altq->qid;
 				at->hdr = ip;
 				break;
 			}
 
 			case O_LOG:
 				ipfw_log(chain, f, hlen, args,
 				    offset | ip6f_mf, tablearg, ip);
 				match = 1;
 				break;
 
 			case O_PROB:
 				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
 				break;
 
 			case O_VERREVPATH:
 				/* Outgoing packets automatically pass/match */
 				match = (args->flags & IPFW_ARGS_OUT ||
 				    (
 #ifdef INET6
 				    is_ipv6 ?
 					verify_path6(&(args->f_id.src_ip6),
 					    iif, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, iif, args->f_id.fib)));
 				break;
 
 			case O_VERSRCREACH:
 				/* Outgoing packets automatically pass/match */
 				match = (hlen > 0 && ((oif != NULL) || (
 #ifdef INET6
 				    is_ipv6 ?
 				        verify_path6(&(args->f_id.src_ip6),
 				            NULL, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, NULL, args->f_id.fib))));
 				break;
 
 			case O_ANTISPOOF:
 				/* Outgoing packets automatically pass/match */
 				if (oif == NULL && hlen > 0 &&
 				    (  (is_ipv4 && in_localaddr(src_ip))
 #ifdef INET6
 				    || (is_ipv6 &&
 				        in6_localaddr(&(args->f_id.src_ip6)))
 #endif
 				    ))
 					match =
 #ifdef INET6
 					    is_ipv6 ? verify_path6(
 					        &(args->f_id.src_ip6), iif,
 						args->f_id.fib) :
 #endif
 					    verify_path(src_ip, iif,
 					        args->f_id.fib);
 				else
 					match = 1;
 				break;
 
 			case O_IPSEC:
 				match = (m_tag_find(m,
 				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
 				/* otherwise no match */
 				break;
 
 #ifdef INET6
 			case O_IP6_SRC:
 				match = is_ipv6 &&
 				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 
 			case O_IP6_DST:
 				match = is_ipv6 &&
 				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 				if (is_ipv6) {
 					int i = cmdlen - 1;
 					struct in6_addr p;
 					struct in6_addr *d =
 					    &((ipfw_insn_ip6 *)cmd)->addr6;
 
 					for (; !match && i > 0; d += 2,
 					    i -= F_INSN_SIZE(struct in6_addr)
 					    * 2) {
 						p = (cmd->opcode ==
 						    O_IP6_SRC_MASK) ?
 						    args->f_id.src_ip6:
 						    args->f_id.dst_ip6;
 						APPLY_MASK(&p, &d[1]);
 						match =
 						    IN6_ARE_ADDR_EQUAL(&d[0],
 						    &p);
 					}
 				}
 				break;
 
 			case O_FLOW6ID:
 				match = is_ipv6 &&
 				    flow6id_match(args->f_id.flow_id6,
 				    (ipfw_insn_u32 *) cmd);
 				break;
 
 			case O_EXT_HDR:
 				match = is_ipv6 &&
 				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
 				break;
 
 			case O_IP6:
 				match = is_ipv6;
 				break;
 #endif
 
 			case O_IP4:
 				match = is_ipv4;
 				break;
 
 			case O_TAG: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				/* Packet is already tagged with this tag? */
 				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
 
 				/* We have `untag' action when F_NOT flag is
 				 * present. And we must remove this mtag from
 				 * mbuf and reset `match' to zero (`match' will
 				 * be inversed later).
 				 * Otherwise we should allocate new mtag and
 				 * push it into mbuf.
 				 */
 				if (cmd->len & F_NOT) { /* `untag' action */
 					if (mtag != NULL)
 						m_tag_delete(m, mtag);
 					match = 0;
 				} else {
 					if (mtag == NULL) {
 						mtag = m_tag_alloc( MTAG_IPFW,
 						    tag, 0, M_NOWAIT);
 						if (mtag != NULL)
 							m_tag_prepend(m, mtag);
 					}
 					match = 1;
 				}
 				break;
 			}
 
 			case O_FIB: /* try match the specified fib */
 				if (args->f_id.fib == cmd->arg1)
 					match = 1;
 				break;
 
 			case O_SOCKARG:	{
 #ifndef USERSPACE	/* not supported in userspace */
 				struct inpcb *inp = args->inp;
 				struct inpcbinfo *pi;
 				
 				if (is_ipv6) /* XXX can we remove this ? */
 					break;
 
 				if (proto == IPPROTO_TCP)
 					pi = &V_tcbinfo;
 				else if (proto == IPPROTO_UDP)
 					pi = &V_udbinfo;
 				else if (proto == IPPROTO_UDPLITE)
 					pi = &V_ulitecbinfo;
 				else
 					break;
 
 				/*
 				 * XXXRW: so_user_cookie should almost
 				 * certainly be inp_user_cookie?
 				 */
 
 				/* For incoming packet, lookup up the 
 				inpcb using the src/dest ip/port tuple */
 				if (inp == NULL) {
 					inp = in_pcblookup(pi, 
 						src_ip, htons(src_port),
 						dst_ip, htons(dst_port),
 						INPLOOKUP_RLOCKPCB, NULL);
 					if (inp != NULL) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 						INP_RUNLOCK(inp);
 					}
 				} else {
 					if (inp->inp_socket) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 					}
 				}
 #endif /* !USERSPACE */
 				break;
 			}
 
 			case O_TAGGED: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				if (cmdlen == 1) {
 					match = m_tag_locate(m, MTAG_IPFW,
 					    tag, NULL) != NULL;
 					break;
 				}
 
 				/* we have ranges */
 				for (mtag = m_tag_first(m);
 				    mtag != NULL && !match;
 				    mtag = m_tag_next(m, mtag)) {
 					uint16_t *p;
 					int i;
 
 					if (mtag->m_tag_cookie != MTAG_IPFW)
 						continue;
 
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for(; !match && i > 0; i--, p += 2)
 						match =
 						    mtag->m_tag_id >= p[0] &&
 						    mtag->m_tag_id <= p[1];
 				}
 				break;
 			}
 				
 			/*
 			 * The second set of opcodes represents 'actions',
 			 * i.e. the terminal part of a rule once the packet
 			 * matches all previous patterns.
 			 * Typically there is only one action for each rule,
 			 * and the opcode is stored at the end of the rule
 			 * (but there are exceptions -- see below).
 			 *
 			 * In general, here we set retval and terminate the
 			 * outer loop (would be a 'break 3' in some language,
 			 * but we need to set l=0, done=1)
 			 *
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
 			 *   (setting l=0), or to the SKIPTO target (setting
 			 *   f/f_len, cmd and l as needed), respectively.
 			 *
 			 * O_TAG, O_LOG and O_ALTQ action parameters:
 			 *   perform some action and set match = 1;
 			 *
 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
 			 *   not real 'actions', and are stored right
 			 *   before the 'action' part of the rule (one
 			 *   exception is O_SKIP_ACTION which could be
 			 *   between these opcodes and 'action' one).
 			 *   These opcodes try to install an entry in the
 			 *   state tables; if successful, we continue with
 			 *   the next opcode (match=1; break;), otherwise
 			 *   the packet must be dropped (set retval,
 			 *   break loops with l=0, done=1)
 			 *
 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
 			 *   cause a lookup of the state table, and a jump
 			 *   to the 'action' part of the parent rule
 			 *   if an entry is found, or
 			 *   (CHECK_STATE only) a jump to the next rule if
 			 *   the entry is not found.
 			 *   The result of the lookup is cached so that
 			 *   further instances of these opcodes become NOPs.
 			 *   The jump to the next rule is done by setting
 			 *   l=0, cmdlen=0.
 			 *
 			 * O_SKIP_ACTION: this opcode is not a real 'action'
 			 *  either, and is stored right before the 'action'
 			 *  part of the rule, right after the O_KEEP_STATE
 			 *  opcode. It causes match failure so the real
 			 *  'action' could be executed only if the rule
 			 *  is checked via dynamic rule from the state
 			 *  table, as in such case execution starts
 			 *  from the true 'action' opcode directly.
 			 *   
 			 */
 			case O_LIMIT:
 			case O_KEEP_STATE:
 				if (ipfw_dyn_install_state(chain, f,
 				    (ipfw_insn_limit *)cmd, args, ulp,
 				    pktlen, &dyn_info, tablearg)) {
 					/* error or limit violation */
 					retval = IP_FW_DENY;
 					l = 0;	/* exit inner loop */
 					done = 1; /* exit outer loop */
 				}
 				match = 1;
 				break;
 
 			case O_PROBE_STATE:
 			case O_CHECK_STATE:
 				/*
 				 * dynamic rules are checked at the first
 				 * keep-state or check-state occurrence,
 				 * with the result being stored in dyn_info.
 				 * The compiler introduces a PROBE_STATE
 				 * instruction for us when we have a
 				 * KEEP_STATE (because PROBE_STATE needs
 				 * to be run first).
 				 */
 				if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) &&
 				    (q = ipfw_dyn_lookup_state(args, ulp,
 				    pktlen, cmd, &dyn_info)) != NULL) {
 					/*
 					 * Found dynamic entry, jump to the
 					 * 'action' part of the parent rule
 					 * by setting f, cmd, l and clearing
 					 * cmdlen.
 					 */
 					f = q;
 					f_pos = dyn_info.f_pos;
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					cmdlen = 0;
 					match = 1;
 					break;
 				}
 				/*
 				 * Dynamic entry not found. If CHECK_STATE,
 				 * skip to next rule, if PROBE_STATE just
 				 * ignore and continue with next opcode.
 				 */
 				if (cmd->opcode == O_CHECK_STATE)
 					l = 0;	/* exit inner loop */
 				match = 1;
 				break;
 
 			case O_SKIP_ACTION:
 				match = 0;	/* skip to the next rule */
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_ACCEPT:
 				retval = 0;	/* accept */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_PIPE:
 			case O_QUEUE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, pipe);
 				if (cmd->opcode == O_PIPE)
 					args->rule.info |= IPFW_IS_PIPE;
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = IP_FW_DUMMYNET;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_DIVERT:
 			case O_TEE:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not on layer 2 */
 				/* otherwise this is terminal */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				retval = (cmd->opcode == O_DIVERT) ?
 					IP_FW_DIVERT : IP_FW_TEE;
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, divert);
 				break;
 
 			case O_COUNT:
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_SKIPTO:
 			    IPFW_INC_RULE_COUNTER(f, pktlen);
 			    f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0);
 			    /*
 			     * Skip disabled rules, and re-enter
 			     * the inner loop with the correct
 			     * f_pos, f, l and cmd.
 			     * Also clear cmdlen and skip_or
 			     */
 			    for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				     (1 << chain->map[f_pos]->set));
 				    f_pos++)
 				;
 			    /* Re-enter the inner loop at the skipto rule. */
 			    f = chain->map[f_pos];
 			    l = f->cmd_len;
 			    cmd = f->cmd;
 			    match = 1;
 			    cmdlen = 0;
 			    skip_or = 0;
 			    continue;
 			    break;	/* not reached */
 
 			case O_CALLRETURN: {
 				/*
 				 * Implementation of `subroutine' call/return,
 				 * in the stack carried in an mbuf tag. This
 				 * is different from `skipto' in that any call
 				 * address is possible (`skipto' must prevent
 				 * backward jumps to avoid endless loops).
 				 * We have `return' action when F_NOT flag is
 				 * present. The `m_tag_id' field is used as
 				 * stack pointer.
 				 */
 				struct m_tag *mtag;
 				uint16_t jmpto, *stack;
 
 #define	IS_CALL		((cmd->len & F_NOT) == 0)
 #define	IS_RETURN	((cmd->len & F_NOT) != 0)
 				/*
 				 * Hand-rolled version of m_tag_locate() with
 				 * wildcard `type'.
 				 * If not already tagged, allocate new tag.
 				 */
 				mtag = m_tag_first(m);
 				while (mtag != NULL) {
 					if (mtag->m_tag_cookie ==
 					    MTAG_IPFW_CALL)
 						break;
 					mtag = m_tag_next(m, mtag);
 				}
 				if (mtag == NULL && IS_CALL) {
 					mtag = m_tag_alloc(MTAG_IPFW_CALL, 0,
 					    IPFW_CALLSTACK_SIZE *
 					    sizeof(uint16_t), M_NOWAIT);
 					if (mtag != NULL)
 						m_tag_prepend(m, mtag);
 				}
 
 				/*
 				 * On error both `call' and `return' just
 				 * continue with next rule.
 				 */
 				if (IS_RETURN && (mtag == NULL ||
 				    mtag->m_tag_id == 0)) {
 					l = 0;		/* exit inner loop */
 					break;
 				}
 				if (IS_CALL && (mtag == NULL ||
 				    mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) {
 					printf("ipfw: call stack error, "
 					    "go to next rule\n");
 					l = 0;		/* exit inner loop */
 					break;
 				}
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				stack = (uint16_t *)(mtag + 1);
 
 				/*
 				 * The `call' action may use cached f_pos
 				 * (in f->next_rule), whose version is written
 				 * in f->next_rule.
 				 * The `return' action, however, doesn't have
 				 * fixed jump address in cmd->arg1 and can't use
 				 * cache.
 				 */
 				if (IS_CALL) {
 					stack[mtag->m_tag_id] = f->rulenum;
 					mtag->m_tag_id++;
 			    		f_pos = JUMP(chain, f, cmd->arg1,
 					    tablearg, 1);
 				} else {	/* `return' action */
 					mtag->m_tag_id--;
 					jmpto = stack[mtag->m_tag_id] + 1;
 					f_pos = ipfw_find_rule(chain, jmpto, 0);
 				}
 
 				/*
 				 * Skip disabled rules, and re-enter
 				 * the inner loop with the correct
 				 * f_pos, f, l and cmd.
 				 * Also clear cmdlen and skip_or
 				 */
 				for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				    (1 << chain->map[f_pos]->set)); f_pos++)
 					;
 				/* Re-enter the inner loop at the dest rule. */
 				f = chain->map[f_pos];
 				l = f->cmd_len;
 				cmd = f->cmd;
 				cmdlen = 0;
 				skip_or = 0;
 				continue;
 				break;	/* NOTREACHED */
 			}
 #undef IS_CALL
 #undef IS_RETURN
 
 			case O_REJECT:
 				/*
 				 * Drop the packet and send a reject notice
 				 * if the packet is not ICMP (or is an ICMP
 				 * query), and it is not multicast/broadcast.
 				 */
 				if (hlen > 0 && is_ipv4 && offset == 0 &&
 				    (proto != IPPROTO_ICMP ||
 				     is_icmp_query(ICMP(ulp))) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
 					send_reject(args, cmd->arg1, iplen, ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #ifdef INET6
 			case O_UNREACH6:
 				if (hlen > 0 && is_ipv6 &&
 				    ((offset & IP6F_OFF_MASK) == 0) &&
 				    (proto != IPPROTO_ICMPV6 ||
 				     (is_icmp6_query(icmp6_type) == 1)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN6_IS_ADDR_MULTICAST(
 					&args->f_id.dst_ip6)) {
 					send_reject6(args,
 					    cmd->opcode == O_REJECT ?
 					    map_icmp_unreach(cmd->arg1):
 					    cmd->arg1, hlen,
 					    (struct ip6_hdr *)ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #endif
 			case O_DENY:
 				retval = IP_FW_DENY;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_FORWARD_IP:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not valid on layer2 pkts */
 				if (q != f ||
 				    dyn_info.direction == MATCH_FORWARD) {
 				    struct sockaddr_in *sa;
 
 				    sa = &(((ipfw_insn_sa *)cmd)->sa);
 				    if (sa->sin_addr.s_addr == INADDR_ANY) {
 #ifdef INET6
 					/*
 					 * We use O_FORWARD_IP opcode for
 					 * fwd rule with tablearg, but tables
 					 * now support IPv6 addresses. And
 					 * when we are inspecting IPv6 packet,
 					 * we can use nh6 field from
 					 * table_value as next_hop6 address.
 					 */
 					if (is_ipv6) {
 						struct ip_fw_nh6 *nh6;
 
 						args->flags |= IPFW_ARGS_NH6;
 						nh6 = &args->hopstore6;
 						nh6->sin6_addr = TARG_VAL(
 						    chain, tablearg, nh6);
 						nh6->sin6_port = sa->sin_port;
 						nh6->sin6_scope_id = TARG_VAL(
 						    chain, tablearg, zoneid);
 					} else
 #endif
 					{
 						args->flags |= IPFW_ARGS_NH4;
 						args->hopstore.sin_port =
 						    sa->sin_port;
 						sa = &args->hopstore;
 						sa->sin_family = AF_INET;
 						sa->sin_len = sizeof(*sa);
 						sa->sin_addr.s_addr = htonl(
 						    TARG_VAL(chain, tablearg,
 						    nh4));
 					}
 				    } else {
 					    args->flags |= IPFW_ARGS_NH4PTR;
 					    args->next_hop = sa;
 				    }
 				}
 				retval = IP_FW_PASS;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 #ifdef INET6
 			case O_FORWARD_IP6:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not valid on layer2 pkts */
 				if (q != f ||
 				    dyn_info.direction == MATCH_FORWARD) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = &(((ipfw_insn_sa6 *)cmd)->sa);
 					args->flags |= IPFW_ARGS_NH6PTR;
 					args->next_hop6 = sin6;
 				}
 				retval = IP_FW_PASS;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 #endif
 
 			case O_NETGRAPH:
 			case O_NGTEE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, netgraph);
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = (cmd->opcode == O_NETGRAPH) ?
 				    IP_FW_NETGRAPH : IP_FW_NGTEE;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_SETFIB: {
 				uint32_t fib;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				fib = TARG(cmd->arg1, fib) & 0x7FFF;
 				if (fib >= rt_numfibs)
 					fib = 0;
 				M_SETFIB(m, fib);
 				args->f_id.fib = fib; /* XXX */
 				l = 0;		/* exit inner loop */
 				break;
 		        }
 
 			case O_SETDSCP: {
 				uint16_t code;
 
 				code = TARG(cmd->arg1, dscp) & 0x3F;
 				l = 0;		/* exit inner loop */
 				if (is_ipv4) {
 					uint16_t old;
 
 					old = *(uint16_t *)ip;
 					ip->ip_tos = (code << 2) |
 					    (ip->ip_tos & 0x03);
 					ip->ip_sum = cksum_adjust(ip->ip_sum,
 					    old, *(uint16_t *)ip);
 				} else if (is_ipv6) {
 					uint8_t *v;
 
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					*v = (*v & 0xF0) | (code >> 2);
 					v++;
 					*v = (*v & 0x3F) | ((code & 0x03) << 6);
 				} else
 					break;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				break;
 			}
 
 			case O_NAT:
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				/*
 				 * Ensure that we do not invoke NAT handler for
 				 * non IPv4 packets. Libalias expects only IPv4.
 				 */
 				if (!is_ipv4 || !IPFW_NAT_LOADED) {
 				    retval = IP_FW_DENY;
 				    break;
 				}
 
 				struct cfg_nat *t;
 				int nat_id;
 
 				args->rule.info = 0;
 				set_match(args, f_pos, chain);
 				/* Check if this is 'global' nat rule */
 				if (cmd->arg1 == IP_FW_NAT44_GLOBAL) {
 					retval = ipfw_nat_ptr(args, NULL, m);
 					break;
 				}
 				t = ((ipfw_insn_nat *)cmd)->nat;
 				if (t == NULL) {
 					nat_id = TARG(cmd->arg1, nat);
 					t = (*lookup_nat_ptr)(&chain->nat, nat_id);
 
 					if (t == NULL) {
 					    retval = IP_FW_DENY;
 					    break;
 					}
 					if (cmd->arg1 != IP_FW_TARG)
 					    ((ipfw_insn_nat *)cmd)->nat = t;
 				}
 				retval = ipfw_nat_ptr(args, t, m);
 				break;
 
 			case O_REASS: {
 				int ip_off;
 
 				l = 0;	/* in any case exit inner loop */
 				if (is_ipv6) /* IPv6 is not supported yet */
 					break;
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				ip_off = ntohs(ip->ip_off);
 
 				/* if not fragmented, go to next rule */
 				if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
 				    break;
 
 				args->m = m = ip_reass(m);
 
 				/*
 				 * do IP header checksum fixup.
 				 */
 				if (m == NULL) { /* fragment got swallowed */
 				    retval = IP_FW_DENY;
 				} else { /* good, packet complete */
 				    int hlen;
 
 				    ip = mtod(m, struct ip *);
 				    hlen = ip->ip_hl << 2;
 				    ip->ip_sum = 0;
 				    if (hlen == sizeof(struct ip))
 					ip->ip_sum = in_cksum_hdr(ip);
 				    else
 					ip->ip_sum = in_cksum(m, hlen);
 				    retval = IP_FW_REASS;
 				    args->rule.info = 0;
 				    set_match(args, f_pos, chain);
 				}
 				done = 1;	/* exit outer loop */
 				break;
 			}
 			case O_EXTERNAL_ACTION:
 				l = 0; /* in any case exit inner loop */
 				retval = ipfw_run_eaction(chain, args,
 				    cmd, &done);
 				/*
 				 * If both @retval and @done are zero,
 				 * consider this as rule matching and
 				 * update counters.
 				 */
 				if (retval == 0 && done == 0) {
 					IPFW_INC_RULE_COUNTER(f, pktlen);
 					/*
 					 * Reset the result of the last
 					 * dynamic state lookup.
 					 * External action can change
 					 * @args content, and it may be
 					 * used for new state lookup later.
 					 */
 					DYN_INFO_INIT(&dyn_info);
 				}
 				break;
 
 			default:
 				panic("-- unknown opcode %d\n", cmd->opcode);
 			} /* end of switch() on opcodes */
 			/*
 			 * if we get here with l=0, then match is irrelevant.
 			 */
 
 			if (cmd->len & F_NOT)
 				match = !match;
 
 			if (match) {
 				if (cmd->len & F_OR)
 					skip_or = 1;
 			} else {
 				if (!(cmd->len & F_OR)) /* not an OR block, */
 					break;		/* try next rule    */
 			}
 
 		}	/* end of inner loop, scan opcodes */
 #undef PULLUP_LEN
 
 		if (done)
 			break;
 
 /* next_rule:; */	/* try next rule		*/
 
 	}		/* end of outer for, scan rules */
 
 	if (done) {
 		struct ip_fw *rule = chain->map[f_pos];
 		/* Update statistics */
 		IPFW_INC_RULE_COUNTER(rule, pktlen);
 	} else {
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
 	IPFW_PF_RUNLOCK(chain);
 #ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
 #endif
 	return (retval);
 
 pullup_failed:
 	if (V_fw_verbose)
 		printf("ipfw: pullup failed\n");
 	return (IP_FW_DENY);
 }
 
 /*
  * Set maximum number of tables that can be used in given VNET ipfw instance.
  */
 #ifdef SYSCTL_NODE
 static int
 sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int ntables;
 
 	ntables = V_fw_tables_max;
 
 	error = sysctl_handle_int(oidp, &ntables, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_resize_tables(&V_layer3_chain, ntables));
 }
 
 /*
  * Switches table namespace between global and per-set.
  */
 static int
 sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int sets;
 
 	sets = V_fw_tables_sets;
 
 	error = sysctl_handle_int(oidp, &sets, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_switch_tables_namespace(&V_layer3_chain, sets));
 }
 #endif
 
 /*
  * Module and VNET glue
  */
 
 /*
  * Stuff that must be initialised only on boot or module load
  */
 static int
 ipfw_init(void)
 {
 	int error = 0;
 
 	/*
  	 * Only print out this stuff the first time around,
 	 * when called from the sysinit code.
 	 */
 	printf("ipfw2 "
 #ifdef INET6
 		"(+ipv6) "
 #endif
 		"initialized, divert %s, nat %s, "
 		"default to %s, logging ",
 #ifdef IPDIVERT
 		"enabled",
 #else
 		"loadable",
 #endif
 #ifdef IPFIREWALL_NAT
 		"enabled",
 #else
 		"loadable",
 #endif
 		default_to_accept ? "accept" : "deny");
 
 	/*
 	 * Note: V_xxx variables can be accessed here but the vnet specific
 	 * initializer may not have been called yet for the VIMAGE case.
 	 * Tuneables will have been processed. We will print out values for
 	 * the default vnet. 
 	 * XXX This should all be rationalized AFTER 8.0
 	 */
 	if (V_fw_verbose == 0)
 		printf("disabled\n");
 	else if (V_verbose_limit == 0)
 		printf("unlimited\n");
 	else
 		printf("limited to %d packets/entry by default\n",
 		    V_verbose_limit);
 
 	/* Check user-supplied table count for validness */
 	if (default_fw_tables > IPFW_TABLES_MAX)
 	  default_fw_tables = IPFW_TABLES_MAX;
 
 	ipfw_init_sopt_handler();
 	ipfw_init_obj_rewriter();
 	ipfw_iface_init();
 	return (error);
 }
 
 /*
  * Called for the removal of the last instance only on module unload.
  */
 static void
 ipfw_destroy(void)
 {
 
 	ipfw_iface_destroy();
 	ipfw_destroy_sopt_handler();
 	ipfw_destroy_obj_rewriter();
 	printf("IP firewall unloaded\n");
 }
 
 /*
  * Stuff that must be initialized for every instance
  * (including the first of course).
  */
 static int
 vnet_ipfw_init(const void *unused)
 {
 	int error, first;
 	struct ip_fw *rule = NULL;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 
 	first = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	/* First set up some values that are compile time options */
 	V_autoinc_step = 100;	/* bounded to 1..1000 in add_rule() */
 	V_fw_deny_unknown_exthdrs = 1;
 #ifdef IPFIREWALL_VERBOSE
 	V_fw_verbose = 1;
 #endif
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
 #ifdef IPFIREWALL_NAT
 	LIST_INIT(&chain->nat);
 #endif
 
 	/* Init shared services hash table */
 	ipfw_init_srv(chain);
 
 	ipfw_init_counters();
 	/* Set initial number of tables */
 	V_fw_tables_max = default_fw_tables;
 	error = ipfw_init_tables(chain, first);
 	if (error) {
 		printf("ipfw2: setting up tables failed\n");
 		free(chain->map, M_IPFW);
 		free(rule, M_IPFW);
 		return (ENOSPC);
 	}
 
 	IPFW_LOCK_INIT(chain);
 
 	/* fill and insert the default rule */
 	rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw));
 	rule->cmd_len = 1;
 	rule->cmd[0].len = 1;
 	rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
 	chain->default_rule = rule;
 	ipfw_add_protected_rule(chain, rule, 0);
 
 	ipfw_dyn_init(chain);
 	ipfw_eaction_init(chain, first);
 #ifdef LINEAR_SKIPTO
 	ipfw_init_skipto_cache(chain);
 #endif
 	ipfw_bpf_init(first);
 
 	/* First set up some values that are compile time options */
 	V_ipfw_vnet_ready = 1;		/* Open for business */
 
 	/*
 	 * Hook the sockopt handler and pfil hooks for ipv4 and ipv6.
 	 * Even if the latter two fail we still keep the module alive
 	 * because the sockopt and layer2 paths are still useful.
 	 * ipfw[6]_hook return 0 on success, ENOENT on failure,
 	 * so we can ignore the exact return value and just set a flag.
 	 *
 	 * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
 	 * changes in the underlying (per-vnet) variables trigger
 	 * immediate hook()/unhook() calls.
 	 * In layer2 we have the same behaviour, except that V_ether_ipfw
 	 * is checked on each packet because there are no pfil hooks.
 	 */
 	V_ip_fw_ctl_ptr = ipfw_ctl3;
 	error = ipfw_attach_hooks();
 	return (error);
 }
 
 /*
  * Called for the removal of each instance.
  */
 static int
 vnet_ipfw_uninit(const void *unused)
 {
 	struct ip_fw *reap;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i, last;
 
 	V_ipfw_vnet_ready = 0; /* tell new callers to go away */
 	/*
 	 * disconnect from ipv4, ipv6, layer2 and sockopt.
 	 * Then grab, release and grab again the WLOCK so we make
 	 * sure the update is propagated and nobody will be in.
 	 */
 	ipfw_detach_hooks();
 	V_ip_fw_ctl_ptr = NULL;
 
 	last = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	IPFW_UH_WLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 
 	ipfw_dyn_uninit(0);	/* run the callout_drain */
 
 	IPFW_UH_WLOCK(chain);
 
 	reap = NULL;
 	IPFW_WLOCK(chain);
 	for (i = 0; i < chain->n_rules; i++)
 		ipfw_reap_add(chain, &reap, chain->map[i]);
 	free(chain->map, M_IPFW);
 #ifdef LINEAR_SKIPTO
 	ipfw_destroy_skipto_cache(chain);
 #endif
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	ipfw_destroy_tables(chain, last);
 	ipfw_eaction_uninit(chain, last);
 	if (reap != NULL)
 		ipfw_reap_rules(reap);
 	vnet_ipfw_iface_destroy(chain);
 	ipfw_destroy_srv(chain);
 	IPFW_LOCK_DESTROY(chain);
 	ipfw_dyn_uninit(1);	/* free the remaining parts */
 	ipfw_destroy_counters();
 	ipfw_bpf_uninit(last);
 	return (0);
 }
 
 /*
  * Module event handler.
  * In general we have the choice of handling most of these events by the
  * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
  * use the SYSINIT handlers as they are more capable of expressing the
  * flow of control during module and vnet operations, so this is just
  * a skeleton. Note there is no SYSINIT equivalent of the module
  * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
  */
 static int
 ipfw_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* Called once at module load or
 	 	 * system boot if compiled in. */
 		break;
 	case MOD_QUIESCE:
 		/* Called before unload. May veto unloading. */
 		break;
 	case MOD_UNLOAD:
 		/* Called during unload. */
 		break;
 	case MOD_SHUTDOWN:
 		/* Called during system shutdown. */
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfwmod = {
 	"ipfw",
 	ipfw_modevent,
 	0
 };
 
 /* Define startup order. */
 #define	IPFW_SI_SUB_FIREWALL	SI_SUB_PROTO_FIREWALL
 #define	IPFW_MODEVENT_ORDER	(SI_ORDER_ANY - 255) /* On boot slot in here. */
 #define	IPFW_MODULE_ORDER	(IPFW_MODEVENT_ORDER + 1) /* A little later. */
 #define	IPFW_VNET_ORDER		(IPFW_MODEVENT_ORDER + 2) /* Later still. */
 
 DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
 FEATURE(ipfw_ctl3, "ipfw new sockopt calls");
 MODULE_VERSION(ipfw, 3);
 /* should declare some dependencies here */
 
 /*
  * Starting up. Done in order after ipfwmod() has been called.
  * VNET_SYSINIT is also called for each existing vnet and each new vnet.
  */
 SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_init, NULL);
 VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_init, NULL);
  
 /*
  * Closing up shop. These are done in REVERSE ORDER, but still
  * after ipfwmod() has been called. Not called on reboot.
  * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
  * or when the module is unloaded.
  */
 SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_uninit, NULL);
 /* end of file */
Index: head/sys/sys/mbuf.h
===================================================================
--- head/sys/sys/mbuf.h	(revision 348253)
+++ head/sys/sys/mbuf.h	(revision 348254)
@@ -1,1364 +1,1399 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 /* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
+#include <sys/refcount.h>
 #include <vm/uma.h>
 #ifdef WITNESS
 #include <sys/lock.h>
 #endif
 #endif
 
 #ifdef _KERNEL
 #include <sys/sdt.h>
 
 #define	MBUF_PROBE1(probe, arg0)					\
 	SDT_PROBE1(sdt, , , probe, arg0)
 #define	MBUF_PROBE2(probe, arg0, arg1)					\
 	SDT_PROBE2(sdt, , , probe, arg0, arg1)
 #define	MBUF_PROBE3(probe, arg0, arg1, arg2)				\
 	SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2)
 #define	MBUF_PROBE4(probe, arg0, arg1, arg2, arg3)			\
 	SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3)
 #define	MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4)		\
 	SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4)
 
 SDT_PROBE_DECLARE(sdt, , , m__init);
 SDT_PROBE_DECLARE(sdt, , , m__gethdr);
 SDT_PROBE_DECLARE(sdt, , , m__get);
 SDT_PROBE_DECLARE(sdt, , , m__getcl);
 SDT_PROBE_DECLARE(sdt, , , m__clget);
 SDT_PROBE_DECLARE(sdt, , , m__cljget);
 SDT_PROBE_DECLARE(sdt, , , m__cljset);
 SDT_PROBE_DECLARE(sdt, , , m__free);
 SDT_PROBE_DECLARE(sdt, , , m__freem);
 
 #endif /* _KERNEL */
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead.
  * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in
  * sys/param.h), which has no additional overhead and is used instead of the
  * internal data area; this is done when at least MINCLSIZE of data must be
  * stored.  Additionally, it is possible to allocate a separate buffer
  * externally and attach it to the mbuf in a way similar to that of mbuf
  * clusters.
  *
  * NB: These calculation do not take actual compiler-induced alignment and
  * padding inside the complete struct mbuf into account.  Appropriate
  * attention is required when changing members of struct mbuf.
  *
  * MLEN is data length in a normal mbuf.
  * MHLEN is data length in an mbuf with pktheader.
  * MINCLSIZE is a smallest amount of data that should be put into cluster.
  *
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are sensible.
  */
 struct mbuf;
 #define	MHSIZE		offsetof(struct mbuf, m_dat)
 #define	MPKTHSIZE	offsetof(struct mbuf, m_pktdat)
 #define	MLEN		((int)(MSIZE - MHSIZE))
 #define	MHLEN		((int)(MSIZE - MPKTHSIZE))
 #define	MINCLSIZE	(MHLEN + 1)
 #define	M_NODOM		255
 
 #ifdef _KERNEL
 /*-
  * Macro for type conversion: convert mbuf pointer to data pointer of correct
  * type:
  *
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * mtodo(m, o) -- Same as above but with offset 'o' into data.
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	mtodo(m, o)	((void *)(((m)->m_data) + (o)))
 
 /*
  * Argument structure passed to UMA routines during mbuf and packet
  * allocations.
  */
 struct mb_args {
 	int	flags;	/* Flags for mbuf being allocated */
 	short	type;	/* Type of mbuf being allocated */
 };
 #endif /* _KERNEL */
 
 /*
  * Packet tag structure (see below for details).
  */
 struct m_tag {
 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
 	u_int16_t		m_tag_id;	/* Tag ID */
 	u_int16_t		m_tag_len;	/* Length of data */
 	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
 	void			(*m_tag_free)(struct m_tag *);
 };
 
 /*
  * Static network interface owned tag.
  * Allocated through ifp->if_snd_tag_alloc().
  */
 struct m_snd_tag {
 	struct ifnet *ifp;		/* network interface tag belongs to */
+	volatile u_int refcount;
 };
 
 /*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *	 LP64: 56
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 struct pkthdr {
 	union {
 		struct m_snd_tag *snd_tag;	/* send tag, if any */
 		struct ifnet	*rcvif;		/* rcv interface */
 	};
 	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	int32_t		 len;		/* total packet length */
 
 	/* Layer crossing persistent information. */
 	uint32_t	 flowid;	/* packet's 4-tuple system */
 	uint32_t	 csum_flags;	/* checksum and offload features */
 	uint16_t	 fibnum;	/* this packet should use this fib */
 	uint8_t		 numa_domain;	/* NUMA domain of recvd pkt */
 	uint8_t		 rsstype;	/* hash type */
 	union {
 		uint64_t	rcv_tstmp;	/* timestamp in ns */
 		struct {
 			uint8_t		 l2hlen;	/* layer 2 hdr len */
 			uint8_t		 l3hlen;	/* layer 3 hdr len */
 			uint8_t		 l4hlen;	/* layer 4 hdr len */
 			uint8_t		 l5hlen;	/* layer 5 hdr len */
 			uint32_t	 spare;
 		};
 	};
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void	*ptr;
 	} PH_per;
 
 	/* Layer specific non-persistent local storage for reassembly, etc. */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void 	*ptr;
 	} PH_loc;
 };
 #define	ether_vtag	PH_per.sixteen[0]
 #define	PH_vt		PH_per
 #define	vt_nrecs	sixteen[0]
 #define	tso_segsz	PH_per.sixteen[1]
 #define	lro_nsegs	tso_segsz
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
 #define pace_thoff	PH_loc.sixteen[0]
 #define pace_tlen	PH_loc.sixteen[1]
 #define pace_drphdrlen	PH_loc.sixteen[2]
 #define pace_tos	PH_loc.eight[6]
 #define pace_lock	PH_loc.eight[7]
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
  * set.
  * Size ILP32: 28
  *	 LP64: 48
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 typedef	void m_ext_free_t(struct mbuf *);
 struct m_ext {
 	union {
 		/*
 		 * If EXT_FLAG_EMBREF is set, then we use refcount in the
 		 * mbuf, the 'ext_count' member.  Otherwise, we have a
 		 * shadow copy and we use pointer 'ext_cnt'.  The original
 		 * mbuf is responsible to carry the pointer to free routine
 		 * and its arguments.  They aren't copied into shadows in
 		 * mb_dupcl() to avoid dereferencing next cachelines.
 		 */
 		volatile u_int	 ext_count;
 		volatile u_int	*ext_cnt;
 	};
 	char		*ext_buf;	/* start of buffer */
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
 	/*
 	 * Fields below store the free context for the external storage.
 	 * They are valid only in the refcount carrying mbuf, the one with
 	 * EXT_FLAG_EMBREF flag, with exclusion for EXT_EXTREF type, where
 	 * the free context is copied into all mbufs that use same external
 	 * storage.
 	 */
 #define	m_ext_copylen	offsetof(struct m_ext, ext_free)
 	m_ext_free_t	*ext_free;	/* free routine if not the usual */
 	void		*ext_arg1;	/* optional argument pointer */
 	void		*ext_arg2;	/* optional argument pointer */
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for practical
  * purposes.
  */
 struct mbuf {
 	/*
 	 * Header present at the beginning of every mbuf.
 	 * Size ILP32: 24
 	 *      LP64: 32
 	 * Compile-time assertions in uipc_mbuf.c test these values to ensure
 	 * that they are correct.
 	 */
 	union {	/* next buffer in chain */
 		struct mbuf		*m_next;
 		SLIST_ENTRY(mbuf)	m_slist;
 		STAILQ_ENTRY(mbuf)	m_stailq;
 	};
 	union {	/* next chain in queue/record */
 		struct mbuf		*m_nextpkt;
 		SLIST_ENTRY(mbuf)	m_slistpkt;
 		STAILQ_ENTRY(mbuf)	m_stailqpkt;
 	};
 	caddr_t		 m_data;	/* location of data */
 	int32_t		 m_len;		/* amount of data in this mbuf */
 	uint32_t	 m_type:8,	/* type of data in this mbuf */
 			 m_flags:24;	/* flags; see below */
 #if !defined(__LP64__)
 	uint32_t	 m_pad;		/* pad for 64bit alignment */
 #endif
 
 	/*
 	 * A set of optional headers (packet header, external storage header)
 	 * and internal data storage.  Historically, these arrays were sized
 	 * to MHLEN (space left after a packet header) and MLEN (space left
 	 * after only a regular mbuf header); they are now variable size in
 	 * order to support future work on variable-size mbufs.
 	 */
 	union {
 		struct {
 			struct pkthdr	m_pkthdr;	/* M_PKTHDR set */
 			union {
 				struct m_ext	m_ext;	/* M_EXT set */
 				char		m_pktdat[0];
 			};
 		};
 		char	m_dat[0];			/* !M_PKTHDR, !M_EXT */
 	};
 };
 
 /*
  * mbuf flags of global significance and layer crossing.
  * Those of only protocol/layer specific significance are to be mapped
  * to M_PROTO[1-12] and cleared at layer handoff boundaries.
  * NB: Limited to the lower 24 bits.
  */
 #define	M_EXT		0x00000001 /* has associated external storage */
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data is marked read-only */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
 #define	M_NOMAP		0x00000100 /* mbuf data is unmapped (soon from Drew) */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
 #define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
 				      hw-stamped on port (useful for IEEE 1588
 				      and 802.1AS) */
 
 #define	M_PROTO1	0x00001000 /* protocol-specific */
 #define	M_PROTO2	0x00002000 /* protocol-specific */
 #define	M_PROTO3	0x00004000 /* protocol-specific */
 #define	M_PROTO4	0x00008000 /* protocol-specific */
 #define	M_PROTO5	0x00010000 /* protocol-specific */
 #define	M_PROTO6	0x00020000 /* protocol-specific */
 #define	M_PROTO7	0x00040000 /* protocol-specific */
 #define	M_PROTO8	0x00080000 /* protocol-specific */
 #define	M_PROTO9	0x00100000 /* protocol-specific */
 #define	M_PROTO10	0x00200000 /* protocol-specific */
 #define	M_PROTO11	0x00400000 /* protocol-specific */
 #define	M_PROTO12	0x00800000 /* protocol-specific */
 
 #define MB_DTOR_SKIP	0x1	/* don't pollute the cache by touching a freed mbuf */
 
 /*
  * Flags to purge when crossing layers.
  */
 #define	M_PROTOFLAGS \
     (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
      M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12)
 
 /*
  * Flags preserved when copying m_pkthdr.
  */
 #define M_COPYFLAGS \
     (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \
      M_TSTMP_HPREC|M_PROTOFLAGS)
 
 /*
  * Mbuf flag description for use with printf(9) %b identifier.
  */
 #define	M_FLAG_BITS \
     "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
     "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC"
 #define	M_FLAG_PROTOBITS \
     "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
     "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
     "\27M_PROTO11\30M_PROTO12"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
 
 /*
  * Network interface cards are able to hash protocol fields (such as IPv4
  * addresses and TCP port numbers) classify packets into flows.  These flows
  * can then be used to maintain ordering while delivering packets to the OS
  * via parallel input queues, as well as to provide a stateless affinity
  * model.  NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set
  * m_flag fields to indicate how the hash should be interpreted by the
  * network stack.
  *
  * Most NICs support RSS, which provides ordering and explicit affinity, and
  * use the hash m_flag bits to indicate what header fields were covered by
  * the hash.  M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non-
  * RSS cards or configurations that provide an opaque flow identifier, allowing
  * for ordering and distribution without explicit affinity.  Additionally,
  * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash
  * properties.
  *
  * The meaning of the IPV6_EX suffix:
  * "o  Home address from the home address option in the IPv6 destination
  *     options header.  If the extension header is not present, use the Source
  *     IPv6 Address.
  *  o  IPv6 address that is contained in the Routing-Header-Type-2 from the
  *     associated extension header.  If the extension header is not present,
  *     use the Destination IPv6 Address."
  * Quoted from:
  * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex
  */
 #define	M_HASHTYPE_HASHPROP		0x80	/* has hash properties */
 #define	M_HASHTYPE_HASH(t)		(M_HASHTYPE_HASHPROP | (t))
 /* Microsoft RSS standard hash types */
 #define	M_HASHTYPE_NONE			0
 #define	M_HASHTYPE_RSS_IPV4		M_HASHTYPE_HASH(1) /* IPv4 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV4		M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6		M_HASHTYPE_HASH(3) /* IPv6 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV6		M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6_EX		M_HASHTYPE_HASH(5) /* IPv6 2-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_TCP_IPV6_EX	M_HASHTYPE_HASH(6) /* TCPv6 4-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_UDP_IPV4		M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6		M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6_EX	M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple +
 							    * ext hdrs */
 
 #define	M_HASHTYPE_OPAQUE		63	/* ordering, not affinity */
 #define	M_HASHTYPE_OPAQUE_HASH		M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE)
 						/* ordering+hash, not affinity*/
 
 #define	M_HASHTYPE_CLEAR(m)	((m)->m_pkthdr.rsstype = 0)
 #define	M_HASHTYPE_GET(m)	((m)->m_pkthdr.rsstype)
 #define	M_HASHTYPE_SET(m, v)	((m)->m_pkthdr.rsstype = (v))
 #define	M_HASHTYPE_TEST(m, v)	(M_HASHTYPE_GET(m) == (v))
 #define	M_HASHTYPE_ISHASH(m)	(M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP)
 
 /*
  * External mbuf storage buffer types.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_buf */
 #define	EXT_JUMBOP	3	/* jumbo cluster page sized */
 #define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
 #define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 #define	EXT_RXRING	8	/* data in NIC receive ring */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
 #define	EXT_VENDOR3	226	/* for vendor-internal use */
 #define	EXT_VENDOR4	227	/* for vendor-internal use */
 
 #define	EXT_EXP1	244	/* for experimental use */
 #define	EXT_EXP2	245	/* for experimental use */
 #define	EXT_EXP3	246	/* for experimental use */
 #define	EXT_EXP4	247	/* for experimental use */
 
 #define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
 #define	EXT_EXTREF	255	/* has externally maintained ext_cnt ptr */
 
 /*
  * Flags for external mbuf buffer types.
  * NB: limited to the lower 24 bits.
  */
 #define	EXT_FLAG_EMBREF		0x000001	/* embedded ext_count */
 #define	EXT_FLAG_EXTREF		0x000002	/* external ext_cnt, notyet */
 
 #define	EXT_FLAG_NOFREE		0x000010	/* don't free mbuf to pool, notyet */
 
 #define	EXT_FLAG_VENDOR1	0x010000	/* These flags are vendor */
 #define	EXT_FLAG_VENDOR2	0x020000	/* or submodule specific, */
 #define	EXT_FLAG_VENDOR3	0x040000	/* not used by mbuf code. */
 #define	EXT_FLAG_VENDOR4	0x080000	/* Set/read by submodule. */
 
 #define	EXT_FLAG_EXP1		0x100000	/* for experimental use */
 #define	EXT_FLAG_EXP2		0x200000	/* for experimental use */
 #define	EXT_FLAG_EXP3		0x400000	/* for experimental use */
 #define	EXT_FLAG_EXP4		0x800000	/* for experimental use */
 
 /*
  * EXT flag description for use with printf(9) %b identifier.
  */
 #define	EXT_FLAG_BITS \
     "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \
     "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \
     "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
     "\30EXT_FLAG_EXP4"
 
 /*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.
  *
  * Outbound flags that are set by upper protocol layers requesting lower
  * layers, or ideally the hardware, to perform these offloading tasks.
  * For outbound packets this field and its flags can be directly tested
  * against ifnet if_hwassist.
  */
 #define	CSUM_IP			0x00000001	/* IP header checksum offload */
 #define	CSUM_IP_UDP		0x00000002	/* UDP checksum offload */
 #define	CSUM_IP_TCP		0x00000004	/* TCP checksum offload */
 #define	CSUM_IP_SCTP		0x00000008	/* SCTP checksum offload */
 #define	CSUM_IP_TSO		0x00000010	/* TCP segmentation offload */
 #define	CSUM_IP_ISCSI		0x00000020	/* iSCSI checksum offload */
 
 #define	CSUM_IP6_UDP		0x00000200	/* UDP checksum offload */
 #define	CSUM_IP6_TCP		0x00000400	/* TCP checksum offload */
 #define	CSUM_IP6_SCTP		0x00000800	/* SCTP checksum offload */
 #define	CSUM_IP6_TSO		0x00001000	/* TCP segmentation offload */
 #define	CSUM_IP6_ISCSI		0x00002000	/* iSCSI checksum offload */
 
 /* Inbound checksum support where the checksum was verified by hardware. */
 #define	CSUM_L3_CALC		0x01000000	/* calculated layer 3 csum */
 #define	CSUM_L3_VALID		0x02000000	/* checksum is correct */
 #define	CSUM_L4_CALC		0x04000000	/* calculated layer 4 csum */
 #define	CSUM_L4_VALID		0x08000000	/* checksum is correct */
 #define	CSUM_L5_CALC		0x10000000	/* calculated layer 5 csum */
 #define	CSUM_L5_VALID		0x20000000	/* checksum is correct */
 #define	CSUM_COALESCED		0x40000000	/* contains merged segments */
 
+#define	CSUM_SND_TAG		0x80000000	/* Packet header has send tag */
+
 /*
  * CSUM flag description for use with printf(9) %b identifier.
  */
 #define	CSUM_BITS \
     "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
     "\6CSUM_IP_ISCSI" \
     "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \
     "\16CSUM_IP6_ISCSI" \
     "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \
-    "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED"
+    "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
 
 /* CSUM flags compatibility mappings. */
 #define	CSUM_IP_CHECKED		CSUM_L3_CALC
 #define	CSUM_IP_VALID		CSUM_L3_VALID
 #define	CSUM_DATA_VALID		CSUM_L4_VALID
 #define	CSUM_PSEUDO_HDR		CSUM_L4_CALC
 #define	CSUM_SCTP_VALID		CSUM_L4_VALID
 #define	CSUM_DELAY_DATA		(CSUM_TCP|CSUM_UDP)
 #define	CSUM_DELAY_IP		CSUM_IP		/* Only v4, no v6 IP hdr csum */
 #define	CSUM_DELAY_DATA_IPV6	(CSUM_TCP_IPV6|CSUM_UDP_IPV6)
 #define	CSUM_DATA_VALID_IPV6	CSUM_DATA_VALID
 #define	CSUM_TCP		CSUM_IP_TCP
 #define	CSUM_UDP		CSUM_IP_UDP
 #define	CSUM_SCTP		CSUM_IP_SCTP
 #define	CSUM_TSO		(CSUM_IP_TSO|CSUM_IP6_TSO)
 #define	CSUM_UDP_IPV6		CSUM_IP6_UDP
 #define	CSUM_TCP_IPV6		CSUM_IP6_TCP
 #define	CSUM_SCTP_IPV6		CSUM_IP6_SCTP
 
 /*
  * mbuf types describing the content of the mbuf (including external storage).
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	MT_DATA	/* packet header, use M_PKTHDR instead */
 
 #define	MT_VENDOR1	4	/* for vendor-internal use */
 #define	MT_VENDOR2	5	/* for vendor-internal use */
 #define	MT_VENDOR3	6	/* for vendor-internal use */
 #define	MT_VENDOR4	7	/* for vendor-internal use */
 
 #define	MT_SONAME	8	/* socket name */
 
 #define	MT_EXP1		9	/* for experimental use */
 #define	MT_EXP2		10	/* for experimental use */
 #define	MT_EXP3		11	/* for experimental use */
 #define	MT_EXP4		12	/* for experimental use */
 
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_EXTCONTROL	15	/* control message with externalized contents */
 #define	MT_OOBDATA	16	/* expedited data  */
 
 #define	MT_NOINIT	255	/* Not a type but a flag to allocate
 				   a non-initialized mbuf */
 
 /*
  * String names of mbuf-related UMA(9) and malloc(9) types.  Exposed to
  * !_KERNEL so that monitoring tools can look up the zones with
  * libmemstat(3).
  */
 #define	MBUF_MEM_NAME		"mbuf"
 #define	MBUF_CLUSTER_MEM_NAME	"mbuf_cluster"
 #define	MBUF_PACKET_MEM_NAME	"mbuf_packet"
 #define	MBUF_JUMBOP_MEM_NAME	"mbuf_jumbo_page"
 #define	MBUF_JUMBO9_MEM_NAME	"mbuf_jumbo_9k"
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
 
 #ifdef _KERNEL
 
 #ifdef WITNESS
 #define	MBUF_CHECKSLEEP(how) do {					\
 	if (how == M_WAITOK)						\
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,		\
 		    "Sleeping in \"%s\"", __func__);			\
 } while (0)
 #else
 #define	MBUF_CHECKSLEEP(how)
 #endif
 
 /*
  * Network buffer allocation API
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
 
 void		 mb_dupcl(struct mbuf *, struct mbuf *);
 void		 mb_free_ext(struct mbuf *);
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 int		 m_append(struct mbuf *, int, c_caddr_t);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_catpkt(struct mbuf *, struct mbuf *);
 int		 m_clget(struct mbuf *m, int how);
 void 		*m_cljget(struct mbuf *m, int how, int size);
 struct mbuf	*m_collapse(struct mbuf *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct mbuf	*m_copym(struct mbuf *, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote_pkthdr(struct mbuf *);
 void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 void		 m_dispose_extcontrolm(struct mbuf *m);
 struct mbuf	*m_dup(const struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, const struct mbuf *, int);
 void		 m_extadd(struct mbuf *, char *, u_int, m_ext_free_t,
 		    void *, void *, int, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct mbuf	*m_fragment(struct mbuf *, int, int);
 void		 m_freem(struct mbuf *);
 struct mbuf	*m_get2(int, int, short, int);
 struct mbuf	*m_getjcl(int, short, int, int);
 struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, const struct mbuf *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *, int);
 struct mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf	*m_pullup(struct mbuf *, int);
 int		 m_sanity(struct mbuf *, int);
 struct mbuf	*m_split(struct mbuf *, int, int);
 struct mbuf	*m_uiotombuf(struct uio *, int, int, int, int);
 struct mbuf	*m_unshare(struct mbuf *, int);
+void		 m_snd_tag_init(struct m_snd_tag *, struct ifnet *);
+void		 m_snd_tag_destroy(struct m_snd_tag *);
 
 static __inline int
 m_gettype(int size)
 {
 	int type;
 
 	switch (size) {
 	case MSIZE:
 		type = EXT_MBUF;
 		break;
 	case MCLBYTES:
 		type = EXT_CLUSTER;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		type = EXT_JUMBOP;
 		break;
 #endif
 	case MJUM9BYTES:
 		type = EXT_JUMBO9;
 		break;
 	case MJUM16BYTES:
 		type = EXT_JUMBO16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (type);
 }
 
 /*
  * Associated an external reference counted buffer with an mbuf.
  */
 static __inline void
 m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt,
     m_ext_free_t freef, void *arg1, void *arg2)
 {
 
 	KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__));
 
 	atomic_add_int(ref_cnt, 1);
 	m->m_flags |= M_EXT;
 	m->m_ext.ext_buf = buf;
 	m->m_ext.ext_cnt = ref_cnt;
 	m->m_data = m->m_ext.ext_buf;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_free = freef;
 	m->m_ext.ext_arg1 = arg1;
 	m->m_ext.ext_arg2 = arg2;
 	m->m_ext.ext_type = EXT_EXTREF;
 	m->m_ext.ext_flags = 0;
 }
 
 static __inline uma_zone_t
 m_getzone(int size)
 {
 	uma_zone_t zone;
 
 	switch (size) {
 	case MCLBYTES:
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		zone = zone_jumbop;
 		break;
 #endif
 	case MJUM9BYTES:
 		zone = zone_jumbo9;
 		break;
 	case MJUM16BYTES:
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (zone);
 }
 
 /*
  * Initialize an mbuf with linear storage.
  *
  * Inline because the consumer text overhead will be roughly the same to
  * initialize or call a function with this many parameters and M_PKTHDR
  * should go away with constant propagation for !MGETHDR.
  */
 static __inline int
 m_init(struct mbuf *m, int how, short type, int flags)
 {
 	int error;
 
 	m->m_next = NULL;
 	m->m_nextpkt = NULL;
 	m->m_data = m->m_dat;
 	m->m_len = 0;
 	m->m_flags = flags;
 	m->m_type = type;
 	if (flags & M_PKTHDR)
 		error = m_pkthdr_init(m, how);
 	else
 		error = 0;
 
 	MBUF_PROBE5(m__init, m, how, type, flags, error);
 	return (error);
 }
 
 static __inline struct mbuf *
 m_get(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__get, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__gethdr, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = flags;
 	args.type = type;
 	m = uma_zalloc_arg(zone_pack, &args, how);
 	MBUF_PROBE4(m__getcl, how, type, flags, m);
 	return (m);
 }
 
 /*
  * XXX: m_cljset() is a dangerous API.  One must attach only a new,
  * unreferenced cluster to an mbuf(9).  It is not possible to assert
  * that, so care can be taken only by users of the API.
  */
 static __inline void
 m_cljset(struct mbuf *m, void *cl, int type)
 {
 	int size;
 
 	switch (type) {
 	case EXT_CLUSTER:
 		size = MCLBYTES;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case EXT_JUMBOP:
 		size = MJUMPAGESIZE;
 		break;
 #endif
 	case EXT_JUMBO9:
 		size = MJUM9BYTES;
 		break;
 	case EXT_JUMBO16:
 		size = MJUM16BYTES;
 		break;
 	default:
 		panic("%s: unknown cluster type %d", __func__, type);
 		break;
 	}
 
 	m->m_data = m->m_ext.ext_buf = cl;
 	m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_type = type;
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_flags |= M_EXT;
 	MBUF_PROBE3(m__cljset, m, cl, type);
 }
 
 static __inline void
 m_chtype(struct mbuf *m, short new_type)
 {
 
 	m->m_type = new_type;
 }
 
 static __inline void
 m_clrprotoflags(struct mbuf *m)
 {
 
 	while (m) {
 		m->m_flags &= ~M_PROTOFLAGS;
 		m = m->m_next;
 	}
 }
 
 static __inline struct mbuf *
 m_last(struct mbuf *m)
 {
 
 	while (m->m_next)
 		m = m->m_next;
 	return (m);
 }
 
 static inline u_int
 m_extrefcnt(struct mbuf *m)
 {
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__));
 
 	return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count :
 	    *m->m_ext.ext_cnt);
 }
 
 /*
  * mbuf, cluster, and external object allocation macros (for compatibility
  * purposes).
  */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, arg1, arg2, flags, type)		\
     m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2),	\
     (flags), (type))
 #define	m_getm(m, len, how, type)					\
     m_getm2((m), (len), (how), (type), M_PKTHDR)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
  * be both the local data payload, or an external buffer area, depending on
  * whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(!((m)->m_flags & M_RDONLY) &&			\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (m_extrefcnt(m) == 1)))
 
 /* Check if the supplied mbuf has a packet header, or else panic. */
 #define	M_ASSERTPKTHDR(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR,			\
 	    ("%s: no mbuf packet header!", __func__))
 
 /*
  * Ensure that the supplied mbuf is a valid, non-free mbuf.
  *
  * XXX: Broken at the moment.  Need some UMA magic to make it work again.
  */
 #define	M_ASSERTVALID(m)						\
 	KASSERT((((struct mbuf *)m)->m_flags & 0) == 0,			\
 	    ("%s: attempted use of a free mbuf!", __func__))
 
 /*
  * Return the address of the start of the buffer associated with an mbuf,
  * handling external storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_START(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])
 
 /*
  * Return the size of the buffer associated with an mbuf, handling external
  * storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_SIZE(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
 	 ((m)->m_flags & M_PKTHDR) ? MHLEN :				\
 	 MLEN)
 
 /*
  * Set the m_data pointer of a newly allocated mbuf to place an object of the
  * specified size at the end of the mbuf, longword aligned.
  *
  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
  * separate macros, each asserting that it was called at the proper moment.
  * This required callers to themselves test the storage type and call the
  * right one.  Rather than require callers to be aware of those layout
  * decisions, we centralize here.
  */
 static __inline void
 m_align(struct mbuf *m, int len)
 {
 #ifdef INVARIANTS
 	const char *msg = "%s: not a virgin mbuf";
 #endif
 	int adjust;
 
 	KASSERT(m->m_data == M_START(m), (msg, __func__));
 
 	adjust = M_SIZE(m) - len;
 	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
 #define	M_ALIGN(m, len)		m_align(m, len)
 #define	MH_ALIGN(m, len)	m_align(m, len)
 #define	MEXT_ALIGN(m, len)	m_align(m, len)
 
 /*
  * Compute the amount of space available before the current start of data in
  * an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_LEADINGSPACE(m)						\
 	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
  * Compute the amount of space available after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_TRAILINGSPACE(m)						\
 	(M_WRITABLE(m) ?						\
 	    ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.  If a new mbuf must be
  * allocated, how specifies whether to wait.  If the allocation fails, the
  * original mbuf chain is freed and m is set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	MBUF_CHECKSLEEP(how);						\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.  This is a relatively expensive operation and
  * should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
+/* Return the rcvif of a packet header. */
+static __inline struct ifnet *
+m_rcvif(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
+		return (NULL);
+	return (m->m_pkthdr.rcvif);
+}
+
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 extern int		max_datalen;	/* MHLEN - max_hdr */
 extern int		max_hdr;	/* Largest link + protocol header */
 extern int		max_linkhdr;	/* Largest link-level header */
 extern int		max_protohdr;	/* Largest protocol header */
 extern int		nmbclusters;	/* Maximum number of clusters */
 
 /*-
  * Network packets may have annotations attached by affixing a list of
  * "packet tags" to the pkthdr structure.  Packet tags are dynamically
  * allocated semi-opaque data structures that have a fixed header
  * (struct m_tag) that specifies the size of the memory block and a
  * <cookie,type> pair that identifies it.  The cookie is a 32-bit unique
  * unsigned value used to identify a module or ABI.  By convention this value
  * is chosen as the date+time that the module is created, expressed as the
  * number of seconds since the epoch (e.g., using date -u +'%s').  The type
  * value is an ABI/module-specific value that identifies a particular
  * annotation and is private to the module.  For compatibility with systems
  * like OpenBSD that define packet tags w/o an ABI/module cookie, the value
  * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find
  * compatibility shim functions and several tag types are defined below.
  * Users that do not require compatibility should use a private cookie value
  * so that packet tag-related definitions can be maintained privately.
  *
  * Note that the packet tag returned by m_tag_alloc has the default memory
  * alignment implemented by malloc.  To reference private data one can use a
  * construct like:
  *
  *	struct m_tag *mtag = m_tag_alloc(...);
  *	struct foo *p = (struct foo *)(mtag+1);
  *
  * if the alignment of struct m_tag is sufficient for referencing members of
  * struct foo.  Otherwise it is necessary to embed struct m_tag within the
  * private data structure to insure proper alignment; e.g.,
  *
  *	struct foo {
  *		struct m_tag	tag;
  *		...
  *	};
  *	struct foo *p = (struct foo *) m_tag_alloc(...);
  *	struct m_tag *mtag = &p->tag;
  */
 
 /*
  * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
  * tags are expected to ``vanish'' when they pass through a network
  * interface.  For most interfaces this happens normally as the tags are
  * reclaimed when the mbuf is free'd.  However in some special cases
  * reclaiming must be done manually.  An example is packets that pass through
  * the loopback interface.  Also, one must be careful to do this when
  * ``turning around'' packets (e.g., icmp_reflect).
  *
  * To mark a tag persistent bit-or this flag in when defining the tag id.
  * The tag will then be treated as described above.
  */
 #define	MTAG_PERSISTENT				0x800
 
 #define	PACKET_TAG_NONE				0  /* Nadda */
 
 /* Packet tags for use with PACKET_ABI_COMPAT. */
 #define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
 #define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
 #define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
 #define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
 #define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
 #define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
 #define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
 #define	PACKET_TAG_GIF				8  /* GIF processing done */
 #define	PACKET_TAG_GRE				9  /* GRE processing done */
 #define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
 #define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
 #define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
 #define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
 #define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
 #define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
 #define	PACKET_TAG_DIVERT			17 /* divert info */
 #define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
 #define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
 #define	PACKET_TAG_PF		(21 | MTAG_PERSISTENT) /* PF/ALTQ information */
 #define	PACKET_TAG_RTSOCKFAM			25 /* rtsock sa family */
 #define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
 #define	PACKET_TAG_CARP				28 /* CARP info */
 #define	PACKET_TAG_IPSEC_NAT_T_PORTS		29 /* two uint16_t */
 #define	PACKET_TAG_ND_OUTGOING			30 /* ND outgoing */
 
 /* Specific cookies and tags. */
 
 /* Packet tag routines. */
 struct m_tag	*m_tag_alloc(u_int32_t, int, int, int);
 void		 m_tag_delete(struct mbuf *, struct m_tag *);
 void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
 void		 m_tag_free_default(struct m_tag *);
 struct m_tag	*m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *);
 struct m_tag	*m_tag_copy(struct m_tag *, int);
 int		 m_tag_copy_chain(struct mbuf *, const struct mbuf *, int);
 void		 m_tag_delete_nonpersistent(struct mbuf *);
 
 /*
  * Initialize the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_init(struct mbuf *m)
 {
 
 	SLIST_INIT(&m->m_pkthdr.tags);
 }
 
 /*
  * Set up the contents of a tag.  Note that this does not fill in the free
  * method; the caller is expected to do that.
  *
  * XXX probably should be called m_tag_init, but that was already taken.
  */
 static __inline void
 m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len)
 {
 
 	t->m_tag_id = type;
 	t->m_tag_len = len;
 	t->m_tag_cookie = cookie;
 }
 
 /*
  * Reclaim resources associated with a tag.
  */
 static __inline void
 m_tag_free(struct m_tag *t)
 {
 
 	(*t->m_tag_free)(t);
 }
 
 /*
  * Return the first tag associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
 
 	return (SLIST_FIRST(&m->m_pkthdr.tags));
 }
 
 /*
  * Return the next tag in the list of tags associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_next(struct mbuf *m __unused, struct m_tag *t)
 {
 
 	return (SLIST_NEXT(t, m_tag_link));
 }
 
 /*
  * Prepend a tag to the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_prepend(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
 /*
  * Unlink a tag from the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_unlink(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
 /* These are for OpenBSD compatibility. */
 #define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
 
 static __inline struct m_tag *
 m_tag_get(int type, int length, int wait)
 {
 	return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait));
 }
 
 static __inline struct m_tag *
 m_tag_find(struct mbuf *m, int type, struct m_tag *start)
 {
 	return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL :
 	    m_tag_locate(m, MTAG_ABI_COMPAT, type, start));
 }
 
+static inline struct m_snd_tag *
+m_snd_tag_ref(struct m_snd_tag *mst)
+{
+
+	refcount_acquire(&mst->refcount);
+	return (mst);
+}
+
+static inline void
+m_snd_tag_rele(struct m_snd_tag *mst)
+{
+
+	if (refcount_release(&mst->refcount))
+		m_snd_tag_destroy(mst);
+}
+
 static __inline struct mbuf *
 m_free(struct mbuf *m)
 {
 	struct mbuf *n = m->m_next;
 
 	MBUF_PROBE1(m__free, m);
 	if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE))
 		m_tag_delete_chain(m, NULL);
+	if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG)
+		m_snd_tag_rele(m->m_pkthdr.snd_tag);
 	if (m->m_flags & M_EXT)
 		mb_free_ext(m);
 	else if ((m->m_flags & M_NOFREE) == 0)
 		uma_zfree(zone_mbuf, m);
 	return (n);
 }
 
 static __inline int
 rt_m_getfib(struct mbuf *m)
 {
 	KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf."));
 	return (m->m_pkthdr.fibnum);
 }
 
 #define M_GETFIB(_m)   rt_m_getfib(_m)
 
 #define M_SETFIB(_m, _fib) do {						\
         KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf."));	\
 	((_m)->m_pkthdr.fibnum) = (_fib);				\
 } while (0)
 
 /* flags passed as first argument for "m_ether_tcpip_hash()" */
 #define	MBUF_HASHFLAG_L2	(1 << 2)
 #define	MBUF_HASHFLAG_L3	(1 << 3)
 #define	MBUF_HASHFLAG_L4	(1 << 4)
 
 /* mbuf hashing helper routines */
 uint32_t	m_ether_tcpip_hash_init(void);
 uint32_t	m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t);
 
 #ifdef MBUF_PROFILING
  void m_profile(struct mbuf *m);
  #define M_PROFILE(m) m_profile(m)
 #else
  #define M_PROFILE(m)
 #endif
 
 struct mbufq {
 	STAILQ_HEAD(, mbuf)	mq_head;
 	int			mq_len;
 	int			mq_maxlen;
 };
 
 static inline void
 mbufq_init(struct mbufq *mq, int maxlen)
 {
 
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_maxlen = maxlen;
 	mq->mq_len = 0;
 }
 
 static inline struct mbuf *
 mbufq_flush(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_len = 0;
 	return (m);
 }
 
 static inline void
 mbufq_drain(struct mbufq *mq)
 {
 	struct mbuf *m, *n;
 
 	n = mbufq_flush(mq);
 	while ((m = n) != NULL) {
 		n = STAILQ_NEXT(m, m_stailqpkt);
 		m_freem(m);
 	}
 }
 
 static inline struct mbuf *
 mbufq_first(const struct mbufq *mq)
 {
 
 	return (STAILQ_FIRST(&mq->mq_head));
 }
 
 static inline struct mbuf *
 mbufq_last(const struct mbufq *mq)
 {
 
 	return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt));
 }
 
 static inline int
 mbufq_full(const struct mbufq *mq)
 {
 
 	return (mq->mq_len >= mq->mq_maxlen);
 }
 
 static inline int
 mbufq_len(const struct mbufq *mq)
 {
 
 	return (mq->mq_len);
 }
 
 static inline int
 mbufq_enqueue(struct mbufq *mq, struct mbuf *m)
 {
 
 	if (mbufq_full(mq))
 		return (ENOBUFS);
 	STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 	return (0);
 }
 
 static inline struct mbuf *
 mbufq_dequeue(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	if (m) {
 		STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt);
 		m->m_nextpkt = NULL;
 		mq->mq_len--;
 	}
 	return (m);
 }
 
 static inline void
 mbufq_prepend(struct mbufq *mq, struct mbuf *m)
 {
 
 	STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 }
 
 /*
  * Note: this doesn't enforce the maximum list size for dst.
  */
 static inline void
 mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src)
 {
 
 	mq_dst->mq_len += mq_src->mq_len;
 	STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head);
 	mq_src->mq_len = 0;
 }
 
 #ifdef _SYS_TIMESPEC_H_
 static inline void
 mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts)
 {
 
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m));
 	KASSERT((m->m_flags & M_TSTMP) != 0, ("mbuf %p no M_TSTMP", m));
 	ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
 	ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000;
 }
 #endif
 
 #ifdef NETDUMP
 /* Invoked from the netdump client code. */
 void	netdump_mbuf_drain(void);
 void	netdump_mbuf_dump(void);
 void	netdump_mbuf_reinit(int nmbuf, int nclust, int clsize);
 #endif
 
 #endif /* _KERNEL */
 #endif /* !_SYS_MBUF_H_ */