diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index 6b25dff7081a..4762b46f3799 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -1,1491 +1,1487 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef __T4_ADAPTER_H__
 #define __T4_ADAPTER_H__
 
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/counter.h>
 #include <sys/rman.h>
 #include <sys/types.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmem.h>
 #include <vm/uma.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <machine/bus.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/pfil.h>
 #include <netinet/in.h>
 #include <netinet/tcp_lro.h>
 
 #include "offload.h"
 #include "t4_ioctl.h"
 #include "common/t4_msg.h"
 #include "firmware/t4fw_interface.h"
 
 #define KTR_CXGBE	KTR_SPARE3
 MALLOC_DECLARE(M_CXGBE);
 #define CXGBE_UNIMPLEMENTED(s) \
     panic("%s (%s, line %d) not implemented yet.", s, __FILE__, __LINE__)
 
 /*
  * Same as LIST_HEAD from queue.h.  This is to avoid conflict with LinuxKPI's
  * LIST_HEAD when building iw_cxgbe.
  */
 #define	CXGBE_LIST_HEAD(name, type)					\
 struct name {								\
 	struct type *lh_first;	/* first element */			\
 }
 
 #ifndef SYSCTL_ADD_UQUAD
 #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD
 #define sysctl_handle_64 sysctl_handle_quad
 #define CTLTYPE_U64 CTLTYPE_QUAD
 #endif
 
 SYSCTL_DECL(_hw_cxgbe);
 
 struct adapter;
 typedef struct adapter adapter_t;
 
 enum {
 	/*
 	 * All ingress queues use this entry size.  Note that the firmware event
 	 * queue and any iq expecting CPL_RX_PKT in the descriptor needs this to
 	 * be at least 64.
 	 */
 	IQ_ESIZE = 64,
 
 	/* Default queue sizes for all kinds of ingress queues */
 	FW_IQ_QSIZE = 256,
 	RX_IQ_QSIZE = 1024,
 
 	/* All egress queues use this entry size */
 	EQ_ESIZE = 64,
 
 	/* Default queue sizes for all kinds of egress queues */
 	CTRL_EQ_QSIZE = 1024,
 	TX_EQ_QSIZE = 1024,
 
 #if MJUMPAGESIZE != MCLBYTES
 	SW_ZONE_SIZES = 4,	/* cluster, jumbop, jumbo9k, jumbo16k */
 #else
 	SW_ZONE_SIZES = 3,	/* cluster, jumbo9k, jumbo16k */
 #endif
 	CL_METADATA_SIZE = CACHE_LINE_SIZE,
 
 	SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */
 	TX_SGL_SEGS = 39,
 	TX_SGL_SEGS_TSO = 38,
 	TX_SGL_SEGS_VM = 38,
 	TX_SGL_SEGS_VM_TSO = 37,
 	TX_SGL_SEGS_EO_TSO = 30,	/* XXX: lower for IPv6. */
 	TX_SGL_SEGS_VXLAN_TSO = 37,
 	TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
 enum {
 	/* adapter intr_type */
 	INTR_INTX	= (1 << 0),
 	INTR_MSI 	= (1 << 1),
 	INTR_MSIX	= (1 << 2)
 };
 
 enum {
 	XGMAC_MTU	= (1 << 0),
 	XGMAC_PROMISC	= (1 << 1),
 	XGMAC_ALLMULTI	= (1 << 2),
 	XGMAC_VLANEX	= (1 << 3),
 	XGMAC_UCADDR	= (1 << 4),
 	XGMAC_MCADDRS	= (1 << 5),
 
 	XGMAC_ALL	= 0xffff
 };
 
 enum {
 	/* flags understood by begin_synchronized_op */
 	HOLD_LOCK	= (1 << 0),
 	SLEEP_OK	= (1 << 1),
 	INTR_OK		= (1 << 2),
 
 	/* flags understood by end_synchronized_op */
 	LOCK_HELD	= HOLD_LOCK,
 };
 
 enum {
 	/* adapter flags */
 	FULL_INIT_DONE	= (1 << 0),
 	FW_OK		= (1 << 1),
 	CHK_MBOX_ACCESS	= (1 << 2),
 	MASTER_PF	= (1 << 3),
 	ADAP_SYSCTL_CTX	= (1 << 4),
 	ADAP_ERR	= (1 << 5),
 	BUF_PACKING_OK	= (1 << 6),
 	IS_VF		= (1 << 7),
 	KERN_TLS_ON	= (1 << 8),	/* HW is configured for KERN_TLS */
 	CXGBE_BUSY	= (1 << 9),
 	HW_OFF_LIMITS	= (1 << 10),	/* off limits to all except reset_thread */
 
 	/* port flags */
 	HAS_TRACEQ	= (1 << 3),
 	FIXED_IFMEDIA	= (1 << 4),	/* ifmedia list doesn't change. */
 
 	/* VI flags */
 	DOOMED		= (1 << 0),
 	VI_INIT_DONE	= (1 << 1),
 	VI_SYSCTL_CTX	= (1 << 2),
 	TX_USES_VM_WR 	= (1 << 3),
 	VI_SKIP_STATS 	= (1 << 4),
 
 	/* adapter debug_flags */
 	DF_DUMP_MBOX		= (1 << 0),	/* Log all mbox cmd/rpl. */
 	DF_LOAD_FW_ANYTIME	= (1 << 1),	/* Allow LOAD_FW after init */
 	DF_DISABLE_TCB_CACHE	= (1 << 2),	/* Disable TCB cache (T6+) */
 	DF_DISABLE_CFG_RETRY	= (1 << 3),	/* Disable fallback config */
 	DF_VERBOSE_SLOWINTR	= (1 << 4),	/* Chatty slow intr handler */
 };
 
 #define IS_DOOMED(vi)	((vi)->flags & DOOMED)
 #define SET_DOOMED(vi)	do {(vi)->flags |= DOOMED;} while (0)
 #define IS_BUSY(sc)	((sc)->flags & CXGBE_BUSY)
 #define SET_BUSY(sc)	do {(sc)->flags |= CXGBE_BUSY;} while (0)
 #define CLR_BUSY(sc)	do {(sc)->flags &= ~CXGBE_BUSY;} while (0)
 
 struct vi_info {
 	device_t dev;
 	struct port_info *pi;
 	struct adapter *adapter;
 
 	struct ifnet *ifp;
 	struct pfil_head *pfil;
 
 	unsigned long flags;
 	int if_flags;
 
 	uint16_t *rss, *nm_rss;
 	uint16_t viid;		/* opaque VI identifier */
 	uint16_t smt_idx;
 	uint16_t vin;
 	uint8_t vfvld;
 	int16_t  xact_addr_filt;/* index of exact MAC address filter */
 	uint16_t rss_size;	/* size of VI's RSS table slice */
 	uint16_t rss_base;	/* start of VI's RSS table slice */
 	int hashen;
 
 	int nintr;
 	int first_intr;
 
 	/* These need to be int as they are used in sysctl */
 	int ntxq;		/* # of tx queues */
 	int first_txq;		/* index of first tx queue */
 	int rsrv_noflowq; 	/* Reserve queue 0 for non-flowid packets */
 	int nrxq;		/* # of rx queues */
 	int first_rxq;		/* index of first rx queue */
 	int nofldtxq;		/* # of offload tx queues */
 	int first_ofld_txq;	/* index of first offload tx queue */
 	int nofldrxq;		/* # of offload rx queues */
 	int first_ofld_rxq;	/* index of first offload rx queue */
 	int nnmtxq;
 	int first_nm_txq;
 	int nnmrxq;
 	int first_nm_rxq;
 	int tmr_idx;
 	int ofld_tmr_idx;
 	int pktc_idx;
 	int ofld_pktc_idx;
 	int qsize_rxq;
 	int qsize_txq;
 
 	struct timeval last_refreshed;
 	struct fw_vi_stats_vf stats;
 	struct mtx tick_mtx;
 	struct callout tick;
 
 	struct sysctl_ctx_list ctx;
 	struct sysctl_oid *rxq_oid;
 	struct sysctl_oid *txq_oid;
 	struct sysctl_oid *nm_rxq_oid;
 	struct sysctl_oid *nm_txq_oid;
 	struct sysctl_oid *ofld_rxq_oid;
 	struct sysctl_oid *ofld_txq_oid;
 
 	uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */
 };
 
 struct tx_ch_rl_params {
 	enum fw_sched_params_rate ratemode;	/* %port (REL) or kbps (ABS) */
 	uint32_t maxrate;
 };
 
 /* CLRL state */
 enum clrl_state {
 	CS_UNINITIALIZED = 0,
 	CS_PARAMS_SET,			/* sw parameters have been set. */
 	CS_HW_UPDATE_REQUESTED,		/* async HW update requested. */
 	CS_HW_UPDATE_IN_PROGRESS,	/* sync hw update in progress. */
 	CS_HW_CONFIGURED		/* configured in the hardware. */
 };
 
 /* CLRL flags */
 enum {
 	CF_USER		= (1 << 0),	/* was configured by driver ioctl. */
 };
 
 struct tx_cl_rl_params {
 	enum clrl_state state;
 	int refcount;
 	uint8_t flags;
 	enum fw_sched_params_rate ratemode;	/* %port REL or ABS value */
 	enum fw_sched_params_unit rateunit;	/* kbps or pps (when ABS) */
 	enum fw_sched_params_mode mode;		/* aggr or per-flow */
 	uint32_t maxrate;
 	uint16_t pktsize;
 	uint16_t burstsize;
 };
 
 /* Tx scheduler parameters for a channel/port */
 struct tx_sched_params {
 	/* Channel Rate Limiter */
 	struct tx_ch_rl_params ch_rl;
 
 	/* Class WRR */
 	/* XXX */
 
 	/* Class Rate Limiter (including the default pktsize and burstsize). */
 	int pktsize;
 	int burstsize;
 	struct tx_cl_rl_params cl_rl[];
 };
 
 struct port_info {
 	device_t dev;
 	struct adapter *adapter;
 
 	struct vi_info *vi;
 	int nvi;
 	int up_vis;
 	int uld_vis;
 	bool vxlan_tcam_entry;
 
 	struct tx_sched_params *sched_params;
 
 	struct mtx pi_lock;
 	char lockname[16];
 	unsigned long flags;
 
 	uint8_t  lport;		/* associated offload logical port */
 	int8_t   mdio_addr;
 	uint8_t  port_type;
 	uint8_t  mod_type;
 	uint8_t  port_id;
 	uint8_t  tx_chan;
 	uint8_t  mps_bg_map;	/* rx MPS buffer group bitmap */
 	uint8_t  rx_e_chan_map;	/* rx TP e-channel bitmap */
 	uint8_t  rx_c_chan;	/* rx TP c-channel */
 
 	struct link_config link_cfg;
 	struct ifmedia media;
 
  	struct port_stats stats;
 	u_int tnl_cong_drops;
 	u_int tx_parse_error;
 	int fcs_reg;
 	uint64_t fcs_base;
 };
 
 #define	IS_MAIN_VI(vi)		((vi) == &((vi)->pi->vi[0]))
 
 struct cluster_metadata {
 	uma_zone_t zone;
 	caddr_t cl;
 	u_int refcount;
 };
 
 struct fl_sdesc {
 	caddr_t cl;
 	uint16_t nmbuf;	/* # of driver originated mbufs with ref on cluster */
 	int16_t moff;	/* offset of metadata from cl */
 	uint8_t zidx;
 };
 
 struct tx_desc {
 	__be64 flit[8];
 };
 
 struct tx_sdesc {
 	struct mbuf *m;		/* m_nextpkt linked chain of frames */
 	uint8_t desc_used;	/* # of hardware descriptors used by the WR */
 };
 
 
 #define IQ_PAD (IQ_ESIZE - sizeof(struct rsp_ctrl) - sizeof(struct rss_header))
 struct iq_desc {
 	struct rss_header rss;
 	uint8_t cpl[IQ_PAD];
 	struct rsp_ctrl rsp;
 };
 #undef IQ_PAD
 CTASSERT(sizeof(struct iq_desc) == IQ_ESIZE);
 
 enum {
 	/* iq flags */
 	IQ_SW_ALLOCATED	= (1 << 0),	/* sw resources allocated */
 	IQ_HAS_FL	= (1 << 1),	/* iq associated with a freelist */
 	IQ_RX_TIMESTAMP	= (1 << 2),	/* provide the SGE rx timestamp */
 	IQ_LRO_ENABLED	= (1 << 3),	/* iq is an eth rxq with LRO enabled */
 	IQ_ADJ_CREDIT	= (1 << 4),	/* hw is off by 1 credit for this iq */
 	IQ_HW_ALLOCATED	= (1 << 5),	/* fw/hw resources allocated */
 
 	/* iq state */
 	IQS_DISABLED	= 0,
 	IQS_BUSY	= 1,
 	IQS_IDLE	= 2,
 
 	/* netmap related flags */
 	NM_OFF	= 0,
 	NM_ON	= 1,
 	NM_BUSY	= 2,
 };
 
 enum {
 	CPL_COOKIE_RESERVED = 0,
 	CPL_COOKIE_FILTER,
 	CPL_COOKIE_DDP0,
 	CPL_COOKIE_DDP1,
 	CPL_COOKIE_TOM,
 	CPL_COOKIE_HASHFILTER,
 	CPL_COOKIE_ETHOFLD,
 	CPL_COOKIE_KERN_TLS,
 
 	NUM_CPL_COOKIES = 8	/* Limited by M_COOKIE.  Do not increase. */
 };
 
 struct sge_iq;
 struct rss_header;
 typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *);
 typedef int (*fw_msg_handler_t)(struct adapter *, const __be64 *);
 
 /*
  * Ingress Queue: T4 is producer, driver is consumer.
  */
 struct sge_iq {
 	uint32_t flags;
 	volatile int state;
 	struct adapter *adapter;
 	struct iq_desc  *desc;	/* KVA of descriptor ring */
 	int8_t   intr_pktc_idx;	/* packet count threshold index */
 	uint8_t  gen;		/* generation bit */
 	uint8_t  intr_params;	/* interrupt holdoff parameters */
 	int8_t   cong;		/* congestion settings */
 	uint16_t qsize;		/* size (# of entries) of the queue */
 	uint16_t sidx;		/* index of the entry with the status page */
 	uint16_t cidx;		/* consumer index */
 	uint16_t cntxt_id;	/* SGE context id for the iq */
 	uint16_t abs_id;	/* absolute SGE id for the iq */
 	int16_t intr_idx;	/* interrupt used by the queue */
 
 	STAILQ_ENTRY(sge_iq) link;
 
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	bus_addr_t ba;		/* bus address of descriptor ring */
 };
 
 enum {
 	/* eq type */
 	EQ_CTRL		= 1,
 	EQ_ETH		= 2,
 	EQ_OFLD		= 3,
 
 	/* eq flags */
 	EQ_SW_ALLOCATED	= (1 << 0),	/* sw resources allocated */
 	EQ_HW_ALLOCATED	= (1 << 1),	/* hw/fw resources allocated */
 	EQ_ENABLED	= (1 << 3),	/* open for business */
 	EQ_QFLUSH	= (1 << 4),	/* if_qflush in progress */
 };
 
 /* Listed in order of preference.  Update t4_sysctls too if you change these */
 enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB};
 
 /*
  * Egress Queue: driver is producer, T4 is consumer.
  *
  * Note: A free list is an egress queue (driver produces the buffers and T4
  * consumes them) but it's special enough to have its own struct (see sge_fl).
  */
 struct sge_eq {
 	unsigned int flags;	/* MUST be first */
 	unsigned int cntxt_id;	/* SGE context id for the eq */
 	unsigned int abs_id;	/* absolute SGE id for the eq */
 	uint8_t type;		/* EQ_CTRL/EQ_ETH/EQ_OFLD */
 	uint8_t doorbells;
 	uint8_t tx_chan;	/* tx channel used by the eq */
 	struct mtx eq_lock;
 
 	struct tx_desc *desc;	/* KVA of descriptor ring */
 	volatile uint32_t *udb;	/* KVA of doorbell (lies within BAR2) */
 	u_int udb_qid;		/* relative qid within the doorbell page */
 	uint16_t sidx;		/* index of the entry with the status page */
 	uint16_t cidx;		/* consumer idx (desc idx) */
 	uint16_t pidx;		/* producer idx (desc idx) */
 	uint16_t equeqidx;	/* EQUEQ last requested at this pidx */
 	uint16_t dbidx;		/* pidx of the most recent doorbell */
 	uint16_t iqid;		/* cached iq->cntxt_id (see iq below) */
 	volatile u_int equiq;	/* EQUIQ outstanding */
 	struct sge_iq *iq;	/* iq that receives egr_update for the eq */
 
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	bus_addr_t ba;		/* bus address of descriptor ring */
 	char lockname[16];
 };
 
 struct rx_buf_info {
 	uma_zone_t zone;	/* zone that this cluster comes from */
 	uint16_t size1;		/* same as size of cluster: 2K/4K/9K/16K.
 				 * hwsize[hwidx1] = size1.  No spare. */
 	uint16_t size2;		/* hwsize[hwidx2] = size2.
 				 * spare in cluster = size1 - size2. */
 	int8_t hwidx1;		/* SGE bufsize idx for size1 */
 	int8_t hwidx2;		/* SGE bufsize idx for size2 */
 	uint8_t type;		/* EXT_xxx type of the cluster */
 };
 
 enum {
 	NUM_MEMWIN = 3,
 
 	MEMWIN0_APERTURE = 2048,
 	MEMWIN0_BASE     = 0x1b800,
 
 	MEMWIN1_APERTURE = 32768,
 	MEMWIN1_BASE     = 0x28000,
 
 	MEMWIN2_APERTURE_T4 = 65536,
 	MEMWIN2_BASE_T4     = 0x30000,
 
 	MEMWIN2_APERTURE_T5 = 128 * 1024,
 	MEMWIN2_BASE_T5     = 0x60000,
 };
 
 struct memwin {
 	struct rwlock mw_lock __aligned(CACHE_LINE_SIZE);
 	uint32_t mw_base;	/* constant after setup_memwin */
 	uint32_t mw_aperture;	/* ditto */
 	uint32_t mw_curpos;	/* protected by mw_lock */
 };
 
 enum {
 	FL_STARVING	= (1 << 0), /* on the adapter's list of starving fl's */
 	FL_DOOMED	= (1 << 1), /* about to be destroyed */
 	FL_BUF_PACKING	= (1 << 2), /* buffer packing enabled */
 	FL_BUF_RESUME	= (1 << 3), /* resume from the middle of the frame */
 };
 
 #define FL_RUNNING_LOW(fl) \
     (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) <= fl->lowat)
 #define FL_NOT_RUNNING_LOW(fl) \
     (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) >= 2 * fl->lowat)
 
 struct sge_fl {
 	struct mtx fl_lock;
 	__be64 *desc;		/* KVA of descriptor ring, ptr to addresses */
 	struct fl_sdesc *sdesc;	/* KVA of software descriptor ring */
 	uint16_t zidx;		/* refill zone idx */
 	uint16_t safe_zidx;
 	uint16_t lowat;		/* # of buffers <= this means fl needs help */
 	int flags;
 	uint16_t buf_boundary;
 
 	/* The 16b idx all deal with hw descriptors */
 	uint16_t dbidx;		/* hw pidx after last doorbell */
 	uint16_t sidx;		/* index of status page */
 	volatile uint16_t hw_cidx;
 
 	/* The 32b idx are all buffer idx, not hardware descriptor idx */
 	uint32_t cidx;		/* consumer index */
 	uint32_t pidx;		/* producer index */
 
 	uint32_t dbval;
 	u_int rx_offset;	/* offset in fl buf (when buffer packing) */
 	volatile uint32_t *udb;
 
 	uint64_t cl_allocated;	/* # of clusters allocated */
 	uint64_t cl_recycled;	/* # of clusters recycled */
 	uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */
 
 	/* These 3 are valid when FL_BUF_RESUME is set, stale otherwise. */
 	struct mbuf *m0;
 	struct mbuf **pnext;
 	u_int remaining;
 
 	uint16_t qsize;		/* # of hw descriptors (status page included) */
 	uint16_t cntxt_id;	/* SGE context id for the freelist */
 	TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	char lockname[16];
 	bus_addr_t ba;		/* bus address of descriptor ring */
 };
 
 struct mp_ring;
 
 struct txpkts {
 	uint8_t wr_type;	/* type 0 or type 1 */
 	uint8_t npkt;		/* # of packets in this work request */
 	uint8_t len16;		/* # of 16B pieces used by this work request */
 	uint8_t score;
 	uint8_t max_npkt;	/* maximum number of packets allowed */
 	uint16_t plen;		/* total payload (sum of all packets) */
 
 	/* straight from fw_eth_tx_pkts_vm_wr. */
 	__u8   ethmacdst[6];
 	__u8   ethmacsrc[6];
 	__be16 ethtype;
 	__be16 vlantci;
 
 	struct mbuf *mb[15];
 };
 
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct ifnet *ifp;	/* the interface this txq belongs to */
 	struct mp_ring *r;	/* tx software ring */
 	struct tx_sdesc *sdesc;	/* KVA of software descriptor ring */
 	struct sglist *gl;
 	__be32 cpl_ctrl0;	/* for convenience */
 	int tc_idx;		/* traffic class */
 	uint64_t last_tx;	/* cycle count when eth_tx was last called */
 	struct txpkts txp;
 
 	struct task tx_reclaim_task;
 	/* stats for common events first */
 
 	uint64_t txcsum;	/* # of times hardware assisted with checksum */
 	uint64_t tso_wrs;	/* # of TSO work requests */
 	uint64_t vlan_insertion;/* # of times VLAN tag was inserted */
 	uint64_t imm_wrs;	/* # of work requests with immediate data */
 	uint64_t sgl_wrs;	/* # of work requests with direct SGL */
 	uint64_t txpkt_wrs;	/* # of txpkt work requests (not coalesced) */
 	uint64_t txpkts0_wrs;	/* # of type0 coalesced tx work requests */
 	uint64_t txpkts1_wrs;	/* # of type1 coalesced tx work requests */
 	uint64_t txpkts0_pkts;	/* # of frames in type0 coalesced tx WRs */
 	uint64_t txpkts1_pkts;	/* # of frames in type1 coalesced tx WRs */
 	uint64_t txpkts_flush;	/* # of times txp had to be sent by tx_update */
 	uint64_t raw_wrs;	/* # of raw work requests (alloc_wr_mbuf) */
 	uint64_t vxlan_tso_wrs;	/* # of VXLAN TSO work requests */
 	uint64_t vxlan_txcsum;
 
 	uint64_t kern_tls_records;
 	uint64_t kern_tls_short;
 	uint64_t kern_tls_partial;
 	uint64_t kern_tls_full;
 	uint64_t kern_tls_octets;
 	uint64_t kern_tls_waste;
 	uint64_t kern_tls_options;
 	uint64_t kern_tls_header;
 	uint64_t kern_tls_fin;
 	uint64_t kern_tls_fin_short;
 	uint64_t kern_tls_cbc;
 	uint64_t kern_tls_gcm;
 
 	/* stats for not-that-common events */
 
 	/* Optional scratch space for constructing work requests. */
 	uint8_t ss[SGE_MAX_WR_LEN] __aligned(16);
 } __aligned(CACHE_LINE_SIZE);
 
 /* rxq: SGE ingress queue + SGE free list + miscellaneous items */
 struct sge_rxq {
 	struct sge_iq iq;	/* MUST be first */
 	struct sge_fl fl;	/* MUST follow iq */
 
 	struct ifnet *ifp;	/* the interface this rxq belongs to */
 	struct lro_ctrl lro;	/* LRO state */
 
 	/* stats for common events first */
 
 	uint64_t rxcsum;	/* # of times hardware assisted with checksum */
 	uint64_t vlan_extraction;/* # of times VLAN tag was extracted */
 	uint64_t vxlan_rxcsum;
 
 	/* stats for not-that-common events */
 
 } __aligned(CACHE_LINE_SIZE);
 
 static inline struct sge_rxq *
 iq_to_rxq(struct sge_iq *iq)
 {
 
 	return (__containerof(iq, struct sge_rxq, iq));
 }
 
 /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */
 struct sge_ofld_rxq {
 	struct sge_iq iq;	/* MUST be first */
 	struct sge_fl fl;	/* MUST follow iq */
 	counter_u64_t rx_iscsi_ddp_setup_ok;
 	counter_u64_t rx_iscsi_ddp_setup_error;
 	uint64_t rx_iscsi_ddp_pdus;
 	uint64_t rx_iscsi_ddp_octets;
 	uint64_t rx_iscsi_fl_pdus;
 	uint64_t rx_iscsi_fl_octets;
 	uint64_t rx_iscsi_padding_errors;
 	uint64_t rx_iscsi_header_digest_errors;
 	uint64_t rx_iscsi_data_digest_errors;
 	u_long	rx_toe_tls_records;
 	u_long	rx_toe_tls_octets;
 } __aligned(CACHE_LINE_SIZE);
 
 static inline struct sge_ofld_rxq *
 iq_to_ofld_rxq(struct sge_iq *iq)
 {
 
 	return (__containerof(iq, struct sge_ofld_rxq, iq));
 }
 
 struct wrqe {
 	STAILQ_ENTRY(wrqe) link;
 	struct sge_wrq *wrq;
 	int wr_len;
 	char wr[] __aligned(16);
 };
 
 struct wrq_cookie {
 	TAILQ_ENTRY(wrq_cookie) link;
 	int ndesc;
 	int pidx;
 };
 
 /*
  * wrq: SGE egress queue that is given prebuilt work requests.  Control queues
  * are of this type.
  */
 struct sge_wrq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct adapter *adapter;
 	struct task wrq_tx_task;
 
 	/* Tx desc reserved but WR not "committed" yet. */
 	TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs;
 
 	/* List of WRs ready to go out as soon as descriptors are available. */
 	STAILQ_HEAD(, wrqe) wr_list;
 	u_int nwr_pending;
 	u_int ndesc_needed;
 
 	/* stats for common events first */
 
 	uint64_t tx_wrs_direct;	/* # of WRs written directly to desc ring. */
 	uint64_t tx_wrs_ss;	/* # of WRs copied from scratch space. */
 	uint64_t tx_wrs_copied;	/* # of WRs queued and copied to desc ring. */
 
 	/* stats for not-that-common events */
 
 	/*
 	 * Scratch space for work requests that wrap around after reaching the
 	 * status page, and some information about the last WR that used it.
 	 */
 	uint16_t ss_pidx;
 	uint16_t ss_len;
 	uint8_t ss[SGE_MAX_WR_LEN];
 
 } __aligned(CACHE_LINE_SIZE);
 
 /* ofld_txq: SGE egress queue + miscellaneous items */
 struct sge_ofld_txq {
 	struct sge_wrq wrq;
 	counter_u64_t tx_iscsi_pdus;
 	counter_u64_t tx_iscsi_octets;
 	counter_u64_t tx_iscsi_iso_wrs;
 	counter_u64_t tx_toe_tls_records;
 	counter_u64_t tx_toe_tls_octets;
 } __aligned(CACHE_LINE_SIZE);
 
 #define INVALID_NM_RXQ_CNTXT_ID ((uint16_t)(-1))
 struct sge_nm_rxq {
 	/* Items used by the driver rx ithread are in this cacheline. */
 	volatile int nm_state __aligned(CACHE_LINE_SIZE);	/* NM_OFF, NM_ON, or NM_BUSY */
 	u_int nid;		/* netmap ring # for this queue */
 	struct vi_info *vi;
 
 	struct iq_desc *iq_desc;
 	uint16_t iq_abs_id;
 	uint16_t iq_cntxt_id;
 	uint16_t iq_cidx;
 	uint16_t iq_sidx;
 	uint8_t iq_gen;
 	uint32_t fl_sidx;
 
 	/* Items used by netmap rxsync are in this cacheline. */
 	__be64  *fl_desc __aligned(CACHE_LINE_SIZE);
 	uint16_t fl_cntxt_id;
 	uint32_t fl_pidx;
 	uint32_t fl_sidx2;	/* copy of fl_sidx */
 	uint32_t fl_db_val;
 	u_int fl_db_saved;
 	u_int fl_db_threshold;	/* in descriptors */
 	u_int fl_hwidx:4;
 
 	/*
 	 * fl_cidx is used by both the ithread and rxsync, the rest are not used
 	 * in the rx fast path.
 	 */
 	uint32_t fl_cidx __aligned(CACHE_LINE_SIZE);
 
 	bus_dma_tag_t iq_desc_tag;
 	bus_dmamap_t iq_desc_map;
 	bus_addr_t iq_ba;
 	int intr_idx;
 
 	bus_dma_tag_t fl_desc_tag;
 	bus_dmamap_t fl_desc_map;
 	bus_addr_t fl_ba;
 };
 
 #define INVALID_NM_TXQ_CNTXT_ID ((u_int)(-1))
 struct sge_nm_txq {
 	struct tx_desc *desc;
 	uint16_t cidx;
 	uint16_t pidx;
 	uint16_t sidx;
 	uint16_t equiqidx;	/* EQUIQ last requested at this pidx */
 	uint16_t equeqidx;	/* EQUEQ last requested at this pidx */
 	uint16_t dbidx;		/* pidx of the most recent doorbell */
 	uint8_t doorbells;
 	volatile uint32_t *udb;
 	u_int udb_qid;
 	u_int cntxt_id;
 	__be32 cpl_ctrl0;	/* for convenience */
 	__be32 op_pkd;		/* ditto */
 	u_int nid;		/* netmap ring # for this queue */
 
 	/* infrequently used items after this */
 
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
 	bus_addr_t ba;
 	int iqidx;
 } __aligned(CACHE_LINE_SIZE);
 
 struct sge {
 	int nrxq;	/* total # of Ethernet rx queues */
 	int ntxq;	/* total # of Ethernet tx queues */
 	int nofldrxq;	/* total # of TOE rx queues */
 	int nofldtxq;	/* total # of TOE tx queues */
 	int nnmrxq;	/* total # of netmap rx queues */
 	int nnmtxq;	/* total # of netmap tx queues */
 	int niq;	/* total # of ingress queues */
 	int neq;	/* total # of egress queues */
 
 	struct sge_iq fwq;	/* Firmware event queue */
 	struct sge_wrq *ctrlq;	/* Control queues */
 	struct sge_txq *txq;	/* NIC tx queues */
 	struct sge_rxq *rxq;	/* NIC rx queues */
 	struct sge_ofld_txq *ofld_txq;	/* TOE tx queues */
 	struct sge_ofld_rxq *ofld_rxq;	/* TOE rx queues */
 	struct sge_nm_txq *nm_txq;	/* netmap tx queues */
 	struct sge_nm_rxq *nm_rxq;	/* netmap rx queues */
 
 	uint16_t iq_start;	/* first cntxt_id */
 	uint16_t iq_base;	/* first abs_id */
 	int eq_start;		/* first cntxt_id */
 	int eq_base;		/* first abs_id */
 	int iqmap_sz;
 	int eqmap_sz;
 	struct sge_iq **iqmap;	/* iq->cntxt_id to iq mapping */
 	struct sge_eq **eqmap;	/* eq->cntxt_id to eq mapping */
 
 	int8_t safe_zidx;
 	struct rx_buf_info rx_buf_info[SW_ZONE_SIZES];
 };
 
 struct devnames {
 	const char *nexus_name;
 	const char *ifnet_name;
 	const char *vi_ifnet_name;
 	const char *pf03_drv_name;
 	const char *vf_nexus_name;
 	const char *vf_ifnet_name;
 };
 
 struct clip_entry;
 
 struct adapter {
 	SLIST_ENTRY(adapter) link;
 	device_t dev;
 	struct cdev *cdev;
 	const struct devnames *names;
 
 	/* PCIe register resources */
 	int regs_rid;
 	struct resource *regs_res;
 	int msix_rid;
 	struct resource *msix_res;
 	bus_space_handle_t bh;
 	bus_space_tag_t bt;
 	bus_size_t mmio_len;
 	int udbs_rid;
 	struct resource *udbs_res;
 	volatile uint8_t *udbs_base;
 
 	unsigned int pf;
 	unsigned int mbox;
 	unsigned int vpd_busy;
 	unsigned int vpd_flag;
 
 	/* Interrupt information */
 	int intr_type;
 	int intr_count;
 	struct irq {
 		struct resource *res;
 		int rid;
 		void *tag;
 		struct sge_rxq *rxq;
 		struct sge_nm_rxq *nm_rxq;
 	} __aligned(CACHE_LINE_SIZE) *irq;
 	int sge_gts_reg;
 	int sge_kdoorbell_reg;
 
 	bus_dma_tag_t dmat;	/* Parent DMA tag */
 
 	struct sge sge;
 	int lro_timeout;
 	int sc_do_rxcopy;
 
 	int vxlan_port;
 	u_int vxlan_refcount;
 	int rawf_base;
 	int nrawf;
 
 	struct taskqueue *tq[MAX_NCHAN];	/* General purpose taskqueues */
 	struct task async_event_task;
 	struct port_info *port[MAX_NPORTS];
 	uint8_t chan_map[MAX_NCHAN];		/* channel -> port */
 
 	CXGBE_LIST_HEAD(, clip_entry) *clip_table;
 	TAILQ_HEAD(, clip_entry) clip_pending;	/* these need hw update. */
 	u_long clip_mask;
 	int clip_gen;
 	struct timeout_task clip_task;
 
 	void *tom_softc;	/* (struct tom_data *) */
 	struct tom_tunables tt;
 	struct t4_offload_policy *policy;
 	struct rwlock policy_lock;
 
 	void *iwarp_softc;	/* (struct c4iw_dev *) */
 	struct iw_tunables iwt;
 	void *iscsi_ulp_softc;	/* (struct cxgbei_data *) */
 	void *ccr_softc;	/* (struct ccr_softc *) */
 	struct l2t_data *l2t;	/* L2 table */
 	struct smt_data *smt;	/* Source MAC Table */
 	struct tid_info tids;
 	vmem_t *key_map;
 	struct tls_tunables tlst;
 
 	uint8_t doorbells;
 	int offload_map;	/* ports with IFCAP_TOE enabled */
 	int active_ulds;	/* ULDs activated on this adapter */
 	int flags;
 	int debug_flags;
 
 	char ifp_lockname[16];
 	struct mtx ifp_lock;
 	struct ifnet *ifp;	/* tracer ifp */
 	struct ifmedia media;
 	int traceq;		/* iq used by all tracers, -1 if none */
 	int tracer_valid;	/* bitmap of valid tracers */
 	int tracer_enabled;	/* bitmap of enabled tracers */
 
 	char fw_version[16];
 	char tp_version[16];
 	char er_version[16];
 	char bs_version[16];
 	char cfg_file[32];
 	u_int cfcsum;
 	struct adapter_params params;
 	const struct chip_params *chip_params;
 	struct t4_virt_res vres;
 
 	uint16_t nbmcaps;
 	uint16_t linkcaps;
 	uint16_t switchcaps;
 	uint16_t niccaps;
 	uint16_t toecaps;
 	uint16_t rdmacaps;
 	uint16_t cryptocaps;
 	uint16_t iscsicaps;
 	uint16_t fcoecaps;
 
 	struct sysctl_ctx_list ctx;
 	struct sysctl_oid *ctrlq_oid;
 	struct sysctl_oid *fwq_oid;
 
 	struct mtx sc_lock;
 	char lockname[16];
 
 	/* Starving free lists */
 	struct mtx sfl_lock;	/* same cache-line as sc_lock? but that's ok */
 	TAILQ_HEAD(, sge_fl) sfl;
 	struct callout sfl_callout;
 
 	/*
 	 * Driver code that can run when the adapter is suspended must use this
 	 * lock or a synchronized_op and check for HW_OFF_LIMITS before
 	 * accessing hardware.
 	 *
 	 * XXX: could be changed to rwlock.  wlock in suspend/resume and for
 	 * indirect register access, rlock everywhere else.
 	 */
 	struct mtx reg_lock;
 
 	struct memwin memwin[NUM_MEMWIN];	/* memory windows */
 
 	struct mtx tc_lock;
 	struct task tc_task;
 
 	struct task reset_task;
 	const void *reset_thread;
 	int num_resets;
 	int incarnation;
 
 	const char *last_op;
 	const void *last_op_thr;
 	int last_op_flags;
 
 	int swintr;
 	int sensor_resets;
 
 	struct callout ktls_tick;
 };
 
 #define ADAPTER_LOCK(sc)		mtx_lock(&(sc)->sc_lock)
 #define ADAPTER_UNLOCK(sc)		mtx_unlock(&(sc)->sc_lock)
 #define ADAPTER_LOCK_ASSERT_OWNED(sc)	mtx_assert(&(sc)->sc_lock, MA_OWNED)
 #define ADAPTER_LOCK_ASSERT_NOTOWNED(sc) mtx_assert(&(sc)->sc_lock, MA_NOTOWNED)
 
 #define ASSERT_SYNCHRONIZED_OP(sc)	\
     KASSERT(IS_BUSY(sc) && \
 	(mtx_owned(&(sc)->sc_lock) || sc->last_op_thr == curthread), \
 	("%s: operation not synchronized.", __func__))
 
 #define PORT_LOCK(pi)			mtx_lock(&(pi)->pi_lock)
 #define PORT_UNLOCK(pi)			mtx_unlock(&(pi)->pi_lock)
 #define PORT_LOCK_ASSERT_OWNED(pi)	mtx_assert(&(pi)->pi_lock, MA_OWNED)
 #define PORT_LOCK_ASSERT_NOTOWNED(pi)	mtx_assert(&(pi)->pi_lock, MA_NOTOWNED)
 
 #define FL_LOCK(fl)			mtx_lock(&(fl)->fl_lock)
 #define FL_TRYLOCK(fl)			mtx_trylock(&(fl)->fl_lock)
 #define FL_UNLOCK(fl)			mtx_unlock(&(fl)->fl_lock)
 #define FL_LOCK_ASSERT_OWNED(fl)	mtx_assert(&(fl)->fl_lock, MA_OWNED)
 #define FL_LOCK_ASSERT_NOTOWNED(fl)	mtx_assert(&(fl)->fl_lock, MA_NOTOWNED)
 
 #define RXQ_FL_LOCK(rxq)		FL_LOCK(&(rxq)->fl)
 #define RXQ_FL_UNLOCK(rxq)		FL_UNLOCK(&(rxq)->fl)
 #define RXQ_FL_LOCK_ASSERT_OWNED(rxq)	FL_LOCK_ASSERT_OWNED(&(rxq)->fl)
 #define RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl)
 
 #define EQ_LOCK(eq)			mtx_lock(&(eq)->eq_lock)
 #define EQ_TRYLOCK(eq)			mtx_trylock(&(eq)->eq_lock)
 #define EQ_UNLOCK(eq)			mtx_unlock(&(eq)->eq_lock)
 #define EQ_LOCK_ASSERT_OWNED(eq)	mtx_assert(&(eq)->eq_lock, MA_OWNED)
 #define EQ_LOCK_ASSERT_NOTOWNED(eq)	mtx_assert(&(eq)->eq_lock, MA_NOTOWNED)
 
 #define TXQ_LOCK(txq)			EQ_LOCK(&(txq)->eq)
 #define TXQ_TRYLOCK(txq)		EQ_TRYLOCK(&(txq)->eq)
 #define TXQ_UNLOCK(txq)			EQ_UNLOCK(&(txq)->eq)
 #define TXQ_LOCK_ASSERT_OWNED(txq)	EQ_LOCK_ASSERT_OWNED(&(txq)->eq)
 #define TXQ_LOCK_ASSERT_NOTOWNED(txq)	EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq)
 
 #define for_each_txq(vi, iter, q) \
 	for (q = &vi->adapter->sge.txq[vi->first_txq], iter = 0; \
 	    iter < vi->ntxq; ++iter, ++q)
 #define for_each_rxq(vi, iter, q) \
 	for (q = &vi->adapter->sge.rxq[vi->first_rxq], iter = 0; \
 	    iter < vi->nrxq; ++iter, ++q)
 #define for_each_ofld_txq(vi, iter, q) \
 	for (q = &vi->adapter->sge.ofld_txq[vi->first_ofld_txq], iter = 0; \
 	    iter < vi->nofldtxq; ++iter, ++q)
 #define for_each_ofld_rxq(vi, iter, q) \
 	for (q = &vi->adapter->sge.ofld_rxq[vi->first_ofld_rxq], iter = 0; \
 	    iter < vi->nofldrxq; ++iter, ++q)
 #define for_each_nm_txq(vi, iter, q) \
 	for (q = &vi->adapter->sge.nm_txq[vi->first_nm_txq], iter = 0; \
 	    iter < vi->nnmtxq; ++iter, ++q)
 #define for_each_nm_rxq(vi, iter, q) \
 	for (q = &vi->adapter->sge.nm_rxq[vi->first_nm_rxq], iter = 0; \
 	    iter < vi->nnmrxq; ++iter, ++q)
 #define for_each_vi(_pi, _iter, _vi) \
 	for ((_vi) = (_pi)->vi, (_iter) = 0; (_iter) < (_pi)->nvi; \
 	     ++(_iter), ++(_vi))
 
 #define IDXINCR(idx, incr, wrap) do { \
 	idx = wrap - idx > incr ? idx + incr : incr - (wrap - idx); \
 } while (0)
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 /* One for errors, one for firmware events */
 #define T4_EXTRA_INTR 2
 
 /* One for firmware events */
 #define T4VF_EXTRA_INTR 1
 
 static inline int
 forwarding_intr_to_fwq(struct adapter *sc)
 {
 
 	return (sc->intr_count == 1);
 }
 
 /* Works reliably inside a sync_op or with reg_lock held. */
 static inline bool
 hw_off_limits(struct adapter *sc)
 {
 	return (__predict_false(sc->flags & HW_OFF_LIMITS));
 }
 
 static inline uint32_t
 t4_read_reg(struct adapter *sc, uint32_t reg)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	return bus_space_read_4(sc->bt, sc->bh, reg);
 }
 
 static inline void
 t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	bus_space_write_4(sc->bt, sc->bh, reg, val);
 }
 
 static inline uint64_t
 t4_read_reg64(struct adapter *sc, uint32_t reg)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 #ifdef __LP64__
 	return bus_space_read_8(sc->bt, sc->bh, reg);
 #else
 	return (uint64_t)bus_space_read_4(sc->bt, sc->bh, reg) +
 	    ((uint64_t)bus_space_read_4(sc->bt, sc->bh, reg + 4) << 32);
 
 #endif
 }
 
 static inline void
 t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 #ifdef __LP64__
 	bus_space_write_8(sc->bt, sc->bh, reg, val);
 #else
 	bus_space_write_4(sc->bt, sc->bh, reg, val);
 	bus_space_write_4(sc->bt, sc->bh, reg + 4, val>> 32);
 #endif
 }
 
 static inline void
 t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	*val = pci_read_config(sc->dev, reg, 1);
 }
 
 static inline void
 t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	pci_write_config(sc->dev, reg, val, 1);
 }
 
 static inline void
 t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val)
 {
 
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	*val = pci_read_config(sc->dev, reg, 2);
 }
 
 static inline void
 t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	pci_write_config(sc->dev, reg, val, 2);
 }
 
 static inline void
 t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	*val = pci_read_config(sc->dev, reg, 4);
 }
 
 static inline void
 t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val)
 {
 	if (hw_off_limits(sc))
 		MPASS(curthread == sc->reset_thread);
 	pci_write_config(sc->dev, reg, val, 4);
 }
 
 static inline struct port_info *
 adap2pinfo(struct adapter *sc, int idx)
 {
 
 	return (sc->port[idx]);
 }
 
 static inline void
 t4_os_set_hw_addr(struct port_info *pi, uint8_t hw_addr[])
 {
 
 	bcopy(hw_addr, pi->vi[0].hw_addr, ETHER_ADDR_LEN);
 }
 
 static inline int
 tx_resume_threshold(struct sge_eq *eq)
 {
 
 	/* not quite the same as qsize / 4, but this will do. */
 	return (eq->sidx / 4);
 }
 
 static inline int
 t4_use_ldst(struct adapter *sc)
 {
 
 #ifdef notyet
 	return (sc->flags & FW_OK || !sc->use_bd);
 #else
 	return (0);
 #endif
 }
 
 static inline void
 CH_DUMP_MBOX(struct adapter *sc, int mbox, const int reg,
     const char *msg, const __be64 *const p, const bool err)
 {
 
 	if (!(sc->debug_flags & DF_DUMP_MBOX) && !err)
 		return;
 	if (p != NULL) {
 		log(err ? LOG_ERR : LOG_DEBUG,
 		    "%s: mbox %u %s %016llx %016llx %016llx %016llx "
 		    "%016llx %016llx %016llx %016llx\n",
 		    device_get_nameunit(sc->dev), mbox, msg,
 		    (long long)be64_to_cpu(p[0]), (long long)be64_to_cpu(p[1]),
 		    (long long)be64_to_cpu(p[2]), (long long)be64_to_cpu(p[3]),
 		    (long long)be64_to_cpu(p[4]), (long long)be64_to_cpu(p[5]),
 		    (long long)be64_to_cpu(p[6]), (long long)be64_to_cpu(p[7]));
 	} else {
 		log(err ? LOG_ERR : LOG_DEBUG,
 		    "%s: mbox %u %s %016llx %016llx %016llx %016llx "
 		    "%016llx %016llx %016llx %016llx\n",
 		    device_get_nameunit(sc->dev), mbox, msg,
 		    (long long)t4_read_reg64(sc, reg),
 		    (long long)t4_read_reg64(sc, reg + 8),
 		    (long long)t4_read_reg64(sc, reg + 16),
 		    (long long)t4_read_reg64(sc, reg + 24),
 		    (long long)t4_read_reg64(sc, reg + 32),
 		    (long long)t4_read_reg64(sc, reg + 40),
 		    (long long)t4_read_reg64(sc, reg + 48),
 		    (long long)t4_read_reg64(sc, reg + 56));
 	}
 }
 
 /* t4_main.c */
 extern int t4_ntxq;
 extern int t4_nrxq;
 extern int t4_intr_types;
 extern int t4_tmr_idx;
 extern int t4_pktc_idx;
 extern unsigned int t4_qsize_rxq;
 extern unsigned int t4_qsize_txq;
 extern device_method_t cxgbe_methods[];
 
 int t4_os_find_pci_capability(struct adapter *, int);
 int t4_os_pci_save_state(struct adapter *);
 int t4_os_pci_restore_state(struct adapter *);
 void t4_os_portmod_changed(struct port_info *);
 void t4_os_link_changed(struct port_info *);
 void t4_iterate(void (*)(struct adapter *, void *), void *);
 void t4_init_devnames(struct adapter *);
 void t4_add_adapter(struct adapter *);
 int t4_detach_common(device_t);
 int t4_map_bars_0_and_4(struct adapter *);
 int t4_map_bar_2(struct adapter *);
 int t4_setup_intr_handlers(struct adapter *);
 void t4_sysctls(struct adapter *);
 int begin_synchronized_op(struct adapter *, struct vi_info *, int, char *);
 void doom_vi(struct adapter *, struct vi_info *);
 void end_synchronized_op(struct adapter *, int);
 int update_mac_settings(struct ifnet *, int);
 int adapter_init(struct adapter *);
 int vi_init(struct vi_info *);
 void vi_sysctls(struct vi_info *);
 int rw_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int, int);
 int alloc_atid(struct adapter *, void *);
 void *lookup_atid(struct adapter *, int);
 void free_atid(struct adapter *, int);
 void release_tid(struct adapter *, int, struct sge_wrq *);
 int cxgbe_media_change(struct ifnet *);
 void cxgbe_media_status(struct ifnet *, struct ifmediareq *);
 bool t4_os_dump_cimla(struct adapter *, int, bool);
 void t4_os_dump_devlog(struct adapter *);
 
 #ifdef KERN_TLS
 /* t4_kern_tls.c */
 int cxgbe_tls_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
-void cxgbe_tls_tag_free(struct m_snd_tag *);
 void t6_ktls_modload(void);
 void t6_ktls_modunload(void);
 int t6_ktls_try(struct ifnet *, struct socket *, struct ktls_session *);
 int t6_ktls_parse_pkt(struct mbuf *, int *, int *);
 int t6_ktls_write_wr(struct sge_txq *, void *, struct mbuf *, u_int, u_int);
 #endif
 
 /* t4_keyctx.c */
 struct auth_hash;
 union authctx;
 #ifdef KERN_TLS
 struct ktls_session;
 struct tls_key_req;
 struct tls_keyctx;
 #endif
 
 void t4_aes_getdeckey(void *, const void *, unsigned int);
 void t4_copy_partial_hash(int, union authctx *, void *);
 void t4_init_gmac_hash(const char *, int, char *);
 void t4_init_hmac_digest(const struct auth_hash *, u_int, const char *, int,
     char *);
 #ifdef KERN_TLS
 u_int t4_tls_key_info_size(const struct ktls_session *);
 int t4_tls_proto_ver(const struct ktls_session *);
 int t4_tls_cipher_mode(const struct ktls_session *);
 int t4_tls_auth_mode(const struct ktls_session *);
 int t4_tls_hmac_ctrl(const struct ktls_session *);
 void t4_tls_key_ctx(const struct ktls_session *, int, struct tls_keyctx *);
 int t4_alloc_tls_keyid(struct adapter *);
 void t4_free_tls_keyid(struct adapter *, int);
 void t4_write_tlskey_wr(const struct ktls_session *, int, int, int, int,
     struct tls_key_req *);
 #endif
 
 #ifdef DEV_NETMAP
 /* t4_netmap.c */
 struct sge_nm_rxq;
 void cxgbe_nm_attach(struct vi_info *);
 void cxgbe_nm_detach(struct vi_info *);
 void service_nm_rxq(struct sge_nm_rxq *);
 int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int);
 int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
 int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int);
 int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
 #endif
 
 /* t4_sge.c */
 void t4_sge_modload(void);
 void t4_sge_modunload(void);
 uint64_t t4_sge_extfree_refs(void);
 void t4_tweak_chip_settings(struct adapter *);
 int t4_verify_chip_settings(struct adapter *);
 void t4_init_rx_buf_info(struct adapter *);
 int t4_create_dma_tag(struct adapter *);
 void t4_sge_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid_list *);
 int t4_destroy_dma_tag(struct adapter *);
 int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
     bus_addr_t *, void **);
 int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
     void *);
 void free_fl_buffers(struct adapter *, struct sge_fl *);
 int t4_setup_adapter_queues(struct adapter *);
 int t4_teardown_adapter_queues(struct adapter *);
 int t4_setup_vi_queues(struct vi_info *);
 int t4_teardown_vi_queues(struct vi_info *);
 void t4_intr_all(void *);
 void t4_intr(void *);
 #ifdef DEV_NETMAP
 void t4_nm_intr(void *);
 void t4_vi_intr(void *);
 #endif
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
 void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
 void t4_update_fl_bufsize(struct ifnet *);
 struct mbuf *alloc_wr_mbuf(int, int);
 int parse_pkt(struct mbuf **, bool);
 void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *);
 void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *);
 int tnl_cong(struct port_info *, int);
 void t4_register_an_handler(an_handler_t);
 void t4_register_fw_msg_handler(int, fw_msg_handler_t);
 void t4_register_cpl_handler(int, cpl_handler_t);
 void t4_register_shared_cpl_handler(int, cpl_handler_t, int);
 #ifdef RATELIMIT
 int ethofld_transmit(struct ifnet *, struct mbuf *);
 void send_etid_flush_wr(struct cxgbe_rate_tag *);
 #endif
 
 /* t4_tracer.c */
 struct t4_tracer;
 void t4_tracer_modload(void);
 void t4_tracer_modunload(void);
 void t4_tracer_port_detach(struct adapter *);
 int t4_get_tracer(struct adapter *, struct t4_tracer *);
 int t4_set_tracer(struct adapter *, struct t4_tracer *);
 int t4_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t5_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *);
 
 /* t4_sched.c */
 int t4_set_sched_class(struct adapter *, struct t4_sched_params *);
 int t4_set_sched_queue(struct adapter *, struct t4_sched_queue *);
 int t4_init_tx_sched(struct adapter *);
 int t4_free_tx_sched(struct adapter *);
 void t4_update_tx_sched(struct adapter *);
 int t4_reserve_cl_rl_kbps(struct adapter *, int, u_int, int *);
 void t4_release_cl_rl(struct adapter *, int, int);
 int sysctl_tc(SYSCTL_HANDLER_ARGS);
 int sysctl_tc_params(SYSCTL_HANDLER_ARGS);
 #ifdef RATELIMIT
 void t4_init_etid_table(struct adapter *);
 void t4_free_etid_table(struct adapter *);
 struct cxgbe_rate_tag *lookup_etid(struct adapter *, int);
 int cxgbe_rate_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
-int cxgbe_rate_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *);
-int cxgbe_rate_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
-void cxgbe_rate_tag_free(struct m_snd_tag *);
 void cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *);
 void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *);
 #endif
 
 /* t4_filter.c */
 int get_filter_mode(struct adapter *, uint32_t *);
 int set_filter_mode(struct adapter *, uint32_t);
 int set_filter_mask(struct adapter *, uint32_t);
 int get_filter(struct adapter *, struct t4_filter *);
 int set_filter(struct adapter *, struct t4_filter *);
 int del_filter(struct adapter *, struct t4_filter *);
 int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t4_hashfilter_ao_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t4_hashfilter_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 int t4_del_hashfilter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
 void free_hftid_hash(struct tid_info *);
 
 static inline struct wrqe *
 alloc_wrqe(int wr_len, struct sge_wrq *wrq)
 {
 	int len = offsetof(struct wrqe, wr) + wr_len;
 	struct wrqe *wr;
 
 	wr = malloc(len, M_CXGBE, M_NOWAIT);
 	if (__predict_false(wr == NULL))
 		return (NULL);
 	wr->wr_len = wr_len;
 	wr->wrq = wrq;
 	return (wr);
 }
 
 static inline void *
 wrtod(struct wrqe *wr)
 {
 	return (&wr->wr[0]);
 }
 
 static inline void
 free_wrqe(struct wrqe *wr)
 {
 	free(wr, M_CXGBE);
 }
 
 static inline void
 t4_wrq_tx(struct adapter *sc, struct wrqe *wr)
 {
 	struct sge_wrq *wrq = wr->wrq;
 
 	TXQ_LOCK(wrq);
 	t4_wrq_tx_locked(sc, wrq, wr);
 	TXQ_UNLOCK(wrq);
 }
 
 static inline int
 read_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val,
     int len)
 {
 
 	return (rw_via_memwin(sc, idx, addr, val, len, 0));
 }
 
 static inline int
 write_via_memwin(struct adapter *sc, int idx, uint32_t addr,
     const uint32_t *val, int len)
 {
 
 	return (rw_via_memwin(sc, idx, addr, (void *)(uintptr_t)val, len, 1));
 }
 
 /* Number of len16 -> number of descriptors */
 static inline int
 tx_len16_to_desc(int len16)
 {
 
 	return (howmany(len16, EQ_ESIZE / 16));
 }
 #endif
diff --git a/sys/dev/cxgbe/crypto/t4_kern_tls.c b/sys/dev/cxgbe/crypto/t4_kern_tls.c
index a20c3045b5b3..f8d5e54cc3b5 100644
--- a/sys/dev/cxgbe/crypto/t4_kern_tls.c
+++ b/sys/dev/cxgbe/crypto/t4_kern_tls.c
@@ -1,2151 +1,2157 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2018-2019 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/ktr.h>
 #include <sys/ktls.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp_var.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "t4_l2t.h"
 #include "t4_clip.h"
 #include "t4_mp_ring.h"
 #include "crypto/t4_crypto.h"
 
 #if defined(INET) || defined(INET6)
 
 #define TLS_HEADER_LENGTH		5
 
 struct tls_scmd {
 	__be32 seqno_numivs;
 	__be32 ivgen_hdrlen;
 };
 
 struct tlspcb {
 	struct m_snd_tag com;
 	struct vi_info *vi;	/* virtual interface */
 	struct adapter *sc;
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	int tid;		/* Connection identifier */
 
 	int tx_key_addr;
 	bool inline_key;
 	bool using_timestamps;
 	unsigned char enc_mode;
 
 	struct tls_scmd scmd0;
 	struct tls_scmd scmd0_short;
 
 	unsigned int tx_key_info_size;
 
 	uint32_t prev_seq;
 	uint32_t prev_ack;
 	uint32_t prev_tsecr;
 	uint16_t prev_win;
 	uint16_t prev_mss;
 
 	/* Only used outside of setup and teardown when using inline keys. */
 	struct tls_keyctx keyctx;
 
 	/* Fields only used during setup and teardown. */
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 
 	bool open_pending;
 };
 
+static void cxgbe_tls_tag_free(struct m_snd_tag *mst);
 static int ktls_setup_keys(struct tlspcb *tlsp,
     const struct ktls_session *tls, struct sge_txq *txq);
 
+static const struct if_snd_tag_sw cxgbe_tls_tag_sw = {
+	.snd_tag_free = cxgbe_tls_tag_free,
+	.type = IF_SND_TAG_TYPE_TLS
+};
+
 static inline struct tlspcb *
 mst_to_tls(struct m_snd_tag *t)
 {
 	return (__containerof(t, struct tlspcb, com));
 }
 
 static struct tlspcb *
 alloc_tlspcb(struct ifnet *ifp, struct vi_info *vi, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct tlspcb *tlsp;
 
 	tlsp = malloc(sizeof(*tlsp), M_CXGBE, M_ZERO | flags);
 	if (tlsp == NULL)
 		return (NULL);
 
-	m_snd_tag_init(&tlsp->com, ifp, IF_SND_TAG_TYPE_TLS);
+	m_snd_tag_init(&tlsp->com, ifp, &cxgbe_tls_tag_sw);
 	tlsp->vi = vi;
 	tlsp->sc = sc;
 	tlsp->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	tlsp->tid = -1;
 	tlsp->tx_key_addr = -1;
 
 	return (tlsp);
 }
 
 static int
 ktls_act_open_cpl_size(bool isipv6)
 {
 
 	if (isipv6)
 		return (sizeof(struct cpl_t6_act_open_req6));
 	else
 		return (sizeof(struct cpl_t6_act_open_req));
 }
 
 static void
 mk_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp,
     struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req *cpl6;
 	struct cpl_act_open_req *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
 		qid_atid));
 	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
 	    &cpl->peer_ip, &cpl->peer_port);
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static void
 mk_ktls_act_open_req6(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req6 *cpl6;
 	struct cpl_act_open_req6 *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req6 *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
 		qid_atid));
 	cpl->local_port = inp->inp_lport;
 	cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
 	cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
 	cpl->peer_port = inp->inp_fport;
 	cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
 	cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static int
 send_ktls_act_open_req(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid)
 {
 	struct wrqe *wr;
 	bool isipv6;
 
 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	if (isipv6) {
 		tlsp->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
 		if (tlsp->ce == NULL)
 			return (ENOENT);
 	}
 
 	wr = alloc_wrqe(ktls_act_open_cpl_size(isipv6), tlsp->ctrlq);
 	if (wr == NULL) {
 		CTR2(KTR_CXGBE, "%s: atid %d failed to alloc WR", __func__,
 		    atid);
 		return (ENOMEM);
 	}
 
 	if (isipv6)
 		mk_ktls_act_open_req6(sc, vi, inp, tlsp, atid, wrtod(wr));
 	else
 		mk_ktls_act_open_req(sc, vi, inp, tlsp, atid, wrtod(wr));
 
 	tlsp->open_pending = true;
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 ktls_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 	u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
 	struct tlspcb *tlsp = lookup_atid(sc, atid);
 	struct inpcb *inp = tlsp->inp;
 
 	CTR3(KTR_CXGBE, "%s: atid %d status %d", __func__, atid, status);
 	free_atid(sc, atid);
 	if (status == 0)
 		tlsp->tid = GET_TID(cpl);
 
 	INP_WLOCK(inp);
 	tlsp->open_pending = false;
 	wakeup(tlsp);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 _Static_assert((LEN__SET_TCB_FIELD_ULP + sizeof(struct ulptx_idata)) % 16 == 0,
     "CPL_SET_TCB_FIELD ULP command not 16-byte aligned");
 
 static void
 write_set_tcb_field_ulp(struct tlspcb *tlsp, void *dst, struct sge_txq *txq,
     uint16_t word, uint64_t mask, uint64_t val)
 {
 	struct ulp_txpkt *txpkt;
 	struct ulptx_idata *idata;
 	struct cpl_set_tcb_field_core *cpl;
 
 	/* ULP_TXPKT */
 	txpkt = dst;
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (struct ulptx_idata *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	idata->len = htobe32(sizeof(*cpl));
 
 	/* CPL_SET_TCB_FIELD */
 	cpl = (struct cpl_set_tcb_field_core *)(idata + 1);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tlsp->tid));
 	cpl->reply_ctrl = htobe16(F_NO_REPLY);
 	cpl->word_cookie = htobe16(V_WORD(word));
 	cpl->mask = htobe64(mask);
 	cpl->val = htobe64(val);
 
 	/* ULPTX_NOOP */
 	idata = (struct ulptx_idata *)(cpl + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 	idata->len = htobe32(0);
 }
 
 static int
 ktls_set_tcb_fields(struct tlspcb *tlsp, struct tcpcb *tp, struct sge_txq *txq)
 {
 	struct fw_ulptx_wr *wr;
 	struct mbuf *m;
 	char *dst;
 	void *items[1];
 	int error, len;
 
 	len = sizeof(*wr) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	m = alloc_wr_mbuf(len, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 
 	/* FW_ULPTX_WR */
 	wr = mtod(m, void *);
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(len / 16));
 	wr->cookie = 0;
 	dst = (char *)(wr + 1);
 
         /* Clear TF_NON_OFFLOAD and set TF_CORE_BYPASS */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_T_FLAGS,
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1) | V_TF_NON_OFFLOAD(1)),
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1)));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/* Clear the SND_UNA_RAW, SND_NXT_RAW, and SND_MAX_RAW offsets. */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_UNA_RAW,
 	    V_TCB_SND_NXT_RAW(M_TCB_SND_NXT_RAW) |
 	    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 	    V_TCB_SND_NXT_RAW(0) | V_TCB_SND_UNA_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_MAX_RAW,
 	    V_TCB_SND_MAX_RAW(M_TCB_SND_MAX_RAW), V_TCB_SND_MAX_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_TIMESTAMP_OFFSET,
 		    V_TCB_TIMESTAMP_OFFSET(M_TCB_TIMESTAMP_OFFSET),
 		    V_TCB_TIMESTAMP_OFFSET(tp->ts_offset >> 28));
 		dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	}
 
 	KASSERT(dst - (char *)wr == len, ("%s: length mismatch", __func__));
 
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	return (error);
 }
 
 int
 cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	const struct ktls_session *tls;
 	struct tlspcb *tlsp;
 	struct adapter *sc;
 	struct vi_info *vi;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct sge_txq *txq;
 	int atid, error, explicit_iv_size, keyid, mac_first;
 
 	tls = params->tls.tls;
 
 	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
 	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
 	    tls->params.tls_vminor > TLS_MINOR_VER_TWO)
 		return (EPROTONOSUPPORT);
 
 	/* Sanity check values in *tls. */
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		/* XXX: Explicitly ignore any provided IV. */
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			break;
 		default:
 			return (EPROTONOSUPPORT);
 		}
 		explicit_iv_size = AES_BLOCK_LEN;
 		mac_first = 1;
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		if (tls->params.iv_len != SALT_SIZE)
 			return (EINVAL);
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		explicit_iv_size = 8;
 		mac_first = 0;
 		break;
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
 	vi = ifp->if_softc;
 	sc = vi->adapter;
 
 	tlsp = alloc_tlspcb(ifp, vi, M_WAITOK);
 
 	atid = alloc_atid(sc, tlsp);
 	if (atid < 0) {
 		error = ENOMEM;
 		goto failed;
 	}
 
 	if (sc->tlst.inline_keys)
 		keyid = -1;
 	else
 		keyid = t4_alloc_tls_keyid(sc);
 	if (keyid < 0) {
 		CTR2(KTR_CXGBE, "%s: atid %d using immediate key ctx", __func__,
 		    atid);
 		tlsp->inline_key = true;
 	} else {
 		tlsp->tx_key_addr = keyid;
 		CTR3(KTR_CXGBE, "%s: atid %d allocated TX key addr %#x",
 		    __func__,
 		    atid, tlsp->tx_key_addr);
 	}
 
 	inp = params->tls.inp;
 	INP_RLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 	tlsp->inp = inp;
 
 	tp = inp->inp_ppcb;
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		tlsp->using_timestamps = true;
 		if ((tp->ts_offset & 0xfffffff) != 0) {
 			INP_RUNLOCK(inp);
 			error = EINVAL;
 			goto failed;
 		}
 	} else
 		tlsp->using_timestamps = false;
 
 	error = send_ktls_act_open_req(sc, vi, inp, tlsp, atid);
 	if (error) {
 		INP_RUNLOCK(inp);
 		goto failed;
 	}
 
 	/* Wait for reply to active open. */
 	CTR2(KTR_CXGBE, "%s: atid %d sent CPL_ACT_OPEN_REQ", __func__,
 	    atid);
 	while (tlsp->open_pending) {
 		/*
 		 * XXX: PCATCH?  We would then have to discard the PCB
 		 * when the completion CPL arrived.
 		 */
 		error = rw_sleep(tlsp, &inp->inp_lock, 0, "t6tlsop", 0);
 	}
 
 	atid = -1;
 	if (tlsp->tid < 0) {
 		INP_RUNLOCK(inp);
 		error = ENOMEM;
 		goto failed;
 	}
 
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 
 	txq = &sc->sge.txq[vi->first_txq];
 	if (inp->inp_flowtype != M_HASHTYPE_NONE)
 		txq += ((inp->inp_flowid % (vi->ntxq - vi->rsrv_noflowq)) +
 		    vi->rsrv_noflowq);
 	tlsp->txq = txq;
 
 	error = ktls_set_tcb_fields(tlsp, tp, txq);
 	INP_RUNLOCK(inp);
 	if (error)
 		goto failed;
 
 	error = ktls_setup_keys(tlsp, tls, txq);
 	if (error)
 		goto failed;
 
 	tlsp->enc_mode = t4_tls_cipher_mode(tls);
 	tlsp->tx_key_info_size = t4_tls_key_info_size(tls);
 
 	/* The SCMD fields used when encrypting a full TLS record. */
 	tlsp->scmd0.seqno_numivs = htobe32(V_SCMD_SEQ_NO_CTRL(3) |
 	    V_SCMD_PROTO_VERSION(t4_tls_proto_ver(tls)) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) |
 	    V_SCMD_CIPH_MODE(tlsp->enc_mode) |
 	    V_SCMD_AUTH_MODE(t4_tls_auth_mode(tls)) |
 	    V_SCMD_HMAC_CTRL(t4_tls_hmac_ctrl(tls)) |
 	    V_SCMD_IV_SIZE(explicit_iv_size / 2) | V_SCMD_NUM_IVS(1));
 
 	tlsp->scmd0.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0);
 	if (tlsp->inline_key)
 		tlsp->scmd0.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 	tlsp->scmd0.ivgen_hdrlen = htobe32(tlsp->scmd0.ivgen_hdrlen);
 
 	/*
 	 * The SCMD fields used when encrypting a partial TLS record
 	 * (no trailer and possibly a truncated payload).
 	 */
 	tlsp->scmd0_short.seqno_numivs = V_SCMD_SEQ_NO_CTRL(0) |
 	    V_SCMD_PROTO_VERSION(SCMD_PROTO_VERSION_GENERIC) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) |
 	    V_SCMD_AUTH_MODE(SCMD_AUTH_MODE_NOP) |
 	    V_SCMD_HMAC_CTRL(SCMD_HMAC_CTRL_NOP) |
 	    V_SCMD_IV_SIZE(AES_BLOCK_LEN / 2) | V_SCMD_NUM_IVS(0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(SCMD_CIPH_MODE_AES_CTR);
 	else
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(tlsp->enc_mode);
 	tlsp->scmd0_short.seqno_numivs =
 	    htobe32(tlsp->scmd0_short.seqno_numivs);
 
 	tlsp->scmd0_short.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0) |
 	    V_SCMD_AADIVDROP(1);
 	if (tlsp->inline_key)
 		tlsp->scmd0_short.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 
 	TXQ_LOCK(txq);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		txq->kern_tls_gcm++;
 	else
 		txq->kern_tls_cbc++;
 	TXQ_UNLOCK(txq);
 	*pt = &tlsp->com;
 	return (0);
 
 failed:
 	if (atid >= 0)
 		free_atid(sc, atid);
 	m_snd_tag_rele(&tlsp->com);
 	return (error);
 }
 
 static int
 ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls,
     struct sge_txq *txq)
 {
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 	void *items[1];
 	struct mbuf *m;
 	int error;
 
 	/*
 	 * Store the salt and keys in the key context.  For
 	 * connections with an inline key, this key context is passed
 	 * as immediate data in each work request.  For connections
 	 * storing the key in DDR, a work request is used to store a
 	 * copy of the key context in DDR.
 	 */
 	t4_tls_key_ctx(tls, KTLS_TX, &tlsp->keyctx);
 	if (tlsp->inline_key)
 		return (0);
 
 	/* Populate key work request. */
         m = alloc_wr_mbuf(TLS_KEY_WR_SZ, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	kwr = mtod(m, void *);
 	memset(kwr, 0, TLS_KEY_WR_SZ);
 
 	t4_write_tlskey_wr(tls, KTLS_TX, tlsp->tid, 0, tlsp->tx_key_addr, kwr);
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memcpy(kctx, &tlsp->keyctx, sizeof(*kctx));
 
 	/*
 	 * Place the key work request in the transmit queue.  It
 	 * should be sent to the NIC before any TLS packets using this
 	 * session.
 	 */
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	else
 		CTR2(KTR_CXGBE, "%s: tid %d sent key WR", __func__, tlsp->tid);
 	return (error);
 }
 
 static u_int
 ktls_base_wr_size(struct tlspcb *tlsp)
 {
 	u_int wr_len;
 
 	wr_len = sizeof(struct fw_ulptx_wr);	// 16
 	wr_len += sizeof(struct ulp_txpkt);	// 8
 	wr_len += sizeof(struct ulptx_idata);	// 8
 	wr_len += sizeof(struct cpl_tx_sec_pdu);// 32
 	if (tlsp->inline_key)
 		wr_len += tlsp->tx_key_info_size;
 	else {
 		wr_len += sizeof(struct ulptx_sc_memrd);// 8
 		wr_len += sizeof(struct ulptx_idata);	// 8
 	}
 	wr_len += sizeof(struct cpl_tx_data);	// 16
 	return (wr_len);
 }
 
 /* How many bytes of TCP payload to send for a given TLS record. */
 static u_int
 ktls_tcp_payload_length(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct tls_record_layer *hdr;
 	u_int plen, mlen;
 
 	M_ASSERTEXTPG(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 
 	/*
 	 * What range of the TLS record is the mbuf requesting to be
 	 * sent.
 	 */
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 
 	/* Always send complete records. */
 	if (mlen == TLS_HEADER_LENGTH + plen)
 		return (mlen);
 
 	/*
 	 * If the host stack has asked to send part of the trailer,
 	 * trim the length to avoid sending any of the trailer.  There
 	 * is no way to send a partial trailer currently.
 	 */
 	if (mlen > TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen)
 		mlen = TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen;
 
 
 	/*
 	 * For AES-CBC adjust the ciphertext length for the block
 	 * size.
 	 */
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    mlen > TLS_HEADER_LENGTH) {
 		mlen = TLS_HEADER_LENGTH + rounddown(mlen - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d short TLS record (%u vs %u)",
 	    __func__, tlsp->tid, mlen, TLS_HEADER_LENGTH + plen);
 #endif
 	return (mlen);
 }
 
 /*
  * For a "short" TLS record, determine the offset into the TLS record
  * payload to send.  This offset does not include the TLS header, but
  * a non-zero offset implies that a header will not be sent.
  */
 static u_int
 ktls_payload_offset(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct tls_record_layer *hdr;
 	u_int offset, plen;
 #ifdef INVARIANTS
 	u_int mlen;
 #endif
 
 	M_ASSERTEXTPG(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 #ifdef INVARIANTS
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 	MPASS(mlen < TLS_HEADER_LENGTH + plen);
 #endif
 	if (mtod(m_tls, vm_offset_t) <= m_tls->m_epg_hdrlen)
 		return (0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 		/*
 		 * Always send something.  This function is only called
 		 * if we aren't sending the tag at all, but if the
 		 * request starts in the tag then we are in an odd
 		 * state where would effectively send nothing.  Cap
 		 * the offset at the last byte of the record payload
 		 * to send the last cipher block.
 		 */
 		offset = min(mtod(m_tls, vm_offset_t) - m_tls->m_epg_hdrlen,
 		    (plen - TLS_HEADER_LENGTH - m_tls->m_epg_trllen) - 1);
 		return (rounddown(offset, AES_BLOCK_LEN));
 	}
 	return (0);
 }
 
 static u_int
 ktls_sgl_size(u_int nsegs)
 {
 	u_int wr_len;
 
 	/* First segment is part of ulptx_sgl. */
 	nsegs--;
 
 	wr_len = sizeof(struct ulptx_sgl);
 	wr_len += 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	return (wr_len);
 }
 
 static int
 ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struct mbuf *m_tls,
     int *nsegsp)
 {
 	struct tls_record_layer *hdr;
 	u_int imm_len, offset, plen, wr_len, tlen;
 
 	M_ASSERTEXTPG(m_tls);
 
 	/*
 	 * Determine the size of the TLS record payload to send
 	 * excluding header and trailer.
 	 */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
 	if (tlen <= m_tls->m_epg_hdrlen) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) +
 		    roundup2(m->m_len + m_tls->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 		    "%s: tid %d TLS header-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len + m_tls->m_len);
 		}
 
 		/* This should always be the last TLS record in a chain. */
 		MPASS(m_tls->m_next == NULL);
 
 		/*
 		 * XXX: Set a bogus 'nsegs' value to avoid tripping an
 		 * assertion in mbuf_nsegs() in t4_sge.c.
 		 */
 		*nsegsp = 1;
 		return (wr_len);
 	}
 
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen;
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 	} else
 		offset = 0;
 
 	/* Calculate the size of the work request. */
 	wr_len = ktls_base_wr_size(tlsp);
 
 	/*
 	 * Full records and short records with an offset of 0 include
 	 * the TLS header as immediate data.  Short records include a
 	 * raw AES IV as immediate data.
 	 */
 	imm_len = 0;
 	if (offset == 0)
 		imm_len += m_tls->m_epg_hdrlen;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	wr_len += roundup2(imm_len, 16);
 
 	/* TLS record payload via DSGL. */
 	*nsegsp = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen + offset,
 	    plen - (m_tls->m_epg_hdrlen + offset));
 	wr_len += ktls_sgl_size(*nsegsp);
 
 	wr_len = roundup2(wr_len, 16);
 	return (wr_len);
 }
 
 /*
  * See if we have any TCP options requiring a dedicated options-only
  * packet.
  */
 static int
 ktls_has_tcp_options(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_NOP:
 		case TCPOPT_TIMESTAMP:
 			break;
 		default:
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Find the TCP timestamp option.
  */
 static void *
 ktls_find_tcp_timestamps(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		if (opt == TCPOPT_TIMESTAMP && optlen == TCPOLEN_TIMESTAMP)
 			return (cp + 2);
 	}
 	return (NULL);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	struct tlspcb *tlsp;
 	struct ether_header *eh;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	int nsegs;
 	u_int wr_len, tot_len;
 
 	/*
 	 * Locate headers in initial mbuf.
 	 *
 	 * XXX: This assumes all of the headers are in the initial mbuf.
 	 * Could perhaps use m_advance() like parse_pkt() if that turns
 	 * out to not be true.
 	 */
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	if (m->m_len <= sizeof(*eh) + sizeof(*ip)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short", __func__,
 		    tlsp->tid);
 		return (EINVAL);
 	}
 	eh = mtod(m, struct ether_header *);
 	if (ntohs(eh->ether_type) != ETHERTYPE_IP &&
 	    ntohs(eh->ether_type) != ETHERTYPE_IPV6) {
 		CTR2(KTR_CXGBE, "%s: tid %d mbuf not ETHERTYPE_IP{,V6}",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	m->m_pkthdr.l2hlen = sizeof(*eh);
 
 	/* XXX: Reject unsupported IP options? */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (struct ip *)(eh + 1);
 		if (ip->ip_p != IPPROTO_TCP) {
 			CTR2(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP",
 			    __func__, tlsp->tid);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = ip->ip_hl * 4;
 	} else {
 		ip6 = (struct ip6_hdr *)(eh + 1);
 		if (ip6->ip6_nxt != IPPROTO_TCP) {
 			CTR3(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP (%u)",
 			    __func__, tlsp->tid, ip6->ip6_nxt);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
 	}
 	if (m->m_len < m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    sizeof(*tcp)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short (2)",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	tcp = (struct tcphdr *)((char *)(eh + 1) + m->m_pkthdr.l3hlen);
 	m->m_pkthdr.l4hlen = tcp->th_off * 4;
 
 	/* Bail if there is TCP payload before the TLS record. */
 	if (m->m_len != m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    m->m_pkthdr.l4hlen) {
 		CTR6(KTR_CXGBE,
 		    "%s: tid %d header mbuf bad length (%d + %d + %d != %d)",
 		    __func__, tlsp->tid, m->m_pkthdr.l2hlen,
 		    m->m_pkthdr.l3hlen, m->m_pkthdr.l4hlen, m->m_len);
 		return (EINVAL);
 	}
 
 	/* Assume all headers are in 'm' for now. */
 	MPASS(m->m_next != NULL);
 	MPASS(m->m_next->m_flags & M_EXTPG);
 
 	tot_len = 0;
 
 	/*
 	 * Each of the remaining mbufs in the chain should reference a
 	 * TLS record.
 	 */
 	*nsegsp = 0;
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_EXTPG);
 
 		wr_len = ktls_wr_len(tlsp, m, m_tls, &nsegs);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d wr_len %d nsegs %d", __func__,
 		    tlsp->tid, wr_len, nsegs);
 #endif
 		if (wr_len > SGE_MAX_WR_LEN || nsegs > TX_SGL_SEGS)
 			return (EFBIG);
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 
 		/*
 		 * Store 'nsegs' for the first TLS record in the
 		 * header mbuf's metadata.
 		 */
 		if (*nsegsp == 0)
 			*nsegsp = nsegs;
 	}
 
 	MPASS(tot_len != 0);
 
 	/*
 	 * See if we have any TCP options or a FIN requiring a
 	 * dedicated packet.
 	 */
 	if ((tcp->th_flags & TH_FIN) != 0 || ktls_has_tcp_options(tcp)) {
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 			    "%s: tid %d options-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len);
 			return (EINVAL);
 		}
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 	}
 
 	/* Include room for a TP work request to program an L2T entry. */
 	tot_len += EQ_ESIZE;
 
 	/*
 	 * Include room for a ULPTX work request including up to 5
 	 * CPL_SET_TCB_FIELD commands before the first TLS work
 	 * request.
 	 */
 	wr_len = sizeof(struct fw_ulptx_wr) +
 	    5 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/*
 	 * If timestamps are present, reserve 1 more command for
 	 * setting the echoed timestamp.
 	 */
 	if (tlsp->using_timestamps)
 		wr_len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	tot_len += roundup2(wr_len, EQ_ESIZE);
 
 	*len16p = tot_len / 16;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d len16 %d nsegs %d", __func__,
 	    tlsp->tid, *len16p, *nsegsp);
 #endif
 	return (0);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_buf(struct sglist *gl, caddr_t to)
 {
 	struct sglist_seg *seg;
 	__be64 *flitp;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)to & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, to));
 
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)to;
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	for (i = 0; i < nsegs - 1; i++, seg++) {
 		usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 		usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	flitp += nflits;
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 		if ((uintptr_t)(*to) == (uintptr_t)&eq->desc[eq->sidx])
 			(*to) = (caddr_t)eq->desc;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static int
 ktls_write_tcp_options(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Clear PUSH and FIN in the TCP header if present. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_flags &= ~(TH_PUSH | TH_FIN);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, pktlen -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_options++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = NULL;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tunnel_packet(struct sge_txq *txq, void *dst, struct mbuf *m,
     struct mbuf *m_tls, u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	/* Locate the template TLS header. */
 	M_ASSERTEXTPG(m_tls);
 
 	/* This should always be the last TLS record in a chain. */
 	MPASS(m_tls->m_next == NULL);
 
 	wr = dst;
 	pktlen = m->m_len + m_tls->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno + mtod(m_tls, vm_offset_t));
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of TCP header. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 
 	/* Copy the subset of the TLS header requested. */
 	copy_to_txd(&txq->eq, (char *)m_tls->m_epg_hdr +
 	    mtod(m_tls, vm_offset_t), &out, m_tls->m_len);
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_header++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 _Static_assert(sizeof(struct cpl_set_tcb_field) <= EQ_ESIZE,
     "CPL_SET_TCB_FIELD must be smaller than a single TX descriptor");
 _Static_assert(W_TCB_SND_UNA_RAW == W_TCB_SND_NXT_RAW,
     "SND_NXT_RAW and SND_UNA_RAW are in different words");
 
 static int
 ktls_write_tls_wr(struct tlspcb *tlsp, struct sge_txq *txq,
     void *dst, struct mbuf *m, struct tcphdr *tcp, struct mbuf *m_tls,
     u_int nsegs, u_int available, tcp_seq tcp_seqno, uint32_t *tsopt,
     u_int pidx, bool set_l2t_idx)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct fw_ulptx_wr *wr;
 	struct ulp_txpkt *txpkt;
 	struct ulptx_sc_memrd *memrd;
 	struct ulptx_idata *idata;
 	struct cpl_tx_sec_pdu *sec_pdu;
 	struct cpl_tx_data *tx_data;
 	struct tls_record_layer *hdr;
 	char *iv, *out;
 	u_int aad_start, aad_stop;
 	u_int auth_start, auth_stop, auth_insert;
 	u_int cipher_start, cipher_stop, iv_offset;
 	u_int imm_len, mss, ndesc, offset, plen, tlen, twr_len, wr_len;
 	u_int fields, tx_max_offset, tx_max;
 	bool first_wr, last_wr, using_scratch;
 
 	ndesc = 0;
 	MPASS(tlsp->txq == txq);
 
 	first_wr = (tlsp->prev_seq == 0 && tlsp->prev_ack == 0 &&
 	    tlsp->prev_win == 0);
 
 	/*
 	 * Use the per-txq scratch pad if near the end of the ring to
 	 * simplify handling of wrap-around.  This uses a simple but
 	 * not quite perfect test of using the scratch buffer if we
 	 * can't fit a maximal work request in without wrapping.
 	 */
 	using_scratch = (eq->sidx - pidx < SGE_MAX_WR_LEN / EQ_ESIZE);
 
 	/* Locate the TLS header. */
 	M_ASSERTEXTPG(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen;
 
 	/* Determine how much of the TLS record to send. */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
 	if (tlen <= m_tls->m_epg_hdrlen) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d header-only TLS record %u",
 		    __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno);
 #endif
 		return (ktls_write_tunnel_packet(txq, dst, m, m_tls, available,
 		    tcp_seqno, pidx));
 	}
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d short TLS record %u with offset %u",
 		    __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno, offset);
 #endif
 		if (m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) != 0) {
 			txq->kern_tls_fin_short++;
 #ifdef INVARIANTS
 			panic("%s: FIN on short TLS record", __func__);
 #endif
 		}
 	} else
 		offset = 0;
 
 	/*
 	 * This is the last work request for a given TLS mbuf chain if
 	 * it is the last mbuf in the chain and FIN is not set.  If
 	 * FIN is set, then ktls_write_tcp_fin() will write out the
 	 * last work request.
 	 */
 	last_wr = m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) == 0;
 
 	/*
 	 * The host stack may ask us to not send part of the start of
 	 * a TLS record.  (For example, the stack might have
 	 * previously sent a "short" TLS record and might later send
 	 * down an mbuf that requests to send the remainder of the TLS
 	 * record.)  The crypto engine must process a TLS record from
 	 * the beginning if computing a GCM tag or HMAC, so we always
 	 * send the TLS record from the beginning as input to the
 	 * crypto engine and via CPL_TX_DATA to TP.  However, TP will
 	 * drop individual packets after they have been chopped up
 	 * into MSS-sized chunks if the entire sequence range of those
 	 * packets is less than SND_UNA.  SND_UNA is computed as
 	 * TX_MAX - SND_UNA_RAW.  Thus, use the offset stored in
 	 * m_data to set TX_MAX to the first byte in the TCP sequence
 	 * space the host actually wants us to send and set
 	 * SND_UNA_RAW to 0.
 	 *
 	 * If the host sends us back to back requests that span the
 	 * trailer of a single TLS record (first request ends "in" the
 	 * trailer and second request starts at the next byte but
 	 * still "in" the trailer), the initial bytes of the trailer
 	 * that the first request drops will not be retransmitted.  If
 	 * the host uses the same requests when retransmitting the
 	 * connection will hang.  To handle this, always transmit the
 	 * full trailer for a request that begins "in" the trailer
 	 * (the second request in the example above).  This should
 	 * also help to avoid retransmits for the common case.
 	 *
 	 * A similar condition exists when using CBC for back to back
 	 * requests that span a single AES block.  The first request
 	 * will be truncated to end at the end of the previous AES
 	 * block.  To handle this, always begin transmission at the
 	 * start of the current AES block.
 	 */
 	tx_max_offset = mtod(m_tls, vm_offset_t);
 	if (tx_max_offset > TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
 	    m_tls->m_epg_trllen) {
 		/* Always send the full trailer. */
 		tx_max_offset = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
 		    m_tls->m_epg_trllen;
 	}
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    tx_max_offset > TLS_HEADER_LENGTH) {
 		/* Always send all of the first AES block. */
 		tx_max_offset = TLS_HEADER_LENGTH +
 		    rounddown(tx_max_offset - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 	tx_max = tcp_seqno + tx_max_offset;
 
 	/*
 	 * Update TCB fields.  Reserve space for the FW_ULPTX_WR header
 	 * but don't populate it until we know how many field updates
 	 * are required.
 	 */
 	if (using_scratch)
 		wr = (void *)txq->ss;
 	else
 		wr = dst;
 	out = (void *)(wr + 1);
 	fields = 0;
 	if (set_l2t_idx) {
 		KASSERT(nsegs != 0,
 		    ("trying to set L2T_IX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d set L2T_IX to %d", __func__,
 		    tlsp->tid, tlsp->l2te->idx);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_L2T_IX,
 		    V_TCB_L2T_IX(M_TCB_L2T_IX), V_TCB_L2T_IX(tlsp->l2te->idx));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 	if (tsopt != NULL && tlsp->prev_tsecr != ntohl(tsopt[1])) {
 		KASSERT(nsegs != 0,
 		    ("trying to set T_RTSEQ_RECENT for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote updated T_RTSEQ_RECENT",
 		    __func__, tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_T_RTSEQ_RECENT,
 		    V_TCB_T_RTSEQ_RECENT(M_TCB_T_RTSEQ_RECENT),
 		    V_TCB_T_RTSEQ_RECENT(ntohl(tsopt[1])));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_tsecr = ntohl(tsopt[1]);
 	}
 
 	if (first_wr || tlsp->prev_seq != tx_max) {
 		KASSERT(nsegs != 0,
 		    ("trying to set TX_MAX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE,
 		    "%s: tid %d setting TX_MAX to %u (tcp_seqno %u)",
 		    __func__, tlsp->tid, tx_max, tcp_seqno);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_TX_MAX,
 		    V_TCB_TX_MAX(M_TCB_TX_MAX), V_TCB_TX_MAX(tx_max));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * If there is data to drop at the beginning of this TLS
 	 * record or if this is a retransmit,
 	 * reset SND_UNA_RAW to 0 so that SND_UNA == TX_MAX.
 	 */
 	if (tlsp->prev_seq != tx_max || mtod(m_tls, vm_offset_t) != 0) {
 		KASSERT(nsegs != 0,
 		    ("trying to clear SND_UNA_RAW for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d clearing SND_UNA_RAW", __func__,
 		    tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_SND_UNA_RAW,
 		    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 		    V_TCB_SND_UNA_RAW(0));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * Store the expected sequence number of the next byte after
 	 * this record.
 	 */
 	tlsp->prev_seq = tcp_seqno + tlen;
 
 	if (first_wr || tlsp->prev_ack != ntohl(tcp->th_ack)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_NXT for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_NXT,
 		    V_TCB_RCV_NXT(M_TCB_RCV_NXT),
 		    V_TCB_RCV_NXT(ntohl(tcp->th_ack)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_ack = ntohl(tcp->th_ack);
 	}
 
 	if (first_wr || tlsp->prev_win != ntohs(tcp->th_win)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_WND for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_WND,
 		    V_TCB_RCV_WND(M_TCB_RCV_WND),
 		    V_TCB_RCV_WND(ntohs(tcp->th_win)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_win = ntohs(tcp->th_win);
 	}
 
 	/* Recalculate 'nsegs' if cached value is not available. */
 	if (nsegs == 0)
 		nsegs = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen +
 		    offset, plen - (m_tls->m_epg_hdrlen + offset));
 
 	/* Calculate the size of the TLS work request. */
 	twr_len = ktls_base_wr_size(tlsp);
 
 	imm_len = 0;
 	if (offset == 0)
 		imm_len += m_tls->m_epg_hdrlen;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	twr_len += roundup2(imm_len, 16);
 	twr_len += ktls_sgl_size(nsegs);
 
 	/*
 	 * If any field updates were required, determine if they can
 	 * be included in the TLS work request.  If not, use the
 	 * FW_ULPTX_WR work request header at 'wr' as a dedicated work
 	 * request for the field updates and start a new work request
 	 * for the TLS work request afterward.
 	 */
 	if (fields != 0) {
 		wr_len = fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		if (twr_len + wr_len <= SGE_MAX_WR_LEN &&
 		    tlsp->sc->tlst.combo_wrs) {
 			wr_len += twr_len;
 			txpkt = (void *)out;
 		} else {
 			wr_len += sizeof(*wr);
 			wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 			wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 			    V_FW_WR_LEN16(wr_len / 16));
 			wr->cookie = 0;
 
 			/*
 			 * If we were using scratch space, copy the
 			 * field updates work request to the ring.
 			 */
 			if (using_scratch) {
 				out = dst;
 				copy_to_txd(eq, txq->ss, &out, wr_len);
 			}
 
 			ndesc = howmany(wr_len, EQ_ESIZE);
 			MPASS(ndesc <= available);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 
 			/*
 			 * Determine if we should use scratch space
 			 * for the TLS work request based on the
 			 * available space after advancing pidx for
 			 * the field updates work request.
 			 */
 			wr_len = twr_len;
 			using_scratch = (eq->sidx - pidx <
 			    howmany(wr_len, EQ_ESIZE));
 			if (using_scratch)
 				wr = (void *)txq->ss;
 			else
 				wr = dst;
 			txpkt = (void *)(wr + 1);
 		}
 	} else {
 		wr_len = twr_len;
 		txpkt = (void *)out;
 	}
 
 	wr_len = roundup2(wr_len, 16);
 	MPASS(ndesc + howmany(wr_len, EQ_ESIZE) <= available);
 
 	/* FW_ULPTX_WR */
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(wr_len / 16));
 	wr->cookie = 0;
 
 	/* ULP_TXPKT */
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(twr_len - sizeof(*wr), 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (void *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 	    V_ULP_TX_SC_MORE(1));
 	idata->len = sizeof(struct cpl_tx_sec_pdu);
 
 	/*
 	 * The key context, CPL_TX_DATA, and immediate data are part
 	 * of this ULPTX_IDATA when using an inline key.  When reading
 	 * the key from memory, the CPL_TX_DATA and immediate data are
 	 * part of a separate ULPTX_IDATA.
 	 */
 	if (tlsp->inline_key)
 		idata->len += tlsp->tx_key_info_size +
 		    sizeof(struct cpl_tx_data) + imm_len;
 	idata->len = htobe32(idata->len);
 
 	/* CPL_TX_SEC_PDU */
 	sec_pdu = (void *)(idata + 1);
 
 	/*
 	 * For short records, AAD is counted as header data in SCMD0,
 	 * the IV is next followed by a cipher region for the payload.
 	 */
 	if (plen == tlen) {
 		aad_start = 0;
 		aad_stop = 0;
 		iv_offset = 1;
 		auth_start = 0;
 		auth_stop = 0;
 		auth_insert = 0;
 		cipher_start = AES_BLOCK_LEN + 1;
 		cipher_stop = 0;
 
 		sec_pdu->pldlen = htobe32(16 + plen -
 		    (m_tls->m_epg_hdrlen + offset));
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0_short.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = htobe32(
 		    tlsp->scmd0_short.ivgen_hdrlen |
 		    V_SCMD_HDR_LEN(offset == 0 ? m_tls->m_epg_hdrlen : 0));
 
 		txq->kern_tls_short++;
 	} else {
 		/*
 		 * AAD is TLS header.  IV is after AAD.  The cipher region
 		 * starts after the IV.  See comments in ccr_authenc() and
 		 * ccr_gmac() in t4_crypto.c regarding cipher and auth
 		 * start/stop values.
 		 */
 		aad_start = 1;
 		aad_stop = TLS_HEADER_LENGTH;
 		iv_offset = TLS_HEADER_LENGTH + 1;
 		cipher_start = m_tls->m_epg_hdrlen + 1;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		} else {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		}
 
 		sec_pdu->pldlen = htobe32(plen);
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = tlsp->scmd0.ivgen_hdrlen;
 
 		if (mtod(m_tls, vm_offset_t) == 0)
 			txq->kern_tls_full++;
 		else
 			txq->kern_tls_partial++;
 	}
 	sec_pdu->op_ivinsrtofst = htobe32(
 	    V_CPL_TX_SEC_PDU_OPCODE(CPL_TX_SEC_PDU) |
 	    V_CPL_TX_SEC_PDU_CPLLEN(2) | V_CPL_TX_SEC_PDU_PLACEHOLDER(0) |
 	    V_CPL_TX_SEC_PDU_IVINSRTOFST(iv_offset));
 	sec_pdu->aadstart_cipherstop_hi = htobe32(
 	    V_CPL_TX_SEC_PDU_AADSTART(aad_start) |
 	    V_CPL_TX_SEC_PDU_AADSTOP(aad_stop) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTART(cipher_start) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_HI(cipher_stop >> 4));
 	sec_pdu->cipherstop_lo_authinsert = htobe32(
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_LO(cipher_stop & 0xf) |
 	    V_CPL_TX_SEC_PDU_AUTHSTART(auth_start) |
 	    V_CPL_TX_SEC_PDU_AUTHSTOP(auth_stop) |
 	    V_CPL_TX_SEC_PDU_AUTHINSERT(auth_insert));
 
 	sec_pdu->scmd1 = htobe64(m_tls->m_epg_seqno);
 
 	/* Key context */
 	out = (void *)(sec_pdu + 1);
 	if (tlsp->inline_key) {
 		memcpy(out, &tlsp->keyctx, tlsp->tx_key_info_size);
 		out += tlsp->tx_key_info_size;
 	} else {
 		/* ULPTX_SC_MEMRD to read key context. */
 		memrd = (void *)out;
 		memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tlsp->tx_key_info_size >> 4));
 		memrd->addr = htobe32(tlsp->tx_key_addr >> 5);
 
 		/* ULPTX_IDATA for CPL_TX_DATA and TLS header. */
 		idata = (void *)(memrd + 1);
 		idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 		    V_ULP_TX_SC_MORE(1));
 		idata->len = htobe32(sizeof(struct cpl_tx_data) + imm_len);
 
 		out = (void *)(idata + 1);
 	}
 
 	/* CPL_TX_DATA */
 	tx_data = (void *)out;
 	OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tlsp->tid));
 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 		mss = m->m_pkthdr.tso_segsz;
 		tlsp->prev_mss = mss;
 	} else if (tlsp->prev_mss != 0)
 		mss = tlsp->prev_mss;
 	else
 		mss = tlsp->vi->ifp->if_mtu -
 		    (m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen);
 	if (offset == 0) {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen));
 		tx_data->rsvd = htobe32(tcp_seqno);
 	} else {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) |
 		    V_TX_LENGTH(tlen - (m_tls->m_epg_hdrlen + offset)));
 		tx_data->rsvd = htobe32(tcp_seqno + m_tls->m_epg_hdrlen + offset);
 	}
 	tx_data->flags = htobe32(F_TX_BYPASS);
 	if (last_wr && tcp->th_flags & TH_PUSH)
 		tx_data->flags |= htobe32(F_TX_PUSH | F_TX_SHOVE);
 
 	/* Populate the TLS header */
 	out = (void *)(tx_data + 1);
 	if (offset == 0) {
 		memcpy(out, m_tls->m_epg_hdr, m_tls->m_epg_hdrlen);
 		out += m_tls->m_epg_hdrlen;
 	}
 
 	/* AES IV for a short record. */
 	if (plen == tlen) {
 		iv = out;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			memcpy(iv, tlsp->keyctx.u.txhdr.txsalt, SALT_SIZE);
 			memcpy(iv + 4, hdr + 1, 8);
 			*(uint32_t *)(iv + 12) = htobe32(2 +
 			    offset / AES_BLOCK_LEN);
 		} else
 			memcpy(iv, hdr + 1, AES_BLOCK_LEN);
 		out += AES_BLOCK_LEN;
 	}
 
 	if (imm_len % 16 != 0) {
 		/* Zero pad to an 8-byte boundary. */
 		memset(out, 0, 8 - (imm_len % 8));
 		out += 8 - (imm_len % 8);
 
 		/*
 		 * Insert a ULP_TX_SC_NOOP if needed so the SGL is
 		 * 16-byte aligned.
 		 */
 		if (imm_len % 16 <= 8) {
 			idata = (void *)out;
 			idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 			idata->len = htobe32(0);
 			out = (void *)(idata + 1);
 		}
 	}
 
 	/* SGL for record payload */
 	sglist_reset(txq->gl);
 	if (sglist_append_mbuf_epg(txq->gl, m_tls, m_tls->m_epg_hdrlen + offset,
 	    plen - (m_tls->m_epg_hdrlen + offset)) != 0) {
 #ifdef INVARIANTS
 		panic("%s: failed to append sglist", __func__);
 #endif
 	}
 	write_gl_to_buf(txq->gl, out);
 
 	if (using_scratch) {
 		out = dst;
 		copy_to_txd(eq, txq->ss, &out, wr_len);
 	}
 
 	ndesc += howmany(wr_len, EQ_ESIZE);
 	MPASS(ndesc <= available);
 
 	txq->kern_tls_records++;
 	txq->kern_tls_octets += tlen - mtod(m_tls, vm_offset_t);
 	if (mtod(m_tls, vm_offset_t) != 0) {
 		if (offset == 0)
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t);
 		else
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t) -
 			    (m_tls->m_epg_hdrlen + offset);
 	}
 
 	txsd = &txq->sdesc[pidx];
 	if (last_wr)
 		txsd->m = m;
 	else
 		txsd->m = NULL;
 	txsd->desc_used = howmany(wr_len, EQ_ESIZE);
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tcp_fin(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_fin++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct tlspcb *tlsp;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	struct ether_header *eh;
 	tcp_seq tcp_seqno;
 	u_int ndesc, pidx, totdesc;
 	uint16_t vlan_tag;
 	bool has_fin, set_l2t_idx;
 	void *tsopt;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	totdesc = 0;
 	eh = mtod(m, struct ether_header *);
 	tcp = (struct tcphdr *)((char *)eh + m->m_pkthdr.l2hlen +
 	    m->m_pkthdr.l3hlen);
 	pidx = eq->pidx;
 	has_fin = (tcp->th_flags & TH_FIN) != 0;
 
 	/*
 	 * If this TLS record has a FIN, then we will send any
 	 * requested options as part of the FIN packet.
 	 */
 	if (!has_fin && ktls_has_tcp_options(tcp)) {
 		ndesc = ktls_write_tcp_options(txq, dst, m, available, pidx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote TCP options packet", __func__,
 		    tlsp->tid);
 #endif
 	}
 
 	/*
 	 * Allocate a new L2T entry if necessary.  This may write out
 	 * a work request to the txq.
 	 */
 	if (m->m_flags & M_VLANTAG)
 		vlan_tag = m->m_pkthdr.ether_vtag;
 	else
 		vlan_tag = 0xfff;
 	set_l2t_idx = false;
 	if (tlsp->l2te == NULL || tlsp->l2te->vlan != vlan_tag ||
 	    memcmp(tlsp->l2te->dmac, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
 		set_l2t_idx = true;
 		if (tlsp->l2te)
 			t4_l2t_release(tlsp->l2te);
 		tlsp->l2te = t4_l2t_alloc_tls(tlsp->sc, txq, dst, &ndesc,
 		    vlan_tag, tlsp->vi->pi->lport, eh->ether_dhost);
 		if (tlsp->l2te == NULL)
 			CXGBE_UNIMPLEMENTED("failed to allocate TLS L2TE");
 		if (ndesc != 0) {
 			MPASS(ndesc <= available - totdesc);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			totdesc += ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 		}
 	}
 
 	/*
 	 * Iterate over each TLS record constructing a work request
 	 * for that record.
 	 */
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_EXTPG);
 
 		/*
 		 * Determine the initial TCP sequence number for this
 		 * record.
 		 */
 		tsopt = NULL;
 		if (m_tls == m->m_next) {
 			tcp_seqno = ntohl(tcp->th_seq) -
 			    mtod(m_tls, vm_offset_t);
 			if (tlsp->using_timestamps)
 				tsopt = ktls_find_tcp_timestamps(tcp);
 		} else {
 			MPASS(mtod(m_tls, vm_offset_t) == 0);
 			tcp_seqno = tlsp->prev_seq;
 		}
 
 		ndesc = ktls_write_tls_wr(tlsp, txq, dst, m, tcp, m_tls,
 		    nsegs, available - totdesc, tcp_seqno, tsopt, pidx,
 		    set_l2t_idx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 
 		/*
 		 * The value of nsegs from the header mbuf's metadata
 		 * is only valid for the first TLS record.
 		 */
 		nsegs = 0;
 
 		/* Only need to set the L2T index once. */
 		set_l2t_idx = false;
 	}
 
 	if (has_fin) {
 		/*
 		 * If the TCP header for this chain has FIN sent, then
 		 * explicitly send a packet that has FIN set.  This
 		 * will also have PUSH set if requested.  This assumes
 		 * we sent at least one TLS record work request and
 		 * uses the TCP sequence number after that reqeust as
 		 * the sequence number for the FIN packet.
 		 */
 		ndesc = ktls_write_tcp_fin(txq, dst, m, available,
 		    tlsp->prev_seq, pidx);
 		totdesc += ndesc;
 	}
 
 	MPASS(totdesc <= available);
 	return (totdesc);
 }
 
-void
+static void
 cxgbe_tls_tag_free(struct m_snd_tag *mst)
 {
 	struct adapter *sc;
 	struct tlspcb *tlsp;
 
 	tlsp = mst_to_tls(mst);
 	sc = tlsp->sc;
 
 	CTR2(KTR_CXGBE, "%s: tid %d", __func__, tlsp->tid);
 
 	if (tlsp->l2te)
 		t4_l2t_release(tlsp->l2te);
 	if (tlsp->tid >= 0)
 		release_tid(sc, tlsp->tid, tlsp->ctrlq);
 	if (tlsp->ce)
 		t4_release_clip_entry(sc, tlsp->ce);
 	if (tlsp->tx_key_addr >= 0)
 		t4_free_tls_keyid(sc, tlsp->tx_key_addr);
 
 	zfree(tlsp, M_CXGBE);
 }
 
 void
 t6_ktls_modload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, ktls_act_open_rpl,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 void
 t6_ktls_modunload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 #else
 
 int
 cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	return (ENXIO);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	return (EINVAL);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	panic("can't happen");
 }
 
 void
 cxgbe_tls_tag_free(struct m_snd_tag *mst)
 {
 	panic("can't happen");
 }
 
 void
 t6_ktls_modload(void)
 {
 }
 
 void
 t6_ktls_modunload(void)
 {
 }
 
 #endif
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index af24977ec29b..f728ddf5b212 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -1,13131 +1,13073 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/pciio.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 #include <sys/firmware.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/if_vlan_var.h>
 #ifdef RSS
 #include <net/rss_config.h>
 #endif
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #ifdef KERN_TLS
 #include <netinet/tcp_seq.h>
 #endif
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/md_var.h>
 #include <machine/cputypes.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #endif
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "cudbg/cudbg.h"
 #include "t4_clip.h"
 #include "t4_ioctl.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 #include "t4_if.h"
 #include "t4_smt.h"
 
 /* T4 bus driver interface */
 static int t4_probe(device_t);
 static int t4_attach(device_t);
 static int t4_detach(device_t);
 static int t4_child_location(device_t, device_t, struct sbuf *);
 static int t4_ready(device_t);
 static int t4_read_port_device(device_t, int, device_t *);
 static int t4_suspend(device_t);
 static int t4_resume(device_t);
 static int t4_reset_prepare(device_t, device_t);
 static int t4_reset_post(device_t, device_t);
 static device_method_t t4_methods[] = {
 	DEVMETHOD(device_probe,		t4_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 	DEVMETHOD(device_suspend,	t4_suspend),
 	DEVMETHOD(device_resume,	t4_resume),
 
 	DEVMETHOD(bus_child_location,	t4_child_location),
 	DEVMETHOD(bus_reset_prepare, 	t4_reset_prepare),
 	DEVMETHOD(bus_reset_post, 	t4_reset_post),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t4_driver = {
 	"t4nex",
 	t4_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T4 port (cxgbe) interface */
 static int cxgbe_probe(device_t);
 static int cxgbe_attach(device_t);
 static int cxgbe_detach(device_t);
 device_method_t cxgbe_methods[] = {
 	DEVMETHOD(device_probe,		cxgbe_probe),
 	DEVMETHOD(device_attach,	cxgbe_attach),
 	DEVMETHOD(device_detach,	cxgbe_detach),
 	{ 0, 0 }
 };
 static driver_t cxgbe_driver = {
 	"cxgbe",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T4 VI (vcxgbe) interface */
 static int vcxgbe_probe(device_t);
 static int vcxgbe_attach(device_t);
 static int vcxgbe_detach(device_t);
 static device_method_t vcxgbe_methods[] = {
 	DEVMETHOD(device_probe,		vcxgbe_probe),
 	DEVMETHOD(device_attach,	vcxgbe_attach),
 	DEVMETHOD(device_detach,	vcxgbe_detach),
 	{ 0, 0 }
 };
 static driver_t vcxgbe_driver = {
 	"vcxgbe",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 static d_ioctl_t t4_ioctl;
 
 static struct cdevsw t4_cdevsw = {
        .d_version = D_VERSION,
        .d_ioctl = t4_ioctl,
        .d_name = "t4nex",
 };
 
 /* T5 bus driver interface */
 static int t5_probe(device_t);
 static device_method_t t5_methods[] = {
 	DEVMETHOD(device_probe,		t5_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 	DEVMETHOD(device_suspend,	t4_suspend),
 	DEVMETHOD(device_resume,	t4_resume),
 
 	DEVMETHOD(bus_child_location,	t4_child_location),
 	DEVMETHOD(bus_reset_prepare, 	t4_reset_prepare),
 	DEVMETHOD(bus_reset_post, 	t4_reset_post),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t5_driver = {
 	"t5nex",
 	t5_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T5 port (cxl) interface */
 static driver_t cxl_driver = {
 	"cxl",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T5 VI (vcxl) interface */
 static driver_t vcxl_driver = {
 	"vcxl",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 /* T6 bus driver interface */
 static int t6_probe(device_t);
 static device_method_t t6_methods[] = {
 	DEVMETHOD(device_probe,		t6_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
 	DEVMETHOD(device_suspend,	t4_suspend),
 	DEVMETHOD(device_resume,	t4_resume),
 
 	DEVMETHOD(bus_child_location,	t4_child_location),
 	DEVMETHOD(bus_reset_prepare, 	t4_reset_prepare),
 	DEVMETHOD(bus_reset_post, 	t4_reset_post),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
 
 	DEVMETHOD_END
 };
 static driver_t t6_driver = {
 	"t6nex",
 	t6_methods,
 	sizeof(struct adapter)
 };
 
 
 /* T6 port (cc) interface */
 static driver_t cc_driver = {
 	"cc",
 	cxgbe_methods,
 	sizeof(struct port_info)
 };
 
 /* T6 VI (vcc) interface */
 static driver_t vcc_driver = {
 	"vcc",
 	vcxgbe_methods,
 	sizeof(struct vi_info)
 };
 
 /* ifnet interface */
 static void cxgbe_init(void *);
 static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t);
 static int cxgbe_transmit(struct ifnet *, struct mbuf *);
 static void cxgbe_qflush(struct ifnet *);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static int cxgbe_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
-static int cxgbe_snd_tag_modify(struct m_snd_tag *,
-    union if_snd_tag_modify_params *);
-static int cxgbe_snd_tag_query(struct m_snd_tag *,
-    union if_snd_tag_query_params *);
-static void cxgbe_snd_tag_free(struct m_snd_tag *);
 #endif
 
 MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services");
 
 /*
  * Correct lock order when you need to acquire multiple locks is t4_list_lock,
  * then ADAPTER_LOCK, then t4_uld_list_lock.
  */
 static struct sx t4_list_lock;
 SLIST_HEAD(, adapter) t4_list;
 #ifdef TCP_OFFLOAD
 static struct sx t4_uld_list_lock;
 SLIST_HEAD(, uld_info) t4_uld_list;
 #endif
 
 /*
  * Tunables.  See tweak_tunables() too.
  *
  * Each tunable is set to a default value here if it's known at compile-time.
  * Otherwise it is set to -n as an indication to tweak_tunables() that it should
  * provide a reasonable default (upto n) when the driver is loaded.
  *
  * Tunables applicable to both T4 and T5 are under hw.cxgbe.  Those specific to
  * T5 are under hw.cxl.
  */
 SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "cxgbe(4) parameters");
 SYSCTL_NODE(_hw, OID_AUTO, cxl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "cxgbe(4) T5+ parameters");
 SYSCTL_NODE(_hw_cxgbe, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "cxgbe(4) TOE parameters");
 
 /*
  * Number of queues for tx and rx, NIC and offload.
  */
 #define NTXQ 16
 int t4_ntxq = -NTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq, CTLFLAG_RDTUN, &t4_ntxq, 0,
     "Number of TX queues per port");
 TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq);	/* Old name, undocumented */
 
 #define NRXQ 8
 int t4_nrxq = -NRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq, CTLFLAG_RDTUN, &t4_nrxq, 0,
     "Number of RX queues per port");
 TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq);	/* Old name, undocumented */
 
 #define NTXQ_VI 1
 static int t4_ntxq_vi = -NTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq_vi, CTLFLAG_RDTUN, &t4_ntxq_vi, 0,
     "Number of TX queues per VI");
 
 #define NRXQ_VI 1
 static int t4_nrxq_vi = -NRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq_vi, CTLFLAG_RDTUN, &t4_nrxq_vi, 0,
     "Number of RX queues per VI");
 
 static int t4_rsrv_noflowq = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, rsrv_noflowq, CTLFLAG_RDTUN, &t4_rsrv_noflowq,
     0, "Reserve TX queue 0 of each VI for non-flowid packets");
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 #define NOFLDTXQ 8
 static int t4_nofldtxq = -NOFLDTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0,
     "Number of offload TX queues per port");
 
 #define NOFLDRXQ 2
 static int t4_nofldrxq = -NOFLDRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0,
     "Number of offload RX queues per port");
 
 #define NOFLDTXQ_VI 1
 static int t4_nofldtxq_vi = -NOFLDTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq_vi, CTLFLAG_RDTUN, &t4_nofldtxq_vi, 0,
     "Number of offload TX queues per VI");
 
 #define NOFLDRXQ_VI 1
 static int t4_nofldrxq_vi = -NOFLDRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq_vi, CTLFLAG_RDTUN, &t4_nofldrxq_vi, 0,
     "Number of offload RX queues per VI");
 
 #define TMR_IDX_OFLD 1
 int t4_tmr_idx_ofld = TMR_IDX_OFLD;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx_ofld, CTLFLAG_RDTUN,
     &t4_tmr_idx_ofld, 0, "Holdoff timer index for offload queues");
 
 #define PKTC_IDX_OFLD (-1)
 int t4_pktc_idx_ofld = PKTC_IDX_OFLD;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx_ofld, CTLFLAG_RDTUN,
     &t4_pktc_idx_ofld, 0, "holdoff packet counter index for offload queues");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_keepalive_idle = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_idle, CTLFLAG_RDTUN,
     &t4_toe_keepalive_idle, 0, "TOE keepalive idle timer (us)");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_keepalive_interval = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_interval, CTLFLAG_RDTUN,
     &t4_toe_keepalive_interval, 0, "TOE keepalive interval timer (us)");
 
 /* 0 means chip/fw default, non-zero number is # of keepalives before abort */
 static int t4_toe_keepalive_count = 0;
 SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, keepalive_count, CTLFLAG_RDTUN,
     &t4_toe_keepalive_count, 0, "Number of TOE keepalive probes before abort");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_rexmt_min = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_min, CTLFLAG_RDTUN,
     &t4_toe_rexmt_min, 0, "Minimum TOE retransmit interval (us)");
 
 /* 0 means chip/fw default, non-zero number is value in microseconds */
 static u_long t4_toe_rexmt_max = 0;
 SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_max, CTLFLAG_RDTUN,
     &t4_toe_rexmt_max, 0, "Maximum TOE retransmit interval (us)");
 
 /* 0 means chip/fw default, non-zero number is # of rexmt before abort */
 static int t4_toe_rexmt_count = 0;
 SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, rexmt_count, CTLFLAG_RDTUN,
     &t4_toe_rexmt_count, 0, "Number of TOE retransmissions before abort");
 
 /* -1 means chip/fw default, other values are raw backoff values to use */
 static int t4_toe_rexmt_backoff[16] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 };
 SYSCTL_NODE(_hw_cxgbe_toe, OID_AUTO, rexmt_backoff,
     CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "cxgbe(4) TOE retransmit backoff values");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 0, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[0], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 1, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[1], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 2, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[2], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 3, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[3], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 4, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[4], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 5, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[5], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 6, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[6], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 7, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[7], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 8, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[8], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 9, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[9], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 10, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[10], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 11, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[11], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 12, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[12], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 13, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[13], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[14], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[15], 0, "");
 
 static int t4_toe_tls_rx_timeout = 5;
 SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, tls_rx_timeout, CTLFLAG_RDTUN,
     &t4_toe_tls_rx_timeout, 0,
     "Timeout in seconds to downgrade TLS sockets to plain TOE");
 #endif
 
 #ifdef DEV_NETMAP
 #define NN_MAIN_VI	(1 << 0)	/* Native netmap on the main VI */
 #define NN_EXTRA_VI	(1 << 1)	/* Native netmap on the extra VI(s) */
 static int t4_native_netmap = NN_EXTRA_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, native_netmap, CTLFLAG_RDTUN, &t4_native_netmap,
     0, "Native netmap support.  bit 0 = main VI, bit 1 = extra VIs");
 
 #define NNMTXQ 8
 static int t4_nnmtxq = -NNMTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq, CTLFLAG_RDTUN, &t4_nnmtxq, 0,
     "Number of netmap TX queues");
 
 #define NNMRXQ 8
 static int t4_nnmrxq = -NNMRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq, CTLFLAG_RDTUN, &t4_nnmrxq, 0,
     "Number of netmap RX queues");
 
 #define NNMTXQ_VI 2
 static int t4_nnmtxq_vi = -NNMTXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq_vi, CTLFLAG_RDTUN, &t4_nnmtxq_vi, 0,
     "Number of netmap TX queues per VI");
 
 #define NNMRXQ_VI 2
 static int t4_nnmrxq_vi = -NNMRXQ_VI;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq_vi, CTLFLAG_RDTUN, &t4_nnmrxq_vi, 0,
     "Number of netmap RX queues per VI");
 #endif
 
 /*
  * Holdoff parameters for ports.
  */
 #define TMR_IDX 1
 int t4_tmr_idx = TMR_IDX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx, CTLFLAG_RDTUN, &t4_tmr_idx,
     0, "Holdoff timer index");
 TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx);	/* Old name */
 
 #define PKTC_IDX (-1)
 int t4_pktc_idx = PKTC_IDX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx, CTLFLAG_RDTUN, &t4_pktc_idx,
     0, "Holdoff packet counter index");
 TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx);	/* Old name */
 
 /*
  * Size (# of entries) of each tx and rx queue.
  */
 unsigned int t4_qsize_txq = TX_EQ_QSIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_txq, CTLFLAG_RDTUN, &t4_qsize_txq, 0,
     "Number of descriptors in each TX queue");
 
 unsigned int t4_qsize_rxq = RX_IQ_QSIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_rxq, CTLFLAG_RDTUN, &t4_qsize_rxq, 0,
     "Number of descriptors in each RX queue");
 
 /*
  * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively).
  */
 int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, interrupt_types, CTLFLAG_RDTUN, &t4_intr_types,
     0, "Interrupt types allowed (bit 0 = INTx, 1 = MSI, 2 = MSI-X)");
 
 /*
  * Configuration file.  All the _CF names here are special.
  */
 #define DEFAULT_CF	"default"
 #define BUILTIN_CF	"built-in"
 #define FLASH_CF	"flash"
 #define UWIRE_CF	"uwire"
 #define FPGA_CF		"fpga"
 static char t4_cfg_file[32] = DEFAULT_CF;
 SYSCTL_STRING(_hw_cxgbe, OID_AUTO, config_file, CTLFLAG_RDTUN, t4_cfg_file,
     sizeof(t4_cfg_file), "Firmware configuration file");
 
 /*
  * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively).
  * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them.
  * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water
  *            mark or when signalled to do so, 0 to never emit PAUSE.
  * pause_autoneg = 1 means PAUSE will be negotiated if possible and the
  *                 negotiated settings will override rx_pause/tx_pause.
  *                 Otherwise rx_pause/tx_pause are applied forcibly.
  */
 static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, pause_settings, CTLFLAG_RDTUN,
     &t4_pause_settings, 0,
     "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)");
 
 /*
  * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively).
  * -1 to run with the firmware default.  Same as FEC_AUTO (bit 5)
  *  0 to disable FEC.
  */
 static int t4_fec = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fec, CTLFLAG_RDTUN, &t4_fec, 0,
     "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)");
 
 /*
  * Link autonegotiation.
  * -1 to run with the firmware default.
  *  0 to disable.
  *  1 to enable.
  */
 static int t4_autoneg = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, autoneg, CTLFLAG_RDTUN, &t4_autoneg, 0,
     "Link autonegotiation");
 
 /*
  * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed,
  * encouraged respectively).  '-n' is the same as 'n' except the firmware
  * version used in the checks is read from the firmware bundled with the driver.
  */
 static int t4_fw_install = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fw_install, CTLFLAG_RDTUN, &t4_fw_install, 0,
     "Firmware auto-install (0 = prohibited, 1 = allowed, 2 = encouraged)");
 
 /*
  * ASIC features that will be used.  Disable the ones you don't want so that the
  * chip resources aren't wasted on features that will not be used.
  */
 static int t4_nbmcaps_allowed = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nbmcaps_allowed, CTLFLAG_RDTUN,
     &t4_nbmcaps_allowed, 0, "Default NBM capabilities");
 
 static int t4_linkcaps_allowed = 0;	/* No DCBX, PPP, etc. by default */
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, linkcaps_allowed, CTLFLAG_RDTUN,
     &t4_linkcaps_allowed, 0, "Default link capabilities");
 
 static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS |
     FW_CAPS_CONFIG_SWITCH_EGRESS;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, switchcaps_allowed, CTLFLAG_RDTUN,
     &t4_switchcaps_allowed, 0, "Default switch capabilities");
 
 #ifdef RATELIMIT
 static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC |
 	FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD;
 #else
 static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC |
 	FW_CAPS_CONFIG_NIC_HASHFILTER;
 #endif
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, niccaps_allowed, CTLFLAG_RDTUN,
     &t4_niccaps_allowed, 0, "Default NIC capabilities");
 
 static int t4_toecaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, toecaps_allowed, CTLFLAG_RDTUN,
     &t4_toecaps_allowed, 0, "Default TCP offload capabilities");
 
 static int t4_rdmacaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, rdmacaps_allowed, CTLFLAG_RDTUN,
     &t4_rdmacaps_allowed, 0, "Default RDMA capabilities");
 
 static int t4_cryptocaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cryptocaps_allowed, CTLFLAG_RDTUN,
     &t4_cryptocaps_allowed, 0, "Default crypto capabilities");
 
 static int t4_iscsicaps_allowed = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, iscsicaps_allowed, CTLFLAG_RDTUN,
     &t4_iscsicaps_allowed, 0, "Default iSCSI capabilities");
 
 static int t4_fcoecaps_allowed = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fcoecaps_allowed, CTLFLAG_RDTUN,
     &t4_fcoecaps_allowed, 0, "Default FCoE capabilities");
 
 static int t5_write_combine = 0;
 SYSCTL_INT(_hw_cxl, OID_AUTO, write_combine, CTLFLAG_RDTUN, &t5_write_combine,
     0, "Use WC instead of UC for BAR2");
 
 static int t4_num_vis = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, num_vis, CTLFLAG_RDTUN, &t4_num_vis, 0,
     "Number of VIs per port");
 
 /*
  * PCIe Relaxed Ordering.
  * -1: driver should figure out a good value.
  * 0: disable RO.
  * 1: enable RO.
  * 2: leave RO alone.
  */
 static int pcie_relaxed_ordering = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, pcie_relaxed_ordering, CTLFLAG_RDTUN,
     &pcie_relaxed_ordering, 0,
     "PCIe Relaxed Ordering: 0 = disable, 1 = enable, 2 = leave alone");
 
 static int t4_panic_on_fatal_err = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RWTUN,
     &t4_panic_on_fatal_err, 0, "panic on fatal errors");
 
 static int t4_reset_on_fatal_err = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, reset_on_fatal_err, CTLFLAG_RWTUN,
     &t4_reset_on_fatal_err, 0, "reset adapter on fatal errors");
 
 static int t4_tx_vm_wr = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_vm_wr, CTLFLAG_RWTUN, &t4_tx_vm_wr, 0,
     "Use VM work requests to transmit packets.");
 
 /*
  * Set to non-zero to enable the attack filter.  A packet that matches any of
  * these conditions will get dropped on ingress:
  * 1) IP && source address == destination address.
  * 2) TCP/IP && source address is not a unicast address.
  * 3) TCP/IP && destination address is not a unicast address.
  * 4) IP && source address is loopback (127.x.y.z).
  * 5) IP && destination address is loopback (127.x.y.z).
  * 6) IPv6 && source address == destination address.
  * 7) IPv6 && source address is not a unicast address.
  * 8) IPv6 && source address is loopback (::1/128).
  * 9) IPv6 && destination address is loopback (::1/128).
  * 10) IPv6 && source address is unspecified (::/128).
  * 11) IPv6 && destination address is unspecified (::/128).
  * 12) TCP/IPv6 && source address is multicast (ff00::/8).
  * 13) TCP/IPv6 && destination address is multicast (ff00::/8).
  */
 static int t4_attack_filter = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, attack_filter, CTLFLAG_RDTUN,
     &t4_attack_filter, 0, "Drop suspicious traffic");
 
 static int t4_drop_ip_fragments = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_ip_fragments, CTLFLAG_RDTUN,
     &t4_drop_ip_fragments, 0, "Drop IP fragments");
 
 static int t4_drop_pkts_with_l2_errors = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l2_errors, CTLFLAG_RDTUN,
     &t4_drop_pkts_with_l2_errors, 0,
     "Drop all frames with Layer 2 length or checksum errors");
 
 static int t4_drop_pkts_with_l3_errors = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l3_errors, CTLFLAG_RDTUN,
     &t4_drop_pkts_with_l3_errors, 0,
     "Drop all frames with IP version, length, or checksum errors");
 
 static int t4_drop_pkts_with_l4_errors = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l4_errors, CTLFLAG_RDTUN,
     &t4_drop_pkts_with_l4_errors, 0,
     "Drop all frames with Layer 4 length, checksum, or other errors");
 
 #ifdef TCP_OFFLOAD
 /*
  * TOE tunables.
  */
 static int t4_cop_managed_offloading = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cop_managed_offloading, CTLFLAG_RDTUN,
     &t4_cop_managed_offloading, 0,
     "COP (Connection Offload Policy) controls all TOE offload");
 #endif
 
 #ifdef KERN_TLS
 /*
  * This enables KERN_TLS for all adapters if set.
  */
 static int t4_kern_tls = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, kern_tls, CTLFLAG_RDTUN, &t4_kern_tls, 0,
     "Enable KERN_TLS mode for all supported adapters");
 
 SYSCTL_NODE(_hw_cxgbe, OID_AUTO, tls, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "cxgbe(4) KERN_TLS parameters");
 
 static int t4_tls_inline_keys = 0;
 SYSCTL_INT(_hw_cxgbe_tls, OID_AUTO, inline_keys, CTLFLAG_RDTUN,
     &t4_tls_inline_keys, 0,
     "Always pass TLS keys in work requests (1) or attempt to store TLS keys "
     "in card memory.");
 
 static int t4_tls_combo_wrs = 0;
 SYSCTL_INT(_hw_cxgbe_tls, OID_AUTO, combo_wrs, CTLFLAG_RDTUN, &t4_tls_combo_wrs,
     0, "Attempt to combine TCB field updates with TLS record work requests.");
 #endif
 
 /* Functions used by VIs to obtain unique MAC addresses for each VI. */
 static int vi_mac_funcs[] = {
 	FW_VI_FUNC_ETH,
 	FW_VI_FUNC_OFLD,
 	FW_VI_FUNC_IWARP,
 	FW_VI_FUNC_OPENISCSI,
 	FW_VI_FUNC_OPENFCOE,
 	FW_VI_FUNC_FOISCSI,
 	FW_VI_FUNC_FOFCOE,
 };
 
 struct intrs_and_queues {
 	uint16_t intr_type;	/* INTx, MSI, or MSI-X */
 	uint16_t num_vis;	/* number of VIs for each port */
 	uint16_t nirq;		/* Total # of vectors */
 	uint16_t ntxq;		/* # of NIC txq's for each port */
 	uint16_t nrxq;		/* # of NIC rxq's for each port */
 	uint16_t nofldtxq;	/* # of TOE/ETHOFLD txq's for each port */
 	uint16_t nofldrxq;	/* # of TOE rxq's for each port */
 	uint16_t nnmtxq;	/* # of netmap txq's */
 	uint16_t nnmrxq;	/* # of netmap rxq's */
 
 	/* The vcxgbe/vcxl interfaces use these and not the ones above. */
 	uint16_t ntxq_vi;	/* # of NIC txq's */
 	uint16_t nrxq_vi;	/* # of NIC rxq's */
 	uint16_t nofldtxq_vi;	/* # of TOE txq's */
 	uint16_t nofldrxq_vi;	/* # of TOE rxq's */
 	uint16_t nnmtxq_vi;	/* # of netmap txq's */
 	uint16_t nnmrxq_vi;	/* # of netmap rxq's */
 };
 
 static void setup_memwin(struct adapter *);
 static void position_memwin(struct adapter *, int, uint32_t);
 static int validate_mem_range(struct adapter *, uint32_t, uint32_t);
 static int fwmtype_to_hwmtype(int);
 static int validate_mt_off_len(struct adapter *, int, uint32_t, uint32_t,
     uint32_t *);
 static int fixup_devlog_params(struct adapter *);
 static int cfg_itype_and_nqueues(struct adapter *, struct intrs_and_queues *);
 static int contact_firmware(struct adapter *);
 static int partition_resources(struct adapter *);
 static int get_params__pre_init(struct adapter *);
 static int set_params__pre_init(struct adapter *);
 static int get_params__post_init(struct adapter *);
 static int set_params__post_init(struct adapter *);
 static void t4_set_desc(struct adapter *);
 static bool fixed_ifmedia(struct port_info *);
 static void build_medialist(struct port_info *);
 static void init_link_config(struct port_info *);
 static int fixup_link_config(struct port_info *);
 static int apply_link_config(struct port_info *);
 static int cxgbe_init_synchronized(struct vi_info *);
 static int cxgbe_uninit_synchronized(struct vi_info *);
 static int adapter_full_init(struct adapter *);
 static void adapter_full_uninit(struct adapter *);
 static int vi_full_init(struct vi_info *);
 static void vi_full_uninit(struct vi_info *);
 static int alloc_extra_vi(struct adapter *, struct port_info *, struct vi_info *);
 static void quiesce_txq(struct sge_txq *);
 static void quiesce_wrq(struct sge_wrq *);
 static void quiesce_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *);
 static void quiesce_vi(struct vi_info *);
 static int t4_alloc_irq(struct adapter *, struct irq *, int rid,
     driver_intr_t *, void *, char *);
 static int t4_free_irq(struct adapter *, struct irq *);
 static void t4_init_atid_table(struct adapter *);
 static void t4_free_atid_table(struct adapter *);
 static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *);
 static void vi_refresh_stats(struct vi_info *);
 static void cxgbe_refresh_stats(struct vi_info *);
 static void cxgbe_tick(void *);
 static void vi_tick(void *);
 static void cxgbe_sysctls(struct port_info *);
 static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
 static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS);
 static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS);
 static int sysctl_btphy(SYSCTL_HANDLER_ARGS);
 static int sysctl_noflowq(SYSCTL_HANDLER_ARGS);
 static int sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS);
 static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS);
 static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS);
 static int sysctl_fec(SYSCTL_HANDLER_ARGS);
 static int sysctl_module_fec(SYSCTL_HANDLER_ARGS);
 static int sysctl_autoneg(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS);
 static int sysctl_temperature(SYSCTL_HANDLER_ARGS);
 static int sysctl_vdd(SYSCTL_HANDLER_ARGS);
 static int sysctl_reset_sensor(SYSCTL_HANDLER_ARGS);
 static int sysctl_loadavg(SYSCTL_HANDLER_ARGS);
 static int sysctl_cctrl(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tid_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_devlog(SYSCTL_HANDLER_ARGS);
 static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS);
 static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS);
 static int sysctl_meminfo(SYSCTL_HANDLER_ARGS);
 static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS);
 static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS);
 static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS);
 static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tids(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tnl_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS);
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpus(SYSCTL_HANDLER_ARGS);
 static int sysctl_reset(SYSCTL_HANDLER_ARGS);
 #ifdef TCP_OFFLOAD
 static int sysctl_tls(SYSCTL_HANDLER_ARGS);
 static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS);
 static int sysctl_tls_rx_timeout(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS);
 static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS);
 #endif
 static int get_sge_context(struct adapter *, struct t4_sge_context *);
 static int load_fw(struct adapter *, struct t4_data *);
 static int load_cfg(struct adapter *, struct t4_data *);
 static int load_boot(struct adapter *, struct t4_bootrom *);
 static int load_bootcfg(struct adapter *, struct t4_data *);
 static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *);
 static void free_offload_policy(struct t4_offload_policy *);
 static int set_offload_policy(struct adapter *, struct t4_offload_policy *);
 static int read_card_mem(struct adapter *, int, struct t4_mem_range *);
 static int read_i2c(struct adapter *, struct t4_i2c_data *);
 static int clear_stats(struct adapter *, u_int);
 static int hold_clip_addr(struct adapter *, struct t4_clip_addr *);
 static int release_clip_addr(struct adapter *, struct t4_clip_addr *);
 #ifdef TCP_OFFLOAD
 static int toe_capability(struct vi_info *, bool);
 static void t4_async_event(void *, int);
 #endif
 #ifdef KERN_TLS
 static int ktls_capability(struct adapter *, bool);
 #endif
 static int mod_event(module_t, int, void *);
 static int notify_siblings(device_t, int);
 static uint64_t vi_get_counter(struct ifnet *, ift_counter);
 static uint64_t cxgbe_get_counter(struct ifnet *, ift_counter);
 static void enable_vxlan_rx(struct adapter *);
 static void reset_adapter(void *, int);
 
 struct {
 	uint16_t device;
 	char *desc;
 } t4_pciids[] = {
 	{0xa000, "Chelsio Terminator 4 FPGA"},
 	{0x4400, "Chelsio T440-dbg"},
 	{0x4401, "Chelsio T420-CR"},
 	{0x4402, "Chelsio T422-CR"},
 	{0x4403, "Chelsio T440-CR"},
 	{0x4404, "Chelsio T420-BCH"},
 	{0x4405, "Chelsio T440-BCH"},
 	{0x4406, "Chelsio T440-CH"},
 	{0x4407, "Chelsio T420-SO"},
 	{0x4408, "Chelsio T420-CX"},
 	{0x4409, "Chelsio T420-BT"},
 	{0x440a, "Chelsio T404-BT"},
 	{0x440e, "Chelsio T440-LP-CR"},
 }, t5_pciids[] = {
 	{0xb000, "Chelsio Terminator 5 FPGA"},
 	{0x5400, "Chelsio T580-dbg"},
 	{0x5401,  "Chelsio T520-CR"},		/* 2 x 10G */
 	{0x5402,  "Chelsio T522-CR"},		/* 2 x 10G, 2 X 1G */
 	{0x5403,  "Chelsio T540-CR"},		/* 4 x 10G */
 	{0x5407,  "Chelsio T520-SO"},		/* 2 x 10G, nomem */
 	{0x5409,  "Chelsio T520-BT"},		/* 2 x 10GBaseT */
 	{0x540a,  "Chelsio T504-BT"},		/* 4 x 1G */
 	{0x540d,  "Chelsio T580-CR"},		/* 2 x 40G */
 	{0x540e,  "Chelsio T540-LP-CR"},	/* 4 x 10G */
 	{0x5410,  "Chelsio T580-LP-CR"},	/* 2 x 40G */
 	{0x5411,  "Chelsio T520-LL-CR"},	/* 2 x 10G */
 	{0x5412,  "Chelsio T560-CR"},		/* 1 x 40G, 2 x 10G */
 	{0x5414,  "Chelsio T580-LP-SO-CR"},	/* 2 x 40G, nomem */
 	{0x5415,  "Chelsio T502-BT"},		/* 2 x 1G */
 	{0x5418,  "Chelsio T540-BT"},		/* 4 x 10GBaseT */
 	{0x5419,  "Chelsio T540-LP-BT"},	/* 4 x 10GBaseT */
 	{0x541a,  "Chelsio T540-SO-BT"},	/* 4 x 10GBaseT, nomem */
 	{0x541b,  "Chelsio T540-SO-CR"},	/* 4 x 10G, nomem */
 
 	/* Custom */
 	{0x5483, "Custom T540-CR"},
 	{0x5484, "Custom T540-BT"},
 }, t6_pciids[] = {
 	{0xc006, "Chelsio Terminator 6 FPGA"},	/* T6 PE10K6 FPGA (PF0) */
 	{0x6400, "Chelsio T6-DBG-25"},		/* 2 x 10/25G, debug */
 	{0x6401, "Chelsio T6225-CR"},		/* 2 x 10/25G */
 	{0x6402, "Chelsio T6225-SO-CR"},	/* 2 x 10/25G, nomem */
 	{0x6403, "Chelsio T6425-CR"},		/* 4 x 10/25G */
 	{0x6404, "Chelsio T6425-SO-CR"},	/* 4 x 10/25G, nomem */
 	{0x6405, "Chelsio T6225-OCP-SO"},	/* 2 x 10/25G, nomem */
 	{0x6406, "Chelsio T62100-OCP-SO"},	/* 2 x 40/50/100G, nomem */
 	{0x6407, "Chelsio T62100-LP-CR"},	/* 2 x 40/50/100G */
 	{0x6408, "Chelsio T62100-SO-CR"},	/* 2 x 40/50/100G, nomem */
 	{0x6409, "Chelsio T6210-BT"},		/* 2 x 10GBASE-T */
 	{0x640d, "Chelsio T62100-CR"},		/* 2 x 40/50/100G */
 	{0x6410, "Chelsio T6-DBG-100"},		/* 2 x 40/50/100G, debug */
 	{0x6411, "Chelsio T6225-LL-CR"},	/* 2 x 10/25G */
 	{0x6414, "Chelsio T61100-OCP-SO"},	/* 1 x 40/50/100G, nomem */
 	{0x6415, "Chelsio T6201-BT"},		/* 2 x 1000BASE-T */
 
 	/* Custom */
 	{0x6480, "Custom T6225-CR"},
 	{0x6481, "Custom T62100-CR"},
 	{0x6482, "Custom T6225-CR"},
 	{0x6483, "Custom T62100-CR"},
 	{0x6484, "Custom T64100-CR"},
 	{0x6485, "Custom T6240-SO"},
 	{0x6486, "Custom T6225-SO-CR"},
 	{0x6487, "Custom T6225-CR"},
 };
 
 #ifdef TCP_OFFLOAD
 /*
  * service_iq_fl() has an iq and needs the fl.  Offset of fl from the iq should
  * be exactly the same for both rxq and ofld_rxq.
  */
 CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq));
 CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
 #endif
 CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
 
 static int
 t4_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xa000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t4_pciids); i++) {
 		if (d == t4_pciids[i].device) {
 			device_set_desc(dev, t4_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t5_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 	uint8_t f = pci_get_function(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	/* Attach only to PF0 of the FPGA */
 	if (d == 0xb000 && f != 0)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t5_pciids); i++) {
 		if (d == t5_pciids[i].device) {
 			device_set_desc(dev, t5_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static int
 t6_probe(device_t dev)
 {
 	int i;
 	uint16_t v = pci_get_vendor(dev);
 	uint16_t d = pci_get_device(dev);
 
 	if (v != PCI_VENDOR_ID_CHELSIO)
 		return (ENXIO);
 
 	for (i = 0; i < nitems(t6_pciids); i++) {
 		if (d == t6_pciids[i].device) {
 			device_set_desc(dev, t6_pciids[i].desc);
 			return (BUS_PROBE_DEFAULT);
 		}
 	}
 
 	return (ENXIO);
 }
 
 static void
 t5_attribute_workaround(device_t dev)
 {
 	device_t root_port;
 	uint32_t v;
 
 	/*
 	 * The T5 chips do not properly echo the No Snoop and Relaxed
 	 * Ordering attributes when replying to a TLP from a Root
 	 * Port.  As a workaround, find the parent Root Port and
 	 * disable No Snoop and Relaxed Ordering.  Note that this
 	 * affects all devices under this root port.
 	 */
 	root_port = pci_find_pcie_root_port(dev);
 	if (root_port == NULL) {
 		device_printf(dev, "Unable to find parent root port\n");
 		return;
 	}
 
 	v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL,
 	    PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2);
 	if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) !=
 	    0)
 		device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n",
 		    device_get_nameunit(root_port));
 }
 
 static const struct devnames devnames[] = {
 	{
 		.nexus_name = "t4nex",
 		.ifnet_name = "cxgbe",
 		.vi_ifnet_name = "vcxgbe",
 		.pf03_drv_name = "t4iov",
 		.vf_nexus_name = "t4vf",
 		.vf_ifnet_name = "cxgbev"
 	}, {
 		.nexus_name = "t5nex",
 		.ifnet_name = "cxl",
 		.vi_ifnet_name = "vcxl",
 		.pf03_drv_name = "t5iov",
 		.vf_nexus_name = "t5vf",
 		.vf_ifnet_name = "cxlv"
 	}, {
 		.nexus_name = "t6nex",
 		.ifnet_name = "cc",
 		.vi_ifnet_name = "vcc",
 		.pf03_drv_name = "t6iov",
 		.vf_nexus_name = "t6vf",
 		.vf_ifnet_name = "ccv"
 	}
 };
 
 void
 t4_init_devnames(struct adapter *sc)
 {
 	int id;
 
 	id = chip_id(sc);
 	if (id >= CHELSIO_T4 && id - CHELSIO_T4 < nitems(devnames))
 		sc->names = &devnames[id - CHELSIO_T4];
 	else {
 		device_printf(sc->dev, "chip id %d is not supported.\n", id);
 		sc->names = NULL;
 	}
 }
 
 static int
 t4_ifnet_unit(struct adapter *sc, struct port_info *pi)
 {
 	const char *parent, *name;
 	long value;
 	int line, unit;
 
 	line = 0;
 	parent = device_get_nameunit(sc->dev);
 	name = sc->names->ifnet_name;
 	while (resource_find_dev(&line, name, &unit, "at", parent) == 0) {
 		if (resource_long_value(name, unit, "port", &value) == 0 &&
 		    value == pi->port_id)
 			return (unit);
 	}
 	return (-1);
 }
 
 static int
 t4_attach(device_t dev)
 {
 	struct adapter *sc;
 	int rc = 0, i, j, rqidx, tqidx, nports;
 	struct make_dev_args mda;
 	struct intrs_and_queues iaq;
 	struct sge *s;
 	uint32_t *buf;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	int ofld_tqidx;
 #endif
 #ifdef TCP_OFFLOAD
 	int ofld_rqidx;
 #endif
 #ifdef DEV_NETMAP
 	int nm_rqidx, nm_tqidx;
 #endif
 	int num_vis;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 	TUNABLE_INT_FETCH("hw.cxgbe.dflags", &sc->debug_flags);
 
 	if ((pci_get_device(dev) & 0xff00) == 0x5400)
 		t5_attribute_workaround(dev);
 	pci_enable_busmaster(dev);
 	if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) {
 		uint32_t v;
 
 		pci_set_max_read_req(dev, 4096);
 		v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2);
 		sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5);
 		if (pcie_relaxed_ordering == 0 &&
 		    (v & PCIEM_CTL_RELAXED_ORD_ENABLE) != 0) {
 			v &= ~PCIEM_CTL_RELAXED_ORD_ENABLE;
 			pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2);
 		} else if (pcie_relaxed_ordering == 1 &&
 		    (v & PCIEM_CTL_RELAXED_ORD_ENABLE) == 0) {
 			v |= PCIEM_CTL_RELAXED_ORD_ENABLE;
 			pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2);
 		}
 	}
 
 	sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS);
 	sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL);
 	sc->traceq = -1;
 	mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF);
 	snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer",
 	    device_get_nameunit(dev));
 
 	snprintf(sc->lockname, sizeof(sc->lockname), "%s",
 	    device_get_nameunit(dev));
 	mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF);
 	t4_add_adapter(sc);
 
 	mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF);
 	TAILQ_INIT(&sc->sfl);
 	callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0);
 
 	mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF);
 
 	sc->policy = NULL;
 	rw_init(&sc->policy_lock, "connection offload policy");
 
 	callout_init(&sc->ktls_tick, 1);
 
 #ifdef TCP_OFFLOAD
 	TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc);
 #endif
 
 	refcount_init(&sc->vxlan_refcount, 0);
 
 	TASK_INIT(&sc->reset_task, 0, reset_adapter, sc);
 
 	sc->ctrlq_oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(sc->dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "ctrlq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "control queues");
 	sc->fwq_oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(sc->dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "fwq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "firmware event queue");
 
 	rc = t4_map_bars_0_and_4(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	memset(sc->chan_map, 0xff, sizeof(sc->chan_map));
 
 	/* Prepare the adapter for operation. */
 	buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK);
 	rc = -t4_prep_adapter(sc, buf);
 	free(buf, M_CXGBE);
 	if (rc != 0) {
 		device_printf(dev, "failed to prepare adapter: %d.\n", rc);
 		goto done;
 	}
 
 	/*
 	 * This is the real PF# to which we're attaching.  Works from within PCI
 	 * passthrough environments too, where pci_get_function() could return a
 	 * different PF# depending on the passthrough configuration.  We need to
 	 * use the real PF# in all our communication with the firmware.
 	 */
 	j = t4_read_reg(sc, A_PL_WHOAMI);
 	sc->pf = chip_id(sc) <= CHELSIO_T5 ? G_SOURCEPF(j) : G_T6_SOURCEPF(j);
 	sc->mbox = sc->pf;
 
 	t4_init_devnames(sc);
 	if (sc->names == NULL) {
 		rc = ENOTSUP;
 		goto done; /* error message displayed already */
 	}
 
 	/*
 	 * Do this really early, with the memory windows set up even before the
 	 * character device.  The userland tool's register i/o and mem read
 	 * will work even in "recovery mode".
 	 */
 	setup_memwin(sc);
 	if (t4_init_devlog_params(sc, 0) == 0)
 		fixup_devlog_params(sc);
 	make_dev_args_init(&mda);
 	mda.mda_devsw = &t4_cdevsw;
 	mda.mda_uid = UID_ROOT;
 	mda.mda_gid = GID_WHEEL;
 	mda.mda_mode = 0600;
 	mda.mda_si_drv1 = sc;
 	rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
 	if (rc != 0)
 		device_printf(dev, "failed to create nexus char device: %d.\n",
 		    rc);
 
 	/* Go no further if recovery mode has been requested. */
 	if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) {
 		device_printf(dev, "recovery mode.\n");
 		goto done;
 	}
 
 #if defined(__i386__)
 	if ((cpu_feature & CPUID_CX8) == 0) {
 		device_printf(dev, "64 bit atomics not available.\n");
 		rc = ENOTSUP;
 		goto done;
 	}
 #endif
 
 	/* Contact the firmware and try to become the master driver. */
 	rc = contact_firmware(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 	MPASS(sc->flags & FW_OK);
 
 	rc = get_params__pre_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	if (sc->flags & MASTER_PF) {
 		rc = partition_resources(sc);
 		if (rc != 0)
 			goto done; /* error message displayed already */
 		t4_intr_clear(sc);
 	}
 
 	rc = get_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = set_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = t4_map_bar_2(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = t4_create_dma_tag(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	/*
 	 * First pass over all the ports - allocate VIs and initialize some
 	 * basic parameters like mac address, port type, etc.
 	 */
 	for_each_port(sc, i) {
 		struct port_info *pi;
 
 		pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK);
 		sc->port[i] = pi;
 
 		/* These must be set before t4_port_init */
 		pi->adapter = sc;
 		pi->port_id = i;
 		/*
 		 * XXX: vi[0] is special so we can't delay this allocation until
 		 * pi->nvi's final value is known.
 		 */
 		pi->vi = malloc(sizeof(struct vi_info) * t4_num_vis, M_CXGBE,
 		    M_ZERO | M_WAITOK);
 
 		/*
 		 * Allocate the "main" VI and initialize parameters
 		 * like mac addr.
 		 */
 		rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i);
 		if (rc != 0) {
 			device_printf(dev, "unable to initialize port %d: %d\n",
 			    i, rc);
 			free(pi->vi, M_CXGBE);
 			free(pi, M_CXGBE);
 			sc->port[i] = NULL;
 			goto done;
 		}
 
 		snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d",
 		    device_get_nameunit(dev), i);
 		mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF);
 		sc->chan_map[pi->tx_chan] = i;
 
 		/*
 		 * The MPS counter for FCS errors doesn't work correctly on the
 		 * T6 so we use the MAC counter here.  Which MAC is in use
 		 * depends on the link settings which will be known when the
 		 * link comes up.
 		 */
 		if (is_t6(sc)) {
 			pi->fcs_reg = -1;
 		} else if (is_t4(sc)) {
 			pi->fcs_reg = PORT_REG(pi->tx_chan,
 			    A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L);
 		} else {
 			pi->fcs_reg = T5_PORT_REG(pi->tx_chan,
 			    A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L);
 		}
 		pi->fcs_base = 0;
 
 		/* All VIs on this port share this media. */
 		ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change,
 		    cxgbe_media_status);
 
 		PORT_LOCK(pi);
 		init_link_config(pi);
 		fixup_link_config(pi);
 		build_medialist(pi);
 		if (fixed_ifmedia(pi))
 			pi->flags |= FIXED_IFMEDIA;
 		PORT_UNLOCK(pi);
 
 		pi->dev = device_add_child(dev, sc->names->ifnet_name,
 		    t4_ifnet_unit(sc, pi));
 		if (pi->dev == NULL) {
 			device_printf(dev,
 			    "failed to add device for port %d.\n", i);
 			rc = ENXIO;
 			goto done;
 		}
 		pi->vi[0].dev = pi->dev;
 		device_set_softc(pi->dev, pi);
 	}
 
 	/*
 	 * Interrupt type, # of interrupts, # of rx/tx queues, etc.
 	 */
 	nports = sc->params.nports;
 	rc = cfg_itype_and_nqueues(sc, &iaq);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	num_vis = iaq.num_vis;
 	sc->intr_type = iaq.intr_type;
 	sc->intr_count = iaq.nirq;
 
 	s = &sc->sge;
 	s->nrxq = nports * iaq.nrxq;
 	s->ntxq = nports * iaq.ntxq;
 	if (num_vis > 1) {
 		s->nrxq += nports * (num_vis - 1) * iaq.nrxq_vi;
 		s->ntxq += nports * (num_vis - 1) * iaq.ntxq_vi;
 	}
 	s->neq = s->ntxq + s->nrxq;	/* the free list in an rxq is an eq */
 	s->neq += nports;		/* ctrl queues: 1 per port */
 	s->niq = s->nrxq + 1;		/* 1 extra for firmware event queue */
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (is_offload(sc) || is_ethoffload(sc)) {
 		s->nofldtxq = nports * iaq.nofldtxq;
 		if (num_vis > 1)
 			s->nofldtxq += nports * (num_vis - 1) * iaq.nofldtxq_vi;
 		s->neq += s->nofldtxq;
 
 		s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_ofld_txq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		s->nofldrxq = nports * iaq.nofldrxq;
 		if (num_vis > 1)
 			s->nofldrxq += nports * (num_vis - 1) * iaq.nofldrxq_vi;
 		s->neq += s->nofldrxq;	/* free list */
 		s->niq += s->nofldrxq;
 
 		s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 	}
 #endif
 #ifdef DEV_NETMAP
 	s->nnmrxq = 0;
 	s->nnmtxq = 0;
 	if (t4_native_netmap & NN_MAIN_VI) {
 		s->nnmrxq += nports * iaq.nnmrxq;
 		s->nnmtxq += nports * iaq.nnmtxq;
 	}
 	if (num_vis > 1 && t4_native_netmap & NN_EXTRA_VI) {
 		s->nnmrxq += nports * (num_vis - 1) * iaq.nnmrxq_vi;
 		s->nnmtxq += nports * (num_vis - 1) * iaq.nnmtxq_vi;
 	}
 	s->neq += s->nnmtxq + s->nnmrxq;
 	s->niq += s->nnmrxq;
 
 	s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 	s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq),
 	    M_CXGBE, M_ZERO | M_WAITOK);
 #endif
 	MPASS(s->niq <= s->iqmap_sz);
 	MPASS(s->neq <= s->eqmap_sz);
 
 	s->ctrlq = malloc(nports * sizeof(struct sge_wrq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->iqmap = malloc(s->iqmap_sz * sizeof(struct sge_iq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	s->eqmap = malloc(s->eqmap_sz * sizeof(struct sge_eq *), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	t4_init_l2t(sc, M_WAITOK);
 	t4_init_smt(sc, M_WAITOK);
 	t4_init_tx_sched(sc);
 	t4_init_atid_table(sc);
 #ifdef RATELIMIT
 	t4_init_etid_table(sc);
 #endif
 #ifdef INET6
 	t4_init_clip_table(sc);
 #endif
 	if (sc->vres.key.size != 0)
 		sc->key_map = vmem_create("T4TLS key map", sc->vres.key.start,
 		    sc->vres.key.size, 32, 0, M_FIRSTFIT | M_WAITOK);
 
 	/*
 	 * Second pass over the ports.  This time we know the number of rx and
 	 * tx queues that each port should get.
 	 */
 	rqidx = tqidx = 0;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	ofld_tqidx = 0;
 #endif
 #ifdef TCP_OFFLOAD
 	ofld_rqidx = 0;
 #endif
 #ifdef DEV_NETMAP
 	nm_rqidx = nm_tqidx = 0;
 #endif
 	for_each_port(sc, i) {
 		struct port_info *pi = sc->port[i];
 		struct vi_info *vi;
 
 		if (pi == NULL)
 			continue;
 
 		pi->nvi = num_vis;
 		for_each_vi(pi, j, vi) {
 			vi->pi = pi;
 			vi->adapter = sc;
 			vi->first_intr = -1;
 			vi->qsize_rxq = t4_qsize_rxq;
 			vi->qsize_txq = t4_qsize_txq;
 
 			vi->first_rxq = rqidx;
 			vi->first_txq = tqidx;
 			vi->tmr_idx = t4_tmr_idx;
 			vi->pktc_idx = t4_pktc_idx;
 			vi->nrxq = j == 0 ? iaq.nrxq : iaq.nrxq_vi;
 			vi->ntxq = j == 0 ? iaq.ntxq : iaq.ntxq_vi;
 
 			rqidx += vi->nrxq;
 			tqidx += vi->ntxq;
 
 			if (j == 0 && vi->ntxq > 1)
 				vi->rsrv_noflowq = t4_rsrv_noflowq ? 1 : 0;
 			else
 				vi->rsrv_noflowq = 0;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 			vi->first_ofld_txq = ofld_tqidx;
 			vi->nofldtxq = j == 0 ? iaq.nofldtxq : iaq.nofldtxq_vi;
 			ofld_tqidx += vi->nofldtxq;
 #endif
 #ifdef TCP_OFFLOAD
 			vi->ofld_tmr_idx = t4_tmr_idx_ofld;
 			vi->ofld_pktc_idx = t4_pktc_idx_ofld;
 			vi->first_ofld_rxq = ofld_rqidx;
 			vi->nofldrxq = j == 0 ? iaq.nofldrxq : iaq.nofldrxq_vi;
 
 			ofld_rqidx += vi->nofldrxq;
 #endif
 #ifdef DEV_NETMAP
 			vi->first_nm_rxq = nm_rqidx;
 			vi->first_nm_txq = nm_tqidx;
 			if (j == 0) {
 				vi->nnmrxq = iaq.nnmrxq;
 				vi->nnmtxq = iaq.nnmtxq;
 			} else {
 				vi->nnmrxq = iaq.nnmrxq_vi;
 				vi->nnmtxq = iaq.nnmtxq_vi;
 			}
 			nm_rqidx += vi->nnmrxq;
 			nm_tqidx += vi->nnmtxq;
 #endif
 		}
 	}
 
 	rc = t4_setup_intr_handlers(sc);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt handlers: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_generic_probe(dev);
 	if (rc != 0) {
 		device_printf(dev, "failed to probe child drivers: %d\n", rc);
 		goto done;
 	}
 
 	/*
 	 * Ensure thread-safe mailbox access (in debug builds).
 	 *
 	 * So far this was the only thread accessing the mailbox but various
 	 * ifnets and sysctls are about to be created and their handlers/ioctls
 	 * will access the mailbox from different threads.
 	 */
 	sc->flags |= CHK_MBOX_ACCESS;
 
 	rc = bus_generic_attach(dev);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to attach all child ports: %d\n", rc);
 		goto done;
 	}
 
 	device_printf(dev,
 	    "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n",
 	    sc->params.pci.speed, sc->params.pci.width, sc->params.nports,
 	    sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" :
 	    (sc->intr_type == INTR_MSI ? "MSI" : "INTx"),
 	    sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq);
 
 	t4_set_desc(sc);
 
 	notify_siblings(dev, 0);
 
 done:
 	if (rc != 0 && sc->cdev) {
 		/* cdev was created and so cxgbetool works; recover that way. */
 		device_printf(dev,
 		    "error during attach, adapter is now in recovery mode.\n");
 		rc = 0;
 	}
 
 	if (rc != 0)
 		t4_detach_common(dev);
 	else
 		t4_sysctls(sc);
 
 	return (rc);
 }
 
 static int
 t4_child_location(device_t bus, device_t dev, struct sbuf *sb)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 	int i;
 
 	sc = device_get_softc(bus);
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		if (pi != NULL && pi->dev == dev) {
 			sbuf_printf(sb, "port=%d", pi->port_id);
 			break;
 		}
 	}
 	return (0);
 }
 
 static int
 t4_ready(device_t dev)
 {
 	struct adapter *sc;
 
 	sc = device_get_softc(dev);
 	if (sc->flags & FW_OK)
 		return (0);
 	return (ENXIO);
 }
 
 static int
 t4_read_port_device(device_t dev, int port, device_t *child)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 
 	sc = device_get_softc(dev);
 	if (port < 0 || port >= MAX_NPORTS)
 		return (EINVAL);
 	pi = sc->port[port];
 	if (pi == NULL || pi->dev == NULL)
 		return (ENXIO);
 	*child = pi->dev;
 	return (0);
 }
 
 static int
 notify_siblings(device_t dev, int detaching)
 {
 	device_t sibling;
 	int error, i;
 
 	error = 0;
 	for (i = 0; i < PCI_FUNCMAX; i++) {
 		if (i == pci_get_function(dev))
 			continue;
 		sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev),
 		    pci_get_slot(dev), i);
 		if (sibling == NULL || !device_is_attached(sibling))
 			continue;
 		if (detaching)
 			error = T4_DETACH_CHILD(sibling);
 		else
 			(void)T4_ATTACH_CHILD(sibling);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Idempotent
  */
 static int
 t4_detach(device_t dev)
 {
 	struct adapter *sc;
 	int rc;
 
 	sc = device_get_softc(dev);
 
 	rc = notify_siblings(dev, 1);
 	if (rc) {
 		device_printf(dev,
 		    "failed to detach sibling devices: %d\n", rc);
 		return (rc);
 	}
 
 	return (t4_detach_common(dev));
 }
 
 int
 t4_detach_common(device_t dev)
 {
 	struct adapter *sc;
 	struct port_info *pi;
 	int i, rc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->cdev) {
 		destroy_dev(sc->cdev);
 		sc->cdev = NULL;
 	}
 
 	sx_xlock(&t4_list_lock);
 	SLIST_REMOVE(&t4_list, sc, adapter, link);
 	sx_xunlock(&t4_list_lock);
 
 	sc->flags &= ~CHK_MBOX_ACCESS;
 	if (sc->flags & FULL_INIT_DONE) {
 		if (!(sc->flags & IS_VF))
 			t4_intr_disable(sc);
 	}
 
 	if (device_is_attached(dev)) {
 		rc = bus_generic_detach(dev);
 		if (rc) {
 			device_printf(dev,
 			    "failed to detach child devices: %d\n", rc);
 			return (rc);
 		}
 	}
 
 #ifdef TCP_OFFLOAD
 	taskqueue_drain(taskqueue_thread, &sc->async_event_task);
 #endif
 
 	for (i = 0; i < sc->intr_count; i++)
 		t4_free_irq(sc, &sc->irq[i]);
 
 	if ((sc->flags & (IS_VF | FW_OK)) == FW_OK)
 		t4_free_tx_sched(sc);
 
 	for (i = 0; i < MAX_NPORTS; i++) {
 		pi = sc->port[i];
 		if (pi) {
 			t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid);
 			if (pi->dev)
 				device_delete_child(dev, pi->dev);
 
 			mtx_destroy(&pi->pi_lock);
 			free(pi->vi, M_CXGBE);
 			free(pi, M_CXGBE);
 		}
 	}
 
 	device_delete_children(dev);
 	adapter_full_uninit(sc);
 
 	if ((sc->flags & (IS_VF | FW_OK)) == FW_OK)
 		t4_fw_bye(sc, sc->mbox);
 
 	if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX)
 		pci_release_msi(dev);
 
 	if (sc->regs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid,
 		    sc->regs_res);
 
 	if (sc->udbs_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid,
 		    sc->udbs_res);
 
 	if (sc->msix_res)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid,
 		    sc->msix_res);
 
 	if (sc->l2t)
 		t4_free_l2t(sc->l2t);
 	if (sc->smt)
 		t4_free_smt(sc->smt);
 	t4_free_atid_table(sc);
 #ifdef RATELIMIT
 	t4_free_etid_table(sc);
 #endif
 	if (sc->key_map)
 		vmem_destroy(sc->key_map);
 #ifdef INET6
 	t4_destroy_clip_table(sc);
 #endif
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	free(sc->sge.ofld_txq, M_CXGBE);
 #endif
 #ifdef TCP_OFFLOAD
 	free(sc->sge.ofld_rxq, M_CXGBE);
 #endif
 #ifdef DEV_NETMAP
 	free(sc->sge.nm_rxq, M_CXGBE);
 	free(sc->sge.nm_txq, M_CXGBE);
 #endif
 	free(sc->irq, M_CXGBE);
 	free(sc->sge.rxq, M_CXGBE);
 	free(sc->sge.txq, M_CXGBE);
 	free(sc->sge.ctrlq, M_CXGBE);
 	free(sc->sge.iqmap, M_CXGBE);
 	free(sc->sge.eqmap, M_CXGBE);
 	free(sc->tids.ftid_tab, M_CXGBE);
 	free(sc->tids.hpftid_tab, M_CXGBE);
 	free_hftid_hash(&sc->tids);
 	free(sc->tids.tid_tab, M_CXGBE);
 	free(sc->tt.tls_rx_ports, M_CXGBE);
 	t4_destroy_dma_tag(sc);
 
 	callout_drain(&sc->ktls_tick);
 	callout_drain(&sc->sfl_callout);
 	if (mtx_initialized(&sc->tids.ftid_lock)) {
 		mtx_destroy(&sc->tids.ftid_lock);
 		cv_destroy(&sc->tids.ftid_cv);
 	}
 	if (mtx_initialized(&sc->tids.atid_lock))
 		mtx_destroy(&sc->tids.atid_lock);
 	if (mtx_initialized(&sc->ifp_lock))
 		mtx_destroy(&sc->ifp_lock);
 
 	if (rw_initialized(&sc->policy_lock)) {
 		rw_destroy(&sc->policy_lock);
 #ifdef TCP_OFFLOAD
 		if (sc->policy != NULL)
 			free_offload_policy(sc->policy);
 #endif
 	}
 
 	for (i = 0; i < NUM_MEMWIN; i++) {
 		struct memwin *mw = &sc->memwin[i];
 
 		if (rw_initialized(&mw->mw_lock))
 			rw_destroy(&mw->mw_lock);
 	}
 
 	mtx_destroy(&sc->sfl_lock);
 	mtx_destroy(&sc->reg_lock);
 	mtx_destroy(&sc->sc_lock);
 
 	bzero(sc, sizeof(*sc));
 
 	return (0);
 }
 
 static inline bool
 ok_to_reset(struct adapter *sc)
 {
 	struct tid_info *t = &sc->tids;
 	struct port_info *pi;
 	struct vi_info *vi;
 	int i, j;
 	const int caps = IFCAP_TOE | IFCAP_TXTLS | IFCAP_NETMAP | IFCAP_TXRTLMT;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	MPASS(!(sc->flags & IS_VF));
 
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		for_each_vi(pi, j, vi) {
 			if (vi->ifp->if_capenable & caps)
 				return (false);
 		}
 	}
 
 	if (atomic_load_int(&t->tids_in_use) > 0)
 		return (false);
 	if (atomic_load_int(&t->stids_in_use) > 0)
 		return (false);
 	if (atomic_load_int(&t->atids_in_use) > 0)
 		return (false);
 	if (atomic_load_int(&t->ftids_in_use) > 0)
 		return (false);
 	if (atomic_load_int(&t->hpftids_in_use) > 0)
 		return (false);
 	if (atomic_load_int(&t->etids_in_use) > 0)
 		return (false);
 
 	return (true);
 }
 
 static int
 t4_suspend(device_t dev)
 {
 	struct adapter *sc = device_get_softc(dev);
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 	struct sge_wrq *wrq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_ofld_txq *ofld_txq;
 #endif
 	int rc, i, j, k;
 
 	CH_ALERT(sc, "suspend requested\n");
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4sus");
 	if (rc != 0)
 		return (ENXIO);
 
 	/* XXX: Can the kernel call suspend repeatedly without resume? */
 	MPASS(!hw_off_limits(sc));
 
 	if (!ok_to_reset(sc)) {
 		/* XXX: should list what resource is preventing suspend. */
 		CH_ERR(sc, "not safe to suspend.\n");
 		rc = EBUSY;
 		goto done;
 	}
 
 	/* No more DMA or interrupts. */
 	t4_shutdown_adapter(sc);
 
 	/* Quiesce all activity. */
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		pi->vxlan_tcam_entry = false;
 
 		PORT_LOCK(pi);
 		if (pi->up_vis > 0) {
 			/*
 			 * t4_shutdown_adapter has already shut down all the
 			 * PHYs but it also disables interrupts and DMA so there
 			 * won't be a link interrupt.  So we update the state
 			 * manually and inform the kernel.
 			 */
 			pi->link_cfg.link_ok = false;
 			t4_os_link_changed(pi);
 		}
 		PORT_UNLOCK(pi);
 
 		for_each_vi(pi, j, vi) {
 			vi->xact_addr_filt = -1;
 			if (!(vi->flags & VI_INIT_DONE))
 				continue;
 
 			ifp = vi->ifp;
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				mtx_lock(&vi->tick_mtx);
 				vi->flags |= VI_SKIP_STATS;
 				callout_stop(&vi->tick);
 				mtx_unlock(&vi->tick_mtx);
 				callout_drain(&vi->tick);
 			}
 
 			/*
 			 * Note that the HW is not available.
 			 */
 			for_each_txq(vi, k, txq) {
 				TXQ_LOCK(txq);
 				txq->eq.flags &= ~(EQ_ENABLED | EQ_HW_ALLOCATED);
 				TXQ_UNLOCK(txq);
 			}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 			for_each_ofld_txq(vi, k, ofld_txq) {
 				ofld_txq->wrq.eq.flags &= ~EQ_HW_ALLOCATED;
 			}
 #endif
 			for_each_rxq(vi, k, rxq) {
 				rxq->iq.flags &= ~IQ_HW_ALLOCATED;
 			}
 #if defined(TCP_OFFLOAD)
 			for_each_ofld_rxq(vi, k, ofld_rxq) {
 				ofld_rxq->iq.flags &= ~IQ_HW_ALLOCATED;
 			}
 #endif
 
 			quiesce_vi(vi);
 		}
 
 		if (sc->flags & FULL_INIT_DONE) {
 			/* Control queue */
 			wrq = &sc->sge.ctrlq[i];
 			wrq->eq.flags &= ~EQ_HW_ALLOCATED;
 			quiesce_wrq(wrq);
 		}
 	}
 	if (sc->flags & FULL_INIT_DONE) {
 		/* Firmware event queue */
 		sc->sge.fwq.flags &= ~IQ_HW_ALLOCATED;
 		quiesce_iq_fl(sc, &sc->sge.fwq, NULL);
 	}
 
 	/* Mark the adapter totally off limits. */
 	mtx_lock(&sc->reg_lock);
 	sc->flags |= HW_OFF_LIMITS;
 	sc->flags &= ~(FW_OK | MASTER_PF);
 	sc->reset_thread = NULL;
 	mtx_unlock(&sc->reg_lock);
 
 	sc->num_resets++;
 	CH_ALERT(sc, "suspend completed.\n");
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 struct adapter_pre_reset_state {
 	u_int flags;
 	uint16_t nbmcaps;
 	uint16_t linkcaps;
 	uint16_t switchcaps;
 	uint16_t niccaps;
 	uint16_t toecaps;
 	uint16_t rdmacaps;
 	uint16_t cryptocaps;
 	uint16_t iscsicaps;
 	uint16_t fcoecaps;
 
 	u_int cfcsum;
 	char cfg_file[32];
 
 	struct adapter_params params;
 	struct t4_virt_res vres;
 	struct tid_info tids;
 	struct sge sge;
 
 	int rawf_base;
 	int nrawf;
 
 };
 
 static void
 save_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	o->flags = sc->flags;
 
 	o->nbmcaps =  sc->nbmcaps;
 	o->linkcaps = sc->linkcaps;
 	o->switchcaps = sc->switchcaps;
 	o->niccaps = sc->niccaps;
 	o->toecaps = sc->toecaps;
 	o->rdmacaps = sc->rdmacaps;
 	o->cryptocaps = sc->cryptocaps;
 	o->iscsicaps = sc->iscsicaps;
 	o->fcoecaps = sc->fcoecaps;
 
 	o->cfcsum = sc->cfcsum;
 	MPASS(sizeof(o->cfg_file) == sizeof(sc->cfg_file));
 	memcpy(o->cfg_file, sc->cfg_file, sizeof(o->cfg_file));
 
 	o->params = sc->params;
 	o->vres = sc->vres;
 	o->tids = sc->tids;
 	o->sge = sc->sge;
 
 	o->rawf_base = sc->rawf_base;
 	o->nrawf = sc->nrawf;
 }
 
 static int
 compare_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o)
 {
 	int rc = 0;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	/* Capabilities */
 #define COMPARE_CAPS(c) do { \
 	if (o->c##caps != sc->c##caps) { \
 		CH_ERR(sc, "%scaps 0x%04x -> 0x%04x.\n", #c, o->c##caps, \
 		    sc->c##caps); \
 		rc = EINVAL; \
 	} \
 } while (0)
 	COMPARE_CAPS(nbm);
 	COMPARE_CAPS(link);
 	COMPARE_CAPS(switch);
 	COMPARE_CAPS(nic);
 	COMPARE_CAPS(toe);
 	COMPARE_CAPS(rdma);
 	COMPARE_CAPS(crypto);
 	COMPARE_CAPS(iscsi);
 	COMPARE_CAPS(fcoe);
 #undef COMPARE_CAPS
 
 	/* Firmware config file */
 	if (o->cfcsum != sc->cfcsum) {
 		CH_ERR(sc, "config file %s (0x%x) -> %s (0x%x)\n", o->cfg_file,
 		    o->cfcsum, sc->cfg_file, sc->cfcsum);
 		rc = EINVAL;
 	}
 
 #define COMPARE_PARAM(p, name) do { \
 	if (o->p != sc->p) { \
 		CH_ERR(sc, #name " %d -> %d\n", o->p, sc->p); \
 		rc = EINVAL; \
 	} \
 } while (0)
 	COMPARE_PARAM(sge.iq_start, iq_start);
 	COMPARE_PARAM(sge.eq_start, eq_start);
 	COMPARE_PARAM(tids.ftid_base, ftid_base);
 	COMPARE_PARAM(tids.ftid_end, ftid_end);
 	COMPARE_PARAM(tids.nftids, nftids);
 	COMPARE_PARAM(vres.l2t.start, l2t_start);
 	COMPARE_PARAM(vres.l2t.size, l2t_size);
 	COMPARE_PARAM(sge.iqmap_sz, iqmap_sz);
 	COMPARE_PARAM(sge.eqmap_sz, eqmap_sz);
 	COMPARE_PARAM(tids.tid_base, tid_base);
 	COMPARE_PARAM(tids.hpftid_base, hpftid_base);
 	COMPARE_PARAM(tids.hpftid_end, hpftid_end);
 	COMPARE_PARAM(tids.nhpftids, nhpftids);
 	COMPARE_PARAM(rawf_base, rawf_base);
 	COMPARE_PARAM(nrawf, nrawf);
 	COMPARE_PARAM(params.mps_bg_map, mps_bg_map);
 	COMPARE_PARAM(params.filter2_wr_support, filter2_wr_support);
 	COMPARE_PARAM(params.ulptx_memwrite_dsgl, ulptx_memwrite_dsgl);
 	COMPARE_PARAM(params.fr_nsmr_tpte_wr_support, fr_nsmr_tpte_wr_support);
 	COMPARE_PARAM(params.max_pkts_per_eth_tx_pkts_wr, max_pkts_per_eth_tx_pkts_wr);
 	COMPARE_PARAM(tids.ntids, ntids);
 	COMPARE_PARAM(tids.etid_base, etid_base);
 	COMPARE_PARAM(tids.etid_end, etid_end);
 	COMPARE_PARAM(tids.netids, netids);
 	COMPARE_PARAM(params.eo_wr_cred, eo_wr_cred);
 	COMPARE_PARAM(params.ethoffload, ethoffload);
 	COMPARE_PARAM(tids.natids, natids);
 	COMPARE_PARAM(tids.stid_base, stid_base);
 	COMPARE_PARAM(vres.ddp.start, ddp_start);
 	COMPARE_PARAM(vres.ddp.size, ddp_size);
 	COMPARE_PARAM(params.ofldq_wr_cred, ofldq_wr_cred);
 	COMPARE_PARAM(vres.stag.start, stag_start);
 	COMPARE_PARAM(vres.stag.size, stag_size);
 	COMPARE_PARAM(vres.rq.start, rq_start);
 	COMPARE_PARAM(vres.rq.size, rq_size);
 	COMPARE_PARAM(vres.pbl.start, pbl_start);
 	COMPARE_PARAM(vres.pbl.size, pbl_size);
 	COMPARE_PARAM(vres.qp.start, qp_start);
 	COMPARE_PARAM(vres.qp.size, qp_size);
 	COMPARE_PARAM(vres.cq.start, cq_start);
 	COMPARE_PARAM(vres.cq.size, cq_size);
 	COMPARE_PARAM(vres.ocq.start, ocq_start);
 	COMPARE_PARAM(vres.ocq.size, ocq_size);
 	COMPARE_PARAM(vres.srq.start, srq_start);
 	COMPARE_PARAM(vres.srq.size, srq_size);
 	COMPARE_PARAM(params.max_ordird_qp, max_ordird_qp);
 	COMPARE_PARAM(params.max_ird_adapter, max_ird_adapter);
 	COMPARE_PARAM(vres.iscsi.start, iscsi_start);
 	COMPARE_PARAM(vres.iscsi.size, iscsi_size);
 	COMPARE_PARAM(vres.key.start, key_start);
 	COMPARE_PARAM(vres.key.size, key_size);
 #undef COMPARE_PARAM
 
 	return (rc);
 }
 
 static int
 t4_resume(device_t dev)
 {
 	struct adapter *sc = device_get_softc(dev);
 	struct adapter_pre_reset_state *old_state = NULL;
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	struct sge_txq *txq;
 	int rc, i, j, k;
 
 	CH_ALERT(sc, "resume requested.\n");
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4res");
 	if (rc != 0)
 		return (ENXIO);
 	MPASS(hw_off_limits(sc));
 	MPASS((sc->flags & FW_OK) == 0);
 	MPASS((sc->flags & MASTER_PF) == 0);
 	MPASS(sc->reset_thread == NULL);
 	sc->reset_thread = curthread;
 
 	/* Register access is expected to work by the time we're here. */
 	if (t4_read_reg(sc, A_PL_WHOAMI) == 0xffffffff) {
 		CH_ERR(sc, "%s: can't read device registers\n", __func__);
 		rc = ENXIO;
 		goto done;
 	}
 
 	/* Restore memory window. */
 	setup_memwin(sc);
 
 	/* Go no further if recovery mode has been requested. */
 	if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) {
 		CH_ALERT(sc, "recovery mode on resume.\n");
 		rc = 0;
 		mtx_lock(&sc->reg_lock);
 		sc->flags &= ~HW_OFF_LIMITS;
 		mtx_unlock(&sc->reg_lock);
 		goto done;
 	}
 
 	old_state = malloc(sizeof(*old_state), M_CXGBE, M_ZERO | M_WAITOK);
 	save_caps_and_params(sc, old_state);
 
 	/* Reestablish contact with firmware and become the primary PF. */
 	rc = contact_firmware(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 	MPASS(sc->flags & FW_OK);
 
 	if (sc->flags & MASTER_PF) {
 		rc = partition_resources(sc);
 		if (rc != 0)
 			goto done; /* error message displayed already */
 		t4_intr_clear(sc);
 	}
 
 	rc = get_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = set_params__post_init(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	rc = compare_caps_and_params(sc, old_state);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		MPASS(pi != NULL);
 		MPASS(pi->vi != NULL);
 		MPASS(pi->vi[0].dev == pi->dev);
 
 		rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i);
 		if (rc != 0) {
 			CH_ERR(sc,
 			    "failed to re-initialize port %d: %d\n", i, rc);
 			goto done;
 		}
 		MPASS(sc->chan_map[pi->tx_chan] == i);
 
 		PORT_LOCK(pi);
 		fixup_link_config(pi);
 		build_medialist(pi);
 		PORT_UNLOCK(pi);
 		for_each_vi(pi, j, vi) {
 			if (IS_MAIN_VI(vi))
 				continue;
 			rc = alloc_extra_vi(sc, pi, vi);
 			if (rc != 0) {
 				CH_ERR(vi,
 				    "failed to re-allocate extra VI: %d\n", rc);
 				goto done;
 			}
 		}
 	}
 
 	/*
 	 * Interrupts and queues are about to be enabled and other threads will
 	 * want to access the hardware too.  It is safe to do so.  Note that
 	 * this thread is still in the middle of a synchronized_op.
 	 */
 	mtx_lock(&sc->reg_lock);
 	sc->flags &= ~HW_OFF_LIMITS;
 	mtx_unlock(&sc->reg_lock);
 
 	if (sc->flags & FULL_INIT_DONE) {
 		rc = adapter_full_init(sc);
 		if (rc != 0) {
 			CH_ERR(sc, "failed to re-initialize adapter: %d\n", rc);
 			goto done;
 		}
 
 		if (sc->vxlan_refcount > 0)
 			enable_vxlan_rx(sc);
 
 		for_each_port(sc, i) {
 			pi = sc->port[i];
 			for_each_vi(pi, j, vi) {
 				if (!(vi->flags & VI_INIT_DONE))
 					continue;
 				rc = vi_full_init(vi);
 				if (rc != 0) {
 					CH_ERR(vi, "failed to re-initialize "
 					    "interface: %d\n", rc);
 					goto done;
 				}
 
 				ifp = vi->ifp;
 				if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
 					continue;
 				/*
 				 * Note that we do not setup multicast addresses
 				 * in the first pass.  This ensures that the
 				 * unicast DMACs for all VIs on all ports get an
 				 * MPS TCAM entry.
 				 */
 				rc = update_mac_settings(ifp, XGMAC_ALL &
 				    ~XGMAC_MCADDRS);
 				if (rc != 0) {
 					CH_ERR(vi, "failed to re-configure MAC: %d\n", rc);
 					goto done;
 				}
 				rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true,
 				    true);
 				if (rc != 0) {
 					CH_ERR(vi, "failed to re-enable VI: %d\n", rc);
 					goto done;
 				}
 				for_each_txq(vi, k, txq) {
 					TXQ_LOCK(txq);
 					txq->eq.flags |= EQ_ENABLED;
 					TXQ_UNLOCK(txq);
 				}
 				mtx_lock(&vi->tick_mtx);
 				vi->flags &= ~VI_SKIP_STATS;
 				callout_schedule(&vi->tick, hz);
 				mtx_unlock(&vi->tick_mtx);
 			}
 			PORT_LOCK(pi);
 			if (pi->up_vis > 0) {
 				t4_update_port_info(pi);
 				fixup_link_config(pi);
 				build_medialist(pi);
 				apply_link_config(pi);
 				if (pi->link_cfg.link_ok)
 					t4_os_link_changed(pi);
 			}
 			PORT_UNLOCK(pi);
 		}
 
 		/* Now reprogram the L2 multicast addresses. */
 		for_each_port(sc, i) {
 			pi = sc->port[i];
 			for_each_vi(pi, j, vi) {
 				if (!(vi->flags & VI_INIT_DONE))
 					continue;
 				ifp = vi->ifp;
 				if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
 					continue;
 				rc = update_mac_settings(ifp, XGMAC_MCADDRS);
 				if (rc != 0) {
 					CH_ERR(vi, "failed to re-configure MCAST MACs: %d\n", rc);
 					rc = 0;	/* carry on */
 				}
 			}
 		}
 	}
 done:
 	if (rc == 0) {
 		sc->incarnation++;
 		CH_ALERT(sc, "resume completed.\n");
 	}
 	end_synchronized_op(sc, 0);
 	free(old_state, M_CXGBE);
 	return (rc);
 }
 
 static int
 t4_reset_prepare(device_t dev, device_t child)
 {
 	struct adapter *sc = device_get_softc(dev);
 
 	CH_ALERT(sc, "reset_prepare.\n");
 	return (0);
 }
 
 static int
 t4_reset_post(device_t dev, device_t child)
 {
 	struct adapter *sc = device_get_softc(dev);
 
 	CH_ALERT(sc, "reset_post.\n");
 	return (0);
 }
 
 static void
 reset_adapter(void *arg, int pending)
 {
 	struct adapter *sc = arg;
 	int rc;
 
 	CH_ALERT(sc, "reset requested.\n");
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst1");
 	if (rc != 0)
 		return;
 
 	if (hw_off_limits(sc)) {
 		CH_ERR(sc, "adapter is suspended, use resume (not reset).\n");
 		rc = ENXIO;
 		goto done;
 	}
 
 	if (!ok_to_reset(sc)) {
 		/* XXX: should list what resource is preventing reset. */
 		CH_ERR(sc, "not safe to reset.\n");
 		rc = EBUSY;
 		goto done;
 	}
 
 done:
 	end_synchronized_op(sc, 0);
 	if (rc != 0)
 		return;	/* Error logged already. */
 
 	mtx_lock(&Giant);
 	rc = BUS_RESET_CHILD(device_get_parent(sc->dev), sc->dev, 0);
 	mtx_unlock(&Giant);
 	if (rc != 0)
 		CH_ERR(sc, "bus_reset_child failed: %d.\n", rc);
 	else
 		CH_ALERT(sc, "bus_reset_child succeeded.\n");
 }
 
 static int
 cxgbe_probe(device_t dev)
 {
 	char buf[128];
 	struct port_info *pi = device_get_softc(dev);
 
 	snprintf(buf, sizeof(buf), "port %d", pi->port_id);
 	device_set_desc_copy(dev, buf);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \
     IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \
     IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \
     IFCAP_HWRXTSTMP | IFCAP_MEXTPG)
 #define T4_CAP_ENABLE (T4_CAP)
 
 static int
 cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 {
 	struct ifnet *ifp;
 	struct sbuf *sb;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *children;
 	struct pfil_head_args pa;
 	struct adapter *sc = vi->adapter;
 
 	ctx = device_get_sysctl_ctx(vi->dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(vi->dev));
 	vi->rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rxq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NIC rx queues");
 	vi->txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "txq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NIC tx queues");
 #ifdef DEV_NETMAP
 	vi->nm_rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "nm_rxq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap rx queues");
 	vi->nm_txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "nm_txq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap tx queues");
 #endif
 #ifdef TCP_OFFLOAD
 	vi->ofld_rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "ofld_rxq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE rx queues");
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	vi->ofld_txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE/ETHOFLD tx queues");
 #endif
 
 	vi->xact_addr_filt = -1;
 	mtx_init(&vi->tick_mtx, "vi tick", NULL, MTX_DEF);
 	callout_init_mtx(&vi->tick, &vi->tick_mtx, 0);
 	if (sc->flags & IS_VF || t4_tx_vm_wr != 0)
 		vi->flags |= TX_USES_VM_WR;
 
 	/* Allocate an ifnet and set it up */
 	ifp = if_alloc_dev(IFT_ETHER, dev);
 	if (ifp == NULL) {
 		device_printf(dev, "Cannot allocate ifnet\n");
 		return (ENOMEM);
 	}
 	vi->ifp = ifp;
 	ifp->if_softc = vi;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 
 	ifp->if_init = cxgbe_init;
 	ifp->if_ioctl = cxgbe_ioctl;
 	ifp->if_transmit = cxgbe_transmit;
 	ifp->if_qflush = cxgbe_qflush;
 	if (vi->pi->nvi > 1 || sc->flags & IS_VF)
 		ifp->if_get_counter = vi_get_counter;
 	else
 		ifp->if_get_counter = cxgbe_get_counter;
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	ifp->if_snd_tag_alloc = cxgbe_snd_tag_alloc;
-	ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
-	ifp->if_snd_tag_query = cxgbe_snd_tag_query;
-	ifp->if_snd_tag_free = cxgbe_snd_tag_free;
 #endif
 #ifdef RATELIMIT
 	ifp->if_ratelimit_query = cxgbe_ratelimit_query;
 #endif
 
 	ifp->if_capabilities = T4_CAP;
 	ifp->if_capenable = T4_CAP_ENABLE;
 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
 	if (chip_id(sc) >= CHELSIO_T6) {
 		ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
 		ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
 		ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP |
 		    CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
 		    CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN;
 	}
 
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0)
 		ifp->if_capabilities |= IFCAP_TOE;
 #endif
 #ifdef RATELIMIT
 	if (is_ethoffload(sc) && vi->nofldtxq != 0) {
 		ifp->if_capabilities |= IFCAP_TXRTLMT;
 		ifp->if_capenable |= IFCAP_TXRTLMT;
 	}
 #endif
 
 	ifp->if_hw_tsomax = IP_MAXPACKET;
 	if (vi->flags & TX_USES_VM_WR)
 		ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO;
 	else
 		ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
 #ifdef RATELIMIT
 	if (is_ethoffload(sc) && vi->nofldtxq != 0)
 		ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO;
 #endif
 	ifp->if_hw_tsomaxsegsize = 65536;
 #ifdef KERN_TLS
 	if (is_ktls(sc)) {
 		ifp->if_capabilities |= IFCAP_TXTLS;
 		if (sc->flags & KERN_TLS_ON)
 			ifp->if_capenable |= IFCAP_TXTLS;
 	}
 #endif
 
 	ether_ifattach(ifp, vi->hw_addr);
 #ifdef DEV_NETMAP
 	if (vi->nnmrxq != 0)
 		cxgbe_nm_attach(vi);
 #endif
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	switch (ifp->if_capabilities & (IFCAP_TOE | IFCAP_TXRTLMT)) {
 	case IFCAP_TOE:
 		sbuf_printf(sb, "; %d txq (TOE)", vi->nofldtxq);
 		break;
 	case IFCAP_TOE | IFCAP_TXRTLMT:
 		sbuf_printf(sb, "; %d txq (TOE/ETHOFLD)", vi->nofldtxq);
 		break;
 	case IFCAP_TXRTLMT:
 		sbuf_printf(sb, "; %d txq (ETHOFLD)", vi->nofldtxq);
 		break;
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (ifp->if_capabilities & IFCAP_TOE)
 		sbuf_printf(sb, ", %d rxq (TOE)", vi->nofldrxq);
 #endif
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		sbuf_printf(sb, "; %d txq, %d rxq (netmap)",
 		    vi->nnmtxq, vi->nnmrxq);
 #endif
 	sbuf_finish(sb);
 	device_printf(dev, "%s\n", sbuf_data(sb));
 	sbuf_delete(sb);
 
 	vi_sysctls(vi);
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = ifp->if_xname;
 	vi->pfil = pfil_head_register(&pa);
 
 	return (0);
 }
 
 static int
 cxgbe_attach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct adapter *sc = pi->adapter;
 	struct vi_info *vi;
 	int i, rc;
 
 	rc = cxgbe_vi_attach(dev, &pi->vi[0]);
 	if (rc)
 		return (rc);
 
 	for_each_vi(pi, i, vi) {
 		if (i == 0)
 			continue;
 		vi->dev = device_add_child(dev, sc->names->vi_ifnet_name, -1);
 		if (vi->dev == NULL) {
 			device_printf(dev, "failed to add VI %d\n", i);
 			continue;
 		}
 		device_set_softc(vi->dev, vi);
 	}
 
 	cxgbe_sysctls(pi);
 
 	bus_generic_attach(dev);
 
 	return (0);
 }
 
 static void
 cxgbe_vi_detach(struct vi_info *vi)
 {
 	struct ifnet *ifp = vi->ifp;
 
 	if (vi->pfil != NULL) {
 		pfil_head_unregister(vi->pfil);
 		vi->pfil = NULL;
 	}
 
 	ether_ifdetach(ifp);
 
 	/* Let detach proceed even if these fail. */
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		cxgbe_nm_detach(vi);
 #endif
 	cxgbe_uninit_synchronized(vi);
 	callout_drain(&vi->tick);
 	vi_full_uninit(vi);
 
 	if_free(vi->ifp);
 	vi->ifp = NULL;
 }
 
 static int
 cxgbe_detach(device_t dev)
 {
 	struct port_info *pi = device_get_softc(dev);
 	struct adapter *sc = pi->adapter;
 	int rc;
 
 	/* Detach the extra VIs first. */
 	rc = bus_generic_detach(dev);
 	if (rc)
 		return (rc);
 	device_delete_children(dev);
 
 	doom_vi(sc, &pi->vi[0]);
 
 	if (pi->flags & HAS_TRACEQ) {
 		sc->traceq = -1;	/* cloner should not create ifnet */
 		t4_tracer_port_detach(sc);
 	}
 
 	cxgbe_vi_detach(&pi->vi[0]);
 	ifmedia_removeall(&pi->media);
 
 	end_synchronized_op(sc, 0);
 
 	return (0);
 }
 
 static void
 cxgbe_init(void *arg)
 {
 	struct vi_info *vi = arg;
 	struct adapter *sc = vi->adapter;
 
 	if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0)
 		return;
 	cxgbe_init_synchronized(vi);
 	end_synchronized_op(sc, 0);
 }
 
 static int
 cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
 {
 	int rc = 0, mtu, flags;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifreq *ifr = (struct ifreq *)data;
 	uint32_t mask;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		mtu = ifr->ifr_mtu;
 		if (mtu < ETHERMIN || mtu > MAX_MTU)
 			return (EINVAL);
 
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu");
 		if (rc)
 			return (rc);
 		ifp->if_mtu = mtu;
 		if (vi->flags & VI_INIT_DONE) {
 			t4_update_fl_bufsize(ifp);
 			if (!hw_off_limits(sc) &&
 			    ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_MTU);
 		}
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFFLAGS:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4flg");
 		if (rc)
 			return (rc);
 
 		if (hw_off_limits(sc)) {
 			rc = ENXIO;
 			goto fail;
 		}
 
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				flags = vi->if_flags;
 				if ((ifp->if_flags ^ flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					rc = update_mac_settings(ifp,
 					    XGMAC_PROMISC | XGMAC_ALLMULTI);
 				}
 			} else {
 				rc = cxgbe_init_synchronized(vi);
 			}
 			vi->if_flags = ifp->if_flags;
 		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			rc = cxgbe_uninit_synchronized(vi);
 		}
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4multi");
 		if (rc)
 			return (rc);
 		if (!hw_off_limits(sc) && ifp->if_drv_flags & IFF_DRV_RUNNING)
 			rc = update_mac_settings(ifp, XGMAC_MCADDRS);
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFCAP:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap");
 		if (rc)
 			return (rc);
 
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 
 			if (IFCAP_TSO4 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				mask &= ~IFCAP_TSO4;
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				if_printf(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 			if (IFCAP_TSO6 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				mask &= ~IFCAP_TSO6;
 				ifp->if_capenable &= ~IFCAP_TSO6;
 				if_printf(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 
 		/*
 		 * Note that we leave CSUM_TSO alone (it is always set).  The
 		 * kernel takes both IFCAP_TSOx and CSUM_TSO into account before
 		 * sending a TSO request our way, so it's sufficient to toggle
 		 * IFCAP_TSOx only.
 		 */
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO4;
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				if_printf(ifp, "enable txcsum6 first.\n");
 				rc = EAGAIN;
 				goto fail;
 			}
 			ifp->if_capenable ^= IFCAP_TSO6;
 		}
 		if (mask & IFCAP_LRO) {
 #if defined(INET) || defined(INET6)
 			int i;
 			struct sge_rxq *rxq;
 
 			ifp->if_capenable ^= IFCAP_LRO;
 			for_each_rxq(vi, i, rxq) {
 				if (ifp->if_capenable & IFCAP_LRO)
 					rxq->iq.flags |= IQ_LRO_ENABLED;
 				else
 					rxq->iq.flags &= ~IQ_LRO_ENABLED;
 			}
 #endif
 		}
 #ifdef TCP_OFFLOAD
 		if (mask & IFCAP_TOE) {
 			int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE;
 
 			rc = toe_capability(vi, enable);
 			if (rc != 0)
 				goto fail;
 
 			ifp->if_capenable ^= mask;
 		}
 #endif
 		if (mask & IFCAP_VLAN_HWTAGGING) {
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_VLANEX);
 		}
 		if (mask & IFCAP_VLAN_MTU) {
 			ifp->if_capenable ^= IFCAP_VLAN_MTU;
 
 			/* Need to find out how to disable auto-mtu-inflation */
 		}
 		if (mask & IFCAP_VLAN_HWTSO)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
 		if (mask & IFCAP_VLAN_HWCSUM)
 			ifp->if_capenable ^= IFCAP_VLAN_HWCSUM;
 #ifdef RATELIMIT
 		if (mask & IFCAP_TXRTLMT)
 			ifp->if_capenable ^= IFCAP_TXRTLMT;
 #endif
 		if (mask & IFCAP_HWRXTSTMP) {
 			int i;
 			struct sge_rxq *rxq;
 
 			ifp->if_capenable ^= IFCAP_HWRXTSTMP;
 			for_each_rxq(vi, i, rxq) {
 				if (ifp->if_capenable & IFCAP_HWRXTSTMP)
 					rxq->iq.flags |= IQ_RX_TIMESTAMP;
 				else
 					rxq->iq.flags &= ~IQ_RX_TIMESTAMP;
 			}
 		}
 		if (mask & IFCAP_MEXTPG)
 			ifp->if_capenable ^= IFCAP_MEXTPG;
 
 #ifdef KERN_TLS
 		if (mask & IFCAP_TXTLS) {
 			int enable = (ifp->if_capenable ^ mask) & IFCAP_TXTLS;
 
 			rc = ktls_capability(sc, enable);
 			if (rc != 0)
 				goto fail;
 
 			ifp->if_capenable ^= (mask & IFCAP_TXTLS);
 		}
 #endif
 		if (mask & IFCAP_VXLAN_HWCSUM) {
 			ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM;
 			ifp->if_hwassist ^= CSUM_INNER_IP6_UDP |
 			    CSUM_INNER_IP6_TCP | CSUM_INNER_IP |
 			    CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP;
 		}
 		if (mask & IFCAP_VXLAN_HWTSO) {
 			ifp->if_capenable ^= IFCAP_VXLAN_HWTSO;
 			ifp->if_hwassist ^= CSUM_INNER_IP6_TSO |
 			    CSUM_INNER_IP_TSO;
 		}
 
 #ifdef VLAN_CAPABILITIES
 		VLAN_CAPABILITIES(ifp);
 #endif
 fail:
 		end_synchronized_op(sc, 0);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		ifmedia_ioctl(ifp, ifr, &pi->media, cmd);
 		break;
 
 	case SIOCGI2C: {
 		struct ifi2creq i2c;
 
 		rc = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (rc != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			rc = EPERM;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			rc = EINVAL;
 			break;
 		}
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c");
 		if (rc)
 			return (rc);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else
 			rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr,
 			    i2c.offset, i2c.len, &i2c.data[0]);
 		end_synchronized_op(sc, 0);
 		if (rc == 0)
 			rc = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 		break;
 	}
 
 	default:
 		rc = ether_ioctl(ifp, cmd, data);
 	}
 
 	return (rc);
 }
 
 static int
 cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc;
 	struct sge_txq *txq;
 	void *items[1];
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_nextpkt == NULL);	/* not quite ready for this yet */
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 #endif
 
 	if (__predict_false(pi->link_cfg.link_ok == false)) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	rc = parse_pkt(&m, vi->flags & TX_USES_VM_WR);
 	if (__predict_false(rc != 0)) {
 		MPASS(m == NULL);			/* was freed already */
 		atomic_add_int(&pi->tx_parse_error, 1);	/* rare, atomic is ok */
 		return (rc);
 	}
 #ifdef RATELIMIT
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
-		if (m->m_pkthdr.snd_tag->type == IF_SND_TAG_TYPE_RATE_LIMIT)
+		if (m->m_pkthdr.snd_tag->sw->type == IF_SND_TAG_TYPE_RATE_LIMIT)
 			return (ethofld_transmit(ifp, m));
 	}
 #endif
 
 	/* Select a txq. */
 	sc = vi->adapter;
 	txq = &sc->sge.txq[vi->first_txq];
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) +
 		    vi->rsrv_noflowq);
 
 	items[0] = m;
 	rc = mp_ring_enqueue(txq->r, items, 1, 256);
 	if (__predict_false(rc != 0))
 		m_freem(m);
 
 	return (rc);
 }
 
 static void
 cxgbe_qflush(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct sge_txq *txq;
 	int i;
 
 	/* queues do not exist if !VI_INIT_DONE. */
 	if (vi->flags & VI_INIT_DONE) {
 		for_each_txq(vi, i, txq) {
 			TXQ_LOCK(txq);
 			txq->eq.flags |= EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 			while (!mp_ring_is_idle(txq->r)) {
 				mp_ring_check_drainage(txq->r, 4096);
 				pause("qflush", 1);
 			}
 			TXQ_LOCK(txq);
 			txq->eq.flags &= ~EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 		}
 	}
 	if_qflush(ifp);
 }
 
 static uint64_t
 vi_get_counter(struct ifnet *ifp, ift_counter c)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct fw_vi_stats_vf *s = &vi->stats;
 
 	mtx_lock(&vi->tick_mtx);
 	vi_refresh_stats(vi);
 	mtx_unlock(&vi->tick_mtx);
 
 	switch (c) {
 	case IFCOUNTER_IPACKETS:
 		return (s->rx_bcast_frames + s->rx_mcast_frames +
 		    s->rx_ucast_frames);
 	case IFCOUNTER_IERRORS:
 		return (s->rx_err_frames);
 	case IFCOUNTER_OPACKETS:
 		return (s->tx_bcast_frames + s->tx_mcast_frames +
 		    s->tx_ucast_frames + s->tx_offload_frames);
 	case IFCOUNTER_OERRORS:
 		return (s->tx_drop_frames);
 	case IFCOUNTER_IBYTES:
 		return (s->rx_bcast_bytes + s->rx_mcast_bytes +
 		    s->rx_ucast_bytes);
 	case IFCOUNTER_OBYTES:
 		return (s->tx_bcast_bytes + s->tx_mcast_bytes +
 		    s->tx_ucast_bytes + s->tx_offload_bytes);
 	case IFCOUNTER_IMCASTS:
 		return (s->rx_mcast_frames);
 	case IFCOUNTER_OMCASTS:
 		return (s->tx_mcast_frames);
 	case IFCOUNTER_OQDROPS: {
 		uint64_t drops;
 
 		drops = 0;
 		if (vi->flags & VI_INIT_DONE) {
 			int i;
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
 				drops += counter_u64_fetch(txq->r->dropped);
 		}
 
 		return (drops);
 
 	}
 
 	default:
 		return (if_get_counter_default(ifp, c));
 	}
 }
 
 static uint64_t
 cxgbe_get_counter(struct ifnet *ifp, ift_counter c)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct port_stats *s = &pi->stats;
 
 	mtx_lock(&vi->tick_mtx);
 	cxgbe_refresh_stats(vi);
 	mtx_unlock(&vi->tick_mtx);
 
 	switch (c) {
 	case IFCOUNTER_IPACKETS:
 		return (s->rx_frames);
 
 	case IFCOUNTER_IERRORS:
 		return (s->rx_jabber + s->rx_runt + s->rx_too_long +
 		    s->rx_fcs_err + s->rx_len_err);
 
 	case IFCOUNTER_OPACKETS:
 		return (s->tx_frames);
 
 	case IFCOUNTER_OERRORS:
 		return (s->tx_error_frames);
 
 	case IFCOUNTER_IBYTES:
 		return (s->rx_octets);
 
 	case IFCOUNTER_OBYTES:
 		return (s->tx_octets);
 
 	case IFCOUNTER_IMCASTS:
 		return (s->rx_mcast_frames);
 
 	case IFCOUNTER_OMCASTS:
 		return (s->tx_mcast_frames);
 
 	case IFCOUNTER_IQDROPS:
 		return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 +
 		    s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 +
 		    s->rx_trunc3 + pi->tnl_cong_drops);
 
 	case IFCOUNTER_OQDROPS: {
 		uint64_t drops;
 
 		drops = s->tx_drop;
 		if (vi->flags & VI_INIT_DONE) {
 			int i;
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
 				drops += counter_u64_fetch(txq->r->dropped);
 		}
 
 		return (drops);
 
 	}
 
 	default:
 		return (if_get_counter_default(ifp, c));
 	}
 }
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static int
 cxgbe_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	int error;
 
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		error = cxgbe_rate_tag_alloc(ifp, params, pt);
 		break;
 #endif
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		error = cxgbe_tls_tag_alloc(ifp, params, pt);
 		break;
 #endif
 	default:
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
-
-static int
-cxgbe_snd_tag_modify(struct m_snd_tag *mst,
-    union if_snd_tag_modify_params *params)
-{
-
-	switch (mst->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_RATE_LIMIT:
-		return (cxgbe_rate_tag_modify(mst, params));
-#endif
-	default:
-		return (EOPNOTSUPP);
-	}
-}
-
-static int
-cxgbe_snd_tag_query(struct m_snd_tag *mst,
-    union if_snd_tag_query_params *params)
-{
-
-	switch (mst->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_RATE_LIMIT:
-		return (cxgbe_rate_tag_query(mst, params));
-#endif
-	default:
-		return (EOPNOTSUPP);
-	}
-}
-
-static void
-cxgbe_snd_tag_free(struct m_snd_tag *mst)
-{
-
-	switch (mst->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_RATE_LIMIT:
-		cxgbe_rate_tag_free(mst);
-		return;
-#endif
-#ifdef KERN_TLS
-	case IF_SND_TAG_TYPE_TLS:
-		cxgbe_tls_tag_free(mst);
-		return;
-#endif
-	default:
-		panic("shouldn't get here");
-	}
-}
 #endif
 
 /*
  * The kernel picks a media from the list we had provided but we still validate
  * the requeste.
  */
 int
 cxgbe_media_change(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct ifmedia *ifm = &pi->media;
 	struct link_config *lc = &pi->link_cfg;
 	struct adapter *sc = pi->adapter;
 	int rc;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mec");
 	if (rc != 0)
 		return (rc);
 	PORT_LOCK(pi);
 	if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) {
 		/* ifconfig .. media autoselect */
 		if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
 			rc = ENOTSUP; /* AN not supported by transceiver */
 			goto done;
 		}
 		lc->requested_aneg = AUTONEG_ENABLE;
 		lc->requested_speed = 0;
 		lc->requested_fc |= PAUSE_AUTONEG;
 	} else {
 		lc->requested_aneg = AUTONEG_DISABLE;
 		lc->requested_speed =
 		    ifmedia_baudrate(ifm->ifm_media) / 1000000;
 		lc->requested_fc = 0;
 		if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE)
 			lc->requested_fc |= PAUSE_RX;
 		if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE)
 			lc->requested_fc |= PAUSE_TX;
 	}
 	if (pi->up_vis > 0) {
 		fixup_link_config(pi);
 		rc = apply_link_config(pi);
 	}
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 /*
  * Base media word (without ETHER, pause, link active, etc.) for the port at the
  * given speed.
  */
 static int
 port_mword(struct port_info *pi, uint32_t speed)
 {
 
 	MPASS(speed & M_FW_PORT_CAP32_SPEED);
 	MPASS(powerof2(speed));
 
 	switch(pi->port_type) {
 	case FW_PORT_TYPE_BT_SGMII:
 	case FW_PORT_TYPE_BT_XFI:
 	case FW_PORT_TYPE_BT_XAUI:
 		/* BaseT */
 		switch (speed) {
 		case FW_PORT_CAP32_SPEED_100M:
 			return (IFM_100_T);
 		case FW_PORT_CAP32_SPEED_1G:
 			return (IFM_1000_T);
 		case FW_PORT_CAP32_SPEED_10G:
 			return (IFM_10G_T);
 		}
 		break;
 	case FW_PORT_TYPE_KX4:
 		if (speed == FW_PORT_CAP32_SPEED_10G)
 			return (IFM_10G_KX4);
 		break;
 	case FW_PORT_TYPE_CX4:
 		if (speed == FW_PORT_CAP32_SPEED_10G)
 			return (IFM_10G_CX4);
 		break;
 	case FW_PORT_TYPE_KX:
 		if (speed == FW_PORT_CAP32_SPEED_1G)
 			return (IFM_1000_KX);
 		break;
 	case FW_PORT_TYPE_KR:
 	case FW_PORT_TYPE_BP_AP:
 	case FW_PORT_TYPE_BP4_AP:
 	case FW_PORT_TYPE_BP40_BA:
 	case FW_PORT_TYPE_KR4_100G:
 	case FW_PORT_TYPE_KR_SFP28:
 	case FW_PORT_TYPE_KR_XLAUI:
 		switch (speed) {
 		case FW_PORT_CAP32_SPEED_1G:
 			return (IFM_1000_KX);
 		case FW_PORT_CAP32_SPEED_10G:
 			return (IFM_10G_KR);
 		case FW_PORT_CAP32_SPEED_25G:
 			return (IFM_25G_KR);
 		case FW_PORT_CAP32_SPEED_40G:
 			return (IFM_40G_KR4);
 		case FW_PORT_CAP32_SPEED_50G:
 			return (IFM_50G_KR2);
 		case FW_PORT_CAP32_SPEED_100G:
 			return (IFM_100G_KR4);
 		}
 		break;
 	case FW_PORT_TYPE_FIBER_XFI:
 	case FW_PORT_TYPE_FIBER_XAUI:
 	case FW_PORT_TYPE_SFP:
 	case FW_PORT_TYPE_QSFP_10G:
 	case FW_PORT_TYPE_QSA:
 	case FW_PORT_TYPE_QSFP:
 	case FW_PORT_TYPE_CR4_QSFP:
 	case FW_PORT_TYPE_CR_QSFP:
 	case FW_PORT_TYPE_CR2_QSFP:
 	case FW_PORT_TYPE_SFP28:
 		/* Pluggable transceiver */
 		switch (pi->mod_type) {
 		case FW_PORT_MOD_TYPE_LR:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_LX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_LR);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_LR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_LR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_LR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_LR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_SR:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_SX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_SR);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_SR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_SR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_SR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_SR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_ER:
 			if (speed == FW_PORT_CAP32_SPEED_10G)
 				return (IFM_10G_ER);
 			break;
 		case FW_PORT_MOD_TYPE_TWINAX_PASSIVE:
 		case FW_PORT_MOD_TYPE_TWINAX_ACTIVE:
 			switch (speed) {
 			case FW_PORT_CAP32_SPEED_1G:
 				return (IFM_1000_CX);
 			case FW_PORT_CAP32_SPEED_10G:
 				return (IFM_10G_TWINAX);
 			case FW_PORT_CAP32_SPEED_25G:
 				return (IFM_25G_CR);
 			case FW_PORT_CAP32_SPEED_40G:
 				return (IFM_40G_CR4);
 			case FW_PORT_CAP32_SPEED_50G:
 				return (IFM_50G_CR2);
 			case FW_PORT_CAP32_SPEED_100G:
 				return (IFM_100G_CR4);
 			}
 			break;
 		case FW_PORT_MOD_TYPE_LRM:
 			if (speed == FW_PORT_CAP32_SPEED_10G)
 				return (IFM_10G_LRM);
 			break;
 		case FW_PORT_MOD_TYPE_NA:
 			MPASS(0);	/* Not pluggable? */
 			/* fall throough */
 		case FW_PORT_MOD_TYPE_ERROR:
 		case FW_PORT_MOD_TYPE_UNKNOWN:
 		case FW_PORT_MOD_TYPE_NOTSUPPORTED:
 			break;
 		case FW_PORT_MOD_TYPE_NONE:
 			return (IFM_NONE);
 		}
 		break;
 	case FW_PORT_TYPE_NONE:
 		return (IFM_NONE);
 	}
 
 	return (IFM_UNKNOWN);
 }
 
 void
 cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4med") != 0)
 		return;
 	PORT_LOCK(pi);
 
 	if (pi->up_vis == 0) {
 		/*
 		 * If all the interfaces are administratively down the firmware
 		 * does not report transceiver changes.  Refresh port info here
 		 * so that ifconfig displays accurate ifmedia at all times.
 		 * This is the only reason we have a synchronized op in this
 		 * function.  Just PORT_LOCK would have been enough otherwise.
 		 */
 		t4_update_port_info(pi);
 		build_medialist(pi);
 	}
 
 	/* ifm_status */
 	ifmr->ifm_status = IFM_AVALID;
 	if (lc->link_ok == false)
 		goto done;
 	ifmr->ifm_status |= IFM_ACTIVE;
 
 	/* ifm_active */
 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
 	ifmr->ifm_active &= ~(IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE);
 	if (lc->fc & PAUSE_RX)
 		ifmr->ifm_active |= IFM_ETH_RXPAUSE;
 	if (lc->fc & PAUSE_TX)
 		ifmr->ifm_active |= IFM_ETH_TXPAUSE;
 	ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed));
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 }
 
 static int
 vcxgbe_probe(device_t dev)
 {
 	char buf[128];
 	struct vi_info *vi = device_get_softc(dev);
 
 	snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id,
 	    vi - vi->pi->vi);
 	device_set_desc_copy(dev, buf);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 alloc_extra_vi(struct adapter *sc, struct port_info *pi, struct vi_info *vi)
 {
 	int func, index, rc;
 	uint32_t param, val;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	index = vi - pi->vi;
 	MPASS(index > 0);	/* This function deals with _extra_ VIs only */
 	KASSERT(index < nitems(vi_mac_funcs),
 	    ("%s: VI %s doesn't have a MAC func", __func__,
 	    device_get_nameunit(vi->dev)));
 	func = vi_mac_funcs[index];
 	rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1,
 	    vi->hw_addr, &vi->rss_size, &vi->vfvld, &vi->vin, func, 0);
 	if (rc < 0) {
 		CH_ERR(vi, "failed to allocate virtual interface %d"
 		    "for port %d: %d\n", index, pi->port_id, -rc);
 		return (-rc);
 	}
 	vi->viid = rc;
 
 	if (vi->rss_size == 1) {
 		/*
 		 * This VI didn't get a slice of the RSS table.  Reduce the
 		 * number of VIs being created (hw.cxgbe.num_vis) or modify the
 		 * configuration file (nvi, rssnvi for this PF) if this is a
 		 * problem.
 		 */
 		device_printf(vi->dev, "RSS table not available.\n");
 		vi->rss_base = 0xffff;
 
 		return (0);
 	}
 
 	param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) |
 	    V_FW_PARAMS_PARAM_YZ(vi->viid);
 	rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	if (rc)
 		vi->rss_base = 0xffff;
 	else {
 		MPASS((val >> 16) == vi->rss_size);
 		vi->rss_base = val & 0xffff;
 	}
 
 	return (0);
 }
 
 static int
 vcxgbe_attach(device_t dev)
 {
 	struct vi_info *vi;
 	struct port_info *pi;
 	struct adapter *sc;
 	int rc;
 
 	vi = device_get_softc(dev);
 	pi = vi->pi;
 	sc = pi->adapter;
 
 	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4via");
 	if (rc)
 		return (rc);
 	rc = alloc_extra_vi(sc, pi, vi);
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	rc = cxgbe_vi_attach(dev, vi);
 	if (rc) {
 		t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid);
 		return (rc);
 	}
 	return (0);
 }
 
 static int
 vcxgbe_detach(device_t dev)
 {
 	struct vi_info *vi;
 	struct adapter *sc;
 
 	vi = device_get_softc(dev);
 	sc = vi->adapter;
 
 	doom_vi(sc, vi);
 
 	cxgbe_vi_detach(vi);
 	t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid);
 
 	end_synchronized_op(sc, 0);
 
 	return (0);
 }
 
 static struct callout fatal_callout;
 static struct taskqueue *reset_tq;
 
 static void
 delayed_panic(void *arg)
 {
 	struct adapter *sc = arg;
 
 	panic("%s: panic on fatal error", device_get_nameunit(sc->dev));
 }
 
 void
 t4_fatal_err(struct adapter *sc, bool fw_error)
 {
 
 	t4_shutdown_adapter(sc);
 	log(LOG_ALERT, "%s: encountered fatal error, adapter stopped.\n",
 	    device_get_nameunit(sc->dev));
 	if (fw_error) {
 		if (sc->flags & CHK_MBOX_ACCESS)
 			ASSERT_SYNCHRONIZED_OP(sc);
 		sc->flags |= ADAP_ERR;
 	} else {
 		ADAPTER_LOCK(sc);
 		sc->flags |= ADAP_ERR;
 		ADAPTER_UNLOCK(sc);
 	}
 #ifdef TCP_OFFLOAD
 	taskqueue_enqueue(taskqueue_thread, &sc->async_event_task);
 #endif
 
 	if (t4_panic_on_fatal_err) {
 		CH_ALERT(sc, "panicking on fatal error (after 30s).\n");
 		callout_reset(&fatal_callout, hz * 30, delayed_panic, sc);
 	} else if (t4_reset_on_fatal_err) {
 		CH_ALERT(sc, "resetting on fatal error.\n");
 		taskqueue_enqueue(reset_tq, &sc->reset_task);
 	}
 }
 
 void
 t4_add_adapter(struct adapter *sc)
 {
 	sx_xlock(&t4_list_lock);
 	SLIST_INSERT_HEAD(&t4_list, sc, link);
 	sx_xunlock(&t4_list_lock);
 }
 
 int
 t4_map_bars_0_and_4(struct adapter *sc)
 {
 	sc->regs_rid = PCIR_BAR(0);
 	sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->regs_rid, RF_ACTIVE);
 	if (sc->regs_res == NULL) {
 		device_printf(sc->dev, "cannot map registers.\n");
 		return (ENXIO);
 	}
 	sc->bt = rman_get_bustag(sc->regs_res);
 	sc->bh = rman_get_bushandle(sc->regs_res);
 	sc->mmio_len = rman_get_size(sc->regs_res);
 	setbit(&sc->doorbells, DOORBELL_KDB);
 
 	sc->msix_rid = PCIR_BAR(4);
 	sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->msix_rid, RF_ACTIVE);
 	if (sc->msix_res == NULL) {
 		device_printf(sc->dev, "cannot map MSI-X BAR.\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 int
 t4_map_bar_2(struct adapter *sc)
 {
 
 	/*
 	 * T4: only iWARP driver uses the userspace doorbells.  There is no need
 	 * to map it if RDMA is disabled.
 	 */
 	if (is_t4(sc) && sc->rdmacaps == 0)
 		return (0);
 
 	sc->udbs_rid = PCIR_BAR(2);
 	sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
 	    &sc->udbs_rid, RF_ACTIVE);
 	if (sc->udbs_res == NULL) {
 		device_printf(sc->dev, "cannot map doorbell BAR.\n");
 		return (ENXIO);
 	}
 	sc->udbs_base = rman_get_virtual(sc->udbs_res);
 
 	if (chip_id(sc) >= CHELSIO_T5) {
 		setbit(&sc->doorbells, DOORBELL_UDB);
 #if defined(__i386__) || defined(__amd64__)
 		if (t5_write_combine) {
 			int rc, mode;
 
 			/*
 			 * Enable write combining on BAR2.  This is the
 			 * userspace doorbell BAR and is split into 128B
 			 * (UDBS_SEG_SIZE) doorbell regions, each associated
 			 * with an egress queue.  The first 64B has the doorbell
 			 * and the second 64B can be used to submit a tx work
 			 * request with an implicit doorbell.
 			 */
 
 			rc = pmap_change_attr((vm_offset_t)sc->udbs_base,
 			    rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING);
 			if (rc == 0) {
 				clrbit(&sc->doorbells, DOORBELL_UDB);
 				setbit(&sc->doorbells, DOORBELL_WCWR);
 				setbit(&sc->doorbells, DOORBELL_UDBWC);
 			} else {
 				device_printf(sc->dev,
 				    "couldn't enable write combining: %d\n",
 				    rc);
 			}
 
 			mode = is_t5(sc) ? V_STATMODE(0) : V_T6_STATMODE(0);
 			t4_write_reg(sc, A_SGE_STAT_CFG,
 			    V_STATSOURCE_T5(7) | mode);
 		}
 #endif
 	}
 	sc->iwt.wc_en = isset(&sc->doorbells, DOORBELL_UDBWC) ? 1 : 0;
 
 	return (0);
 }
 
 struct memwin_init {
 	uint32_t base;
 	uint32_t aperture;
 };
 
 static const struct memwin_init t4_memwin[NUM_MEMWIN] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 }
 };
 
 static const struct memwin_init t5_memwin[NUM_MEMWIN] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
 	{ MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 },
 };
 
 static void
 setup_memwin(struct adapter *sc)
 {
 	const struct memwin_init *mw_init;
 	struct memwin *mw;
 	int i;
 	uint32_t bar0;
 
 	if (is_t4(sc)) {
 		/*
 		 * Read low 32b of bar0 indirectly via the hardware backdoor
 		 * mechanism.  Works from within PCI passthrough environments
 		 * too, where rman_get_start() can return a different value.  We
 		 * need to program the T4 memory window decoders with the actual
 		 * addresses that will be coming across the PCIe link.
 		 */
 		bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0));
 		bar0 &= (uint32_t) PCIM_BAR_MEM_BASE;
 
 		mw_init = &t4_memwin[0];
 	} else {
 		/* T5+ use the relative offset inside the PCIe BAR */
 		bar0 = 0;
 
 		mw_init = &t5_memwin[0];
 	}
 
 	for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) {
 		if (!rw_initialized(&mw->mw_lock)) {
 			rw_init(&mw->mw_lock, "memory window access");
 			mw->mw_base = mw_init->base;
 			mw->mw_aperture = mw_init->aperture;
 			mw->mw_curpos = 0;
 		}
 		t4_write_reg(sc,
 		    PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i),
 		    (mw->mw_base + bar0) | V_BIR(0) |
 		    V_WINDOW(ilog2(mw->mw_aperture) - 10));
 		rw_wlock(&mw->mw_lock);
 		position_memwin(sc, i, mw->mw_curpos);
 		rw_wunlock(&mw->mw_lock);
 	}
 
 	/* flush */
 	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2));
 }
 
 /*
  * Positions the memory window at the given address in the card's address space.
  * There are some alignment requirements and the actual position may be at an
  * address prior to the requested address.  mw->mw_curpos always has the actual
  * position of the window.
  */
 static void
 position_memwin(struct adapter *sc, int idx, uint32_t addr)
 {
 	struct memwin *mw;
 	uint32_t pf;
 	uint32_t reg;
 
 	MPASS(idx >= 0 && idx < NUM_MEMWIN);
 	mw = &sc->memwin[idx];
 	rw_assert(&mw->mw_lock, RA_WLOCKED);
 
 	if (is_t4(sc)) {
 		pf = 0;
 		mw->mw_curpos = addr & ~0xf;	/* start must be 16B aligned */
 	} else {
 		pf = V_PFNUM(sc->pf);
 		mw->mw_curpos = addr & ~0x7f;	/* start must be 128B aligned */
 	}
 	reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx);
 	t4_write_reg(sc, reg, mw->mw_curpos | pf);
 	t4_read_reg(sc, reg);	/* flush */
 }
 
 int
 rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val,
     int len, int rw)
 {
 	struct memwin *mw;
 	uint32_t mw_end, v;
 
 	MPASS(idx >= 0 && idx < NUM_MEMWIN);
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (addr & 3 || len & 3 || len <= 0)
 		return (EINVAL);
 
 	mw = &sc->memwin[idx];
 	while (len > 0) {
 		rw_rlock(&mw->mw_lock);
 		mw_end = mw->mw_curpos + mw->mw_aperture;
 		if (addr >= mw_end || addr < mw->mw_curpos) {
 			/* Will need to reposition the window */
 			if (!rw_try_upgrade(&mw->mw_lock)) {
 				rw_runlock(&mw->mw_lock);
 				rw_wlock(&mw->mw_lock);
 			}
 			rw_assert(&mw->mw_lock, RA_WLOCKED);
 			position_memwin(sc, idx, addr);
 			rw_downgrade(&mw->mw_lock);
 			mw_end = mw->mw_curpos + mw->mw_aperture;
 		}
 		rw_assert(&mw->mw_lock, RA_RLOCKED);
 		while (addr < mw_end && len > 0) {
 			if (rw == 0) {
 				v = t4_read_reg(sc, mw->mw_base + addr -
 				    mw->mw_curpos);
 				*val++ = le32toh(v);
 			} else {
 				v = *val++;
 				t4_write_reg(sc, mw->mw_base + addr -
 				    mw->mw_curpos, htole32(v));
 			}
 			addr += 4;
 			len -= 4;
 		}
 		rw_runlock(&mw->mw_lock);
 	}
 
 	return (0);
 }
 
 static void
 t4_init_atid_table(struct adapter *sc)
 {
 	struct tid_info *t;
 	int i;
 
 	t = &sc->tids;
 	if (t->natids == 0)
 		return;
 
 	MPASS(t->atid_tab == NULL);
 
 	t->atid_tab = malloc(t->natids * sizeof(*t->atid_tab), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
 	t->afree = t->atid_tab;
 	t->atids_in_use = 0;
 	for (i = 1; i < t->natids; i++)
 		t->atid_tab[i - 1].next = &t->atid_tab[i];
 	t->atid_tab[t->natids - 1].next = NULL;
 }
 
 static void
 t4_free_atid_table(struct adapter *sc)
 {
 	struct tid_info *t;
 
 	t = &sc->tids;
 
 	KASSERT(t->atids_in_use == 0,
 	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
 
 	if (mtx_initialized(&t->atid_lock))
 		mtx_destroy(&t->atid_lock);
 	free(t->atid_tab, M_CXGBE);
 	t->atid_tab = NULL;
 }
 
 int
 alloc_atid(struct adapter *sc, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 	int atid = -1;
 
 	mtx_lock(&t->atid_lock);
 	if (t->afree) {
 		union aopen_entry *p = t->afree;
 
 		atid = p - t->atid_tab;
 		MPASS(atid <= M_TID_TID);
 		t->afree = p->next;
 		p->data = ctx;
 		t->atids_in_use++;
 	}
 	mtx_unlock(&t->atid_lock);
 	return (atid);
 }
 
 void *
 lookup_atid(struct adapter *sc, int atid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->atid_tab[atid].data);
 }
 
 void
 free_atid(struct adapter *sc, int atid)
 {
 	struct tid_info *t = &sc->tids;
 	union aopen_entry *p = &t->atid_tab[atid];
 
 	mtx_lock(&t->atid_lock);
 	p->next = t->afree;
 	t->afree = p;
 	t->atids_in_use--;
 	mtx_unlock(&t->atid_lock);
 }
 
 static void
 queue_tid_release(struct adapter *sc, int tid)
 {
 
 	CXGBE_UNIMPLEMENTED("deferred tid release");
 }
 
 void
 release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
 {
 	struct wrqe *wr;
 	struct cpl_tid_release *req;
 
 	wr = alloc_wrqe(sizeof(*req), ctrlq);
 	if (wr == NULL) {
 		queue_tid_release(sc, tid);	/* defer */
 		return;
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 t4_range_cmp(const void *a, const void *b)
 {
 	return ((const struct t4_range *)a)->start -
 	       ((const struct t4_range *)b)->start;
 }
 
 /*
  * Verify that the memory range specified by the addr/len pair is valid within
  * the card's address space.
  */
 static int
 validate_mem_range(struct adapter *sc, uint32_t addr, uint32_t len)
 {
 	struct t4_range mem_ranges[4], *r, *next;
 	uint32_t em, addr_len;
 	int i, n, remaining;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (addr & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	/* Enabled memories */
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 
 	r = &mem_ranges[0];
 	n = 0;
 	bzero(r, sizeof(mem_ranges));
 	if (em & F_EDRAM0_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		r->size = G_EDRAM0_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EDRAM0_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (em & F_EDRAM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		r->size = G_EDRAM1_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EDRAM1_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (em & F_EXT_MEM_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		r->size = G_EXT_MEM_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EXT_MEM_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) {
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		r->size = G_EXT_MEM1_SIZE(addr_len) << 20;
 		if (r->size > 0) {
 			r->start = G_EXT_MEM1_BASE(addr_len) << 20;
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 			r++;
 			n++;
 		}
 	}
 	MPASS(n <= nitems(mem_ranges));
 
 	if (n > 1) {
 		/* Sort and merge the ranges. */
 		qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp);
 
 		/* Start from index 0 and examine the next n - 1 entries. */
 		r = &mem_ranges[0];
 		for (remaining = n - 1; remaining > 0; remaining--, r++) {
 
 			MPASS(r->size > 0);	/* r is a valid entry. */
 			next = r + 1;
 			MPASS(next->size > 0);	/* and so is the next one. */
 
 			while (r->start + r->size >= next->start) {
 				/* Merge the next one into the current entry. */
 				r->size = max(r->start + r->size,
 				    next->start + next->size) - r->start;
 				n--;	/* One fewer entry in total. */
 				if (--remaining == 0)
 					goto done;	/* short circuit */
 				next++;
 			}
 			if (next != r + 1) {
 				/*
 				 * Some entries were merged into r and next
 				 * points to the first valid entry that couldn't
 				 * be merged.
 				 */
 				MPASS(next->size > 0);	/* must be valid */
 				memcpy(r + 1, next, remaining * sizeof(*r));
 #ifdef INVARIANTS
 				/*
 				 * This so that the foo->size assertion in the
 				 * next iteration of the loop do the right
 				 * thing for entries that were pulled up and are
 				 * no longer valid.
 				 */
 				MPASS(n < nitems(mem_ranges));
 				bzero(&mem_ranges[n], (nitems(mem_ranges) - n) *
 				    sizeof(struct t4_range));
 #endif
 			}
 		}
 done:
 		/* Done merging the ranges. */
 		MPASS(n > 0);
 		r = &mem_ranges[0];
 		for (i = 0; i < n; i++, r++) {
 			if (addr >= r->start &&
 			    addr + len <= r->start + r->size)
 				return (0);
 		}
 	}
 
 	return (EFAULT);
 }
 
 static int
 fwmtype_to_hwmtype(int mtype)
 {
 
 	switch (mtype) {
 	case FW_MEMTYPE_EDC0:
 		return (MEM_EDC0);
 	case FW_MEMTYPE_EDC1:
 		return (MEM_EDC1);
 	case FW_MEMTYPE_EXTMEM:
 		return (MEM_MC0);
 	case FW_MEMTYPE_EXTMEM1:
 		return (MEM_MC1);
 	default:
 		panic("%s: cannot translate fw mtype %d.", __func__, mtype);
 	}
 }
 
 /*
  * Verify that the memory range specified by the memtype/offset/len pair is
  * valid and lies entirely within the memtype specified.  The global address of
  * the start of the range is returned in addr.
  */
 static int
 validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, uint32_t len,
     uint32_t *addr)
 {
 	uint32_t em, addr_len, maddr;
 
 	/* Memory can only be accessed in naturally aligned 4 byte units */
 	if (off & 3 || len & 3 || len == 0)
 		return (EINVAL);
 
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	switch (fwmtype_to_hwmtype(mtype)) {
 	case MEM_EDC0:
 		if (!(em & F_EDRAM0_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		maddr = G_EDRAM0_BASE(addr_len) << 20;
 		break;
 	case MEM_EDC1:
 		if (!(em & F_EDRAM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		maddr = G_EDRAM1_BASE(addr_len) << 20;
 		break;
 	case MEM_MC:
 		if (!(em & F_EXT_MEM_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		maddr = G_EXT_MEM_BASE(addr_len) << 20;
 		break;
 	case MEM_MC1:
 		if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE))
 			return (EINVAL);
 		addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		maddr = G_EXT_MEM1_BASE(addr_len) << 20;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	*addr = maddr + off;	/* global address */
 	return (validate_mem_range(sc, *addr, len));
 }
 
 static int
 fixup_devlog_params(struct adapter *sc)
 {
 	struct devlog_params *dparams = &sc->params.devlog;
 	int rc;
 
 	rc = validate_mt_off_len(sc, dparams->memtype, dparams->start,
 	    dparams->size, &dparams->addr);
 
 	return (rc);
 }
 
 static void
 update_nirq(struct intrs_and_queues *iaq, int nports)
 {
 
 	iaq->nirq = T4_EXTRA_INTR;
 	iaq->nirq += nports * max(iaq->nrxq, iaq->nnmrxq);
 	iaq->nirq += nports * iaq->nofldrxq;
 	iaq->nirq += nports * (iaq->num_vis - 1) *
 	    max(iaq->nrxq_vi, iaq->nnmrxq_vi);
 	iaq->nirq += nports * (iaq->num_vis - 1) * iaq->nofldrxq_vi;
 }
 
 /*
  * Adjust requirements to fit the number of interrupts available.
  */
 static void
 calculate_iaq(struct adapter *sc, struct intrs_and_queues *iaq, int itype,
     int navail)
 {
 	int old_nirq;
 	const int nports = sc->params.nports;
 
 	MPASS(nports > 0);
 	MPASS(navail > 0);
 
 	bzero(iaq, sizeof(*iaq));
 	iaq->intr_type = itype;
 	iaq->num_vis = t4_num_vis;
 	iaq->ntxq = t4_ntxq;
 	iaq->ntxq_vi = t4_ntxq_vi;
 	iaq->nrxq = t4_nrxq;
 	iaq->nrxq_vi = t4_nrxq_vi;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (is_offload(sc) || is_ethoffload(sc)) {
 		iaq->nofldtxq = t4_nofldtxq;
 		iaq->nofldtxq_vi = t4_nofldtxq_vi;
 	}
 #endif
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		iaq->nofldrxq = t4_nofldrxq;
 		iaq->nofldrxq_vi = t4_nofldrxq_vi;
 	}
 #endif
 #ifdef DEV_NETMAP
 	if (t4_native_netmap & NN_MAIN_VI) {
 		iaq->nnmtxq = t4_nnmtxq;
 		iaq->nnmrxq = t4_nnmrxq;
 	}
 	if (t4_native_netmap & NN_EXTRA_VI) {
 		iaq->nnmtxq_vi = t4_nnmtxq_vi;
 		iaq->nnmrxq_vi = t4_nnmrxq_vi;
 	}
 #endif
 
 	update_nirq(iaq, nports);
 	if (iaq->nirq <= navail &&
 	    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 		/*
 		 * This is the normal case -- there are enough interrupts for
 		 * everything.
 		 */
 		goto done;
 	}
 
 	/*
 	 * If extra VIs have been configured try reducing their count and see if
 	 * that works.
 	 */
 	while (iaq->num_vis > 1) {
 		iaq->num_vis--;
 		update_nirq(iaq, nports);
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 			device_printf(sc->dev, "virtual interfaces per port "
 			    "reduced to %d from %d.  nrxq=%u, nofldrxq=%u, "
 			    "nrxq_vi=%u nofldrxq_vi=%u, nnmrxq_vi=%u.  "
 			    "itype %d, navail %u, nirq %d.\n",
 			    iaq->num_vis, t4_num_vis, iaq->nrxq, iaq->nofldrxq,
 			    iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi,
 			    itype, navail, iaq->nirq);
 			goto done;
 		}
 	}
 
 	/*
 	 * Extra VIs will not be created.  Log a message if they were requested.
 	 */
 	MPASS(iaq->num_vis == 1);
 	iaq->ntxq_vi = iaq->nrxq_vi = 0;
 	iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0;
 	iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0;
 	if (iaq->num_vis != t4_num_vis) {
 		device_printf(sc->dev, "extra virtual interfaces disabled.  "
 		    "nrxq=%u, nofldrxq=%u, nrxq_vi=%u nofldrxq_vi=%u, "
 		    "nnmrxq_vi=%u.  itype %d, navail %u, nirq %d.\n",
 		    iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi,
 		    iaq->nnmrxq_vi, itype, navail, iaq->nirq);
 	}
 
 	/*
 	 * Keep reducing the number of NIC rx queues to the next lower power of
 	 * 2 (for even RSS distribution) and halving the TOE rx queues and see
 	 * if that works.
 	 */
 	do {
 		if (iaq->nrxq > 1) {
 			do {
 				iaq->nrxq--;
 			} while (!powerof2(iaq->nrxq));
 			if (iaq->nnmrxq > iaq->nrxq)
 				iaq->nnmrxq = iaq->nrxq;
 		}
 		if (iaq->nofldrxq > 1)
 			iaq->nofldrxq >>= 1;
 
 		old_nirq = iaq->nirq;
 		update_nirq(iaq, nports);
 		if (iaq->nirq <= navail &&
 		    (itype != INTR_MSI || powerof2(iaq->nirq))) {
 			device_printf(sc->dev, "running with reduced number of "
 			    "rx queues because of shortage of interrupts.  "
 			    "nrxq=%u, nofldrxq=%u.  "
 			    "itype %d, navail %u, nirq %d.\n", iaq->nrxq,
 			    iaq->nofldrxq, itype, navail, iaq->nirq);
 			goto done;
 		}
 	} while (old_nirq != iaq->nirq);
 
 	/* One interrupt for everything.  Ugh. */
 	device_printf(sc->dev, "running with minimal number of queues.  "
 	    "itype %d, navail %u.\n", itype, navail);
 	iaq->nirq = 1;
 	iaq->nrxq = 1;
 	iaq->ntxq = 1;
 	if (iaq->nofldrxq > 0) {
 		iaq->nofldrxq = 1;
 		iaq->nofldtxq = 1;
 	}
 	iaq->nnmtxq = 0;
 	iaq->nnmrxq = 0;
 done:
 	MPASS(iaq->num_vis > 0);
 	if (iaq->num_vis > 1) {
 		MPASS(iaq->nrxq_vi > 0);
 		MPASS(iaq->ntxq_vi > 0);
 	}
 	MPASS(iaq->nirq > 0);
 	MPASS(iaq->nrxq > 0);
 	MPASS(iaq->ntxq > 0);
 	if (itype == INTR_MSI) {
 		MPASS(powerof2(iaq->nirq));
 	}
 }
 
 static int
 cfg_itype_and_nqueues(struct adapter *sc, struct intrs_and_queues *iaq)
 {
 	int rc, itype, navail, nalloc;
 
 	for (itype = INTR_MSIX; itype; itype >>= 1) {
 
 		if ((itype & t4_intr_types) == 0)
 			continue;	/* not allowed */
 
 		if (itype == INTR_MSIX)
 			navail = pci_msix_count(sc->dev);
 		else if (itype == INTR_MSI)
 			navail = pci_msi_count(sc->dev);
 		else
 			navail = 1;
 restart:
 		if (navail == 0)
 			continue;
 
 		calculate_iaq(sc, iaq, itype, navail);
 		nalloc = iaq->nirq;
 		rc = 0;
 		if (itype == INTR_MSIX)
 			rc = pci_alloc_msix(sc->dev, &nalloc);
 		else if (itype == INTR_MSI)
 			rc = pci_alloc_msi(sc->dev, &nalloc);
 
 		if (rc == 0 && nalloc > 0) {
 			if (nalloc == iaq->nirq)
 				return (0);
 
 			/*
 			 * Didn't get the number requested.  Use whatever number
 			 * the kernel is willing to allocate.
 			 */
 			device_printf(sc->dev, "fewer vectors than requested, "
 			    "type=%d, req=%d, rcvd=%d; will downshift req.\n",
 			    itype, iaq->nirq, nalloc);
 			pci_release_msi(sc->dev);
 			navail = nalloc;
 			goto restart;
 		}
 
 		device_printf(sc->dev,
 		    "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n",
 		    itype, rc, iaq->nirq, nalloc);
 	}
 
 	device_printf(sc->dev,
 	    "failed to find a usable interrupt type.  "
 	    "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types,
 	    pci_msix_count(sc->dev), pci_msi_count(sc->dev));
 
 	return (ENXIO);
 }
 
 #define FW_VERSION(chip) ( \
     V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \
     V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \
     V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \
     V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD))
 #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf)
 
 /* Just enough of fw_hdr to cover all version info. */
 struct fw_h {
 	__u8	ver;
 	__u8	chip;
 	__be16	len512;
 	__be32	fw_ver;
 	__be32	tp_microcode_ver;
 	__u8	intfver_nic;
 	__u8	intfver_vnic;
 	__u8	intfver_ofld;
 	__u8	intfver_ri;
 	__u8	intfver_iscsipdu;
 	__u8	intfver_iscsi;
 	__u8	intfver_fcoepdu;
 	__u8	intfver_fcoe;
 };
 /* Spot check a couple of fields. */
 CTASSERT(offsetof(struct fw_h, fw_ver) == offsetof(struct fw_hdr, fw_ver));
 CTASSERT(offsetof(struct fw_h, intfver_nic) == offsetof(struct fw_hdr, intfver_nic));
 CTASSERT(offsetof(struct fw_h, intfver_fcoe) == offsetof(struct fw_hdr, intfver_fcoe));
 
 struct fw_info {
 	uint8_t chip;
 	char *kld_name;
 	char *fw_mod_name;
 	struct fw_h fw_h;
 } fw_info[] = {
 	{
 		.chip = CHELSIO_T4,
 		.kld_name = "t4fw_cfg",
 		.fw_mod_name = "t4fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T4,
 			.fw_ver = htobe32(FW_VERSION(T4)),
 			.intfver_nic = FW_INTFVER(T4, NIC),
 			.intfver_vnic = FW_INTFVER(T4, VNIC),
 			.intfver_ofld = FW_INTFVER(T4, OFLD),
 			.intfver_ri = FW_INTFVER(T4, RI),
 			.intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T4, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T4, FCOE),
 		},
 	}, {
 		.chip = CHELSIO_T5,
 		.kld_name = "t5fw_cfg",
 		.fw_mod_name = "t5fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T5,
 			.fw_ver = htobe32(FW_VERSION(T5)),
 			.intfver_nic = FW_INTFVER(T5, NIC),
 			.intfver_vnic = FW_INTFVER(T5, VNIC),
 			.intfver_ofld = FW_INTFVER(T5, OFLD),
 			.intfver_ri = FW_INTFVER(T5, RI),
 			.intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T5, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T5, FCOE),
 		},
 	}, {
 		.chip = CHELSIO_T6,
 		.kld_name = "t6fw_cfg",
 		.fw_mod_name = "t6fw",
 		.fw_h = {
 			.chip = FW_HDR_CHIP_T6,
 			.fw_ver = htobe32(FW_VERSION(T6)),
 			.intfver_nic = FW_INTFVER(T6, NIC),
 			.intfver_vnic = FW_INTFVER(T6, VNIC),
 			.intfver_ofld = FW_INTFVER(T6, OFLD),
 			.intfver_ri = FW_INTFVER(T6, RI),
 			.intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU),
 			.intfver_iscsi = FW_INTFVER(T6, ISCSI),
 			.intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU),
 			.intfver_fcoe = FW_INTFVER(T6, FCOE),
 		},
 	}
 };
 
 static struct fw_info *
 find_fw_info(int chip)
 {
 	int i;
 
 	for (i = 0; i < nitems(fw_info); i++) {
 		if (fw_info[i].chip == chip)
 			return (&fw_info[i]);
 	}
 	return (NULL);
 }
 
 /*
  * Is the given firmware API compatible with the one the driver was compiled
  * with?
  */
 static int
 fw_compatible(const struct fw_h *hdr1, const struct fw_h *hdr2)
 {
 
 	/* short circuit if it's the exact same firmware version */
 	if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver)
 		return (1);
 
 	/*
 	 * XXX: Is this too conservative?  Perhaps I should limit this to the
 	 * features that are supported in the driver.
 	 */
 #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x)
 	if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) &&
 	    SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) &&
 	    SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe))
 		return (1);
 #undef SAME_INTF
 
 	return (0);
 }
 
 static int
 load_fw_module(struct adapter *sc, const struct firmware **dcfg,
     const struct firmware **fw)
 {
 	struct fw_info *fw_info;
 
 	*dcfg = NULL;
 	if (fw != NULL)
 		*fw = NULL;
 
 	fw_info = find_fw_info(chip_id(sc));
 	if (fw_info == NULL) {
 		device_printf(sc->dev,
 		    "unable to look up firmware information for chip %d.\n",
 		    chip_id(sc));
 		return (EINVAL);
 	}
 
 	*dcfg = firmware_get(fw_info->kld_name);
 	if (*dcfg != NULL) {
 		if (fw != NULL)
 			*fw = firmware_get(fw_info->fw_mod_name);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 unload_fw_module(struct adapter *sc, const struct firmware *dcfg,
     const struct firmware *fw)
 {
 
 	if (fw != NULL)
 		firmware_put(fw, FIRMWARE_UNLOAD);
 	if (dcfg != NULL)
 		firmware_put(dcfg, FIRMWARE_UNLOAD);
 }
 
 /*
  * Return values:
  * 0 means no firmware install attempted.
  * ERESTART means a firmware install was attempted and was successful.
  * +ve errno means a firmware install was attempted but failed.
  */
 static int
 install_kld_firmware(struct adapter *sc, struct fw_h *card_fw,
     const struct fw_h *drv_fw, const char *reason, int *already)
 {
 	const struct firmware *cfg, *fw;
 	const uint32_t c = be32toh(card_fw->fw_ver);
 	uint32_t d, k;
 	int rc, fw_install;
 	struct fw_h bundled_fw;
 	bool load_attempted;
 
 	cfg = fw = NULL;
 	load_attempted = false;
 	fw_install = t4_fw_install < 0 ? -t4_fw_install : t4_fw_install;
 
 	memcpy(&bundled_fw, drv_fw, sizeof(bundled_fw));
 	if (t4_fw_install < 0) {
 		rc = load_fw_module(sc, &cfg, &fw);
 		if (rc != 0 || fw == NULL) {
 			device_printf(sc->dev,
 			    "failed to load firmware module: %d. cfg %p, fw %p;"
 			    " will use compiled-in firmware version for"
 			    "hw.cxgbe.fw_install checks.\n",
 			    rc, cfg, fw);
 		} else {
 			memcpy(&bundled_fw, fw->data, sizeof(bundled_fw));
 		}
 		load_attempted = true;
 	}
 	d = be32toh(bundled_fw.fw_ver);
 
 	if (reason != NULL)
 		goto install;
 
 	if ((sc->flags & FW_OK) == 0) {
 
 		if (c == 0xffffffff) {
 			reason = "missing";
 			goto install;
 		}
 
 		rc = 0;
 		goto done;
 	}
 
 	if (!fw_compatible(card_fw, &bundled_fw)) {
 		reason = "incompatible or unusable";
 		goto install;
 	}
 
 	if (d > c) {
 		reason = "older than the version bundled with this driver";
 		goto install;
 	}
 
 	if (fw_install == 2 && d != c) {
 		reason = "different than the version bundled with this driver";
 		goto install;
 	}
 
 	/* No reason to do anything to the firmware already on the card. */
 	rc = 0;
 	goto done;
 
 install:
 	rc = 0;
 	if ((*already)++)
 		goto done;
 
 	if (fw_install == 0) {
 		device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 		    "but the driver is prohibited from installing a firmware "
 		    "on the card.\n",
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason);
 
 		goto done;
 	}
 
 	/*
 	 * We'll attempt to install a firmware.  Load the module first (if it
 	 * hasn't been loaded already).
 	 */
 	if (!load_attempted) {
 		rc = load_fw_module(sc, &cfg, &fw);
 		if (rc != 0 || fw == NULL) {
 			device_printf(sc->dev,
 			    "failed to load firmware module: %d. cfg %p, fw %p\n",
 			    rc, cfg, fw);
 			/* carry on */
 		}
 	}
 	if (fw == NULL) {
 		device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 		    "but the driver cannot take corrective action because it "
 		    "is unable to load the firmware module.\n",
 		    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 		    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason);
 		rc = sc->flags & FW_OK ? 0 : ENOENT;
 		goto done;
 	}
 	k = be32toh(((const struct fw_hdr *)fw->data)->fw_ver);
 	if (k != d) {
 		MPASS(t4_fw_install > 0);
 		device_printf(sc->dev,
 		    "firmware in KLD (%u.%u.%u.%u) is not what the driver was "
 		    "expecting (%u.%u.%u.%u) and will not be used.\n",
 		    G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k),
 		    G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k),
 		    G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d),
 		    G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d));
 		rc = sc->flags & FW_OK ? 0 : EINVAL;
 		goto done;
 	}
 
 	device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, "
 	    "installing firmware %u.%u.%u.%u on card.\n",
 	    G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c),
 	    G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason,
 	    G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d),
 	    G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d));
 
 	rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to install firmware: %d\n", rc);
 	} else {
 		/* Installed successfully, update the cached header too. */
 		rc = ERESTART;
 		memcpy(card_fw, fw->data, sizeof(*card_fw));
 	}
 done:
 	unload_fw_module(sc, cfg, fw);
 
 	return (rc);
 }
 
 /*
  * Establish contact with the firmware and attempt to become the master driver.
  *
  * A firmware will be installed to the card if needed (if the driver is allowed
  * to do so).
  */
 static int
 contact_firmware(struct adapter *sc)
 {
 	int rc, already = 0;
 	enum dev_state state;
 	struct fw_info *fw_info;
 	struct fw_hdr *card_fw;		/* fw on the card */
 	const struct fw_h *drv_fw;
 
 	fw_info = find_fw_info(chip_id(sc));
 	if (fw_info == NULL) {
 		device_printf(sc->dev,
 		    "unable to look up firmware information for chip %d.\n",
 		    chip_id(sc));
 		return (EINVAL);
 	}
 	drv_fw = &fw_info->fw_h;
 
 	/* Read the header of the firmware on the card */
 	card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK);
 restart:
 	rc = -t4_get_fw_hdr(sc, card_fw);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "unable to read firmware header from card's flash: %d\n",
 		    rc);
 		goto done;
 	}
 
 	rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL,
 	    &already);
 	if (rc == ERESTART)
 		goto restart;
 	if (rc != 0)
 		goto done;
 
 	rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state);
 	if (rc < 0 || state == DEV_STATE_ERR) {
 		rc = -rc;
 		device_printf(sc->dev,
 		    "failed to connect to the firmware: %d, %d.  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 #if 0
 		if (install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw,
 		    "not responding properly to HELLO", &already) == ERESTART)
 			goto restart;
 #endif
 		goto done;
 	}
 	MPASS(be32toh(card_fw->flags) & FW_HDR_FLAGS_RESET_HALT);
 	sc->flags |= FW_OK;	/* The firmware responded to the FW_HELLO. */
 
 	if (rc == sc->pf) {
 		sc->flags |= MASTER_PF;
 		rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw,
 		    NULL, &already);
 		if (rc == ERESTART)
 			rc = 0;
 		else if (rc != 0)
 			goto done;
 	} else if (state == DEV_STATE_UNINIT) {
 		/*
 		 * We didn't get to be the master so we definitely won't be
 		 * configuring the chip.  It's a bug if someone else hasn't
 		 * configured it already.
 		 */
 		device_printf(sc->dev, "couldn't be master(%d), "
 		    "device not already initialized either(%d).  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 		rc = EPROTO;
 		goto done;
 	} else {
 		/*
 		 * Some other PF is the master and has configured the chip.
 		 * This is allowed but untested.
 		 */
 		device_printf(sc->dev, "PF%d is master, device state %d.  "
 		    "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW));
 		snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", rc);
 		sc->cfcsum = 0;
 		rc = 0;
 	}
 done:
 	if (rc != 0 && sc->flags & FW_OK) {
 		t4_fw_bye(sc, sc->mbox);
 		sc->flags &= ~FW_OK;
 	}
 	free(card_fw, M_CXGBE);
 	return (rc);
 }
 
 static int
 copy_cfg_file_to_card(struct adapter *sc, char *cfg_file,
     uint32_t mtype, uint32_t moff)
 {
 	struct fw_info *fw_info;
 	const struct firmware *dcfg, *rcfg = NULL;
 	const uint32_t *cfdata;
 	uint32_t cflen, addr;
 	int rc;
 
 	load_fw_module(sc, &dcfg, NULL);
 
 	/* Card specific interpretation of "default". */
 	if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) {
 		if (pci_get_device(sc->dev) == 0x440a)
 			snprintf(cfg_file, sizeof(t4_cfg_file), UWIRE_CF);
 		if (is_fpga(sc))
 			snprintf(cfg_file, sizeof(t4_cfg_file), FPGA_CF);
 	}
 
 	if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) {
 		if (dcfg == NULL) {
 			device_printf(sc->dev,
 			    "KLD with default config is not available.\n");
 			rc = ENOENT;
 			goto done;
 		}
 		cfdata = dcfg->data;
 		cflen = dcfg->datasize & ~3;
 	} else {
 		char s[32];
 
 		fw_info = find_fw_info(chip_id(sc));
 		if (fw_info == NULL) {
 			device_printf(sc->dev,
 			    "unable to look up firmware information for chip %d.\n",
 			    chip_id(sc));
 			rc = EINVAL;
 			goto done;
 		}
 		snprintf(s, sizeof(s), "%s_%s", fw_info->kld_name, cfg_file);
 
 		rcfg = firmware_get(s);
 		if (rcfg == NULL) {
 			device_printf(sc->dev,
 			    "unable to load module \"%s\" for configuration "
 			    "profile \"%s\".\n", s, cfg_file);
 			rc = ENOENT;
 			goto done;
 		}
 		cfdata = rcfg->data;
 		cflen = rcfg->datasize & ~3;
 	}
 
 	if (cflen > FLASH_CFG_MAX_SIZE) {
 		device_printf(sc->dev,
 		    "config file too long (%d, max allowed is %d).\n",
 		    cflen, FLASH_CFG_MAX_SIZE);
 		rc = EINVAL;
 		goto done;
 	}
 
 	rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "%s: addr (%d/0x%x) or len %d is not valid: %d.\n",
 		    __func__, mtype, moff, cflen, rc);
 		rc = EINVAL;
 		goto done;
 	}
 	write_via_memwin(sc, 2, addr, cfdata, cflen);
 done:
 	if (rcfg != NULL)
 		firmware_put(rcfg, FIRMWARE_UNLOAD);
 	unload_fw_module(sc, dcfg, NULL);
 	return (rc);
 }
 
 struct caps_allowed {
 	uint16_t nbmcaps;
 	uint16_t linkcaps;
 	uint16_t switchcaps;
 	uint16_t niccaps;
 	uint16_t toecaps;
 	uint16_t rdmacaps;
 	uint16_t cryptocaps;
 	uint16_t iscsicaps;
 	uint16_t fcoecaps;
 };
 
 #define FW_PARAM_DEV(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param))
 #define FW_PARAM_PFVF(param) \
 	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \
 	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param))
 
 /*
  * Provide a configuration profile to the firmware and have it initialize the
  * chip accordingly.  This may involve uploading a configuration file to the
  * card.
  */
 static int
 apply_cfg_and_initialize(struct adapter *sc, char *cfg_file,
     const struct caps_allowed *caps_allowed)
 {
 	int rc;
 	struct fw_caps_config_cmd caps;
 	uint32_t mtype, moff, finicsum, cfcsum, param, val;
 
 	rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST);
 	if (rc != 0) {
 		device_printf(sc->dev, "firmware reset failed: %d.\n", rc);
 		return (rc);
 	}
 
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	if (strncmp(cfg_file, BUILTIN_CF, sizeof(t4_cfg_file)) == 0) {
 		mtype = 0;
 		moff = 0;
 		caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	} else if (strncmp(cfg_file, FLASH_CF, sizeof(t4_cfg_file)) == 0) {
 		mtype = FW_MEMTYPE_FLASH;
 		moff = t4_flash_cfg_addr(sc);
 		caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 		    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
 		    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) |
 		    FW_LEN16(caps));
 	} else {
 		/*
 		 * Ask the firmware where it wants us to upload the config file.
 		 */
 		param = FW_PARAM_DEV(CF);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* No support for config file?  Shouldn't happen. */
 			device_printf(sc->dev,
 			    "failed to query config file location: %d.\n", rc);
 			goto done;
 		}
 		mtype = G_FW_PARAMS_PARAM_Y(val);
 		moff = G_FW_PARAMS_PARAM_Z(val) << 16;
 		caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 		    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
 		    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) |
 		    FW_LEN16(caps));
 
 		rc = copy_cfg_file_to_card(sc, cfg_file, mtype, moff);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to upload config file to card: %d.\n", rc);
 			goto done;
 		}
 	}
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to pre-process config file: %d "
 		    "(mtype %d, moff 0x%x).\n", rc, mtype, moff);
 		goto done;
 	}
 
 	finicsum = be32toh(caps.finicsum);
 	cfcsum = be32toh(caps.cfcsum);	/* actual */
 	if (finicsum != cfcsum) {
 		device_printf(sc->dev,
 		    "WARNING: config file checksum mismatch: %08x %08x\n",
 		    finicsum, cfcsum);
 	}
 	sc->cfcsum = cfcsum;
 	snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", cfg_file);
 
 	/*
 	 * Let the firmware know what features will (not) be used so it can tune
 	 * things accordingly.
 	 */
 #define LIMIT_CAPS(x) do { \
 	caps.x##caps &= htobe16(caps_allowed->x##caps); \
 } while (0)
 	LIMIT_CAPS(nbm);
 	LIMIT_CAPS(link);
 	LIMIT_CAPS(switch);
 	LIMIT_CAPS(nic);
 	LIMIT_CAPS(toe);
 	LIMIT_CAPS(rdma);
 	LIMIT_CAPS(crypto);
 	LIMIT_CAPS(iscsi);
 	LIMIT_CAPS(fcoe);
 #undef LIMIT_CAPS
 	if (caps.niccaps & htobe16(FW_CAPS_CONFIG_NIC_HASHFILTER)) {
 		/*
 		 * TOE and hashfilters are mutually exclusive.  It is a config
 		 * file or firmware bug if both are reported as available.  Try
 		 * to cope with the situation in non-debug builds by disabling
 		 * TOE.
 		 */
 		MPASS(caps.toecaps == 0);
 
 		caps.toecaps = 0;
 		caps.rdmacaps = 0;
 		caps.iscsicaps = 0;
 	}
 
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_WRITE);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to process config file: %d.\n", rc);
 		goto done;
 	}
 
 	t4_tweak_chip_settings(sc);
 	set_params__pre_init(sc);
 
 	/* get basic stuff going */
 	rc = -t4_fw_initialize(sc, sc->mbox);
 	if (rc != 0) {
 		device_printf(sc->dev, "fw_initialize failed: %d.\n", rc);
 		goto done;
 	}
 done:
 	return (rc);
 }
 
 /*
  * Partition chip resources for use between various PFs, VFs, etc.
  */
 static int
 partition_resources(struct adapter *sc)
 {
 	char cfg_file[sizeof(t4_cfg_file)];
 	struct caps_allowed caps_allowed;
 	int rc;
 	bool fallback;
 
 	/* Only the master driver gets to configure the chip resources. */
 	MPASS(sc->flags & MASTER_PF);
 
 #define COPY_CAPS(x) do { \
 	caps_allowed.x##caps = t4_##x##caps_allowed; \
 } while (0)
 	bzero(&caps_allowed, sizeof(caps_allowed));
 	COPY_CAPS(nbm);
 	COPY_CAPS(link);
 	COPY_CAPS(switch);
 	COPY_CAPS(nic);
 	COPY_CAPS(toe);
 	COPY_CAPS(rdma);
 	COPY_CAPS(crypto);
 	COPY_CAPS(iscsi);
 	COPY_CAPS(fcoe);
 	fallback = sc->debug_flags & DF_DISABLE_CFG_RETRY ? false : true;
 	snprintf(cfg_file, sizeof(cfg_file), "%s", t4_cfg_file);
 retry:
 	rc = apply_cfg_and_initialize(sc, cfg_file, &caps_allowed);
 	if (rc != 0 && fallback) {
 		device_printf(sc->dev,
 		    "failed (%d) to configure card with \"%s\" profile, "
 		    "will fall back to a basic configuration and retry.\n",
 		    rc, cfg_file);
 		snprintf(cfg_file, sizeof(cfg_file), "%s", BUILTIN_CF);
 		bzero(&caps_allowed, sizeof(caps_allowed));
 		COPY_CAPS(switch);
 		caps_allowed.niccaps = FW_CAPS_CONFIG_NIC;
 		fallback = false;
 		goto retry;
 	}
 #undef COPY_CAPS
 	return (rc);
 }
 
 /*
  * Retrieve parameters that are needed (or nice to have) very early.
  */
 static int
 get_params__pre_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[2], val[2];
 
 	t4_get_version_info(sc);
 
 	snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers));
 
 	snprintf(sc->bs_version, sizeof(sc->bs_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.bs_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.bs_vers));
 
 	snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers));
 
 	snprintf(sc->er_version, sizeof(sc->er_version), "%u.%u.%u.%u",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.er_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.er_vers));
 
 	param[0] = FW_PARAM_DEV(PORTVEC);
 	param[1] = FW_PARAM_DEV(CCLK);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (pre_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->params.portvec = val[0];
 	sc->params.nports = bitcount32(val[0]);
 	sc->params.vpd.cclk = val[1];
 
 	/* Read device log parameters. */
 	rc = -t4_init_devlog_params(sc, 1);
 	if (rc == 0)
 		fixup_devlog_params(sc);
 	else {
 		device_printf(sc->dev,
 		    "failed to get devlog parameters: %d.\n", rc);
 		rc = 0;	/* devlog isn't critical for device operation */
 	}
 
 	return (rc);
 }
 
 /*
  * Any params that need to be set before FW_INITIALIZE.
  */
 static int
 set_params__pre_init(struct adapter *sc)
 {
 	int rc = 0;
 	uint32_t param, val;
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		param = FW_PARAM_DEV(HPFILTER_REGION_SUPPORT);
 		val = 1;
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		/* firmwares < 1.20.1.0 do not have this param. */
 		if (rc == FW_EINVAL &&
 		    sc->params.fw_vers < FW_VERSION32(1, 20, 1, 0)) {
 			rc = 0;
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to enable high priority filters :%d.\n",
 			    rc);
 		}
 	}
 
 	/* Enable opaque VIIDs with firmwares that support it. */
 	param = FW_PARAM_DEV(OPAQUE_VIID_SMT_EXTN);
 	val = 1;
 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	if (rc == 0 && val == 1)
 		sc->params.viid_smt_extn_support = true;
 	else
 		sc->params.viid_smt_extn_support = false;
 
 	return (rc);
 }
 
 /*
  * Retrieve various parameters that are of interest to the driver.  The device
  * has been initialized by the firmware at this point.
  */
 static int
 get_params__post_init(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[7], val[7];
 	struct fw_caps_config_cmd caps;
 
 	param[0] = FW_PARAM_PFVF(IQFLINT_START);
 	param[1] = FW_PARAM_PFVF(EQ_START);
 	param[2] = FW_PARAM_PFVF(FILTER_START);
 	param[3] = FW_PARAM_PFVF(FILTER_END);
 	param[4] = FW_PARAM_PFVF(L2T_START);
 	param[5] = FW_PARAM_PFVF(L2T_END);
 	param[6] = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 	    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 7, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (post_init): %d.\n", rc);
 		return (rc);
 	}
 
 	sc->sge.iq_start = val[0];
 	sc->sge.eq_start = val[1];
 	if ((int)val[3] > (int)val[2]) {
 		sc->tids.ftid_base = val[2];
 		sc->tids.ftid_end = val[3];
 		sc->tids.nftids = val[3] - val[2] + 1;
 	}
 	sc->vres.l2t.start = val[4];
 	sc->vres.l2t.size = val[5] - val[4] + 1;
 	KASSERT(sc->vres.l2t.size <= L2T_SIZE,
 	    ("%s: L2 table size (%u) larger than expected (%u)",
 	    __func__, sc->vres.l2t.size, L2T_SIZE));
 	sc->params.core_vdd = val[6];
 
 	param[0] = FW_PARAM_PFVF(IQFLINT_END);
 	param[1] = FW_PARAM_PFVF(EQ_END);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to query parameters (post_init2): %d.\n", rc);
 		return (rc);
 	}
 	MPASS((int)val[0] >= sc->sge.iq_start);
 	sc->sge.iqmap_sz = val[0] - sc->sge.iq_start + 1;
 	MPASS((int)val[1] >= sc->sge.eq_start);
 	sc->sge.eqmap_sz = val[1] - sc->sge.eq_start + 1;
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 
 		sc->tids.tid_base = t4_read_reg(sc,
 		    A_LE_DB_ACTIVE_TABLE_START_INDEX);
 
 		param[0] = FW_PARAM_PFVF(HPFILTER_START);
 		param[1] = FW_PARAM_PFVF(HPFILTER_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			   "failed to query hpfilter parameters: %d.\n", rc);
 			return (rc);
 		}
 		if ((int)val[1] > (int)val[0]) {
 			sc->tids.hpftid_base = val[0];
 			sc->tids.hpftid_end = val[1];
 			sc->tids.nhpftids = val[1] - val[0] + 1;
 
 			/*
 			 * These should go off if the layout changes and the
 			 * driver needs to catch up.
 			 */
 			MPASS(sc->tids.hpftid_base == 0);
 			MPASS(sc->tids.tid_base == sc->tids.nhpftids);
 		}
 
 		param[0] = FW_PARAM_PFVF(RAWF_START);
 		param[1] = FW_PARAM_PFVF(RAWF_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			   "failed to query rawf parameters: %d.\n", rc);
 			return (rc);
 		}
 		if ((int)val[1] > (int)val[0]) {
 			sc->rawf_base = val[0];
 			sc->nrawf = val[1] - val[0] + 1;
 		}
 	}
 
 	/*
 	 * MPSBGMAP is queried separately because only recent firmwares support
 	 * it as a parameter and we don't want the compound query above to fail
 	 * on older firmwares.
 	 */
 	param[0] = FW_PARAM_DEV(MPSBGMAP);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.mps_bg_map = val[0];
 	else
 		sc->params.mps_bg_map = 0;
 
 	/*
 	 * Determine whether the firmware supports the filter2 work request.
 	 * This is queried separately for the same reason as MPSBGMAP above.
 	 */
 	param[0] = FW_PARAM_DEV(FILTER2_WR);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.filter2_wr_support = val[0] != 0;
 	else
 		sc->params.filter2_wr_support = 0;
 
 	/*
 	 * Find out whether we're allowed to use the ULPTX MEMWRITE DSGL.
 	 * This is queried separately for the same reason as other params above.
 	 */
 	param[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL);
 	val[0] = 0;
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.ulptx_memwrite_dsgl = val[0] != 0;
 	else
 		sc->params.ulptx_memwrite_dsgl = false;
 
 	/* FW_RI_FR_NSMR_TPTE_WR support */
 	param[0] = FW_PARAM_DEV(RI_FR_NSMR_TPTE_WR);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.fr_nsmr_tpte_wr_support = val[0] != 0;
 	else
 		sc->params.fr_nsmr_tpte_wr_support = false;
 
 	/* Support for 512 SGL entries per FR MR. */
 	param[0] = FW_PARAM_DEV(DEV_512SGL_MR);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.dev_512sgl_mr = val[0] != 0;
 	else
 		sc->params.dev_512sgl_mr = false;
 
 	param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0)
 		sc->params.max_pkts_per_eth_tx_pkts_wr = val[0];
 	else
 		sc->params.max_pkts_per_eth_tx_pkts_wr = 15;
 
 	param[0] = FW_PARAM_DEV(NUM_TM_CLASS);
 	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 	if (rc == 0) {
 		MPASS(val[0] > 0 && val[0] < 256);	/* nsched_cls is 8b */
 		sc->params.nsched_cls = val[0];
 	} else
 		sc->params.nsched_cls = sc->chip_params->nsched_cls;
 
 	/* get capabilites */
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to get card capabilities: %d.\n", rc);
 		return (rc);
 	}
 
 #define READ_CAPS(x) do { \
 	sc->x = htobe16(caps.x); \
 } while (0)
 	READ_CAPS(nbmcaps);
 	READ_CAPS(linkcaps);
 	READ_CAPS(switchcaps);
 	READ_CAPS(niccaps);
 	READ_CAPS(toecaps);
 	READ_CAPS(rdmacaps);
 	READ_CAPS(cryptocaps);
 	READ_CAPS(iscsicaps);
 	READ_CAPS(fcoecaps);
 
 	if (sc->niccaps & FW_CAPS_CONFIG_NIC_HASHFILTER) {
 		MPASS(chip_id(sc) > CHELSIO_T4);
 		MPASS(sc->toecaps == 0);
 		sc->toecaps = 0;
 
 		param[0] = FW_PARAM_DEV(NTID);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query HASHFILTER parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.ntids = val[0];
 		if (sc->params.fw_vers < FW_VERSION32(1, 20, 5, 0)) {
 			MPASS(sc->tids.ntids >= sc->tids.nhpftids);
 			sc->tids.ntids -= sc->tids.nhpftids;
 		}
 		sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS);
 		sc->params.hash_filter = 1;
 	}
 	if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) {
 		param[0] = FW_PARAM_PFVF(ETHOFLD_START);
 		param[1] = FW_PARAM_PFVF(ETHOFLD_END);
 		param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query NIC parameters: %d.\n", rc);
 			return (rc);
 		}
 		if ((int)val[1] > (int)val[0]) {
 			sc->tids.etid_base = val[0];
 			sc->tids.etid_end = val[1];
 			sc->tids.netids = val[1] - val[0] + 1;
 			sc->params.eo_wr_cred = val[2];
 			sc->params.ethoffload = 1;
 		}
 	}
 	if (sc->toecaps) {
 		/* query offload-related parameters */
 		param[0] = FW_PARAM_DEV(NTID);
 		param[1] = FW_PARAM_PFVF(SERVER_START);
 		param[2] = FW_PARAM_PFVF(SERVER_END);
 		param[3] = FW_PARAM_PFVF(TDDP_START);
 		param[4] = FW_PARAM_PFVF(TDDP_END);
 		param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query TOE parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->tids.ntids = val[0];
 		if (sc->params.fw_vers < FW_VERSION32(1, 20, 5, 0)) {
 			MPASS(sc->tids.ntids >= sc->tids.nhpftids);
 			sc->tids.ntids -= sc->tids.nhpftids;
 		}
 		sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS);
 		if ((int)val[2] > (int)val[1]) {
 			sc->tids.stid_base = val[1];
 			sc->tids.nstids = val[2] - val[1] + 1;
 		}
 		sc->vres.ddp.start = val[3];
 		sc->vres.ddp.size = val[4] - val[3] + 1;
 		sc->params.ofldq_wr_cred = val[5];
 		sc->params.offload = 1;
 	} else {
 		/*
 		 * The firmware attempts memfree TOE configuration for -SO cards
 		 * and will report toecaps=0 if it runs out of resources (this
 		 * depends on the config file).  It may not report 0 for other
 		 * capabilities dependent on the TOE in this case.  Set them to
 		 * 0 here so that the driver doesn't bother tracking resources
 		 * that will never be used.
 		 */
 		sc->iscsicaps = 0;
 		sc->rdmacaps = 0;
 	}
 	if (sc->rdmacaps) {
 		param[0] = FW_PARAM_PFVF(STAG_START);
 		param[1] = FW_PARAM_PFVF(STAG_END);
 		param[2] = FW_PARAM_PFVF(RQ_START);
 		param[3] = FW_PARAM_PFVF(RQ_END);
 		param[4] = FW_PARAM_PFVF(PBL_START);
 		param[5] = FW_PARAM_PFVF(PBL_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(1): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.stag.start = val[0];
 		sc->vres.stag.size = val[1] - val[0] + 1;
 		sc->vres.rq.start = val[2];
 		sc->vres.rq.size = val[3] - val[2] + 1;
 		sc->vres.pbl.start = val[4];
 		sc->vres.pbl.size = val[5] - val[4] + 1;
 
 		param[0] = FW_PARAM_PFVF(SQRQ_START);
 		param[1] = FW_PARAM_PFVF(SQRQ_END);
 		param[2] = FW_PARAM_PFVF(CQ_START);
 		param[3] = FW_PARAM_PFVF(CQ_END);
 		param[4] = FW_PARAM_PFVF(OCQ_START);
 		param[5] = FW_PARAM_PFVF(OCQ_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(2): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.qp.start = val[0];
 		sc->vres.qp.size = val[1] - val[0] + 1;
 		sc->vres.cq.start = val[2];
 		sc->vres.cq.size = val[3] - val[2] + 1;
 		sc->vres.ocq.start = val[4];
 		sc->vres.ocq.size = val[5] - val[4] + 1;
 
 		param[0] = FW_PARAM_PFVF(SRQ_START);
 		param[1] = FW_PARAM_PFVF(SRQ_END);
 		param[2] = FW_PARAM_DEV(MAXORDIRD_QP);
 		param[3] = FW_PARAM_DEV(MAXIRD_ADAPTER);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 4, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query RDMA parameters(3): %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.srq.start = val[0];
 		sc->vres.srq.size = val[1] - val[0] + 1;
 		sc->params.max_ordird_qp = val[2];
 		sc->params.max_ird_adapter = val[3];
 	}
 	if (sc->iscsicaps) {
 		param[0] = FW_PARAM_PFVF(ISCSI_START);
 		param[1] = FW_PARAM_PFVF(ISCSI_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query iSCSI parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.iscsi.start = val[0];
 		sc->vres.iscsi.size = val[1] - val[0] + 1;
 	}
 	if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS) {
 		param[0] = FW_PARAM_PFVF(TLS_START);
 		param[1] = FW_PARAM_PFVF(TLS_END);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to query TLS parameters: %d.\n", rc);
 			return (rc);
 		}
 		sc->vres.key.start = val[0];
 		sc->vres.key.size = val[1] - val[0] + 1;
 	}
 
 	/*
 	 * We've got the params we wanted to query directly from the firmware.
 	 * Grab some others via other means.
 	 */
 	t4_init_sge_params(sc);
 	t4_init_tp_params(sc);
 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
 
 	rc = t4_verify_chip_settings(sc);
 	if (rc != 0)
 		return (rc);
 	t4_init_rx_buf_info(sc);
 
 	return (rc);
 }
 
 #ifdef KERN_TLS
 static void
 ktls_tick(void *arg)
 {
 	struct adapter *sc;
 	uint32_t tstamp;
 
 	sc = arg;
 	if (sc->flags & KERN_TLS_ON) {
 		tstamp = tcp_ts_getticks();
 		t4_write_reg(sc, A_TP_SYNC_TIME_HI, tstamp >> 1);
 		t4_write_reg(sc, A_TP_SYNC_TIME_LO, tstamp << 31);
 	}
 	callout_schedule_sbt(&sc->ktls_tick, SBT_1MS, 0, C_HARDCLOCK);
 }
 
 static int
 t4_config_kern_tls(struct adapter *sc, bool enable)
 {
 	int rc;
 	uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_KTLS_HW) |
 	    V_FW_PARAMS_PARAM_Y(enable ? 1 : 0) |
 	    V_FW_PARAMS_PARAM_Z(FW_PARAMS_PARAM_DEV_KTLS_HW_USER_ENABLE);
 
 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &param);
 	if (rc != 0) {
 		CH_ERR(sc, "failed to %s NIC TLS: %d\n",
 		    enable ?  "enable" : "disable", rc);
 		return (rc);
 	}
 
 	if (enable)
 		sc->flags |= KERN_TLS_ON;
 	else
 		sc->flags &= ~KERN_TLS_ON;
 
 	return (rc);
 }
 #endif
 
 static int
 set_params__post_init(struct adapter *sc)
 {
 	uint32_t mask, param, val;
 #ifdef TCP_OFFLOAD
 	int i, v, shift;
 #endif
 
 	/* ask for encapsulated CPLs */
 	param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP);
 	val = 1;
 	(void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 
 	/* Enable 32b port caps if the firmware supports it. */
 	param = FW_PARAM_PFVF(PORT_CAPS32);
 	val = 1;
 	if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val) == 0)
 		sc->params.port_caps32 = 1;
 
 	/* Let filter + maskhash steer to a part of the VI's RSS region. */
 	val = 1 << (G_MASKSIZE(t4_read_reg(sc, A_TP_RSS_CONFIG_TNL)) - 1);
 	t4_set_reg_field(sc, A_TP_RSS_CONFIG_TNL, V_MASKFILTER(M_MASKFILTER),
 	    V_MASKFILTER(val - 1));
 
 	mask = F_DROPERRORANY | F_DROPERRORMAC | F_DROPERRORIPVER |
 	    F_DROPERRORFRAG | F_DROPERRORATTACK | F_DROPERRORETHHDRLEN |
 	    F_DROPERRORIPHDRLEN | F_DROPERRORTCPHDRLEN | F_DROPERRORPKTLEN |
 	    F_DROPERRORTCPOPT | F_DROPERRORCSUMIP | F_DROPERRORCSUM;
 	val = 0;
 	if (chip_id(sc) < CHELSIO_T6 && t4_attack_filter != 0) {
 		t4_set_reg_field(sc, A_TP_GLOBAL_CONFIG, F_ATTACKFILTERENABLE,
 		    F_ATTACKFILTERENABLE);
 		val |= F_DROPERRORATTACK;
 	}
 	if (t4_drop_ip_fragments != 0) {
 		t4_set_reg_field(sc, A_TP_GLOBAL_CONFIG, F_FRAGMENTDROP,
 		    F_FRAGMENTDROP);
 		val |= F_DROPERRORFRAG;
 	}
 	if (t4_drop_pkts_with_l2_errors != 0)
 		val |= F_DROPERRORMAC | F_DROPERRORETHHDRLEN;
 	if (t4_drop_pkts_with_l3_errors != 0) {
 		val |= F_DROPERRORIPVER | F_DROPERRORIPHDRLEN |
 		    F_DROPERRORCSUMIP;
 	}
 	if (t4_drop_pkts_with_l4_errors != 0) {
 		val |= F_DROPERRORTCPHDRLEN | F_DROPERRORPKTLEN |
 		    F_DROPERRORTCPOPT | F_DROPERRORCSUM;
 	}
 	t4_set_reg_field(sc, A_TP_ERR_CONFIG, mask, val);
 
 #ifdef TCP_OFFLOAD
 	/*
 	 * Override the TOE timers with user provided tunables.  This is not the
 	 * recommended way to change the timers (the firmware config file is) so
 	 * these tunables are not documented.
 	 *
 	 * All the timer tunables are in microseconds.
 	 */
 	if (t4_toe_keepalive_idle != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_keepalive_idle);
 		v &= M_KEEPALIVEIDLE;
 		t4_set_reg_field(sc, A_TP_KEEP_IDLE,
 		    V_KEEPALIVEIDLE(M_KEEPALIVEIDLE), V_KEEPALIVEIDLE(v));
 	}
 	if (t4_toe_keepalive_interval != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_keepalive_interval);
 		v &= M_KEEPALIVEINTVL;
 		t4_set_reg_field(sc, A_TP_KEEP_INTVL,
 		    V_KEEPALIVEINTVL(M_KEEPALIVEINTVL), V_KEEPALIVEINTVL(v));
 	}
 	if (t4_toe_keepalive_count != 0) {
 		v = t4_toe_keepalive_count & M_KEEPALIVEMAXR2;
 		t4_set_reg_field(sc, A_TP_SHIFT_CNT,
 		    V_KEEPALIVEMAXR1(M_KEEPALIVEMAXR1) |
 		    V_KEEPALIVEMAXR2(M_KEEPALIVEMAXR2),
 		    V_KEEPALIVEMAXR1(1) | V_KEEPALIVEMAXR2(v));
 	}
 	if (t4_toe_rexmt_min != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_rexmt_min);
 		v &= M_RXTMIN;
 		t4_set_reg_field(sc, A_TP_RXT_MIN,
 		    V_RXTMIN(M_RXTMIN), V_RXTMIN(v));
 	}
 	if (t4_toe_rexmt_max != 0) {
 		v = us_to_tcp_ticks(sc, t4_toe_rexmt_max);
 		v &= M_RXTMAX;
 		t4_set_reg_field(sc, A_TP_RXT_MAX,
 		    V_RXTMAX(M_RXTMAX), V_RXTMAX(v));
 	}
 	if (t4_toe_rexmt_count != 0) {
 		v = t4_toe_rexmt_count & M_RXTSHIFTMAXR2;
 		t4_set_reg_field(sc, A_TP_SHIFT_CNT,
 		    V_RXTSHIFTMAXR1(M_RXTSHIFTMAXR1) |
 		    V_RXTSHIFTMAXR2(M_RXTSHIFTMAXR2),
 		    V_RXTSHIFTMAXR1(1) | V_RXTSHIFTMAXR2(v));
 	}
 	for (i = 0; i < nitems(t4_toe_rexmt_backoff); i++) {
 		if (t4_toe_rexmt_backoff[i] != -1) {
 			v = t4_toe_rexmt_backoff[i] & M_TIMERBACKOFFINDEX0;
 			shift = (i & 3) << 3;
 			t4_set_reg_field(sc, A_TP_TCP_BACKOFF_REG0 + (i & ~3),
 			    M_TIMERBACKOFFINDEX0 << shift, v << shift);
 		}
 	}
 #endif
 
 #ifdef KERN_TLS
 	if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS &&
 	    sc->toecaps & FW_CAPS_CONFIG_TOE) {
 		/*
 		 * Limit TOE connections to 2 reassembly "islands".  This is
 		 * required for TOE TLS connections to downgrade to plain TOE
 		 * connections if an unsupported TLS version or ciphersuite is
 		 * used.
 		 */
 		t4_tp_wr_bits_indirect(sc, A_TP_FRAG_CONFIG,
 		    V_PASSMODE(M_PASSMODE), V_PASSMODE(2));
 		if (is_ktls(sc)) {
 			sc->tlst.inline_keys = t4_tls_inline_keys;
 			sc->tlst.combo_wrs = t4_tls_combo_wrs;
 			if (t4_kern_tls != 0)
 				t4_config_kern_tls(sc, true);
 		}
 	}
 #endif
 	return (0);
 }
 
 #undef FW_PARAM_PFVF
 #undef FW_PARAM_DEV
 
 static void
 t4_set_desc(struct adapter *sc)
 {
 	char buf[128];
 	struct adapter_params *p = &sc->params;
 
 	snprintf(buf, sizeof(buf), "Chelsio %s", p->vpd.id);
 
 	device_set_desc_copy(sc->dev, buf);
 }
 
 static inline void
 ifmedia_add4(struct ifmedia *ifm, int m)
 {
 
 	ifmedia_add(ifm, m, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_RXPAUSE, 0, NULL);
 	ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL);
 }
 
 /*
  * This is the selected media, which is not quite the same as the active media.
  * The media line in ifconfig is "media: Ethernet selected (active)" if selected
  * and active are not the same, and "media: Ethernet selected" otherwise.
  */
 static void
 set_current_media(struct port_info *pi)
 {
 	struct link_config *lc;
 	struct ifmedia *ifm;
 	int mword;
 	u_int speed;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	/* Leave current media alone if it's already set to IFM_NONE. */
 	ifm = &pi->media;
 	if (ifm->ifm_cur != NULL &&
 	    IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE)
 		return;
 
 	lc = &pi->link_cfg;
 	if (lc->requested_aneg != AUTONEG_DISABLE &&
 	    lc->pcaps & FW_PORT_CAP32_ANEG) {
 		ifmedia_set(ifm, IFM_ETHER | IFM_AUTO);
 		return;
 	}
 	mword = IFM_ETHER | IFM_FDX;
 	if (lc->requested_fc & PAUSE_TX)
 		mword |= IFM_ETH_TXPAUSE;
 	if (lc->requested_fc & PAUSE_RX)
 		mword |= IFM_ETH_RXPAUSE;
 	if (lc->requested_speed == 0)
 		speed = port_top_speed(pi) * 1000;	/* Gbps -> Mbps */
 	else
 		speed = lc->requested_speed;
 	mword |= port_mword(pi, speed_to_fwcap(speed));
 	ifmedia_set(ifm, mword);
 }
 
 /*
  * Returns true if the ifmedia list for the port cannot change.
  */
 static bool
 fixed_ifmedia(struct port_info *pi)
 {
 
 	return (pi->port_type == FW_PORT_TYPE_BT_SGMII ||
 	    pi->port_type == FW_PORT_TYPE_BT_XFI ||
 	    pi->port_type == FW_PORT_TYPE_BT_XAUI ||
 	    pi->port_type == FW_PORT_TYPE_KX4 ||
 	    pi->port_type == FW_PORT_TYPE_KX ||
 	    pi->port_type == FW_PORT_TYPE_KR ||
 	    pi->port_type == FW_PORT_TYPE_BP_AP ||
 	    pi->port_type == FW_PORT_TYPE_BP4_AP ||
 	    pi->port_type == FW_PORT_TYPE_BP40_BA ||
 	    pi->port_type == FW_PORT_TYPE_KR4_100G ||
 	    pi->port_type == FW_PORT_TYPE_KR_SFP28 ||
 	    pi->port_type == FW_PORT_TYPE_KR_XLAUI);
 }
 
 static void
 build_medialist(struct port_info *pi)
 {
 	uint32_t ss, speed;
 	int unknown, mword, bit;
 	struct link_config *lc;
 	struct ifmedia *ifm;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	if (pi->flags & FIXED_IFMEDIA)
 		return;
 
 	/*
 	 * Rebuild the ifmedia list.
 	 */
 	ifm = &pi->media;
 	ifmedia_removeall(ifm);
 	lc = &pi->link_cfg;
 	ss = G_FW_PORT_CAP32_SPEED(lc->pcaps); /* Supported Speeds */
 	if (__predict_false(ss == 0)) {	/* not supposed to happen. */
 		MPASS(ss != 0);
 no_media:
 		MPASS(LIST_EMPTY(&ifm->ifm_list));
 		ifmedia_add(ifm, IFM_ETHER | IFM_NONE, 0, NULL);
 		ifmedia_set(ifm, IFM_ETHER | IFM_NONE);
 		return;
 	}
 
 	unknown = 0;
 	for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) {
 		speed = 1 << bit;
 		MPASS(speed & M_FW_PORT_CAP32_SPEED);
 		if (ss & speed) {
 			mword = port_mword(pi, speed);
 			if (mword == IFM_NONE) {
 				goto no_media;
 			} else if (mword == IFM_UNKNOWN)
 				unknown++;
 			else
 				ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | mword);
 		}
 	}
 	if (unknown > 0) /* Add one unknown for all unknown media types. */
 		ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN);
 	if (lc->pcaps & FW_PORT_CAP32_ANEG)
 		ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL);
 
 	set_current_media(pi);
 }
 
 /*
  * Initialize the requested fields in the link config based on driver tunables.
  */
 static void
 init_link_config(struct port_info *pi)
 {
 	struct link_config *lc = &pi->link_cfg;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	lc->requested_speed = 0;
 
 	if (t4_autoneg == 0)
 		lc->requested_aneg = AUTONEG_DISABLE;
 	else if (t4_autoneg == 1)
 		lc->requested_aneg = AUTONEG_ENABLE;
 	else
 		lc->requested_aneg = AUTONEG_AUTO;
 
 	lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX |
 	    PAUSE_AUTONEG);
 
 	if (t4_fec & FEC_AUTO)
 		lc->requested_fec = FEC_AUTO;
 	else if (t4_fec == 0)
 		lc->requested_fec = FEC_NONE;
 	else {
 		/* -1 is handled by the FEC_AUTO block above and not here. */
 		lc->requested_fec = t4_fec &
 		    (FEC_RS | FEC_BASER_RS | FEC_NONE | FEC_MODULE);
 		if (lc->requested_fec == 0)
 			lc->requested_fec = FEC_AUTO;
 	}
 }
 
 /*
  * Makes sure that all requested settings comply with what's supported by the
  * port.  Returns the number of settings that were invalid and had to be fixed.
  */
 static int
 fixup_link_config(struct port_info *pi)
 {
 	int n = 0;
 	struct link_config *lc = &pi->link_cfg;
 	uint32_t fwspeed;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	/* Speed (when not autonegotiating) */
 	if (lc->requested_speed != 0) {
 		fwspeed = speed_to_fwcap(lc->requested_speed);
 		if ((fwspeed & lc->pcaps) == 0) {
 			n++;
 			lc->requested_speed = 0;
 		}
 	}
 
 	/* Link autonegotiation */
 	MPASS(lc->requested_aneg == AUTONEG_ENABLE ||
 	    lc->requested_aneg == AUTONEG_DISABLE ||
 	    lc->requested_aneg == AUTONEG_AUTO);
 	if (lc->requested_aneg == AUTONEG_ENABLE &&
 	    !(lc->pcaps & FW_PORT_CAP32_ANEG)) {
 		n++;
 		lc->requested_aneg = AUTONEG_AUTO;
 	}
 
 	/* Flow control */
 	MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0);
 	if (lc->requested_fc & PAUSE_TX &&
 	    !(lc->pcaps & FW_PORT_CAP32_FC_TX)) {
 		n++;
 		lc->requested_fc &= ~PAUSE_TX;
 	}
 	if (lc->requested_fc & PAUSE_RX &&
 	    !(lc->pcaps & FW_PORT_CAP32_FC_RX)) {
 		n++;
 		lc->requested_fc &= ~PAUSE_RX;
 	}
 	if (!(lc->requested_fc & PAUSE_AUTONEG) &&
 	    !(lc->pcaps & FW_PORT_CAP32_FORCE_PAUSE)) {
 		n++;
 		lc->requested_fc |= PAUSE_AUTONEG;
 	}
 
 	/* FEC */
 	if ((lc->requested_fec & FEC_RS &&
 	    !(lc->pcaps & FW_PORT_CAP32_FEC_RS)) ||
 	    (lc->requested_fec & FEC_BASER_RS &&
 	    !(lc->pcaps & FW_PORT_CAP32_FEC_BASER_RS))) {
 		n++;
 		lc->requested_fec = FEC_AUTO;
 	}
 
 	return (n);
 }
 
 /*
  * Apply the requested L1 settings, which are expected to be valid, to the
  * hardware.
  */
 static int
 apply_link_config(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 
 #ifdef INVARIANTS
 	ASSERT_SYNCHRONIZED_OP(sc);
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	if (lc->requested_aneg == AUTONEG_ENABLE)
 		MPASS(lc->pcaps & FW_PORT_CAP32_ANEG);
 	if (!(lc->requested_fc & PAUSE_AUTONEG))
 		MPASS(lc->pcaps & FW_PORT_CAP32_FORCE_PAUSE);
 	if (lc->requested_fc & PAUSE_TX)
 		MPASS(lc->pcaps & FW_PORT_CAP32_FC_TX);
 	if (lc->requested_fc & PAUSE_RX)
 		MPASS(lc->pcaps & FW_PORT_CAP32_FC_RX);
 	if (lc->requested_fec & FEC_RS)
 		MPASS(lc->pcaps & FW_PORT_CAP32_FEC_RS);
 	if (lc->requested_fec & FEC_BASER_RS)
 		MPASS(lc->pcaps & FW_PORT_CAP32_FEC_BASER_RS);
 #endif
 	rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc);
 	if (rc != 0) {
 		/* Don't complain if the VF driver gets back an EPERM. */
 		if (!(sc->flags & IS_VF) || rc != FW_EPERM)
 			device_printf(pi->dev, "l1cfg failed: %d\n", rc);
 	} else {
 		/*
 		 * An L1_CFG will almost always result in a link-change event if
 		 * the link is up, and the driver will refresh the actual
 		 * fec/fc/etc. when the notification is processed.  If the link
 		 * is down then the actual settings are meaningless.
 		 *
 		 * This takes care of the case where a change in the L1 settings
 		 * may not result in a notification.
 		 */
 		if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG))
 			lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX);
 	}
 	return (rc);
 }
 
 #define FW_MAC_EXACT_CHUNK	7
 struct mcaddr_ctx {
 	struct ifnet *ifp;
 	const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK];
 	uint64_t hash;
 	int i;
 	int del;
 	int rc;
 };
 
 static u_int
 add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
 {
 	struct mcaddr_ctx *ctx = arg;
 	struct vi_info *vi = ctx->ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 
 	if (ctx->rc < 0)
 		return (0);
 
 	ctx->mcaddr[ctx->i] = LLADDR(sdl);
 	MPASS(ETHER_IS_MULTICAST(ctx->mcaddr[ctx->i]));
 	ctx->i++;
 
 	if (ctx->i == FW_MAC_EXACT_CHUNK) {
 		ctx->rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, ctx->del,
 		    ctx->i, ctx->mcaddr, NULL, &ctx->hash, 0);
 		if (ctx->rc < 0) {
 			int j;
 
 			for (j = 0; j < ctx->i; j++) {
 				if_printf(ctx->ifp,
 				    "failed to add mc address"
 				    " %02x:%02x:%02x:"
 				    "%02x:%02x:%02x rc=%d\n",
 				    ctx->mcaddr[j][0], ctx->mcaddr[j][1],
 				    ctx->mcaddr[j][2], ctx->mcaddr[j][3],
 				    ctx->mcaddr[j][4], ctx->mcaddr[j][5],
 				    -ctx->rc);
 			}
 			return (0);
 		}
 		ctx->del = 0;
 		ctx->i = 0;
 	}
 
 	return (1);
 }
 
 /*
  * Program the port's XGMAC based on parameters in ifnet.  The caller also
  * indicates which parameters should be programmed (the rest are left alone).
  */
 int
 update_mac_settings(struct ifnet *ifp, int flags)
 {
 	int rc = 0;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
 	uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT(flags, ("%s: not told what to update.", __func__));
 
 	if (flags & XGMAC_MTU)
 		mtu = ifp->if_mtu;
 
 	if (flags & XGMAC_PROMISC)
 		promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0;
 
 	if (flags & XGMAC_ALLMULTI)
 		allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0;
 
 	if (flags & XGMAC_VLANEX)
 		vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0;
 
 	if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) {
 		rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc,
 		    allmulti, 1, vlanex, false);
 		if (rc) {
 			if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags,
 			    rc);
 			return (rc);
 		}
 	}
 
 	if (flags & XGMAC_UCADDR) {
 		uint8_t ucaddr[ETHER_ADDR_LEN];
 
 		bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr));
 		rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt,
 		    ucaddr, true, &vi->smt_idx);
 		if (rc < 0) {
 			rc = -rc;
 			if_printf(ifp, "change_mac failed: %d\n", rc);
 			return (rc);
 		} else {
 			vi->xact_addr_filt = rc;
 			rc = 0;
 		}
 	}
 
 	if (flags & XGMAC_MCADDRS) {
 		struct epoch_tracker et;
 		struct mcaddr_ctx ctx;
 		int j;
 
 		ctx.ifp = ifp;
 		ctx.hash = 0;
 		ctx.i = 0;
 		ctx.del = 1;
 		ctx.rc = 0;
 		/*
 		 * Unlike other drivers, we accumulate list of pointers into
 		 * interface address lists and we need to keep it safe even
 		 * after if_foreach_llmaddr() returns, thus we must enter the
 		 * network epoch.
 		 */
 		NET_EPOCH_ENTER(et);
 		if_foreach_llmaddr(ifp, add_maddr, &ctx);
 		if (ctx.rc < 0) {
 			NET_EPOCH_EXIT(et);
 			rc = -ctx.rc;
 			return (rc);
 		}
 		if (ctx.i > 0) {
 			rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid,
 			    ctx.del, ctx.i, ctx.mcaddr, NULL, &ctx.hash, 0);
 			NET_EPOCH_EXIT(et);
 			if (rc < 0) {
 				rc = -rc;
 				for (j = 0; j < ctx.i; j++) {
 					if_printf(ifp,
 					    "failed to add mcast address"
 					    " %02x:%02x:%02x:"
 					    "%02x:%02x:%02x rc=%d\n",
 					    ctx.mcaddr[j][0], ctx.mcaddr[j][1],
 					    ctx.mcaddr[j][2], ctx.mcaddr[j][3],
 					    ctx.mcaddr[j][4], ctx.mcaddr[j][5],
 					    rc);
 				}
 				return (rc);
 			}
 			ctx.del = 0;
 		} else
 			NET_EPOCH_EXIT(et);
 
 		rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0);
 		if (rc != 0)
 			if_printf(ifp, "failed to set mcast address hash: %d\n",
 			    rc);
 		if (ctx.del == 0) {
 			/* We clobbered the VXLAN entry if there was one. */
 			pi->vxlan_tcam_entry = false;
 		}
 	}
 
 	if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 &&
 	    pi->vxlan_tcam_entry == false) {
 		rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac,
 		    match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id,
 		    true);
 		if (rc < 0) {
 			rc = -rc;
 			if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n",
 			    rc);
 		} else {
 			MPASS(rc == sc->rawf_base + pi->port_id);
 			rc = 0;
 			pi->vxlan_tcam_entry = true;
 		}
 	}
 
 	return (rc);
 }
 
 /*
  * {begin|end}_synchronized_op must be called from the same thread.
  */
 int
 begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags,
     char *wmesg)
 {
 	int rc, pri;
 
 #ifdef WITNESS
 	/* the caller thinks it's ok to sleep, but is it really? */
 	if (flags & SLEEP_OK)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "begin_synchronized_op");
 #endif
 
 	if (INTR_OK)
 		pri = PCATCH;
 	else
 		pri = 0;
 
 	ADAPTER_LOCK(sc);
 	for (;;) {
 
 		if (vi && IS_DOOMED(vi)) {
 			rc = ENXIO;
 			goto done;
 		}
 
 		if (!IS_BUSY(sc)) {
 			rc = 0;
 			break;
 		}
 
 		if (!(flags & SLEEP_OK)) {
 			rc = EBUSY;
 			goto done;
 		}
 
 		if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) {
 			rc = EINTR;
 			goto done;
 		}
 	}
 
 	KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__));
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = wmesg;
 	sc->last_op_thr = curthread;
 	sc->last_op_flags = flags;
 #endif
 
 done:
 	if (!(flags & HOLD_LOCK) || rc)
 		ADAPTER_UNLOCK(sc);
 
 	return (rc);
 }
 
 /*
  * Tell if_ioctl and if_init that the VI is going away.  This is
  * special variant of begin_synchronized_op and must be paired with a
  * call to end_synchronized_op.
  */
 void
 doom_vi(struct adapter *sc, struct vi_info *vi)
 {
 
 	ADAPTER_LOCK(sc);
 	SET_DOOMED(vi);
 	wakeup(&sc->flags);
 	while (IS_BUSY(sc))
 		mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0);
 	SET_BUSY(sc);
 #ifdef INVARIANTS
 	sc->last_op = "t4detach";
 	sc->last_op_thr = curthread;
 	sc->last_op_flags = 0;
 #endif
 	ADAPTER_UNLOCK(sc);
 }
 
 /*
  * {begin|end}_synchronized_op must be called from the same thread.
  */
 void
 end_synchronized_op(struct adapter *sc, int flags)
 {
 
 	if (flags & LOCK_HELD)
 		ADAPTER_LOCK_ASSERT_OWNED(sc);
 	else
 		ADAPTER_LOCK(sc);
 
 	KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__));
 	CLR_BUSY(sc);
 	wakeup(&sc->flags);
 	ADAPTER_UNLOCK(sc);
 }
 
 static int
 cxgbe_init_synchronized(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	int rc = 0, i;
 	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return (0);	/* already running */
 
 	if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_init(sc)) != 0))
 		return (rc);	/* error message displayed already */
 
 	if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_init(vi)) != 0))
 		return (rc); /* error message displayed already */
 
 	rc = update_mac_settings(ifp, XGMAC_ALL);
 	if (rc)
 		goto done;	/* error message displayed already */
 
 	PORT_LOCK(pi);
 	if (pi->up_vis == 0) {
 		t4_update_port_info(pi);
 		fixup_link_config(pi);
 		build_medialist(pi);
 		apply_link_config(pi);
 	}
 
 	rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true);
 	if (rc != 0) {
 		if_printf(ifp, "enable_vi failed: %d\n", rc);
 		PORT_UNLOCK(pi);
 		goto done;
 	}
 
 	/*
 	 * Can't fail from this point onwards.  Review cxgbe_uninit_synchronized
 	 * if this changes.
 	 */
 
 	for_each_txq(vi, i, txq) {
 		TXQ_LOCK(txq);
 		txq->eq.flags |= EQ_ENABLED;
 		TXQ_UNLOCK(txq);
 	}
 
 	/*
 	 * The first iq of the first port to come up is used for tracing.
 	 */
 	if (sc->traceq < 0 && IS_MAIN_VI(vi)) {
 		sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id;
 		t4_write_reg(sc, is_t4(sc) ?  A_MPS_TRC_RSS_CONTROL :
 		    A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) |
 		    V_QUEUENUMBER(sc->traceq));
 		pi->flags |= HAS_TRACEQ;
 	}
 
 	/* all ok */
 	pi->up_vis++;
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if (pi->link_cfg.link_ok)
 		t4_os_link_changed(pi);
 	PORT_UNLOCK(pi);
 
 	mtx_lock(&vi->tick_mtx);
 	if (ifp->if_get_counter == vi_get_counter)
 		callout_reset(&vi->tick, hz, vi_tick, vi);
 	else
 		callout_reset(&vi->tick, hz, cxgbe_tick, vi);
 	mtx_unlock(&vi->tick_mtx);
 done:
 	if (rc != 0)
 		cxgbe_uninit_synchronized(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 static int
 cxgbe_uninit_synchronized(struct vi_info *vi)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	int rc, i;
 	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!(vi->flags & VI_INIT_DONE)) {
 		if (__predict_false(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			KASSERT(0, ("uninited VI is running"));
 			if_printf(ifp, "uninited VI with running ifnet.  "
 			    "vi->flags 0x%016lx, if_flags 0x%08x, "
 			    "if_drv_flags 0x%08x\n", vi->flags, ifp->if_flags,
 			    ifp->if_drv_flags);
 		}
 		return (0);
 	}
 
 	/*
 	 * Disable the VI so that all its data in either direction is discarded
 	 * by the MPS.  Leave everything else (the queues, interrupts, and 1Hz
 	 * tick) intact as the TP can deliver negative advice or data that it's
 	 * holding in its RAM (for an offloaded connection) even after the VI is
 	 * disabled.
 	 */
 	rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false);
 	if (rc) {
 		if_printf(ifp, "disable_vi failed: %d\n", rc);
 		return (rc);
 	}
 
 	for_each_txq(vi, i, txq) {
 		TXQ_LOCK(txq);
 		txq->eq.flags &= ~EQ_ENABLED;
 		TXQ_UNLOCK(txq);
 	}
 
 	mtx_lock(&vi->tick_mtx);
 	callout_stop(&vi->tick);
 	mtx_unlock(&vi->tick_mtx);
 
 	PORT_LOCK(pi);
 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		PORT_UNLOCK(pi);
 		return (0);
 	}
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	pi->up_vis--;
 	if (pi->up_vis > 0) {
 		PORT_UNLOCK(pi);
 		return (0);
 	}
 
 	pi->link_cfg.link_ok = false;
 	pi->link_cfg.speed = 0;
 	pi->link_cfg.link_down_rc = 255;
 	t4_os_link_changed(pi);
 	PORT_UNLOCK(pi);
 
 	return (0);
 }
 
 /*
  * It is ok for this function to fail midway and return right away.  t4_detach
  * will walk the entire sc->irq list and clean up whatever is valid.
  */
 int
 t4_setup_intr_handlers(struct adapter *sc)
 {
 	int rc, rid, p, q, v;
 	char s[8];
 	struct irq *irq;
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct sge *sge = &sc->sge;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 #endif
 #ifdef RSS
 	int nbuckets = rss_getnumbuckets();
 #endif
 
 	/*
 	 * Setup interrupts.
 	 */
 	irq = &sc->irq[0];
 	rid = sc->intr_type == INTR_INTX ? 0 : 1;
 	if (forwarding_intr_to_fwq(sc))
 		return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all"));
 
 	/* Multiple interrupts. */
 	if (sc->flags & IS_VF)
 		KASSERT(sc->intr_count >= T4VF_EXTRA_INTR + sc->params.nports,
 		    ("%s: too few intr.", __func__));
 	else
 		KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports,
 		    ("%s: too few intr.", __func__));
 
 	/* The first one is always error intr on PFs */
 	if (!(sc->flags & IS_VF)) {
 		rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err");
 		if (rc != 0)
 			return (rc);
 		irq++;
 		rid++;
 	}
 
 	/* The second one is always the firmware event queue (first on VFs) */
 	rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt");
 	if (rc != 0)
 		return (rc);
 	irq++;
 	rid++;
 
 	for_each_port(sc, p) {
 		pi = sc->port[p];
 		for_each_vi(pi, v, vi) {
 			vi->first_intr = rid - 1;
 
 			if (vi->nnmrxq > 0) {
 				int n = max(vi->nrxq, vi->nnmrxq);
 
 				rxq = &sge->rxq[vi->first_rxq];
 #ifdef DEV_NETMAP
 				nm_rxq = &sge->nm_rxq[vi->first_nm_rxq];
 #endif
 				for (q = 0; q < n; q++) {
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'a' + v, q);
 					if (q < vi->nrxq)
 						irq->rxq = rxq++;
 #ifdef DEV_NETMAP
 					if (q < vi->nnmrxq)
 						irq->nm_rxq = nm_rxq++;
 
 					if (irq->nm_rxq != NULL &&
 					    irq->rxq == NULL) {
 						/* Netmap rx only */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_nm_intr, irq->nm_rxq, s);
 					}
 					if (irq->nm_rxq != NULL &&
 					    irq->rxq != NULL) {
 						/* NIC and Netmap rx */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_vi_intr, irq, s);
 					}
 #endif
 					if (irq->rxq != NULL &&
 					    irq->nm_rxq == NULL) {
 						/* NIC rx only */
 						rc = t4_alloc_irq(sc, irq, rid,
 						    t4_intr, irq->rxq, s);
 					}
 					if (rc != 0)
 						return (rc);
 #ifdef RSS
 					if (q < vi->nrxq) {
 						bus_bind_intr(sc->dev, irq->res,
 						    rss_getcpu(q % nbuckets));
 					}
 #endif
 					irq++;
 					rid++;
 					vi->nintr++;
 				}
 			} else {
 				for_each_rxq(vi, q, rxq) {
 					snprintf(s, sizeof(s), "%x%c%x", p,
 					    'a' + v, q);
 					rc = t4_alloc_irq(sc, irq, rid,
 					    t4_intr, rxq, s);
 					if (rc != 0)
 						return (rc);
 #ifdef RSS
 					bus_bind_intr(sc->dev, irq->res,
 					    rss_getcpu(q % nbuckets));
 #endif
 					irq++;
 					rid++;
 					vi->nintr++;
 				}
 			}
 #ifdef TCP_OFFLOAD
 			for_each_ofld_rxq(vi, q, ofld_rxq) {
 				snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q);
 				rc = t4_alloc_irq(sc, irq, rid, t4_intr,
 				    ofld_rxq, s);
 				if (rc != 0)
 					return (rc);
 				irq++;
 				rid++;
 				vi->nintr++;
 			}
 #endif
 		}
 	}
 	MPASS(irq == &sc->irq[sc->intr_count]);
 
 	return (0);
 }
 
 static void
 write_global_rss_key(struct adapter *sc)
 {
 #ifdef RSS
 	int i;
 	uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)];
 	uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)];
 
 	CTASSERT(RSS_KEYSIZE == 40);
 
 	rss_getkey((void *)&raw_rss_key[0]);
 	for (i = 0; i < nitems(rss_key); i++) {
 		rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]);
 	}
 	t4_write_rss_key(sc, &rss_key[0], -1, 1);
 #endif
 }
 
 /*
  * Idempotent.
  */
 static int
 adapter_full_init(struct adapter *sc)
 {
 	int rc, i;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!(sc->flags & ADAP_SYSCTL_CTX)) {
 		sysctl_ctx_init(&sc->ctx);
 		sc->flags |= ADAP_SYSCTL_CTX;
 	}
 
 	/*
 	 * queues that belong to the adapter (not any particular port).
 	 */
 	rc = t4_setup_adapter_queues(sc);
 	if (rc != 0)
 		return (rc);
 
 	for (i = 0; i < nitems(sc->tq); i++) {
 		if (sc->tq[i] != NULL)
 			continue;
 		sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT,
 		    taskqueue_thread_enqueue, &sc->tq[i]);
 		if (sc->tq[i] == NULL) {
 			CH_ERR(sc, "failed to allocate task queue %d\n", i);
 			return (ENOMEM);
 		}
 		taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d",
 		    device_get_nameunit(sc->dev), i);
 	}
 
 	if (!(sc->flags & IS_VF)) {
 		write_global_rss_key(sc);
 		t4_intr_enable(sc);
 	}
 #ifdef KERN_TLS
 	if (is_ktls(sc))
 		callout_reset_sbt(&sc->ktls_tick, SBT_1MS, 0, ktls_tick, sc,
 		    C_HARDCLOCK);
 #endif
 	return (0);
 }
 
 int
 adapter_init(struct adapter *sc)
 {
 	int rc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 	KASSERT((sc->flags & FULL_INIT_DONE) == 0,
 	    ("%s: FULL_INIT_DONE already", __func__));
 
 	rc = adapter_full_init(sc);
 	if (rc != 0)
 		adapter_full_uninit(sc);
 	else
 		sc->flags |= FULL_INIT_DONE;
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 static void
 adapter_full_uninit(struct adapter *sc)
 {
 	int i;
 
 	/* Do this before freeing the adapter queues. */
 	if (sc->flags & ADAP_SYSCTL_CTX) {
 		sysctl_ctx_free(&sc->ctx);
 		sc->flags &= ~ADAP_SYSCTL_CTX;
 	}
 
 	t4_teardown_adapter_queues(sc);
 
 	for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) {
 		taskqueue_free(sc->tq[i]);
 		sc->tq[i] = NULL;
 	}
 
 	sc->flags &= ~FULL_INIT_DONE;
 }
 
 #ifdef RSS
 #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \
     RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \
     RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \
     RSS_HASHTYPE_RSS_UDP_IPV6)
 
 /* Translates kernel hash types to hardware. */
 static int
 hashconfig_to_hashen(int hashconfig)
 {
 	int hashen = 0;
 
 	if (hashconfig & RSS_HASHTYPE_RSS_IPV4)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_IPV6)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) {
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN;
 	}
 	if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) {
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN;
 	}
 	if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN;
 	if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6)
 		hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN;
 
 	return (hashen);
 }
 
 /* Translates hardware hash types to kernel. */
 static int
 hashen_to_hashconfig(int hashen)
 {
 	int hashconfig = 0;
 
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) {
 		/*
 		 * If UDP hashing was enabled it must have been enabled for
 		 * either IPv4 or IPv6 (inclusive or).  Enabling UDP without
 		 * enabling any 4-tuple hash is nonsense configuration.
 		 */
 		MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN |
 		    F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN));
 
 		if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN)
 			hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4;
 		if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)
 			hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6;
 	}
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_IPV4;
 	if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN)
 		hashconfig |= RSS_HASHTYPE_RSS_IPV6;
 
 	return (hashconfig);
 }
 #endif
 
 /*
  * Idempotent.
  */
 static int
 vi_full_init(struct vi_info *vi)
 {
 	struct adapter *sc = vi->adapter;
 	struct sge_rxq *rxq;
 	int rc, i, j;
 #ifdef RSS
 	int nbuckets = rss_getnumbuckets();
 	int hashconfig = rss_gethashconfig();
 	int extra;
 #endif
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!(vi->flags & VI_SYSCTL_CTX)) {
 		sysctl_ctx_init(&vi->ctx);
 		vi->flags |= VI_SYSCTL_CTX;
 	}
 
 	/*
 	 * Allocate tx/rx/fl queues for this VI.
 	 */
 	rc = t4_setup_vi_queues(vi);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * Setup RSS for this VI.  Save a copy of the RSS table for later use.
 	 */
 	if (vi->nrxq > vi->rss_size) {
 		CH_ALERT(vi, "nrxq (%d) > hw RSS table size (%d); "
 		    "some queues will never receive traffic.\n", vi->nrxq,
 		    vi->rss_size);
 	} else if (vi->rss_size % vi->nrxq) {
 		CH_ALERT(vi, "nrxq (%d), hw RSS table size (%d); "
 		    "expect uneven traffic distribution.\n", vi->nrxq,
 		    vi->rss_size);
 	}
 #ifdef RSS
 	if (vi->nrxq != nbuckets) {
 		CH_ALERT(vi, "nrxq (%d) != kernel RSS buckets (%d);"
 		    "performance will be impacted.\n", vi->nrxq, nbuckets);
 	}
 #endif
 	if (vi->rss == NULL)
 		vi->rss = malloc(vi->rss_size * sizeof (*vi->rss), M_CXGBE,
 		    M_ZERO | M_WAITOK);
 	for (i = 0; i < vi->rss_size;) {
 #ifdef RSS
 		j = rss_get_indirection_to_bucket(i);
 		j %= vi->nrxq;
 		rxq = &sc->sge.rxq[vi->first_rxq + j];
 		vi->rss[i++] = rxq->iq.abs_id;
 #else
 		for_each_rxq(vi, j, rxq) {
 			vi->rss[i++] = rxq->iq.abs_id;
 			if (i == vi->rss_size)
 				break;
 		}
 #endif
 	}
 
 	rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size,
 	    vi->rss, vi->rss_size);
 	if (rc != 0) {
 		CH_ERR(vi, "rss_config failed: %d\n", rc);
 		return (rc);
 	}
 
 #ifdef RSS
 	vi->hashen = hashconfig_to_hashen(hashconfig);
 
 	/*
 	 * We may have had to enable some hashes even though the global config
 	 * wants them disabled.  This is a potential problem that must be
 	 * reported to the user.
 	 */
 	extra = hashen_to_hashconfig(vi->hashen) ^ hashconfig;
 
 	/*
 	 * If we consider only the supported hash types, then the enabled hashes
 	 * are a superset of the requested hashes.  In other words, there cannot
 	 * be any supported hash that was requested but not enabled, but there
 	 * can be hashes that were not requested but had to be enabled.
 	 */
 	extra &= SUPPORTED_RSS_HASHTYPES;
 	MPASS((extra & hashconfig) == 0);
 
 	if (extra) {
 		CH_ALERT(vi,
 		    "global RSS config (0x%x) cannot be accommodated.\n",
 		    hashconfig);
 	}
 	if (extra & RSS_HASHTYPE_RSS_IPV4)
 		CH_ALERT(vi, "IPv4 2-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_TCP_IPV4)
 		CH_ALERT(vi, "TCP/IPv4 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_IPV6)
 		CH_ALERT(vi, "IPv6 2-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_TCP_IPV6)
 		CH_ALERT(vi, "TCP/IPv6 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_UDP_IPV4)
 		CH_ALERT(vi, "UDP/IPv4 4-tuple hashing forced on.\n");
 	if (extra & RSS_HASHTYPE_RSS_UDP_IPV6)
 		CH_ALERT(vi, "UDP/IPv6 4-tuple hashing forced on.\n");
 #else
 	vi->hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN |
 	    F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN;
 #endif
 	rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, vi->rss[0],
 	    0, 0);
 	if (rc != 0) {
 		CH_ERR(vi, "rss hash/defaultq config failed: %d\n", rc);
 		return (rc);
 	}
 
 	return (0);
 }
 
 int
 vi_init(struct vi_info *vi)
 {
 	int rc;
 
 	ASSERT_SYNCHRONIZED_OP(vi->adapter);
 	KASSERT((vi->flags & VI_INIT_DONE) == 0,
 	    ("%s: VI_INIT_DONE already", __func__));
 
 	rc = vi_full_init(vi);
 	if (rc != 0)
 		vi_full_uninit(vi);
 	else
 		vi->flags |= VI_INIT_DONE;
 
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 static void
 vi_full_uninit(struct vi_info *vi)
 {
 
 	if (vi->flags & VI_INIT_DONE) {
 		quiesce_vi(vi);
 		free(vi->rss, M_CXGBE);
 		free(vi->nm_rss, M_CXGBE);
 	}
 
 	/* Do this before freeing the VI queues. */
 	if (vi->flags & VI_SYSCTL_CTX) {
 		sysctl_ctx_free(&vi->ctx);
 		vi->flags &= ~VI_SYSCTL_CTX;
 	}
 
 	t4_teardown_vi_queues(vi);
 	vi->flags &= ~VI_INIT_DONE;
 }
 
 static void
 quiesce_txq(struct sge_txq *txq)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 
 	MPASS(eq->flags & EQ_SW_ALLOCATED);
 	MPASS(!(eq->flags & EQ_ENABLED));
 
 	/* Wait for the mp_ring to empty. */
 	while (!mp_ring_is_idle(txq->r)) {
 		mp_ring_check_drainage(txq->r, 4096);
 		pause("rquiesce", 1);
 	}
 	MPASS(txq->txp.npkt == 0);
 
 	if (eq->flags & EQ_HW_ALLOCATED) {
 		/*
 		 * Hardware is alive and working normally.  Wait for it to
 		 * finish and then wait for the driver to catch up and reclaim
 		 * all descriptors.
 		 */
 		while (spg->cidx != htobe16(eq->pidx))
 			pause("equiesce", 1);
 		while (eq->cidx != eq->pidx)
 			pause("dquiesce", 1);
 	} else {
 		/*
 		 * Hardware is unavailable.  Discard all pending tx and reclaim
 		 * descriptors directly.
 		 */
 		TXQ_LOCK(txq);
 		while (eq->cidx != eq->pidx) {
 			struct mbuf *m, *nextpkt;
 			struct tx_sdesc *txsd;
 
 			txsd = &txq->sdesc[eq->cidx];
 			for (m = txsd->m; m != NULL; m = nextpkt) {
 				nextpkt = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				m_freem(m);
 			}
 			IDXINCR(eq->cidx, txsd->desc_used, eq->sidx);
 		}
 		spg->pidx = spg->cidx = htobe16(eq->cidx);
 		TXQ_UNLOCK(txq);
 	}
 }
 
 static void
 quiesce_wrq(struct sge_wrq *wrq)
 {
 
 	/* XXXTX */
 }
 
 static void
 quiesce_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl)
 {
 	/* Synchronize with the interrupt handler */
 	while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED))
 		pause("iqfree", 1);
 
 	if (fl != NULL) {
 		MPASS(iq->flags & IQ_HAS_FL);
 
 		mtx_lock(&sc->sfl_lock);
 		FL_LOCK(fl);
 		fl->flags |= FL_DOOMED;
 		FL_UNLOCK(fl);
 		callout_stop(&sc->sfl_callout);
 		mtx_unlock(&sc->sfl_lock);
 
 		KASSERT((fl->flags & FL_STARVING) == 0,
 		    ("%s: still starving", __func__));
 
 		/* Release all buffers if hardware is no longer available. */
 		if (!(iq->flags & IQ_HW_ALLOCATED))
 			free_fl_buffers(sc, fl);
 	}
 }
 
 /*
  * Wait for all activity on all the queues of the VI to complete.  It is assumed
  * that no new work is being enqueued by the hardware or the driver.  That part
  * should be arranged before calling this function.
  */
 static void
 quiesce_vi(struct vi_info *vi)
 {
 	int i;
 	struct adapter *sc = vi->adapter;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_ofld_txq *ofld_txq;
 #endif
 
 	if (!(vi->flags & VI_INIT_DONE))
 		return;
 
 	for_each_txq(vi, i, txq) {
 		quiesce_txq(txq);
 	}
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		quiesce_wrq(&ofld_txq->wrq);
 	}
 #endif
 
 	for_each_rxq(vi, i, rxq) {
 		quiesce_iq_fl(sc, &rxq->iq, &rxq->fl);
 	}
 
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		quiesce_iq_fl(sc, &ofld_rxq->iq, &ofld_rxq->fl);
 	}
 #endif
 }
 
 static int
 t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid,
     driver_intr_t *handler, void *arg, char *name)
 {
 	int rc;
 
 	irq->rid = rid;
 	irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid,
 	    RF_SHAREABLE | RF_ACTIVE);
 	if (irq->res == NULL) {
 		device_printf(sc->dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 
 	rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET,
 	    NULL, handler, arg, &irq->tag);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 		    rid, name, rc);
 	} else if (name)
 		bus_describe_intr(sc->dev, irq->res, irq->tag, "%s", name);
 
 	return (rc);
 }
 
 static int
 t4_free_irq(struct adapter *sc, struct irq *irq)
 {
 	if (irq->tag)
 		bus_teardown_intr(sc->dev, irq->res, irq->tag);
 	if (irq->res)
 		bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res);
 
 	bzero(irq, sizeof(*irq));
 
 	return (0);
 }
 
 static void
 get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf)
 {
 
 	regs->version = chip_id(sc) | chip_rev(sc) << 10;
 	t4_get_regs(sc, buf, regs->len);
 }
 
 #define	A_PL_INDIR_CMD	0x1f8
 
 #define	S_PL_AUTOINC	31
 #define	M_PL_AUTOINC	0x1U
 #define	V_PL_AUTOINC(x)	((x) << S_PL_AUTOINC)
 #define	G_PL_AUTOINC(x)	(((x) >> S_PL_AUTOINC) & M_PL_AUTOINC)
 
 #define	S_PL_VFID	20
 #define	M_PL_VFID	0xffU
 #define	V_PL_VFID(x)	((x) << S_PL_VFID)
 #define	G_PL_VFID(x)	(((x) >> S_PL_VFID) & M_PL_VFID)
 
 #define	S_PL_ADDR	0
 #define	M_PL_ADDR	0xfffffU
 #define	V_PL_ADDR(x)	((x) << S_PL_ADDR)
 #define	G_PL_ADDR(x)	(((x) >> S_PL_ADDR) & M_PL_ADDR)
 
 #define	A_PL_INDIR_DATA	0x1fc
 
 static uint64_t
 read_vf_stat(struct adapter *sc, u_int vin, int reg)
 {
 	u32 stats[2];
 
 	if (sc->flags & IS_VF) {
 		stats[0] = t4_read_reg(sc, VF_MPS_REG(reg));
 		stats[1] = t4_read_reg(sc, VF_MPS_REG(reg + 4));
 	} else {
 		mtx_assert(&sc->reg_lock, MA_OWNED);
 		t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) |
 		    V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(reg)));
 		stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA);
 		stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA);
 	}
 	return (((uint64_t)stats[1]) << 32 | stats[0]);
 }
 
 static void
 t4_get_vi_stats(struct adapter *sc, u_int vin, struct fw_vi_stats_vf *stats)
 {
 
 #define GET_STAT(name) \
 	read_vf_stat(sc, vin, A_MPS_VF_STAT_##name##_L)
 
 	if (!(sc->flags & IS_VF))
 		mtx_lock(&sc->reg_lock);
 	stats->tx_bcast_bytes    = GET_STAT(TX_VF_BCAST_BYTES);
 	stats->tx_bcast_frames   = GET_STAT(TX_VF_BCAST_FRAMES);
 	stats->tx_mcast_bytes    = GET_STAT(TX_VF_MCAST_BYTES);
 	stats->tx_mcast_frames   = GET_STAT(TX_VF_MCAST_FRAMES);
 	stats->tx_ucast_bytes    = GET_STAT(TX_VF_UCAST_BYTES);
 	stats->tx_ucast_frames   = GET_STAT(TX_VF_UCAST_FRAMES);
 	stats->tx_drop_frames    = GET_STAT(TX_VF_DROP_FRAMES);
 	stats->tx_offload_bytes  = GET_STAT(TX_VF_OFFLOAD_BYTES);
 	stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES);
 	stats->rx_bcast_bytes    = GET_STAT(RX_VF_BCAST_BYTES);
 	stats->rx_bcast_frames   = GET_STAT(RX_VF_BCAST_FRAMES);
 	stats->rx_mcast_bytes    = GET_STAT(RX_VF_MCAST_BYTES);
 	stats->rx_mcast_frames   = GET_STAT(RX_VF_MCAST_FRAMES);
 	stats->rx_ucast_bytes    = GET_STAT(RX_VF_UCAST_BYTES);
 	stats->rx_ucast_frames   = GET_STAT(RX_VF_UCAST_FRAMES);
 	stats->rx_err_frames     = GET_STAT(RX_VF_ERR_FRAMES);
 	if (!(sc->flags & IS_VF))
 		mtx_unlock(&sc->reg_lock);
 
 #undef GET_STAT
 }
 
 static void
 t4_clr_vi_stats(struct adapter *sc, u_int vin)
 {
 	int reg;
 
 	t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) |
 	    V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L)));
 	for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L;
 	     reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4)
 		t4_write_reg(sc, A_PL_INDIR_DATA, 0);
 }
 
 static void
 vi_refresh_stats(struct vi_info *vi)
 {
 	struct timeval tv;
 	const struct timeval interval = {0, 250000};	/* 250ms */
 
 	mtx_assert(&vi->tick_mtx, MA_OWNED);
 
 	if (!(vi->flags & VI_INIT_DONE) || vi->flags & VI_SKIP_STATS)
 		return;
 
 	getmicrotime(&tv);
 	timevalsub(&tv, &interval);
 	if (timevalcmp(&tv, &vi->last_refreshed, <))
 		return;
 
 	t4_get_vi_stats(vi->adapter, vi->vin, &vi->stats);
 	getmicrotime(&vi->last_refreshed);
 }
 
 static void
 cxgbe_refresh_stats(struct vi_info *vi)
 {
 	u_int i, v, tnl_cong_drops, chan_map;
 	struct timeval tv;
 	const struct timeval interval = {0, 250000};	/* 250ms */
 	struct port_info *pi;
 	struct adapter *sc;
 
 	mtx_assert(&vi->tick_mtx, MA_OWNED);
 
 	if (vi->flags & VI_SKIP_STATS)
 		return;
 
 	getmicrotime(&tv);
 	timevalsub(&tv, &interval);
 	if (timevalcmp(&tv, &vi->last_refreshed, <))
 		return;
 
 	pi = vi->pi;
 	sc = vi->adapter;
 	tnl_cong_drops = 0;
 	t4_get_port_stats(sc, pi->tx_chan, &pi->stats);
 	chan_map = pi->rx_e_chan_map;
 	while (chan_map) {
 		i = ffs(chan_map) - 1;
 		mtx_lock(&sc->reg_lock);
 		t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1,
 		    A_TP_MIB_TNL_CNG_DROP_0 + i);
 		mtx_unlock(&sc->reg_lock);
 		tnl_cong_drops += v;
 		chan_map &= ~(1 << i);
 	}
 	pi->tnl_cong_drops = tnl_cong_drops;
 	getmicrotime(&vi->last_refreshed);
 }
 
 static void
 cxgbe_tick(void *arg)
 {
 	struct vi_info *vi = arg;
 
 	MPASS(IS_MAIN_VI(vi));
 	mtx_assert(&vi->tick_mtx, MA_OWNED);
 
 	cxgbe_refresh_stats(vi);
 	callout_schedule(&vi->tick, hz);
 }
 
 static void
 vi_tick(void *arg)
 {
 	struct vi_info *vi = arg;
 
 	mtx_assert(&vi->tick_mtx, MA_OWNED);
 
 	vi_refresh_stats(vi);
 	callout_schedule(&vi->tick, hz);
 }
 
 /*
  * Should match fw_caps_config_<foo> enums in t4fw_interface.h
  */
 static char *caps_decoder[] = {
 	"\20\001IPMI\002NCSI",				/* 0: NBM */
 	"\20\001PPP\002QFC\003DCBX",			/* 1: link */
 	"\20\001INGRESS\002EGRESS",			/* 2: switch */
 	"\20\001NIC\002VM\003IDS\004UM\005UM_ISGL"	/* 3: NIC */
 	    "\006HASHFILTER\007ETHOFLD",
 	"\20\001TOE",					/* 4: TOE */
 	"\20\001RDDP\002RDMAC",				/* 5: RDMA */
 	"\20\001INITIATOR_PDU\002TARGET_PDU"		/* 6: iSCSI */
 	    "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD"
 	    "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD"
 	    "\007T10DIF"
 	    "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD",
 	"\20\001LOOKASIDE\002TLSKEYS\003IPSEC_INLINE"	/* 7: Crypto */
 	    "\004TLS_HW",
 	"\20\001INITIATOR\002TARGET\003CTRL_OFLD"	/* 8: FCoE */
 		    "\004PO_INITIATOR\005PO_TARGET",
 };
 
 void
 t4_sysctls(struct adapter *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children, *c0;
 	static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"};
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 
 	/*
 	 * dev.t4nex.X.
 	 */
 	oid = device_get_sysctl_tree(sc->dev);
 	c0 = children = SYSCTL_CHILDREN(oid);
 
 	sc->sc_do_rxcopy = 1;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW,
 	    &sc->sc_do_rxcopy, 1, "Do RX copy of small frames");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL,
 	    sc->params.nports, "# of ports");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, doorbells,
 	    (uintptr_t)&sc->doorbells, sysctl_bitfield_8b, "A",
 	    "available doorbells");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL,
 	    sc->params.vpd.cclk, "core clock frequency (in KHz)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    sc->params.sge.timer_val, sizeof(sc->params.sge.timer_val),
 	    sysctl_int_array, "A", "interrupt holdoff timer values (us)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    sc->params.sge.counter_val, sizeof(sc->params.sge.counter_val),
 	    sysctl_int_array, "A", "interrupt holdoff packet counter values");
 
 	t4_sge_sysctls(sc, ctx, children);
 
 	sc->lro_timeout = 100;
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW,
 	    &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dflags", CTLFLAG_RW,
 	    &sc->debug_flags, 0, "flags to enable runtime debugging");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version",
 	    CTLFLAG_RD, sc->tp_version, 0, "TP microcode version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
 	    CTLFLAG_RD, sc->fw_version, 0, "firmware version");
 
 	if (sc->flags & IS_VF)
 		return;
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD,
 	    NULL, chip_rev(sc), "chip hardware revision");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "sn",
 	    CTLFLAG_RD, sc->params.vpd.sn, 0, "serial number");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pn",
 	    CTLFLAG_RD, sc->params.vpd.pn, 0, "part number");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "ec",
 	    CTLFLAG_RD, sc->params.vpd.ec, 0, "engineering change");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "md_version",
 	    CTLFLAG_RD, sc->params.vpd.md, 0, "manufacturing diags version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "na",
 	    CTLFLAG_RD, sc->params.vpd.na, 0, "network address");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "er_version", CTLFLAG_RD,
 	    sc->er_version, 0, "expansion ROM version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bs_version", CTLFLAG_RD,
 	    sc->bs_version, 0, "bootstrap firmware version");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "scfg_version", CTLFLAG_RD,
 	    NULL, sc->params.scfg_vers, "serial config version");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "vpd_version", CTLFLAG_RD,
 	    NULL, sc->params.vpd_vers, "VPD version");
 
 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf",
 	    CTLFLAG_RD, sc->cfg_file, 0, "configuration file");
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL,
 	    sc->cfcsum, "config file checksum");
 
 #define SYSCTL_CAP(name, n, text) \
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, caps_decoder[n], \
 	    (uintptr_t)&sc->name, sysctl_bitfield_16b, "A", \
 	    "available " text " capabilities")
 
 	SYSCTL_CAP(nbmcaps, 0, "NBM");
 	SYSCTL_CAP(linkcaps, 1, "link");
 	SYSCTL_CAP(switchcaps, 2, "switch");
 	SYSCTL_CAP(niccaps, 3, "NIC");
 	SYSCTL_CAP(toecaps, 4, "TCP offload");
 	SYSCTL_CAP(rdmacaps, 5, "RDMA");
 	SYSCTL_CAP(iscsicaps, 6, "iSCSI");
 	SYSCTL_CAP(cryptocaps, 7, "crypto");
 	SYSCTL_CAP(fcoecaps, 8, "FCoE");
 #undef SYSCTL_CAP
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD,
 	    NULL, sc->tids.nftids, "number of filters");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_temperature, "I", "chip temperature (in Celsius)");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "reset_sensor",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_reset_sensor, "I", "reset the chip's temperature sensor.");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "loadavg",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_loadavg, "A",
 	    "microprocessor load averages (debug firmwares only)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "core_vdd",
 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_vdd,
 	    "I", "core Vdd (in mV)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "local_cpus",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, LOCAL_CPUS,
 	    sysctl_cpus, "A", "local CPUs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_cpus",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, INTR_CPUS,
 	    sysctl_cpus, "A", "preferred CPUs for interrupts");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "swintr", CTLFLAG_RW,
 	    &sc->swintr, 0, "software triggered interrupts");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "reset",
 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_reset, "I",
 	    "1 = reset adapter, 0 = zero reset counter");
 
 	/*
 	 * dev.t4nex.X.misc.  Marked CTLFLAG_SKIP to avoid information overload.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc",
 	    CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL,
 	    "logs and miscellaneous information");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_cctrl, "A", "congestion control");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 3,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 4,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 5,
 	    sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_cim_la, "A", "CIM logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_cim_ma_la, "A", "CIM MA logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 	    0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 	    1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 	    2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 	    3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 	    4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 	    5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)");
 
 	if (chip_id(sc) > CHELSIO_T4) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A",
 		    "CIM OBQ 6 (SGE0-RX)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A",
 		    "CIM OBQ 7 (SGE1-RX)");
 	}
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_cim_pif_la, "A", "CIM PIF logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_cim_qcfg, "A", "CIM queue configuration");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_cpl_stats, "A", "CPL statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_ddp_stats, "A", "non-TCP DDP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tid_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tid_stats, "A", "tid stats");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_devlog, "A", "firmware's device log");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_fcoe_stats, "A", "FCoE statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_hw_sched, "A", "hardware scheduler ");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_l2t, "A", "hardware L2 table");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "smt",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_smt, "A", "hardware source MAC table");
 
 #ifdef INET6
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "clip",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_clip, "A", "active CLIP table entries");
 #endif
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_lb_stats, "A", "loopback statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_meminfo, "A", "memory regions");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6,
 	    "A", "MPS TCAM entries");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_path_mtus, "A", "path MTUs");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_pm_stats, "A", "PM statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_rdma_stats, "A", "RDMA statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tcp_stats, "A", "TCP statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tids, "A", "TID information");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tp_err_stats, "A", "TP error statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tnl_stats",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tnl_stats, "A", "TP tunnel statistics");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tp_la_mask, "I", "TP logic analyzer event capture mask");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tp_la, "A", "TP logic analyzer");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_tx_rate, "A", "Tx rate");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_ulprx_la, "A", "ULPRX logic analyzer");
 
 	if (chip_id(sc) >= CHELSIO_T5) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 		    sysctl_wcwr_stats, "A", "write combined work requests");
 	}
 
 #ifdef KERN_TLS
 	if (is_ktls(sc)) {
 		/*
 		 * dev.t4nex.0.tls.
 		 */
 		oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "tls",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "KERN_TLS parameters");
 		children = SYSCTL_CHILDREN(oid);
 
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "inline_keys",
 		    CTLFLAG_RW, &sc->tlst.inline_keys, 0, "Always pass TLS "
 		    "keys in work requests (1) or attempt to store TLS keys "
 		    "in card memory.");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "combo_wrs",
 		    CTLFLAG_RW, &sc->tlst.combo_wrs, 0, "Attempt to combine "
 		    "TCB field updates with TLS record work requests.");
 	}
 #endif
 
 #ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		int i;
 		char s[4];
 
 		/*
 		 * dev.t4nex.X.toe.
 		 */
 		oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE parameters");
 		children = SYSCTL_CHILDREN(oid);
 
 		sc->tt.cong_algorithm = -1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_algorithm",
 		    CTLFLAG_RW, &sc->tt.cong_algorithm, 0, "congestion control "
 		    "(-1 = default, 0 = reno, 1 = tahoe, 2 = newreno, "
 		    "3 = highspeed)");
 
 		sc->tt.sndbuf = -1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW,
 		    &sc->tt.sndbuf, 0, "hardware send buffer");
 
 		sc->tt.ddp = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp",
 		    CTLFLAG_RW | CTLFLAG_SKIP, &sc->tt.ddp, 0, "");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_zcopy", CTLFLAG_RW,
 		    &sc->tt.ddp, 0, "Enable zero-copy aio_read(2)");
 
 		sc->tt.rx_coalesce = -1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce",
 		    CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing");
 
 		sc->tt.tls = 0;
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls", CTLTYPE_INT |
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tls, "I",
 		    "Inline TLS allowed");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_ports",
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 		    sysctl_tls_rx_ports, "I",
 		    "TCP ports that use inline TLS+TOE RX");
 
 		sc->tt.tls_rx_timeout = t4_toe_tls_rx_timeout;
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_timeout",
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 		    sysctl_tls_rx_timeout, "I",
 		    "Timeout in seconds to downgrade TLS sockets to plain TOE");
 
 		sc->tt.tx_align = -1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align",
 		    CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload");
 
 		sc->tt.tx_zcopy = 0;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy",
 		    CTLFLAG_RW, &sc->tt.tx_zcopy, 0,
 		    "Enable zero-copy aio_write(2)");
 
 		sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
 		    "cop_managed_offloading", CTLFLAG_RW,
 		    &sc->tt.cop_managed_offloading, 0,
 		    "COP (Connection Offload Policy) controls all TOE offload");
 
 		sc->tt.autorcvbuf_inc = 16 * 1024;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "autorcvbuf_inc",
 		    CTLFLAG_RW, &sc->tt.autorcvbuf_inc, 0,
 		    "autorcvbuf increment");
 
 		sc->tt.update_hc_on_pmtu_change = 1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
 		    "update_hc_on_pmtu_change", CTLFLAG_RW,
 		    &sc->tt.update_hc_on_pmtu_change, 0,
 		    "Update hostcache entry if the PMTU changes");
 
 		sc->tt.iso = 1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "iso", CTLFLAG_RW,
 		    &sc->tt.iso, 0, "Enable iSCSI segmentation offload");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 		    sysctl_tp_tick, "A", "TP timer tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1,
 		    sysctl_tp_tick, "A", "TCP timestamp tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2,
 		    sysctl_tp_tick, "A", "DACK tick (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer",
 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 		    sysctl_tp_dack_timer, "IU", "DACK timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_RXT_MIN, sysctl_tp_timer, "LU",
 		    "Minimum retransmit interval (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_RXT_MAX, sysctl_tp_timer, "LU",
 		    "Maximum retransmit interval (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_PERS_MIN, sysctl_tp_timer, "LU",
 		    "Persist timer min (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_PERS_MAX, sysctl_tp_timer, "LU",
 		    "Persist timer max (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_KEEP_IDLE, sysctl_tp_timer, "LU",
 		    "Keepalive idle timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_interval",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_KEEP_INTVL, sysctl_tp_timer, "LU",
 		    "Keepalive interval timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_INIT_SRTT, sysctl_tp_timer, "LU", "Initial SRTT (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer",
 		    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    A_TP_FINWAIT2_TIMER, sysctl_tp_timer, "LU",
 		    "FINWAIT2 timer (us)");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "syn_rexmt_count",
 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    S_SYNSHIFTMAX, sysctl_tp_shift_cnt, "IU",
 		    "Number of SYN retransmissions before abort");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_count",
 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    S_RXTSHIFTMAXR2, sysctl_tp_shift_cnt, "IU",
 		    "Number of retransmissions before abort");
 
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_count",
 		    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    S_KEEPALIVEMAXR2, sysctl_tp_shift_cnt, "IU",
 		    "Number of keepalive probes before abort");
 
 		oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rexmt_backoff",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 		    "TOE retransmit backoffs");
 		children = SYSCTL_CHILDREN(oid);
 		for (i = 0; i < 16; i++) {
 			snprintf(s, sizeof(s), "%u", i);
 			SYSCTL_ADD_PROC(ctx, children, OID_AUTO, s,
 			    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 			    i, sysctl_tp_backoff, "IU",
 			    "TOE retransmit backoff");
 		}
 	}
 #endif
 }
 
 void
 vi_sysctls(struct vi_info *vi)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 
 	ctx = device_get_sysctl_ctx(vi->dev);
 
 	/*
 	 * dev.v?(cxgbe|cxl).X.
 	 */
 	oid = device_get_sysctl_tree(vi->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL,
 	    vi->viid, "VI identifer");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD,
 	    &vi->nrxq, 0, "# of rx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD,
 	    &vi->ntxq, 0, "# of tx queues");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD,
 	    &vi->first_rxq, 0, "index of first rx queue");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
 	    &vi->first_txq, 0, "index of first tx queue");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_base", CTLFLAG_RD, NULL,
 	    vi->rss_base, "start of RSS indirection table");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL,
 	    vi->rss_size, "size of RSS indirection table");
 
 	if (IS_MAIN_VI(vi)) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq",
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 		    sysctl_noflowq, "IU",
 		    "Reserve queue 0 for non-flowid packets");
 	}
 
 	if (vi->adapter->flags & IS_VF) {
 		MPASS(vi->flags & TX_USES_VM_WR);
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_vm_wr", CTLFLAG_RD,
 		    NULL, 1, "use VM work requests for transmit");
 	} else {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_vm_wr",
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 		    sysctl_tx_vm_wr, "I", "use VM work requestes for transmit");
 	}
 
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD,
 		    &vi->nofldrxq, 0,
 		    "# of rx queues for offloaded TCP connections");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq",
 		    CTLFLAG_RD, &vi->first_ofld_rxq, 0,
 		    "index of first TOE rx queue");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx_ofld",
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 		    sysctl_holdoff_tmr_idx_ofld, "I",
 		    "holdoff timer index for TOE queues");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx_ofld",
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 		    sysctl_holdoff_pktc_idx_ofld, "I",
 		    "holdoff packet counter index for TOE queues");
 	}
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	if (vi->nofldtxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD,
 		    &vi->nofldtxq, 0,
 		    "# of tx queues for TOE/ETHOFLD");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq",
 		    CTLFLAG_RD, &vi->first_ofld_txq, 0,
 		    "index of first TOE/ETHOFLD tx queue");
 	}
 #endif
 #ifdef DEV_NETMAP
 	if (vi->nnmrxq != 0) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD,
 		    &vi->nnmrxq, 0, "# of netmap rx queues");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD,
 		    &vi->nnmtxq, 0, "# of netmap tx queues");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq",
 		    CTLFLAG_RD, &vi->first_nm_rxq, 0,
 		    "index of first netmap rx queue");
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq",
 		    CTLFLAG_RD, &vi->first_nm_txq, 0,
 		    "index of first netmap tx queue");
 	}
 #endif
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 	    sysctl_holdoff_tmr_idx, "I", "holdoff timer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 	    sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index");
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 	    sysctl_qsize_rxq, "I", "rx queue size");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0,
 	    sysctl_qsize_txq, "I", "tx queue size");
 }
 
 static void
 cxgbe_sysctls(struct port_info *pi)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children, *children2;
 	struct adapter *sc = pi->adapter;
 	int i;
 	char name[16];
 	static char *tc_flags = {"\20\1USER"};
 
 	ctx = device_get_sysctl_ctx(pi->dev);
 
 	/*
 	 * dev.cxgbe.X.
 	 */
 	oid = device_get_sysctl_tree(pi->dev);
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 0,
 	    sysctl_linkdnrc, "A", "reason why link is down");
 	if (pi->port_type == FW_PORT_TYPE_BT_XAUI) {
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature",
 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 0,
 		    sysctl_btphy, "I", "PHY temperature (in Celsius)");
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version",
 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 1,
 		    sysctl_btphy, "I", "PHY firmware version");
 	}
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings",
 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0,
 	    sysctl_pause_settings, "A",
 	    "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fec",
 	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0,
 	    sysctl_fec, "A",
 	    "FECs to use (bit 0 = RS, 1 = FC, 2 = none, 5 = auto, 6 = module)");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "module_fec",
 	    CTLTYPE_STRING | CTLFLAG_MPSAFE, pi, 0, sysctl_module_fec, "A",
 	    "FEC recommended by the cable/transceiver");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "autoneg",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0,
 	    sysctl_autoneg, "I",
 	    "autonegotiation (-1 = not supported)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcaps", CTLFLAG_RD,
 	    &pi->link_cfg.pcaps, 0, "port capabilities");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "acaps", CTLFLAG_RD,
 	    &pi->link_cfg.acaps, 0, "advertised capabilities");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lpacaps", CTLFLAG_RD,
 	    &pi->link_cfg.lpacaps, 0, "link partner advertised capabilities");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL,
 	    port_top_speed(pi), "max speed (in Gbps)");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "mps_bg_map", CTLFLAG_RD, NULL,
 	    pi->mps_bg_map, "MPS buffer group map");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_e_chan_map", CTLFLAG_RD,
 	    NULL, pi->rx_e_chan_map, "TP rx e-channel map");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_c_chan", CTLFLAG_RD, NULL,
 	    pi->rx_c_chan, "TP rx c-channel");
 
 	if (sc->flags & IS_VF)
 		return;
 
 	/*
 	 * dev.(cxgbe|cxl).X.tc.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 	    "Tx scheduler traffic classes (cl_rl)");
 	children2 = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "pktsize",
 	    CTLFLAG_RW, &pi->sched_params->pktsize, 0,
 	    "pktsize for per-flow cl-rl (0 means up to the driver )");
 	SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "burstsize",
 	    CTLFLAG_RW, &pi->sched_params->burstsize, 0,
 	    "burstsize for per-flow cl-rl (0 means up to the driver)");
 	for (i = 0; i < sc->params.nsched_cls; i++) {
 		struct tx_cl_rl_params *tc = &pi->sched_params->cl_rl[i];
 
 		snprintf(name, sizeof(name), "%d", i);
 		children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx,
 		    SYSCTL_CHILDREN(oid), OID_AUTO, name,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "traffic class"));
 		SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "state",
 		    CTLFLAG_RD, &tc->state, 0, "current state");
 		SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "flags",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, tc_flags,
 		    (uintptr_t)&tc->flags, sysctl_bitfield_8b, "A", "flags");
 		SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount",
 		    CTLFLAG_RD, &tc->refcount, 0, "references to this class");
 		SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc,
 		    (pi->port_id << 16) | i, sysctl_tc_params, "A",
 		    "traffic class parameters");
 	}
 
 	/*
 	 * dev.cxgbe.X.stats.
 	 */
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "port statistics");
 	children = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD,
 	    &pi->tx_parse_error, 0,
 	    "# of tx packets with invalid length or # of segments");
 
 #define T4_REGSTAT(name, stat, desc) \
     SYSCTL_ADD_OID(ctx, children, OID_AUTO, #name, \
         CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, \
 	(is_t4(sc) ? PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_##stat##_L) : \
 	T5_PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_##stat##_L)), \
         sysctl_handle_t4_reg64, "QU", desc)
 
 /* We get these from port_stats and they may be stale by up to 1s */
 #define T4_PORTSTAT(name, desc) \
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \
 	    &pi->stats.name, desc)
 
 	T4_REGSTAT(tx_octets, TX_PORT_BYTES, "# of octets in good frames");
 	T4_REGSTAT(tx_frames, TX_PORT_FRAMES, "total # of good frames");
 	T4_REGSTAT(tx_bcast_frames, TX_PORT_BCAST, "# of broadcast frames");
 	T4_REGSTAT(tx_mcast_frames, TX_PORT_MCAST, "# of multicast frames");
 	T4_REGSTAT(tx_ucast_frames, TX_PORT_UCAST, "# of unicast frames");
 	T4_REGSTAT(tx_error_frames, TX_PORT_ERROR, "# of error frames");
 	T4_REGSTAT(tx_frames_64, TX_PORT_64B, "# of tx frames in this range");
 	T4_REGSTAT(tx_frames_65_127, TX_PORT_65B_127B, "# of tx frames in this range");
 	T4_REGSTAT(tx_frames_128_255, TX_PORT_128B_255B, "# of tx frames in this range");
 	T4_REGSTAT(tx_frames_256_511, TX_PORT_256B_511B, "# of tx frames in this range");
 	T4_REGSTAT(tx_frames_512_1023, TX_PORT_512B_1023B, "# of tx frames in this range");
 	T4_REGSTAT(tx_frames_1024_1518, TX_PORT_1024B_1518B, "# of tx frames in this range");
 	T4_REGSTAT(tx_frames_1519_max, TX_PORT_1519B_MAX, "# of tx frames in this range");
 	T4_REGSTAT(tx_drop, TX_PORT_DROP, "# of dropped tx frames");
 	T4_REGSTAT(tx_pause, TX_PORT_PAUSE, "# of pause frames transmitted");
 	T4_REGSTAT(tx_ppp0, TX_PORT_PPP0, "# of PPP prio 0 frames transmitted");
 	T4_REGSTAT(tx_ppp1, TX_PORT_PPP1, "# of PPP prio 1 frames transmitted");
 	T4_REGSTAT(tx_ppp2, TX_PORT_PPP2, "# of PPP prio 2 frames transmitted");
 	T4_REGSTAT(tx_ppp3, TX_PORT_PPP3, "# of PPP prio 3 frames transmitted");
 	T4_REGSTAT(tx_ppp4, TX_PORT_PPP4, "# of PPP prio 4 frames transmitted");
 	T4_REGSTAT(tx_ppp5, TX_PORT_PPP5, "# of PPP prio 5 frames transmitted");
 	T4_REGSTAT(tx_ppp6, TX_PORT_PPP6, "# of PPP prio 6 frames transmitted");
 	T4_REGSTAT(tx_ppp7, TX_PORT_PPP7, "# of PPP prio 7 frames transmitted");
 
 	T4_REGSTAT(rx_octets, RX_PORT_BYTES, "# of octets in good frames");
 	T4_REGSTAT(rx_frames, RX_PORT_FRAMES, "total # of good frames");
 	T4_REGSTAT(rx_bcast_frames, RX_PORT_BCAST, "# of broadcast frames");
 	T4_REGSTAT(rx_mcast_frames, RX_PORT_MCAST, "# of multicast frames");
 	T4_REGSTAT(rx_ucast_frames, RX_PORT_UCAST, "# of unicast frames");
 	T4_REGSTAT(rx_too_long, RX_PORT_MTU_ERROR, "# of frames exceeding MTU");
 	T4_REGSTAT(rx_jabber, RX_PORT_MTU_CRC_ERROR, "# of jabber frames");
 	if (is_t6(sc)) {
 		T4_PORTSTAT(rx_fcs_err,
 		    "# of frames received with bad FCS since last link up");
 	} else {
 		T4_REGSTAT(rx_fcs_err, RX_PORT_CRC_ERROR,
 		    "# of frames received with bad FCS");
 	}
 	T4_REGSTAT(rx_len_err, RX_PORT_LEN_ERROR, "# of frames received with length error");
 	T4_REGSTAT(rx_symbol_err, RX_PORT_SYM_ERROR, "symbol errors");
 	T4_REGSTAT(rx_runt, RX_PORT_LESS_64B, "# of short frames received");
 	T4_REGSTAT(rx_frames_64, RX_PORT_64B, "# of rx frames in this range");
 	T4_REGSTAT(rx_frames_65_127, RX_PORT_65B_127B, "# of rx frames in this range");
 	T4_REGSTAT(rx_frames_128_255, RX_PORT_128B_255B, "# of rx frames in this range");
 	T4_REGSTAT(rx_frames_256_511, RX_PORT_256B_511B, "# of rx frames in this range");
 	T4_REGSTAT(rx_frames_512_1023, RX_PORT_512B_1023B, "# of rx frames in this range");
 	T4_REGSTAT(rx_frames_1024_1518, RX_PORT_1024B_1518B, "# of rx frames in this range");
 	T4_REGSTAT(rx_frames_1519_max, RX_PORT_1519B_MAX, "# of rx frames in this range");
 	T4_REGSTAT(rx_pause, RX_PORT_PAUSE, "# of pause frames received");
 	T4_REGSTAT(rx_ppp0, RX_PORT_PPP0, "# of PPP prio 0 frames received");
 	T4_REGSTAT(rx_ppp1, RX_PORT_PPP1, "# of PPP prio 1 frames received");
 	T4_REGSTAT(rx_ppp2, RX_PORT_PPP2, "# of PPP prio 2 frames received");
 	T4_REGSTAT(rx_ppp3, RX_PORT_PPP3, "# of PPP prio 3 frames received");
 	T4_REGSTAT(rx_ppp4, RX_PORT_PPP4, "# of PPP prio 4 frames received");
 	T4_REGSTAT(rx_ppp5, RX_PORT_PPP5, "# of PPP prio 5 frames received");
 	T4_REGSTAT(rx_ppp6, RX_PORT_PPP6, "# of PPP prio 6 frames received");
 	T4_REGSTAT(rx_ppp7, RX_PORT_PPP7, "# of PPP prio 7 frames received");
 
 	T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows");
 	T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows");
 	T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows");
 	T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows");
 	T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets");
 	T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets");
 	T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets");
 	T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets");
 
 #undef T4_REGSTAT
 #undef T4_PORTSTAT
 }
 
 static int
 sysctl_int_array(SYSCTL_HANDLER_ARGS)
 {
 	int rc, *i, space = 0;
 	struct sbuf sb;
 
 	sbuf_new_for_sysctl(&sb, NULL, 64, req);
 	for (i = arg1; arg2; arg2 -= sizeof(int), i++) {
 		if (space)
 			sbuf_printf(&sb, " ");
 		sbuf_printf(&sb, "%d", *i);
 		space = 1;
 	}
 	rc = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 static int
 sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%b", *(uint8_t *)(uintptr_t)arg2, (char *)arg1);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%b", *(uint16_t *)(uintptr_t)arg2, (char *)arg1);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_btphy(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	int op = arg2;
 	struct adapter *sc = pi->adapter;
 	u_int v;
 	int rc;
 
 	rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		/* XXX: magic numbers */
 		rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e,
 		    op ? 0x20 : 0xc820, &v);
 	}
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 	if (op == 0)
 		v /= 256;
 
 	rc = sysctl_handle_int(oidp, &v, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_noflowq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	int rc, val;
 
 	val = vi->rsrv_noflowq;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if ((val >= 1) && (vi->ntxq > 1))
 		vi->rsrv_noflowq = 1;
 	else
 		vi->rsrv_noflowq = 0;
 
 	return (rc);
 }
 
 static int
 sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	int rc, val, i;
 
 	MPASS(!(sc->flags & IS_VF));
 
 	val = vi->flags & TX_USES_VM_WR ? 1 : 0;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (val != 0 && val != 1)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4txvm");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else if (vi->ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		/*
 		 * We don't want parse_pkt to run with one setting (VF or PF)
 		 * and then eth_tx to see a different setting but still use
 		 * stale information calculated by parse_pkt.
 		 */
 		rc = EBUSY;
 	} else {
 		struct port_info *pi = vi->pi;
 		struct sge_txq *txq;
 		uint32_t ctrl0;
 		uint8_t npkt = sc->params.max_pkts_per_eth_tx_pkts_wr;
 
 		if (val) {
 			vi->flags |= TX_USES_VM_WR;
 			vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO;
 			ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 			    V_TXPKT_INTF(pi->tx_chan));
 			if (!(sc->flags & IS_VF))
 				npkt--;
 		} else {
 			vi->flags &= ~TX_USES_VM_WR;
 			vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
 			ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 			    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 			    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 		}
 		for_each_txq(vi, i, txq) {
 			txq->cpl_ctrl0 = ctrl0;
 			txq->txp.max_npkt = npkt;
 		}
 	}
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	int idx, rc, i;
 	struct sge_rxq *rxq;
 	uint8_t v;
 
 	idx = vi->tmr_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < 0 || idx >= SGE_NTIMERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4tmr");
 	if (rc)
 		return (rc);
 
 	v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1);
 	for_each_rxq(vi, i, rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&rxq->iq.intr_params, v);
 #else
 		rxq->iq.intr_params = v;
 #endif
 	}
 	vi->tmr_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	int idx, rc;
 
 	idx = vi->pktc_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < -1 || idx >= SGE_NCOUNTERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4pktc");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->pktc_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	int qsize, rc;
 
 	qsize = vi->qsize_rxq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (qsize < 128 || (qsize & 7))
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4rxqs");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->qsize_rxq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_qsize_txq(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	int qsize, rc;
 
 	qsize = vi->qsize_txq;
 
 	rc = sysctl_handle_int(oidp, &qsize, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (qsize < 128 || qsize > 65536)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4txqs");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->qsize_txq = qsize;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 
 static int
 sysctl_pause_settings(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 
 	if (req->newptr == NULL) {
 		struct sbuf *sb;
 		static char *bits = "\20\1RX\2TX\3AUTO";
 
 		rc = sysctl_wire_old_buffer(req, 0);
 		if (rc != 0)
 			return(rc);
 
 		sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 		if (sb == NULL)
 			return (ENOMEM);
 
 		if (lc->link_ok) {
 			sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) |
 			    (lc->requested_fc & PAUSE_AUTONEG), bits);
 		} else {
 			sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX |
 			    PAUSE_RX | PAUSE_AUTONEG), bits);
 		}
 		rc = sbuf_finish(sb);
 		sbuf_delete(sb);
 	} else {
 		char s[2];
 		int n;
 
 		s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX |
 		    PAUSE_AUTONEG));
 		s[1] = 0;
 
 		rc = sysctl_handle_string(oidp, s, sizeof(s), req);
 		if (rc != 0)
 			return(rc);
 
 		if (s[1] != 0)
 			return (EINVAL);
 		if (s[0] < '0' || s[0] > '9')
 			return (EINVAL);	/* not a number */
 		n = s[0] - '0';
 		if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG))
 			return (EINVAL);	/* some other bit is set too */
 
 		rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 		    "t4PAUSE");
 		if (rc)
 			return (rc);
 		if (!hw_off_limits(sc)) {
 			PORT_LOCK(pi);
 			lc->requested_fc = n;
 			fixup_link_config(pi);
 			if (pi->up_vis > 0)
 				rc = apply_link_config(pi);
 			set_current_media(pi);
 			PORT_UNLOCK(pi);
 		}
 		end_synchronized_op(sc, 0);
 	}
 
 	return (rc);
 }
 
 static int
 sysctl_fec(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 	int8_t old;
 
 	if (req->newptr == NULL) {
 		struct sbuf *sb;
 		static char *bits = "\20\1RS-FEC\2FC-FEC\3NO-FEC\4RSVD2"
 		    "\5RSVD3\6auto\7module";
 
 		rc = sysctl_wire_old_buffer(req, 0);
 		if (rc != 0)
 			return(rc);
 
 		sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 		if (sb == NULL)
 			return (ENOMEM);
 
 		/*
 		 * Display the requested_fec when the link is down -- the actual
 		 * FEC makes sense only when the link is up.
 		 */
 		if (lc->link_ok) {
 			sbuf_printf(sb, "%b", (lc->fec & M_FW_PORT_CAP32_FEC) |
 			    (lc->requested_fec & (FEC_AUTO | FEC_MODULE)),
 			    bits);
 		} else {
 			sbuf_printf(sb, "%b", lc->requested_fec, bits);
 		}
 		rc = sbuf_finish(sb);
 		sbuf_delete(sb);
 	} else {
 		char s[8];
 		int n;
 
 		snprintf(s, sizeof(s), "%d",
 		    lc->requested_fec == FEC_AUTO ? -1 :
 		    lc->requested_fec & (M_FW_PORT_CAP32_FEC | FEC_MODULE));
 
 		rc = sysctl_handle_string(oidp, s, sizeof(s), req);
 		if (rc != 0)
 			return(rc);
 
 		n = strtol(&s[0], NULL, 0);
 		if (n < 0 || n & FEC_AUTO)
 			n = FEC_AUTO;
 		else if (n & ~(M_FW_PORT_CAP32_FEC | FEC_MODULE))
 			return (EINVAL);/* some other bit is set too */
 
 		rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 		    "t4fec");
 		if (rc)
 			return (rc);
 		PORT_LOCK(pi);
 		old = lc->requested_fec;
 		if (n == FEC_AUTO)
 			lc->requested_fec = FEC_AUTO;
 		else if (n == 0 || n == FEC_NONE)
 			lc->requested_fec = FEC_NONE;
 		else {
 			if ((lc->pcaps |
 			    V_FW_PORT_CAP32_FEC(n & M_FW_PORT_CAP32_FEC)) !=
 			    lc->pcaps) {
 				rc = ENOTSUP;
 				goto done;
 			}
 			lc->requested_fec = n & (M_FW_PORT_CAP32_FEC |
 			    FEC_MODULE);
 		}
 		if (!hw_off_limits(sc)) {
 			fixup_link_config(pi);
 			if (pi->up_vis > 0) {
 				rc = apply_link_config(pi);
 				if (rc != 0) {
 					lc->requested_fec = old;
 					if (rc == FW_EPROTO)
 						rc = ENOTSUP;
 				}
 			}
 		}
 done:
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, 0);
 	}
 
 	return (rc);
 }
 
 static int
 sysctl_module_fec(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc;
 	int8_t fec;
 	struct sbuf *sb;
 	static char *bits = "\20\1RS-FEC\2FC-FEC\3NO-FEC\4RSVD2\5RSVD3";
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mfec") != 0) {
 		rc = EBUSY;
 		goto done;
 	}
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 	PORT_LOCK(pi);
 	if (pi->up_vis == 0) {
 		/*
 		 * If all the interfaces are administratively down the firmware
 		 * does not report transceiver changes.  Refresh port info here.
 		 * This is the only reason we have a synchronized op in this
 		 * function.  Just PORT_LOCK would have been enough otherwise.
 		 */
 		t4_update_port_info(pi);
 	}
 
 	fec = lc->fec_hint;
 	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE ||
 	    !fec_supported(lc->pcaps)) {
 		sbuf_printf(sb, "n/a");
 	} else {
 		if (fec == 0)
 			fec = FEC_NONE;
 		sbuf_printf(sb, "%b", fec & M_FW_PORT_CAP32_FEC, bits);
 	}
 	rc = sbuf_finish(sb);
 	PORT_UNLOCK(pi);
 done:
 	sbuf_delete(sb);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 static int
 sysctl_autoneg(SYSCTL_HANDLER_ARGS)
 {
 	struct port_info *pi = arg1;
 	struct adapter *sc = pi->adapter;
 	struct link_config *lc = &pi->link_cfg;
 	int rc, val;
 
 	if (lc->pcaps & FW_PORT_CAP32_ANEG)
 		val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1;
 	else
 		val = -1;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 	if (val == 0)
 		val = AUTONEG_DISABLE;
 	else if (val == 1)
 		val = AUTONEG_ENABLE;
 	else
 		val = AUTONEG_AUTO;
 
 	rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK,
 	    "t4aneg");
 	if (rc)
 		return (rc);
 	PORT_LOCK(pi);
 	if (val == AUTONEG_ENABLE && !(lc->pcaps & FW_PORT_CAP32_ANEG)) {
 		rc = ENOTSUP;
 		goto done;
 	}
 	lc->requested_aneg = val;
 	if (!hw_off_limits(sc)) {
 		fixup_link_config(pi);
 		if (pi->up_vis > 0)
 			rc = apply_link_config(pi);
 		set_current_media(pi);
 	}
 done:
 	PORT_UNLOCK(pi);
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, reg = arg2;
 	uint64_t val;
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		rc = 0;
 		val = t4_read_reg64(sc, reg);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc == 0)
 		rc = sysctl_handle_64(oidp, &val, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_temperature(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, t;
 	uint32_t param, val;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 		    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	}
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	/* unknown is returned as 0 but we display -1 in that case */
 	t = val == 0 ? -1 : val;
 
 	rc = sysctl_handle_int(oidp, &t, 0, req);
 	return (rc);
 }
 
 static int
 sysctl_vdd(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc;
 	uint32_t param, val;
 
 	if (sc->params.core_vdd == 0) {
 		rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 		    "t4vdd");
 		if (rc)
 			return (rc);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else {
 			param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 			    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 			    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD);
 			rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1,
 			    &param, &val);
 		}
 		end_synchronized_op(sc, 0);
 		if (rc)
 			return (rc);
 		sc->params.core_vdd = val;
 	}
 
 	return (sysctl_handle_int(oidp, &sc->params.core_vdd, 0, req));
 }
 
 static int
 sysctl_reset_sensor(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, v;
 	uint32_t param, val;
 
 	v = sc->sensor_resets;
 	rc = sysctl_handle_int(oidp, &v, 0, req);
 	if (rc != 0 || req->newptr == NULL || v <= 0)
 		return (rc);
 
 	if (sc->params.fw_vers < FW_VERSION32(1, 24, 7, 0) ||
 	    chip_id(sc) < CHELSIO_T5)
 		return (ENOTSUP);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4srst");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		param = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) |
 		    V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_RESET_TMP_SENSOR));
 		val = 1;
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	}
 	end_synchronized_op(sc, 0);
 	if (rc == 0)
 		sc->sensor_resets++;
 	return (rc);
 }
 
 static int
 sysctl_loadavg(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint32_t param, val;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4lavg");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_LOAD);
 		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 	}
 	end_synchronized_op(sc, 0);
 	if (rc)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (val == 0xffffffff) {
 		/* Only debug and custom firmwares report load averages. */
 		sbuf_printf(sb, "not available");
 	} else {
 		sbuf_printf(sb, "%d %d %d", val & 0xff, (val >> 8) & 0xff,
 		    (val >> 16) & 0xff);
 	}
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cctrl(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t incr[NMTUS][NCCTRL_WIN];
 	static const char *dec_fac[] = {
 		"0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875",
 		"0.9375"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_read_cong_tbl(sc, incr);
 	mtx_unlock(&sc->reg_lock);
 	if (rc)
 		goto done;
 
 	for (i = 0; i < NCCTRL_WIN; ++i) {
 		sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i,
 		    incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i],
 		    incr[5][i], incr[6][i], incr[7][i]);
 		sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n",
 		    incr[8][i], incr[9][i], incr[10][i], incr[11][i],
 		    incr[12][i], incr[13][i], incr[14][i], incr[15][i],
 		    sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]);
 	}
 
 	rc = sbuf_finish(sb);
 done:
 	sbuf_delete(sb);
 	return (rc);
 }
 
 static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = {
 	"TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI",	/* ibq's */
 	"ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI",	/* obq's */
 	"SGE0-RX", "SGE1-RX"	/* additional obq's (T5 onwards) */
 };
 
 static int
 sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n, qid = arg2;
 	uint32_t *buf, *p;
 	char *qtype;
 	u_int cim_num_obq = sc->chip_params->cim_num_obq;
 
 	KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq,
 	    ("%s: bad qid %d\n", __func__, qid));
 
 	if (qid < CIM_NUM_IBQ) {
 		/* inbound queue */
 		qtype = "IBQ";
 		n = 4 * CIM_IBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = -ENXIO;
 		else
 			rc = t4_read_cim_ibq(sc, qid, buf, n);
 		mtx_unlock(&sc->reg_lock);
 	} else {
 		/* outbound queue */
 		qtype = "OBQ";
 		qid -= CIM_NUM_IBQ;
 		n = 4 * cim_num_obq * CIM_OBQ_SIZE;
 		buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK);
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = -ENXIO;
 		else
 			rc = t4_read_cim_obq(sc, qid, buf, n);
 		mtx_unlock(&sc->reg_lock);
 	}
 
 	if (rc < 0) {
 		rc = -rc;
 		goto done;
 	}
 	n = rc * sizeof(uint32_t);	/* rc has # of words actually read */
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		goto done;
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]);
 	for (i = 0, p = buf; i < n; i += 16, p += 4)
 		sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1],
 		    p[2], p[3]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static void
 sbuf_cim_la4(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg)
 {
 	uint32_t *p;
 
 	sbuf_printf(sb, "Status   Data      PC%s",
 	    cfg & F_UPDBGLACAPTPCONLY ? "" :
 	    "     LS0Stat  LS0Addr             LS0Data");
 
 	for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) {
 		if (cfg & F_UPDBGLACAPTPCONLY) {
 			sbuf_printf(sb, "\n  %02x   %08x %08x", p[5] & 0xff,
 			    p[6], p[7]);
 			sbuf_printf(sb, "\n  %02x   %02x%06x %02x%06x",
 			    (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8,
 			    p[4] & 0xff, p[5] >> 8);
 			sbuf_printf(sb, "\n  %02x   %x%07x %x%07x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4);
 		} else {
 			sbuf_printf(sb,
 			    "\n  %02x   %x%07x %x%07x %08x %08x "
 			    "%08x%08x%08x%08x",
 			    (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
 			    p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5],
 			    p[6], p[7]);
 		}
 	}
 }
 
 static void
 sbuf_cim_la6(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg)
 {
 	uint32_t *p;
 
 	sbuf_printf(sb, "Status   Inst    Data      PC%s",
 	    cfg & F_UPDBGLACAPTPCONLY ? "" :
 	    "     LS0Stat  LS0Addr  LS0Data  LS1Stat  LS1Addr  LS1Data");
 
 	for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) {
 		if (cfg & F_UPDBGLACAPTPCONLY) {
 			sbuf_printf(sb, "\n  %02x   %08x %08x %08x",
 			    p[3] & 0xff, p[2], p[1], p[0]);
 			sbuf_printf(sb, "\n  %02x   %02x%06x %02x%06x %02x%06x",
 			    (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8,
 			    p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8);
 			sbuf_printf(sb, "\n  %02x   %04x%04x %04x%04x %04x%04x",
 			    (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16,
 			    p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff,
 			    p[6] >> 16);
 		} else {
 			sbuf_printf(sb, "\n  %02x   %04x%04x %04x%04x %04x%04x "
 			    "%08x %08x %08x %08x %08x %08x",
 			    (p[9] >> 16) & 0xff,
 			    p[9] & 0xffff, p[8] >> 16,
 			    p[8] & 0xffff, p[7] >> 16,
 			    p[7] & 0xffff, p[6] >> 16,
 			    p[2], p[1], p[0], p[5], p[4], p[3]);
 		}
 	}
 }
 
 static int
 sbuf_cim_la(struct adapter *sc, struct sbuf *sb, int flags)
 {
 	uint32_t cfg, *buf;
 	int rc;
 
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 	buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | flags);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg);
 		if (rc == 0)
 			rc = -t4_cim_read_la(sc, buf, NULL);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc == 0) {
 		if (chip_id(sc) < CHELSIO_T6)
 			sbuf_cim_la4(sc, sb, buf, cfg);
 		else
 			sbuf_cim_la6(sc, sb, buf, cfg);
 	}
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	rc = sbuf_cim_la(sc, sb, M_WAITOK);
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 bool
 t4_os_dump_cimla(struct adapter *sc, int arg, bool verbose)
 {
 	struct sbuf sb;
 	int rc;
 
 	if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
 		return (false);
 	rc = sbuf_cim_la(sc, &sb, M_NOWAIT);
 	if (rc == 0) {
 		rc = sbuf_finish(&sb);
 		if (rc == 0) {
 			log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s",
 		    		device_get_nameunit(sc->dev), sbuf_data(&sb));
 		}
 	}
 	sbuf_delete(&sb);
 	return (false);
 }
 
 static int
 sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE);
 	mtx_unlock(&sc->reg_lock);
 	if (rc)
 		goto done;
 
 	p = buf;
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2],
 		    p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCnt ID Tag UE       Data       RDY VLD");
 	for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) {
 		sbuf_printf(sb, "\n%3u %2u  %x   %u %08x%08x  %u   %u",
 		    (p[2] >> 10) & 0xff, (p[2] >> 7) & 7,
 		    (p[2] >> 3) & 0xf, (p[2] >> 2) & 1,
 		    (p[1] >> 2) | ((p[2] & 3) << 30),
 		    (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1,
 		    p[0] & 1);
 	}
 	rc = sbuf_finish(sb);
 done:
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int i;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL);
 	mtx_unlock(&sc->reg_lock);
 	if (rc)
 		goto done;
 
 	p = buf;
 	sbuf_printf(sb, "Cntl ID DataBE   Addr                 Data");
 	for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x  %04x  %08x %08x%08x%08x%08x",
 		    (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff,
 		    p[4], p[3], p[2], p[1], p[0]);
 	}
 
 	sbuf_printf(sb, "\n\nCntl ID               Data");
 	for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) {
 		sbuf_printf(sb, "\n %02x  %02x %08x%08x%08x%08x",
 		    (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]);
 	}
 
 	rc = sbuf_finish(sb);
 done:
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
 	uint16_t thres[CIM_NUM_IBQ];
 	uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr;
 	uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat;
 	u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq;
 
 	cim_num_obq = sc->chip_params->cim_num_obq;
 	if (is_t4(sc)) {
 		ibq_rdaddr = A_UP_IBQ_0_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_REALADDR;
 	} else {
 		ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR;
 		obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR;
 	}
 	nq = CIM_NUM_IBQ + cim_num_obq;
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat);
 		if (rc == 0) {
 			rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq,
 			    obq_wr);
 			if (rc == 0)
 				t4_read_cimq_cfg(sc, base, size, thres);
 		}
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "  Queue  Base  Size Thres  RdPtr WrPtr  SOP  EOP Avail");
 
 	for (i = 0; i < CIM_NUM_IBQ; i++, p += 4)
 		sbuf_printf(sb, "\n%7s %5x %5u %5u %6x  %4x %4u %4u %5u",
 		    qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]),
 		    G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 	for ( ; i < nq; i++, p += 4, wr += 2)
 		sbuf_printf(sb, "\n%7s %5x %5u %12x  %4x %4u %4u %5u", qname[i],
 		    base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff,
 		    wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]),
 		    G_QUEREMFLITS(p[2]) * 16);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cpl_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_cpl_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_tp_get_cpl_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 	if (rc)
 		goto done;
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "                 channel 0  channel 1"
 		    "  channel 2  channel 3");
 		sbuf_printf(sb, "\nCPL requests:   %10u %10u %10u %10u",
 		    stats.req[0], stats.req[1], stats.req[2], stats.req[3]);
 		sbuf_printf(sb, "\nCPL responses:  %10u %10u %10u %10u",
 		    stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]);
 	} else {
 		sbuf_printf(sb, "                 channel 0  channel 1");
 		sbuf_printf(sb, "\nCPL requests:   %10u %10u",
 		    stats.req[0], stats.req[1]);
 		sbuf_printf(sb, "\nCPL responses:  %10u %10u",
 		    stats.rsp[0], stats.rsp[1]);
 	}
 
 	rc = sbuf_finish(sb);
 done:
 	sbuf_delete(sb);
 	return (rc);
 }
 
 static int
 sysctl_ddp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_usm_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_get_usm_stats(sc, &stats, 1);
 	mtx_unlock(&sc->reg_lock);
 	if (rc == 0) {
 		sbuf_printf(sb, "Frames: %u\n", stats.frames);
 		sbuf_printf(sb, "Octets: %ju\n", stats.octets);
 		sbuf_printf(sb, "Drops:  %u", stats.drops);
 		rc = sbuf_finish(sb);
 	}
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tid_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_tid_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_tp_get_tid_stats(sc, &stats, 1);
 	mtx_unlock(&sc->reg_lock);
 	if (rc == 0) {
 		sbuf_printf(sb, "Delete:     %u\n", stats.del);
 		sbuf_printf(sb, "Invalidate: %u\n", stats.inv);
 		sbuf_printf(sb, "Active:     %u\n", stats.act);
 		sbuf_printf(sb, "Passive:    %u", stats.pas);
 		rc = sbuf_finish(sb);
 	}
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static const char * const devlog_level_strings[] = {
 	[FW_DEVLOG_LEVEL_EMERG]		= "EMERG",
 	[FW_DEVLOG_LEVEL_CRIT]		= "CRIT",
 	[FW_DEVLOG_LEVEL_ERR]		= "ERR",
 	[FW_DEVLOG_LEVEL_NOTICE]	= "NOTICE",
 	[FW_DEVLOG_LEVEL_INFO]		= "INFO",
 	[FW_DEVLOG_LEVEL_DEBUG]		= "DEBUG"
 };
 
 static const char * const devlog_facility_strings[] = {
 	[FW_DEVLOG_FACILITY_CORE]	= "CORE",
 	[FW_DEVLOG_FACILITY_CF]		= "CF",
 	[FW_DEVLOG_FACILITY_SCHED]	= "SCHED",
 	[FW_DEVLOG_FACILITY_TIMER]	= "TIMER",
 	[FW_DEVLOG_FACILITY_RES]	= "RES",
 	[FW_DEVLOG_FACILITY_HW]		= "HW",
 	[FW_DEVLOG_FACILITY_FLR]	= "FLR",
 	[FW_DEVLOG_FACILITY_DMAQ]	= "DMAQ",
 	[FW_DEVLOG_FACILITY_PHY]	= "PHY",
 	[FW_DEVLOG_FACILITY_MAC]	= "MAC",
 	[FW_DEVLOG_FACILITY_PORT]	= "PORT",
 	[FW_DEVLOG_FACILITY_VI]		= "VI",
 	[FW_DEVLOG_FACILITY_FILTER]	= "FILTER",
 	[FW_DEVLOG_FACILITY_ACL]	= "ACL",
 	[FW_DEVLOG_FACILITY_TM]		= "TM",
 	[FW_DEVLOG_FACILITY_QFC]	= "QFC",
 	[FW_DEVLOG_FACILITY_DCB]	= "DCB",
 	[FW_DEVLOG_FACILITY_ETH]	= "ETH",
 	[FW_DEVLOG_FACILITY_OFLD]	= "OFLD",
 	[FW_DEVLOG_FACILITY_RI]		= "RI",
 	[FW_DEVLOG_FACILITY_ISCSI]	= "ISCSI",
 	[FW_DEVLOG_FACILITY_FCOE]	= "FCOE",
 	[FW_DEVLOG_FACILITY_FOISCSI]	= "FOISCSI",
 	[FW_DEVLOG_FACILITY_FOFCOE]	= "FOFCOE",
 	[FW_DEVLOG_FACILITY_CHNET]	= "CHNET",
 };
 
 static int
 sbuf_devlog(struct adapter *sc, struct sbuf *sb, int flags)
 {
 	int i, j, rc, nentries, first = 0;
 	struct devlog_params *dparams = &sc->params.devlog;
 	struct fw_devlog_e *buf, *e;
 	uint64_t ftstamp = UINT64_MAX;
 
 	if (dparams->addr == 0)
 		return (ENXIO);
 
 	MPASS(flags == M_WAITOK || flags == M_NOWAIT);
 	buf = malloc(dparams->size, M_CXGBE, M_ZERO | flags);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf,
 		    dparams->size);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		goto done;
 
 	nentries = dparams->size / sizeof(struct fw_devlog_e);
 	for (i = 0; i < nentries; i++) {
 		e = &buf[i];
 
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		e->timestamp = be64toh(e->timestamp);
 		e->seqno = be32toh(e->seqno);
 		for (j = 0; j < 8; j++)
 			e->params[j] = be32toh(e->params[j]);
 
 		if (e->timestamp < ftstamp) {
 			ftstamp = e->timestamp;
 			first = i;
 		}
 	}
 
 	if (buf[first].timestamp == 0)
 		goto done;	/* nothing in the log */
 
 	sbuf_printf(sb, "%10s  %15s  %8s  %8s  %s\n",
 	    "Seq#", "Tstamp", "Level", "Facility", "Message");
 
 	i = first;
 	do {
 		e = &buf[i];
 		if (e->timestamp == 0)
 			break;	/* end */
 
 		sbuf_printf(sb, "%10d  %15ju  %8s  %8s  ",
 		    e->seqno, e->timestamp,
 		    (e->level < nitems(devlog_level_strings) ?
 			devlog_level_strings[e->level] : "UNKNOWN"),
 		    (e->facility < nitems(devlog_facility_strings) ?
 			devlog_facility_strings[e->facility] : "UNKNOWN"));
 		sbuf_printf(sb, e->fmt, e->params[0], e->params[1],
 		    e->params[2], e->params[3], e->params[4],
 		    e->params[5], e->params[6], e->params[7]);
 
 		if (++i == nentries)
 			i = 0;
 	} while (i != first);
 done:
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_devlog(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	rc = sbuf_devlog(sc, sb, M_WAITOK);
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 void
 t4_os_dump_devlog(struct adapter *sc)
 {
 	int rc;
 	struct sbuf sb;
 
 	if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
 		return;
 	rc = sbuf_devlog(sc, &sb, M_NOWAIT);
 	if (rc == 0) {
 		rc = sbuf_finish(&sb);
 		if (rc == 0) {
 			log(LOG_DEBUG, "%s: device log follows.\n%s",
 		    		device_get_nameunit(sc->dev), sbuf_data(&sb));
 		}
 	}
 	sbuf_delete(&sb);
 }
 
 static int
 sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_fcoe_stats stats[MAX_NCHAN];
 	int i, nchan = sc->chip_params->nchan;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		for (i = 0; i < nchan; i++)
 			t4_get_fcoe_stats(sc, i, &stats[i], 1);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (nchan > 2) {
 		sbuf_printf(sb, "                   channel 0        channel 1"
 		    "        channel 2        channel 3");
 		sbuf_printf(sb, "\noctetsDDP:  %16ju %16ju %16ju %16ju",
 		    stats[0].octets_ddp, stats[1].octets_ddp,
 		    stats[2].octets_ddp, stats[3].octets_ddp);
 		sbuf_printf(sb, "\nframesDDP:  %16u %16u %16u %16u",
 		    stats[0].frames_ddp, stats[1].frames_ddp,
 		    stats[2].frames_ddp, stats[3].frames_ddp);
 		sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u",
 		    stats[0].frames_drop, stats[1].frames_drop,
 		    stats[2].frames_drop, stats[3].frames_drop);
 	} else {
 		sbuf_printf(sb, "                   channel 0        channel 1");
 		sbuf_printf(sb, "\noctetsDDP:  %16ju %16ju",
 		    stats[0].octets_ddp, stats[1].octets_ddp);
 		sbuf_printf(sb, "\nframesDDP:  %16u %16u",
 		    stats[0].frames_ddp, stats[1].frames_ddp);
 		sbuf_printf(sb, "\nframesDrop: %16u %16u",
 		    stats[0].frames_drop, stats[1].frames_drop);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_hw_sched(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	unsigned int map, kbps, ipg, mode;
 	unsigned int pace_tab[NTX_SCHED];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP);
 	mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG));
 	t4_read_pace_tbl(sc, pace_tab);
 
 	sbuf_printf(sb, "Scheduler  Mode   Channel  Rate (Kbps)   "
 	    "Class IPG (0.1 ns)   Flow IPG (us)");
 
 	for (i = 0; i < NTX_SCHED; ++i, map >>= 2) {
 		t4_get_tx_sched(sc, i, &kbps, &ipg, 1);
 		sbuf_printf(sb, "\n    %u      %-5s     %u     ", i,
 		    (mode & (1 << i)) ? "flow" : "class", map & 3);
 		if (kbps)
 			sbuf_printf(sb, "%9u     ", kbps);
 		else
 			sbuf_printf(sb, " disabled     ");
 
 		if (ipg)
 			sbuf_printf(sb, "%13u        ", ipg);
 		else
 			sbuf_printf(sb, "     disabled        ");
 
 		if (pace_tab[i])
 			sbuf_printf(sb, "%10u", pace_tab[i]);
 		else
 			sbuf_printf(sb, "  disabled");
 	}
 	rc = sbuf_finish(sb);
 done:
 	mtx_unlock(&sc->reg_lock);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 static int
 sysctl_lb_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, j;
 	uint64_t *p0, *p1;
 	struct lb_port_stats s[2];
 	static const char *stat_name[] = {
 		"OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:",
 		"UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:",
 		"Frames128To255:", "Frames256To511:", "Frames512To1023:",
 		"Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:",
 		"BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:",
 		"BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:",
 		"BG2FramesTrunc:", "BG3FramesTrunc:"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	memset(s, 0, sizeof(s));
 
 	for (i = 0; i < sc->chip_params->nchan; i += 2) {
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else {
 			t4_get_lb_stats(sc, i, &s[0]);
 			t4_get_lb_stats(sc, i + 1, &s[1]);
 		}
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			break;
 
 		p0 = &s[0].octets;
 		p1 = &s[1].octets;
 		sbuf_printf(sb, "%s                       Loopback %u"
 		    "           Loopback %u", i == 0 ? "" : "\n", i, i + 1);
 
 		for (j = 0; j < nitems(stat_name); j++)
 			sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j],
 				   *p0++, *p1++);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_linkdnrc(SYSCTL_HANDLER_ARGS)
 {
 	int rc = 0;
 	struct port_info *pi = arg1;
 	struct link_config *lc = &pi->link_cfg;
 	struct sbuf *sb;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 64, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (lc->link_ok || lc->link_down_rc == 255)
 		sbuf_printf(sb, "n/a");
 	else
 		sbuf_printf(sb, "%s", t4_link_down_rc_str(lc->link_down_rc));
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 struct mem_desc {
 	unsigned int base;
 	unsigned int limit;
 	unsigned int idx;
 };
 
 static int
 mem_desc_cmp(const void *a, const void *b)
 {
 	return ((const struct mem_desc *)a)->base -
 	       ((const struct mem_desc *)b)->base;
 }
 
 static void
 mem_region_show(struct sbuf *sb, const char *name, unsigned int from,
     unsigned int to)
 {
 	unsigned int size;
 
 	if (from == to)
 		return;
 
 	size = to - from + 1;
 	if (size == 0)
 		return;
 
 	/* XXX: need humanize_number(3) in libkern for a more readable 'size' */
 	sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size);
 }
 
 static int
 sysctl_meminfo(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i, n;
 	uint32_t lo, hi, used, alloc;
 	static const char *memory[] = {
 		"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:", "HMA:"
 	};
 	static const char *region[] = {
 		"DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:",
 		"Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:",
 		"Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:",
 		"TDDP region:", "TPT region:", "STAG region:", "RQ region:",
 		"RQUDP region:", "PBL region:", "TXPBL region:",
 		"DBVFIFO region:", "ULPRX state:", "ULPTX state:",
 		"On-chip queues:", "TLS keys:",
 	};
 	struct mem_desc avail[4];
 	struct mem_desc mem[nitems(region) + 3];	/* up to 3 holes */
 	struct mem_desc *md = mem;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < nitems(mem); i++) {
 		mem[i].limit = 0;
 		mem[i].idx = i;
 	}
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	/* Find and sort the populated memory ranges */
 	i = 0;
 	lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	if (lo & F_EDRAM0_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM0_BAR);
 		avail[i].base = G_EDRAM0_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20);
 		avail[i].idx = 0;
 		i++;
 	}
 	if (lo & F_EDRAM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EDRAM1_BAR);
 		avail[i].base = G_EDRAM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20);
 		avail[i].idx = 1;
 		i++;
 	}
 	if (lo & F_EXT_MEM_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR);
 		avail[i].base = G_EXT_MEM_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20);
 		avail[i].idx = is_t5(sc) ? 3 : 2;	/* Call it MC0 for T5 */
 		i++;
 	}
 	if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		avail[i].base = G_EXT_MEM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20);
 		avail[i].idx = 4;
 		i++;
 	}
 	if (is_t6(sc) && lo & F_HMA_MUX) {
 		hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR);
 		avail[i].base = G_EXT_MEM1_BASE(hi) << 20;
 		avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20);
 		avail[i].idx = 5;
 		i++;
 	}
 	MPASS(i <= nitems(avail));
 	if (!i)                                    /* no memory available */
 		goto done;
 	qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp);
 
 	(md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR);
 	(md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE);
 	(md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE);
 
 	/* the next few have explicit upper bounds */
 	md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) *
 		    G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE));
 	md++;
 
 	md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE);
 	md->limit = md->base - 1 +
 		    t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) *
 		    G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE));
 	md++;
 
 	if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 		if (chip_id(sc) <= CHELSIO_T5)
 			md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE);
 		else
 			md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR);
 		md->limit = 0;
 	} else {
 		md->base = 0;
 		md->idx = nitems(region);  /* hide it */
 	}
 	md++;
 
 #define ulp_region(reg) \
 	md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\
 	(md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT)
 
 	ulp_region(RX_ISCSI);
 	ulp_region(RX_TDDP);
 	ulp_region(TX_TPT);
 	ulp_region(RX_STAG);
 	ulp_region(RX_RQ);
 	ulp_region(RX_RQUDP);
 	ulp_region(RX_PBL);
 	ulp_region(TX_PBL);
 #undef ulp_region
 
 	md->base = 0;
 	if (is_t4(sc))
 		md->idx = nitems(region);
 	else {
 		uint32_t size = 0;
 		uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2);
 		uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE);
 
 		if (is_t5(sc)) {
 			if (sge_ctrl & F_VFIFO_ENABLE)
 				size = fifo_size << 2;
 		} else
 			size = G_T6_DBVFIFO_SIZE(fifo_size) << 6;
 
 		if (size) {
 			md->base = t4_read_reg(sc, A_SGE_DBVFIFO_BADDR);
 			md->limit = md->base + size - 1;
 		} else
 			md->idx = nitems(region);
 	}
 	md++;
 
 	md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE);
 	md->limit = 0;
 	md++;
 	md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE);
 	md->limit = 0;
 	md++;
 
 	md->base = sc->vres.ocq.start;
 	if (sc->vres.ocq.size)
 		md->limit = md->base + sc->vres.ocq.size - 1;
 	else
 		md->idx = nitems(region);  /* hide it */
 	md++;
 
 	md->base = sc->vres.key.start;
 	if (sc->vres.key.size)
 		md->limit = md->base + sc->vres.key.size - 1;
 	else
 		md->idx = nitems(region);  /* hide it */
 	md++;
 
 	/* add any address-space holes, there can be up to 3 */
 	for (n = 0; n < i - 1; n++)
 		if (avail[n].limit < avail[n + 1].base)
 			(md++)->base = avail[n].limit;
 	if (avail[n].limit)
 		(md++)->base = avail[n].limit;
 
 	n = md - mem;
 	qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp);
 
 	for (lo = 0; lo < i; lo++)
 		mem_region_show(sb, memory[avail[lo].idx], avail[lo].base,
 				avail[lo].limit - 1);
 
 	sbuf_printf(sb, "\n");
 	for (i = 0; i < n; i++) {
 		if (mem[i].idx >= nitems(region))
 			continue;                        /* skip holes */
 		if (!mem[i].limit)
 			mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0;
 		mem_region_show(sb, region[mem[i].idx], mem[i].base,
 				mem[i].limit);
 	}
 
 	sbuf_printf(sb, "\n");
 	lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP RAM:", lo, hi);
 
 	lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR);
 	hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1;
 	mem_region_show(sb, "uP Extmem2:", lo, hi);
 
 	lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE);
 	sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n",
 		   G_PMRXMAXPAGE(lo),
 		   t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10,
 		   (lo & F_PMRXNUMCHN) ? 2 : 1);
 
 	lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE);
 	hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
 	sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n",
 		   G_PMTXMAXPAGE(lo),
 		   hi >= (1 << 20) ? (hi >> 20) : (hi >> 10),
 		   hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo));
 	sbuf_printf(sb, "%u p-structs\n",
 		   t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT));
 
 	for (i = 0; i < 4; i++) {
 		if (chip_id(sc) > CHELSIO_T5)
 			lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4);
 		else
 			lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4);
 		if (is_t5(sc)) {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		} else {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		}
 		/* For T6 these are MAC buffer groups */
 		sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated",
 		    i, used, alloc);
 	}
 	for (i = 0; i < sc->chip_params->nchan; i++) {
 		if (chip_id(sc) > CHELSIO_T5)
 			lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4);
 		else
 			lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4);
 		if (is_t5(sc)) {
 			used = G_T5_USED(lo);
 			alloc = G_T5_ALLOC(lo);
 		} else {
 			used = G_USED(lo);
 			alloc = G_ALLOC(lo);
 		}
 		/* For T6 these are MAC buffer groups */
 		sbuf_printf(sb,
 		    "\nLoopback %d using %u pages out of %u allocated",
 		    i, used, alloc);
 	}
 done:
 	mtx_unlock(&sc->reg_lock);
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (rc);
 }
 
 static inline void
 tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask)
 {
 	*mask = x | y;
 	y = htobe64(y);
 	memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN);
 }
 
 static int
 sysctl_mps_tcam(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 
 	MPASS(chip_id(sc) <= CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "Idx  Ethernet address     Mask     Vld Ports PF"
 	    "  VF              Replication             P0 P1 P2 P3  ML");
 	for (i = 0; i < sc->chip_params->mps_tcam_size; i++) {
 		uint64_t tcamx, tcamy, mask;
 		uint32_t cls_lo, cls_hi;
 		uint8_t addr[ETHER_ADDR_LEN];
 
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else {
 			tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i));
 			tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i));
 		}
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			break;
 		if (tcamx & tcamy)
 			continue;
 		tcamxy2valmask(tcamx, tcamy, addr, &mask);
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else {
 			cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i));
 			cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i));
 		}
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			break;
 		sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx"
 			   "  %c   %#x%4u%4d", i, addr[0], addr[1], addr[2],
 			   addr[3], addr[4], addr[5], (uintmax_t)mask,
 			   (cls_lo & F_SRAM_VLD) ? 'Y' : 'N',
 			   G_PORTMAP(cls_hi), G_PF(cls_lo),
 			   (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1);
 
 		if (cls_lo & F_REPLICATE) {
 			struct fw_ldst_cmd ldst_cmd;
 
 			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
 			ldst_cmd.op_to_addrspace =
 			    htobe32(V_FW_CMD_OP(FW_LDST_CMD) |
 				F_FW_CMD_REQUEST | F_FW_CMD_READ |
 				V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS));
 			ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd));
 			ldst_cmd.u.mps.rplc.fid_idx =
 			    htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) |
 				V_FW_LDST_CMD_IDX(i));
 
 			rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t4mps");
 			if (rc)
 				break;
 			if (hw_off_limits(sc))
 				rc = ENXIO;
 			else
 				rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd,
 				    sizeof(ldst_cmd), &ldst_cmd);
 			end_synchronized_op(sc, 0);
 			if (rc != 0)
 				break;
 			else {
 				sbuf_printf(sb, " %08x %08x %08x %08x",
 				    be32toh(ldst_cmd.u.mps.rplc.rplc127_96),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc95_64),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc63_32),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc31_0));
 			}
 		} else
 			sbuf_printf(sb, "%36s", "");
 
 		sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo),
 		    G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo),
 		    G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf);
 	}
 
 	if (rc)
 		(void) sbuf_finish(sb);
 	else
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 
 	MPASS(chip_id(sc) > CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "Idx  Ethernet address     Mask       VNI   Mask"
 	    "   IVLAN Vld DIP_Hit   Lookup  Port Vld Ports PF  VF"
 	    "                           Replication"
 	    "                                    P0 P1 P2 P3  ML\n");
 
 	for (i = 0; i < sc->chip_params->mps_tcam_size; i++) {
 		uint8_t dip_hit, vlan_vld, lookup_type, port_num;
 		uint16_t ivlan;
 		uint64_t tcamx, tcamy, val, mask;
 		uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy;
 		uint8_t addr[ETHER_ADDR_LEN];
 
 		ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0);
 		if (i < 256)
 			ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0);
 		else
 			ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1);
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else {
 			t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl);
 			val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1);
 			tcamy = G_DMACH(val) << 32;
 			tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1);
 			data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1);
 		}
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			break;
 
 		lookup_type = G_DATALKPTYPE(data2);
 		port_num = G_DATAPORTNUM(data2);
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			/* Inner header VNI */
 			vniy = ((data2 & F_DATAVIDH2) << 23) |
 				       (G_DATAVIDH1(data2) << 16) | G_VIDL(val);
 			dip_hit = data2 & F_DATADIPHIT;
 			vlan_vld = 0;
 		} else {
 			vniy = 0;
 			dip_hit = 0;
 			vlan_vld = data2 & F_DATAVIDH2;
 			ivlan = G_VIDL(val);
 		}
 
 		ctl |= V_CTLXYBITSEL(1);
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else {
 			t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl);
 			val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1);
 			tcamx = G_DMACH(val) << 32;
 			tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1);
 			data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1);
 		}
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			break;
 
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			/* Inner header VNI mask */
 			vnix = ((data2 & F_DATAVIDH2) << 23) |
 			       (G_DATAVIDH1(data2) << 16) | G_VIDL(val);
 		} else
 			vnix = 0;
 
 		if (tcamx & tcamy)
 			continue;
 		tcamxy2valmask(tcamx, tcamy, addr, &mask);
 
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else {
 			cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i));
 			cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i));
 		}
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			break;
 
 		if (lookup_type && lookup_type != M_DATALKPTYPE) {
 			sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x "
 			    "%012jx %06x %06x    -    -   %3c"
 			    "        I  %4x   %3c   %#x%4u%4d", i, addr[0],
 			    addr[1], addr[2], addr[3], addr[4], addr[5],
 			    (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N',
 			    port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N',
 			    G_PORTMAP(cls_hi), G_T6_PF(cls_lo),
 			    cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1);
 		} else {
 			sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x "
 			    "%012jx    -       -   ", i, addr[0], addr[1],
 			    addr[2], addr[3], addr[4], addr[5],
 			    (uintmax_t)mask);
 
 			if (vlan_vld)
 				sbuf_printf(sb, "%4u   Y     ", ivlan);
 			else
 				sbuf_printf(sb, "  -    N     ");
 
 			sbuf_printf(sb, "-      %3c  %4x   %3c   %#x%4u%4d",
 			    lookup_type ? 'I' : 'O', port_num,
 			    cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N',
 			    G_PORTMAP(cls_hi), G_T6_PF(cls_lo),
 			    cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1);
 		}
 
 
 		if (cls_lo & F_T6_REPLICATE) {
 			struct fw_ldst_cmd ldst_cmd;
 
 			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
 			ldst_cmd.op_to_addrspace =
 			    htobe32(V_FW_CMD_OP(FW_LDST_CMD) |
 				F_FW_CMD_REQUEST | F_FW_CMD_READ |
 				V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS));
 			ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd));
 			ldst_cmd.u.mps.rplc.fid_idx =
 			    htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) |
 				V_FW_LDST_CMD_IDX(i));
 
 			rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t6mps");
 			if (rc)
 				break;
 			if (hw_off_limits(sc))
 				rc = ENXIO;
 			else
 				rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd,
 				    sizeof(ldst_cmd), &ldst_cmd);
 			end_synchronized_op(sc, 0);
 			if (rc != 0)
 				break;
 			else {
 				sbuf_printf(sb, " %08x %08x %08x %08x"
 				    " %08x %08x %08x %08x",
 				    be32toh(ldst_cmd.u.mps.rplc.rplc255_224),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc223_192),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc191_160),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc159_128),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc127_96),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc95_64),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc63_32),
 				    be32toh(ldst_cmd.u.mps.rplc.rplc31_0));
 			}
 		} else
 			sbuf_printf(sb, "%72s", "");
 
 		sbuf_printf(sb, "%4u%3u%3u%3u %#x",
 		    G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo),
 		    G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo),
 		    (cls_lo >> S_T6_MULTILISTEN0) & 0xf);
 	}
 
 	if (rc)
 		(void) sbuf_finish(sb);
 	else
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_path_mtus(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint16_t mtus[NMTUS];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_read_mtu_tbl(sc, mtus, NULL);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u",
 	    mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6],
 	    mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13],
 	    mtus[14], mtus[15]);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_pm_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc, i;
 	uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS];
 	uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS];
 	static const char *tx_stats[MAX_PM_NSTATS] = {
 		"Read:", "Write bypass:", "Write mem:", "Bypass + mem:",
 		"Tx FIFO wait", NULL, "Tx latency"
 	};
 	static const char *rx_stats[MAX_PM_NSTATS] = {
 		"Read:", "Write bypass:", "Write mem:", "Flush:",
 		"Rx FIFO wait", NULL, "Rx latency"
 	};
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		t4_pmtx_get_stats(sc, tx_cnt, tx_cyc);
 		t4_pmrx_get_stats(sc, rx_cnt, rx_cyc);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "                Tx pcmds             Tx bytes");
 	for (i = 0; i < 4; i++) {
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 	}
 
 	sbuf_printf(sb, "\n                Rx pcmds             Rx bytes");
 	for (i = 0; i < 4; i++) {
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 	}
 
 	if (chip_id(sc) > CHELSIO_T5) {
 		sbuf_printf(sb,
 		    "\n              Total wait      Total occupancy");
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 
 		i += 2;
 		MPASS(i < nitems(tx_stats));
 
 		sbuf_printf(sb,
 		    "\n                   Reads           Total wait");
 		sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i],
 		    tx_cyc[i]);
 		sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i],
 		    rx_cyc[i]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_rdma_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_rdma_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_tp_get_rdma_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod);
 	sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tcp_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_tcp_stats v4, v6;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_tp_get_tcp_stats(sc, &v4, &v6, 0);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(sb,
 	    "                                IP                 IPv6\n");
 	sbuf_printf(sb, "OutRsts:      %20u %20u\n",
 	    v4.tcp_out_rsts, v6.tcp_out_rsts);
 	sbuf_printf(sb, "InSegs:       %20ju %20ju\n",
 	    v4.tcp_in_segs, v6.tcp_in_segs);
 	sbuf_printf(sb, "OutSegs:      %20ju %20ju\n",
 	    v4.tcp_out_segs, v6.tcp_out_segs);
 	sbuf_printf(sb, "RetransSegs:  %20ju %20ju",
 	    v4.tcp_retrans_segs, v6.tcp_retrans_segs);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tids(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint32_t x, y;
 	struct tid_info *t = &sc->tids;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (t->natids) {
 		sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1,
 		    t->atids_in_use);
 	}
 
 	if (t->nhpftids) {
 		sbuf_printf(sb, "HPFTID range: %u-%u, in use: %u\n",
 		    t->hpftid_base, t->hpftid_end, t->hpftids_in_use);
 	}
 
 	if (t->ntids) {
 		bool hashen = false;
 
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) {
 			hashen = true;
 			if (chip_id(sc) <= CHELSIO_T5) {
 				x = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4;
 				y = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4;
 			} else {
 				x = t4_read_reg(sc, A_LE_DB_SRVR_START_INDEX);
 				y = t4_read_reg(sc, A_T6_LE_DB_HASH_TID_BASE);
 			}
 		}
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			goto done;
 
 		sbuf_printf(sb, "TID range: ");
 		if (hashen) {
 			if (x)
 				sbuf_printf(sb, "%u-%u, ", t->tid_base, x - 1);
 			sbuf_printf(sb, "%u-%u", y, t->ntids - 1);
 		} else {
 			sbuf_printf(sb, "%u-%u", t->tid_base, t->tid_base +
 			    t->ntids - 1);
 		}
 		sbuf_printf(sb, ", in use: %u\n",
 		    atomic_load_acq_int(&t->tids_in_use));
 	}
 
 	if (t->nstids) {
 		sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base,
 		    t->stid_base + t->nstids - 1, t->stids_in_use);
 	}
 
 	if (t->nftids) {
 		sbuf_printf(sb, "FTID range: %u-%u, in use: %u\n", t->ftid_base,
 		    t->ftid_end, t->ftids_in_use);
 	}
 
 	if (t->netids) {
 		sbuf_printf(sb, "ETID range: %u-%u, in use: %u\n", t->etid_base,
 		    t->etid_base + t->netids - 1, t->etids_in_use);
 	}
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		x = t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4);
 		y = t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		goto done;
 	sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", x, y);
 done:
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	else
 		(void)sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_err_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_tp_get_err_stats(sc, &stats, 0);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "                 channel 0  channel 1"
 		    "  channel 2  channel 3\n");
 		sbuf_printf(sb, "macInErrs:      %10u %10u %10u %10u\n",
 		    stats.mac_in_errs[0], stats.mac_in_errs[1],
 		    stats.mac_in_errs[2], stats.mac_in_errs[3]);
 		sbuf_printf(sb, "hdrInErrs:      %10u %10u %10u %10u\n",
 		    stats.hdr_in_errs[0], stats.hdr_in_errs[1],
 		    stats.hdr_in_errs[2], stats.hdr_in_errs[3]);
 		sbuf_printf(sb, "tcpInErrs:      %10u %10u %10u %10u\n",
 		    stats.tcp_in_errs[0], stats.tcp_in_errs[1],
 		    stats.tcp_in_errs[2], stats.tcp_in_errs[3]);
 		sbuf_printf(sb, "tcp6InErrs:     %10u %10u %10u %10u\n",
 		    stats.tcp6_in_errs[0], stats.tcp6_in_errs[1],
 		    stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]);
 		sbuf_printf(sb, "tnlCongDrops:   %10u %10u %10u %10u\n",
 		    stats.tnl_cong_drops[0], stats.tnl_cong_drops[1],
 		    stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]);
 		sbuf_printf(sb, "tnlTxDrops:     %10u %10u %10u %10u\n",
 		    stats.tnl_tx_drops[0], stats.tnl_tx_drops[1],
 		    stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]);
 		sbuf_printf(sb, "ofldVlanDrops:  %10u %10u %10u %10u\n",
 		    stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1],
 		    stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]);
 		sbuf_printf(sb, "ofldChanDrops:  %10u %10u %10u %10u\n\n",
 		    stats.ofld_chan_drops[0], stats.ofld_chan_drops[1],
 		    stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]);
 	} else {
 		sbuf_printf(sb, "                 channel 0  channel 1\n");
 		sbuf_printf(sb, "macInErrs:      %10u %10u\n",
 		    stats.mac_in_errs[0], stats.mac_in_errs[1]);
 		sbuf_printf(sb, "hdrInErrs:      %10u %10u\n",
 		    stats.hdr_in_errs[0], stats.hdr_in_errs[1]);
 		sbuf_printf(sb, "tcpInErrs:      %10u %10u\n",
 		    stats.tcp_in_errs[0], stats.tcp_in_errs[1]);
 		sbuf_printf(sb, "tcp6InErrs:     %10u %10u\n",
 		    stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]);
 		sbuf_printf(sb, "tnlCongDrops:   %10u %10u\n",
 		    stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]);
 		sbuf_printf(sb, "tnlTxDrops:     %10u %10u\n",
 		    stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]);
 		sbuf_printf(sb, "ofldVlanDrops:  %10u %10u\n",
 		    stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]);
 		sbuf_printf(sb, "ofldChanDrops:  %10u %10u\n\n",
 		    stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]);
 	}
 
 	sbuf_printf(sb, "ofldNoNeigh:    %u\nofldCongDefer:  %u",
 	    stats.ofld_no_neigh, stats.ofld_cong_defer);
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tnl_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	struct tp_tnl_stats stats;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return(rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_tp_get_tnl_stats(sc, &stats, 1);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "           channel 0  channel 1"
 		    "  channel 2  channel 3\n");
 		sbuf_printf(sb, "OutPkts:  %10u %10u %10u %10u\n",
 		    stats.out_pkt[0], stats.out_pkt[1],
 		    stats.out_pkt[2], stats.out_pkt[3]);
 		sbuf_printf(sb, "InPkts:   %10u %10u %10u %10u",
 		    stats.in_pkt[0], stats.in_pkt[1],
 		    stats.in_pkt[2], stats.in_pkt[3]);
 	} else {
 		sbuf_printf(sb, "           channel 0  channel 1\n");
 		sbuf_printf(sb, "OutPkts:  %10u %10u\n",
 		    stats.out_pkt[0], stats.out_pkt[1]);
 		sbuf_printf(sb, "InPkts:   %10u %10u",
 		    stats.in_pkt[0], stats.in_pkt[1]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct tp_params *tpp = &sc->params.tp;
 	u_int mask;
 	int rc;
 
 	mask = tpp->la_mask >> 16;
 	rc = sysctl_handle_int(oidp, &mask, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 	if (mask > 0xffff)
 		return (EINVAL);
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		tpp->la_mask = mask << 16;
 		t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U,
 		    tpp->la_mask);
 	}
 	mtx_unlock(&sc->reg_lock);
 
 	return (rc);
 }
 
 struct field_desc {
 	const char *name;
 	u_int start;
 	u_int width;
 };
 
 static void
 field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f)
 {
 	char buf[32];
 	int line_size = 0;
 
 	while (f->name) {
 		uint64_t mask = (1ULL << f->width) - 1;
 		int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name,
 		    ((uintmax_t)v >> f->start) & mask);
 
 		if (line_size + len >= 79) {
 			line_size = 8;
 			sbuf_printf(sb, "\n        ");
 		}
 		sbuf_printf(sb, "%s ", buf);
 		line_size += len + 1;
 		f++;
 	}
 	sbuf_printf(sb, "\n");
 }
 
 static const struct field_desc tp_la0[] = {
 	{ "RcfOpCodeOut", 60, 4 },
 	{ "State", 56, 4 },
 	{ "WcfState", 52, 4 },
 	{ "RcfOpcSrcOut", 50, 2 },
 	{ "CRxError", 49, 1 },
 	{ "ERxError", 48, 1 },
 	{ "SanityFailed", 47, 1 },
 	{ "SpuriousMsg", 46, 1 },
 	{ "FlushInputMsg", 45, 1 },
 	{ "FlushInputCpl", 44, 1 },
 	{ "RssUpBit", 43, 1 },
 	{ "RssFilterHit", 42, 1 },
 	{ "Tid", 32, 10 },
 	{ "InitTcb", 31, 1 },
 	{ "LineNumber", 24, 7 },
 	{ "Emsg", 23, 1 },
 	{ "EdataOut", 22, 1 },
 	{ "Cmsg", 21, 1 },
 	{ "CdataOut", 20, 1 },
 	{ "EreadPdu", 19, 1 },
 	{ "CreadPdu", 18, 1 },
 	{ "TunnelPkt", 17, 1 },
 	{ "RcfPeerFin", 16, 1 },
 	{ "RcfReasonOut", 12, 4 },
 	{ "TxCchannel", 10, 2 },
 	{ "RcfTxChannel", 8, 2 },
 	{ "RxEchannel", 6, 2 },
 	{ "RcfRxChannel", 5, 1 },
 	{ "RcfDataOutSrdy", 4, 1 },
 	{ "RxDvld", 3, 1 },
 	{ "RxOoDvld", 2, 1 },
 	{ "RxCongestion", 1, 1 },
 	{ "TxCongestion", 0, 1 },
 	{ NULL }
 };
 
 static const struct field_desc tp_la1[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "CplCmdOut", 48, 8 },
 	{ "ESynOut", 47, 1 },
 	{ "EAckOut", 46, 1 },
 	{ "EFinOut", 45, 1 },
 	{ "ERstOut", 44, 1 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static const struct field_desc tp_la2[] = {
 	{ "CplCmdIn", 56, 8 },
 	{ "MpsVfVld", 55, 1 },
 	{ "MpsPf", 52, 3 },
 	{ "MpsVf", 44, 8 },
 	{ "SynIn", 43, 1 },
 	{ "AckIn", 42, 1 },
 	{ "FinIn", 41, 1 },
 	{ "RstIn", 40, 1 },
 	{ "DataIn", 39, 1 },
 	{ "DataInVld", 38, 1 },
 	{ "PadIn", 37, 1 },
 	{ "RxBufEmpty", 36, 1 },
 	{ "RxDdp", 35, 1 },
 	{ "RxFbCongestion", 34, 1 },
 	{ "TxFbCongestion", 33, 1 },
 	{ "TxPktSumSrdy", 32, 1 },
 	{ "RcfUlpType", 28, 4 },
 	{ "Eread", 27, 1 },
 	{ "Ebypass", 26, 1 },
 	{ "Esave", 25, 1 },
 	{ "Static0", 24, 1 },
 	{ "Cread", 23, 1 },
 	{ "Cbypass", 22, 1 },
 	{ "Csave", 21, 1 },
 	{ "CPktOut", 20, 1 },
 	{ "RxPagePoolFull", 18, 2 },
 	{ "RxLpbkPkt", 17, 1 },
 	{ "TxLpbkPkt", 16, 1 },
 	{ "RxVfValid", 15, 1 },
 	{ "SynLearned", 14, 1 },
 	{ "SetDelEntry", 13, 1 },
 	{ "SetInvEntry", 12, 1 },
 	{ "CpcmdDvld", 11, 1 },
 	{ "CpcmdSave", 10, 1 },
 	{ "RxPstructsFull", 8, 2 },
 	{ "EpcmdDvld", 7, 1 },
 	{ "EpcmdFlush", 6, 1 },
 	{ "EpcmdTrimPrefix", 5, 1 },
 	{ "EpcmdTrimPostfix", 4, 1 },
 	{ "ERssIp4Pkt", 3, 1 },
 	{ "ERssIp6Pkt", 2, 1 },
 	{ "ERssTcpUdpPkt", 1, 1 },
 	{ "ERssFceFipPkt", 0, 1 },
 	{ NULL }
 };
 
 static void
 tp_la_show(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	field_desc_show(sb, *p, tp_la0);
 }
 
 static void
 tp_la_show2(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], tp_la0);
 }
 
 static void
 tp_la_show3(struct sbuf *sb, uint64_t *p, int idx)
 {
 
 	if (idx)
 		sbuf_printf(sb, "\n");
 	field_desc_show(sb, p[0], tp_la0);
 	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
 		field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1);
 }
 
 static int
 sysctl_tp_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint64_t *buf, *p;
 	int rc;
 	u_int i, inc;
 	void (*show_func)(struct sbuf *, uint64_t *, int);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		t4_tp_read_la(sc, buf, NULL);
 		switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) {
 		case 2:
 			inc = 2;
 			show_func = tp_la_show2;
 			break;
 		case 3:
 			inc = 2;
 			show_func = tp_la_show3;
 			break;
 		default:
 			inc = 1;
 			show_func = tp_la_show;
 		}
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		goto done;
 
 	p = buf;
 	for (i = 0; i < TPLA_SIZE / inc; i++, p += inc)
 		(*show_func)(sb, p, i);
 	rc = sbuf_finish(sb);
 done:
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_tx_rate(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	u64 nrate[MAX_NCHAN], orate[MAX_NCHAN];
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_get_chan_txrate(sc, nrate, orate);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 256, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (sc->chip_params->nchan > 2) {
 		sbuf_printf(sb, "              channel 0   channel 1"
 		    "   channel 2   channel 3\n");
 		sbuf_printf(sb, "NIC B/s:     %10ju  %10ju  %10ju  %10ju\n",
 		    nrate[0], nrate[1], nrate[2], nrate[3]);
 		sbuf_printf(sb, "Offload B/s: %10ju  %10ju  %10ju  %10ju",
 		    orate[0], orate[1], orate[2], orate[3]);
 	} else {
 		sbuf_printf(sb, "              channel 0   channel 1\n");
 		sbuf_printf(sb, "NIC B/s:     %10ju  %10ju\n",
 		    nrate[0], nrate[1]);
 		sbuf_printf(sb, "Offload B/s: %10ju  %10ju",
 		    orate[0], orate[1]);
 	}
 
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_ulprx_la(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	uint32_t *buf, *p;
 	int rc, i;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		t4_ulprx_read_la(sc, buf);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		goto done;
 
 	p = buf;
 	sbuf_printf(sb, "      Pcmd        Type   Message"
 	    "                Data");
 	for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) {
 		sbuf_printf(sb, "\n%08x%08x  %4x  %08x  %08x%08x%08x%08x",
 		    p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]);
 	}
 	rc = sbuf_finish(sb);
 done:
 	sbuf_delete(sb);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static int
 sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sbuf *sb;
 	int rc;
 	uint32_t cfg, s1, s2;
 
 	MPASS(chip_id(sc) >= CHELSIO_T5);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		cfg = t4_read_reg(sc, A_SGE_STAT_CFG);
 		s1 = t4_read_reg(sc, A_SGE_STAT_TOTAL);
 		s2 = t4_read_reg(sc, A_SGE_STAT_MATCH);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (G_STATSOURCE_T5(cfg) == 7) {
 		int mode;
 
 		mode = is_t5(sc) ? G_STATMODE(cfg) : G_T6_STATMODE(cfg);
 		if (mode == 0)
 			sbuf_printf(sb, "total %d, incomplete %d", s1, s2);
 		else if (mode == 1)
 			sbuf_printf(sb, "total %d, data overflow %d", s1, s2);
 		else
 			sbuf_printf(sb, "unknown mode %d", mode);
 	}
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_cpus(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	enum cpu_sets op = arg2;
 	cpuset_t cpuset;
 	struct sbuf *sb;
 	int i, rc;
 
 	MPASS(op == LOCAL_CPUS || op == INTR_CPUS);
 
 	CPU_ZERO(&cpuset);
 	rc = bus_get_cpus(sc->dev, op, sizeof(cpuset), &cpuset);
 	if (rc != 0)
 		return (rc);
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	CPU_FOREACH(i)
 		sbuf_printf(sb, "%d ", i);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 static int
 sysctl_reset(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	u_int val;
 	int rc;
 
 	val = sc->num_resets;
 	rc = sysctl_handle_int(oidp, &val, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (val == 0) {
 		/* Zero out the counter that tracks reset. */
 		sc->num_resets = 0;
 		return (0);
 	}
 
 	if (val != 1)
 		return (EINVAL);	/* 0 or 1 are the only legal values */
 
 	if (hw_off_limits(sc))		/* harmless race */
 		return (EALREADY);
 
 	taskqueue_enqueue(reset_tq, &sc->reset_task);
 	return (0);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 sysctl_tls(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int i, j, v, rc;
 	struct vi_info *vi;
 
 	v = sc->tt.tls;
 	rc = sysctl_handle_int(oidp, &v, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (v != 0 && !(sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS))
 		return (ENOTSUP);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4stls");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		sc->tt.tls = !!v;
 		for_each_port(sc, i) {
 			for_each_vi(sc->port[i], j, vi) {
 				if (vi->flags & VI_INIT_DONE)
 					t4_update_fl_bufsize(vi->ifp);
 			}
 		}
 	}
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 
 }
 
 static int
 sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int *old_ports, *new_ports;
 	int i, new_count, rc;
 
 	if (req->newptr == NULL && req->oldptr == NULL)
 		return (SYSCTL_OUT(req, NULL, imax(sc->tt.num_tls_rx_ports, 1) *
 		    sizeof(sc->tt.tls_rx_ports[0])));
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tlsrx");
 	if (rc)
 		return (rc);
 
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	if (sc->tt.num_tls_rx_ports == 0) {
 		i = -1;
 		rc = SYSCTL_OUT(req, &i, sizeof(i));
 	} else
 		rc = SYSCTL_OUT(req, sc->tt.tls_rx_ports,
 		    sc->tt.num_tls_rx_ports * sizeof(sc->tt.tls_rx_ports[0]));
 	if (rc == 0 && req->newptr != NULL) {
 		new_count = req->newlen / sizeof(new_ports[0]);
 		new_ports = malloc(new_count * sizeof(new_ports[0]), M_CXGBE,
 		    M_WAITOK);
 		rc = SYSCTL_IN(req, new_ports, new_count *
 		    sizeof(new_ports[0]));
 		if (rc)
 			goto err;
 
 		/* Allow setting to a single '-1' to clear the list. */
 		if (new_count == 1 && new_ports[0] == -1) {
 			ADAPTER_LOCK(sc);
 			old_ports = sc->tt.tls_rx_ports;
 			sc->tt.tls_rx_ports = NULL;
 			sc->tt.num_tls_rx_ports = 0;
 			ADAPTER_UNLOCK(sc);
 			free(old_ports, M_CXGBE);
 		} else {
 			for (i = 0; i < new_count; i++) {
 				if (new_ports[i] < 1 ||
 				    new_ports[i] > IPPORT_MAX) {
 					rc = EINVAL;
 					goto err;
 				}
 			}
 
 			ADAPTER_LOCK(sc);
 			old_ports = sc->tt.tls_rx_ports;
 			sc->tt.tls_rx_ports = new_ports;
 			sc->tt.num_tls_rx_ports = new_count;
 			ADAPTER_UNLOCK(sc);
 			free(old_ports, M_CXGBE);
 			new_ports = NULL;
 		}
 	err:
 		free(new_ports, M_CXGBE);
 	}
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 sysctl_tls_rx_timeout(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int v, rc;
 
 	v = sc->tt.tls_rx_timeout;
 	rc = sysctl_handle_int(oidp, &v, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (v < 0)
 		return (EINVAL);
 
 	if (v != 0 && !(sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS))
 		return (ENOTSUP);
 
 	sc->tt.tls_rx_timeout = v;
 
 	return (0);
 
 }
 
 static void
 unit_conv(char *buf, size_t len, u_int val, u_int factor)
 {
 	u_int rem = val % factor;
 
 	if (rem == 0)
 		snprintf(buf, len, "%u", val / factor);
 	else {
 		while (rem % 10 == 0)
 			rem /= 10;
 		snprintf(buf, len, "%u.%u", val / factor, rem);
 	}
 }
 
 static int
 sysctl_tp_tick(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	char buf[16];
 	u_int res, re;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		res = (u_int)-1;
 	else
 		res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION);
 	mtx_unlock(&sc->reg_lock);
 	if (res == (u_int)-1)
 		return (ENXIO);
 
 	switch (arg2) {
 	case 0:
 		/* timer_tick */
 		re = G_TIMERRESOLUTION(res);
 		break;
 	case 1:
 		/* TCP timestamp tick */
 		re = G_TIMESTAMPRESOLUTION(res);
 		break;
 	case 2:
 		/* DACK tick */
 		re = G_DELAYEDACKRESOLUTION(res);
 		break;
 	default:
 		return (EDOOFUS);
 	}
 
 	unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000);
 
 	return (sysctl_handle_string(oidp, buf, sizeof(buf), req));
 }
 
 static int
 sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc;
 	u_int dack_tmr, dack_re, v;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		rc = 0;
 		dack_re = G_DELAYEDACKRESOLUTION(t4_read_reg(sc,
 		    A_TP_TIMER_RESOLUTION));
 		dack_tmr = t4_read_reg(sc, A_TP_DACK_TIMER);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	v = ((cclk_ps << dack_re) / 1000000) * dack_tmr;
 
 	return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_tp_timer(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, reg = arg2;
 	u_int tre;
 	u_long tp_tick_us, v;
 	u_int cclk_ps = 1000000000 / sc->params.vpd.cclk;
 
 	MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX ||
 	    reg == A_TP_PERS_MIN  || reg == A_TP_PERS_MAX ||
 	    reg == A_TP_KEEP_IDLE || reg == A_TP_KEEP_INTVL ||
 	    reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		rc = 0;
 		tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION));
 		tp_tick_us = (cclk_ps << tre) / 1000000;
 		if (reg == A_TP_INIT_SRTT)
 			v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg));
 		else
 			v = tp_tick_us * t4_read_reg(sc, reg);
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 	else
 		return (sysctl_handle_long(oidp, &v, 0, req));
 }
 
 /*
  * All fields in TP_SHIFT_CNT are 4b and the starting location of the field is
  * passed to this function.
  */
 static int
 sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, idx = arg2;
 	u_int v;
 
 	MPASS(idx >= 0 && idx <= 24);
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		rc = 0;
 		v = (t4_read_reg(sc, A_TP_SHIFT_CNT) >> idx) & 0xf;
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 	else
 		return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_tp_backoff(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	int rc, idx = arg2;
 	u_int shift, v, r;
 
 	MPASS(idx >= 0 && idx < 16);
 
 	r = A_TP_TCP_BACKOFF_REG0 + (idx & ~3);
 	shift = (idx & 3) << 3;
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else {
 		rc = 0;
 		v = (t4_read_reg(sc, r) >> shift) & M_TIMERBACKOFFINDEX0;
 	}
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 	else
 		return (sysctl_handle_int(oidp, &v, 0, req));
 }
 
 static int
 sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	int idx, rc, i;
 	struct sge_ofld_rxq *ofld_rxq;
 	uint8_t v;
 
 	idx = vi->ofld_tmr_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < 0 || idx >= SGE_NTIMERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4otmr");
 	if (rc)
 		return (rc);
 
 	v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->ofld_pktc_idx != -1);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 #ifdef atomic_store_rel_8
 		atomic_store_rel_8(&ofld_rxq->iq.intr_params, v);
 #else
 		ofld_rxq->iq.intr_params = v;
 #endif
 	}
 	vi->ofld_tmr_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (0);
 }
 
 static int
 sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	int idx, rc;
 
 	idx = vi->ofld_pktc_idx;
 
 	rc = sysctl_handle_int(oidp, &idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (idx < -1 || idx >= SGE_NCOUNTERS)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK,
 	    "t4opktc");
 	if (rc)
 		return (rc);
 
 	if (vi->flags & VI_INIT_DONE)
 		rc = EBUSY; /* cannot be changed once the queues are created */
 	else
 		vi->ofld_pktc_idx = idx;
 
 	end_synchronized_op(sc, LOCK_HELD);
 	return (rc);
 }
 #endif
 
 static int
 get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt)
 {
 	int rc;
 
 	if (cntxt->cid > M_CTXTQID)
 		return (EINVAL);
 
 	if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS &&
 	    cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt");
 	if (rc)
 		return (rc);
 
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	if (sc->flags & FW_OK) {
 		rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id,
 		    &cntxt->data[0]);
 		if (rc == 0)
 			goto done;
 	}
 
 	/*
 	 * Read via firmware failed or wasn't even attempted.  Read directly via
 	 * the backdoor.
 	 */
 	rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_fw(struct adapter *sc, struct t4_data *fw)
 {
 	int rc;
 	uint8_t *fw_data;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw");
 	if (rc)
 		return (rc);
 
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	/*
 	 * The firmware, with the sole exception of the memory parity error
 	 * handler, runs from memory and not flash.  It is almost always safe to
 	 * install a new firmware on a running system.  Just set bit 1 in
 	 * hw.cxgbe.dflags or dev.<nexus>.<n>.dflags first.
 	 */
 	if (sc->flags & FULL_INIT_DONE &&
 	    (sc->debug_flags & DF_LOAD_FW_ANYTIME) == 0) {
 		rc = EBUSY;
 		goto done;
 	}
 
 	fw_data = malloc(fw->len, M_CXGBE, M_WAITOK);
 
 	rc = copyin(fw->data, fw_data, fw->len);
 	if (rc == 0)
 		rc = -t4_load_fw(sc, fw_data, fw->len);
 
 	free(fw_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_cfg(struct adapter *sc, struct t4_data *cfg)
 {
 	int rc;
 	uint8_t *cfg_data = NULL;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf");
 	if (rc)
 		return (rc);
 
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	if (cfg->len == 0) {
 		/* clear */
 		rc = -t4_load_cfg(sc, NULL, 0);
 		goto done;
 	}
 
 	cfg_data = malloc(cfg->len, M_CXGBE, M_WAITOK);
 
 	rc = copyin(cfg->data, cfg_data, cfg->len);
 	if (rc == 0)
 		rc = -t4_load_cfg(sc, cfg_data, cfg->len);
 
 	free(cfg_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_boot(struct adapter *sc, struct t4_bootrom *br)
 {
 	int rc;
 	uint8_t *br_data = NULL;
 	u_int offset;
 
 	if (br->len > 1024 * 1024)
 		return (EFBIG);
 
 	if (br->pf_offset == 0) {
 		/* pfidx */
 		if (br->pfidx_addr > 7)
 			return (EINVAL);
 		offset = G_OFFSET(t4_read_reg(sc, PF_REG(br->pfidx_addr,
 		    A_PCIE_PF_EXPROM_OFST)));
 	} else if (br->pf_offset == 1) {
 		/* offset */
 		offset = G_OFFSET(br->pfidx_addr);
 	} else {
 		return (EINVAL);
 	}
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldbr");
 	if (rc)
 		return (rc);
 
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	if (br->len == 0) {
 		/* clear */
 		rc = -t4_load_boot(sc, NULL, offset, 0);
 		goto done;
 	}
 
 	br_data = malloc(br->len, M_CXGBE, M_WAITOK);
 
 	rc = copyin(br->data, br_data, br->len);
 	if (rc == 0)
 		rc = -t4_load_boot(sc, br_data, offset, br->len);
 
 	free(br_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 load_bootcfg(struct adapter *sc, struct t4_data *bc)
 {
 	int rc;
 	uint8_t *bc_data = NULL;
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf");
 	if (rc)
 		return (rc);
 
 	if (hw_off_limits(sc)) {
 		rc = ENXIO;
 		goto done;
 	}
 
 	if (bc->len == 0) {
 		/* clear */
 		rc = -t4_load_bootcfg(sc, NULL, 0);
 		goto done;
 	}
 
 	bc_data = malloc(bc->len, M_CXGBE, M_WAITOK);
 
 	rc = copyin(bc->data, bc_data, bc->len);
 	if (rc == 0)
 		rc = -t4_load_bootcfg(sc, bc_data, bc->len);
 
 	free(bc_data, M_CXGBE);
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
 
 static int
 cudbg_dump(struct adapter *sc, struct t4_cudbg_dump *dump)
 {
 	int rc;
 	struct cudbg_init *cudbg;
 	void *handle, *buf;
 
 	/* buf is large, don't block if no memory is available */
 	buf = malloc(dump->len, M_CXGBE, M_NOWAIT | M_ZERO);
 	if (buf == NULL)
 		return (ENOMEM);
 
 	handle = cudbg_alloc_handle();
 	if (handle == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 
 	cudbg = cudbg_get_init(handle);
 	cudbg->adap = sc;
 	cudbg->print = (cudbg_print_cb)printf;
 
 #ifndef notyet
 	device_printf(sc->dev, "%s: wr_flash %u, len %u, data %p.\n",
 	    __func__, dump->wr_flash, dump->len, dump->data);
 #endif
 
 	if (dump->wr_flash)
 		cudbg->use_flash = 1;
 	MPASS(sizeof(cudbg->dbg_bitmap) == sizeof(dump->bitmap));
 	memcpy(cudbg->dbg_bitmap, dump->bitmap, sizeof(cudbg->dbg_bitmap));
 
 	rc = cudbg_collect(handle, buf, &dump->len);
 	if (rc != 0)
 		goto done;
 
 	rc = copyout(buf, dump->data, dump->len);
 done:
 	cudbg_free_handle(handle);
 	free(buf, M_CXGBE);
 	return (rc);
 }
 
 static void
 free_offload_policy(struct t4_offload_policy *op)
 {
 	struct offload_rule *r;
 	int i;
 
 	if (op == NULL)
 		return;
 
 	r = &op->rule[0];
 	for (i = 0; i < op->nrules; i++, r++) {
 		free(r->bpf_prog.bf_insns, M_CXGBE);
 	}
 	free(op->rule, M_CXGBE);
 	free(op, M_CXGBE);
 }
 
 static int
 set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop)
 {
 	int i, rc, len;
 	struct t4_offload_policy *op, *old;
 	struct bpf_program *bf;
 	const struct offload_settings *s;
 	struct offload_rule *r;
 	void *u;
 
 	if (!is_offload(sc))
 		return (ENODEV);
 
 	if (uop->nrules == 0) {
 		/* Delete installed policies. */
 		op = NULL;
 		goto set_policy;
 	} else if (uop->nrules > 256) { /* arbitrary */
 		return (E2BIG);
 	}
 
 	/* Copy userspace offload policy to kernel */
 	op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK);
 	op->nrules = uop->nrules;
 	len = op->nrules * sizeof(struct offload_rule);
 	op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
 	rc = copyin(uop->rule, op->rule, len);
 	if (rc) {
 		free(op->rule, M_CXGBE);
 		free(op, M_CXGBE);
 		return (rc);
 	}
 
 	r = &op->rule[0];
 	for (i = 0; i < op->nrules; i++, r++) {
 
 		/* Validate open_type */
 		if (r->open_type != OPEN_TYPE_LISTEN &&
 		    r->open_type != OPEN_TYPE_ACTIVE &&
 		    r->open_type != OPEN_TYPE_PASSIVE &&
 		    r->open_type != OPEN_TYPE_DONTCARE) {
 error:
 			/*
 			 * Rules 0 to i have malloc'd filters that need to be
 			 * freed.  Rules i+1 to nrules have userspace pointers
 			 * and should be left alone.
 			 */
 			op->nrules = i;
 			free_offload_policy(op);
 			return (rc);
 		}
 
 		/* Validate settings */
 		s = &r->settings;
 		if ((s->offload != 0 && s->offload != 1) ||
 		    s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED ||
 		    s->sched_class < -1 ||
 		    s->sched_class >= sc->params.nsched_cls) {
 			rc = EINVAL;
 			goto error;
 		}
 
 		bf = &r->bpf_prog;
 		u = bf->bf_insns;	/* userspace ptr */
 		bf->bf_insns = NULL;
 		if (bf->bf_len == 0) {
 			/* legal, matches everything */
 			continue;
 		}
 		len = bf->bf_len * sizeof(*bf->bf_insns);
 		bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
 		rc = copyin(u, bf->bf_insns, len);
 		if (rc != 0)
 			goto error;
 
 		if (!bpf_validate(bf->bf_insns, bf->bf_len)) {
 			rc = EINVAL;
 			goto error;
 		}
 	}
 set_policy:
 	rw_wlock(&sc->policy_lock);
 	old = sc->policy;
 	sc->policy = op;
 	rw_wunlock(&sc->policy_lock);
 	free_offload_policy(old);
 
 	return (0);
 }
 
 #define MAX_READ_BUF_SIZE (128 * 1024)
 static int
 read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr)
 {
 	uint32_t addr, remaining, n;
 	uint32_t *buf;
 	int rc;
 	uint8_t *dst;
 
 	mtx_lock(&sc->reg_lock);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		rc = validate_mem_range(sc, mr->addr, mr->len);
 	mtx_unlock(&sc->reg_lock);
 	if (rc != 0)
 		return (rc);
 
 	buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK);
 	addr = mr->addr;
 	remaining = mr->len;
 	dst = (void *)mr->data;
 
 	while (remaining) {
 		n = min(remaining, MAX_READ_BUF_SIZE);
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else
 			read_via_memwin(sc, 2, addr, buf, n);
 		mtx_unlock(&sc->reg_lock);
 		if (rc != 0)
 			break;
 
 		rc = copyout(buf, dst, n);
 		if (rc != 0)
 			break;
 
 		dst += n;
 		remaining -= n;
 		addr += n;
 	}
 
 	free(buf, M_CXGBE);
 	return (rc);
 }
 #undef MAX_READ_BUF_SIZE
 
 static int
 read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd)
 {
 	int rc;
 
 	if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports)
 		return (EINVAL);
 
 	if (i2cd->len > sizeof(i2cd->data))
 		return (EFBIG);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr,
 		    i2cd->offset, i2cd->len, &i2cd->data[0]);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 static int
 clear_stats(struct adapter *sc, u_int port_id)
 {
 	int i, v, chan_map;
 	struct port_info *pi;
 	struct vi_info *vi;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 	struct sge_wrq *wrq;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_ofld_txq *ofld_txq;
 #endif
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 
 	if (port_id >= sc->params.nports)
 		return (EINVAL);
 	pi = sc->port[port_id];
 	if (pi == NULL)
 		return (EIO);
 
 	mtx_lock(&sc->reg_lock);
 	if (!hw_off_limits(sc)) {
 		/* MAC stats */
 		t4_clr_port_stats(sc, pi->tx_chan);
 		if (is_t6(sc)) {
 			if (pi->fcs_reg != -1)
 				pi->fcs_base = t4_read_reg64(sc, pi->fcs_reg);
 			else
 				pi->stats.rx_fcs_err = 0;
 		}
 		for_each_vi(pi, v, vi) {
 			if (vi->flags & VI_INIT_DONE)
 				t4_clr_vi_stats(sc, vi->vin);
 		}
 		chan_map = pi->rx_e_chan_map;
 		v = 0;	/* reuse */
 		while (chan_map) {
 			i = ffs(chan_map) - 1;
 			t4_write_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v,
 			    1, A_TP_MIB_TNL_CNG_DROP_0 + i);
 			chan_map &= ~(1 << i);
 		}
 	}
 	mtx_unlock(&sc->reg_lock);
 	pi->tx_parse_error = 0;
 	pi->tnl_cong_drops = 0;
 
 	/*
 	 * Since this command accepts a port, clear stats for
 	 * all VIs on this port.
 	 */
 	for_each_vi(pi, v, vi) {
 		if (vi->flags & VI_INIT_DONE) {
 
 			for_each_rxq(vi, i, rxq) {
 #if defined(INET) || defined(INET6)
 				rxq->lro.lro_queued = 0;
 				rxq->lro.lro_flushed = 0;
 #endif
 				rxq->rxcsum = 0;
 				rxq->vlan_extraction = 0;
 				rxq->vxlan_rxcsum = 0;
 
 				rxq->fl.cl_allocated = 0;
 				rxq->fl.cl_recycled = 0;
 				rxq->fl.cl_fast_recycled = 0;
 			}
 
 			for_each_txq(vi, i, txq) {
 				txq->txcsum = 0;
 				txq->tso_wrs = 0;
 				txq->vlan_insertion = 0;
 				txq->imm_wrs = 0;
 				txq->sgl_wrs = 0;
 				txq->txpkt_wrs = 0;
 				txq->txpkts0_wrs = 0;
 				txq->txpkts1_wrs = 0;
 				txq->txpkts0_pkts = 0;
 				txq->txpkts1_pkts = 0;
 				txq->txpkts_flush = 0;
 				txq->raw_wrs = 0;
 				txq->vxlan_tso_wrs = 0;
 				txq->vxlan_txcsum = 0;
 				txq->kern_tls_records = 0;
 				txq->kern_tls_short = 0;
 				txq->kern_tls_partial = 0;
 				txq->kern_tls_full = 0;
 				txq->kern_tls_octets = 0;
 				txq->kern_tls_waste = 0;
 				txq->kern_tls_options = 0;
 				txq->kern_tls_header = 0;
 				txq->kern_tls_fin = 0;
 				txq->kern_tls_fin_short = 0;
 				txq->kern_tls_cbc = 0;
 				txq->kern_tls_gcm = 0;
 				mp_ring_reset_stats(txq->r);
 			}
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 			for_each_ofld_txq(vi, i, ofld_txq) {
 				ofld_txq->wrq.tx_wrs_direct = 0;
 				ofld_txq->wrq.tx_wrs_copied = 0;
 				counter_u64_zero(ofld_txq->tx_iscsi_pdus);
 				counter_u64_zero(ofld_txq->tx_iscsi_octets);
 				counter_u64_zero(ofld_txq->tx_iscsi_iso_wrs);
 				counter_u64_zero(ofld_txq->tx_toe_tls_records);
 				counter_u64_zero(ofld_txq->tx_toe_tls_octets);
 			}
 #endif
 #ifdef TCP_OFFLOAD
 			for_each_ofld_rxq(vi, i, ofld_rxq) {
 				ofld_rxq->fl.cl_allocated = 0;
 				ofld_rxq->fl.cl_recycled = 0;
 				ofld_rxq->fl.cl_fast_recycled = 0;
 				counter_u64_zero(
 				    ofld_rxq->rx_iscsi_ddp_setup_ok);
 				counter_u64_zero(
 				    ofld_rxq->rx_iscsi_ddp_setup_error);
 				ofld_rxq->rx_iscsi_ddp_pdus = 0;
 				ofld_rxq->rx_iscsi_ddp_octets = 0;
 				ofld_rxq->rx_iscsi_fl_pdus = 0;
 				ofld_rxq->rx_iscsi_fl_octets = 0;
 				ofld_rxq->rx_toe_tls_records = 0;
 				ofld_rxq->rx_toe_tls_octets = 0;
 			}
 #endif
 
 			if (IS_MAIN_VI(vi)) {
 				wrq = &sc->sge.ctrlq[pi->port_id];
 				wrq->tx_wrs_direct = 0;
 				wrq->tx_wrs_copied = 0;
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 hold_clip_addr(struct adapter *sc, struct t4_clip_addr *ca)
 {
 #ifdef INET6
 	struct in6_addr in6;
 
 	bcopy(&ca->addr[0], &in6.s6_addr[0], sizeof(in6.s6_addr));
 	if (t4_get_clip_entry(sc, &in6, true) != NULL)
 		return (0);
 	else
 		return (EIO);
 #else
 	return (ENOTSUP);
 #endif
 }
 
 static int
 release_clip_addr(struct adapter *sc, struct t4_clip_addr *ca)
 {
 #ifdef INET6
 	struct in6_addr in6;
 
 	bcopy(&ca->addr[0], &in6.s6_addr[0], sizeof(in6.s6_addr));
 	return (t4_release_clip_addr(sc, &in6));
 #else
 	return (ENOTSUP);
 #endif
 }
 
 int
 t4_os_find_pci_capability(struct adapter *sc, int cap)
 {
 	int i;
 
 	return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0);
 }
 
 int
 t4_os_pci_save_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_save(dev, dinfo, 0);
 	return (0);
 }
 
 int
 t4_os_pci_restore_state(struct adapter *sc)
 {
 	device_t dev;
 	struct pci_devinfo *dinfo;
 
 	dev = sc->dev;
 	dinfo = device_get_ivars(dev);
 
 	pci_cfg_restore(dev, dinfo);
 	return (0);
 }
 
 void
 t4_os_portmod_changed(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	static const char *mod_str[] = {
 		NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM"
 	};
 
 	KASSERT((pi->flags & FIXED_IFMEDIA) == 0,
 	    ("%s: port_type %u", __func__, pi->port_type));
 
 	vi = &pi->vi[0];
 	if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) {
 		PORT_LOCK(pi);
 		build_medialist(pi);
 		if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) {
 			fixup_link_config(pi);
 			apply_link_config(pi);
 		}
 		PORT_UNLOCK(pi);
 		end_synchronized_op(sc, LOCK_HELD);
 	}
 
 	ifp = vi->ifp;
 	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
 		if_printf(ifp, "transceiver unplugged.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN)
 		if_printf(ifp, "unknown transceiver inserted.\n");
 	else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED)
 		if_printf(ifp, "unsupported transceiver inserted.\n");
 	else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) {
 		if_printf(ifp, "%dGbps %s transceiver inserted.\n",
 		    port_top_speed(pi), mod_str[pi->mod_type]);
 	} else {
 		if_printf(ifp, "transceiver (type %d) inserted.\n",
 		    pi->mod_type);
 	}
 }
 
 void
 t4_os_link_changed(struct port_info *pi)
 {
 	struct vi_info *vi;
 	struct ifnet *ifp;
 	struct link_config *lc = &pi->link_cfg;
 	struct adapter *sc = pi->adapter;
 	int v;
 
 	PORT_LOCK_ASSERT_OWNED(pi);
 
 	if (is_t6(sc)) {
 		if (lc->link_ok) {
 			if (lc->speed > 25000 ||
 			    (lc->speed == 25000 && lc->fec == FEC_RS)) {
 				pi->fcs_reg = T5_PORT_REG(pi->tx_chan,
 				    A_MAC_PORT_AFRAMECHECKSEQUENCEERRORS);
 			} else {
 				pi->fcs_reg = T5_PORT_REG(pi->tx_chan,
 				    A_MAC_PORT_MTIP_1G10G_RX_CRCERRORS);
 			}
 			pi->fcs_base = t4_read_reg64(sc, pi->fcs_reg);
 			pi->stats.rx_fcs_err = 0;
 		} else {
 			pi->fcs_reg = -1;
 		}
 	} else {
 		MPASS(pi->fcs_reg != -1);
 		MPASS(pi->fcs_base == 0);
 	}
 
 	for_each_vi(pi, v, vi) {
 		ifp = vi->ifp;
 		if (ifp == NULL)
 			continue;
 
 		if (lc->link_ok) {
 			ifp->if_baudrate = IF_Mbps(lc->speed);
 			if_link_state_change(ifp, LINK_STATE_UP);
 		} else {
 			if_link_state_change(ifp, LINK_STATE_DOWN);
 		}
 	}
 }
 
 void
 t4_iterate(void (*func)(struct adapter *, void *), void *arg)
 {
 	struct adapter *sc;
 
 	sx_slock(&t4_list_lock);
 	SLIST_FOREACH(sc, &t4_list, link) {
 		/*
 		 * func should not make any assumptions about what state sc is
 		 * in - the only guarantee is that sc->sc_lock is a valid lock.
 		 */
 		func(sc, arg);
 	}
 	sx_sunlock(&t4_list_lock);
 }
 
 static int
 t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	int rc;
 	struct adapter *sc = dev->si_drv1;
 
 	rc = priv_check(td, PRIV_DRIVER);
 	if (rc != 0)
 		return (rc);
 
 	switch (cmd) {
 	case CHELSIO_T4_GETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else if (edata->size == 4)
 			edata->val = t4_read_reg(sc, edata->addr);
 		else if (edata->size == 8)
 			edata->val = t4_read_reg64(sc, edata->addr);
 		else
 			rc = EINVAL;
 		mtx_unlock(&sc->reg_lock);
 
 		break;
 	}
 	case CHELSIO_T4_SETREG: {
 		struct t4_reg *edata = (struct t4_reg *)data;
 
 		if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len)
 			return (EFAULT);
 
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else if (edata->size == 4) {
 			if (edata->val & 0xffffffff00000000)
 				rc = EINVAL;
 			t4_write_reg(sc, edata->addr, (uint32_t) edata->val);
 		} else if (edata->size == 8)
 			t4_write_reg64(sc, edata->addr, edata->val);
 		else
 			rc = EINVAL;
 		mtx_unlock(&sc->reg_lock);
 
 		break;
 	}
 	case CHELSIO_T4_REGDUMP: {
 		struct t4_regdump *regs = (struct t4_regdump *)data;
 		int reglen = t4_get_regs_len(sc);
 		uint8_t *buf;
 
 		if (regs->len < reglen) {
 			regs->len = reglen; /* hint to the caller */
 			return (ENOBUFS);
 		}
 
 		regs->len = reglen;
 		buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO);
 		mtx_lock(&sc->reg_lock);
 		if (hw_off_limits(sc))
 			rc = ENXIO;
 		else
 			get_regs(sc, regs, buf);
 		mtx_unlock(&sc->reg_lock);
 		if (rc == 0)
 			rc = copyout(buf, regs->data, reglen);
 		free(buf, M_CXGBE);
 		break;
 	}
 	case CHELSIO_T4_GET_FILTER_MODE:
 		rc = get_filter_mode(sc, (uint32_t *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER_MODE:
 		rc = set_filter_mode(sc, *(uint32_t *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER_MASK:
 		rc = set_filter_mask(sc, *(uint32_t *)data);
 		break;
 	case CHELSIO_T4_GET_FILTER:
 		rc = get_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_SET_FILTER:
 		rc = set_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_DEL_FILTER:
 		rc = del_filter(sc, (struct t4_filter *)data);
 		break;
 	case CHELSIO_T4_GET_SGE_CONTEXT:
 		rc = get_sge_context(sc, (struct t4_sge_context *)data);
 		break;
 	case CHELSIO_T4_LOAD_FW:
 		rc = load_fw(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_GET_MEM:
 		rc = read_card_mem(sc, 2, (struct t4_mem_range *)data);
 		break;
 	case CHELSIO_T4_GET_I2C:
 		rc = read_i2c(sc, (struct t4_i2c_data *)data);
 		break;
 	case CHELSIO_T4_CLEAR_STATS:
 		rc = clear_stats(sc, *(uint32_t *)data);
 		break;
 	case CHELSIO_T4_SCHED_CLASS:
 		rc = t4_set_sched_class(sc, (struct t4_sched_params *)data);
 		break;
 	case CHELSIO_T4_SCHED_QUEUE:
 		rc = t4_set_sched_queue(sc, (struct t4_sched_queue *)data);
 		break;
 	case CHELSIO_T4_GET_TRACER:
 		rc = t4_get_tracer(sc, (struct t4_tracer *)data);
 		break;
 	case CHELSIO_T4_SET_TRACER:
 		rc = t4_set_tracer(sc, (struct t4_tracer *)data);
 		break;
 	case CHELSIO_T4_LOAD_CFG:
 		rc = load_cfg(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_LOAD_BOOT:
 		rc = load_boot(sc, (struct t4_bootrom *)data);
 		break;
 	case CHELSIO_T4_LOAD_BOOTCFG:
 		rc = load_bootcfg(sc, (struct t4_data *)data);
 		break;
 	case CHELSIO_T4_CUDBG_DUMP:
 		rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data);
 		break;
 	case CHELSIO_T4_SET_OFLD_POLICY:
 		rc = set_offload_policy(sc, (struct t4_offload_policy *)data);
 		break;
 	case CHELSIO_T4_HOLD_CLIP_ADDR:
 		rc = hold_clip_addr(sc, (struct t4_clip_addr *)data);
 		break;
 	case CHELSIO_T4_RELEASE_CLIP_ADDR:
 		rc = release_clip_addr(sc, (struct t4_clip_addr *)data);
 		break;
 	default:
 		rc = ENOTTY;
 	}
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 toe_capability(struct vi_info *vi, bool enable)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!is_offload(sc))
 		return (ENODEV);
 	if (hw_off_limits(sc))
 		return (ENXIO);
 
 	if (enable) {
 #ifdef KERN_TLS
 		if (sc->flags & KERN_TLS_ON) {
 			int i, j, n;
 			struct port_info *p;
 			struct vi_info *v;
 
 			/*
 			 * Reconfigure hardware for TOE if TXTLS is not enabled
 			 * on any ifnet.
 			 */
 			n = 0;
 			for_each_port(sc, i) {
 				p = sc->port[i];
 				for_each_vi(p, j, v) {
 					if (v->ifp->if_capenable & IFCAP_TXTLS) {
 						CH_WARN(sc,
 						    "%s has NIC TLS enabled.\n",
 						    device_get_nameunit(v->dev));
 						n++;
 					}
 				}
 			}
 			if (n > 0) {
 				CH_WARN(sc, "Disable NIC TLS on all interfaces "
 				    "associated with this adapter before "
 				    "trying to enable TOE.\n");
 				return (EAGAIN);
 			}
 			rc = t4_config_kern_tls(sc, false);
 			if (rc)
 				return (rc);
 		}
 #endif
 		if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) {
 			/* TOE is already enabled. */
 			return (0);
 		}
 
 		/*
 		 * We need the port's queues around so that we're able to send
 		 * and receive CPLs to/from the TOE even if the ifnet for this
 		 * port has never been UP'd administratively.
 		 */
 		if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_init(vi)) != 0))
 			return (rc);
 		if (!(pi->vi[0].flags & VI_INIT_DONE) &&
 		    ((rc = vi_init(&pi->vi[0])) != 0))
 			return (rc);
 
 		if (isset(&sc->offload_map, pi->port_id)) {
 			/* TOE is enabled on another VI of this port. */
 			pi->uld_vis++;
 			return (0);
 		}
 
 		if (!uld_active(sc, ULD_TOM)) {
 			rc = t4_activate_uld(sc, ULD_TOM);
 			if (rc == EAGAIN) {
 				log(LOG_WARNING,
 				    "You must kldload t4_tom.ko before trying "
 				    "to enable TOE on a cxgbe interface.\n");
 			}
 			if (rc != 0)
 				return (rc);
 			KASSERT(sc->tom_softc != NULL,
 			    ("%s: TOM activated but softc NULL", __func__));
 			KASSERT(uld_active(sc, ULD_TOM),
 			    ("%s: TOM activated but flag not set", __func__));
 		}
 
 		/* Activate iWARP and iSCSI too, if the modules are loaded. */
 		if (!uld_active(sc, ULD_IWARP))
 			(void) t4_activate_uld(sc, ULD_IWARP);
 		if (!uld_active(sc, ULD_ISCSI))
 			(void) t4_activate_uld(sc, ULD_ISCSI);
 
 		pi->uld_vis++;
 		setbit(&sc->offload_map, pi->port_id);
 	} else {
 		pi->uld_vis--;
 
 		if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0)
 			return (0);
 
 		KASSERT(uld_active(sc, ULD_TOM),
 		    ("%s: TOM never initialized?", __func__));
 		clrbit(&sc->offload_map, pi->port_id);
 	}
 
 	return (0);
 }
 
 /*
  * Add an upper layer driver to the global list.
  */
 int
 t4_register_uld(struct uld_info *ui)
 {
 	int rc = 0;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u->uld_id == ui->uld_id) {
 		    rc = EEXIST;
 		    goto done;
 	    }
 	}
 
 	SLIST_INSERT_HEAD(&t4_uld_list, ui, link);
 	ui->refcount = 0;
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_unregister_uld(struct uld_info *ui)
 {
 	int rc = EINVAL;
 	struct uld_info *u;
 
 	sx_xlock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(u, &t4_uld_list, link) {
 	    if (u == ui) {
 		    if (ui->refcount > 0) {
 			    rc = EBUSY;
 			    goto done;
 		    }
 
 		    SLIST_REMOVE(&t4_uld_list, ui, uld_info, link);
 		    rc = 0;
 		    goto done;
 	    }
 	}
 done:
 	sx_xunlock(&t4_uld_list_lock);
 	return (rc);
 }
 
 int
 t4_activate_uld(struct adapter *sc, int id)
 {
 	int rc;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (id < 0 || id > ULD_MAX)
 		return (EINVAL);
 	rc = EAGAIN;	/* kldoad the module with this ULD and try again. */
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			if (!(sc->flags & FULL_INIT_DONE)) {
 				rc = adapter_init(sc);
 				if (rc != 0)
 					break;
 			}
 
 			rc = ui->activate(sc);
 			if (rc == 0) {
 				setbit(&sc->active_ulds, id);
 				ui->refcount++;
 			}
 			break;
 		}
 	}
 
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 
 int
 t4_deactivate_uld(struct adapter *sc, int id)
 {
 	int rc;
 	struct uld_info *ui;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (id < 0 || id > ULD_MAX)
 		return (EINVAL);
 	rc = ENXIO;
 
 	sx_slock(&t4_uld_list_lock);
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
 			rc = ui->deactivate(sc);
 			if (rc == 0) {
 				clrbit(&sc->active_ulds, id);
 				ui->refcount--;
 			}
 			break;
 		}
 	}
 
 	sx_sunlock(&t4_uld_list_lock);
 
 	return (rc);
 }
 
 static void
 t4_async_event(void *arg, int n)
 {
 	struct uld_info *ui;
 	struct adapter *sc = (struct adapter *)arg;
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4async") != 0)
 		return;
 	sx_slock(&t4_uld_list_lock);
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == ULD_IWARP) {
 			ui->async_event(sc);
 			break;
 		}
 	}
 	sx_sunlock(&t4_uld_list_lock);
 	end_synchronized_op(sc, 0);
 }
 
 int
 uld_active(struct adapter *sc, int uld_id)
 {
 
 	MPASS(uld_id >= 0 && uld_id <= ULD_MAX);
 
 	return (isset(&sc->active_ulds, uld_id));
 }
 #endif
 
 #ifdef KERN_TLS
 static int
 ktls_capability(struct adapter *sc, bool enable)
 {
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (!is_ktls(sc))
 		return (ENODEV);
 	if (hw_off_limits(sc))
 		return (ENXIO);
 
 	if (enable) {
 		if (sc->flags & KERN_TLS_ON)
 			return (0);	/* already on */
 		if (sc->offload_map != 0) {
 			CH_WARN(sc,
 			    "Disable TOE on all interfaces associated with "
 			    "this adapter before trying to enable NIC TLS.\n");
 			return (EAGAIN);
 		}
 		return (t4_config_kern_tls(sc, true));
 	} else {
 		/*
 		 * Nothing to do for disable.  If TOE is enabled sometime later
 		 * then toe_capability will reconfigure the hardware.
 		 */
 		return (0);
 	}
 }
 #endif
 
 /*
  * t  = ptr to tunable.
  * nc = number of CPUs.
  * c  = compiled in default for that tunable.
  */
 static void
 calculate_nqueues(int *t, int nc, const int c)
 {
 	int nq;
 
 	if (*t > 0)
 		return;
 	nq = *t < 0 ? -*t : c;
 	*t = min(nc, nq);
 }
 
 /*
  * Come up with reasonable defaults for some of the tunables, provided they're
  * not set by the user (in which case we'll use the values as is).
  */
 static void
 tweak_tunables(void)
 {
 	int nc = mp_ncpus;	/* our snapshot of the number of CPUs */
 
 	if (t4_ntxq < 1) {
 #ifdef RSS
 		t4_ntxq = rss_getnumbuckets();
 #else
 		calculate_nqueues(&t4_ntxq, nc, NTXQ);
 #endif
 	}
 
 	calculate_nqueues(&t4_ntxq_vi, nc, NTXQ_VI);
 
 	if (t4_nrxq < 1) {
 #ifdef RSS
 		t4_nrxq = rss_getnumbuckets();
 #else
 		calculate_nqueues(&t4_nrxq, nc, NRXQ);
 #endif
 	}
 
 	calculate_nqueues(&t4_nrxq_vi, nc, NRXQ_VI);
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	calculate_nqueues(&t4_nofldtxq, nc, NOFLDTXQ);
 	calculate_nqueues(&t4_nofldtxq_vi, nc, NOFLDTXQ_VI);
 #endif
 #ifdef TCP_OFFLOAD
 	calculate_nqueues(&t4_nofldrxq, nc, NOFLDRXQ);
 	calculate_nqueues(&t4_nofldrxq_vi, nc, NOFLDRXQ_VI);
 #endif
 
 #if defined(TCP_OFFLOAD) || defined(KERN_TLS)
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = FW_CAPS_CONFIG_TOE;
 #else
 	if (t4_toecaps_allowed == -1)
 		t4_toecaps_allowed = 0;
 #endif
 
 #ifdef TCP_OFFLOAD
 	if (t4_rdmacaps_allowed == -1) {
 		t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP |
 		    FW_CAPS_CONFIG_RDMA_RDMAC;
 	}
 
 	if (t4_iscsicaps_allowed == -1) {
 		t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU |
 		    FW_CAPS_CONFIG_ISCSI_TARGET_PDU |
 		    FW_CAPS_CONFIG_ISCSI_T10DIF;
 	}
 
 	if (t4_tmr_idx_ofld < 0 || t4_tmr_idx_ofld >= SGE_NTIMERS)
 		t4_tmr_idx_ofld = TMR_IDX_OFLD;
 
 	if (t4_pktc_idx_ofld < -1 || t4_pktc_idx_ofld >= SGE_NCOUNTERS)
 		t4_pktc_idx_ofld = PKTC_IDX_OFLD;
 
 	if (t4_toe_tls_rx_timeout < 0)
 		t4_toe_tls_rx_timeout = 0;
 #else
 	if (t4_rdmacaps_allowed == -1)
 		t4_rdmacaps_allowed = 0;
 
 	if (t4_iscsicaps_allowed == -1)
 		t4_iscsicaps_allowed = 0;
 #endif
 
 #ifdef DEV_NETMAP
 	calculate_nqueues(&t4_nnmtxq, nc, NNMTXQ);
 	calculate_nqueues(&t4_nnmrxq, nc, NNMRXQ);
 	calculate_nqueues(&t4_nnmtxq_vi, nc, NNMTXQ_VI);
 	calculate_nqueues(&t4_nnmrxq_vi, nc, NNMRXQ_VI);
 #endif
 
 	if (t4_tmr_idx < 0 || t4_tmr_idx >= SGE_NTIMERS)
 		t4_tmr_idx = TMR_IDX;
 
 	if (t4_pktc_idx < -1 || t4_pktc_idx >= SGE_NCOUNTERS)
 		t4_pktc_idx = PKTC_IDX;
 
 	if (t4_qsize_txq < 128)
 		t4_qsize_txq = 128;
 
 	if (t4_qsize_rxq < 128)
 		t4_qsize_rxq = 128;
 	while (t4_qsize_rxq & 7)
 		t4_qsize_rxq++;
 
 	t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX;
 
 	/*
 	 * Number of VIs to create per-port.  The first VI is the "main" regular
 	 * VI for the port.  The rest are additional virtual interfaces on the
 	 * same physical port.  Note that the main VI does not have native
 	 * netmap support but the extra VIs do.
 	 *
 	 * Limit the number of VIs per port to the number of available
 	 * MAC addresses per port.
 	 */
 	if (t4_num_vis < 1)
 		t4_num_vis = 1;
 	if (t4_num_vis > nitems(vi_mac_funcs)) {
 		t4_num_vis = nitems(vi_mac_funcs);
 		printf("cxgbe: number of VIs limited to %d\n", t4_num_vis);
 	}
 
 	if (pcie_relaxed_ordering < 0 || pcie_relaxed_ordering > 2) {
 		pcie_relaxed_ordering = 1;
 #if defined(__i386__) || defined(__amd64__)
 		if (cpu_vendor_id == CPU_VENDOR_INTEL)
 			pcie_relaxed_ordering = 0;
 #endif
 	}
 }
 
 #ifdef DDB
 static void
 t4_dump_tcb(struct adapter *sc, int tid)
 {
 	uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos;
 
 	reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2);
 	save = t4_read_reg(sc, reg);
 	base = sc->memwin[2].mw_base;
 
 	/* Dump TCB for the tid */
 	tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	tcb_addr += tid * TCB_SIZE;
 
 	if (is_t4(sc)) {
 		pf = 0;
 		win_pos = tcb_addr & ~0xf;	/* start must be 16B aligned */
 	} else {
 		pf = V_PFNUM(sc->pf);
 		win_pos = tcb_addr & ~0x7f;	/* start must be 128B aligned */
 	}
 	t4_write_reg(sc, reg, win_pos | pf);
 	t4_read_reg(sc, reg);
 
 	off = tcb_addr - win_pos;
 	for (i = 0; i < 4; i++) {
 		uint32_t buf[8];
 		for (j = 0; j < 8; j++, off += 4)
 			buf[j] = htonl(t4_read_reg(sc, base + off));
 
 		db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n",
 		    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6],
 		    buf[7]);
 	}
 
 	t4_write_reg(sc, reg, save);
 	t4_read_reg(sc, reg);
 }
 
 static void
 t4_dump_devlog(struct adapter *sc)
 {
 	struct devlog_params *dparams = &sc->params.devlog;
 	struct fw_devlog_e e;
 	int i, first, j, m, nentries, rc;
 	uint64_t ftstamp = UINT64_MAX;
 
 	if (dparams->start == 0) {
 		db_printf("devlog params not valid\n");
 		return;
 	}
 
 	nentries = dparams->size / sizeof(struct fw_devlog_e);
 	m = fwmtype_to_hwmtype(dparams->memtype);
 
 	/* Find the first entry. */
 	first = -1;
 	for (i = 0; i < nentries && !db_pager_quit; i++) {
 		rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e),
 		    sizeof(e), (void *)&e);
 		if (rc != 0)
 			break;
 
 		if (e.timestamp == 0)
 			break;
 
 		e.timestamp = be64toh(e.timestamp);
 		if (e.timestamp < ftstamp) {
 			ftstamp = e.timestamp;
 			first = i;
 		}
 	}
 
 	if (first == -1)
 		return;
 
 	i = first;
 	do {
 		rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e),
 		    sizeof(e), (void *)&e);
 		if (rc != 0)
 			return;
 
 		if (e.timestamp == 0)
 			return;
 
 		e.timestamp = be64toh(e.timestamp);
 		e.seqno = be32toh(e.seqno);
 		for (j = 0; j < 8; j++)
 			e.params[j] = be32toh(e.params[j]);
 
 		db_printf("%10d  %15ju  %8s  %8s  ",
 		    e.seqno, e.timestamp,
 		    (e.level < nitems(devlog_level_strings) ?
 			devlog_level_strings[e.level] : "UNKNOWN"),
 		    (e.facility < nitems(devlog_facility_strings) ?
 			devlog_facility_strings[e.facility] : "UNKNOWN"));
 		db_printf(e.fmt, e.params[0], e.params[1], e.params[2],
 		    e.params[3], e.params[4], e.params[5], e.params[6],
 		    e.params[7]);
 
 		if (++i == nentries)
 			i = 0;
 	} while (i != first && !db_pager_quit);
 }
 
 static struct command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table);
 _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table);
 
 DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL)
 {
 	device_t dev;
 	int t;
 	bool valid;
 
 	valid = false;
 	t = db_read_token();
 	if (t == tIDENT) {
 		dev = device_lookup_by_name(db_tok_string);
 		valid = true;
 	}
 	db_skip_to_eol();
 	if (!valid) {
 		db_printf("usage: show t4 devlog <nexus>\n");
 		return;
 	}
 
 	if (dev == NULL) {
 		db_printf("device not found\n");
 		return;
 	}
 
 	t4_dump_devlog(device_get_softc(dev));
 }
 
 DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL)
 {
 	device_t dev;
 	int radix, tid, t;
 	bool valid;
 
 	valid = false;
 	radix = db_radix;
 	db_radix = 10;
 	t = db_read_token();
 	if (t == tIDENT) {
 		dev = device_lookup_by_name(db_tok_string);
 		t = db_read_token();
 		if (t == tNUMBER) {
 			tid = db_tok_number;
 			valid = true;
 		}
 	}	
 	db_radix = radix;
 	db_skip_to_eol();
 	if (!valid) {
 		db_printf("usage: show t4 tcb <nexus> <tid>\n");
 		return;
 	}
 
 	if (dev == NULL) {
 		db_printf("device not found\n");
 		return;
 	}
 	if (tid < 0) {
 		db_printf("invalid tid\n");
 		return;
 	}
 
 	t4_dump_tcb(device_get_softc(dev), tid);
 }
 #endif
 
 static eventhandler_tag vxlan_start_evtag;
 static eventhandler_tag vxlan_stop_evtag;
 
 struct vxlan_evargs {
 	struct ifnet *ifp;
 	uint16_t port;
 };
 
 static void
 enable_vxlan_rx(struct adapter *sc)
 {
 	int i, rc;
 	struct port_info *pi;
 	uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE, V_VXLAN(sc->vxlan_port) |
 	    F_VXLAN_EN);
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		if (pi->vxlan_tcam_entry == true)
 			continue;
 		rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid, match_all_mac,
 		    match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id,
 		    true);
 		if (rc < 0) {
 			rc = -rc;
 			CH_ERR(&pi->vi[0],
 			    "failed to add VXLAN TCAM entry: %d.\n", rc);
 		} else {
 			MPASS(rc == sc->rawf_base + pi->port_id);
 			pi->vxlan_tcam_entry = true;
 		}
 	}
 }
 
 static void
 t4_vxlan_start(struct adapter *sc, void *arg)
 {
 	struct vxlan_evargs *v = arg;
 
 	if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
 		return;
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0)
 		return;
 
 	if (sc->vxlan_refcount == 0) {
 		sc->vxlan_port = v->port;
 		sc->vxlan_refcount = 1;
 		if (!hw_off_limits(sc))
 			enable_vxlan_rx(sc);
 	} else if (sc->vxlan_port == v->port) {
 		sc->vxlan_refcount++;
 	} else {
 		CH_ERR(sc, "VXLAN already configured on port  %d; "
 		    "ignoring attempt to configure it on port %d\n",
 		    sc->vxlan_port, v->port);
 	}
 	end_synchronized_op(sc, 0);
 }
 
 static void
 t4_vxlan_stop(struct adapter *sc, void *arg)
 {
 	struct vxlan_evargs *v = arg;
 
 	if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
 		return;
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0)
 		return;
 
 	/*
 	 * VXLANs may have been configured before the driver was loaded so we
 	 * may see more stops than starts.  This is not handled cleanly but at
 	 * least we keep the refcount sane.
 	 */
 	if (sc->vxlan_port != v->port)
 		goto done;
 	if (sc->vxlan_refcount == 0) {
 		CH_ERR(sc, "VXLAN operation on port %d was stopped earlier; "
 		    "ignoring attempt to stop it again.\n", sc->vxlan_port);
 	} else if (--sc->vxlan_refcount == 0 && !hw_off_limits(sc))
 		t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0);
 done:
 	end_synchronized_op(sc, 0);
 }
 
 static void
 t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp,
     sa_family_t family, u_int port)
 {
 	struct vxlan_evargs v;
 
 	MPASS(family == AF_INET || family == AF_INET6);
 	v.ifp = ifp;
 	v.port = port;
 
 	t4_iterate(t4_vxlan_start, &v);
 }
 
 static void
 t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family,
     u_int port)
 {
 	struct vxlan_evargs v;
 
 	MPASS(family == AF_INET || family == AF_INET6);
 	v.ifp = ifp;
 	v.port = port;
 
 	t4_iterate(t4_vxlan_stop, &v);
 }
 
 
 static struct sx mlu;	/* mod load unload */
 SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
 
 static int
 mod_event(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 	static int loaded = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		sx_xlock(&mlu);
 		if (loaded++ == 0) {
 			t4_sge_modload();
 			t4_register_shared_cpl_handler(CPL_SET_TCB_RPL,
 			    t4_filter_rpl, CPL_COOKIE_FILTER);
 			t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL,
 			    do_l2t_write_rpl, CPL_COOKIE_FILTER);
 			t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL,
 			    t4_hashfilter_ao_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_shared_cpl_handler(CPL_SET_TCB_RPL,
 			    t4_hashfilter_tcb_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS,
 			    t4_del_hashfilter_rpl, CPL_COOKIE_HASHFILTER);
 			t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt);
 			t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt);
 			t4_register_cpl_handler(CPL_SMT_WRITE_RPL,
 			    do_smt_write_rpl);
 			sx_init(&t4_list_lock, "T4/T5 adapters");
 			SLIST_INIT(&t4_list);
 			callout_init(&fatal_callout, 1);
 #ifdef TCP_OFFLOAD
 			sx_init(&t4_uld_list_lock, "T4/T5 ULDs");
 			SLIST_INIT(&t4_uld_list);
 #endif
 #ifdef INET6
 			t4_clip_modload();
 #endif
 #ifdef KERN_TLS
 			t6_ktls_modload();
 #endif
 			t4_tracer_modload();
 			tweak_tunables();
 			vxlan_start_evtag =
 			    EVENTHANDLER_REGISTER(vxlan_start,
 				t4_vxlan_start_handler, NULL,
 				EVENTHANDLER_PRI_ANY);
 			vxlan_stop_evtag =
 			    EVENTHANDLER_REGISTER(vxlan_stop,
 				t4_vxlan_stop_handler, NULL,
 				EVENTHANDLER_PRI_ANY);
 			reset_tq = taskqueue_create("t4_rst_tq", M_WAITOK,
 			    taskqueue_thread_enqueue, &reset_tq);
 			taskqueue_start_threads(&reset_tq, 1, PI_SOFT,
 			    "t4_rst_thr");
 		}
 		sx_xunlock(&mlu);
 		break;
 
 	case MOD_UNLOAD:
 		sx_xlock(&mlu);
 		if (--loaded == 0) {
 			int tries;
 
 			taskqueue_free(reset_tq);
 			sx_slock(&t4_list_lock);
 			if (!SLIST_EMPTY(&t4_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #ifdef TCP_OFFLOAD
 			sx_slock(&t4_uld_list_lock);
 			if (!SLIST_EMPTY(&t4_uld_list)) {
 				rc = EBUSY;
 				sx_sunlock(&t4_uld_list_lock);
 				sx_sunlock(&t4_list_lock);
 				goto done_unload;
 			}
 #endif
 			tries = 0;
 			while (tries++ < 5 && t4_sge_extfree_refs() != 0) {
 				uprintf("%ju clusters with custom free routine "
 				    "still is use.\n", t4_sge_extfree_refs());
 				pause("t4unload", 2 * hz);
 			}
 #ifdef TCP_OFFLOAD
 			sx_sunlock(&t4_uld_list_lock);
 #endif
 			sx_sunlock(&t4_list_lock);
 
 			if (t4_sge_extfree_refs() == 0) {
 				EVENTHANDLER_DEREGISTER(vxlan_start,
 				    vxlan_start_evtag);
 				EVENTHANDLER_DEREGISTER(vxlan_stop,
 				    vxlan_stop_evtag);
 				t4_tracer_modunload();
 #ifdef KERN_TLS
 				t6_ktls_modunload();
 #endif
 #ifdef INET6
 				t4_clip_modunload();
 #endif
 #ifdef TCP_OFFLOAD
 				sx_destroy(&t4_uld_list_lock);
 #endif
 				sx_destroy(&t4_list_lock);
 				t4_sge_modunload();
 				loaded = 0;
 			} else {
 				rc = EBUSY;
 				loaded++;	/* undo earlier decrement */
 			}
 		}
 done_unload:
 		sx_xunlock(&mlu);
 		break;
 	}
 
 	return (rc);
 }
 
 static devclass_t t4_devclass, t5_devclass, t6_devclass;
 static devclass_t cxgbe_devclass, cxl_devclass, cc_devclass;
 static devclass_t vcxgbe_devclass, vcxl_devclass, vcc_devclass;
 
 DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0);
 MODULE_VERSION(t4nex, 1);
 MODULE_DEPEND(t4nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t4nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0);
 MODULE_VERSION(t5nex, 1);
 MODULE_DEPEND(t5nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t5nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(t6nex, pci, t6_driver, t6_devclass, mod_event, 0);
 MODULE_VERSION(t6nex, 1);
 MODULE_DEPEND(t6nex, firmware, 1, 1, 1);
 #ifdef DEV_NETMAP
 MODULE_DEPEND(t6nex, netmap, 1, 1, 1);
 #endif /* DEV_NETMAP */
 
 DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0);
 MODULE_VERSION(cxgbe, 1);
 
 DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0);
 MODULE_VERSION(cxl, 1);
 
 DRIVER_MODULE(cc, t6nex, cc_driver, cc_devclass, 0, 0);
 MODULE_VERSION(cc, 1);
 
 DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, vcxgbe_devclass, 0, 0);
 MODULE_VERSION(vcxgbe, 1);
 
 DRIVER_MODULE(vcxl, cxl, vcxl_driver, vcxl_devclass, 0, 0);
 MODULE_VERSION(vcxl, 1);
 
 DRIVER_MODULE(vcc, cc, vcc_driver, vcc_devclass, 0, 0);
 MODULE_VERSION(vcc, 1);
diff --git a/sys/dev/cxgbe/t4_sched.c b/sys/dev/cxgbe/t4_sched.c
index b19e62474bbb..82f8537bda38 100644
--- a/sys/dev/cxgbe/t4_sched.c
+++ b/sys/dev/cxgbe/t4_sched.c
@@ -1,979 +1,991 @@
 /*-
  * Copyright (c) 2017 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/sysctl.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 
-
 static int
 in_range(int val, int lo, int hi)
 {
 
 	return (val < 0 || (val <= hi && val >= lo));
 }
 
 static int
 set_sched_class_config(struct adapter *sc, int minmax)
 {
 	int rc;
 
 	if (minmax < 0)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc");
 	if (rc)
 		return (rc);
 	if (hw_off_limits(sc))
 		rc = ENXIO;
 	else
 		rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 static int
 set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p,
     int sleep_ok)
 {
 	int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode;
 	struct port_info *pi;
 	struct tx_cl_rl_params *tc, old;
 	bool check_pktsize = false;
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL)
 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
 	else if (p->level == SCHED_CLASS_LEVEL_CL_WRR)
 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
 	else if (p->level == SCHED_CLASS_LEVEL_CH_RL)
 		fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
 	else
 		return (EINVAL);
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 		if (p->mode == SCHED_CLASS_MODE_CLASS)
 			fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
 		else if (p->mode == SCHED_CLASS_MODE_FLOW) {
 			check_pktsize = true;
 			fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
 		} else
 			return (EINVAL);
 	} else
 		fw_mode = 0;
 
 	/* Valid channel must always be provided. */
 	if (p->channel < 0)
 		return (EINVAL);
 	if (!in_range(p->channel, 0, sc->chip_params->nchan - 1))
 		return (ERANGE);
 
 	pi = sc->port[sc->chan_map[p->channel]];
 	if (pi == NULL)
 		return (ENXIO);
 	MPASS(pi->tx_chan == p->channel);
 	top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
 	    p->level == SCHED_CLASS_LEVEL_CH_RL) {
 		/*
 		 * Valid rate (mode, unit and values) must be provided.
 		 */
 
 		if (p->minrate < 0)
 			p->minrate = 0;
 		if (p->maxrate < 0)
 			return (EINVAL);
 
 		if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) {
 			fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
 			/* ratemode could be relative (%) or absolute. */
 			if (p->ratemode == SCHED_CLASS_RATEMODE_REL) {
 				fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
 				/* maxrate is % of port bandwidth. */
 				if (!in_range(p->minrate, 0, 100) ||
 				    !in_range(p->maxrate, 0, 100)) {
 					return (ERANGE);
 				}
 			} else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) {
 				fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
 				/* maxrate is absolute value in kbps. */
 				if (!in_range(p->minrate, 0, top_speed) ||
 				    !in_range(p->maxrate, 0, top_speed)) {
 					return (ERANGE);
 				}
 			} else
 				return (EINVAL);
 		} else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) {
 			/* maxrate is the absolute value in pps. */
 			check_pktsize = true;
 			fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
 		} else
 			return (EINVAL);
 	} else {
 		MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR);
 
 		/*
 		 * Valid weight must be provided.
 		 */
 		if (p->weight < 0)
 		       return (EINVAL);
 		if (!in_range(p->weight, 1, 99))
 			return (ERANGE);
 
 		fw_rateunit = 0;
 		fw_ratemode = 0;
 	}
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
 	    p->level == SCHED_CLASS_LEVEL_CL_WRR) {
 		/*
 		 * Valid scheduling class must be provided.
 		 */
 		if (p->cl < 0)
 			return (EINVAL);
 		if (!in_range(p->cl, 0, sc->params.nsched_cls - 1))
 			return (ERANGE);
 	}
 
 	if (check_pktsize) {
 		if (p->pktsize < 0)
 			return (EINVAL);
 		if (!in_range(p->pktsize, 64, pi->vi[0].ifp->if_mtu))
 			return (ERANGE);
 	}
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 		tc = &pi->sched_params->cl_rl[p->cl];
 		mtx_lock(&sc->tc_lock);
 		if (tc->refcount > 0 || tc->state == CS_HW_UPDATE_IN_PROGRESS)
 			rc = EBUSY;
 		else {
 			old = *tc;
 
 			tc->flags |= CF_USER;
 			tc->state = CS_HW_UPDATE_IN_PROGRESS;
 			tc->ratemode = fw_ratemode;
 			tc->rateunit = fw_rateunit;
 			tc->mode = fw_mode;
 			tc->maxrate = p->maxrate;
 			tc->pktsize = p->pktsize;
 			rc = 0;
 		}
 		mtx_unlock(&sc->tc_lock);
 		if (rc != 0)
 			return (rc);
 	}
 
 	rc = begin_synchronized_op(sc, NULL,
 	    sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp");
 	if (rc != 0) {
 		if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 			mtx_lock(&sc->tc_lock);
 			MPASS(tc->refcount == 0);
 			MPASS(tc->flags & CF_USER);
 			MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);
 			*tc = old;
 			mtx_unlock(&sc->tc_lock);
 		}
 		return (rc);
 	}
 	if (!hw_off_limits(sc)) {
 		rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level,
 		    fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl,
 		    p->minrate, p->maxrate, p->weight, p->pktsize, 0, sleep_ok);
 	}
 	end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD);
 
 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
 		mtx_lock(&sc->tc_lock);
 		MPASS(tc->refcount == 0);
 		MPASS(tc->flags & CF_USER);
 		MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);
 
 		if (rc == 0)
 			tc->state = CS_HW_CONFIGURED;
 		else {
 			/* parameters failed so we don't park at params_set */
 			tc->state = CS_UNINITIALIZED;
 			tc->flags &= ~CF_USER;
 			CH_ERR(pi, "failed to configure traffic class %d: %d.  "
 			    "params: mode %d, rateunit %d, ratemode %d, "
 			    "channel %d, minrate %d, maxrate %d, pktsize %d, "
 			    "burstsize %d\n", p->cl, rc, fw_mode, fw_rateunit,
 			    fw_ratemode, p->channel, p->minrate, p->maxrate,
 			    p->pktsize, 0);
 		}
 		mtx_unlock(&sc->tc_lock);
 	}
 
 	return (rc);
 }
 
 static void
 update_tx_sched(void *context, int pending)
 {
 	int i, j, rc;
 	struct port_info *pi;
 	struct tx_cl_rl_params *tc;
 	struct adapter *sc = context;
 	const int n = sc->params.nsched_cls;
 
 	mtx_lock(&sc->tc_lock);
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		tc = &pi->sched_params->cl_rl[0];
 		for (j = 0; j < n; j++, tc++) {
 			MPASS(mtx_owned(&sc->tc_lock));
 			if (tc->state != CS_HW_UPDATE_REQUESTED)
 				continue;
 			mtx_unlock(&sc->tc_lock);
 
 			if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
 			    "t4utxs") != 0) {
 				mtx_lock(&sc->tc_lock);
 				continue;
 			}
 			rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED,
 			    FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit,
 			    tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0,
 			    tc->pktsize, tc->burstsize, 1);
 			end_synchronized_op(sc, 0);
 
 			mtx_lock(&sc->tc_lock);
 			MPASS(tc->state == CS_HW_UPDATE_REQUESTED);
 			if (rc == 0) {
 				tc->state = CS_HW_CONFIGURED;
 				continue;
 			}
 			/* parameters failed so we try to avoid params_set */
 			if (tc->refcount > 0)
 				tc->state = CS_PARAMS_SET;
 			else
 				tc->state = CS_UNINITIALIZED;
 			CH_ERR(pi, "failed to configure traffic class %d: %d.  "
 			    "params: mode %d, rateunit %d, ratemode %d, "
 			    "channel %d, minrate %d, maxrate %d, pktsize %d, "
 			    "burstsize %d\n", j, rc, tc->mode, tc->rateunit,
 			    tc->ratemode, pi->tx_chan, 0, tc->maxrate,
 			    tc->pktsize, tc->burstsize);
 		}
 	}
 	mtx_unlock(&sc->tc_lock);
 }
 
 int
 t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p)
 {
 
 	if (p->type != SCHED_CLASS_TYPE_PACKET)
 		return (EINVAL);
 
 	if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
 		return (set_sched_class_config(sc, p->u.config.minmax));
 
 	if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
 		return (set_sched_class_params(sc, &p->u.params, 1));
 
 	return (EINVAL);
 }
 
 static int
 bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx)
 {
 	struct tx_cl_rl_params *tc0, *tc;
 	int rc, old_idx;
 	uint32_t fw_mnem, fw_class;
 
 	if (!(txq->eq.flags & EQ_HW_ALLOCATED))
 		return (ENXIO);
 
 	mtx_lock(&sc->tc_lock);
 	if (txq->tc_idx == -2) {
 		rc = EBUSY;	/* Another bind/unbind in progress already. */
 		goto done;
 	}
 	if (idx == txq->tc_idx) {
 		rc = 0;		/* No change, nothing to do. */
 		goto done;
 	}
 
 	tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0];
 	if (idx != -1) {
 		/*
 		 * Bind to a different class at index idx.
 		 */
 		tc = &tc0[idx];
 		if (tc->state != CS_HW_CONFIGURED) {
 			rc = ENXIO;
 			goto done;
 		} else {
 			/*
 			 * Ok to proceed.  Place a reference on the new class
 			 * while still holding on to the reference on the
 			 * previous class, if any.
 			 */
 			tc->refcount++;
 		}
 	}
 	/* Mark as busy before letting go of the lock. */
 	old_idx = txq->tc_idx;
 	txq->tc_idx = -2;
 	mtx_unlock(&sc->tc_lock);
 
 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq");
 	if (rc == 0) {
 		fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
 		    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
 		fw_class = idx < 0 ? 0xffffffff : idx;
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem,
 		    &fw_class);
 		end_synchronized_op(sc, 0);
 	}
 
 	mtx_lock(&sc->tc_lock);
 	MPASS(txq->tc_idx == -2);
 	if (rc == 0) {
 		/*
 		 * Unbind, bind, or bind to a different class succeeded.  Remove
 		 * the reference on the old traffic class, if any.
 		 */
 		if (old_idx != -1) {
 			tc = &tc0[old_idx];
 			MPASS(tc->refcount > 0);
 			tc->refcount--;
 		}
 		txq->tc_idx = idx;
 	} else {
 		/*
 		 * Unbind, bind, or bind to a different class failed.  Remove
 		 * the anticipatory reference on the new traffic class, if any.
 		 */
 		if (idx != -1) {
 			tc = &tc0[idx];
 			MPASS(tc->refcount > 0);
 			tc->refcount--;
 		}
 		txq->tc_idx = old_idx;
 	}
 done:
 	MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->params.nsched_cls);
 	mtx_unlock(&sc->tc_lock);
 	return (rc);
 }
 
 int
 t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
 {
 	struct port_info *pi = NULL;
 	struct vi_info *vi;
 	struct sge_txq *txq;
 	int i, rc;
 
 	if (p->port >= sc->params.nports)
 		return (EINVAL);
 
 	/*
 	 * XXX: cxgbetool allows the user to specify the physical port only.  So
 	 * we always operate on the main VI.
 	 */
 	pi = sc->port[p->port];
 	vi = &pi->vi[0];
 
 	/* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */
 	if (!(vi->flags & VI_INIT_DONE))
 		return (EAGAIN);
 	MPASS(vi->ntxq > 0);
 
 	if (!in_range(p->queue, 0, vi->ntxq - 1) ||
 	    !in_range(p->cl, 0, sc->params.nsched_cls - 1))
 		return (EINVAL);
 
 	if (p->queue < 0) {
 		/*
 		 * Change the scheduling on all the TX queues for the
 		 * interface.
 		 */
 		for_each_txq(vi, i, txq) {
 			rc = bind_txq_to_traffic_class(sc, txq, p->cl);
 			if (rc != 0)
 				break;
 		}
 	} else {
 		/*
 		 * If op.queue is non-negative, then we're only changing the
 		 * scheduling on a single specified TX queue.
 		 */
 		txq = &sc->sge.txq[vi->first_txq + p->queue];
 		rc = bind_txq_to_traffic_class(sc, txq, p->cl);
 	}
 
 	return (rc);
 }
 
 int
 t4_init_tx_sched(struct adapter *sc)
 {
 	int i;
 	const int n = sc->params.nsched_cls;
 	struct port_info *pi;
 
 	mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF);
 	TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc);
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		pi->sched_params = malloc(sizeof(*pi->sched_params) +
 		    n * sizeof(struct tx_cl_rl_params), M_CXGBE, M_ZERO | M_WAITOK);
 	}
 
 	return (0);
 }
 
 int
 t4_free_tx_sched(struct adapter *sc)
 {
 	int i;
 
 	taskqueue_drain(taskqueue_thread, &sc->tc_task);
 
 	for_each_port(sc, i) {
 		if (sc->port[i] != NULL)
 			free(sc->port[i]->sched_params, M_CXGBE);
 	}
 
 	if (mtx_initialized(&sc->tc_lock))
 		mtx_destroy(&sc->tc_lock);
 
 	return (0);
 }
 
 void
 t4_update_tx_sched(struct adapter *sc)
 {
 
 	taskqueue_enqueue(taskqueue_thread, &sc->tc_task);
 }
 
 int
 t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate,
     int *tc_idx)
 {
 	int rc = 0, fa, fa2, i, pktsize, burstsize;
 	bool update;
 	struct tx_cl_rl_params *tc;
 	struct port_info *pi;
 
 	MPASS(port_id >= 0 && port_id < sc->params.nports);
 
 	pi = sc->port[port_id];
 	if (pi->sched_params->pktsize > 0)
 		pktsize = pi->sched_params->pktsize;
 	else
 		pktsize = pi->vi[0].ifp->if_mtu;
 	if (pi->sched_params->burstsize > 0)
 		burstsize = pi->sched_params->burstsize;
 	else
 		burstsize = pktsize * 4;
 	tc = &pi->sched_params->cl_rl[0];
 
 	update = false;
 	fa = fa2 = -1;
 	mtx_lock(&sc->tc_lock);
 	for (i = 0; i < sc->params.nsched_cls; i++, tc++) {
 		if (tc->state >= CS_PARAMS_SET &&
 		    tc->ratemode == FW_SCHED_PARAMS_RATE_ABS &&
 		    tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE &&
 		    tc->mode == FW_SCHED_PARAMS_MODE_FLOW &&
 		    tc->maxrate == maxrate && tc->pktsize == pktsize &&
 		    tc->burstsize == burstsize) {
 			tc->refcount++;
 			*tc_idx = i;
 			if (tc->state == CS_PARAMS_SET) {
 				tc->state = CS_HW_UPDATE_REQUESTED;
 				update = true;
 			}
 			goto done;
 		}
 
 		if (fa < 0 && tc->state == CS_UNINITIALIZED) {
 			MPASS(tc->refcount == 0);
 			fa = i;		/* first available, never used. */
 		}
 		if (fa2 < 0 && tc->refcount == 0 && !(tc->flags & CF_USER)) {
 			fa2 = i;	/* first available, used previously.  */
 		}
 	}
 	/* Not found */
 	MPASS(i == sc->params.nsched_cls);
 	if (fa == -1)
 		fa = fa2;
 	if (fa == -1) {
 		*tc_idx = -1;
 		rc = ENOSPC;
 	} else {
 		MPASS(fa >= 0 && fa < sc->params.nsched_cls);
 		tc = &pi->sched_params->cl_rl[fa];
 		MPASS(!(tc->flags & CF_USER));
 		MPASS(tc->refcount == 0);
 
 		tc->refcount = 1;
 		tc->state = CS_HW_UPDATE_REQUESTED;
 		tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
 		tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
 		tc->mode = FW_SCHED_PARAMS_MODE_FLOW;
 		tc->maxrate = maxrate;
 		tc->pktsize = pktsize;
 		tc->burstsize = burstsize;
 		*tc_idx = fa;
 		update = true;
 	}
 done:
 	mtx_unlock(&sc->tc_lock);
 	if (update)
 		t4_update_tx_sched(sc);
 	return (rc);
 }
 
 void
 t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx)
 {
 	struct tx_cl_rl_params *tc;
 
 	MPASS(port_id >= 0 && port_id < sc->params.nports);
 	MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
 
 	mtx_lock(&sc->tc_lock);
 	tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx];
 	MPASS(tc->refcount > 0);
 	tc->refcount--;
 	mtx_unlock(&sc->tc_lock);
 }
 
 int
 sysctl_tc(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct adapter *sc = vi->adapter;
 	struct sge_txq *txq;
 	int qidx = arg2, rc, tc_idx;
 
 	MPASS(qidx >= vi->first_txq && qidx < vi->first_txq + vi->ntxq);
 
 	txq = &sc->sge.txq[qidx];
 	tc_idx = txq->tc_idx;
 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	if (sc->flags & IS_VF)
 		return (EPERM);
 	if (!in_range(tc_idx, 0, sc->params.nsched_cls - 1))
 		return (EINVAL);
 
 	return (bind_txq_to_traffic_class(sc, txq, tc_idx));
 }
 
 int
 sysctl_tc_params(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct tx_cl_rl_params tc;
 	struct sbuf *sb;
 	int i, rc, port_id, mbps, gbps;
 
 	rc = sysctl_wire_old_buffer(req, 0);
 	if (rc != 0)
 		return (rc);
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	port_id = arg2 >> 16;
 	MPASS(port_id < sc->params.nports);
 	MPASS(sc->port[port_id] != NULL);
 	i = arg2 & 0xffff;
 	MPASS(i < sc->params.nsched_cls);
 
 	mtx_lock(&sc->tc_lock);
 	tc = sc->port[port_id]->sched_params->cl_rl[i];
 	mtx_unlock(&sc->tc_lock);
 
 	if (tc.state < CS_PARAMS_SET) {
 		sbuf_printf(sb, "uninitialized");
 		goto done;
 	}
 
 	switch (tc.rateunit) {
 	case SCHED_CLASS_RATEUNIT_BITS:
 		switch (tc.ratemode) {
 		case SCHED_CLASS_RATEMODE_REL:
 			/* XXX: top speed or actual link speed? */
 			gbps = port_top_speed(sc->port[port_id]);
 			sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps);
 			break;
 		case SCHED_CLASS_RATEMODE_ABS:
 			mbps = tc.maxrate / 1000;
 			gbps = tc.maxrate / 1000000;
 			if (tc.maxrate == gbps * 1000000)
 				sbuf_printf(sb, "%uGbps", gbps);
 			else if (tc.maxrate == mbps * 1000)
 				sbuf_printf(sb, "%uMbps", mbps);
 			else
 				sbuf_printf(sb, "%uKbps", tc.maxrate);
 			break;
 		default:
 			rc = ENXIO;
 			goto done;
 		}
 		break;
 	case SCHED_CLASS_RATEUNIT_PKTS:
 		sbuf_printf(sb, "%upps", tc.maxrate);
 		break;
 	default:
 		rc = ENXIO;
 		goto done;
 	}
 
 	switch (tc.mode) {
 	case SCHED_CLASS_MODE_CLASS:
 		/* Note that pktsize and burstsize are not used in this mode. */
 		sbuf_printf(sb, " aggregate");
 		break;
 	case SCHED_CLASS_MODE_FLOW:
 		sbuf_printf(sb, " per-flow");
 		if (tc.pktsize > 0)
 			sbuf_printf(sb, " pkt-size %u", tc.pktsize);
 		if (tc.burstsize > 0)
 			sbuf_printf(sb, " burst-size %u", tc.burstsize);
 		break;
 	default:
 		rc = ENXIO;
 		goto done;
 	}
 
 done:
 	if (rc == 0)
 		rc = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (rc);
 }
 
 #ifdef RATELIMIT
 void
 t4_init_etid_table(struct adapter *sc)
 {
 	int i;
 	struct tid_info *t;
 
 	if (!is_ethoffload(sc))
 		return;
 
 	t = &sc->tids;
 	MPASS(t->netids > 0);
 
 	mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF);
 	t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE,
 			M_ZERO | M_WAITOK);
 	t->efree = t->etid_tab;
 	t->etids_in_use = 0;
 	for (i = 1; i < t->netids; i++)
 		t->etid_tab[i - 1].next = &t->etid_tab[i];
 	t->etid_tab[t->netids - 1].next = NULL;
 }
 
 void
 t4_free_etid_table(struct adapter *sc)
 {
 	struct tid_info *t;
 
 	if (!is_ethoffload(sc))
 		return;
 
 	t = &sc->tids;
 	MPASS(t->netids > 0);
 
 	free(t->etid_tab, M_CXGBE);
 	t->etid_tab = NULL;
 
 	if (mtx_initialized(&t->etid_lock))
 		mtx_destroy(&t->etid_lock);
 }
 
 /* etid services */
 static int alloc_etid(struct adapter *, struct cxgbe_rate_tag *);
 static void free_etid(struct adapter *, int);
 
 static int
 alloc_etid(struct adapter *sc, struct cxgbe_rate_tag *cst)
 {
 	struct tid_info *t = &sc->tids;
 	int etid = -1;
 
 	mtx_lock(&t->etid_lock);
 	if (t->efree) {
 		union etid_entry *p = t->efree;
 
 		etid = p - t->etid_tab + t->etid_base;
 		t->efree = p->next;
 		p->cst = cst;
 		t->etids_in_use++;
 	}
 	mtx_unlock(&t->etid_lock);
 	return (etid);
 }
 
 struct cxgbe_rate_tag *
 lookup_etid(struct adapter *sc, int etid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->etid_tab[etid - t->etid_base].cst);
 }
 
 static void
 free_etid(struct adapter *sc, int etid)
 {
 	struct tid_info *t = &sc->tids;
 	union etid_entry *p = &t->etid_tab[etid - t->etid_base];
 
 	mtx_lock(&t->etid_lock);
 	p->next = t->efree;
 	t->efree = p;
 	t->etids_in_use--;
 	mtx_unlock(&t->etid_lock);
 }
 
+static int cxgbe_rate_tag_modify(struct m_snd_tag *,
+    union if_snd_tag_modify_params *);
+static int cxgbe_rate_tag_query(struct m_snd_tag *,
+    union if_snd_tag_query_params *);
+static void cxgbe_rate_tag_free(struct m_snd_tag *);
+
+static const struct if_snd_tag_sw cxgbe_rate_tag_sw = {
+	.snd_tag_modify = cxgbe_rate_tag_modify,
+	.snd_tag_query = cxgbe_rate_tag_query,
+	.snd_tag_free = cxgbe_rate_tag_free,
+	.type = IF_SND_TAG_TYPE_RATE_LIMIT
+};
+
 int
 cxgbe_rate_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	int rc, schedcl;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct cxgbe_rate_tag *cst;
 
 	MPASS(params->hdr.type == IF_SND_TAG_TYPE_RATE_LIMIT);
 
 	rc = t4_reserve_cl_rl_kbps(sc, pi->port_id,
 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
 	if (rc != 0)
 		return (rc);
 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
 
 	cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (cst == NULL) {
 failed:
 		t4_release_cl_rl(sc, pi->port_id, schedcl);
 		return (ENOMEM);
 	}
 
 	cst->etid = alloc_etid(sc, cst);
 	if (cst->etid < 0) {
 		free(cst, M_CXGBE);
 		goto failed;
 	}
 
 	mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
 	mbufq_init(&cst->pending_tx, INT_MAX);
 	mbufq_init(&cst->pending_fwack, INT_MAX);
-	m_snd_tag_init(&cst->com, ifp, IF_SND_TAG_TYPE_RATE_LIMIT);
+	m_snd_tag_init(&cst->com, ifp, &cxgbe_rate_tag_sw);
 	cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
 	cst->adapter = sc;
 	cst->port_id = pi->port_id;
 	cst->schedcl = schedcl;
 	cst->max_rate = params->rate_limit.max_rate;
 	cst->tx_credits = sc->params.eo_wr_cred;
 	cst->tx_total = cst->tx_credits;
 	cst->plen = 0;
 	cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 
 	/*
 	 * Queues will be selected later when the connection flowid is available.
 	 */
 
 	*pt = &cst->com;
 	return (0);
 }
 
 /*
  * Change in parameters, no change in ifp.
  */
-int
+static int
 cxgbe_rate_tag_modify(struct m_snd_tag *mst,
     union if_snd_tag_modify_params *params)
 {
 	int rc, schedcl;
 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
 	struct adapter *sc = cst->adapter;
 
 	/* XXX: is schedcl -1 ok here? */
 	MPASS(cst->schedcl >= 0 && cst->schedcl < sc->params.nsched_cls);
 
 	mtx_lock(&cst->lock);
 	MPASS(cst->flags & EO_SND_TAG_REF);
 	rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
 	if (rc != 0)
 		return (rc);
 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
 	t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
 	cst->schedcl = schedcl;
 	cst->max_rate = params->rate_limit.max_rate;
 	mtx_unlock(&cst->lock);
 
 	return (0);
 }
 
-int
+static int
 cxgbe_rate_tag_query(struct m_snd_tag *mst,
     union if_snd_tag_query_params *params)
 {
 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
 
 	params->rate_limit.max_rate = cst->max_rate;
 
 #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total)
 	params->rate_limit.queue_level =
 		(cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE;
 
 	return (0);
 }
 
 /*
  * Unlocks cst and frees it.
  */
 void
 cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *cst)
 {
 	struct adapter *sc = cst->adapter;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	MPASS((cst->flags & EO_SND_TAG_REF) == 0);
 	MPASS(cst->tx_credits == cst->tx_total);
 	MPASS(cst->plen == 0);
 	MPASS(mbufq_first(&cst->pending_tx) == NULL);
 	MPASS(mbufq_first(&cst->pending_fwack) == NULL);
 
 	if (cst->etid >= 0)
 		free_etid(sc, cst->etid);
 	if (cst->schedcl != -1)
 		t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
 	mtx_unlock(&cst->lock);
 	mtx_destroy(&cst->lock);
 	free(cst, M_CXGBE);
 }
 
-void
+static void
 cxgbe_rate_tag_free(struct m_snd_tag *mst)
 {
 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
 
 	mtx_lock(&cst->lock);
 
 	/* The kernel is done with the snd_tag.  Remove its reference. */
 	MPASS(cst->flags & EO_SND_TAG_REF);
 	cst->flags &= ~EO_SND_TAG_REF;
 
 	if (cst->ncompl == 0) {
 		/*
 		 * No fw4_ack in flight.  Free the tag right away if there are
 		 * no outstanding credits.  Request the firmware to return all
 		 * credits for the etid otherwise.
 		 */
 		if (cst->tx_credits == cst->tx_total) {
 			cxgbe_rate_tag_free_locked(cst);
 			return;	/* cst is gone. */
 		}
 		send_etid_flush_wr(cst);
 	}
 	mtx_unlock(&cst->lock);
 }
 
 void
 cxgbe_ratelimit_query(struct ifnet *ifp, struct if_ratelimit_query_results *q)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->adapter;
 
 	q->rate_table = NULL;
 	q->flags = RT_IS_SELECTABLE;
 	/*
 	 * Absolute max limits from the firmware configuration.  Practical
 	 * limits depend on the burstsize, pktsize (ifp->if_mtu ultimately) and
 	 * the card's cclk.
 	 */
 	q->max_flows = sc->tids.netids;
 	q->number_of_rates = sc->params.nsched_cls;
 	q->min_segment_burst = 4; /* matches PKTSCHED_BURST in the firmware. */
 
 #if 1
 	if (chip_id(sc) < CHELSIO_T6) {
 		/* Based on testing by rrs@ with a T580 at burstsize = 4. */
 		MPASS(q->min_segment_burst == 4);
 		q->max_flows = min(4000, q->max_flows);
 	} else {
 		/* XXX: TBD, carried forward from T5 for now. */
 		q->max_flows = min(4000, q->max_flows);
 	}
 
 	/*
 	 * XXX: tcp_ratelimit.c grabs all available rates on link-up before it
 	 * even knows whether hw pacing will be used or not.  This prevents
 	 * other consumers like SO_MAX_PACING_RATE or those using cxgbetool or
 	 * the private ioctls from using any of traffic classes.
 	 *
 	 * Underreport the number of rates to tcp_ratelimit so that it doesn't
 	 * hog all of them.  This can be removed if/when tcp_ratelimit switches
 	 * to making its allocations on first-use rather than link-up.  There is
 	 * nothing wrong with one particular consumer reserving all the classes
 	 * but it should do so only if it'll actually use hw rate limiting.
 	 */
 	q->number_of_rates /= 4;
 #endif
 }
 #endif
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
index d927d34b616b..8a502907d172 100644
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -1,6912 +1,6912 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/socketvar.h>
 #include <sys/counter.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 #include <net/if_vxlan.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #ifdef DEV_NETMAP
 #include <machine/bus.h>
 #include <sys/selinfo.h>
 #include <net/if_var.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
 #define RX_COPY_THRESHOLD MINCLSIZE
 #endif
 
 /* Internal mbuf flags stored in PH_loc.eight[1]. */
 #define	MC_NOMAP		0x01
 #define	MC_RAW_WR		0x02
 #define	MC_TLS			0x04
 
 /*
  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
  * 0-7 are valid values.
  */
 static int fl_pktshift = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0,
     "payload DMA offset in rx buffer (bytes)");
 
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
  *  0: disable padding.
  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 int fl_pad = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0,
     "payload pad boundary (bytes)");
 
 /*
  * Status page length.
  * -1: driver should figure out a good value.
  *  64 or 128 are the only other valid values.
  */
 static int spg_len = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0,
     "status page size (bytes)");
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int cong_drop = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0,
     "Congestion control for RX queues (0 = backpressure, 1 = drop");
 
 /*
  * Deliver multiple frames in the same free list buffer if they fit.
  * -1: let the driver decide whether to enable buffer packing or not.
  *  0: disable buffer packing.
  *  1: enable buffer packing.
  */
 static int buffer_packing = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing,
     0, "Enable buffer packing");
 
 /*
  * Start next frame in a packed buffer at this boundary.
  * -1: driver should figure out a good value.
  * T4: driver will ignore this and use the same value as fl_pad above.
  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
  */
 static int fl_pack = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0,
     "payload pack boundary (bytes)");
 
 /*
  * Largest rx cluster size that the driver is allowed to allocate.
  */
 static int largest_rx_cluster = MJUM16BYTES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN,
     &largest_rx_cluster, 0, "Largest rx cluster (bytes)");
 
 /*
  * Size of cluster allocation that's most likely to succeed.  The driver will
  * fall back to this size if it fails to allocate clusters larger than this.
  */
 static int safest_rx_cluster = PAGE_SIZE;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN,
     &safest_rx_cluster, 0, "Safe rx cluster (bytes)");
 
 #ifdef RATELIMIT
 /*
  * Knob to control TCP timestamp rewriting, and the granularity of the tick used
  * for rewriting.  -1 and 0-3 are all valid values.
  * -1: hardware should leave the TCP timestamps alone.
  * 0: 1ms
  * 1: 100us
  * 2: 10us
  * 3: 1us
  */
 static int tsclk = -1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0,
     "Control TCP timestamp rewriting when using pacing");
 
 static int eo_max_backlog = 1024 * 1024;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog,
     0, "Maximum backlog of ratelimited data per flow");
 #endif
 
 /*
  * The interrupt holdoff timers are multiplied by this value on T6+.
  * 1 and 3-17 (both inclusive) are legal values.
  */
 static int tscale = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0,
     "Interrupt holdoff timer scale on T6+");
 
 /*
  * Number of LRO entries in the lro_ctrl structure per rx queue.
  */
 static int lro_entries = TCP_LRO_ENTRIES;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0,
     "Number of LRO entries per RX queue");
 
 /*
  * This enables presorting of frames before they're fed into tcp_lro_rx.
  */
 static int lro_mbufs = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
     "Enable presorting of LRO frames");
 
 static counter_u64_t pullups;
 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, pullups, CTLFLAG_RD, &pullups,
     "Number of mbuf pullups performed");
 
 static counter_u64_t defrags;
 SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, defrags, CTLFLAG_RD, &defrags,
     "Number of mbuf defrags performed");
 
 static int t4_tx_coalesce = 1;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce, CTLFLAG_RWTUN, &t4_tx_coalesce, 0,
     "tx coalescing allowed");
 
 /*
  * The driver will make aggressive attempts at tx coalescing if it sees these
  * many packets eligible for coalescing in quick succession, with no more than
  * the specified gap in between the eth_tx calls that delivered the packets.
  */
 static int t4_tx_coalesce_pkts = 32;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_pkts, CTLFLAG_RWTUN,
     &t4_tx_coalesce_pkts, 0,
     "# of consecutive packets (1 - 255) that will trigger tx coalescing");
 static int t4_tx_coalesce_gap = 5;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_gap, CTLFLAG_RWTUN,
     &t4_tx_coalesce_gap, 0, "tx gap (in microseconds)");
 
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *,
     u_int);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
     int, int);
 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
     struct sge_iq *, char *);
 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
     struct sysctl_ctx_list *, struct sysctl_oid *);
 static void free_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *);
 static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_iq *);
 static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_fl *);
 static int alloc_iq_fl_hwq(struct vi_info *, struct sge_iq *, struct sge_fl *);
 static int free_iq_fl_hwq(struct adapter *, struct sge_iq *, struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static void free_fwq(struct adapter *);
 static int alloc_ctrlq(struct adapter *, int);
 static void free_ctrlq(struct adapter *, int);
 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, int);
 static void free_rxq(struct vi_info *, struct sge_rxq *);
 static void add_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_rxq *);
 #ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
     int);
 static void free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
 static void add_ofld_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_ofld_rxq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct sge_eq *, struct sysctl_ctx_list *,
     struct sysctl_oid *);
 static void free_eq(struct adapter *, struct sge_eq *);
 static void add_eq_sysctls(struct adapter *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_eq *);
 static int alloc_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int free_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
     struct sysctl_ctx_list *, struct sysctl_oid *);
 static void free_wrq(struct adapter *, struct sge_wrq *);
 static void add_wrq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_wrq *);
 static int alloc_txq(struct vi_info *, struct sge_txq *, int);
 static void free_txq(struct vi_info *, struct sge_txq *);
 static void add_txq_sysctls(struct vi_info *, struct sysctl_ctx_list *,
     struct sysctl_oid *, struct sge_txq *);
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int alloc_ofld_txq(struct vi_info *, struct sge_ofld_txq *, int);
 static void free_ofld_txq(struct vi_info *, struct sge_ofld_txq *);
 static void add_ofld_txq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
     struct sge_ofld_txq *);
 #endif
 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int find_refill_source(struct adapter *, int, bool);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
 static inline u_int txpkt_len16(u_int, const u_int);
 static inline u_int txpkt_vm_len16(u_int, const u_int);
 static inline void calculate_mbuf_len16(struct mbuf *, bool);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
 static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *,
     u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
     struct mbuf *);
 static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *,
     int, bool *);
 static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *,
     int, bool *);
 static u_int write_txpkts_wr(struct adapter *, struct sge_txq *);
 static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
 static inline uint16_t read_hw_cidx(struct sge_eq *);
 static inline u_int reclaimable_tx_desc(struct sge_eq *);
 static inline u_int total_available_tx_desc(struct sge_eq *);
 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
 static void tx_reclaim(void *, int);
 static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
 static void wrq_tx_drain(void *, int);
 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 #ifdef RATELIMIT
 #if defined(INET) || defined(INET6)
 static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
 #endif
 static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 #endif
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
 
 an_handler_t t4_an_handler;
 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
 cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES];
 cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES];
 cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES];
 
 void
 t4_register_an_handler(an_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(h == NULL || t4_an_handler == NULL);
 
 	loc = (uintptr_t *)&t4_an_handler;
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(type < nitems(t4_fw_msg_handler));
 	MPASS(h == NULL || t4_fw_msg_handler[type] == NULL);
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	MPASS(type != FW_TYPE_RSSCPL);
 	MPASS(type != FW6_TYPE_RSSCPL);
 
 	loc = (uintptr_t *)&t4_fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 void
 t4_register_cpl_handler(int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(h == NULL || t4_cpl_handler[opcode] == NULL);
 
 	loc = (uintptr_t *)&t4_cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 static int
 set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	u_int tid;
 	int cookie;
 
 	MPASS(m == NULL);
 
 	tid = GET_TID(cpl);
 	if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) {
 		/*
 		 * The return code for filter-write is put in the CPL cookie so
 		 * we have to rely on the hardware tid (is_ftid) to determine
 		 * that this is a response to a filter.
 		 */
 		cookie = CPL_COOKIE_FILTER;
 	} else {
 		cookie = G_COOKIE(cpl->cookie);
 	}
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < nitems(set_tcb_rpl_handlers));
 
 	return (set_tcb_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
 	unsigned int cookie;
 
 	MPASS(m == NULL);
 
 	cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER;
 	return (l2t_write_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 
 	MPASS(m == NULL);
 	MPASS(cookie != CPL_COOKIE_RESERVED);
 
 	return (act_open_rpl_handlers[cookie](iq, rss, m));
 }
 
 static int
 abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_hashfilter(sc))
 		cookie = CPL_COOKIE_HASHFILTER;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (abort_rpl_rss_handlers[cookie](iq, rss, m));
 }
 
 static int
 fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	u_int cookie;
 
 	MPASS(m == NULL);
 	if (is_etid(sc, tid))
 		cookie = CPL_COOKIE_ETHOFLD;
 	else
 		cookie = CPL_COOKIE_TOM;
 
 	return (fw4_ack_handlers[cookie](iq, rss, m));
 }
 
 static void
 t4_init_shared_cpl_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler);
 	t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler);
 	t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler);
 	t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler);
 	t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler);
 }
 
 void
 t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie)
 {
 	uintptr_t *loc;
 
 	MPASS(opcode < nitems(t4_cpl_handler));
 	MPASS(cookie > CPL_COOKIE_RESERVED);
 	MPASS(cookie < NUM_CPL_COOKIES);
 	MPASS(t4_cpl_handler[opcode] != NULL);
 
 	switch (opcode) {
 	case CPL_SET_TCB_RPL:
 		loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie];
 		break;
 	case CPL_L2T_WRITE_RPL:
 		loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie];
 		break;
 	case CPL_ACT_OPEN_RPL:
 		loc = (uintptr_t *)&act_open_rpl_handlers[cookie];
 		break;
 	case CPL_ABORT_RPL_RSS:
 		loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie];
 		break;
 	case CPL_FW4_ACK:
 		loc = (uintptr_t *)&fw4_ack_handlers[cookie];
 		break;
 	default:
 		MPASS(0);
 		return;
 	}
 	MPASS(h == NULL || *loc == (uintptr_t)NULL);
 	atomic_store_rel_ptr(loc, (uintptr_t)h);
 }
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
 		    " using 0 instead.\n", fl_pktshift);
 		fl_pktshift = 0;
 	}
 
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
 #if defined(__i386__) || defined(__amd64__)
 		len = cpu_clflush_line_size > 64 ? 128 : 64;
 #else
 		len = 64;
 #endif
 		if (spg_len != -1) {
 			printf("Invalid hw.cxgbe.spg_len value (%d),"
 			    " using %d instead.\n", spg_len, len);
 		}
 		spg_len = len;
 	}
 
 	if (cong_drop < -1 || cong_drop > 1) {
 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
 		    " using 0 instead.\n", cong_drop);
 		cong_drop = 0;
 	}
 
 	if (tscale != 1 && (tscale < 3 || tscale > 17)) {
 		printf("Invalid hw.cxgbe.tscale value (%d),"
 		    " using 1 instead.\n", tscale);
 		tscale = 1;
 	}
 
 	if (largest_rx_cluster != MCLBYTES &&
 #if MJUMPAGESIZE != MCLBYTES
 	    largest_rx_cluster != MJUMPAGESIZE &&
 #endif
 	    largest_rx_cluster != MJUM9BYTES &&
 	    largest_rx_cluster != MJUM16BYTES) {
 		printf("Invalid hw.cxgbe.largest_rx_cluster value (%d),"
 		    " using %d instead.\n", largest_rx_cluster, MJUM16BYTES);
 		largest_rx_cluster = MJUM16BYTES;
 	}
 
 	if (safest_rx_cluster != MCLBYTES &&
 #if MJUMPAGESIZE != MCLBYTES
 	    safest_rx_cluster != MJUMPAGESIZE &&
 #endif
 	    safest_rx_cluster != MJUM9BYTES &&
 	    safest_rx_cluster != MJUM16BYTES) {
 		printf("Invalid hw.cxgbe.safest_rx_cluster value (%d),"
 		    " using %d instead.\n", safest_rx_cluster, MJUMPAGESIZE);
 		safest_rx_cluster = MJUMPAGESIZE;
 	}
 
 	extfree_refs = counter_u64_alloc(M_WAITOK);
 	extfree_rels = counter_u64_alloc(M_WAITOK);
 	pullups = counter_u64_alloc(M_WAITOK);
 	defrags = counter_u64_alloc(M_WAITOK);
 	counter_u64_zero(extfree_refs);
 	counter_u64_zero(extfree_rels);
 	counter_u64_zero(pullups);
 	counter_u64_zero(defrags);
 
 	t4_init_shared_cpl_handlers();
 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 #ifdef RATELIMIT
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
 	    CPL_COOKIE_ETHOFLD);
 #endif
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
 
 void
 t4_sge_modunload(void)
 {
 
 	counter_u64_free(extfree_refs);
 	counter_u64_free(extfree_rels);
 	counter_u64_free(pullups);
 	counter_u64_free(defrags);
 }
 
 uint64_t
 t4_sge_extfree_refs(void)
 {
 	uint64_t refs, rels;
 
 	rels = counter_u64_fetch(extfree_rels);
 	refs = counter_u64_fetch(extfree_refs);
 
 	return (refs - rels);
 }
 
 /* max 4096 */
 #define MAX_PACK_BOUNDARY 512
 
 static inline void
 setup_pad_and_pack_boundaries(struct adapter *sc)
 {
 	uint32_t v, m;
 	int pad, pack, pad_shift;
 
 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
 	    X_INGPADBOUNDARY_SHIFT;
 	pad = fl_pad;
 	if (fl_pad < (1 << pad_shift) ||
 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
 	    !powerof2(fl_pad)) {
 		/*
 		 * If there is any chance that we might use buffer packing and
 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
 		 * it to the minimum allowed in all other cases.
 		 */
 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
 
 		/*
 		 * For fl_pad = 0 we'll still write a reasonable value to the
 		 * register but all the freelists will opt out of padding.
 		 * We'll complain here only if the user tried to set it to a
 		 * value greater than 0 that was invalid.
 		 */
 		if (fl_pad > 0) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
 			    " (%d), using %d instead.\n", fl_pad, pad);
 		}
 	}
 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	if (is_t4(sc)) {
 		if (fl_pack != -1 && fl_pack != pad) {
 			/* Complain but carry on. */
 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
 			    " using %d instead.\n", fl_pack, pad);
 		}
 		return;
 	}
 
 	pack = fl_pack;
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack)) {
 		if (sc->params.pci.mps > MAX_PACK_BOUNDARY)
 			pack = MAX_PACK_BOUNDARY;
 		else
 			pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
 		MPASS(powerof2(pack));
 		if (pack < 16)
 			pack = 16;
 		if (pack == 32)
 			pack = 64;
 		if (pack > 4096)
 			pack = 4096;
 		if (fl_pack != -1) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
 			    " (%d), using %d instead.\n", fl_pack, pack);
 		}
 	}
 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
 	if (pack == 16)
 		v = V_INGPACKBOUNDARY(0);
 	else
 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
 
 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
 }
 
 /*
  * adap->params.vpd.cclk must be set up before this is called.
  */
 void
 t4_tweak_chip_settings(struct adapter *sc)
 {
 	int i, reg;
 	uint32_t v, m;
 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	setup_pad_and_pack_boundaries(sc);
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096);
 	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536);
 	reg = A_SGE_FL_BUFFER_SIZE2;
 	for (i = 0; i < nitems(sw_buf_sizes); i++) {
 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
 		t4_write_reg(sc, reg, sw_buf_sizes[i]);
 		reg += 4;
 		MPASS(reg <= A_SGE_FL_BUFFER_SIZE15);
 		t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE);
 		reg += 4;
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
 
 	KASSERT(intr_timer[0] <= timer_max,
 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
 	    timer_max));
 	for (i = 1; i < nitems(intr_timer); i++) {
 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
 		    ("%s: timers not listed in increasing order (%d)",
 		    __func__, i));
 
 		while (intr_timer[i] > timer_max) {
 			if (i == nitems(intr_timer) - 1) {
 				intr_timer[i] = timer_max;
 				break;
 			}
 			intr_timer[i] += intr_timer[i - 1];
 			intr_timer[i] /= 2;
 		}
 	}
 
 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
 
 	if (chip_id(sc) >= CHELSIO_T6) {
 		m = V_TSCALE(M_TSCALE);
 		if (tscale == 1)
 			v = 0;
 		else
 			v = V_TSCALE(tscale - 2);
 		t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v);
 
 		if (sc->debug_flags & DF_DISABLE_TCB_CACHE) {
 			m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(M_WRTHRTHRESH);
 			t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 			v &= ~m;
 			v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN |
 			    V_WRTHRTHRESH(16);
 			t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1);
 		}
 	}
 
 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
 
 	/*
 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
 	 * may have to deal with is MAXPHYS + 1 page.
 	 */
 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
 
 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
 }
 
 /*
  * SGE wants the buffer to be at least 64B and then a multiple of 16.  Its
  * address mut be 16B aligned.  If padding is in use the buffer's start and end
  * need to be aligned to the pad boundary as well.  We'll just make sure that
  * the size is a multiple of the pad boundary here, it is up to the buffer
  * allocation code to make sure the start of the buffer is aligned.
  */
 static inline int
 hwsz_ok(struct adapter *sc, int hwsz)
 {
 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
 
 	return (hwsz >= 64 && (hwsz & mask) == 0);
 }
 
 /*
  * Initialize the rx buffer sizes and figure out which zones the buffers will
  * be allocated from.
  */
 void
 t4_init_rx_buf_info(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
 	struct sge_params *sp = &sc->params.sge;
 	int i, j, n;
 	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 	struct rx_buf_info *rxb;
 
 	s->safe_zidx = -1;
 	rxb = &s->rx_buf_info[0];
 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 		rxb->size1 = sw_buf_sizes[i];
 		rxb->zone = m_getzone(rxb->size1);
 		rxb->type = m_gettype(rxb->size1);
 		rxb->size2 = 0;
 		rxb->hwidx1 = -1;
 		rxb->hwidx2 = -1;
 		for (j = 0; j < SGE_FLBUF_SIZES; j++) {
 			int hwsize = sp->sge_fl_buffer_size[j];
 
 			if (!hwsz_ok(sc, hwsize))
 				continue;
 
 			/* hwidx for size1 */
 			if (rxb->hwidx1 == -1 && rxb->size1 == hwsize)
 				rxb->hwidx1 = j;
 
 			/* hwidx for size2 (buffer packing) */
 			if (rxb->size1 - CL_METADATA_SIZE < hwsize)
 				continue;
 			n = rxb->size1 - hwsize - CL_METADATA_SIZE;
 			if (n == 0) {
 				rxb->hwidx2 = j;
 				rxb->size2 = hwsize;
 				break;	/* stop looking */
 			}
 			if (rxb->hwidx2 != -1) {
 				if (n < sp->sge_fl_buffer_size[rxb->hwidx2] -
 				    hwsize - CL_METADATA_SIZE) {
 					rxb->hwidx2 = j;
 					rxb->size2 = hwsize;
 				}
 			} else if (n <= 2 * CL_METADATA_SIZE) {
 				rxb->hwidx2 = j;
 				rxb->size2 = hwsize;
 			}
 		}
 		if (rxb->hwidx2 != -1)
 			sc->flags |= BUF_PACKING_OK;
 		if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster)
 			s->safe_zidx = i;
 	}
 }
 
 /*
  * Verify some basic SGE settings for the PF and VF driver, and other
  * miscellaneous settings for the PF driver.
  */
 int
 t4_verify_chip_settings(struct adapter *sc)
 {
 	struct sge_params *sp = &sc->params.sge;
 	uint32_t m, v, r;
 	int rc = 0;
 	const uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 
 	m = F_RXPKTCPLMODE;
 	v = F_RXPKTCPLMODE;
 	r = sp->sge_control;
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/*
 	 * If this changes then every single use of PAGE_SHIFT in the driver
 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
 	 */
 	if (sp->page_shift != PAGE_SHIFT) {
 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	if (sc->flags & IS_VF)
 		return (0);
 
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
 	if (r != v) {
 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
 		if (sc->vres.ddp.size != 0)
 			rc = EINVAL;
 	}
 
 	m = v = F_TDDPTAGTCB;
 	r = t4_read_reg(sc, A_ULP_RX_CTL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
 		if (sc->vres.ddp.size != 0)
 			rc = EINVAL;
 	}
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	r = t4_read_reg(sc, A_TP_PARA_REG5);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
 		if (sc->vres.ddp.size != 0)
 			rc = EINVAL;
 	}
 
 	return (rc);
 }
 
 int
 t4_create_dma_tag(struct adapter *sc)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
 	    NULL, &sc->dmat);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create main DMA tag: %d\n", rc);
 	}
 
 	return (rc);
 }
 
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 	    sysctl_bufsizes, "A", "freelist buffer sizes");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
 	    NULL, sp->spg_len, "status page size (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
 t4_destroy_dma_tag(struct adapter *sc)
 {
 	if (sc->dmat)
 		bus_dma_tag_destroy(sc->dmat);
 
 	return (0);
 }
 
 /*
  * Allocate and initialize the firmware event queue, control queues, and special
  * purpose rx queues owned by the adapter.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 int
 t4_setup_adapter_queues(struct adapter *sc)
 {
 	int rc, i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	/*
 	 * Firmware event queue
 	 */
 	rc = alloc_fwq(sc);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * That's all for the VF driver.
 	 */
 	if (sc->flags & IS_VF)
 		return (rc);
 
 	/*
 	 * XXX: General purpose rx queues, one per port.
 	 */
 
 	/*
 	 * Control queues, one per port.
 	 */
 	for_each_port(sc, i) {
 		rc = alloc_ctrlq(sc, i);
 		if (rc != 0)
 			return (rc);
 	}
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_adapter_queues(struct adapter *sc)
 {
 	int i;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	if (!(sc->flags & IS_VF)) {
 		for_each_port(sc, i)
 			free_ctrlq(sc, i);
 	}
 	free_fwq(sc);
 
 	return (0);
 }
 
 /* Maximum payload that could arrive with a single iq descriptor. */
 static inline int
 max_rx_payload(struct adapter *sc, struct ifnet *ifp, const bool ofld)
 {
 	int maxp;
 
 	/* large enough even when hw VLAN extraction is disabled */
 	maxp = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
 	    ETHER_VLAN_ENCAP_LEN + ifp->if_mtu;
 	if (ofld && sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS &&
 	    maxp < sc->params.tp.max_rx_pdu)
 		maxp = sc->params.tp.max_rx_pdu;
 	return (maxp);
 }
 
 int
 t4_setup_vi_queues(struct vi_info *vi)
 {
 	int rc = 0, i, intr_idx;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_ofld_txq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	int saved_idx, iqidx;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 	struct adapter *sc = vi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	int maxp;
 
 	/* Interrupt vector to start from (when using multiple vectors) */
 	intr_idx = vi->first_intr;
 
 #ifdef DEV_NETMAP
 	saved_idx = intr_idx;
 	if (ifp->if_capabilities & IFCAP_NETMAP) {
 
 		/* netmap is supported with direct interrupts only. */
 		MPASS(!forwarding_intr_to_fwq(sc));
 		MPASS(vi->first_intr >= 0);
 
 		/*
 		 * We don't have buffers to back the netmap rx queues
 		 * right now so we create the queues in a way that
 		 * doesn't set off any congestion signal in the chip.
 		 */
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 
 		for_each_nm_txq(vi, i, nm_txq) {
 			iqidx = vi->first_nm_rxq + (i % vi->nnmrxq);
 			rc = alloc_nm_txq(vi, nm_txq, iqidx, i);
 			if (rc != 0)
 				goto done;
 		}
 	}
 
 	/* Normal rx queues and netmap rx queues share the same interrupts. */
 	intr_idx = saved_idx;
 #endif
 
 	/*
 	 * Allocate rx queues first because a default iqid is required when
 	 * creating a tx queue.
 	 */
 	maxp = max_rx_payload(sc, ifp, false);
 	for_each_rxq(vi, i, rxq) {
 		rc = alloc_rxq(vi, rxq, i, intr_idx, maxp);
 		if (rc != 0)
 			goto done;
 		if (!forwarding_intr_to_fwq(sc))
 			intr_idx++;
 	}
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
 #endif
 #ifdef TCP_OFFLOAD
 	maxp = max_rx_payload(sc, ifp, true);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		rc = alloc_ofld_rxq(vi, ofld_rxq, i, intr_idx, maxp);
 		if (rc != 0)
 			goto done;
 		if (!forwarding_intr_to_fwq(sc))
 			intr_idx++;
 	}
 #endif
 
 	/*
 	 * Now the tx queues.
 	 */
 	for_each_txq(vi, i, txq) {
 		rc = alloc_txq(vi, txq, i);
 		if (rc != 0)
 			goto done;
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		rc = alloc_ofld_txq(vi, ofld_txq, i);
 		if (rc != 0)
 			goto done;
 	}
 #endif
 done:
 	if (rc)
 		t4_teardown_vi_queues(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_vi_queues(struct vi_info *vi)
 {
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	struct sge_ofld_txq *ofld_txq;
 #endif
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 
 #ifdef DEV_NETMAP
 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
 		for_each_nm_txq(vi, i, nm_txq) {
 			free_nm_txq(vi, nm_txq);
 		}
 
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			free_nm_rxq(vi, nm_rxq);
 		}
 	}
 #endif
 
 	/*
 	 * Take down all the tx queues first, as they reference the rx queues
 	 * (for egress updates, etc.).
 	 */
 
 	for_each_txq(vi, i, txq) {
 		free_txq(vi, txq);
 	}
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		free_ofld_txq(vi, ofld_txq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Interrupt handler when the driver is using only 1 interrupt.  This is a very
  * unusual scenario.
  *
  * a) Deals with errors, if any.
  * b) Services firmware event queue, which is taking interrupts for all other
  *    queues.
  */
 void
 t4_intr_all(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	MPASS(sc->intr_count == 1);
 
 	if (sc->intr_type == INTR_INTX)
 		t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 
 	t4_intr_err(arg);
 	t4_intr_evt(fwq);
 }
 
 /*
  * Interrupt handler for errors (installed directly when multiple interrupts are
  * being used, or called by t4_intr_all).
  */
 void
 t4_intr_err(void *arg)
 {
 	struct adapter *sc = arg;
 	uint32_t v;
 	const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
 
 	if (sc->flags & ADAP_ERR)
 		return;
 
 	v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
 	if (v & F_PFSW) {
 		sc->swintr++;
 		t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
 	}
 
 	t4_slow_intr_handler(sc, verbose);
 }
 
 /*
  * Interrupt handler for iq-only queues.  The firmware event queue is the only
  * such queue right now.
  */
 void
 t4_intr_evt(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 /*
  * Interrupt handler for iq+fl queues.
  */
 void
 t4_intr(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq_fl(iq, 0);
 		(void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 #ifdef DEV_NETMAP
 /*
  * Interrupt handler for netmap rx queues.
  */
 void
 t4_nm_intr(void *arg)
 {
 	struct sge_nm_rxq *nm_rxq = arg;
 
 	if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) {
 		service_nm_rxq(nm_rxq);
 		(void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON);
 	}
 }
 
 /*
  * Interrupt handler for vectors shared between NIC and netmap rx queues.
  */
 void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
 
 	MPASS(irq->nm_rxq != NULL);
 	t4_nm_intr(irq->nm_rxq);
 
 	MPASS(irq->rxq != NULL);
 	t4_intr(irq->rxq);
 }
 #endif
 
 /*
  * Deals with interrupts on an iq-only (no freelist) queue.
  */
 static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type;
 	uint32_t lq;
 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	KASSERT((iq->flags & IQ_HAS_FL) == 0,
 	    ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq,
 	    iq->flags));
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 	MPASS((iq->flags & IQ_LRO_ENABLED) == 0);
 
 	limit = budget ? budget : iq->qsize / 16;
 
 	/*
 	 * We always come back and check the descriptor ring for new indirect
 	 * interrupts and other responses after running a single handler.
 	 */
 	for (;;) {
 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 			rmb();
 
 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 			lq = be32toh(d->rsp.pldbuflen_qid);
 
 			switch (rsp_type) {
 			case X_RSPD_TYPE_FLBUF:
 				panic("%s: data for an iq (%p) with no freelist",
 				    __func__, iq);
 
 				/* NOTREACHED */
 
 			case X_RSPD_TYPE_CPL:
 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 				    ("%s: bad opcode %02x.", __func__,
 				    d->rss.opcode));
 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL);
 				break;
 
 			case X_RSPD_TYPE_INTR:
 				/*
 				 * There are 1K interrupt-capable queues (qids 0
 				 * through 1023).  A response type indicating a
 				 * forwarded interrupt with a qid >= 1K is an
 				 * iWARP async notification.
 				 */
 				if (__predict_true(lq >= 1024)) {
 					t4_an_handler(iq, &d->rsp);
 					break;
 				}
 
 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
 				    sc->sge.iq_base];
 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
 				    IQS_BUSY)) {
 					if (service_iq_fl(q, q->qsize / 16) == 0) {
 						(void) atomic_cmpset_int(&q->state,
 						    IQS_BUSY, IQS_IDLE);
 					} else {
 						STAILQ_INSERT_TAIL(&iql, q,
 						    link);
 					}
 				}
 				break;
 
 			default:
 				KASSERT(0,
 				    ("%s: illegal response type %d on iq %p",
 				    __func__, rsp_type, iq));
 				log(LOG_ERR,
 				    "%s: illegal response type %d on iq %p",
 				    device_get_nameunit(sc->dev), rsp_type, iq);
 				break;
 			}
 
 			d++;
 			if (__predict_false(++iq->cidx == iq->sidx)) {
 				iq->cidx = 0;
 				iq->gen ^= F_RSPD_GEN;
 				d = &iq->desc[0];
 			}
 			if (__predict_false(++ndescs == limit)) {
 				t4_write_reg(sc, sc->sge_gts_reg,
 				    V_CIDXINC(ndescs) |
 				    V_INGRESSQID(iq->cntxt_id) |
 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 				ndescs = 0;
 
 				if (budget) {
 					return (EINPROGRESS);
 				}
 			}
 		}
 
 		if (STAILQ_EMPTY(&iql))
 			break;
 
 		/*
 		 * Process the head only, and send it to the back of the list if
 		 * it's still not done.
 		 */
 		q = STAILQ_FIRST(&iql);
 		STAILQ_REMOVE_HEAD(&iql, link);
 		if (service_iq_fl(q, q->qsize / 8) == 0)
 			(void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
 		else
 			STAILQ_INSERT_TAIL(&iql, q, link);
 	}
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	return (0);
 }
 
 #if defined(INET) || defined(INET6)
 static inline int
 sort_before_lro(struct lro_ctrl *lro)
 {
 
 	return (lro->lro_mbuf_max != 0);
 }
 #endif
 
 static inline uint64_t
 last_flit_to_ns(struct adapter *sc, uint64_t lf)
 {
 	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
 
 	if (n > UINT64_MAX / 1000000)
 		return (n / sc->params.vpd.cclk * 1000000);
 	else
 		return (n * 1000000 / sc->params.vpd.cclk);
 }
 
 static inline void
 move_to_next_rxbuf(struct sge_fl *fl)
 {
 
 	fl->rx_offset = 0;
 	if (__predict_false((++fl->cidx & 7) == 0)) {
 		uint16_t cidx = fl->cidx >> 3;
 
 		if (__predict_false(cidx == fl->sidx))
 			fl->cidx = cidx = 0;
 		fl->hw_cidx = cidx;
 	}
 }
 
 /*
  * Deals with interrupts on an iq+fl queue.
  */
 static int
 service_iq_fl(struct sge_iq *iq, int budget)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct sge_fl *fl;
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs, limit;
 	int rsp_type, starved;
 	uint32_t lq;
 	uint16_t fl_hw_cidx;
 	struct mbuf *m0;
 #if defined(INET) || defined(INET6)
 	const struct timeval lro_timeout = {0, sc->lro_timeout};
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 	MPASS(iq->flags & IQ_HAS_FL);
 
 	ndescs = 0;
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_ADJ_CREDIT) {
 		MPASS(sort_before_lro(lro));
 		iq->flags &= ~IQ_ADJ_CREDIT;
 		if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) {
 			tcp_lro_flush_all(lro);
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) |
 			    V_INGRESSQID((u32)iq->cntxt_id) |
 			    V_SEINTARM(iq->intr_params));
 			return (0);
 		}
 		ndescs = 1;
 	}
 #else
 	MPASS((iq->flags & IQ_ADJ_CREDIT) == 0);
 #endif
 
 	limit = budget ? budget : iq->qsize / 16;
 	fl = &rxq->fl;
 	fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
 	while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 		rmb();
 
 		m0 = NULL;
 		rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 		lq = be32toh(d->rsp.pldbuflen_qid);
 
 		switch (rsp_type) {
 		case X_RSPD_TYPE_FLBUF:
 			if (lq & F_RSPD_NEWBUF) {
 				if (fl->rx_offset > 0)
 					move_to_next_rxbuf(fl);
 				lq = G_RSPD_LEN(lq);
 			}
 			if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) {
 				FL_LOCK(fl);
 				refill_fl(sc, fl, 64);
 				FL_UNLOCK(fl);
 				fl_hw_cidx = fl->hw_cidx;
 			}
 
 			if (d->rss.opcode == CPL_RX_PKT) {
 				if (__predict_true(eth_rx(sc, rxq, d, lq) == 0))
 					break;
 				goto out;
 			}
 			m0 = get_fl_payload(sc, fl, lq);
 			if (__predict_false(m0 == NULL))
 				goto out;
 
 			/* fall through */
 
 		case X_RSPD_TYPE_CPL:
 			KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 			    ("%s: bad opcode %02x.", __func__, d->rss.opcode));
 			t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
 			break;
 
 		case X_RSPD_TYPE_INTR:
 
 			/*
 			 * There are 1K interrupt-capable queues (qids 0
 			 * through 1023).  A response type indicating a
 			 * forwarded interrupt with a qid >= 1K is an
 			 * iWARP async notification.  That is the only
 			 * acceptable indirect interrupt on this queue.
 			 */
 			if (__predict_false(lq < 1024)) {
 				panic("%s: indirect interrupt on iq_fl %p "
 				    "with qid %u", __func__, iq, lq);
 			}
 
 			t4_an_handler(iq, &d->rsp);
 			break;
 
 		default:
 			KASSERT(0, ("%s: illegal response type %d on iq %p",
 			    __func__, rsp_type, iq));
 			log(LOG_ERR, "%s: illegal response type %d on iq %p",
 			    device_get_nameunit(sc->dev), rsp_type, iq);
 			break;
 		}
 
 		d++;
 		if (__predict_false(++iq->cidx == iq->sidx)) {
 			iq->cidx = 0;
 			iq->gen ^= F_RSPD_GEN;
 			d = &iq->desc[0];
 		}
 		if (__predict_false(++ndescs == limit)) {
 			t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 			    V_INGRESSQID(iq->cntxt_id) |
 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 
 #if defined(INET) || defined(INET6)
 			if (iq->flags & IQ_LRO_ENABLED &&
 			    !sort_before_lro(lro) &&
 			    sc->lro_timeout != 0) {
 				tcp_lro_flush_inactive(lro, &lro_timeout);
 			}
 #endif
 			if (budget)
 				return (EINPROGRESS);
 			ndescs = 0;
 		}
 	}
 out:
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		if (ndescs > 0 && lro->lro_mbuf_count > 8) {
 			MPASS(sort_before_lro(lro));
 			/* hold back one credit and don't flush LRO state */
 			iq->flags |= IQ_ADJ_CREDIT;
 			ndescs--;
 		} else {
 			tcp_lro_flush_all(lro);
 		}
 	}
 #endif
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	FL_LOCK(fl);
 	starved = refill_fl(sc, fl, 64);
 	FL_UNLOCK(fl);
 	if (__predict_false(starved != 0))
 		add_fl_to_sfl(sc, fl);
 
 	return (0);
 }
 
 static inline struct cluster_metadata *
 cl_metadata(struct fl_sdesc *sd)
 {
 
 	return ((void *)(sd->cl + sd->moff));
 }
 
 static void
 rxb_free(struct mbuf *m)
 {
 	struct cluster_metadata *clm = m->m_ext.ext_arg1;
 
 	uma_zfree(clm->zone, clm->cl);
 	counter_u64_add(extfree_rels, 1);
 }
 
 /*
  * The mbuf returned comes from zone_muf and carries the payload in one of these
  * ways
  * a) complete frame inside the mbuf
  * b) m_cljset (for clusters without metadata)
  * d) m_extaddref (cluster with metadata)
  */
 static struct mbuf *
 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 	struct cluster_metadata *clm;
 	int len, blen;
 	caddr_t payload;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		u_int l, pad;
 
 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
 		len = min(remaining, blen);
 		payload = sd->cl + fl->rx_offset;
 
 		l = fr_offset + len;
 		pad = roundup2(l, fl->buf_boundary) - l;
 		if (fl->rx_offset + len + pad < rxb->size2)
 			blen = len + pad;
 		MPASS(fl->rx_offset + blen <= rxb->size2);
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 		blen = rxb->size1;
 		len = min(remaining, blen);
 		payload = sd->cl;
 	}
 
 	if (fr_offset == 0) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (__predict_false(m == NULL))
 			return (NULL);
 		m->m_pkthdr.len = remaining;
 	} else {
 		m = m_get(M_NOWAIT, MT_DATA);
 		if (__predict_false(m == NULL))
 			return (NULL);
 	}
 	m->m_len = len;
 
 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 		/* copy data to mbuf */
 		bcopy(payload, mtod(m, caddr_t), len);
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->rx_offset += blen;
 			MPASS(fl->rx_offset <= rxb->size2);
 			if (fl->rx_offset < rxb->size2)
 				return (m);	/* without advancing the cidx */
 		}
 	} else if (fl->flags & FL_BUF_PACKING) {
 		clm = cl_metadata(sd);
 		if (sd->nmbuf++ == 0) {
 			clm->refcount = 1;
 			clm->zone = rxb->zone;
 			clm->cl = sd->cl;
 			counter_u64_add(extfree_refs, 1);
 		}
 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm,
 		    NULL);
 
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= rxb->size2);
 		if (fl->rx_offset < rxb->size2)
 			return (m);	/* without advancing the cidx */
 	} else {
 		m_cljset(m, sd->cl, rxb->type);
 		sd->cl = NULL;	/* consumed, not a recycle candidate */
 	}
 
 	move_to_next_rxbuf(fl);
 
 	return (m);
 }
 
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen)
 {
 	struct mbuf *m0, *m, **pnext;
 	u_int remaining;
 
 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
 		M_ASSERTPKTHDR(fl->m0);
 		MPASS(fl->m0->m_pkthdr.len == plen);
 		MPASS(fl->remaining < plen);
 
 		m0 = fl->m0;
 		pnext = fl->pnext;
 		remaining = fl->remaining;
 		fl->flags &= ~FL_BUF_RESUME;
 		goto get_segment;
 	}
 
 	/*
 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
 	 * 'len' and it may span multiple hw buffers.
 	 */
 
 	m0 = get_scatter_segment(sc, fl, 0, plen);
 	if (m0 == NULL)
 		return (NULL);
 	remaining = plen - m0->m_len;
 	pnext = &m0->m_next;
 	while (remaining > 0) {
 get_segment:
 		MPASS(fl->rx_offset == 0);
 		m = get_scatter_segment(sc, fl, plen - remaining, remaining);
 		if (__predict_false(m == NULL)) {
 			fl->m0 = m0;
 			fl->pnext = pnext;
 			fl->remaining = remaining;
 			fl->flags |= FL_BUF_RESUME;
 			return (NULL);
 		}
 		*pnext = m;
 		pnext = &m->m_next;
 		remaining -= m->m_len;
 	}
 	*pnext = NULL;
 
 	M_ASSERTPKTHDR(m0);
 	return (m0);
 }
 
 static int
 skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 	int len, blen;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		u_int l, pad;
 
 		blen = rxb->size2 - fl->rx_offset;	/* max possible in this buf */
 		len = min(remaining, blen);
 
 		l = fr_offset + len;
 		pad = roundup2(l, fl->buf_boundary) - l;
 		if (fl->rx_offset + len + pad < rxb->size2)
 			blen = len + pad;
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= rxb->size2);
 		if (fl->rx_offset < rxb->size2)
 			return (len);	/* without advancing the cidx */
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 		blen = rxb->size1;
 		len = min(remaining, blen);
 	}
 	move_to_next_rxbuf(fl);
 	return (len);
 }
 
 static inline void
 skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen)
 {
 	int remaining, fr_offset, len;
 
 	fr_offset = 0;
 	remaining = plen;
 	while (remaining > 0) {
 		len = skip_scatter_segment(sc, fl, fr_offset, remaining);
 		fr_offset += len;
 		remaining -= len;
 	}
 }
 
 static inline int
 get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen)
 {
 	int len;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx];
 
 	if (fl->flags & FL_BUF_PACKING)
 		len = rxb->size2 - fl->rx_offset;
 	else
 		len = rxb->size1;
 
 	return (min(plen, len));
 }
 
 static int
 eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d,
     u_int plen)
 {
 	struct mbuf *m0;
 	struct ifnet *ifp = rxq->ifp;
 	struct sge_fl *fl = &rxq->fl;
 	struct vi_info *vi = ifp->if_softc;
 	const struct cpl_rx_pkt *cpl;
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 	uint16_t err_vec, tnl_type, tnlhdr_len;
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
 	static const int sw_csum_flags[2][2] = {
 		{
 			/* IP, inner IP */
 			CSUM_ENCAP_VXLAN |
 			    CSUM_L3_CALC | CSUM_L3_VALID |
 			    CSUM_L4_CALC | CSUM_L4_VALID |
 			    CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
 
 			/* IP, inner IP6 */
 			CSUM_ENCAP_VXLAN |
 			    CSUM_L3_CALC | CSUM_L3_VALID |
 			    CSUM_L4_CALC | CSUM_L4_VALID |
 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
 		},
 		{
 			/* IP6, inner IP */
 			CSUM_ENCAP_VXLAN |
 			    CSUM_L4_CALC | CSUM_L4_VALID |
 			    CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
 
 			/* IP6, inner IP6 */
 			CSUM_ENCAP_VXLAN |
 			    CSUM_L4_CALC | CSUM_L4_VALID |
 			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
 		},
 	};
 
 	MPASS(plen > sc->params.sge.fl_pktshift);
 	if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
 	    __predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
 		struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 		caddr_t frame;
 		int rc, slen;
 
 		slen = get_segment_len(sc, fl, plen) -
 		    sc->params.sge.fl_pktshift;
 		frame = sd->cl + fl->rx_offset + sc->params.sge.fl_pktshift;
 		CURVNET_SET_QUIET(ifp->if_vnet);
 		rc = pfil_run_hooks(vi->pfil, frame, ifp,
 		    slen | PFIL_MEMPTR | PFIL_IN, NULL);
 		CURVNET_RESTORE();
 		if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) {
 			skip_fl_payload(sc, fl, plen);
 			return (0);
 		}
 		if (rc == PFIL_REALLOCED) {
 			skip_fl_payload(sc, fl, plen);
 			m0 = pfil_mem2mbuf(frame);
 			goto have_mbuf;
 		}
 	}
 
 	m0 = get_fl_payload(sc, fl, plen);
 	if (__predict_false(m0 == NULL))
 		return (ENOMEM);
 
 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
 	m0->m_len -= sc->params.sge.fl_pktshift;
 	m0->m_data += sc->params.sge.fl_pktshift;
 
 have_mbuf:
 	m0->m_pkthdr.rcvif = ifp;
 	M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]);
 	m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
 
 	cpl = (const void *)(&d->rss + 1);
 	if (sc->params.tp.rx_pkt_encap) {
 		const uint16_t ev = be16toh(cpl->err_vec);
 
 		err_vec = G_T6_COMPR_RXERR_VEC(ev);
 		tnl_type = G_T6_RX_TNL_TYPE(ev);
 		tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev);
 	} else {
 		err_vec = be16toh(cpl->err_vec);
 		tnl_type = 0;
 		tnlhdr_len = 0;
 	}
 	if (cpl->csum_calc && err_vec == 0) {
 		int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6));
 
 		/* checksum(s) calculated and found to be correct. */
 
 		MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^
 		    (cpl->l2info & htobe32(F_RXF_IP6)));
 		m0->m_pkthdr.csum_data = be16toh(cpl->csum);
 		if (tnl_type == 0) {
 	    		if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) {
 				m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
 				    CSUM_L3_VALID | CSUM_L4_CALC |
 				    CSUM_L4_VALID;
 			} else if (ipv6 && ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
 				m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
 				    CSUM_L4_VALID;
 			}
 			rxq->rxcsum++;
 		} else {
 			MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN);
 
 			M_HASHTYPE_SETINNER(m0);
 			if (__predict_false(cpl->ip_frag)) {
 				/*
 				 * csum_data is for the inner frame (which is an
 				 * IP fragment) and is not 0xffff.  There is no
 				 * way to pass the inner csum_data to the stack.
 				 * We don't want the stack to use the inner
 				 * csum_data to validate the outer frame or it
 				 * will get rejected.  So we fix csum_data here
 				 * and let sw do the checksum of inner IP
 				 * fragments.
 				 *
 				 * XXX: Need 32b for csum_data2 in an rx mbuf.
 				 * Maybe stuff it into rcv_tstmp?
 				 */
 				m0->m_pkthdr.csum_data = 0xffff;
 				if (ipv6) {
 					m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
 					    CSUM_L4_VALID;
 				} else {
 					m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
 					    CSUM_L3_VALID | CSUM_L4_CALC |
 					    CSUM_L4_VALID;
 				}
 			} else {
 				int outer_ipv6;
 
 				MPASS(m0->m_pkthdr.csum_data == 0xffff);
 
 				outer_ipv6 = tnlhdr_len >=
 				    sizeof(struct ether_header) +
 				    sizeof(struct ip6_hdr);
 				m0->m_pkthdr.csum_flags =
 				    sw_csum_flags[outer_ipv6][ipv6];
 			}
 			rxq->vxlan_rxcsum++;
 		}
 	}
 
 	if (cpl->vlan_ex) {
 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
 		m0->m_flags |= M_VLANTAG;
 		rxq->vlan_extraction++;
 	}
 
 	if (rxq->iq.flags & IQ_RX_TIMESTAMP) {
 		/*
 		 * Fill up rcv_tstmp but do not set M_TSTMP.
 		 * rcv_tstmp is not in the format that the
 		 * kernel expects and we don't want to mislead
 		 * it.  For now this is only for custom code
 		 * that knows how to interpret cxgbe's stamp.
 		 */
 		m0->m_pkthdr.rcv_tstmp =
 		    last_flit_to_ns(sc, d->rsp.u.last_flit);
 #ifdef notyet
 		m0->m_flags |= M_TSTMP;
 #endif
 	}
 
 #ifdef NUMA
 	m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
 #endif
 #if defined(INET) || defined(INET6)
 	if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 &&
 	    (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
 	    M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
 		if (sort_before_lro(lro)) {
 			tcp_lro_queue_mbuf(lro, m0);
 			return (0); /* queued for sort, then LRO */
 		}
 		if (tcp_lro_rx(lro, m0, 0) == 0)
 			return (0); /* queued for LRO */
 	}
 #endif
 	ifp->if_input(ifp, m0);
 
 	return (0);
 }
 
 /*
  * Must drain the wrq or make sure that someone else will.
  */
 static void
 wrq_tx_drain(void *arg, int n)
 {
 	struct sge_wrq *wrq = arg;
 	struct sge_eq *eq = &wrq->eq;
 
 	EQ_LOCK(eq);
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(wrq->adapter, wrq);
 	EQ_UNLOCK(eq);
 }
 
 static void
 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
 {
 	struct sge_eq *eq = &wrq->eq;
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n;
 	struct wrqe *wr;
 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	wr = STAILQ_FIRST(&wrq->wr_list);
 	MPASS(wr != NULL);	/* Must be called with something useful to do */
 	MPASS(eq->pidx == eq->dbidx);
 	dbdiff = 0;
 
 	do {
 		eq->cidx = read_hw_cidx(eq);
 		if (eq->pidx == eq->cidx)
 			available = eq->sidx - 1;
 		else
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 		MPASS(wr->wrq == wrq);
 		n = howmany(wr->wr_len, EQ_ESIZE);
 		if (available < n)
 			break;
 
 		dst = (void *)&eq->desc[eq->pidx];
 		if (__predict_true(eq->sidx - eq->pidx > n)) {
 			/* Won't wrap, won't end exactly at the status page. */
 			bcopy(&wr->wr[0], dst, wr->wr_len);
 			eq->pidx += n;
 		} else {
 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
 
 			bcopy(&wr->wr[0], dst, first_portion);
 			if (wr->wr_len > first_portion) {
 				bcopy(&wr->wr[first_portion], &eq->desc[0],
 				    wr->wr_len - first_portion);
 			}
 			eq->pidx = n - (eq->sidx - eq->pidx);
 		}
 		wrq->tx_wrs_copied++;
 
 		if (available < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 		}
 
 		dbdiff += n;
 		if (dbdiff >= 16) {
 			ring_eq_db(sc, eq, dbdiff);
 			dbdiff = 0;
 		}
 
 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
 		free_wrqe(wr);
 		MPASS(wrq->nwr_pending > 0);
 		wrq->nwr_pending--;
 		MPASS(wrq->ndesc_needed >= n);
 		wrq->ndesc_needed -= n;
 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
 
 	if (dbdiff)
 		ring_eq_db(sc, eq, dbdiff);
 }
 
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 #ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
 #endif
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(wr != NULL);
 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
 	MPASS((wr->wr_len & 0x7) == 0);
 
 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 	wrq->nwr_pending++;
 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
 
 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
 		return;	/* commit_wrq_wr will drain wr_list as well. */
 
 	drain_wrq_wr_list(sc, wrq);
 
 	/* Doorbell must have caught up to the pidx. */
 	MPASS(eq->pidx == eq->dbidx);
 }
 
 void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	struct sge_fl *fl;
 	int i, maxp;
 
 	maxp = max_rx_payload(sc, ifp, false);
 	for_each_rxq(vi, i, rxq) {
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
 		fl->zidx = find_refill_source(sc, maxp,
 		    fl->flags & FL_BUF_PACKING);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
 	maxp = max_rx_payload(sc, ifp, true);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
 		fl->zidx = find_refill_source(sc, maxp,
 		    fl->flags & FL_BUF_PACKING);
 		FL_UNLOCK(fl);
 	}
 #endif
 }
 
 static inline int
 mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.inner_l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
 	return (m->m_pkthdr.inner_l5hlen);
 }
 
 static inline void
 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.inner_l5hlen = nsegs;
 }
 
 static inline int
 mbuf_cflags(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[4]);
 }
 
 static inline void
 set_mbuf_cflags(struct mbuf *m, uint8_t flags)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[4] = flags;
 }
 
 static inline int
 mbuf_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[0];
 	if (!(mbuf_cflags(m) & MC_TLS))
 		MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	if (!(mbuf_cflags(m) & MC_TLS))
 		MPASS(len16 > 0 && len16 <= SGE_MAX_WR_LEN / 16);
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
 #ifdef RATELIMIT
 static inline int
 mbuf_eo_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[1]);
 }
 
 #if defined(INET) || defined(INET6)
 static inline void
 set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[1] = nsegs;
 }
 #endif
 
 static inline int
 mbuf_eo_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[2];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 #if defined(INET) || defined(INET6)
 static inline void
 set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[2] = len16;
 }
 #endif
 
 static inline int
 mbuf_eo_tsclk_tsoff(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	return (m->m_pkthdr.PH_loc.eight[3]);
 }
 
 #if defined(INET) || defined(INET6)
 static inline void
 set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
 }
 #endif
 
 static inline int
 needs_eo(struct m_snd_tag *mst)
 {
 
-	return (mst != NULL && mst->type == IF_SND_TAG_TYPE_RATE_LIMIT);
+	return (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_RATE_LIMIT);
 }
 #endif
 
 /*
  * Try to allocate an mbuf to contain a raw work request.  To make it
  * easy to construct the work request, don't allocate a chain but a
  * single mbuf.
  */
 struct mbuf *
 alloc_wr_mbuf(int len, int how)
 {
 	struct mbuf *m;
 
 	if (len <= MHLEN)
 		m = m_gethdr(how, MT_DATA);
 	else if (len <= MCLBYTES)
 		m = m_getcl(how, MT_DATA, M_PKTHDR);
 	else
 		m = NULL;
 	if (m == NULL)
 		return (NULL);
 	m->m_pkthdr.len = len;
 	m->m_len = len;
 	set_mbuf_cflags(m, MC_RAW_WR);
 	set_mbuf_len16(m, howmany(len, 16));
 	return (m);
 }
 
 static inline bool
 needs_hwcsum(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP |
 	    CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
 	    CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP |
 	    CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP |
 	    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
 static inline bool
 needs_tso(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO |
 	    CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
 static inline bool
 needs_vxlan_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN);
 }
 
 static inline bool
 needs_vxlan_tso(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO |
 	    CSUM_INNER_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
 	return ((m->m_pkthdr.csum_flags & csum_flags) != 0 &&
 	    (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN);
 }
 
 #if defined(INET) || defined(INET6)
 static inline bool
 needs_inner_tcp_csum(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 #endif
 
 static inline bool
 needs_l3_csum(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP |
 	    CSUM_INNER_IP_TSO;
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
 static inline bool
 needs_outer_tcp_csum(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP |
 	    CSUM_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
 #ifdef RATELIMIT
 static inline bool
 needs_outer_l4_csum(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO |
 	    CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
 static inline bool
 needs_outer_udp_csum(struct mbuf *m)
 {
 	const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP;
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 #endif
 
 static inline bool
 needs_vlan_insertion(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	return (m->m_flags & M_VLANTAG);
 }
 
 static void *
 m_advance(struct mbuf **pm, int *poffset, int len)
 {
 	struct mbuf *m = *pm;
 	int offset = *poffset;
 	uintptr_t p = 0;
 
 	MPASS(len > 0);
 
 	for (;;) {
 		if (offset + len < m->m_len) {
 			offset += len;
 			p = mtod(m, uintptr_t) + offset;
 			break;
 		}
 		len -= m->m_len - offset;
 		m = m->m_next;
 		offset = 0;
 		MPASS(m != NULL);
 	}
 	*poffset = offset;
 	*pm = m;
 	return ((void *)p);
 }
 
 static inline int
 count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr)
 {
 	vm_paddr_t paddr;
 	int i, len, off, pglen, pgoff, seglen, segoff;
 	int nsegs = 0;
 
 	M_ASSERTEXTPG(m);
 	off = mtod(m, vm_offset_t);
 	len = m->m_len;
 	off += skip;
 	len -= skip;
 
 	if (m->m_epg_hdrlen != 0) {
 		if (off >= m->m_epg_hdrlen) {
 			off -= m->m_epg_hdrlen;
 		} else {
 			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			paddr = pmap_kextract(
 			    (vm_offset_t)&m->m_epg_hdr[segoff]);
 			if (*nextaddr != paddr)
 				nsegs++;
 			*nextaddr = paddr + seglen;
 		}
 	}
 	pgoff = m->m_epg_1st_off;
 	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 		paddr = m->m_epg_pa[i] + segoff;
 		if (*nextaddr != paddr)
 			nsegs++;
 		*nextaddr = paddr + seglen;
 		pgoff = 0;
 	};
 	if (len != 0) {
 		seglen = min(len, m->m_epg_trllen - off);
 		len -= seglen;
 		paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]);
 		if (*nextaddr != paddr)
 			nsegs++;
 		*nextaddr = paddr + seglen;
 	}
 
 	return (nsegs);
 }
 
 
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
  * must have at least one mbuf that's not empty.  It is possible for this
  * routine to return 0 if skip accounts for all the contents of the mbuf chain.
  */
 static inline int
 count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags)
 {
 	vm_paddr_t nextaddr, paddr;
 	vm_offset_t va;
 	int len, nsegs;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len > 0);
 	MPASS(m->m_pkthdr.len >= skip);
 
 	nsegs = 0;
 	nextaddr = 0;
 	for (; m; m = m->m_next) {
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
 		if (skip >= len) {
 			skip -= len;
 			continue;
 		}
 		if ((m->m_flags & M_EXTPG) != 0) {
 			*cflags |= MC_NOMAP;
 			nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr);
 			skip = 0;
 			continue;
 		}
 		va = mtod(m, vm_offset_t) + skip;
 		len -= skip;
 		skip = 0;
 		paddr = pmap_kextract(va);
 		nsegs += sglist_count((void *)(uintptr_t)va, len);
 		if (paddr == nextaddr)
 			nsegs--;
 		nextaddr = pmap_kextract(va + len - 1) + 1;
 	}
 
 	return (nsegs);
 }
 
 /*
  * The maximum number of segments that can fit in a WR.
  */
 static int
 max_nsegs_allowed(struct mbuf *m, bool vm_wr)
 {
 
 	if (vm_wr) {
 		if (needs_tso(m))
 			return (TX_SGL_SEGS_VM_TSO);
 		return (TX_SGL_SEGS_VM);
 	}
 
 	if (needs_tso(m)) {
 		if (needs_vxlan_tso(m))
 			return (TX_SGL_SEGS_VXLAN_TSO);
 		else
 			return (TX_SGL_SEGS_TSO);
 	}
 
 	return (TX_SGL_SEGS);
 }
 
 static struct timeval txerr_ratecheck = {0};
 static const struct timeval txerr_interval = {3, 0};
 
 /*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
  */
 int
 parse_pkt(struct mbuf **mp, bool vm_wr)
 {
 	struct mbuf *m0 = *mp, *m;
 	int rc, nsegs, defragged = 0, offset;
 	struct ether_header *eh;
 	void *l3hdr;
 #if defined(INET) || defined(INET6)
 	struct tcphdr *tcp;
 #endif
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	struct m_snd_tag *mst;
 #endif
 	uint16_t eh_type;
 	uint8_t cflags;
 
 	cflags = 0;
 	M_ASSERTPKTHDR(m0);
 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
 		rc = EINVAL;
 fail:
 		m_freem(m0);
 		*mp = NULL;
 		return (rc);
 	}
 restart:
 	/*
 	 * First count the number of gather list segments in the payload.
 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
 	nsegs = count_mbuf_nsegs(m0, 0, &cflags);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		mst = m0->m_pkthdr.snd_tag;
 	else
 		mst = NULL;
 #endif
 #ifdef KERN_TLS
-	if (mst != NULL && mst->type == IF_SND_TAG_TYPE_TLS) {
+	if (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_TLS) {
 		int len16;
 
 		cflags |= MC_TLS;
 		set_mbuf_cflags(m0, cflags);
 		rc = t6_ktls_parse_pkt(m0, &nsegs, &len16);
 		if (rc != 0)
 			goto fail;
 		set_mbuf_nsegs(m0, nsegs);
 		set_mbuf_len16(m0, len16);
 		return (0);
 	}
 #endif
 	if (nsegs > max_nsegs_allowed(m0, vm_wr)) {
 		if (defragged++ > 0) {
 			rc = EFBIG;
 			goto fail;
 		}
 		counter_u64_add(defrags, 1);
 		if ((m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = ENOMEM;
 			goto fail;
 		}
 		*mp = m0 = m;	/* update caller's copy after defrag */
 		goto restart;
 	}
 
 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN &&
 	    !(cflags & MC_NOMAP))) {
 		counter_u64_add(pullups, 1);
 		m0 = m_pullup(m0, m0->m_pkthdr.len);
 		if (m0 == NULL) {
 			/* Should have left well enough alone. */
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0;	/* update caller's copy after pullup */
 		goto restart;
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	set_mbuf_cflags(m0, cflags);
 	calculate_mbuf_len16(m0, vm_wr);
 
 #ifdef RATELIMIT
 	/*
 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
 	 * checksumming is enabled.  needs_outer_l4_csum happens to check for
 	 * all the right things.
 	 */
 	if (__predict_false(needs_eo(mst) && !needs_outer_l4_csum(m0))) {
 		m_snd_tag_rele(m0->m_pkthdr.snd_tag);
 		m0->m_pkthdr.snd_tag = NULL;
 		m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		mst = NULL;
 	}
 #endif
 
 	if (!needs_hwcsum(m0)
 #ifdef RATELIMIT
    		 && !needs_eo(mst)
 #endif
 	)
 		return (0);
 
 	m = m0;
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	if (eh_type == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evh = (void *)eh;
 
 		eh_type = ntohs(evh->evl_proto);
 		m0->m_pkthdr.l2hlen = sizeof(*evh);
 	} else
 		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
 	offset = 0;
 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
 		break;
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
 		if (needs_vxlan_csum(m0)) {
 			/* Driver will do the outer IP hdr checksum. */
 			ip->ip_sum = 0;
 			if (needs_vxlan_tso(m0)) {
 				const uint16_t ipl = ip->ip_len;
 
 				ip->ip_len = 0;
 				ip->ip_sum = ~in_cksum_hdr(ip);
 				ip->ip_len = ipl;
 			} else
 				ip->ip_sum = in_cksum_hdr(ip);
 		}
 		m0->m_pkthdr.l3hlen = ip->ip_hl << 2;
 		break;
 	}
 #endif
 	default:
 		if (ratecheck(&txerr_ratecheck, &txerr_interval)) {
 			log(LOG_ERR, "%s: ethertype 0x%04x unknown.  "
 			    "if_cxgbe must be compiled with the same "
 			    "INET/INET6 options as the kernel.\n", __func__,
 			    eh_type);
 		}
 		rc = EINVAL;
 		goto fail;
 	}
 
 	if (needs_vxlan_csum(m0)) {
 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 		m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header);
 
 		/* Inner headers. */
 		eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen +
 		    sizeof(struct udphdr) + sizeof(struct vxlan_header));
 		eh_type = ntohs(eh->ether_type);
 		if (eh_type == ETHERTYPE_VLAN) {
 			struct ether_vlan_header *evh = (void *)eh;
 
 			eh_type = ntohs(evh->evl_proto);
 			m0->m_pkthdr.inner_l2hlen = sizeof(*evh);
 		} else
 			m0->m_pkthdr.inner_l2hlen = sizeof(*eh);
 		l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
 
 		switch (eh_type) {
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr);
 			break;
 #endif
 #ifdef INET
 		case ETHERTYPE_IP:
 		{
 			struct ip *ip = l3hdr;
 
 			m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2;
 			break;
 		}
 #endif
 		default:
 			if (ratecheck(&txerr_ratecheck, &txerr_interval)) {
 				log(LOG_ERR, "%s: VXLAN hw offload requested"
 				    "with unknown ethertype 0x%04x.  if_cxgbe "
 				    "must be compiled with the same INET/INET6 "
 				    "options as the kernel.\n", __func__,
 				    eh_type);
 			}
 			rc = EINVAL;
 			goto fail;
 		}
 #if defined(INET) || defined(INET6)
 		if (needs_inner_tcp_csum(m0)) {
 			tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen);
 			m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4;
 		}
 #endif
 		MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 		m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP |
 		    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP |
 		    CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO |
 		    CSUM_ENCAP_VXLAN;
 	}
 
 #if defined(INET) || defined(INET6)
 	if (needs_outer_tcp_csum(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 #ifdef RATELIMIT
 		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
 			set_mbuf_eo_tsclk_tsoff(m0,
 			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
 		} else
 			set_mbuf_eo_tsclk_tsoff(m0, 0);
 	} else if (needs_outer_udp_csum(m0)) {
 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 #endif
 	}
 #ifdef RATELIMIT
 	if (needs_eo(mst)) {
 		u_int immhdrs;
 
 		/* EO WRs have the headers in the WR and not the GL. */
 		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
 		    m0->m_pkthdr.l4hlen;
 		cflags = 0;
 		nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags);
 		MPASS(cflags == mbuf_cflags(m0));
 		set_mbuf_eo_nsegs(m0, nsegs);
 		set_mbuf_eo_len16(m0,
 		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
 	}
 #endif
 #endif
 	MPASS(m0 == *mp);
 	return (0);
 }
 
 void *
 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, available;
 	struct wrqe *wr;
 	void *w;
 
 	MPASS(len16 > 0);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
 	EQ_LOCK(eq);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
 slowpath:
 		EQ_UNLOCK(eq);
 		wr = alloc_wrqe(len16 * 16, wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 		cookie->pidx = -1;
 		cookie->ndesc = ndesc;
 		return (&wr->wr);
 	}
 
 	eq->cidx = read_hw_cidx(eq);
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	if (available < ndesc)
 		goto slowpath;
 
 	cookie->pidx = eq->pidx;
 	cookie->ndesc = ndesc;
 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
 
 	w = &eq->desc[eq->pidx];
 	IDXINCR(eq->pidx, ndesc, eq->sidx);
 	if (__predict_false(cookie->pidx + ndesc > eq->sidx)) {
 		w = &wrq->ss[0];
 		wrq->ss_pidx = cookie->pidx;
 		wrq->ss_len = len16 * 16;
 	}
 
 	EQ_UNLOCK(eq);
 
 	return (w);
 }
 
 void
 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, pidx;
 	struct wrq_cookie *prev, *next;
 
 	if (cookie->pidx == -1) {
 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
 		t4_wrq_tx(sc, wr);
 		return;
 	}
 
 	if (__predict_false(w == &wrq->ss[0])) {
 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
 
 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
 		wrq->tx_wrs_ss++;
 	} else
 		wrq->tx_wrs_direct++;
 
 	EQ_LOCK(eq);
 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
 	pidx = cookie->pidx;
 	MPASS(pidx >= 0 && pidx < eq->sidx);
 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
 	next = TAILQ_NEXT(cookie, link);
 	if (prev == NULL) {
 		MPASS(pidx == eq->dbidx);
 		if (next == NULL || ndesc >= 16) {
 			int available;
 			struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 			/*
 			 * Note that the WR via which we'll request tx updates
 			 * is at pidx and not eq->pidx, which has moved on
 			 * already.
 			 */
 			dst = (void *)&eq->desc[pidx];
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 			if (available < eq->sidx / 4 &&
 			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 				/*
 				 * XXX: This is not 100% reliable with some
 				 * types of WRs.  But this is a very unusual
 				 * situation for an ofld/ctrl queue anyway.
 				 */
 				dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 				    F_FW_WR_EQUEQ);
 			}
 
 			ring_eq_db(wrq->adapter, eq, ndesc);
 		} else {
 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
 			next->pidx = pidx;
 			next->ndesc += ndesc;
 		}
 	} else {
 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
 		prev->ndesc += ndesc;
 	}
 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
 		/* Doorbell must have caught up to the pidx. */
 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
 	EQ_UNLOCK(eq);
 }
 
 static u_int
 can_resume_eth_tx(struct mp_ring *r)
 {
 	struct sge_eq *eq = r->cookie;
 
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
 static inline bool
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
 
 	return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0);
 }
 
 static inline int
 discard_tx(struct sge_eq *eq)
 {
 
 	return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED);
 }
 
 static inline int
 wr_can_update_eq(void *p)
 {
 	struct fw_eth_tx_pkts_wr *wr = p;
 
 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
 	case FW_ETH_TX_PKT_WR:
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKTS2_WR:
 	case FW_ETH_TX_PKT_VM_WR:
 	case FW_ETH_TX_PKTS_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 static inline void
 set_txupdate_flags(struct sge_txq *txq, u_int avail,
     struct fw_eth_tx_pkt_wr *wr)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct txpkts *txp = &txq->txp;
 
 	if ((txp->npkt > 0 || avail < eq->sidx / 2) &&
 	    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
 		eq->equeqidx = eq->pidx;
 	} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
 		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 		eq->equeqidx = eq->pidx;
 	}
 }
 
 #if defined(__i386__) || defined(__amd64__)
 extern uint64_t tsc_freq;
 #endif
 
 static inline bool
 record_eth_tx_time(struct sge_txq *txq)
 {
 	const uint64_t cycles = get_cyclecount();
 	const uint64_t last_tx = txq->last_tx;
 #if defined(__i386__) || defined(__amd64__)
 	const uint64_t itg = tsc_freq * t4_tx_coalesce_gap / 1000000;
 #else
 	const uint64_t itg = 0;
 #endif
 
 	MPASS(cycles >= last_tx);
 	txq->last_tx = cycles;
 	return (cycles - last_tx < itg);
 }
 
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
 {
 	struct sge_txq *txq = r->cookie;
 	struct ifnet *ifp = txq->ifp;
 	struct sge_eq *eq = &txq->eq;
 	struct txpkts *txp = &txq->txp;
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->adapter;
 	u_int total, remaining;		/* # of packets */
 	u_int n, avail, dbdiff;		/* # of hardware descriptors */
 	int i, rc;
 	struct mbuf *m0;
 	bool snd, recent_tx;
 	void *wr;	/* start of the last WR written to the ring */
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	recent_tx = record_eth_tx_time(txq);
 
 	remaining = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(discard_tx(eq))) {
 		for (i = 0; i < txp->npkt; i++)
 			m_freem(txp->mb[i]);
 		txp->npkt = 0;
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
 		reclaim_tx_descs(txq, eq->sidx);
 		*coalescing = false;
 		return (remaining);	/* emptied */
 	}
 
 	/* How many hardware descriptors do we have readily available. */
 	if (eq->pidx == eq->cidx)
 		avail = eq->sidx - 1;
 	else
 		avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 	total = 0;
 	if (remaining == 0) {
 		txp->score = 0;
 		txq->txpkts_flush++;
 		goto send_txpkts;
 	}
 
 	dbdiff = 0;
 	MPASS(remaining > 0);
 	while (remaining > 0) {
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
 		if (avail < 2 * SGE_MAX_WR_NDESC)
 			avail += reclaim_tx_descs(txq, 64);
 
 		if (t4_tx_coalesce == 0 && txp->npkt == 0)
 			goto skip_coalescing;
 		if (cannot_use_txpkts(m0))
 			txp->score = 0;
 		else if (recent_tx) {
 			if (++txp->score == 0)
 				txp->score = UINT8_MAX;
 		} else
 			txp->score = 1;
 		if (txp->npkt > 0 || remaining > 1 ||
 		    txp->score >= t4_tx_coalesce_pkts ||
 		    atomic_load_int(&txq->eq.equiq) != 0) {
 			if (vi->flags & TX_USES_VM_WR)
 				rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
 			else
 				rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd);
 		} else {
 			snd = false;
 			rc = EINVAL;
 		}
 		if (snd) {
 			MPASS(txp->npkt > 0);
 			for (i = 0; i < txp->npkt; i++)
 				ETHER_BPF_MTAP(ifp, txp->mb[i]);
 			if (txp->npkt > 1) {
 				MPASS(avail >= tx_len16_to_desc(txp->len16));
 				if (vi->flags & TX_USES_VM_WR)
 					n = write_txpkts_vm_wr(sc, txq);
 				else
 					n = write_txpkts_wr(sc, txq);
 			} else {
 				MPASS(avail >=
 				    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
 				if (vi->flags & TX_USES_VM_WR)
 					n = write_txpkt_vm_wr(sc, txq,
 					    txp->mb[0]);
 				else
 					n = write_txpkt_wr(sc, txq, txp->mb[0],
 					    avail);
 			}
 			MPASS(n <= SGE_MAX_WR_NDESC);
 			avail -= n;
 			dbdiff += n;
 			wr = &eq->desc[eq->pidx];
 			IDXINCR(eq->pidx, n, eq->sidx);
 			txp->npkt = 0;	/* emptied */
 		}
 		if (rc == 0) {
 			/* m0 was coalesced into txq->txpkts. */
 			goto next_mbuf;
 		}
 		if (rc == EAGAIN) {
 			/*
 			 * m0 is suitable for tx coalescing but could not be
 			 * combined with the existing txq->txpkts, which has now
 			 * been transmitted.  Start a new txpkts with m0.
 			 */
 			MPASS(snd);
 			MPASS(txp->npkt == 0);
 			continue;
 		}
 
 		MPASS(rc != 0 && rc != EAGAIN);
 		MPASS(txp->npkt == 0);
 skip_coalescing:
 		n = tx_len16_to_desc(mbuf_len16(m0));
 		if (__predict_false(avail < n)) {
 			avail += reclaim_tx_descs(txq, min(n, 32));
 			if (avail < n)
 				break;	/* out of descriptors */
 		}
 
 		wr = &eq->desc[eq->pidx];
 		if (mbuf_cflags(m0) & MC_RAW_WR) {
 			n = write_raw_wr(txq, wr, m0, avail);
 #ifdef KERN_TLS
 		} else if (mbuf_cflags(m0) & MC_TLS) {
 			ETHER_BPF_MTAP(ifp, m0);
 			n = t6_ktls_write_wr(txq, wr, m0, mbuf_nsegs(m0),
 			    avail);
 #endif
 		} else {
 			ETHER_BPF_MTAP(ifp, m0);
 			if (vi->flags & TX_USES_VM_WR)
 				n = write_txpkt_vm_wr(sc, txq, m0);
 			else
 				n = write_txpkt_wr(sc, txq, m0, avail);
 		}
 		MPASS(n >= 1 && n <= avail);
 		if (!(mbuf_cflags(m0) & MC_TLS))
 			MPASS(n <= SGE_MAX_WR_NDESC);
 
 		avail -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
 		if (dbdiff >= 512 / EQ_ESIZE) {	/* X_FETCHBURSTMAX_512B */
 			if (wr_can_update_eq(wr))
 				set_txupdate_flags(txq, avail, wr);
 			ring_eq_db(sc, eq, dbdiff);
 			avail += reclaim_tx_descs(txq, 32);
 			dbdiff = 0;
 		}
 next_mbuf:
 		total++;
 		remaining--;
 		if (__predict_false(++cidx == r->size))
 			cidx = 0;
 	}
 	if (dbdiff != 0) {
 		if (wr_can_update_eq(wr))
 			set_txupdate_flags(txq, avail, wr);
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
 	} else if (eq->pidx == eq->cidx && txp->npkt > 0 &&
 	    atomic_load_int(&txq->eq.equiq) == 0) {
 		/*
 		 * If nothing was submitted to the chip for tx (it was coalesced
 		 * into txpkts instead) and there is no tx update outstanding
 		 * then we need to send txpkts now.
 		 */
 send_txpkts:
 		MPASS(txp->npkt > 0);
 		for (i = 0; i < txp->npkt; i++)
 			ETHER_BPF_MTAP(ifp, txp->mb[i]);
 		if (txp->npkt > 1) {
 			MPASS(avail >= tx_len16_to_desc(txp->len16));
 			if (vi->flags & TX_USES_VM_WR)
 				n = write_txpkts_vm_wr(sc, txq);
 			else
 				n = write_txpkts_wr(sc, txq);
 		} else {
 			MPASS(avail >=
 			    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
 			if (vi->flags & TX_USES_VM_WR)
 				n = write_txpkt_vm_wr(sc, txq, txp->mb[0]);
 			else
 				n = write_txpkt_wr(sc, txq, txp->mb[0], avail);
 		}
 		MPASS(n <= SGE_MAX_WR_NDESC);
 		wr = &eq->desc[eq->pidx];
 		IDXINCR(eq->pidx, n, eq->sidx);
 		txp->npkt = 0;	/* emptied */
 
 		MPASS(wr_can_update_eq(wr));
 		set_txupdate_flags(txq, avail - n, wr);
 		ring_eq_db(sc, eq, n);
 		reclaim_tx_descs(txq, 32);
 	}
 	*coalescing = txp->npkt > 0;
 
 	return (total);
 }
 
 static inline void
 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
     int qsize, int intr_idx, int cong)
 {
 
 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
 	KASSERT(intr_idx >= -1 && intr_idx < sc->intr_count,
 	    ("%s: bad intr_idx %d", __func__, intr_idx));
 
 	iq->flags = 0;
 	iq->state = IQS_DISABLED;
 	iq->adapter = sc;
 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
 	if (pktc_idx >= 0) {
 		iq->intr_params |= F_QINTR_CNT_EN;
 		iq->intr_pktc_idx = pktc_idx;
 	}
 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
 	iq->intr_idx = intr_idx;
 	iq->cong = cong;
 }
 
 static inline void
 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	fl->qsize = qsize;
 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
 	mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 	if (sc->flags & BUF_PACKING_OK &&
 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
 		fl->flags |= FL_BUF_PACKING;
 	fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING);
 	fl->safe_zidx = sc->sge.safe_zidx;
 	if (fl->flags & FL_BUF_PACKING) {
 		fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
 		fl->buf_boundary = sp->pack_boundary;
 	} else {
 		fl->lowat = roundup2(sp->fl_starve_threshold, 8);
 		fl->buf_boundary = 16;
 	}
 	if (fl_pad && fl->buf_boundary < sp->pad_boundary)
 		fl->buf_boundary = sp->pad_boundary;
 }
 
 static inline void
 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
     uint8_t tx_chan, struct sge_iq *iq, char *name)
 {
 	KASSERT(eqtype >= EQ_CTRL && eqtype <= EQ_OFLD,
 	    ("%s: bad qtype %d", __func__, eqtype));
 
 	eq->type = eqtype;
 	eq->tx_chan = tx_chan;
 	eq->iq = iq;
 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 }
 
 int
 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_addr_t *pa, void **va)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
 	if (rc != 0) {
 		CH_ERR(sc, "cannot allocate DMA tag: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamem_alloc(*tag, va,
 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
 	if (rc != 0) {
 		CH_ERR(sc, "cannot allocate DMA memory: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
 	if (rc != 0) {
 		CH_ERR(sc, "cannot load DMA map: %d\n", rc);
 		goto done;
 	}
 done:
 	if (rc)
 		free_ring(sc, *tag, *map, *pa, *va);
 
 	return (rc);
 }
 
 int
 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
     bus_addr_t pa, void *va)
 {
 	if (pa)
 		bus_dmamap_unload(tag, map);
 	if (va)
 		bus_dmamem_free(tag, va, map);
 	if (tag)
 		bus_dma_tag_destroy(tag);
 
 	return (0);
 }
 
 /*
  * Allocates the software resources (mainly memory and sysctl nodes) for an
  * ingress queue and an optional freelist.
  *
  * Sets IQ_SW_ALLOCATED and returns 0 on success.
  */
 static int
 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
     struct sysctl_ctx_list *ctx, struct sysctl_oid *oid)
 {
 	int rc;
 	size_t len;
 	struct adapter *sc = vi->adapter;
 
 	MPASS(!(iq->flags & IQ_SW_ALLOCATED));
 
 	len = iq->qsize * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
 	    (void **)&iq->desc);
 	if (rc != 0)
 		return (rc);
 
 	if (fl) {
 		len = fl->qsize * EQ_ESIZE;
 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
 		    &fl->ba, (void **)&fl->desc);
 		if (rc) {
 			free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba,
 			    iq->desc);
 			return (rc);
 		}
 
 		/* Allocate space for one software descriptor per buffer. */
 		fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc),
 		    M_CXGBE, M_ZERO | M_WAITOK);
 
 		add_fl_sysctls(sc, ctx, oid, fl);
 		iq->flags |= IQ_HAS_FL;
 	}
 	add_iq_sysctls(ctx, oid, iq);
 	iq->flags |= IQ_SW_ALLOCATED;
 
 	return (0);
 }
 
 /*
  * Frees all software resources (memory and locks) associated with an ingress
  * queue and an optional freelist.
  */
 static void
 free_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl)
 {
 	MPASS(iq->flags & IQ_SW_ALLOCATED);
 
 	if (fl) {
 		MPASS(iq->flags & IQ_HAS_FL);
 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc);
 		free_fl_buffers(sc, fl);
 		free(fl->sdesc, M_CXGBE);
 		mtx_destroy(&fl->fl_lock);
 		bzero(fl, sizeof(*fl));
 	}
 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
 	bzero(iq, sizeof(*iq));
 }
 
 /*
  * Allocates a hardware ingress queue and an optional freelist that will be
  * associated with it.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 static int
 alloc_iq_fl_hwq(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc, i, cntxt_id;
 	struct fw_iq_cmd c;
 	struct adapter *sc = vi->adapter;
 	__be32 v = 0;
 
 	MPASS (!(iq->flags & IQ_HW_ALLOCATED));
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 
 	/* Special handling for firmware event queue */
 	if (iq == &sc->sge.fwq)
 		v |= F_FW_IQ_CMD_IQASYNCH;
 
 	if (iq->intr_idx < 0) {
 		/* Forwarded interrupts, all headed to fwq */
 		v |= F_FW_IQ_CMD_IQANDST;
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id);
 	} else {
 		KASSERT(iq->intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__, iq->intr_idx));
 		v |= V_FW_IQ_CMD_IQANDSTINDEX(iq->intr_idx);
 	}
 
 	bzero(iq->desc, iq->qsize * IQ_ESIZE);
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(vi->pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(iq->qsize);
 	c.iqaddr = htobe64(iq->ba);
 	if (iq->cong >= 0)
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
 
 	if (fl) {
 		bzero(fl->desc, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len);
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
 			    0));
 		if (iq->cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(iq->cong) |
 				    F_FW_IQ_CMD_FL0CONGCIF |
 				    F_FW_IQ_CMD_FL0CONGEN);
 		}
 		c.fl0dcaen_to_fl0cidxfthresh =
 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) |
 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 		c.fl0size = htobe16(fl->qsize);
 		c.fl0addr = htobe64(fl->ba);
 	}
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		CH_ERR(sc, "failed to create hw ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	iq->cidx = 0;
 	iq->gen = F_RSPD_GEN;
 	iq->cntxt_id = be16toh(c.iqid);
 	iq->abs_id = be16toh(c.physiqid);
 
 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.iqmap_sz) {
 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
 		    cntxt_id, sc->sge.iqmap_sz - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = iq;
 
 	if (fl) {
 		u_int qid;
 #ifdef INVARIANTS
 		MPASS(!(fl->flags & FL_BUF_RESUME));
 		for (i = 0; i < fl->sidx * 8; i++)
 			MPASS(fl->sdesc[i].cl == NULL);
 #endif
 		fl->cntxt_id = be16toh(c.fl0id);
 		fl->pidx = fl->cidx = fl->hw_cidx = fl->dbidx = 0;
 		fl->rx_offset = 0;
 		fl->flags &= ~(FL_STARVING | FL_DOOMED);
 
 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
 		if (cntxt_id >= sc->sge.eqmap_sz) {
 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
 			    __func__, cntxt_id, sc->sge.eqmap_sz - 1);
 		}
 		sc->sge.eqmap[cntxt_id] = (void *)fl;
 
 		qid = fl->cntxt_id;
 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 			uint32_t mask = (1 << s_qpp) - 1;
 			volatile uint8_t *udb;
 
 			udb = sc->udbs_base + UDBS_DB_OFFSET;
 			udb += (qid >> s_qpp) << PAGE_SHIFT;
 			qid &= mask;
 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
 				udb += qid << UDBS_SEG_SHIFT;
 				qid = 0;
 			}
 			fl->udb = (volatile void *)udb;
 		}
 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
 
 		FL_LOCK(fl);
 		/* Enough to make sure the SGE doesn't think it's starved */
 		refill_fl(sc, fl, fl->lowat);
 		FL_UNLOCK(fl);
 	}
 
 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && iq->cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
 		if (iq->cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (iq->cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			CH_ERR(sc, "failed to set congestion manager context "
 			    "for ingress queue %d: %d\n", iq->cntxt_id, rc);
 		}
 	}
 
 	/* Enable IQ interrupts */
 	atomic_store_rel_int(&iq->state, IQS_IDLE);
 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
 	    V_INGRESSQID(iq->cntxt_id));
 
 	iq->flags |= IQ_HW_ALLOCATED;
 
 	return (0);
 }
 
 static int
 free_iq_fl_hwq(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc;
 
 	MPASS(iq->flags & IQ_HW_ALLOCATED);
 	rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
 	    iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff);
 	if (rc != 0) {
 		CH_ERR(sc, "failed to free iq %p: %d\n", iq, rc);
 		return (rc);
 	}
 	iq->flags &= ~IQ_HW_ALLOCATED;
 
 	return (0);
 }
 
 static void
 add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_iq *iq)
 {
 	struct sysctl_oid_list *children;
 
 	if (ctx == NULL || oid == NULL)
 		return;
 
 	children = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba,
 	    "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    iq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &iq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &iq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &iq->cidx,
 	    0, "consumer index");
 }
 
 static void
 add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_fl *fl)
 {
 	struct sysctl_oid_list *children;
 
 	if (ctx == NULL || oid == NULL)
 		return;
 
 	children = SYSCTL_CHILDREN(oid);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
 	    &fl->ba, "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &fl->cntxt_id, 0, "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
 	    fl_pad ? 1 : 0, "padding enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
 	    0, "consumer index");
 	if (fl->flags & FL_BUF_PACKING) {
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
 	}
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
 }
 
 /*
  * Idempotent.
  */
 static int
 alloc_fwq(struct adapter *sc)
 {
 	int rc, intr_idx;
 	struct sge_iq *fwq = &sc->sge.fwq;
 	struct vi_info *vi = &sc->port[0]->vi[0];
 
 	if (!(fwq->flags & IQ_SW_ALLOCATED)) {
 		MPASS(!(fwq->flags & IQ_HW_ALLOCATED));
 
 		if (sc->flags & IS_VF)
 			intr_idx = 0;
 		else
 			intr_idx = sc->intr_count > 1 ? 1 : 0;
 		init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, intr_idx, -1);
 		rc = alloc_iq_fl(vi, fwq, NULL, &sc->ctx, sc->fwq_oid);
 		if (rc != 0) {
 			CH_ERR(sc, "failed to allocate fwq: %d\n", rc);
 			return (rc);
 		}
 		MPASS(fwq->flags & IQ_SW_ALLOCATED);
 	}
 
 	if (!(fwq->flags & IQ_HW_ALLOCATED)) {
 		MPASS(fwq->flags & IQ_SW_ALLOCATED);
 
 		rc = alloc_iq_fl_hwq(vi, fwq, NULL);
 		if (rc != 0) {
 			CH_ERR(sc, "failed to create hw fwq: %d\n", rc);
 			return (rc);
 		}
 		MPASS(fwq->flags & IQ_HW_ALLOCATED);
 	}
 
 	return (0);
 }
 
 /*
  * Idempotent.
  */
 static void
 free_fwq(struct adapter *sc)
 {
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	if (fwq->flags & IQ_HW_ALLOCATED) {
 		MPASS(fwq->flags & IQ_SW_ALLOCATED);
 		free_iq_fl_hwq(sc, fwq, NULL);
 		MPASS(!(fwq->flags & IQ_HW_ALLOCATED));
 	}
 
 	if (fwq->flags & IQ_SW_ALLOCATED) {
 		MPASS(!(fwq->flags & IQ_HW_ALLOCATED));
 		free_iq_fl(sc, fwq, NULL);
 		MPASS(!(fwq->flags & IQ_SW_ALLOCATED));
 	}
 }
 
 /*
  * Idempotent.
  */
 static int
 alloc_ctrlq(struct adapter *sc, int idx)
 {
 	int rc;
 	char name[16];
 	struct sysctl_oid *oid;
 	struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx];
 
 	MPASS(idx < sc->params.nports);
 
 	if (!(ctrlq->eq.flags & EQ_SW_ALLOCATED)) {
 		MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED));
 
 		snprintf(name, sizeof(name), "%d", idx);
 		oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->ctrlq_oid),
 		    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 		    "ctrl queue");
 
 		snprintf(name, sizeof(name), "%s ctrlq%d",
 		    device_get_nameunit(sc->dev), idx);
 		init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE,
 		    sc->port[idx]->tx_chan, &sc->sge.fwq, name);
 		rc = alloc_wrq(sc, NULL, ctrlq, &sc->ctx, oid);
 		if (rc != 0) {
 			CH_ERR(sc, "failed to allocate ctrlq%d: %d\n", idx, rc);
 			sysctl_remove_oid(oid, 1, 1);
 			return (rc);
 		}
 		MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED);
 	}
 
 	if (!(ctrlq->eq.flags & EQ_HW_ALLOCATED)) {
 		MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED);
 
 		rc = alloc_eq_hwq(sc, NULL, &ctrlq->eq);
 		if (rc != 0) {
 			CH_ERR(sc, "failed to create hw ctrlq%d: %d\n", idx, rc);
 			return (rc);
 		}
 		MPASS(ctrlq->eq.flags & EQ_HW_ALLOCATED);
 	}
 
 	return (0);
 }
 
 /*
  * Idempotent.
  */
 static void
 free_ctrlq(struct adapter *sc, int idx)
 {
 	struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx];
 
 	if (ctrlq->eq.flags & EQ_HW_ALLOCATED) {
 		MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED);
 		free_eq_hwq(sc, NULL, &ctrlq->eq);
 		MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED));
 	}
 
 	if (ctrlq->eq.flags & EQ_SW_ALLOCATED) {
 		MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED));
 		free_wrq(sc, ctrlq);
 		MPASS(!(ctrlq->eq.flags & EQ_SW_ALLOCATED));
 	}
 }
 
 int
 tnl_cong(struct port_info *pi, int drop)
 {
 
 	if (drop == -1)
 		return (-1);
 	else if (drop == 1)
 		return (0);
 	else
 		return (pi->rx_e_chan_map);
 }
 
 /*
  * Idempotent.
  */
 static int
 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int idx, int intr_idx,
     int maxp)
 {
 	int rc;
 	struct adapter *sc = vi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct sysctl_oid *oid;
 	char name[16];
 
 	if (!(rxq->iq.flags & IQ_SW_ALLOCATED)) {
 		MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED));
 #if defined(INET) || defined(INET6)
 		rc = tcp_lro_init_args(&rxq->lro, ifp, lro_entries, lro_mbufs);
 		if (rc != 0)
 			return (rc);
 		MPASS(rxq->lro.ifp == ifp);	/* also indicates LRO init'ed */
 #endif
 		rxq->ifp = ifp;
 
 		snprintf(name, sizeof(name), "%d", idx);
 		oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->rxq_oid),
 		    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 		    "rx queue");
 
 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq,
 		    intr_idx, tnl_cong(vi->pi, cong_drop));
 #if defined(INET) || defined(INET6)
 		if (ifp->if_capenable & IFCAP_LRO)
 			rxq->iq.flags |= IQ_LRO_ENABLED;
 #endif
 		if (ifp->if_capenable & IFCAP_HWRXTSTMP)
 			rxq->iq.flags |= IQ_RX_TIMESTAMP;
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(vi->dev), idx);
 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
 		rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, &vi->ctx, oid);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to allocate rxq%d: %d\n", idx, rc);
 			sysctl_remove_oid(oid, 1, 1);
 #if defined(INET) || defined(INET6)
 			tcp_lro_free(&rxq->lro);
 			rxq->lro.ifp = NULL;
 #endif
 			return (rc);
 		}
 		MPASS(rxq->iq.flags & IQ_SW_ALLOCATED);
 		add_rxq_sysctls(&vi->ctx, oid, rxq);
 	}
 
 	if (!(rxq->iq.flags & IQ_HW_ALLOCATED)) {
 		MPASS(rxq->iq.flags & IQ_SW_ALLOCATED);
 		rc = alloc_iq_fl_hwq(vi, &rxq->iq, &rxq->fl);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to create hw rxq%d: %d\n", idx, rc);
 			return (rc);
 		}
 		MPASS(rxq->iq.flags & IQ_HW_ALLOCATED);
 
 		if (idx == 0)
 			sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
 		else
 			KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
 			    ("iq_base mismatch"));
 		KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
 		    ("PF with non-zero iq_base"));
 
 		/*
 		 * The freelist is just barely above the starvation threshold
 		 * right now, fill it up a bit more.
 		 */
 		FL_LOCK(&rxq->fl);
 		refill_fl(sc, &rxq->fl, 128);
 		FL_UNLOCK(&rxq->fl);
 	}
 
 	return (0);
 }
 
 /*
  * Idempotent.
  */
 static void
 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
 {
 	if (rxq->iq.flags & IQ_HW_ALLOCATED) {
 		MPASS(rxq->iq.flags & IQ_SW_ALLOCATED);
 		free_iq_fl_hwq(vi->adapter, &rxq->iq, &rxq->fl);
 		MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED));
 	}
 
 	if (rxq->iq.flags & IQ_SW_ALLOCATED) {
 		MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED));
 #if defined(INET) || defined(INET6)
 		tcp_lro_free(&rxq->lro);
 #endif
 		free_iq_fl(vi->adapter, &rxq->iq, &rxq->fl);
 		MPASS(!(rxq->iq.flags & IQ_SW_ALLOCATED));
 		bzero(rxq, sizeof(*rxq));
 	}
 }
 
 static void
 add_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_rxq *rxq)
 {
 	struct sysctl_oid_list *children;
 
 	if (ctx == NULL || oid == NULL)
 		return;
 
 	children = SYSCTL_CHILDREN(oid);
 #if defined(INET) || defined(INET6)
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD,
 	    &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_rxcsum", CTLFLAG_RD,
 	    &rxq->vxlan_rxcsum,
 	    "# of times hardware assisted with inner checksum (VXLAN)");
 }
 
 #ifdef TCP_OFFLOAD
 /*
  * Idempotent.
  */
 static int
 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int idx,
     int intr_idx, int maxp)
 {
 	int rc;
 	struct adapter *sc = vi->adapter;
 	struct sysctl_oid *oid;
 	char name[16];
 
 	if (!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)) {
 		MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED));
 
 		snprintf(name, sizeof(name), "%d", idx);
 		oid = SYSCTL_ADD_NODE(&vi->ctx,
 		    SYSCTL_CHILDREN(vi->ofld_rxq_oid), OID_AUTO, name,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload rx queue");
 
 		init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx,
 		    vi->qsize_rxq, intr_idx, 0);
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(vi->dev), idx);
 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
 		rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, &vi->ctx,
 		    oid);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to allocate ofld_rxq%d: %d\n", idx,
 			    rc);
 			sysctl_remove_oid(oid, 1, 1);
 			return (rc);
 		}
 		MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED);
 		ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK);
 		ofld_rxq->rx_iscsi_ddp_setup_error =
 		    counter_u64_alloc(M_WAITOK);
 		add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq);
 	}
 
 	if (!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)) {
 		MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED);
 		rc = alloc_iq_fl_hwq(vi, &ofld_rxq->iq, &ofld_rxq->fl);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to create hw ofld_rxq%d: %d\n", idx,
 			    rc);
 			return (rc);
 		}
 		MPASS(ofld_rxq->iq.flags & IQ_HW_ALLOCATED);
 	}
 	return (rc);
 }
 
 /*
  * Idempotent.
  */
 static void
 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
 {
 	if (ofld_rxq->iq.flags & IQ_HW_ALLOCATED) {
 		MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED);
 		free_iq_fl_hwq(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl);
 		MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED));
 	}
 
 	if (ofld_rxq->iq.flags & IQ_SW_ALLOCATED) {
 		MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED));
 		free_iq_fl(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl);
 		MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED));
 		counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok);
 		counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error);
 		bzero(ofld_rxq, sizeof(*ofld_rxq));
 	}
 }
 
 static void
 add_ofld_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_ofld_rxq *ofld_rxq)
 {
 	struct sysctl_oid_list *children;
 
 	if (ctx == NULL || oid == NULL)
 		return;
 
 	children = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
 	    "rx_toe_tls_records", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_records,
 	    "# of TOE TLS records received");
 	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
 	    "rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets,
 	    "# of payload octets in received TOE TLS records");
 
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_ok",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_ok,
 	    "# of times DDP buffer was setup successfully.");
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_error",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_error,
 	    "# of times DDP buffer setup failed.");
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_octets",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_octets, 0,
 	    "# of octets placed directly");
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_pdus",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_pdus, 0,
 	    "# of PDUs with data placed directly.");
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_octets",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_octets, 0,
 	    "# of data octets delivered in freelist");
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_pdus",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_pdus, 0,
 	    "# of PDUs with data delivered in freelist");
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "padding_errors",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_padding_errors, 0,
 	    "# of PDUs with invalid padding");
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "header_digest_errors",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_header_digest_errors, 0,
 	    "# of PDUs with invalid header digests");
 	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "data_digest_errors",
 	    CTLFLAG_RD, &ofld_rxq->rx_iscsi_data_digest_errors, 0,
 	    "# of PDUs with invalid data digests");
 }
 #endif
 
 /*
  * Returns a reasonable automatic cidx flush threshold for a given queue size.
  */
 static u_int
 qsize_to_fthresh(int qsize)
 {
 	u_int fthresh;
 
 	while (!powerof2(qsize))
 		qsize++;
 	fthresh = ilog2(qsize);
 	if (fthresh > X_CIDXFLUSHTHRESH_128)
 		fthresh = X_CIDXFLUSHTHRESH_128;
 
 	return (fthresh);
 }
 
 static int
 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		CH_ERR(sc, "failed to create hw ctrlq for tx_chan %d: %d\n",
 		    eq->tx_chan, rc);
 		return (rc);
 	}
 
 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
 	eq->abs_id = G_FW_EQ_CTRL_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.eqmap_sz)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.eqmap_sz - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 static int
 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create Ethernet egress queue: %d\n", rc);
 		return (rc);
 	}
 
 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.eqmap_sz)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.eqmap_sz - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 static int
 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
 	    V_FW_EQ_OFLD_CMD_VFN(0));
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) |
 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create egress queue for TCP offload: %d\n", rc);
 		return (rc);
 	}
 
 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_OFLD_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.eqmap_sz)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.eqmap_sz - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 #endif
 
 /* SW only */
 static int
 alloc_eq(struct adapter *sc, struct sge_eq *eq, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid)
 {
 	int rc, qsize;
 	size_t len;
 
 	MPASS(!(eq->flags & EQ_SW_ALLOCATED));
 
 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba,
 	    (void **)&eq->desc);
 	if (rc)
 		return (rc);
 	if (ctx != NULL && oid != NULL)
 		add_eq_sysctls(sc, ctx, oid, eq);
 	eq->flags |= EQ_SW_ALLOCATED;
 
 	return (0);
 }
 
 /* SW only */
 static void
 free_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	MPASS(eq->flags & EQ_SW_ALLOCATED);
 	if (eq->type == EQ_ETH)
 		MPASS(eq->pidx == eq->cidx);
 
 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
 	mtx_destroy(&eq->eq_lock);
 	bzero(eq, sizeof(*eq));
 }
 
 static void
 add_eq_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_eq *eq)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &eq->ba,
 	    "bus address of descriptor ring");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
 	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
 	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &eq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &eq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &eq->cidx,
 	    0, "consumer index");
 	SYSCTL_ADD_U16(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &eq->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
 	    eq->sidx, "status page index");
 }
 
 static int
 alloc_eq_hwq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc;
 
 	MPASS(!(eq->flags & EQ_HW_ALLOCATED));
 
 	eq->iqid = eq->iq->cntxt_id;
 	eq->pidx = eq->cidx = eq->dbidx = 0;
 	/* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */
 	eq->equeqidx = 0;
 	eq->doorbells = sc->doorbells;
 	bzero(eq->desc, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len);
 
 	switch (eq->type) {
 	case EQ_CTRL:
 		rc = ctrl_eq_alloc(sc, eq);
 		break;
 
 	case EQ_ETH:
 		rc = eth_eq_alloc(sc, vi, eq);
 		break;
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, vi, eq);
 		break;
 #endif
 
 	default:
 		panic("%s: invalid eq type %d.", __func__, eq->type);
 	}
 	if (rc != 0) {
 		CH_ERR(sc, "failed to allocate egress queue(%d): %d\n",
 		    eq->type, rc);
 		return (rc);
 	}
 
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
 			eq->udb_qid = 0;
 		}
 		eq->udb = (volatile void *)udb;
 	}
 
 	eq->flags |= EQ_HW_ALLOCATED;
 	return (0);
 }
 
 static int
 free_eq_hwq(struct adapter *sc, struct vi_info *vi __unused, struct sge_eq *eq)
 {
 	int rc;
 
 	MPASS(eq->flags & EQ_HW_ALLOCATED);
 
 	switch (eq->type) {
 	case EQ_CTRL:
 		rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
 		break;
 	case EQ_ETH:
 		rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
 		break;
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 	case EQ_OFLD:
 		rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
 		break;
 #endif
 	default:
 		panic("%s: invalid eq type %d.", __func__, eq->type);
 	}
 	if (rc != 0) {
 		CH_ERR(sc, "failed to free eq (type %d): %d\n", eq->type, rc);
 		return (rc);
 	}
 	eq->flags &= ~EQ_HW_ALLOCATED;
 
 	return (0);
 }
 
 static int
 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
     struct sysctl_ctx_list *ctx, struct sysctl_oid *oid)
 {
 	struct sge_eq *eq = &wrq->eq;
 	int rc;
 
 	MPASS(!(eq->flags & EQ_SW_ALLOCATED));
 
 	rc = alloc_eq(sc, eq, ctx, oid);
 	if (rc)
 		return (rc);
 	MPASS(eq->flags & EQ_SW_ALLOCATED);
 	/* Can't fail after this. */
 
 	wrq->adapter = sc;
 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
 	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
 	wrq->nwr_pending = 0;
 	wrq->ndesc_needed = 0;
 	add_wrq_sysctls(ctx, oid, wrq);
 
 	return (0);
 }
 
 static void
 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 	free_eq(sc, &wrq->eq);
 	MPASS(wrq->nwr_pending == 0);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	MPASS(STAILQ_EMPTY(&wrq->wr_list));
 	bzero(wrq, sizeof(*wrq));
 }
 
 static void
 add_wrq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_wrq *wrq)
 {
 	struct sysctl_oid_list *children;
 
 	if (ctx == NULL || oid == NULL)
 		return;
 
 	children = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
 }
 
 /*
  * Idempotent.
  */
 static int
 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx)
 {
 	int rc, iqidx;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = vi->adapter;
 	struct sge_eq *eq = &txq->eq;
 	struct txpkts *txp;
 	char name[16];
 	struct sysctl_oid *oid;
 
 	if (!(eq->flags & EQ_SW_ALLOCATED)) {
 		MPASS(!(eq->flags & EQ_HW_ALLOCATED));
 
 		snprintf(name, sizeof(name), "%d", idx);
 		oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->txq_oid),
 		    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 		    "tx queue");
 
 		iqidx = vi->first_rxq + (idx % vi->nrxq);
 		snprintf(name, sizeof(name), "%s txq%d",
 		    device_get_nameunit(vi->dev), idx);
 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan,
 		    &sc->sge.rxq[iqidx].iq, name);
 
 		rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx,
 		    can_resume_eth_tx, M_CXGBE, &eq->eq_lock, M_WAITOK);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to allocate mp_ring for txq%d: %d\n",
 			    idx, rc);
 failed:
 			sysctl_remove_oid(oid, 1, 1);
 			return (rc);
 		}
 
 		rc = alloc_eq(sc, eq, &vi->ctx, oid);
 		if (rc) {
 			CH_ERR(vi, "failed to allocate txq%d: %d\n", idx, rc);
 			mp_ring_free(txq->r);
 			goto failed;
 		}
 		MPASS(eq->flags & EQ_SW_ALLOCATED);
 		/* Can't fail after this point. */
 
 		TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
 		txq->ifp = vi->ifp;
 		txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
 		txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 		    M_ZERO | M_WAITOK);
 
 		add_txq_sysctls(vi, &vi->ctx, oid, txq);
 	}
 
 	if (!(eq->flags & EQ_HW_ALLOCATED)) {
 		MPASS(eq->flags & EQ_SW_ALLOCATED);
 		rc = alloc_eq_hwq(sc, vi, eq);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to create hw txq%d: %d\n", idx, rc);
 			return (rc);
 		}
 		MPASS(eq->flags & EQ_HW_ALLOCATED);
 		/* Can't fail after this point. */
 
 		if (idx == 0)
 			sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
 		else
 			KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
 			    ("eq_base mismatch"));
 		KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
 		    ("PF with non-zero eq_base"));
 
 		txp = &txq->txp;
 		MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr);
 		txq->txp.max_npkt = min(nitems(txp->mb),
 		    sc->params.max_pkts_per_eth_tx_pkts_wr);
 		if (vi->flags & TX_USES_VM_WR && !(sc->flags & IS_VF))
 			txq->txp.max_npkt--;
 
 		if (vi->flags & TX_USES_VM_WR)
 			txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 			    V_TXPKT_INTF(pi->tx_chan));
 		else
 			txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 			    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
 			    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
 
 		txq->tc_idx = -1;
 	}
 
 	return (0);
 }
 
 /*
  * Idempotent.
  */
 static void
 free_txq(struct vi_info *vi, struct sge_txq *txq)
 {
 	struct adapter *sc = vi->adapter;
 	struct sge_eq *eq = &txq->eq;
 
 	if (eq->flags & EQ_HW_ALLOCATED) {
 		MPASS(eq->flags & EQ_SW_ALLOCATED);
 		free_eq_hwq(sc, NULL, eq);
 		MPASS(!(eq->flags & EQ_HW_ALLOCATED));
 	}
 
 	if (eq->flags & EQ_SW_ALLOCATED) {
 		MPASS(!(eq->flags & EQ_HW_ALLOCATED));
 		sglist_free(txq->gl);
 		free(txq->sdesc, M_CXGBE);
 		mp_ring_free(txq->r);
 		free_eq(sc, eq);
 		MPASS(!(eq->flags & EQ_SW_ALLOCATED));
 		bzero(txq, sizeof(*txq));
 	}
 }
 
 static void
 add_txq_sysctls(struct vi_info *vi, struct sysctl_ctx_list *ctx,
     struct sysctl_oid *oid, struct sge_txq *txq)
 {
 	struct adapter *sc;
 	struct sysctl_oid_list *children;
 
 	if (ctx == NULL || oid == NULL)
 		return;
 
 	sc = vi->adapter;
 	children = SYSCTL_CHILDREN(oid);
 
 	mp_ring_sysctls(txq->r, ctx, children);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tc",
 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, txq - sc->sge.txq,
 	    sysctl_tc, "I", "traffic class (-1 means none)");
 
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
 	    &txq->txcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD,
 	    &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
 	    &txq->tso_wrs, "# of TSO work requests");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
 	    &txq->imm_wrs, "# of work requests with immediate data");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD,
 	    &txq->txpkts0_wrs, "# of txpkts (type 0) work requests");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD,
 	    &txq->txpkts1_wrs, "# of txpkts (type 1) work requests");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD,
 	    &txq->txpkts0_pkts,
 	    "# of frames tx'd using type0 txpkts work requests");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD,
 	    &txq->txpkts1_pkts,
 	    "# of frames tx'd using type1 txpkts work requests");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts_flush", CTLFLAG_RD,
 	    &txq->txpkts_flush,
 	    "# of times txpkts had to be flushed out by an egress-update");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
 	    &txq->raw_wrs, "# of raw work requests (non-packets)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_tso_wrs", CTLFLAG_RD,
 	    &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_txcsum", CTLFLAG_RD,
 	    &txq->vxlan_txcsum,
 	    "# of times hardware assisted with inner checksums (VXLAN)");
 
 #ifdef KERN_TLS
 	if (is_ktls(sc)) {
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_records",
 		    CTLFLAG_RD, &txq->kern_tls_records,
 		    "# of NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_short",
 		    CTLFLAG_RD, &txq->kern_tls_short,
 		    "# of short NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_partial",
 		    CTLFLAG_RD, &txq->kern_tls_partial,
 		    "# of partial NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_full",
 		    CTLFLAG_RD, &txq->kern_tls_full,
 		    "# of full NIC TLS records transmitted");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_octets",
 		    CTLFLAG_RD, &txq->kern_tls_octets,
 		    "# of payload octets in transmitted NIC TLS records");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_waste",
 		    CTLFLAG_RD, &txq->kern_tls_waste,
 		    "# of octets DMAd but not transmitted in NIC TLS records");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_options",
 		    CTLFLAG_RD, &txq->kern_tls_options,
 		    "# of NIC TLS options-only packets transmitted");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_header",
 		    CTLFLAG_RD, &txq->kern_tls_header,
 		    "# of NIC TLS header-only packets transmitted");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin",
 		    CTLFLAG_RD, &txq->kern_tls_fin,
 		    "# of NIC TLS FIN-only packets transmitted");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin_short",
 		    CTLFLAG_RD, &txq->kern_tls_fin_short,
 		    "# of NIC TLS padded FIN packets on short TLS records");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_cbc",
 		    CTLFLAG_RD, &txq->kern_tls_cbc,
 		    "# of NIC TLS sessions using AES-CBC");
 		SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_gcm",
 		    CTLFLAG_RD, &txq->kern_tls_gcm,
 		    "# of NIC TLS sessions using AES-GCM");
 	}
 #endif
 }
 
 #if defined(TCP_OFFLOAD) || defined(RATELIMIT)
 /*
  * Idempotent.
  */
 static int
 alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx)
 {
 	struct sysctl_oid *oid;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = vi->adapter;
 	struct sge_eq *eq = &ofld_txq->wrq.eq;
 	int rc, iqidx;
 	char name[16];
 
 	MPASS(idx >= 0);
 	MPASS(idx < vi->nofldtxq);
 
 	if (!(eq->flags & EQ_SW_ALLOCATED)) {
 		snprintf(name, sizeof(name), "%d", idx);
 		oid = SYSCTL_ADD_NODE(&vi->ctx,
 		    SYSCTL_CHILDREN(vi->ofld_txq_oid), OID_AUTO, name,
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue");
 
 		snprintf(name, sizeof(name), "%s ofld_txq%d",
 		    device_get_nameunit(vi->dev), idx);
 		if (vi->nofldrxq > 0) {
 			iqidx = vi->first_ofld_rxq + (idx % vi->nofldrxq);
 			init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
 			    &sc->sge.ofld_rxq[iqidx].iq, name);
 		} else {
 			iqidx = vi->first_rxq + (idx % vi->nrxq);
 			init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
 			    &sc->sge.rxq[iqidx].iq, name);
 		}
 
 		rc = alloc_wrq(sc, vi, &ofld_txq->wrq, &vi->ctx, oid);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to allocate ofld_txq%d: %d\n", idx,
 			    rc);
 			sysctl_remove_oid(oid, 1, 1);
 			return (rc);
 		}
 		MPASS(eq->flags & EQ_SW_ALLOCATED);
 		/* Can't fail after this point. */
 
 		ofld_txq->tx_iscsi_pdus = counter_u64_alloc(M_WAITOK);
 		ofld_txq->tx_iscsi_octets = counter_u64_alloc(M_WAITOK);
 		ofld_txq->tx_iscsi_iso_wrs = counter_u64_alloc(M_WAITOK);
 		ofld_txq->tx_toe_tls_records = counter_u64_alloc(M_WAITOK);
 		ofld_txq->tx_toe_tls_octets = counter_u64_alloc(M_WAITOK);
 		add_ofld_txq_sysctls(&vi->ctx, oid, ofld_txq);
 	}
 
 	if (!(eq->flags & EQ_HW_ALLOCATED)) {
 		rc = alloc_eq_hwq(sc, vi, eq);
 		if (rc != 0) {
 			CH_ERR(vi, "failed to create hw ofld_txq%d: %d\n", idx,
 			    rc);
 			return (rc);
 		}
 		MPASS(eq->flags & EQ_HW_ALLOCATED);
 	}
 
 	return (0);
 }
 
 /*
  * Idempotent.
  */
 static void
 free_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq)
 {
 	struct adapter *sc = vi->adapter;
 	struct sge_eq *eq = &ofld_txq->wrq.eq;
 
 	if (eq->flags & EQ_HW_ALLOCATED) {
 		MPASS(eq->flags & EQ_SW_ALLOCATED);
 		free_eq_hwq(sc, NULL, eq);
 		MPASS(!(eq->flags & EQ_HW_ALLOCATED));
 	}
 
 	if (eq->flags & EQ_SW_ALLOCATED) {
 		MPASS(!(eq->flags & EQ_HW_ALLOCATED));
 		counter_u64_free(ofld_txq->tx_iscsi_pdus);
 		counter_u64_free(ofld_txq->tx_iscsi_octets);
 		counter_u64_free(ofld_txq->tx_iscsi_iso_wrs);
 		counter_u64_free(ofld_txq->tx_toe_tls_records);
 		counter_u64_free(ofld_txq->tx_toe_tls_octets);
 		free_wrq(sc, &ofld_txq->wrq);
 		MPASS(!(eq->flags & EQ_SW_ALLOCATED));
 		bzero(ofld_txq, sizeof(*ofld_txq));
 	}
 }
 
 static void
 add_ofld_txq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
     struct sge_ofld_txq *ofld_txq)
 {
 	struct sysctl_oid_list *children;
 
 	if (ctx == NULL || oid == NULL)
 		return;
 
 	children = SYSCTL_CHILDREN(oid);
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_pdus",
 	    CTLFLAG_RD, &ofld_txq->tx_iscsi_pdus,
 	    "# of iSCSI PDUs transmitted");
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_octets",
 	    CTLFLAG_RD, &ofld_txq->tx_iscsi_octets,
 	    "# of payload octets in transmitted iSCSI PDUs");
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_iso_wrs",
 	    CTLFLAG_RD, &ofld_txq->tx_iscsi_iso_wrs,
 	    "# of iSCSI segmentation offload work requests");
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_records",
 	    CTLFLAG_RD, &ofld_txq->tx_toe_tls_records,
 	    "# of TOE TLS records transmitted");
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_octets",
 	    CTLFLAG_RD, &ofld_txq->tx_toe_tls_octets,
 	    "# of payload octets in transmitted TOE TLS records");
 }
 #endif
 
 static void
 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *ba = arg;
 
 	KASSERT(nseg == 1,
 	    ("%s meant for single segment mappings only.", __func__));
 
 	*ba = error ? 0 : segs->ds_addr;
 }
 
 static inline void
 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
 {
 	uint32_t n, v;
 
 	n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx);
 	MPASS(n > 0);
 
 	wmb();
 	v = fl->dbval | V_PIDX(n);
 	if (fl->udb)
 		*fl->udb = htole32(v);
 	else
 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
 	IDXINCR(fl->dbidx, n, fl->sidx);
 }
 
 /*
  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
  * recycled do not count towards this allocation budget.
  *
  * Returns non-zero to indicate that this freelist should be added to the list
  * of starving freelists.
  */
 static int
 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
 {
 	__be64 *d;
 	struct fl_sdesc *sd;
 	uintptr_t pa;
 	caddr_t cl;
 	struct rx_buf_info *rxb;
 	struct cluster_metadata *clm;
 	uint16_t max_pidx, zidx = fl->zidx;
 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
 
 	FL_LOCK_ASSERT_OWNED(fl);
 
 	/*
 	 * We always stop at the beginning of the hardware descriptor that's just
 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
 	 * which would mean an empty freelist to the chip.
 	 */
 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
 	if (fl->pidx == max_pidx * 8)
 		return (0);
 
 	d = &fl->desc[fl->pidx];
 	sd = &fl->sdesc[fl->pidx];
 	rxb = &sc->sge.rx_buf_info[zidx];
 
 	while (n > 0) {
 
 		if (sd->cl != NULL) {
 
 			if (sd->nmbuf == 0) {
 				/*
 				 * Fast recycle without involving any atomics on
 				 * the cluster's metadata (if the cluster has
 				 * metadata).  This happens when all frames
 				 * received in the cluster were small enough to
 				 * fit within a single mbuf each.
 				 */
 				fl->cl_fast_recycled++;
 				goto recycled;
 			}
 
 			/*
 			 * Cluster is guaranteed to have metadata.  Clusters
 			 * without metadata always take the fast recycle path
 			 * when they're recycled.
 			 */
 			clm = cl_metadata(sd);
 			MPASS(clm != NULL);
 
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				fl->cl_recycled++;
 				counter_u64_add(extfree_rels, 1);
 				goto recycled;
 			}
 			sd->cl = NULL;	/* gave up my reference */
 		}
 		MPASS(sd->cl == NULL);
 		cl = uma_zalloc(rxb->zone, M_NOWAIT);
 		if (__predict_false(cl == NULL)) {
 			if (zidx != fl->safe_zidx) {
 				zidx = fl->safe_zidx;
 				rxb = &sc->sge.rx_buf_info[zidx];
 				cl = uma_zalloc(rxb->zone, M_NOWAIT);
 			}
 			if (cl == NULL)
 				break;
 		}
 		fl->cl_allocated++;
 		n--;
 
 		pa = pmap_kextract((vm_offset_t)cl);
 		sd->cl = cl;
 		sd->zidx = zidx;
 
 		if (fl->flags & FL_BUF_PACKING) {
 			*d = htobe64(pa | rxb->hwidx2);
 			sd->moff = rxb->size2;
 		} else {
 			*d = htobe64(pa | rxb->hwidx1);
 			sd->moff = 0;
 		}
 recycled:
 		sd->nmbuf = 0;
 		d++;
 		sd++;
 		if (__predict_false((++fl->pidx & 7) == 0)) {
 			uint16_t pidx = fl->pidx >> 3;
 
 			if (__predict_false(pidx == fl->sidx)) {
 				fl->pidx = 0;
 				pidx = 0;
 				sd = fl->sdesc;
 				d = fl->desc;
 			}
 			if (n < 8 || pidx == max_pidx)
 				break;
 
 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
 				ring_fl_db(sc, fl);
 		}
 	}
 
 	if ((fl->pidx >> 3) != fl->dbidx)
 		ring_fl_db(sc, fl);
 
 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
 }
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
 refill_sfl(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_fl *fl, *fl_temp;
 
 	mtx_assert(&sc->sfl_lock, MA_OWNED);
 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
 		FL_LOCK(fl);
 		refill_fl(sc, fl, 64);
 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
 			TAILQ_REMOVE(&sc->sfl, fl, link);
 			fl->flags &= ~FL_STARVING;
 		}
 		FL_UNLOCK(fl);
 	}
 
 	if (!TAILQ_EMPTY(&sc->sfl))
 		callout_schedule(&sc->sfl_callout, hz / 5);
 }
 
 /*
  * Release the driver's reference on all buffers in the given freelist.  Buffers
  * with kernel references cannot be freed and will prevent the driver from being
  * unloaded safely.
  */
 void
 free_fl_buffers(struct adapter *sc, struct sge_fl *fl)
 {
 	struct fl_sdesc *sd;
 	struct cluster_metadata *clm;
 	int i;
 
 	sd = fl->sdesc;
 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
 		if (sd->cl == NULL)
 			continue;
 
 		if (sd->nmbuf == 0)
 			uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl);
 		else if (fl->flags & FL_BUF_PACKING) {
 			clm = cl_metadata(sd);
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone,
 				    sd->cl);
 				counter_u64_add(extfree_rels, 1);
 			}
 		}
 		sd->cl = NULL;
 	}
 
 	if (fl->flags & FL_BUF_RESUME) {
 		m_freem(fl->m0);
 		fl->flags &= ~FL_BUF_RESUME;
 	}
 }
 
 static inline void
 get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	sglist_reset(gl);
 	rc = sglist_append_mbuf(gl, m);
 	if (__predict_false(rc != 0)) {
 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
 
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
 #if 0	/* vm_wr not readily available here. */
 	KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m, vm_wr),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
 		gl->sg_nseg, max_nsegs_allowed(m, vm_wr)));
 #endif
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_len16(u_int nsegs, const u_int extra)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = extra + sizeof(struct fw_eth_tx_pkt_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
  * request header.
  */
 static inline u_int
 txpkt_vm_len16(u_int nsegs, const u_int extra)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 static inline void
 calculate_mbuf_len16(struct mbuf *m, bool vm_wr)
 {
 	const int lso = sizeof(struct cpl_tx_pkt_lso_core);
 	const int tnl_lso = sizeof(struct cpl_tx_tnl_lso);
 
 	if (vm_wr) {
 		if (needs_tso(m))
 			set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso));
 		else
 			set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0));
 		return;
 	}
 
 	if (needs_tso(m)) {
 		if (needs_vxlan_tso(m))
 			set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso));
 		else
 			set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso));
 	} else
 		set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0));
 }
 
 /*
  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts0_len16(u_int nsegs)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts1_len16(void)
 {
 	u_int n;
 
 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
 	return (howmany(n, 16));
 }
 
 static inline u_int
 imm_payload(u_int ndesc)
 {
 	u_int n;
 
 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
 	    sizeof(struct cpl_tx_pkt_core);
 
 	return (n);
 }
 
 static inline uint64_t
 csum_to_ctrl(struct adapter *sc, struct mbuf *m)
 {
 	uint64_t ctrl;
 	int csum_type, l2hlen, l3hlen;
 	int x, y;
 	static const int csum_types[3][2] = {
 		{TX_CSUM_TCPIP, TX_CSUM_TCPIP6},
 		{TX_CSUM_UDPIP, TX_CSUM_UDPIP6},
 		{TX_CSUM_IP, 0}
 	};
 
 	M_ASSERTPKTHDR(m);
 
 	if (!needs_hwcsum(m))
 		return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
 
 	MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN);
 	MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip));
 
 	if (needs_vxlan_csum(m)) {
 		MPASS(m->m_pkthdr.l4hlen > 0);
 		MPASS(m->m_pkthdr.l5hlen > 0);
 		MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN);
 		MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip));
 
 		l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 		    m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen +
 		    m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN;
 		l3hlen = m->m_pkthdr.inner_l3hlen;
 	} else {
 		l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN;
 		l3hlen = m->m_pkthdr.l3hlen;
 	}
 
 	ctrl = 0;
 	if (!needs_l3_csum(m))
 		ctrl |= F_TXPKT_IPCSUM_DIS;
 
 	if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP |
 	    CSUM_IP6_TCP | CSUM_INNER_IP6_TCP))
 		x = 0;	/* TCP */
 	else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP |
 	    CSUM_IP6_UDP | CSUM_INNER_IP6_UDP))
 		x = 1;	/* UDP */
 	else
 		x = 2;
 
 	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP |
 	    CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP))
 		y = 0;	/* IPv4 */
 	else {
 		MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP |
 		    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP));
 		y = 1;	/* IPv6 */
 	}
 	/*
 	 * needs_hwcsum returned true earlier so there must be some kind of
 	 * checksum to calculate.
 	 */
 	csum_type = csum_types[x][y];
 	MPASS(csum_type != 0);
 	if (csum_type == TX_CSUM_IP)
 		ctrl |= F_TXPKT_L4CSUM_DIS;
 	ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen);
 	if (chip_id(sc) <= CHELSIO_T5)
 		ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen);
 	else
 		ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen);
 
 	return (ctrl);
 }
 
 static inline void *
 write_lso_cpl(void *cpl, struct mbuf *m0)
 {
 	struct cpl_tx_pkt_lso_core *lso;
 	uint32_t ctrl;
 
 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 	    m0->m_pkthdr.l4hlen > 0,
 	    ("%s: mbuf %p needs TSO but missing header lengths",
 		__func__, m0));
 
 	ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
 	    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
 	    V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
 	    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 	    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 	if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 		ctrl |= F_LSO_IPV6;
 
 	lso = cpl;
 	lso->lso_ctrl = htobe32(ctrl);
 	lso->ipid_ofst = htobe16(0);
 	lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 	lso->seqno_offset = htobe32(0);
 	lso->len = htobe32(m0->m_pkthdr.len);
 
 	return (lso + 1);
 }
 
 static void *
 write_tnl_lso_cpl(void *cpl, struct mbuf *m0)
 {
 	struct cpl_tx_tnl_lso *tnl_lso = cpl;
 	uint32_t ctrl;
 
 	KASSERT(m0->m_pkthdr.inner_l2hlen > 0 &&
 	    m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 &&
 	    m0->m_pkthdr.inner_l5hlen > 0,
 	    ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths",
 		__func__, m0));
 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 	    m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0,
 	    ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths",
 		__func__, m0));
 
 	/* Outer headers. */
 	ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) |
 	    F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST |
 	    V_CPL_TX_TNL_LSO_ETHHDRLENOUT(
 		(m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
 	    V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) |
 	    F_CPL_TX_TNL_LSO_IPLENSETOUT;
 	if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 		ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT;
 	else {
 		ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT |
 		    F_CPL_TX_TNL_LSO_IPIDINCOUT;
 	}
 	tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl);
 	tnl_lso->IpIdOffsetOut = 0;
 	tnl_lso->UdpLenSetOut_to_TnlHdrLen =
 		htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT |
 		    F_CPL_TX_TNL_LSO_UDPLENSETOUT |
 		    V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen +
 			m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen +
 			m0->m_pkthdr.l5hlen) |
 		    V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN));
 	tnl_lso->r1 = 0;
 
 	/* Inner headers. */
 	ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN(
 	    (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) |
 	    V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) |
 	    V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2);
 	if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr))
 		ctrl |= F_CPL_TX_TNL_LSO_IPV6;
 	tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl);
 	tnl_lso->IpIdOffset = 0;
 	tnl_lso->IpIdSplit_to_Mss =
 	    htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz));
 	tnl_lso->TCPSeqOffset = 0;
 	tnl_lso->EthLenOffset_Size =
 	    htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len));
 
 	return (tnl_lso + 1);
 }
 
 #define VM_TX_L2HDR_LEN	16	/* ethmacdst to vlantci */
 
 /*
  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0)
 {
 	struct sge_eq *eq;
 	struct fw_eth_tx_pkt_vm_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = tx_len16_to_desc(len16);
 
 	/* Firmware work request header */
 	eq = &txq->eq;
 	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3[0] = 0;
 	wr->r3[1] = 0;
 
 	/*
 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
 	 * simpler to always copy it rather than making it
 	 * conditional.  Also, it seems that we do not have to set
 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
 	 */
 	m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst);
 
 	if (needs_tso(m0)) {
 		cpl = write_lso_cpl(wr + 1, m0);
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = csum_to_ctrl(sc, m0);
 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 
 	/*
 	 * A packet using TSO will use up an entire descriptor for the
 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
 	 * If this descriptor is the last descriptor in the ring, wrap
 	 * around to the front of the ring explicitly for the start of
 	 * the sgl.
 	 */
 	if (dst == (void *)&eq->desc[eq->sidx]) {
 		dst = (void *)&eq->desc[0];
 		write_gl_to_txd(txq, m0, &dst, 0);
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a raw WR to the hardware descriptors, update the software
  * descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct mbuf *m;
 	caddr_t dst;
 	int len16, ndesc;
 
 	len16 = mbuf_len16(m0);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	dst = wr;
 	for (m = m0; m != NULL; m = m->m_next)
 		copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 
 	txq->raw_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0,
     u_int available)
 {
 	struct sge_eq *eq;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0)) {
 		if (needs_vxlan_tso(m0))
 			ctrl += sizeof(struct cpl_tx_tnl_lso);
 		else
 			ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	} else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
 	    available >= 2) {
 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
 		nsegs = 0;
 	}
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	eq = &txq->eq;
 	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	if (needs_tso(m0)) {
 		if (needs_vxlan_tso(m0)) {
 			cpl = write_tnl_lso_cpl(wr + 1, m0);
 			txq->vxlan_tso_wrs++;
 		} else {
 			cpl = write_lso_cpl(wr + 1, m0);
 			txq->tso_wrs++;
 		}
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = csum_to_ctrl(sc, m0);
 	if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
 		/* some hardware assistance provided */
 		if (needs_vxlan_csum(m0))
 			txq->vxlan_txcsum++;
 		else
 			txq->txcsum++;
 	}
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx]))
 		dst = (caddr_t)&eq->desc[0];
 	if (nsegs > 0) {
 
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
 	} else {
 		struct mbuf *m;
 
 		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
 #endif
 		}
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
 		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static inline bool
 cmp_l2hdr(struct txpkts *txp, struct mbuf *m)
 {
 	int len;
 
 	MPASS(txp->npkt > 0);
 	MPASS(m->m_len >= VM_TX_L2HDR_LEN);
 
 	if (txp->ethtype == be16toh(ETHERTYPE_VLAN))
 		len = VM_TX_L2HDR_LEN;
 	else
 		len = sizeof(struct ether_header);
 
 	return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0);
 }
 
 static inline void
 save_l2hdr(struct txpkts *txp, struct mbuf *m)
 {
 	MPASS(m->m_len >= VM_TX_L2HDR_LEN);
 
 	memcpy(&txp->ethmacdst[0], mtod(m, const void *), VM_TX_L2HDR_LEN);
 }
 
 static int
 add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
     int avail, bool *send)
 {
 	struct txpkts *txp = &txq->txp;
 
 	/* Cannot have TSO and coalesce at the same time. */
 	if (cannot_use_txpkts(m)) {
 cannot_coalesce:
 		*send = txp->npkt > 0;
 		return (EINVAL);
 	}
 
 	/* VF allows coalescing of type 1 (1 GL) only */
 	if (mbuf_nsegs(m) > 1)
 		goto cannot_coalesce;
 
 	*send = false;
 	if (txp->npkt > 0) {
 		MPASS(tx_len16_to_desc(txp->len16) <= avail);
 		MPASS(txp->npkt < txp->max_npkt);
 		MPASS(txp->wr_type == 1);	/* VF supports type 1 only */
 
 		if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) {
 retry_after_send:
 			*send = true;
 			return (EAGAIN);
 		}
 		if (m->m_pkthdr.len + txp->plen > 65535)
 			goto retry_after_send;
 		if (cmp_l2hdr(txp, m))
 			goto retry_after_send;
 
 		txp->len16 += txpkts1_len16();
 		txp->plen += m->m_pkthdr.len;
 		txp->mb[txp->npkt++] = m;
 		if (txp->npkt == txp->max_npkt)
 			*send = true;
 	} else {
 		txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) +
 		    txpkts1_len16();
 		if (tx_len16_to_desc(txp->len16) > avail)
 			goto cannot_coalesce;
 		txp->npkt = 1;
 		txp->wr_type = 1;
 		txp->plen = m->m_pkthdr.len;
 		txp->mb[0] = m;
 		save_l2hdr(txp, m);
 	}
 	return (0);
 }
 
 static int
 add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
     int avail, bool *send)
 {
 	struct txpkts *txp = &txq->txp;
 	int nsegs;
 
 	MPASS(!(sc->flags & IS_VF));
 
 	/* Cannot have TSO and coalesce at the same time. */
 	if (cannot_use_txpkts(m)) {
 cannot_coalesce:
 		*send = txp->npkt > 0;
 		return (EINVAL);
 	}
 
 	*send = false;
 	nsegs = mbuf_nsegs(m);
 	if (txp->npkt == 0) {
 		if (m->m_pkthdr.len > 65535)
 			goto cannot_coalesce;
 		if (nsegs > 1) {
 			txp->wr_type = 0;
 			txp->len16 =
 			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
 			    txpkts0_len16(nsegs);
 		} else {
 			txp->wr_type = 1;
 			txp->len16 =
 			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
 			    txpkts1_len16();
 		}
 		if (tx_len16_to_desc(txp->len16) > avail)
 			goto cannot_coalesce;
 		txp->npkt = 1;
 		txp->plen = m->m_pkthdr.len;
 		txp->mb[0] = m;
 	} else {
 		MPASS(tx_len16_to_desc(txp->len16) <= avail);
 		MPASS(txp->npkt < txp->max_npkt);
 
 		if (m->m_pkthdr.len + txp->plen > 65535) {
 retry_after_send:
 			*send = true;
 			return (EAGAIN);
 		}
 
 		MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 		if (txp->wr_type == 0) {
 			if (tx_len16_to_desc(txp->len16 +
 			    txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC))
 				goto retry_after_send;
 			txp->len16 += txpkts0_len16(nsegs);
 		} else {
 			if (nsegs != 1)
 				goto retry_after_send;
 			if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) >
 			    avail)
 				goto retry_after_send;
 			txp->len16 += txpkts1_len16();
 		}
 
 		txp->plen += m->m_pkthdr.len;
 		txp->mb[txp->npkt++] = m;
 		if (txp->npkt == txp->max_npkt)
 			*send = true;
 	}
 	return (0);
 }
 
 /*
  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
  * the software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkts_wr(struct adapter *sc, struct sge_txq *txq)
 {
 	const struct txpkts *txp = &txq->txp;
 	struct sge_eq *eq = &txq->eq;
 	struct fw_eth_tx_pkts_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint64_t ctrl1;
 	int ndesc, i, checkwrap;
 	struct mbuf *m, *last;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 
 	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
 	wr->type = txp->wr_type;
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
 	ndesc = tx_len16_to_desc(txp->len16);
 	last = NULL;
 	checkwrap = eq->sidx - ndesc < eq->pidx;
 	for (i = 0; i < txp->npkt; i++) {
 		m = txp->mb[i];
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
 
 			/* ULP master command */
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
 			ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m)));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 			    F_ULP_TX_SC_MORE);
 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
 
 			cpl = (void *)(ulpsc + 1);
 			if (checkwrap &&
 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
 				cpl = (void *)&eq->desc[0];
 		} else {
 			cpl = flitp;
 		}
 
 		/* Checksum offload */
 		ctrl1 = csum_to_ctrl(sc, m);
 		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
 			/* some hardware assistance provided */
 			if (needs_vxlan_csum(m))
 				txq->vxlan_txcsum++;
 			else
 				txq->txcsum++;
 		}
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		if (checkwrap &&
 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = (void *)&eq->desc[0];
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
 		if (last != NULL)
 			last->m_nextpkt = m;
 		last = m;
 	}
 
 	txq->sgl_wrs++;
 	if (txp->wr_type == 0) {
 		txq->txpkts0_pkts += txp->npkt;
 		txq->txpkts0_wrs++;
 	} else {
 		txq->txpkts1_pkts += txp->npkt;
 		txq->txpkts1_wrs++;
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = txp->mb[0];
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static u_int
 write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq)
 {
 	const struct txpkts *txp = &txq->txp;
 	struct sge_eq *eq = &txq->eq;
 	struct fw_eth_tx_pkts_vm_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint64_t ctrl1;
 	int ndesc, i;
 	struct mbuf *m, *last;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->wr_type == 1);	/* VF supports type 1 only */
 	MPASS(txp->mb[0] != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 
 	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR));
 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
 	wr->r3 = 0;
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r4 = 0;
 	memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16);
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 32B into a hardware descriptor.  Each mbuf in
 	 * the WR will take 32B so we check for the end of the descriptor ring
 	 * before writing odd mbufs (mb[1], 3, 5, ..)
 	 */
 	ndesc = tx_len16_to_desc(txp->len16);
 	last = NULL;
 	for (i = 0; i < txp->npkt; i++) {
 		m = txp->mb[i];
 		if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = &eq->desc[0];
 		cpl = flitp;
 
 		/* Checksum offload */
 		ctrl1 = csum_to_ctrl(sc, m);
 		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
 			txq->txcsum++;	/* some hardware assistance provided */
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		MPASS(mbuf_nsegs(m) == 1);
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0);
 
 		if (last != NULL)
 			last->m_nextpkt = m;
 		last = m;
 	}
 
 	txq->sgl_wrs++;
 	txq->txpkts1_pkts += txp->npkt;
 	txq->txpkts1_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = txp->mb[0];
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sglist *gl = txq->gl;
 	struct sglist_seg *seg;
 	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	get_pkt_gl(m, gl);
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
 	 * ring, so we're at least 16 bytes away from the status page.  There is
 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
 		for (i = 0; i < nsegs - 1; i++, seg++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
 		for (i = 0; i < nflits - 2; i++) {
 			if (flitp == wrap)
 				flitp = (void *)eq->desc;
 			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
 	}
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
 		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static inline void
 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
 	u_int db;
 
 	MPASS(n > 0);
 
 	db = eq->doorbells;
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 		int i;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
 		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
 		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(eq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 
 	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
 static inline u_int
 reclaimable_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
 }
 
 static inline u_int
 total_available_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx, pidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	pidx = eq->pidx;
 
 	if (pidx == hw_cidx)
 		return (eq->sidx - 1);
 	else
 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
 }
 
 static inline uint16_t
 read_hw_cidx(struct sge_eq *eq)
 {
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 	uint16_t cidx = spg->cidx;	/* stable snapshot */
 
 	return (be16toh(cidx));
 }
 
 /*
  * Reclaim 'n' descriptors approximately.
  */
 static u_int
 reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
 	struct sge_eq *eq = &txq->eq;
 	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(n > 0);
 
 	reclaimed = 0;
 	can_reclaim = reclaimable_tx_desc(eq);
 	while (can_reclaim && reclaimed < n) {
 		int ndesc;
 		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
 
 		/* Firmware doesn't return "partial" credits. */
 		KASSERT(can_reclaim >= ndesc,
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 		KASSERT(ndesc != 0,
 		    ("%s: descriptor with no credits: cidx %d",
 		    __func__, eq->cidx));
 
 		for (m = txsd->m; m != NULL; m = nextpkt) {
 			nextpkt = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
 		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
 tx_reclaim(void *arg, int n)
 {
 	struct sge_txq *txq = arg;
 	struct sge_eq *eq = &txq->eq;
 
 	do {
 		if (TXQ_TRYLOCK(txq) == 0)
 			break;
 		n = reclaim_tx_descs(txq, 32);
 		if (eq->cidx == eq->pidx)
 			eq->equeqidx = eq->pidx;
 		TXQ_UNLOCK(txq);
 	} while (n > 0);
 }
 
 static __be64
 get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
 	switch (idx % 3) {
 	case 0: {
 		uint64_t rc;
 
 		rc = (uint64_t)segs[i].ss_len << 32;
 		if (i + 1 < nsegs)
 			rc |= (uint64_t)(segs[i + 1].ss_len);
 
 		return (htobe64(rc));
 	}
 	case 1:
 		return (htobe64(segs[i].ss_paddr));
 	case 2:
 		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
 }
 
 static int
 find_refill_source(struct adapter *sc, int maxp, bool packing)
 {
 	int i, zidx = -1;
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
 
 	if (packing) {
 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 			if (rxb->hwidx2 == -1)
 				continue;
 			if (rxb->size1 < PAGE_SIZE &&
 			    rxb->size1 < largest_rx_cluster)
 				continue;
 			if (rxb->size1 > largest_rx_cluster)
 				break;
 			MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE);
 			if (rxb->size2 >= maxp)
 				return (i);
 			zidx = i;
 		}
 	} else {
 		for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 			if (rxb->hwidx1 == -1)
 				continue;
 			if (rxb->size1 > largest_rx_cluster)
 				break;
 			if (rxb->size1 >= maxp)
 				return (i);
 			zidx = i;
 		}
 	}
 
 	return (zidx);
 }
 
 static void
 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	if ((fl->flags & FL_DOOMED) == 0) {
 		fl->flags |= FL_STARVING;
 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
 	}
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 }
 
 static void
 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_wrq *wrq = (void *)eq;
 
 	atomic_readandclear_int(&eq->equiq);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
 }
 
 static void
 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_txq *txq = (void *)eq;
 
 	MPASS(eq->type == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
 	if (mp_ring_is_idle(txq->r))
 		taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
 	else
 		mp_ring_check_drainage(txq->r, 64);
 }
 
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
 		&handle_wrq_egr_update, &handle_eth_egr_update,
 		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
 	(*h[eq->type])(sc, eq);
 
 	return (0);
 }
 
 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
     offsetof(struct cpl_fw6_msg, data));
 
 static int
 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
 		const struct rss_header *rss2;
 
 		rss2 = (const struct rss_header *)&cpl->data[0];
 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
 	}
 
 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 /**
  *	t4_handle_wrerr_rpl - process a FW work request error message
  *	@adap: the adapter
  *	@rpl: start of the FW message
  */
 static int
 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
 {
 	u8 opcode = *(const u8 *)rpl;
 	const struct fw_error_cmd *e = (const void *)rpl;
 	unsigned int i;
 
 	if (opcode != FW_ERROR_CMD) {
 		log(LOG_ERR,
 		    "%s: Received WRERR_RPL message with opcode %#x\n",
 		    device_get_nameunit(adap->dev), opcode);
 		return (EINVAL);
 	}
 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
 	    "non-fatal");
 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
 	case FW_ERROR_TYPE_EXCEPTION:
 		log(LOG_ERR, "exception info:\n");
 		for (i = 0; i < nitems(e->u.exception.info); i++)
 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
 			    be32toh(e->u.exception.info[i]));
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_HWMODULE:
 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
 		    be32toh(e->u.hwmodule.regaddr),
 		    be32toh(e->u.hwmodule.regval));
 		break;
 	case FW_ERROR_TYPE_WR:
 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
 		    be16toh(e->u.wr.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
 		    be32toh(e->u.wr.eqid));
 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
 			    e->u.wr.wrhdr[i]);
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_ACL:
 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
 		    be16toh(e->u.acl.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
 		    be32toh(e->u.acl.eqid),
 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
 		    "MAC");
 		for (i = 0; i < nitems(e->u.acl.val); i++)
 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
 		log(LOG_ERR, "\n");
 		break;
 	default:
 		log(LOG_ERR, "type %#x\n",
 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static inline bool
 bufidx_used(struct adapter *sc, int idx)
 {
 	struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0];
 	int i;
 
 	for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) {
 		if (rxb->size1 > largest_rx_cluster)
 			continue;
 		if (rxb->hwidx1 == idx || rxb->hwidx2 == idx)
 			return (true);
 	}
 
 	return (false);
 }
 
 static int
 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
 {
 	struct adapter *sc = arg1;
 	struct sge_params *sp = &sc->params.sge;
 	int i, rc;
 	struct sbuf sb;
 	char c;
 
 	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
 	for (i = 0; i < SGE_FLBUF_SIZES; i++) {
 		if (bufidx_used(sc, i))
 			c = '*';
 		else
 			c = '\0';
 
 		sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c);
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 #ifdef RATELIMIT
 #if defined(INET) || defined(INET6)
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
 {
 	u_int n;
 
 	MPASS(immhdrs > 0);
 
 	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
 	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
 	if (__predict_false(nsegs == 0))
 		goto done;
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 done:
 	return (howmany(n, 16));
 }
 #endif
 
 #define ETID_FLOWC_NPARAMS 6
 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
     ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
 #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
 
 static int
 send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi,
     struct vi_info *vi)
 {
 	struct wrq_cookie cookie;
 	u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN;
 	struct fw_flowc_wr *flowc;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
 	    EO_FLOWC_PENDING);
 
 	flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLOWC_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		return (ENOMEM);
 
 	bzero(flowc, ETID_FLOWC_LEN);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htobe32(pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
 	flowc->mnemval[1].val = htobe32(pi->tx_chan);
 	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
 	flowc->mnemval[2].val = htobe32(pi->tx_chan);
 	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
 	flowc->mnemval[3].val = htobe32(cst->iqid);
 	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
 	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
 	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 	flowc->mnemval[5].val = htobe32(cst->schedcl);
 
 	commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie);
 
 	cst->flags &= ~EO_FLOWC_PENDING;
 	cst->flags |= EO_FLOWC_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
 	cst->tx_credits -= ETID_FLOWC_LEN16;
 
 	return (0);
 }
 
 #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
 
 void
 send_etid_flush_wr(struct cxgbe_rate_tag *cst)
 {
 	struct fw_flowc_wr *flowc;
 	struct wrq_cookie cookie;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLUSH_LEN16, &cookie);
 	if (__predict_false(flowc == NULL))
 		CXGBE_UNIMPLEMENTED(__func__);
 
 	bzero(flowc, ETID_FLUSH_LEN16 * 16);
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
 	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
 	    V_FW_WR_FLOWID(cst->etid));
 
 	commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie);
 
 	cst->flags |= EO_FLUSH_RPL_PENDING;
 	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
 	cst->tx_credits -= ETID_FLUSH_LEN16;
 	cst->ncompl++;
 }
 
 static void
 write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr,
     struct mbuf *m0, int compl)
 {
 	struct cpl_tx_pkt_core *cpl;
 	uint64_t ctrl1;
 	uint32_t ctrl;	/* used in many unrelated places */
 	int len16, pktlen, nsegs, immhdrs;
 	caddr_t dst;
 	uintptr_t p;
 	struct ulptx_sgl *usgl;
 	struct sglist sg;
 	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
 
 	mtx_assert(&cst->lock, MA_OWNED);
 	M_ASSERTPKTHDR(m0);
 	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 	    m0->m_pkthdr.l4hlen > 0,
 	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
 
 	len16 = mbuf_eo_len16(m0);
 	nsegs = mbuf_eo_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
 	ctrl += immhdrs;
 
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
 	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
 	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
 	    V_FW_WR_FLOWID(cst->etid));
 	wr->r3 = 0;
 	if (needs_outer_udp_csum(m0)) {
 		wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
 		wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen;
 		wr->u.udpseg.rtplen = 0;
 		wr->u.udpseg.r4 = 0;
 		wr->u.udpseg.mss = htobe16(pktlen - immhdrs);
 		wr->u.udpseg.schedpktsize = wr->u.udpseg.mss;
 		wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
 		cpl = (void *)(wr + 1);
 	} else {
 		MPASS(needs_outer_tcp_csum(m0));
 		wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
 		wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
 		wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
 		wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
 		wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
 		wr->u.tcpseg.r4 = 0;
 		wr->u.tcpseg.r5 = 0;
 		wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
 
 		if (needs_tso(m0)) {
 			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 			wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
 
 			ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
 			    F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
 			    V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
 				ETHER_HDR_LEN) >> 2) |
 			    V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
 			    V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 			if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 				ctrl |= F_LSO_IPV6;
 			lso->lso_ctrl = htobe32(ctrl);
 			lso->ipid_ofst = htobe16(0);
 			lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 			lso->seqno_offset = htobe32(0);
 			lso->len = htobe32(pktlen);
 
 			cpl = (void *)(lso + 1);
 		} else {
 			wr->u.tcpseg.mss = htobe16(0xffff);
 			cpl = (void *)(wr + 1);
 		}
 	}
 
 	/* Checksum offload must be requested for ethofld. */
 	MPASS(needs_outer_l4_csum(m0));
 	ctrl1 = csum_to_ctrl(cst->adapter, m0);
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = cst->ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */
 	p = (uintptr_t)(cpl + 1);
 	m_copydata(m0, 0, immhdrs, (void *)p);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 		int i, pad;
 
 		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
 		p += immhdrs;
 		pad = 16 - (immhdrs & 0xf);
 		bzero((void *)p, pad);
 
 		usgl = (void *)(p + pad);
 		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 		    V_ULPTX_NSGE(nsegs));
 
 		sglist_init(&sg, nitems(segs), segs);
 		for (; m0 != NULL; m0 = m0->m_next) {
 			if (__predict_false(m0->m_len == 0))
 				continue;
 			if (immhdrs >= m0->m_len) {
 				immhdrs -= m0->m_len;
 				continue;
 			}
 			if (m0->m_flags & M_EXTPG)
 				sglist_append_mbuf_epg(&sg, m0,
 				    mtod(m0, vm_offset_t), m0->m_len);
                         else
 				sglist_append(&sg, mtod(m0, char *) + immhdrs,
 				    m0->m_len - immhdrs);
 			immhdrs = 0;
 		}
 		MPASS(sg.sg_nseg == nsegs);
 
 		/*
 		 * Zero pad last 8B in case the WR doesn't end on a 16B
 		 * boundary.
 		 */
 		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
 
 		usgl->len0 = htobe32(segs[0].ss_len);
 		usgl->addr0 = htobe64(segs[0].ss_paddr);
 		for (i = 0; i < nsegs - 1; i++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 	}
 
 }
 
 static void
 ethofld_tx(struct cxgbe_rate_tag *cst)
 {
 	struct mbuf *m;
 	struct wrq_cookie cookie;
 	int next_credits, compl;
 	struct fw_eth_tx_eo_wr *wr;
 
 	mtx_assert(&cst->lock, MA_OWNED);
 
 	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
 		M_ASSERTPKTHDR(m);
 
 		/* How many len16 credits do we need to send this mbuf. */
 		next_credits = mbuf_eo_len16(m);
 		MPASS(next_credits > 0);
 		if (next_credits > cst->tx_credits) {
 			/*
 			 * Tx will make progress eventually because there is at
 			 * least one outstanding fw4_ack that will return
 			 * credits and kick the tx.
 			 */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		wr = start_wrq_wr(&cst->eo_txq->wrq, next_credits, &cookie);
 		if (__predict_false(wr == NULL)) {
 			/* XXX: wishful thinking, not a real assertion. */
 			MPASS(cst->ncompl > 0);
 			return;
 		}
 		cst->tx_credits -= next_credits;
 		cst->tx_nocompl += next_credits;
 		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
 		ETHER_BPF_MTAP(cst->com.ifp, m);
 		write_ethofld_wr(cst, wr, m, compl);
 		commit_wrq_wr(&cst->eo_txq->wrq, wr, &cookie);
 		if (compl) {
 			cst->ncompl++;
 			cst->tx_nocompl	= 0;
 		}
 		(void) mbufq_dequeue(&cst->pending_tx);
 
 		/*
 		 * Drop the mbuf's reference on the tag now rather
 		 * than waiting until m_freem().  This ensures that
 		 * cxgbe_rate_tag_free gets called when the inp drops
 		 * its reference on the tag and there are no more
 		 * mbufs in the pending_tx queue and can flush any
 		 * pending requests.  Otherwise if the last mbuf
 		 * doesn't request a completion the etid will never be
 		 * released.
 		 */
 		m->m_pkthdr.snd_tag = NULL;
 		m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
 		m_snd_tag_rele(&cst->com);
 
 		mbufq_enqueue(&cst->pending_fwack, m);
 	}
 }
 
 int
 ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
 {
 	struct cxgbe_rate_tag *cst;
 	int rc;
 
 	MPASS(m0->m_nextpkt == NULL);
 	MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG);
 	MPASS(m0->m_pkthdr.snd_tag != NULL);
 	cst = mst_to_crt(m0->m_pkthdr.snd_tag);
 
 	mtx_lock(&cst->lock);
 	MPASS(cst->flags & EO_SND_TAG_REF);
 
 	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
 		struct vi_info *vi = ifp->if_softc;
 		struct port_info *pi = vi->pi;
 		struct adapter *sc = pi->adapter;
 		const uint32_t rss_mask = vi->rss_size - 1;
 		uint32_t rss_hash;
 
 		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
 		if (M_HASHTYPE_ISHASH(m0))
 			rss_hash = m0->m_pkthdr.flowid;
 		else
 			rss_hash = arc4random();
 		/* We assume RSS hashing */
 		cst->iqid = vi->rss[rss_hash & rss_mask];
 		cst->eo_txq += rss_hash % vi->nofldtxq;
 		rc = send_etid_flowc_wr(cst, pi, vi);
 		if (rc != 0)
 			goto done;
 	}
 
 	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
 		rc = ENOBUFS;
 		goto done;
 	}
 
 	mbufq_enqueue(&cst->pending_tx, m0);
 	cst->plen += m0->m_pkthdr.len;
 
 	/*
 	 * Hold an extra reference on the tag while generating work
 	 * requests to ensure that we don't try to free the tag during
 	 * ethofld_tx() in case we are sending the final mbuf after
 	 * the inp was freed.
 	 */
 	m_snd_tag_ref(&cst->com);
 	ethofld_tx(cst);
 	mtx_unlock(&cst->lock);
 	m_snd_tag_rele(&cst->com);
 	return (0);
 
 done:
 	mtx_unlock(&cst->lock);
 	if (__predict_false(rc != 0))
 		m_freem(m0);
 	return (rc);
 }
 
 static int
 ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	struct mbuf *m;
 	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct cxgbe_rate_tag *cst;
 	uint8_t credits = cpl->credits;
 
 	cst = lookup_etid(sc, etid);
 	mtx_lock(&cst->lock);
 	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
 		MPASS(credits >= ETID_FLOWC_LEN16);
 		credits -= ETID_FLOWC_LEN16;
 		cst->flags &= ~EO_FLOWC_RPL_PENDING;
 	}
 
 	KASSERT(cst->ncompl > 0,
 	    ("%s: etid %u (%p) wasn't expecting completion.",
 	    __func__, etid, cst));
 	cst->ncompl--;
 
 	while (credits > 0) {
 		m = mbufq_dequeue(&cst->pending_fwack);
 		if (__predict_false(m == NULL)) {
 			/*
 			 * The remaining credits are for the final flush that
 			 * was issued when the tag was freed by the kernel.
 			 */
 			MPASS((cst->flags &
 			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
 			    EO_FLUSH_RPL_PENDING);
 			MPASS(credits == ETID_FLUSH_LEN16);
 			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
 			MPASS(cst->ncompl == 0);
 
 			cst->flags &= ~EO_FLUSH_RPL_PENDING;
 			cst->tx_credits += cpl->credits;
 			cxgbe_rate_tag_free_locked(cst);
 			return (0);	/* cst is gone. */
 		}
 		KASSERT(m != NULL,
 		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
 		    credits));
 		KASSERT(credits >= mbuf_eo_len16(m),
 		    ("%s: too few credits (%u, %u, %u)", __func__,
 		    cpl->credits, credits, mbuf_eo_len16(m)));
 		credits -= mbuf_eo_len16(m);
 		cst->plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 
 	cst->tx_credits += cpl->credits;
 	MPASS(cst->tx_credits <= cst->tx_total);
 
 	if (cst->flags & EO_SND_TAG_REF) {
 		/*
 		 * As with ethofld_transmit(), hold an extra reference
 		 * so that the tag is stable across ethold_tx().
 		 */
 		m_snd_tag_ref(&cst->com);
 		m = mbufq_first(&cst->pending_tx);
 		if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
 			ethofld_tx(cst);
 		mtx_unlock(&cst->lock);
 		m_snd_tag_rele(&cst->com);
 	} else {
 		/*
 		 * There shouldn't be any pending packets if the tag
 		 * was freed by the kernel since any pending packet
 		 * should hold a reference to the tag.
 		 */
 		MPASS(mbufq_first(&cst->pending_tx) == NULL);
 		mtx_unlock(&cst->lock);
 	}
 
 	return (0);
 }
 #endif
diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
index 7c37a785f23c..e8d4dcda85de 100644
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -1,1224 +1,1219 @@
 /*-
  * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MLX5_EN_H_
 #define	_MLX5_EN_H_
 
 #include <linux/kmod.h>
 #include <linux/page.h>
 #include <linux/slab.h>
 #include <linux/if_vlan.h>
 #include <linux/if_ether.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
 #include <linux/etherdevice.h>
 #include <linux/ktime.h>
 #include <linux/net_dim.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/udp.h>
 #include <net/ethernet.h>
 #include <net/pfil.h>
 #include <sys/buf_ring.h>
 #include <sys/kthread.h>
 #include <sys/counter.h>
 
 #include "opt_rss.h"
 
 #ifdef	RSS
 #include <net/rss_config.h>
 #include <netinet/in_rss.h>
 #endif
 
 #include <machine/bus.h>
 
 #include <dev/mlx5/driver.h>
 #include <dev/mlx5/qp.h>
 #include <dev/mlx5/cq.h>
 #include <dev/mlx5/port.h>
 #include <dev/mlx5/vport.h>
 #include <dev/mlx5/diagnostics.h>
 
 #include <dev/mlx5/mlx5_core/wq.h>
 #include <dev/mlx5/mlx5_core/transobj.h>
 #include <dev/mlx5/mlx5_core/mlx5_core.h>
 
 #define	MLX5E_MAX_PRIORITY 8
 
 #define	MLX5E_MAX_FEC_10X_25X 4
 #define	MLX5E_MAX_FEC_50X 4
 
 /* IEEE 802.1Qaz standard supported values */
 #define	IEEE_8021QAZ_MAX_TCS	8
 
 #define	MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x7
 #define	MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
 #define	MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xe
 
 #define	MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE                0x7
 #define	MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define	MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xe
 
 #define	MLX5E_MAX_BUSDMA_RX_SEGS 15
 
 #ifndef MLX5E_MAX_RX_BYTES
 #define	MLX5E_MAX_RX_BYTES MCLBYTES
 #endif
 
 #define	MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \
     MIN(65535, 7 * MLX5E_MAX_RX_BYTES)
 
 #define	MLX5E_DIM_DEFAULT_PROFILE 3
 #define	MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO	16
 #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
 #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE	0x3
 #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
 #define	MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC      0x10
 #define	MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
 #define	MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
 #define	MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ         0x7
 #define	MLX5E_CACHELINE_SIZE CACHE_LINE_SIZE
 #define	MLX5E_HW2SW_MTU(hwmtu) \
     ((hwmtu) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN))
 #define	MLX5E_SW2HW_MTU(swmtu) \
     ((swmtu) + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN))
 #define	MLX5E_SW2MB_MTU(swmtu) \
     (MLX5E_SW2HW_MTU(swmtu) + MLX5E_NET_IP_ALIGN)
 #define	MLX5E_MTU_MIN		72	/* Min MTU allowed by the kernel */
 #define	MLX5E_MTU_MAX		MIN(ETHERMTU_JUMBO, MJUM16BYTES)	/* Max MTU of Ethernet
 									 * jumbo frames */
 
 #define	MLX5E_BUDGET_MAX	8192	/* RX and TX */
 #define	MLX5E_RX_BUDGET_MAX	256
 #define	MLX5E_SQ_BF_BUDGET	16
 #define	MLX5E_SQ_TX_QUEUE_SIZE	4096	/* SQ drbr queue size */
 
 #define	MLX5E_MAX_TX_NUM_TC	8	/* units */
 #define	MLX5E_MAX_TX_HEADER	192	/* bytes */
 #define	MLX5E_MAX_TX_PAYLOAD_SIZE	65536	/* bytes */
 #define	MLX5E_MAX_TX_MBUF_SIZE	65536	/* bytes */
 #define	MLX5E_MAX_TX_MBUF_FRAGS	\
     ((MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS) - \
     (MLX5E_MAX_TX_HEADER / MLX5_SEND_WQE_DS) - \
     1 /* the maximum value of the DS counter is 0x3F and not 0x40 */)	/* units */
 #define	MLX5E_MAX_TX_INLINE \
   (MLX5E_MAX_TX_HEADER - sizeof(struct mlx5e_tx_wqe) + \
   sizeof(((struct mlx5e_tx_wqe *)0)->eth.inline_hdr_start))	/* bytes */
 
 #define	MLX5E_100MB (100000)
 #define	MLX5E_1GB   (1000000)
 
 #define	MLX5E_ZERO(ptr, field)	      \
 	memset(&(ptr)->field, 0, \
 	    sizeof(*(ptr)) - __offsetof(__typeof(*(ptr)), field))
 
 MALLOC_DECLARE(M_MLX5EN);
 
 struct mlx5_core_dev;
 struct mlx5e_cq;
 
 typedef void (mlx5e_cq_comp_t)(struct mlx5_core_cq *, struct mlx5_eqe *);
 
 #define	mlx5_en_err(_dev, format, ...)				\
 	if_printf(_dev, "ERR: ""%s:%d:(pid %d): " format, \
 	    __func__, __LINE__, curthread->td_proc->p_pid,	\
 	    ##__VA_ARGS__)
 
 #define	mlx5_en_warn(_dev, format, ...)				\
 	if_printf(_dev, "WARN: ""%s:%d:(pid %d): " format, \
 	    __func__, __LINE__, curthread->td_proc->p_pid,	\
 	    ##__VA_ARGS__)
 
 #define	mlx5_en_info(_dev, format, ...)				\
 	if_printf(_dev, "INFO: ""%s:%d:(pid %d): " format, \
 	    __func__, __LINE__, curthread->td_proc->p_pid,	\
 	    ##__VA_ARGS__)
 
 #define	MLX5E_STATS_COUNT(a, ...) a
 #define	MLX5E_STATS_VAR(a, b, c, ...) b c;
 #define	MLX5E_STATS_COUNTER(a, b, c, ...) counter_##b##_t c;
 #define	MLX5E_STATS_DESC(a, b, c, d, e, ...) d, e,
 
 #define	MLX5E_VPORT_STATS(m)						\
   /* HW counters */							\
   m(+1, u64, rx_packets, "rx_packets", "Received packets")		\
   m(+1, u64, rx_bytes, "rx_bytes", "Received bytes")			\
   m(+1, u64, tx_packets, "tx_packets", "Transmitted packets")		\
   m(+1, u64, tx_bytes, "tx_bytes", "Transmitted bytes")			\
   m(+1, u64, rx_error_packets, "rx_error_packets", "Received error packets") \
   m(+1, u64, rx_error_bytes, "rx_error_bytes", "Received error bytes")	\
   m(+1, u64, tx_error_packets, "tx_error_packets", "Transmitted error packets") \
   m(+1, u64, tx_error_bytes, "tx_error_bytes", "Transmitted error bytes") \
   m(+1, u64, rx_unicast_packets, "rx_unicast_packets", "Received unicast packets") \
   m(+1, u64, rx_unicast_bytes, "rx_unicast_bytes", "Received unicast bytes") \
   m(+1, u64, tx_unicast_packets, "tx_unicast_packets", "Transmitted unicast packets") \
   m(+1, u64, tx_unicast_bytes, "tx_unicast_bytes", "Transmitted unicast bytes") \
   m(+1, u64, rx_multicast_packets, "rx_multicast_packets", "Received multicast packets") \
   m(+1, u64, rx_multicast_bytes, "rx_multicast_bytes", "Received multicast bytes") \
   m(+1, u64, tx_multicast_packets, "tx_multicast_packets", "Transmitted multicast packets") \
   m(+1, u64, tx_multicast_bytes, "tx_multicast_bytes", "Transmitted multicast bytes") \
   m(+1, u64, rx_broadcast_packets, "rx_broadcast_packets", "Received broadcast packets") \
   m(+1, u64, rx_broadcast_bytes, "rx_broadcast_bytes", "Received broadcast bytes") \
   m(+1, u64, tx_broadcast_packets, "tx_broadcast_packets", "Transmitted broadcast packets") \
   m(+1, u64, tx_broadcast_bytes, "tx_broadcast_bytes", "Transmitted broadcast bytes") \
   m(+1, u64, rx_out_of_buffer, "rx_out_of_buffer", "Receive out of buffer, no recv wqes events") \
   /* SW counters */							\
   m(+1, u64, tso_packets, "tso_packets", "Transmitted TSO packets")	\
   m(+1, u64, tso_bytes, "tso_bytes", "Transmitted TSO bytes")		\
   m(+1, u64, lro_packets, "lro_packets", "Received LRO packets")		\
   m(+1, u64, lro_bytes, "lro_bytes", "Received LRO bytes")		\
   m(+1, u64, sw_lro_queued, "sw_lro_queued", "Packets queued for SW LRO")	\
   m(+1, u64, sw_lro_flushed, "sw_lro_flushed", "Packets flushed from SW LRO")	\
   m(+1, u64, rx_csum_good, "rx_csum_good", "Received checksum valid packets") \
   m(+1, u64, rx_csum_none, "rx_csum_none", "Received no checksum packets") \
   m(+1, u64, tx_csum_offload, "tx_csum_offload", "Transmit checksum offload packets") \
   m(+1, u64, tx_queue_dropped, "tx_queue_dropped", "Transmit queue dropped") \
   m(+1, u64, tx_defragged, "tx_defragged", "Transmit queue defragged") \
   m(+1, u64, rx_wqe_err, "rx_wqe_err", "Receive WQE errors") \
   m(+1, u64, tx_jumbo_packets, "tx_jumbo_packets", "TX packets greater than 1518 octets") \
   m(+1, u64, rx_steer_missed_packets, "rx_steer_missed_packets", "RX packets dropped by steering rule(s)")
 
 #define	MLX5E_VPORT_STATS_NUM (0 MLX5E_VPORT_STATS(MLX5E_STATS_COUNT))
 
 struct mlx5e_vport_stats {
 	struct	sysctl_ctx_list ctx;
 	u64	arg [0];
 	MLX5E_VPORT_STATS(MLX5E_STATS_VAR)
 };
 
 #define	MLX5E_PPORT_IEEE802_3_STATS(m)					\
   m(+1, u64, frames_tx, "frames_tx", "Frames transmitted")		\
   m(+1, u64, frames_rx, "frames_rx", "Frames received")			\
   m(+1, u64, check_seq_err, "check_seq_err", "Sequence errors")		\
   m(+1, u64, alignment_err, "alignment_err", "Alignment errors")	\
   m(+1, u64, octets_tx, "octets_tx", "Bytes transmitted")		\
   m(+1, u64, octets_received, "octets_received", "Bytes received")	\
   m(+1, u64, multicast_xmitted, "multicast_xmitted", "Multicast transmitted") \
   m(+1, u64, broadcast_xmitted, "broadcast_xmitted", "Broadcast transmitted") \
   m(+1, u64, multicast_rx, "multicast_rx", "Multicast received")	\
   m(+1, u64, broadcast_rx, "broadcast_rx", "Broadcast received")	\
   m(+1, u64, in_range_len_errors, "in_range_len_errors", "In range length errors") \
   m(+1, u64, out_of_range_len, "out_of_range_len", "Out of range length errors") \
   m(+1, u64, too_long_errors, "too_long_errors", "Too long errors")	\
   m(+1, u64, symbol_err, "symbol_err", "Symbol errors")			\
   m(+1, u64, mac_control_tx, "mac_control_tx", "MAC control transmitted") \
   m(+1, u64, mac_control_rx, "mac_control_rx", "MAC control received")	\
   m(+1, u64, unsupported_op_rx, "unsupported_op_rx", "Unsupported operation received") \
   m(+1, u64, pause_ctrl_rx, "pause_ctrl_rx", "Pause control received")	\
   m(+1, u64, pause_ctrl_tx, "pause_ctrl_tx", "Pause control transmitted")
 
 #define	MLX5E_PPORT_RFC2819_STATS(m)					\
   m(+1, u64, drop_events, "drop_events", "Dropped events")		\
   m(+1, u64, octets, "octets", "Octets")					\
   m(+1, u64, pkts, "pkts", "Packets")					\
   m(+1, u64, broadcast_pkts, "broadcast_pkts", "Broadcast packets")	\
   m(+1, u64, multicast_pkts, "multicast_pkts", "Multicast packets")	\
   m(+1, u64, crc_align_errors, "crc_align_errors", "CRC alignment errors") \
   m(+1, u64, undersize_pkts, "undersize_pkts", "Undersized packets")	\
   m(+1, u64, oversize_pkts, "oversize_pkts", "Oversized packets")	\
   m(+1, u64, fragments, "fragments", "Fragments")			\
   m(+1, u64, jabbers, "jabbers", "Jabbers")				\
   m(+1, u64, collisions, "collisions", "Collisions")
 
 #define	MLX5E_PPORT_RFC2819_STATS_DEBUG(m)				\
   m(+1, u64, p64octets, "p64octets", "Bytes")				\
   m(+1, u64, p65to127octets, "p65to127octets", "Bytes")			\
   m(+1, u64, p128to255octets, "p128to255octets", "Bytes")		\
   m(+1, u64, p256to511octets, "p256to511octets", "Bytes")		\
   m(+1, u64, p512to1023octets, "p512to1023octets", "Bytes")		\
   m(+1, u64, p1024to1518octets, "p1024to1518octets", "Bytes")		\
   m(+1, u64, p1519to2047octets, "p1519to2047octets", "Bytes")		\
   m(+1, u64, p2048to4095octets, "p2048to4095octets", "Bytes")		\
   m(+1, u64, p4096to8191octets, "p4096to8191octets", "Bytes")		\
   m(+1, u64, p8192to10239octets, "p8192to10239octets", "Bytes")
 
 #define	MLX5E_PPORT_RFC2863_STATS_DEBUG(m)				\
   m(+1, u64, in_octets, "in_octets", "In octets")			\
   m(+1, u64, in_ucast_pkts, "in_ucast_pkts", "In unicast packets")	\
   m(+1, u64, in_discards, "in_discards", "In discards")			\
   m(+1, u64, in_errors, "in_errors", "In errors")			\
   m(+1, u64, in_unknown_protos, "in_unknown_protos", "In unknown protocols") \
   m(+1, u64, out_octets, "out_octets", "Out octets")			\
   m(+1, u64, out_ucast_pkts, "out_ucast_pkts", "Out unicast packets")	\
   m(+1, u64, out_discards, "out_discards", "Out discards")		\
   m(+1, u64, out_errors, "out_errors", "Out errors")			\
   m(+1, u64, in_multicast_pkts, "in_multicast_pkts", "In multicast packets") \
   m(+1, u64, in_broadcast_pkts, "in_broadcast_pkts", "In broadcast packets") \
   m(+1, u64, out_multicast_pkts, "out_multicast_pkts", "Out multicast packets") \
   m(+1, u64, out_broadcast_pkts, "out_broadcast_pkts", "Out broadcast packets")
 
 #define	MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG(m)				\
   m(+1, u64, port_transmit_wait, "port_transmit_wait", "Port transmit wait") \
   m(+1, u64, ecn_marked, "ecn_marked", "ECN marked")			\
   m(+1, u64, no_buffer_discard_mc, "no_buffer_discard_mc", "No buffer discard mc") \
   m(+1, u64, rx_ebp, "rx_ebp", "RX EBP")					\
   m(+1, u64, tx_ebp, "tx_ebp", "TX EBP")					\
   m(+1, u64, rx_buffer_almost_full, "rx_buffer_almost_full", "RX buffer almost full") \
   m(+1, u64, rx_buffer_full, "rx_buffer_full", "RX buffer full")	\
   m(+1, u64, rx_icrc_encapsulated, "rx_icrc_encapsulated", "RX ICRC encapsulated") \
   m(+1, u64, ex_reserved_0, "ex_reserved_0", "Reserved") \
   m(+1, u64, ex_reserved_1, "ex_reserved_1", "Reserved") \
   m(+1, u64, tx_stat_p64octets, "tx_stat_p64octets", "Bytes")			\
   m(+1, u64, tx_stat_p65to127octets, "tx_stat_p65to127octets", "Bytes")		\
   m(+1, u64, tx_stat_p128to255octets, "tx_stat_p128to255octets", "Bytes")	\
   m(+1, u64, tx_stat_p256to511octets, "tx_stat_p256to511octets", "Bytes")	\
   m(+1, u64, tx_stat_p512to1023octets, "tx_stat_p512to1023octets", "Bytes")	\
   m(+1, u64, tx_stat_p1024to1518octets, "tx_stat_p1024to1518octets", "Bytes")	\
   m(+1, u64, tx_stat_p1519to2047octets, "tx_stat_p1519to2047octets", "Bytes")	\
   m(+1, u64, tx_stat_p2048to4095octets, "tx_stat_p2048to4095octets", "Bytes")	\
   m(+1, u64, tx_stat_p4096to8191octets, "tx_stat_p4096to8191octets", "Bytes")	\
   m(+1, u64, tx_stat_p8192to10239octets, "tx_stat_p8192to10239octets", "Bytes")
 
 #define	MLX5E_PPORT_STATISTICAL_DEBUG(m)				\
   m(+1, u64, phy_time_since_last_clear, "phy_time_since_last_clear",	\
     "Time since last clear in milliseconds")				\
   m(+1, u64, phy_received_bits, "phy_received_bits",			\
     "Total amount of traffic received in bits before error correction")	\
   m(+1, u64, phy_symbol_errors, "phy_symbol_errors",			\
     "Total number of symbol errors before error correction")		\
   m(+1, u64, phy_corrected_bits, "phy_corrected_bits",			\
     "Total number of corrected bits ")					\
   m(+1, u64, phy_corrected_bits_lane0, "phy_corrected_bits_lane0",	\
     "Total number of corrected bits for lane 0")			\
   m(+1, u64, phy_corrected_bits_lane1, "phy_corrected_bits_lane1",	\
     "Total number of corrected bits for lane 1")			\
   m(+1, u64, phy_corrected_bits_lane2, "phy_corrected_bits_lane2",	\
     "Total number of corrected bits for lane 2")			\
   m(+1, u64, phy_corrected_bits_lane3, "phy_corrected_bits_lane3",	\
     "Total number of corrected bits for lane 3")
 
 #define	MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(m)			\
   m(+1, u64, time_since_last_clear, "time_since_last_clear",		\
     "Time since the last counters clear event (msec)")			\
   m(+1, u64, symbol_errors, "symbol_errors", "Symbol errors")		\
   m(+1, u64, sync_headers_errors, "sync_headers_errors",		\
     "Sync header error counter")					\
   m(+1, u64, bip_errors_lane0, "edpl_bip_errors_lane0",			\
     "Indicates the number of PRBS errors on lane 0")			\
   m(+1, u64, bip_errors_lane1, "edpl_bip_errors_lane1",			\
     "Indicates the number of PRBS errors on lane 1")			\
   m(+1, u64, bip_errors_lane2, "edpl_bip_errors_lane2",			\
     "Indicates the number of PRBS errors on lane 2")			\
   m(+1, u64, bip_errors_lane3, "edpl_bip_errors_lane3",			\
     "Indicates the number of PRBS errors on lane 3")			\
   m(+1, u64, fc_corrected_blocks_lane0, "fc_corrected_blocks_lane0",	\
     "FEC correctable block counter lane 0")				\
   m(+1, u64, fc_corrected_blocks_lane1, "fc_corrected_blocks_lane1",	\
     "FEC correctable block counter lane 1")				\
   m(+1, u64, fc_corrected_blocks_lane2, "fc_corrected_blocks_lane2",	\
     "FEC correctable block counter lane 2")				\
   m(+1, u64, fc_corrected_blocks_lane3, "fc_corrected_blocks_lane3",	\
     "FEC correctable block counter lane 3")				\
   m(+1, u64, rs_corrected_blocks, "rs_corrected_blocks",		\
     "FEC correcable block counter")					\
   m(+1, u64, rs_uncorrectable_blocks, "rs_uncorrectable_blocks",	\
     "FEC uncorrecable block counter")					\
   m(+1, u64, rs_no_errors_blocks, "rs_no_errors_blocks",		\
     "The number of RS-FEC blocks received that had no errors")		\
   m(+1, u64, rs_single_error_blocks, "rs_single_error_blocks",		\
     "The number of corrected RS-FEC blocks received that had"		\
     "exactly 1 error symbol")						\
   m(+1, u64, rs_corrected_symbols_total, "rs_corrected_symbols_total",	\
     "Port FEC corrected symbol counter")				\
   m(+1, u64, rs_corrected_symbols_lane0, "rs_corrected_symbols_lane0",	\
     "FEC corrected symbol counter lane 0")				\
   m(+1, u64, rs_corrected_symbols_lane1, "rs_corrected_symbols_lane1",	\
     "FEC corrected symbol counter lane 1")				\
   m(+1, u64, rs_corrected_symbols_lane2, "rs_corrected_symbols_lane2",	\
     "FEC corrected symbol counter lane 2")				\
   m(+1, u64, rs_corrected_symbols_lane3, "rs_corrected_symbols_lane3",	\
     "FEC corrected symbol counter lane 3")
 
 /* Per priority statistics for PFC */
 #define	MLX5E_PPORT_PER_PRIO_STATS_SUB(m,n,p)			\
   m(n, p, +1, u64, rx_octets, "rx_octets", "Received octets")		\
   m(n, p, +1, u64, rx_uc_frames, "rx_uc_frames", "Received unicast frames") \
   m(n, p, +1, u64, rx_mc_frames, "rx_mc_frames", "Received multicast frames") \
   m(n, p, +1, u64, rx_bc_frames, "rx_bc_frames", "Received broadcast frames") \
   m(n, p, +1, u64, rx_frames, "rx_frames", "Received frames")		\
   m(n, p, +1, u64, tx_octets, "tx_octets", "Transmitted octets")	\
   m(n, p, +1, u64, tx_uc_frames, "tx_uc_frames", "Transmitted unicast frames") \
   m(n, p, +1, u64, tx_mc_frames, "tx_mc_frames", "Transmitted multicast frames") \
   m(n, p, +1, u64, tx_bc_frames, "tx_bc_frames", "Transmitted broadcast frames") \
   m(n, p, +1, u64, tx_frames, "tx_frames", "Transmitted frames")	\
   m(n, p, +1, u64, rx_pause, "rx_pause", "Received pause frames")	\
   m(n, p, +1, u64, rx_pause_duration, "rx_pause_duration",		\
 	"Received pause duration")					\
   m(n, p, +1, u64, tx_pause, "tx_pause", "Transmitted pause frames")	\
   m(n, p, +1, u64, tx_pause_duration, "tx_pause_duration",		\
 	"Transmitted pause duration")					\
   m(n, p, +1, u64, rx_pause_transition, "rx_pause_transition",		\
 	"Received pause transitions")					\
   m(n, p, +1, u64, rx_discards, "rx_discards", "Discarded received frames") \
   m(n, p, +1, u64, device_stall_minor_watermark,			\
 	"device_stall_minor_watermark", "Device stall minor watermark")	\
   m(n, p, +1, u64, device_stall_critical_watermark,			\
 	"device_stall_critical_watermark", "Device stall critical watermark")
 
 #define	MLX5E_PPORT_PER_PRIO_STATS_PREFIX(m,p,c,t,f,s,d) \
   m(c, t, pri_##p##_##f, "prio" #p "_" s, "Priority " #p " - " d)
 
 #define	MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO 8
 
 #define	MLX5E_PPORT_PER_PRIO_STATS(m) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,0) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,1) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,2) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,3) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,4) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,5) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,6) \
   MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,7)
 
 #define	MLX5E_PCIE_PERFORMANCE_COUNTERS_64(m)				\
   m(+1, u64, life_time_counter_high, "life_time_counter",		\
     "Life time counter.", pcie_perf_counters)				\
   m(+1, u64, tx_overflow_buffer_pkt, "tx_overflow_buffer_pkt",		\
     "The number of packets dropped due to lack of PCIe buffers "	\
     "in receive path from NIC port toward the hosts.",			\
     pcie_perf_counters)							\
   m(+1, u64, tx_overflow_buffer_marked_pkt,				\
     "tx_overflow_buffer_marked_pkt",					\
     "The number of packets marked due to lack of PCIe buffers "		\
     "in receive path from NIC port toward the hosts.",			\
     pcie_perf_counters)
 
 #define	MLX5E_PCIE_PERFORMANCE_COUNTERS_32(m)				\
   m(+1, u64, rx_errors, "rx_errors",					\
     "Number of transitions to recovery due to Framing "			\
     "errors and CRC errors.", pcie_perf_counters)			\
   m(+1, u64, tx_errors, "tx_errors", "Number of transitions "		\
     "to recovery due to EIEOS and TS errors.", pcie_perf_counters)	\
   m(+1, u64, l0_to_recovery_eieos, "l0_to_recovery_eieos", "Number of "	\
     "transitions to recovery due to getting EIEOS.", pcie_perf_counters)\
   m(+1, u64, l0_to_recovery_ts, "l0_to_recovery_ts", "Number of "	\
     "transitions to recovery due to getting TS.", pcie_perf_counters)	\
   m(+1, u64, l0_to_recovery_framing, "l0_to_recovery_framing", "Number "\
     "of transitions to recovery due to identifying framing "		\
     "errors at gen3/4.", pcie_perf_counters)				\
   m(+1, u64, l0_to_recovery_retrain, "l0_to_recovery_retrain",		\
     "Number of transitions to recovery due to link retrain request "	\
     "from data link.", pcie_perf_counters)				\
   m(+1, u64, crc_error_dllp, "crc_error_dllp", "Number of transitions "	\
     "to recovery due to identifying CRC DLLP errors.",			\
     pcie_perf_counters)							\
   m(+1, u64, crc_error_tlp, "crc_error_tlp", "Number of transitions to "\
     "recovery due to identifying CRC TLP errors.", pcie_perf_counters)	\
   m(+1, u64, outbound_stalled_reads, "outbound_stalled_reads",		\
     "The percentage of time within the last second that the NIC had "	\
     "outbound non-posted read requests but could not perform the "	\
     "operation due to insufficient non-posted credits.",		\
     pcie_perf_counters)							\
   m(+1, u64, outbound_stalled_writes, "outbound_stalled_writes",	\
     "The percentage of time within the last second that the NIC had "	\
     "outbound posted writes requests but could not perform the "	\
     "operation due to insufficient posted credits.",			\
     pcie_perf_counters)							\
   m(+1, u64, outbound_stalled_reads_events,				\
     "outbound_stalled_reads_events", "The number of events where "	\
     "outbound_stalled_reads was above a threshold.",			\
     pcie_perf_counters)							\
   m(+1, u64, outbound_stalled_writes_events,				\
     "outbound_stalled_writes_events",					\
     "The number of events where outbound_stalled_writes was above "	\
     "a threshold.", pcie_perf_counters)
 
 #define	MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(m)			\
   m(+1, u64, time_to_boot_image_start, "time_to_boot_image_start",	\
     "Time from start until FW boot image starts running in usec.",	\
     pcie_timers_states)							\
   m(+1, u64, time_to_link_image, "time_to_link_image",			\
     "Time from start until FW pci_link image starts running in usec.",	\
     pcie_timers_states)							\
   m(+1, u64, calibration_time, "calibration_time",			\
     "Time it took FW to do calibration in usec.",			\
     pcie_timers_states)							\
   m(+1, u64, time_to_first_perst, "time_to_first_perst",		\
     "Time form start until FW handle first perst. in usec.",		\
     pcie_timers_states)							\
   m(+1, u64, time_to_detect_state, "time_to_detect_state",		\
     "Time from start until first transition to LTSSM.Detect_Q in usec",	\
     pcie_timers_states)							\
   m(+1, u64, time_to_l0, "time_to_l0",					\
     "Time from start until first transition to LTSSM.L0 in usec",	\
     pcie_timers_states)							\
   m(+1, u64, time_to_crs_en, "time_to_crs_en",				\
     "Time from start until crs is enabled in usec",			\
     pcie_timers_states)							\
   m(+1, u64, time_to_plastic_image_start, "time_to_plastic_image_start",\
     "Time form start until FW plastic image starts running in usec.",	\
     pcie_timers_states)							\
   m(+1, u64, time_to_iron_image_start, "time_to_iron_image_start",	\
     "Time form start until FW iron image starts running in usec.",	\
     pcie_timers_states)							\
   m(+1, u64, perst_handler, "perst_handler",				\
     "Number of persts arrived.", pcie_timers_states)			\
   m(+1, u64, times_in_l1, "times_in_l1",				\
     "Number of times LTSSM entered L1 flow.", pcie_timers_states)	\
   m(+1, u64, times_in_l23, "times_in_l23",				\
     "Number of times LTSSM entered L23 flow.", pcie_timers_states)	\
   m(+1, u64, dl_down, "dl_down",					\
     "Number of moves for DL_active to DL_down.", pcie_timers_states)	\
   m(+1, u64, config_cycle1usec, "config_cycle1usec",			\
     "Number of configuration requests that firmware "			\
     "handled in less than 1 usec.", pcie_timers_states)			\
   m(+1, u64, config_cycle2to7usec, "config_cycle2to7usec",		\
     "Number of configuration requests that firmware "			\
     "handled within 2 to 7 usec.", pcie_timers_states)			\
   m(+1, u64, config_cycle8to15usec, "config_cycle8to15usec",		\
     "Number of configuration requests that firmware "			\
     "handled within 8 to 15 usec.", pcie_timers_states)			\
   m(+1, u64, config_cycle16to63usec, "config_cycle16to63usec",		\
     "Number of configuration requests that firmware "			\
     "handled within 16 to 63 usec.", pcie_timers_states)		\
   m(+1, u64, config_cycle64usec, "config_cycle64usec",			\
     "Number of configuration requests that firmware "			\
     "handled took more than 64 usec.", pcie_timers_states)		\
   m(+1, u64, correctable_err_msg_sent, "correctable_err_msg_sent",	\
     "Number of correctable error messages sent.", pcie_timers_states)	\
   m(+1, u64, non_fatal_err_msg_sent, "non_fatal_err_msg_sent",		\
     "Number of non-Fatal error msg sent.", pcie_timers_states)		\
   m(+1, u64, fatal_err_msg_sent, "fatal_err_msg_sent",			\
     "Number of fatal error msg sent.", pcie_timers_states)
 
 #define	MLX5E_PCIE_LANE_COUNTERS_32(m)				\
   m(+1, u64, error_counter_lane0, "error_counter_lane0",	\
     "Error counter for PCI lane 0", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane1, "error_counter_lane1",	\
     "Error counter for PCI lane 1", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane2, "error_counter_lane2",	\
     "Error counter for PCI lane 2", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane3, "error_counter_lane3",	\
     "Error counter for PCI lane 3", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane4, "error_counter_lane4",	\
     "Error counter for PCI lane 4", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane5, "error_counter_lane5",	\
     "Error counter for PCI lane 5", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane6, "error_counter_lane6",	\
     "Error counter for PCI lane 6", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane7, "error_counter_lane7",	\
     "Error counter for PCI lane 7", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane8, "error_counter_lane8",	\
     "Error counter for PCI lane 8", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane9, "error_counter_lane9",	\
     "Error counter for PCI lane 9", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane10, "error_counter_lane10",	\
     "Error counter for PCI lane 10", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane11, "error_counter_lane11",	\
     "Error counter for PCI lane 11", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane12, "error_counter_lane12",	\
     "Error counter for PCI lane 12", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane13, "error_counter_lane13",	\
     "Error counter for PCI lane 13", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane14, "error_counter_lane14",	\
     "Error counter for PCI lane 14", pcie_lanes_counters)	\
   m(+1, u64, error_counter_lane15, "error_counter_lane15",	\
     "Error counter for PCI lane 15", pcie_lanes_counters)
 
 /*
  * Make sure to update mlx5e_update_pport_counters()
  * when adding a new MLX5E_PPORT_STATS block
  */
 #define	MLX5E_PPORT_STATS(m)			\
   MLX5E_PPORT_PER_PRIO_STATS(m)		\
   MLX5E_PPORT_IEEE802_3_STATS(m)		\
   MLX5E_PPORT_RFC2819_STATS(m)
 
 #define	MLX5E_PORT_STATS_DEBUG(m)		\
   MLX5E_PPORT_RFC2819_STATS_DEBUG(m)		\
   MLX5E_PPORT_RFC2863_STATS_DEBUG(m)		\
   MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(m)	\
   MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG(m)	\
   MLX5E_PPORT_STATISTICAL_DEBUG(m)		\
   MLX5E_PCIE_PERFORMANCE_COUNTERS_64(m) \
   MLX5E_PCIE_PERFORMANCE_COUNTERS_32(m) \
   MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(m) \
   MLX5E_PCIE_LANE_COUNTERS_32(m)
 
 #define	MLX5E_PPORT_IEEE802_3_STATS_NUM \
   (0 MLX5E_PPORT_IEEE802_3_STATS(MLX5E_STATS_COUNT))
 #define	MLX5E_PPORT_RFC2819_STATS_NUM \
   (0 MLX5E_PPORT_RFC2819_STATS(MLX5E_STATS_COUNT))
 #define	MLX5E_PPORT_STATS_NUM \
   (0 MLX5E_PPORT_STATS(MLX5E_STATS_COUNT))
 
 #define	MLX5E_PPORT_PER_PRIO_STATS_NUM \
   (0 MLX5E_PPORT_PER_PRIO_STATS(MLX5E_STATS_COUNT))
 #define	MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM \
   (0 MLX5E_PPORT_RFC2819_STATS_DEBUG(MLX5E_STATS_COUNT))
 #define	MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM \
   (0 MLX5E_PPORT_RFC2863_STATS_DEBUG(MLX5E_STATS_COUNT))
 #define	MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM \
   (0 MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(MLX5E_STATS_COUNT))
 #define	MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM \
   (0 MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG(MLX5E_STATS_COUNT))
 #define	MLX5E_PPORT_STATISTICAL_DEBUG_NUM \
   (0 MLX5E_PPORT_STATISTICAL_DEBUG(MLX5E_STATS_COUNT))
 #define	MLX5E_PORT_STATS_DEBUG_NUM \
   (0 MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_COUNT))
 
 struct mlx5e_pport_stats {
 	struct	sysctl_ctx_list ctx;
 	u64	arg [0];
 	MLX5E_PPORT_STATS(MLX5E_STATS_VAR)
 };
 
 struct mlx5e_port_stats_debug {
 	struct	sysctl_ctx_list ctx;
 	u64	arg [0];
 	MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_VAR)
 };
 
 #define	MLX5E_RQ_STATS(m)					\
   m(+1, u64, packets, "packets", "Received packets")		\
   m(+1, u64, bytes, "bytes", "Received bytes")			\
   m(+1, u64, csum_none, "csum_none", "Received packets")		\
   m(+1, u64, lro_packets, "lro_packets", "Received LRO packets")	\
   m(+1, u64, lro_bytes, "lro_bytes", "Received LRO bytes")	\
   m(+1, u64, sw_lro_queued, "sw_lro_queued", "Packets queued for SW LRO")	\
   m(+1, u64, sw_lro_flushed, "sw_lro_flushed", "Packets flushed from SW LRO")	\
   m(+1, u64, wqe_err, "wqe_err", "Received packets")
 
 #define	MLX5E_RQ_STATS_NUM (0 MLX5E_RQ_STATS(MLX5E_STATS_COUNT))
 
 struct mlx5e_rq_stats {
 	struct	sysctl_ctx_list ctx;
 	u64	arg [0];
 	MLX5E_RQ_STATS(MLX5E_STATS_VAR)
 };
 
 #define	MLX5E_SQ_STATS(m)						\
   m(+1, u64, packets, "packets", "Transmitted packets")			\
   m(+1, u64, bytes, "bytes", "Transmitted bytes")			\
   m(+1, u64, tso_packets, "tso_packets", "Transmitted packets")		\
   m(+1, u64, tso_bytes, "tso_bytes", "Transmitted bytes")		\
   m(+1, u64, csum_offload_none, "csum_offload_none", "Transmitted packets")	\
   m(+1, u64, defragged, "defragged", "Transmitted packets")		\
   m(+1, u64, dropped, "dropped", "Transmitted packets")			\
   m(+1, u64, enobuf, "enobuf", "Transmitted packets")			\
   m(+1, u64, cqe_err, "cqe_err", "Transmit CQE errors")			\
   m(+1, u64, nop, "nop", "Transmitted packets")
 
 #define	MLX5E_SQ_STATS_NUM (0 MLX5E_SQ_STATS(MLX5E_STATS_COUNT))
 
 struct mlx5e_sq_stats {
 	struct	sysctl_ctx_list ctx;
 	u64	arg [0];
 	MLX5E_SQ_STATS(MLX5E_STATS_VAR)
 };
 
 struct mlx5e_stats {
 	struct mlx5e_vport_stats vport;
 	struct mlx5e_pport_stats pport;
 	struct mlx5e_port_stats_debug port_stats_debug;
 };
 
 struct mlx5e_rq_param {
 	u32	rqc [MLX5_ST_SZ_DW(rqc)];
 	struct mlx5_wq_param wq;
 };
 
 struct mlx5e_sq_param {
 	u32	sqc [MLX5_ST_SZ_DW(sqc)];
 	struct mlx5_wq_param wq;
 };
 
 struct mlx5e_cq_param {
 	u32	cqc [MLX5_ST_SZ_DW(cqc)];
 	struct mlx5_wq_param wq;
 };
 
 struct mlx5e_params {
 	u8	log_sq_size;
 	u8	log_rq_size;
 	u16	num_channels;
 	u8	default_vlan_prio;
 	u8	num_tc;
 	u8	rx_cq_moderation_mode;
 	u8	tx_cq_moderation_mode;
 	u16	rx_cq_moderation_usec;
 	u16	rx_cq_moderation_pkts;
 	u16	tx_cq_moderation_usec;
 	u16	tx_cq_moderation_pkts;
 	u16	min_rx_wqes;
 	bool	hw_lro_en;
 	bool	cqe_zipping_en;
 	u32	lro_wqe_sz;
 	u16	rx_hash_log_tbl_sz;
 	u32	tx_pauseframe_control __aligned(4);
 	u32	rx_pauseframe_control __aligned(4);
 	u16	tx_max_inline;
 	u8	tx_min_inline_mode;
 	u8	tx_priority_flow_control;
 	u8	rx_priority_flow_control;
 	u8	channels_rsss;
 };
 
 #define	MLX5E_PARAMS(m)							\
   m(+1, u64, tx_queue_size_max, "tx_queue_size_max", "Max send queue size") \
   m(+1, u64, rx_queue_size_max, "rx_queue_size_max", "Max receive queue size") \
   m(+1, u64, tx_queue_size, "tx_queue_size", "Default send queue size")	\
   m(+1, u64, rx_queue_size, "rx_queue_size", "Default receive queue size") \
   m(+1, u64, channels, "channels", "Default number of channels")		\
   m(+1, u64, channels_rsss, "channels_rsss", "Default channels receive side scaling stride") \
   m(+1, u64, coalesce_usecs_max, "coalesce_usecs_max", "Maximum usecs for joining packets") \
   m(+1, u64, coalesce_pkts_max, "coalesce_pkts_max", "Maximum packets to join") \
   m(+1, u64, rx_coalesce_usecs, "rx_coalesce_usecs", "Limit in usec for joining rx packets") \
   m(+1, u64, rx_coalesce_pkts, "rx_coalesce_pkts", "Maximum number of rx packets to join") \
   m(+1, u64, rx_coalesce_mode, "rx_coalesce_mode", "0: EQE fixed mode 1: CQE fixed mode 2: EQE auto mode 3: CQE auto mode") \
   m(+1, u64, tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining tx packets") \
   m(+1, u64, tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx packets to join") \
   m(+1, u64, tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
   m(+1, u64, tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
   m(+1, u64, tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
   m(+1, u64, hw_lro, "hw_lro", "set to enable hw_lro") \
   m(+1, u64, cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled") \
   m(+1, u64, modify_tx_dma, "modify_tx_dma", "0: Enable TX 1: Disable TX") \
   m(+1, u64, modify_rx_dma, "modify_rx_dma", "0: Enable RX 1: Disable RX") \
   m(+1, u64, diag_pci_enable, "diag_pci_enable", "0: Disabled 1: Enabled") \
   m(+1, u64, diag_general_enable, "diag_general_enable", "0: Disabled 1: Enabled") \
   m(+1, u64, hw_mtu, "hw_mtu", "Current hardware MTU value") \
   m(+1, u64, mc_local_lb, "mc_local_lb", "0: Local multicast loopback enabled 1: Disabled") \
   m(+1, u64, uc_local_lb, "uc_local_lb", "0: Local unicast loopback enabled 1: Disabled") \
   m(+1, s64, irq_cpu_base, "irq_cpu_base", "-1: Don't bind IRQ 0..NCPU-1: select this base CPU when binding IRQs") \
   m(+1, s64, irq_cpu_stride, "irq_cpu_stride", "0..NCPU-1: Distance between IRQ vectors when binding them")
 
 #define	MLX5E_PARAMS_NUM (0 MLX5E_PARAMS(MLX5E_STATS_COUNT))
 
 struct mlx5e_params_ethtool {
 	u64	arg [0];
 	MLX5E_PARAMS(MLX5E_STATS_VAR)
 	u64	max_bw_value[IEEE_8021QAZ_MAX_TCS];
 	u8	max_bw_share[IEEE_8021QAZ_MAX_TCS];
 	u8	prio_tc[MLX5E_MAX_PRIORITY];
 	u8	dscp2prio[MLX5_MAX_SUPPORTED_DSCP];
 	u8	trust_state;
 	u8	fec_mask_10x_25x[MLX5E_MAX_FEC_10X_25X];
 	u16	fec_mask_50x[MLX5E_MAX_FEC_50X];
 	u8	fec_avail_10x_25x[MLX5E_MAX_FEC_10X_25X];
 	u16	fec_avail_50x[MLX5E_MAX_FEC_50X];
 	u32	fec_mode_active;
 	u32	hw_mtu_msb;
 	s32	hw_val_temp[MLX5_MAX_TEMPERATURE];
 	u32	hw_num_temp;
 };
 
 struct mlx5e_cq {
 	/* data path - accessed per cqe */
 	struct mlx5_cqwq wq;
 
 	/* data path - accessed per HW polling */
 	struct mlx5_core_cq mcq;
 
 	/* control */
 	struct mlx5e_priv *priv;
 	struct mlx5_wq_ctrl wq_ctrl;
 } __aligned(MLX5E_CACHELINE_SIZE);
 
 struct mlx5e_rq_mbuf {
 	bus_dmamap_t	dma_map;
 	caddr_t		data;
 	struct mbuf	*mbuf;
 };
 
 struct mlx5e_rq {
 	/* persistant fields */
 	struct mtx mtx;
 	struct mlx5e_rq_stats stats;
 	struct callout watchdog;
 
 	/* data path */
 #define	mlx5e_rq_zero_start wq
 	struct mlx5_wq_ll wq;
 	bus_dma_tag_t dma_tag;
 	u32	wqe_sz;
 	u32	nsegs;
 	struct mlx5e_rq_mbuf *mbuf;
 	struct ifnet *ifp;
 	struct mlx5e_cq cq;
 	struct lro_ctrl lro;
 	volatile int enabled;
 	int	ix;
 
 	/* Dynamic Interrupt Moderation */
 	struct net_dim dim;
 
 	/* control */
 	struct mlx5_wq_ctrl wq_ctrl;
 	u32	rqn;
 	struct mlx5e_channel *channel;
 } __aligned(MLX5E_CACHELINE_SIZE);
 
 struct mlx5e_sq_mbuf {
 	bus_dmamap_t dma_map;
 	struct mbuf *mbuf;
 	volatile s32 *p_refcount;	/* in use refcount, if any */
 	u32	num_bytes;
 	u32	num_wqebbs;
 };
 
 enum {
 	MLX5E_SQ_READY,
 	MLX5E_SQ_FULL
 };
 
 struct mlx5e_sq {
 	/* persistant fields */
 	struct	mtx lock;
 	struct	mtx comp_lock;
 	struct	mlx5e_sq_stats stats;
 	struct	callout cev_callout;
 
 	/* data path */
 #define	mlx5e_sq_zero_start dma_tag
 	bus_dma_tag_t dma_tag;
 
 	/* dirtied @completion */
 	u16	cc;
 
 	/* dirtied @xmit */
 	u16	pc __aligned(MLX5E_CACHELINE_SIZE);
 	u16	cev_counter;		/* completion event counter */
 	u16	cev_factor;		/* completion event factor */
 	u16	cev_next_state;		/* next completion event state */
 #define	MLX5E_CEV_STATE_INITIAL 0	/* timer not started */
 #define	MLX5E_CEV_STATE_SEND_NOPS 1	/* send NOPs */
 #define	MLX5E_CEV_STATE_HOLD_NOPS 2	/* don't send NOPs yet */
 	u16	running;		/* set if SQ is running */
 	union {
 		u32	d32[2];
 		u64	d64;
 	} doorbell;
 
 	struct	mlx5e_cq cq;
 
 	/* pointers to per packet info: write@xmit, read@completion */
 	struct	mlx5e_sq_mbuf *mbuf;
 
 	/* read only */
 	struct	mlx5_wq_cyc wq;
 	void __iomem *uar_map;
 	struct	ifnet *ifp;
 	u32	sqn;
 	u32	mkey_be;
 	u16	max_inline;
 	u8	min_inline_mode;
 	u8	min_insert_caps;
 #define	MLX5E_INSERT_VLAN 1
 #define	MLX5E_INSERT_NON_VLAN 2
 
 	/* control path */
 	struct	mlx5_wq_ctrl wq_ctrl;
 	struct	mlx5e_priv *priv;
 	int	tc;
 } __aligned(MLX5E_CACHELINE_SIZE);
 
 static inline bool
 mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n)
 {
 	u16 cc = sq->cc;
 	u16 pc = sq->pc;
 
 	return ((sq->wq.sz_m1 & (cc - pc)) >= n || cc == pc);
 }
 
 static inline u32
 mlx5e_sq_queue_level(struct mlx5e_sq *sq)
 {
 	u16 cc;
 	u16 pc;
 
 	if (sq == NULL)
 		return (0);
 
 	cc = sq->cc;
 	pc = sq->pc;
 
 	return (((sq->wq.sz_m1 & (pc - cc)) *
 	    IF_SND_QUEUE_LEVEL_MAX) / sq->wq.sz_m1);
 }
 
 struct mlx5e_channel {
 	struct mlx5e_rq rq;
 	struct m_snd_tag tag;
 	struct mlx5_sq_bfreg bfreg;
 	struct mlx5e_sq sq[MLX5E_MAX_TX_NUM_TC];
 	struct mlx5e_priv *priv;
 	struct completion completion;
 	int	ix;
 } __aligned(MLX5E_CACHELINE_SIZE);
 
 enum mlx5e_traffic_types {
 	MLX5E_TT_IPV4_TCP,
 	MLX5E_TT_IPV6_TCP,
 	MLX5E_TT_IPV4_UDP,
 	MLX5E_TT_IPV6_UDP,
 	MLX5E_TT_IPV4_IPSEC_AH,
 	MLX5E_TT_IPV6_IPSEC_AH,
 	MLX5E_TT_IPV4_IPSEC_ESP,
 	MLX5E_TT_IPV6_IPSEC_ESP,
 	MLX5E_TT_IPV4,
 	MLX5E_TT_IPV6,
 	MLX5E_TT_ANY,
 	MLX5E_NUM_TT,
 };
 
 enum {
 	MLX5E_RQT_SPREADING = 0,
 	MLX5E_RQT_DEFAULT_RQ = 1,
 	MLX5E_NUM_RQT = 2,
 };
 
 struct mlx5_flow_rule;
 
 struct mlx5e_eth_addr_info {
 	u8	addr [ETH_ALEN + 2];
 	u32	tt_vec;
 	/* flow table rule per traffic type */
 	struct mlx5_flow_rule	*ft_rule[MLX5E_NUM_TT];
 };
 
 #define	MLX5E_ETH_ADDR_HASH_SIZE (1 << BITS_PER_BYTE)
 
 struct mlx5e_eth_addr_hash_node;
 
 struct mlx5e_eth_addr_hash_head {
 	struct mlx5e_eth_addr_hash_node *lh_first;
 };
 
 struct mlx5e_eth_addr_db {
 	struct mlx5e_eth_addr_hash_head if_uc[MLX5E_ETH_ADDR_HASH_SIZE];
 	struct mlx5e_eth_addr_hash_head if_mc[MLX5E_ETH_ADDR_HASH_SIZE];
 	struct mlx5e_eth_addr_info broadcast;
 	struct mlx5e_eth_addr_info allmulti;
 	struct mlx5e_eth_addr_info promisc;
 	bool	broadcast_enabled;
 	bool	allmulti_enabled;
 	bool	promisc_enabled;
 };
 
 enum {
 	MLX5E_STATE_ASYNC_EVENTS_ENABLE,
 	MLX5E_STATE_OPENED,
 };
 
 enum {
 	MLX5_BW_NO_LIMIT   = 0,
 	MLX5_100_MBPS_UNIT = 3,
 	MLX5_GBPS_UNIT     = 4,
 };
 
 struct mlx5e_vlan_db {
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 	struct mlx5_flow_rule	*active_vlans_ft_rule[VLAN_N_VID];
 	struct mlx5_flow_rule	*untagged_ft_rule;
 	struct mlx5_flow_rule	*any_cvlan_ft_rule;
 	struct mlx5_flow_rule	*any_svlan_ft_rule;
 	bool	filter_disabled;
 };
 
 struct mlx5e_vxlan_db_el {
 	u_int refcount;
 	u_int proto;
 	u_int port;
 	bool installed;
 	struct mlx5_flow_rule *vxlan_ft_rule;
 	TAILQ_ENTRY(mlx5e_vxlan_db_el) link;
 };
 
 struct mlx5e_vxlan_db {
 	TAILQ_HEAD(, mlx5e_vxlan_db_el) head;
 };
 
 struct mlx5e_flow_table {
 	int num_groups;
 	struct mlx5_flow_table *t;
 	struct mlx5_flow_group **g;
 };
 
 struct mlx5e_flow_tables {
 	struct mlx5_flow_namespace *ns;
 	struct mlx5e_flow_table vlan;
 	struct mlx5e_flow_table vxlan;
 	struct mlx5_flow_rule *vxlan_catchall_ft_rule;
 	struct mlx5e_flow_table main;
 	struct mlx5e_flow_table main_vxlan;
 	struct mlx5_flow_rule *main_vxlan_rule[MLX5E_NUM_TT];
 	struct mlx5e_flow_table inner_rss;
 };
 
 struct mlx5e_xmit_args {
 	volatile s32 *pref;
 	u32 tisn;
 	u16 ihs;
 };
 
 #include "en_rl.h"
 #include "en_hw_tls.h"
 
 #define	MLX5E_TSTMP_PREC 10
 
 struct mlx5e_clbr_point {
 	uint64_t base_curr;
 	uint64_t base_prev;
 	uint64_t clbr_hw_prev;
 	uint64_t clbr_hw_curr;
 	u_int clbr_gen;
 };
 
 struct mlx5e_dcbx {
 	u32	cable_len;
 	u32	xoff;
 };
 
 struct mlx5e_priv {
 	struct mlx5_core_dev *mdev;     /* must be first */
 
 	/* priv data path fields - start */
 	int	order_base_2_num_channels;
 	int	queue_mapping_channel_mask;
 	int	num_tc;
 	int	default_vlan_prio;
 	/* priv data path fields - end */
 
 	unsigned long state;
 	int	gone;
 #define	PRIV_LOCK(priv) sx_xlock(&(priv)->state_lock)
 #define	PRIV_UNLOCK(priv) sx_xunlock(&(priv)->state_lock)
 #define	PRIV_LOCKED(priv) sx_xlocked(&(priv)->state_lock)
 #define	PRIV_ASSERT_LOCKED(priv) sx_assert(&(priv)->state_lock, SA_XLOCKED)
 	struct sx state_lock;		/* Protects Interface state */
 	u32	pdn;
 	u32	tdn;
 	struct mlx5_core_mkey mr;
 
 	u32	tisn[MLX5E_MAX_TX_NUM_TC];
 	u32	rqtn;
 	u32	tirn[MLX5E_NUM_TT];
 	u32	tirn_inner_vxlan[MLX5E_NUM_TT];
 
 	struct mlx5e_flow_tables fts;
 	struct mlx5e_eth_addr_db eth_addr;
 	struct mlx5e_vlan_db vlan;
 	struct mlx5e_vxlan_db vxlan;
 
 	struct mlx5e_params params;
 	struct mlx5e_params_ethtool params_ethtool;
 	union mlx5_core_pci_diagnostics params_pci;
 	union mlx5_core_general_diagnostics params_general;
 	struct mtx async_events_mtx;	/* sync hw events */
 	struct work_struct update_stats_work;
 	struct work_struct update_carrier_work;
 	struct work_struct set_rx_mode_work;
 	MLX5_DECLARE_DOORBELL_LOCK(doorbell_lock)
 
 	struct ifnet *ifp;
 	struct sysctl_ctx_list sysctl_ctx;
 	struct sysctl_oid *sysctl_ifnet;
 	struct sysctl_oid *sysctl_hw;
 	int	sysctl_debug;
 	struct mlx5e_stats stats;
 	int	counter_set_id;
 
 	struct workqueue_struct *wq;
 
 	eventhandler_tag vlan_detach;
 	eventhandler_tag vlan_attach;
 	struct ifmedia media;
 	int	media_status_last;
 	int	media_active_last;
 	eventhandler_tag vxlan_start;
 	eventhandler_tag vxlan_stop;
 
 	struct callout watchdog;
 
 	struct mlx5e_rl_priv_data rl;
 
 	struct mlx5e_tls tls;
 
 	struct callout tstmp_clbr;
 	int	clbr_done;
 	int	clbr_curr;
 	struct mlx5e_clbr_point clbr_points[2];
 	u_int	clbr_gen;
 
 	struct mlx5e_dcbx dcbx;
 	bool	sw_is_port_buf_owner;
 
 	struct pfil_head *pfil;
 	struct mlx5e_channel channel[];
 };
 
 #define	MLX5E_NET_IP_ALIGN 2
 
 struct mlx5e_tx_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_eth_seg eth;
 };
 
 struct mlx5e_tx_umr_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_umr_ctrl_seg umr;
 	uint8_t mkc[64];
 };
 
 struct mlx5e_tx_psv_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_seg_set_psv psv;
 };
 
 struct mlx5e_rx_wqe {
 	struct mlx5_wqe_srq_next_seg next;
 	struct mlx5_wqe_data_seg data[];
 };
 
 /* the size of the structure above must be power of two */
 CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe)));
 
 struct mlx5e_eeprom {
 	int	lock_bit;
 	int	i2c_addr;
 	int	page_num;
 	int	device_addr;
 	int	module_num;
 	int	len;
 	int	type;
 	int	page_valid;
 	u32	*data;
 };
 
 #define	MLX5E_FLD_MAX(typ, fld) ((1ULL << __mlx5_bit_sz(typ, fld)) - 1ULL)
 
 bool	mlx5e_do_send_cqe(struct mlx5e_sq *);
 int	mlx5e_get_full_header_size(const struct mbuf *, const struct tcphdr **);
 int	mlx5e_xmit(struct ifnet *, struct mbuf *);
 
 int	mlx5e_open_locked(struct ifnet *);
 int	mlx5e_close_locked(struct ifnet *);
 
 void	mlx5e_cq_error_event(struct mlx5_core_cq *mcq, int event);
 mlx5e_cq_comp_t mlx5e_rx_cq_comp;
 mlx5e_cq_comp_t mlx5e_tx_cq_comp;
 struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
 
 void	mlx5e_dim_work(struct work_struct *);
 void	mlx5e_dim_build_cq_param(struct mlx5e_priv *, struct mlx5e_cq_param *);
 
 int	mlx5e_open_flow_table(struct mlx5e_priv *priv);
 void	mlx5e_close_flow_table(struct mlx5e_priv *priv);
 void	mlx5e_set_rx_mode_core(struct mlx5e_priv *priv);
 void	mlx5e_set_rx_mode_work(struct work_struct *work);
 
 void	mlx5e_vlan_rx_add_vid(void *, struct ifnet *, u16);
 void	mlx5e_vlan_rx_kill_vid(void *, struct ifnet *, u16);
 void	mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
 void	mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
 int	mlx5e_add_all_vlan_rules(struct mlx5e_priv *priv);
 void	mlx5e_del_all_vlan_rules(struct mlx5e_priv *priv);
 
 void	mlx5e_vxlan_start(void *arg, struct ifnet *ifp, sa_family_t family,
 	    u_int port);
 void	mlx5e_vxlan_stop(void *arg, struct ifnet *ifp, sa_family_t family,
 	    u_int port);
 
 int	mlx5e_add_all_vxlan_rules(struct mlx5e_priv *priv);
 void	mlx5e_del_all_vxlan_rules(struct mlx5e_priv *priv);
 
 static inline void
 mlx5e_tx_notify_hw(struct mlx5e_sq *sq, u32 *wqe)
 {
 	/* ensure wqe is visible to device before updating doorbell record */
 	wmb();
 
 	*sq->wq.db = cpu_to_be32(sq->pc);
 
 	/*
 	 * Ensure the doorbell record is visible to device before ringing
 	 * the doorbell:
 	 */
 	wmb();
 
 	mlx5_write64(wqe, sq->uar_map,
 	    MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock));
 }
 
 static inline void
 mlx5e_cq_arm(struct mlx5e_cq *cq, spinlock_t *dblock)
 {
 	struct mlx5_core_cq *mcq;
 
 	mcq = &cq->mcq;
 	mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, dblock, cq->wq.cc);
 }
 
 #define	mlx5e_dbg(_IGN, _priv, ...) mlx5_core_dbg((_priv)->mdev, __VA_ARGS__)
 
 extern const struct ethtool_ops mlx5e_ethtool_ops;
 void	mlx5e_create_ethtool(struct mlx5e_priv *);
 void	mlx5e_create_stats(struct sysctl_ctx_list *,
     struct sysctl_oid_list *, const char *,
     const char **, unsigned, u64 *);
 void	mlx5e_create_counter_stats(struct sysctl_ctx_list *,
     struct sysctl_oid_list *, const char *,
     const char **, unsigned, counter_u64_t *);
 void	mlx5e_send_nop(struct mlx5e_sq *, u32);
 int	mlx5e_sq_dump_xmit(struct mlx5e_sq *, struct mlx5e_xmit_args *, struct mbuf **);
 int	mlx5e_sq_xmit(struct mlx5e_sq *, struct mbuf **);
 void	mlx5e_sq_cev_timeout(void *);
 int	mlx5e_refresh_channel_params(struct mlx5e_priv *);
 int	mlx5e_open_cq(struct mlx5e_priv *, struct mlx5e_cq_param *,
     struct mlx5e_cq *, mlx5e_cq_comp_t *, int eq_ix);
 void	mlx5e_close_cq(struct mlx5e_cq *);
 void	mlx5e_free_sq_db(struct mlx5e_sq *);
 int	mlx5e_alloc_sq_db(struct mlx5e_sq *);
 int	mlx5e_enable_sq(struct mlx5e_sq *, struct mlx5e_sq_param *,
     const struct mlx5_sq_bfreg *, int tis_num);
 int	mlx5e_modify_sq(struct mlx5e_sq *, int curr_state, int next_state);
 void	mlx5e_disable_sq(struct mlx5e_sq *);
 void	mlx5e_drain_sq(struct mlx5e_sq *);
 void	mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value);
 void	mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value);
 void	mlx5e_resume_sq(struct mlx5e_sq *sq);
 void	mlx5e_update_sq_inline(struct mlx5e_sq *sq);
 void	mlx5e_refresh_sq_inline(struct mlx5e_priv *priv);
 int	mlx5e_update_buf_lossy(struct mlx5e_priv *priv);
 int	mlx5e_fec_update(struct mlx5e_priv *priv);
 int	mlx5e_hw_temperature_update(struct mlx5e_priv *priv);
 
-if_snd_tag_alloc_t mlx5e_ul_snd_tag_alloc;
-if_snd_tag_modify_t mlx5e_ul_snd_tag_modify;
-if_snd_tag_query_t mlx5e_ul_snd_tag_query;
-if_snd_tag_free_t mlx5e_ul_snd_tag_free;
-
 #endif					/* _MLX5_EN_H_ */
diff --git a/sys/dev/mlx5/mlx5_en/en_hw_tls.h b/sys/dev/mlx5/mlx5_en/en_hw_tls.h
index eca9843c7673..5f2c5da5dfc0 100644
--- a/sys/dev/mlx5/mlx5_en/en_hw_tls.h
+++ b/sys/dev/mlx5/mlx5_en/en_hw_tls.h
@@ -1,104 +1,101 @@
 /*-
  * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MLX5_TLS_H_
 #define	_MLX5_TLS_H_
 
 #include <sys/queue.h>
 
 #define	MLX5E_TLS_TAG_LOCK(tag)		mtx_lock(&(tag)->mtx)
 #define	MLX5E_TLS_TAG_UNLOCK(tag)	mtx_unlock(&(tag)->mtx)
 
 #define	MLX5E_TLS_STAT_INC(tag, field, num) \
 	counter_u64_add((tag)->tls->stats.field, num)
 
 enum {
       MLX5E_TLS_LOOP = 0,
       MLX5E_TLS_FAILURE = 1,
       MLX5E_TLS_DEFERRED = 2,
       MLX5E_TLS_CONTINUE = 3,
 };
 
 struct mlx5e_tls_tag {
 	struct m_snd_tag tag;
 	STAILQ_ENTRY(mlx5e_tls_tag) entry;
 	volatile s32 refs;	/* number of pending mbufs */
 	uint32_t tisn;		/* HW TIS context number */
 	uint32_t dek_index;	/* HW TLS context number */
 	struct mlx5e_tls *tls;
 	struct m_snd_tag *rl_tag;
 	struct mtx mtx;
 	uint32_t expected_seq; /* expected TCP sequence number */
 	uint32_t state;	/* see MLX5E_TLS_ST_XXX */
 #define	MLX5E_TLS_ST_INIT 0
 #define	MLX5E_TLS_ST_SETUP 1
 #define	MLX5E_TLS_ST_TXRDY 2
 #define	MLX5E_TLS_ST_FREED 3
 	struct work_struct work;
 
 	uint32_t dek_index_ok:1;
 
 	/* parameters needed */
 	uint8_t crypto_params[128] __aligned(4);
 } __aligned(MLX5E_CACHELINE_SIZE);
 
 #define	MLX5E_TLS_STATS(m)					\
   m(+1, u64, tx_packets, "tx_packets", "Transmitted packets")	\
   m(+1, u64, tx_bytes, "tx_bytes", "Transmitted bytes")		\
   m(+1, u64, tx_packets_ooo, "tx_packets_ooo", "Transmitted packets out of order") \
   m(+1, u64, tx_bytes_ooo, "tx_bytes_ooo", "Transmitted bytes out of order") \
   m(+1, u64, tx_error, "tx_error", "Transmitted packets with error")
 
 #define	MLX5E_TLS_STATS_NUM (0 MLX5E_TLS_STATS(MLX5E_STATS_COUNT))
 
 struct mlx5e_tls_stats {
 	struct	sysctl_ctx_list ctx;
 	counter_u64_t	arg[0];
 	MLX5E_TLS_STATS(MLX5E_STATS_COUNTER)
 };
 
 struct mlx5e_tls {
 	struct sysctl_ctx_list ctx;
 	struct mlx5e_tls_stats stats;
 	struct workqueue_struct *wq;
 	uma_zone_t zone;
 	uint32_t max_resources;		/* max number of resources */
 	volatile uint32_t num_resources;	/* current number of resources */
 	int init;			/* set when ready */
 	char zname[32];
 };
 
 int mlx5e_tls_init(struct mlx5e_priv *);
 void mlx5e_tls_cleanup(struct mlx5e_priv *);
 int mlx5e_sq_tls_xmit(struct mlx5e_sq *, struct mlx5e_xmit_args *, struct mbuf **);
 
 if_snd_tag_alloc_t mlx5e_tls_snd_tag_alloc;
-if_snd_tag_modify_t mlx5e_tls_snd_tag_modify;
-if_snd_tag_query_t mlx5e_tls_snd_tag_query;
-if_snd_tag_free_t mlx5e_tls_snd_tag_free;
 
 #endif					/* _MLX5_TLS_H_ */
diff --git a/sys/dev/mlx5/mlx5_en/en_rl.h b/sys/dev/mlx5/mlx5_en/en_rl.h
index f30e8ba8cc07..1d7f7afc487d 100644
--- a/sys/dev/mlx5/mlx5_en/en_rl.h
+++ b/sys/dev/mlx5/mlx5_en/en_rl.h
@@ -1,175 +1,172 @@
 /*-
  * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __MLX5_EN_RL_H__
 #define __MLX5_EN_RL_H__
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/interrupt.h>
 #include <sys/unistd.h>
 
 #include <sys/queue.h>
 
 #define	MLX5E_RL_MAX_WORKERS		128	/* limited by Toeplitz hash */
 #define	MLX5E_RL_MAX_TX_RATES		(64 * 1024)	/* software limit */
 #define	MLX5E_RL_DEF_SQ_PER_WORKER	(12 * 1024)	/* software limit */
 #define	MLX5E_RL_MAX_SQS		(120 * 1024)	/* software limit */
 
 #define	MLX5E_RL_TX_COAL_USEC_DEFAULT	32
 #define	MLX5E_RL_TX_COAL_PKTS_DEFAULT	4
 #define	MLX5E_RL_TX_COAL_MODE_DEFAULT	0
 #define	MLX5E_RL_TX_COMP_FACT_DEFAULT	1
 
 #define	MLX5E_RL_WORKER_LOCK(rlw)		mtx_lock(&(rlw)->mtx)
 #define	MLX5E_RL_WORKER_UNLOCK(rlw)		mtx_unlock(&(rlw)->mtx)
 
 #define	MLX5E_RL_RLOCK(rl) sx_slock(&(rl)->rl_sxlock)
 #define	MLX5E_RL_RUNLOCK(rl) sx_sunlock(&(rl)->rl_sxlock)
 
 #define	MLX5E_RL_WLOCK(rl) sx_xlock(&(rl)->rl_sxlock)
 #define	MLX5E_RL_WUNLOCK(rl) sx_xunlock(&(rl)->rl_sxlock)
 
 #define	MLX5E_RL_PARAMS(m) \
   m(+1, u64, tx_queue_size, "tx_queue_size", "Default send queue size") \
   m(+1, u64, tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining TX packets") \
   m(+1, u64, tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of TX packets to join") \
   m(+1, u64, tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
   m(+1, u64, tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
   m(+1, u64, tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
   m(+1, u64, tx_worker_threads_max, "tx_worker_threads_max", "Max number of TX worker threads") \
   m(+1, u64, tx_worker_threads_def, "tx_worker_threads_def", "Default number of TX worker threads") \
   m(+1, u64, tx_channels_per_worker_max, "tx_channels_per_worker_max", "Max number of TX channels per worker") \
   m(+1, u64, tx_channels_per_worker_def, "tx_channels_per_worker_def", "Default number of TX channels per worker") \
   m(+1, u64, tx_rates_max, "tx_rates_max", "Max number of TX rates") \
   m(+1, u64, tx_rates_def, "tx_rates_def", "Default number of TX rates") \
   m(+1, u64, tx_limit_min, "tx_limit_min", "Minimum TX rate in bits/s") \
   m(+1, u64, tx_limit_max, "tx_limit_max", "Maximum TX rate in bits/s") \
   m(+1, u64, tx_burst_size, "tx_burst_size", "Current burst size in number of packets. A value of zero means use firmware default.") \
   m(+1, u64, tx_burst_size_max, "tx_burst_size_max", "Maximum burst size in number of packets") \
   m(+1, u64, tx_burst_size_min, "tx_burst_size_min", "Minimum burst size in number of packets")
 
 #define	MLX5E_RL_PARAMS_NUM (0 MLX5E_RL_PARAMS(MLX5E_STATS_COUNT))
 
 #define MLX5E_RL_STATS(m) \
   m(+1, u64, tx_allocate_resource_failure, "tx_allocate_resource_failure", "Number of times firmware resource allocation failed") \
   m(+1, u64, tx_add_new_rate_failure, "tx_add_new_rate_failure", "Number of times adding a new firmware rate failed") \
   m(+1, u64, tx_modify_rate_failure, "tx_modify_rate_failure", "Number of times modifying a firmware rate failed") \
   m(+1, u64, tx_active_connections, "tx_active_connections", "Number of active connections") \
   m(+1, u64, tx_open_queues, "tx_open_queues", "Number of open TX queues") \
   m(+1, u64, tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available")
 
 #define MLX5E_RL_STATS_NUM (0 MLX5E_RL_STATS(MLX5E_STATS_COUNT))
 
 #define	MLX5E_RL_TABLE_PARAMS(m) \
   m(+1, u64, tx_limit_add, "tx_limit_add", "Add TX rate limit in bits/s to empty slot") \
   m(+1, u64, tx_limit_clr, "tx_limit_clr", "Clear all TX rates in table") \
   m(+1, u64, tx_allowed_deviation, "tx_allowed_deviation", "Relative rate deviation allowed in 1/1000") \
   m(+1, u64, tx_allowed_deviation_min, "tx_allowed_deviation_min", "Minimum allowed rate deviation in 1/1000") \
   m(+1, u64, tx_allowed_deviation_max, "tx_allowed_deviation_max", "Maximum allowed rate deviation in 1/1000")
 
 #define	MLX5E_RL_TABLE_PARAMS_NUM (0 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_COUNT))
 
 #define	MLX5E_RL_PARAMS_INDEX(n)			\
     (__offsetof(struct mlx5e_rl_params, n) / sizeof(uint64_t))
 
 struct mlx5e_priv;
 
 /* Indicates channel's state */
 enum {
 	MLX5E_RL_ST_FREE,
 	MLX5E_RL_ST_USED,
 	MLX5E_RL_ST_MODIFY,
 	MLX5E_RL_ST_DESTROY,
 };
 
 struct mlx5e_rl_stats {
 	u64	arg [0];
 	MLX5E_RL_STATS(MLX5E_STATS_VAR)
 };
 
 struct mlx5e_rl_params {
 	u64	arg [0];
 	MLX5E_RL_PARAMS(MLX5E_STATS_VAR)
 	u64	table_arg [0];
 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_VAR)
 };
 
 struct mlx5e_rl_channel_param {
 	struct mlx5e_sq_param sq;
 	struct mlx5e_cq_param cq;
 };
 
 struct mlx5e_rl_channel {
 	struct m_snd_tag tag;
 	STAILQ_ENTRY(mlx5e_rl_channel) entry;
 	struct mlx5e_sq * volatile sq;
 	struct mlx5e_rl_worker *worker;
 	uint64_t new_rate;
 	uint64_t init_rate;
 	uint64_t last_rate;
 	uint16_t last_burst;
 	uint16_t state;
 };
 
 struct mlx5e_rl_worker {
 	struct mtx mtx;
 	struct cv cv;
 	STAILQ_HEAD(, mlx5e_rl_channel) index_list_head;
 	STAILQ_HEAD(, mlx5e_rl_channel) process_head;
 	struct mlx5e_priv *priv;
 	struct mlx5e_rl_channel *channels;
 	unsigned worker_done;
 };
 
 struct mlx5e_rl_priv_data {
 	struct sx rl_sxlock;
 	struct sysctl_ctx_list ctx;
 	struct mlx5e_rl_channel_param chan_param;
 	struct mlx5e_rl_params param;
 	struct mlx5e_rl_stats stats;
 	struct mlx5e_rl_worker *workers;
 	struct mlx5e_priv *priv;
 	uint64_t *rate_limit_table;
 	unsigned opened;
 	uint32_t tisn;
 };
 
 int mlx5e_rl_init(struct mlx5e_priv *priv);
 void mlx5e_rl_cleanup(struct mlx5e_priv *priv);
 void mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl);
 
 if_snd_tag_alloc_t mlx5e_rl_snd_tag_alloc;
-if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
-if_snd_tag_query_t mlx5e_rl_snd_tag_query;
-if_snd_tag_free_t mlx5e_rl_snd_tag_free;
 
 #endif		/* __MLX5_EN_RL_H__ */
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c
index 1a92e5aa222a..6140671fe0c2 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c
@@ -1,805 +1,808 @@
 /*-
  * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_kern_tls.h"
 
 #include "en.h"
 
 #include <dev/mlx5/tls.h>
 
 #include <linux/delay.h>
 #include <sys/ktls.h>
 #include <opencrypto/cryptodev.h>
 
 #ifdef KERN_TLS
 
+#ifdef RATELIMIT
+static if_snd_tag_modify_t mlx5e_tls_rl_snd_tag_modify;
+#endif
+static if_snd_tag_query_t mlx5e_tls_snd_tag_query;
+static if_snd_tag_free_t mlx5e_tls_snd_tag_free;
+
+static const struct if_snd_tag_sw mlx5e_tls_snd_tag_sw = {
+	.snd_tag_query = mlx5e_tls_snd_tag_query,
+	.snd_tag_free = mlx5e_tls_snd_tag_free,
+	.type = IF_SND_TAG_TYPE_TLS
+};
+
+#ifdef RATELIMIT
+static const struct if_snd_tag_sw mlx5e_tls_rl_snd_tag_sw = {
+	.snd_tag_modify = mlx5e_tls_rl_snd_tag_modify,
+	.snd_tag_query = mlx5e_tls_snd_tag_query,
+	.snd_tag_free = mlx5e_tls_snd_tag_free,
+	.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT
+};
+#endif
+
 MALLOC_DEFINE(M_MLX5E_TLS, "MLX5E_TLS", "MLX5 ethernet HW TLS");
 
 /* software TLS context */
 struct mlx5_ifc_sw_tls_cntx_bits {
 	struct mlx5_ifc_tls_static_params_bits param;
 	struct mlx5_ifc_tls_progress_params_bits progress;
 	struct {
 		uint8_t key_data[8][0x20];
 		uint8_t key_len[0x20];
 	} key;
 };
 
 CTASSERT(MLX5_ST_SZ_BYTES(sw_tls_cntx) <= sizeof(((struct mlx5e_tls_tag *)0)->crypto_params));
 CTASSERT(MLX5_ST_SZ_BYTES(mkc) == sizeof(((struct mlx5e_tx_umr_wqe *)0)->mkc));
 
 static const char *mlx5e_tls_stats_desc[] = {
 	MLX5E_TLS_STATS(MLX5E_STATS_DESC)
 };
 
 static void mlx5e_tls_work(struct work_struct *);
 
 static int
 mlx5e_tls_tag_zinit(void *mem, int size, int flags)
 {
 	struct mlx5e_tls_tag *ptag = mem;
 
 	MPASS(size == sizeof(*ptag));
 
 	memset(ptag, 0, sizeof(*ptag));
 	mtx_init(&ptag->mtx, "mlx5-tls-tag-mtx", NULL, MTX_DEF);
 	INIT_WORK(&ptag->work, mlx5e_tls_work);
 
 	return (0);
 }
 
 static void
 mlx5e_tls_tag_zfini(void *mem, int size)
 {
 	struct mlx5e_tls_tag *ptag = mem;
 	struct mlx5e_priv *priv;
 	struct mlx5e_tls *ptls;
 
 	ptls = ptag->tls;
 	priv = container_of(ptls, struct mlx5e_priv, tls);
 
 	flush_work(&ptag->work);
 
 	if (ptag->tisn != 0) {
 		mlx5_tls_close_tis(priv->mdev, ptag->tisn);
 		atomic_add_32(&ptls->num_resources, -1U);
 	}
 
 	mtx_destroy(&ptag->mtx);
 }
 
 static void
 mlx5e_tls_tag_zfree(struct mlx5e_tls_tag *ptag)
 {
 
 	/* reset some variables */
 	ptag->state = MLX5E_TLS_ST_INIT;
 	ptag->dek_index = 0;
 	ptag->dek_index_ok = 0;
 
 	/* avoid leaking keys */
 	memset(ptag->crypto_params, 0, sizeof(ptag->crypto_params));
 
 	/* update number of TIS contexts */
 	if (ptag->tisn == 0)
 		atomic_add_32(&ptag->tls->num_resources, -1U);
 
 	/* return tag to UMA */
 	uma_zfree(ptag->tls->zone, ptag);
 }
 
 int
 mlx5e_tls_init(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tls *ptls = &priv->tls;
 	struct sysctl_oid *node;
 	uint32_t x;
 
 	if (MLX5_CAP_GEN(priv->mdev, tls_tx) == 0)
 		return (0);
 
 	ptls->wq = create_singlethread_workqueue("mlx5-tls-wq");
 	if (ptls->wq == NULL)
 		return (ENOMEM);
 
 	sysctl_ctx_init(&ptls->ctx);
 
 	snprintf(ptls->zname, sizeof(ptls->zname),
 	    "mlx5_%u_tls", device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	ptls->zone = uma_zcreate(ptls->zname, sizeof(struct mlx5e_tls_tag),
 	    NULL, NULL, mlx5e_tls_tag_zinit, mlx5e_tls_tag_zfini, UMA_ALIGN_CACHE, 0);
 
 	ptls->max_resources = 1U << MLX5_CAP_GEN(priv->mdev, log_max_dek);
 
 	for (x = 0; x != MLX5E_TLS_STATS_NUM; x++)
 		ptls->stats.arg[x] = counter_u64_alloc(M_WAITOK);
 
 	ptls->init = 1;
 
 	node = SYSCTL_ADD_NODE(&priv->sysctl_ctx,
 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
 	    "tls", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Hardware TLS offload");
 	if (node == NULL)
 		return (0);
 
 	mlx5e_create_counter_stats(&ptls->ctx,
 	    SYSCTL_CHILDREN(node), "stats",
 	    mlx5e_tls_stats_desc, MLX5E_TLS_STATS_NUM,
 	    ptls->stats.arg);
 
 	return (0);
 }
 
 void
 mlx5e_tls_cleanup(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tls *ptls = &priv->tls;
 	uint32_t x;
 
 	if (MLX5_CAP_GEN(priv->mdev, tls_tx) == 0)
 		return;
 
 	ptls->init = 0;
 	flush_workqueue(ptls->wq);
 	sysctl_ctx_free(&ptls->ctx);
 	uma_zdestroy(ptls->zone);
 	destroy_workqueue(ptls->wq);
 
 	/* check if all resources are freed */
 	MPASS(priv->tls.num_resources == 0);
 
 	for (x = 0; x != MLX5E_TLS_STATS_NUM; x++)
 		counter_u64_free(ptls->stats.arg[x]);
 }
 
 static void
 mlx5e_tls_work(struct work_struct *work)
 {
 	struct mlx5e_tls_tag *ptag;
 	struct mlx5e_priv *priv;
 	int err;
 
 	ptag = container_of(work, struct mlx5e_tls_tag, work);
 	priv = container_of(ptag->tls, struct mlx5e_priv, tls);
 
 	switch (ptag->state) {
 	case MLX5E_TLS_ST_INIT:
 		/* try to open TIS, if not present */
 		if (ptag->tisn == 0) {
 			err = mlx5_tls_open_tis(priv->mdev, 0, priv->tdn,
 			    priv->pdn, &ptag->tisn);
 			if (err) {
 				MLX5E_TLS_STAT_INC(ptag, tx_error, 1);
 				break;
 			}
 		}
 		MLX5_SET(sw_tls_cntx, ptag->crypto_params, progress.pd, ptag->tisn);
 
 		/* try to allocate a DEK context ID */
 		err = mlx5_encryption_key_create(priv->mdev, priv->pdn,
 		    MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, key.key_data),
 		    MLX5_GET(sw_tls_cntx, ptag->crypto_params, key.key_len),
 		    &ptag->dek_index);
 		if (err) {
 			MLX5E_TLS_STAT_INC(ptag, tx_error, 1);
 			break;
 		}
 
 		MLX5_SET(sw_tls_cntx, ptag->crypto_params, param.dek_index, ptag->dek_index);
 
 		ptag->dek_index_ok = 1;
 
 		MLX5E_TLS_TAG_LOCK(ptag);
 		if (ptag->state == MLX5E_TLS_ST_INIT)
 			ptag->state = MLX5E_TLS_ST_SETUP;
 		MLX5E_TLS_TAG_UNLOCK(ptag);
 		break;
 
 	case MLX5E_TLS_ST_FREED:
 		/* wait for all refs to go away */
 		while (ptag->refs != 0)
 			msleep(1);
 
 		/* try to destroy DEK context by ID */
 		if (ptag->dek_index_ok)
 			err = mlx5_encryption_key_destroy(priv->mdev, ptag->dek_index);
 
 		/* free tag */
 		mlx5e_tls_tag_zfree(ptag);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static int
 mlx5e_tls_set_params(void *ctx, const struct tls_session_params *en)
 {
 
 	MLX5_SET(sw_tls_cntx, ctx, param.const_2, 2);
 	if (en->tls_vminor == TLS_MINOR_VER_TWO)
 		MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 2); /* v1.2 */
 	else
 		MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 3); /* v1.3 */
 	MLX5_SET(sw_tls_cntx, ctx, param.const_1, 1);
 	MLX5_SET(sw_tls_cntx, ctx, param.encryption_standard, 1); /* TLS */
 
 	/* copy the initial vector in place */
 	switch (en->iv_len) {
 	case MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv):
 	case MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv) +
 	     MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.implicit_iv):
 		memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, param.gcm_iv),
 		    en->iv, en->iv_len);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if (en->cipher_key_len <= MLX5_FLD_SZ_BYTES(sw_tls_cntx, key.key_data)) {
 		memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, key.key_data),
 		    en->cipher_key, en->cipher_key_len);
 		MLX5_SET(sw_tls_cntx, ctx, key.key_len, en->cipher_key_len);
 	} else {
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /* Verify zero default */
 CTASSERT(MLX5E_TLS_ST_INIT == 0);
 
 int
 mlx5e_tls_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	union if_snd_tag_alloc_params rl_params;
+	const struct if_snd_tag_sw *snd_tag_sw;
 	struct mlx5e_priv *priv;
 	struct mlx5e_tls_tag *ptag;
 	const struct tls_session_params *en;
 	int error;
 
 	priv = ifp->if_softc;
 
 	if (priv->gone != 0 || priv->tls.init == 0)
 		return (EOPNOTSUPP);
 
 	/* allocate new tag from zone, if any */
 	ptag = uma_zalloc(priv->tls.zone, M_NOWAIT);
 	if (ptag == NULL)
 		return (ENOMEM);
 
 	/* sanity check default values */
 	MPASS(ptag->state == MLX5E_TLS_ST_INIT);
 	MPASS(ptag->dek_index == 0);
 	MPASS(ptag->dek_index_ok == 0);
 
 	/* setup TLS tag */
 	ptag->tls = &priv->tls;
 
 	/* check if there is no TIS context */
 	if (ptag->tisn == 0) {
 		uint32_t value;
 
 		value = atomic_fetchadd_32(&priv->tls.num_resources, 1U);
 
 		/* check resource limits */
 		if (value >= priv->tls.max_resources) {
 			error = ENOMEM;
 			goto failure;
 		}
 	}
 
 	en = &params->tls.tls->params;
 
 	/* only TLS v1.2 and v1.3 is currently supported */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    (en->tls_vminor != TLS_MINOR_VER_TWO
 #ifdef TLS_MINOR_VER_THREE
 	     && en->tls_vminor != TLS_MINOR_VER_THREE
 #endif
 	     )) {
 		error = EPROTONOSUPPORT;
 		goto failure;
 	}
 
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		switch (en->cipher_key_len) {
 		case 128 / 8:
 			if (en->tls_vminor == TLS_MINOR_VER_TWO) {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_128) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			} else {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_128) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			}
 			error = mlx5e_tls_set_params(ptag->crypto_params, en);
 			if (error)
 				goto failure;
 			break;
 
 		case 256 / 8:
 			if (en->tls_vminor == TLS_MINOR_VER_TWO) {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_256) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			} else {
 				if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_256) == 0) {
 					error = EPROTONOSUPPORT;
 					goto failure;
 				}
 			}
 			error = mlx5e_tls_set_params(ptag->crypto_params, en);
 			if (error)
 				goto failure;
 			break;
 
 		default:
 			error = EINVAL;
 			goto failure;
 		}
 		break;
 	default:
 		error = EPROTONOSUPPORT;
 		goto failure;
 	}
 
 	memset(&rl_params, 0, sizeof(rl_params));
 	rl_params.hdr = params->hdr;
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		rl_params.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT;
 		rl_params.rate_limit.max_rate = params->tls_rate_limit.max_rate;
+		snd_tag_sw = &mlx5e_tls_rl_snd_tag_sw;
 		break;
 #endif
 	case IF_SND_TAG_TYPE_TLS:
 		rl_params.hdr.type = IF_SND_TAG_TYPE_UNLIMITED;
+		snd_tag_sw = &mlx5e_tls_snd_tag_sw;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		goto failure;
 	}
 
 	error = m_snd_tag_alloc(ifp, &rl_params, &ptag->rl_tag);
 	if (error)
 		goto failure;
 
 	/* store pointer to mbuf tag */
 	MPASS(ptag->tag.refcount == 0);
-	m_snd_tag_init(&ptag->tag, ifp, params->hdr.type);
+	m_snd_tag_init(&ptag->tag, ifp, snd_tag_sw);
 	*ppmt = &ptag->tag;
 
 	queue_work(priv->tls.wq, &ptag->work);
 	flush_work(&ptag->work);
 
 	return (0);
 
 failure:
 	mlx5e_tls_tag_zfree(ptag);
 	return (error);
 }
 
-int
-mlx5e_tls_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
-{
 #ifdef RATELIMIT
+static int
+mlx5e_tls_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
+{
 	union if_snd_tag_modify_params rl_params;
 	struct mlx5e_tls_tag *ptag =
 	    container_of(pmt, struct mlx5e_tls_tag, tag);
 	int error;
-#endif
 
-	switch (pmt->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
-		memset(&rl_params, 0, sizeof(rl_params));
-		rl_params.rate_limit.max_rate = params->tls_rate_limit.max_rate;
-		error = ptag->rl_tag->ifp->if_snd_tag_modify(ptag->rl_tag,
-		    &rl_params);
-		return (error);
-#endif
-	default:
-		return (EOPNOTSUPP);
-	}
+	memset(&rl_params, 0, sizeof(rl_params));
+	rl_params.rate_limit.max_rate = params->tls_rate_limit.max_rate;
+	error = ptag->rl_tag->sw->snd_tag_modify(ptag->rl_tag, &rl_params);
+	return (error);
 }
+#endif
 
-int
+static int
 mlx5e_tls_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_tls_tag *ptag =
 	    container_of(pmt, struct mlx5e_tls_tag, tag);
-	int error;
 
-	switch (pmt->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
-#endif
-	case IF_SND_TAG_TYPE_TLS:
-		error = ptag->rl_tag->ifp->if_snd_tag_query(ptag->rl_tag,
-		    params);
-		break;
-	default:
-		error = EOPNOTSUPP;
-		break;
-	}
-	return (error);
+	return (ptag->rl_tag->sw->snd_tag_query(ptag->rl_tag, params));
 }
 
-void
+static void
 mlx5e_tls_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_tls_tag *ptag =
 	    container_of(pmt, struct mlx5e_tls_tag, tag);
 	struct mlx5e_priv *priv;
 
 	m_snd_tag_rele(ptag->rl_tag);
 
 	MLX5E_TLS_TAG_LOCK(ptag);
 	ptag->state = MLX5E_TLS_ST_FREED;
 	MLX5E_TLS_TAG_UNLOCK(ptag);
 
 	priv = ptag->tag.ifp->if_softc;
 	queue_work(priv->tls.wq, &ptag->work);
 }
 
 CTASSERT((MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) % 16) == 0);
 
 static void
 mlx5e_tls_send_static_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag)
 {
 	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_umr_wqe) +
 	    MLX5_FLD_SZ_BYTES(sw_tls_cntx, param), MLX5_SEND_WQE_DS);
 	struct mlx5e_tx_umr_wqe *wqe;
 	u16 pi;
 
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(wqe, 0, sizeof(*wqe));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) |
 	    MLX5_OPCODE_UMR | (MLX5_OPCODE_MOD_UMR_TLS_TIS_STATIC_PARAMS << 24));
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	wqe->ctrl.imm = cpu_to_be32(ptag->tisn << 8);
 
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
 	else
 		wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL;
 
 	/* fill out UMR control segment */
 	wqe->umr.flags = 0x80;	/* inline data */
 	wqe->umr.bsf_octowords = cpu_to_be16(MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) / 16);
 
 	/* copy in the static crypto parameters */
 	memcpy(wqe + 1, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, param),
 	    MLX5_FLD_SZ_BYTES(sw_tls_cntx, param));
 
 	/* copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->mbuf[pi].p_refcount = &ptag->refs;
 	atomic_add_int(&ptag->refs, 1);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 CTASSERT(MLX5_FLD_SZ_BYTES(sw_tls_cntx, progress) ==
     sizeof(((struct mlx5e_tx_psv_wqe *)0)->psv));
 
 static void
 mlx5e_tls_send_progress_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag)
 {
 	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_psv_wqe),
 	    MLX5_SEND_WQE_DS);
 	struct mlx5e_tx_psv_wqe *wqe;
 	u16 pi;
 
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(wqe, 0, sizeof(*wqe));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) |
 	    MLX5_OPCODE_SET_PSV | (MLX5_OPCODE_MOD_PSV_TLS_TIS_PROGRESS_PARAMS << 24));
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
 
 	/* copy in the PSV control segment */
 	memcpy(&wqe->psv, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, progress),
 	    sizeof(wqe->psv));
 
 	/* copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->mbuf[pi].p_refcount = &ptag->refs;
 	atomic_add_int(&ptag->refs, 1);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 static void
 mlx5e_tls_send_nop(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag)
 {
 	const u32 ds_cnt = MLX5_SEND_WQEBB_NUM_DS;
 	struct mlx5e_tx_wqe *wqe;
 	u16 pi;
 
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	if (mlx5e_do_send_cqe(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
 	else
 		wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL;
 
 	/* Copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->mbuf[pi].p_refcount = &ptag->refs;
 	atomic_add_int(&ptag->refs, 1);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 #define	SBTLS_MBUF_NO_DATA ((struct mbuf *)1)
 
 static struct mbuf *
 sbtls_recover_record(struct mbuf *mb, int wait, uint32_t tcp_old, uint32_t *ptcp_seq, bool *pis_start)
 {
 	struct mbuf *mr, *top;
 	uint32_t offset;
 	uint32_t delta;
 
 	/* check format of incoming mbuf */
 	if (mb->m_next == NULL ||
 	    (mb->m_next->m_flags & (M_EXTPG | M_EXT)) != (M_EXTPG | M_EXT)) {
 		top = NULL;
 		goto done;
 	}
 
 	/* get unmapped data offset */
 	offset = mtod(mb->m_next, uintptr_t);
 
 	/* check if we don't need to re-transmit anything */
 	if (offset == 0) {
 		top = SBTLS_MBUF_NO_DATA;
 		*pis_start = true;
 		goto done;
 	}
 
 	/* try to get a new  packet header */
 	top = m_gethdr(wait, MT_DATA);
 	if (top == NULL)
 		goto done;
 
 	mr = m_get(wait, MT_DATA);
 	if (mr == NULL) {
 		m_free(top);
 		top = NULL;
 		goto done;
 	}
 
 	top->m_next = mr;
 
 	mb_dupcl(mr, mb->m_next);
 
 	/* the beginning of the TLS record */
 	mr->m_data = NULL;
 
 	/* setup packet header length */
 	top->m_pkthdr.len = mr->m_len = offset;
 	top->m_len = 0;
 
 	/* check for partial re-transmit */
 	delta = *ptcp_seq - tcp_old;
 
 	if (delta < offset) {
 		m_adj(top, offset - delta);
 		offset = delta;
 
 		/* continue where we left off */
 		*pis_start = false;
 	} else {
 		*pis_start = true;
 	}
 
 	/*
 	 * Rewind the TCP sequence number by the amount of data
 	 * retransmitted:
 	 */
 	*ptcp_seq -= offset;
 done:
 	return (top);
 }
 
 static int
 mlx5e_sq_tls_populate(struct mbuf *mb, uint64_t *pseq)
 {
 
 	for (; mb != NULL; mb = mb->m_next) {
 		if (!(mb->m_flags & M_EXTPG))
 			continue;
 		*pseq = mb->m_epg_seqno;
 		return (1);
 	}
 	return (0);
 }
 
 int
 mlx5e_sq_tls_xmit(struct mlx5e_sq *sq, struct mlx5e_xmit_args *parg, struct mbuf **ppmb)
 {
 	struct mlx5e_tls_tag *ptls_tag;
 	struct m_snd_tag *ptag;
 	const struct tcphdr *th;
 	struct mbuf *mb = *ppmb;
 	u64 rcd_sn;
 	u32 header_size;
 	u32 mb_seq;
 
 	if ((mb->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0)
 		return (MLX5E_TLS_CONTINUE);
 
 	ptag = mb->m_pkthdr.snd_tag;
 
 	if (
 #ifdef RATELIMIT
-	    ptag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT &&
+	    ptag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT &&
 #endif
-	    ptag->type != IF_SND_TAG_TYPE_TLS)
+	    ptag->sw->type != IF_SND_TAG_TYPE_TLS)
 		return (MLX5E_TLS_CONTINUE);
 
 	ptls_tag = container_of(ptag, struct mlx5e_tls_tag, tag);
 
 	header_size = mlx5e_get_full_header_size(mb, &th);
 	if (unlikely(header_size == 0 || th == NULL))
 		return (MLX5E_TLS_FAILURE);
 
 	/*
 	 * Send non-TLS TCP packets AS-IS:
 	 */
 	if (header_size == mb->m_pkthdr.len ||
 	    mlx5e_sq_tls_populate(mb, &rcd_sn) == 0) {
 		parg->tisn = 0;
 		parg->ihs = header_size;
 		return (MLX5E_TLS_CONTINUE);
 	}
 
 	mb_seq = ntohl(th->th_seq);
 
 	MLX5E_TLS_TAG_LOCK(ptls_tag);
 	switch (ptls_tag->state) {
 	case MLX5E_TLS_ST_INIT:
 		MLX5E_TLS_TAG_UNLOCK(ptls_tag);
 		return (MLX5E_TLS_FAILURE);
 	case MLX5E_TLS_ST_SETUP:
 		ptls_tag->state = MLX5E_TLS_ST_TXRDY;
 		ptls_tag->expected_seq = ~mb_seq;	/* force setup */
 	default:
 		MLX5E_TLS_TAG_UNLOCK(ptls_tag);
 		break;
 	}
 
 	if (unlikely(ptls_tag->expected_seq != mb_seq)) {
 		bool is_start;
 		struct mbuf *r_mb;
 		uint32_t tcp_seq = mb_seq;
 
 		r_mb = sbtls_recover_record(mb, M_NOWAIT, ptls_tag->expected_seq, &tcp_seq, &is_start);
 		if (r_mb == NULL) {
 			MLX5E_TLS_STAT_INC(ptls_tag, tx_error, 1);
 			return (MLX5E_TLS_FAILURE);
 		}
 
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_packets_ooo, 1);
 
 		/* check if this is the first fragment of a TLS record */
 		if (is_start) {
 			/* setup TLS static parameters */
 			MLX5_SET64(sw_tls_cntx, ptls_tag->crypto_params,
 			    param.initial_record_number, rcd_sn);
 
 			/*
 			 * NOTE: The sendqueue should have enough room to
 			 * carry both the static and the progress parameters
 			 * when we get here!
 			 */
 			mlx5e_tls_send_static_parameters(sq, ptls_tag);
 			mlx5e_tls_send_progress_parameters(sq, ptls_tag);
 
 			if (r_mb == SBTLS_MBUF_NO_DATA) {
 				mlx5e_tls_send_nop(sq, ptls_tag);
 				ptls_tag->expected_seq = mb_seq;
 				return (MLX5E_TLS_LOOP);
 			}
 		}
 
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes_ooo, r_mb->m_pkthdr.len);
 
 		/* setup transmit arguments */
 		parg->tisn = ptls_tag->tisn;
 		parg->pref = &ptls_tag->refs;
 
 		/* try to send DUMP data */
 		if (mlx5e_sq_dump_xmit(sq, parg, &r_mb) != 0) {
 			m_freem(r_mb);
 			ptls_tag->expected_seq = tcp_seq;
 			return (MLX5E_TLS_FAILURE);
 		} else {
 			ptls_tag->expected_seq = mb_seq;
 			return (MLX5E_TLS_LOOP);
 		}
 	} else {
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_packets, 1);
 		MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes, mb->m_pkthdr.len);
 	}
 	ptls_tag->expected_seq += mb->m_pkthdr.len - header_size;
 
 	parg->tisn = ptls_tag->tisn;
 	parg->ihs = header_size;
 	parg->pref = &ptls_tag->refs;
 	return (MLX5E_TLS_CONTINUE);
 }
 
 #else
 
 int
 mlx5e_tls_init(struct mlx5e_priv *priv)
 {
 
 	return (0);
 }
 
 void
 mlx5e_tls_cleanup(struct mlx5e_priv *priv)
 {
 	/* NOP */
 }
 
 #endif		/* KERN_TLS */
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index 40e0d2b0c342..9d8854528d4a 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -1,4912 +1,4843 @@
 /*-
  * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_kern_tls.h"
 
 #include "en.h"
 
 #include <sys/eventhandler.h>
 #include <sys/sockio.h>
 #include <machine/atomic.h>
 
 #include <net/debugnet.h>
 
 static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
+static if_snd_tag_query_t mlx5e_ul_snd_tag_query;
+static if_snd_tag_free_t mlx5e_ul_snd_tag_free;
 
 struct mlx5e_channel_param {
 	struct mlx5e_rq_param rq;
 	struct mlx5e_sq_param sq;
 	struct mlx5e_cq_param rx_cq;
 	struct mlx5e_cq_param tx_cq;
 };
 
 struct media {
 	u32	subtype;
 	u64	baudrate;
 };
 
 static const struct media mlx5e_mode_table[MLX5E_LINK_SPEEDS_NUMBER] =
 {
 	[MLX5E_1000BASE_CX_SGMII] = {
 		.subtype = IFM_1000_CX_SGMII,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_1000BASE_KX] = {
 		.subtype = IFM_1000_KX,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_CX4] = {
 		.subtype = IFM_10G_CX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KX4] = {
 		.subtype = IFM_10G_KX4,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_KR] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_20GBASE_KR2] = {
 		.subtype = IFM_20G_KR2,
 		.baudrate = IF_Gbps(20ULL),
 	},
 	[MLX5E_40GBASE_CR4] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_KR4] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_56GBASE_R4] = {
 		.subtype = IFM_56G_R4,
 		.baudrate = IF_Gbps(56ULL),
 	},
 	[MLX5E_10GBASE_CR] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_SR] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_ER_LR] = {
 		.subtype = IFM_10G_ER,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_40GBASE_SR4] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_LR4_ER4] = {
 		.subtype = IFM_40G_LR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_100GBASE_CR4] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_SR4] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_KR4] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GBASE_LR4] = {
 		.subtype = IFM_100G_LR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100BASE_TX] = {
 		.subtype = IFM_100_TX,
 		.baudrate = IF_Mbps(100ULL),
 	},
 	[MLX5E_1000BASE_T] = {
 		.subtype = IFM_1000_T,
 		.baudrate = IF_Mbps(1000ULL),
 	},
 	[MLX5E_10GBASE_T] = {
 		.subtype = IFM_10G_T,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_25GBASE_CR] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_KR] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GBASE_SR] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_50GBASE_CR2] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GBASE_KR2] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GBASE_KR4] = {
 		.subtype = IFM_50G_KR4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 };
 
 static const struct media mlx5e_ext_mode_table[MLX5E_EXT_LINK_SPEEDS_NUMBER][MLX5E_CABLE_TYPE_NUMBER] =
 {
 	/**/
 	[MLX5E_SGMII_100M][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100_SGMII,
 		.baudrate = IF_Mbps(100),
 	},
 
 	/**/
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_1000_CX,
 		.baudrate = IF_Mbps(1000),
 	},
 	[MLX5E_1000BASE_X_SGMII][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_1000_SX,
 		.baudrate = IF_Mbps(1000),
 	},
 
 	/**/
 	[MLX5E_5GBASE_R][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_5000_KR,
 		.baudrate = IF_Mbps(5000),
 	},
 	[MLX5E_5GBASE_R][MLX5E_CABLE_TYPE_TWISTED_PAIR] = {
 		.subtype = IFM_5000_T,
 		.baudrate = IF_Mbps(5000),
 	},
 
 	/**/
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_10G_KR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_10G_CR1,
 		.baudrate = IF_Gbps(10ULL),
 	},
 	[MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_10G_SR,
 		.baudrate = IF_Gbps(10ULL),
 	},
 
 	/**/
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_40G_KR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_40G_CR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 	[MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_40G_SR4,
 		.baudrate = IF_Gbps(40ULL),
 	},
 
 	/**/
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_25G_KR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_25G_CR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_25G_SR,
 		.baudrate = IF_Gbps(25ULL),
 	},
 	[MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_TWISTED_PAIR] = {
 		.subtype = IFM_25G_T,
 		.baudrate = IF_Gbps(25ULL),
 	},
 
 	/**/
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_50G_KR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_50G_CR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_50G_SR2,
 		.baudrate = IF_Gbps(50ULL),
 	},
 
 	/**/
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_50G_KR_PAM4,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_50G_CP,
 		.baudrate = IF_Gbps(50ULL),
 	},
 	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_50G_SR,
 		.baudrate = IF_Gbps(50ULL),
 	},
 
 	/**/
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_100G_CR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_100G_SR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 
 	/**/
 	[MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100G_KR_PAM4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_100G_CR_PAM4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_100G_SR2,	/* XXX */
 		.baudrate = IF_Gbps(100ULL),
 	},
 
 	/**/
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_100G_KR4,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_100G_CP2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_100G_SR2,
 		.baudrate = IF_Gbps(100ULL),
 	},
 
 	/**/
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_200G_KR4_PAM4,	/* XXX */
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_200G_CR4_PAM4,	/* XXX */
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_200G_SR4,	/* XXX */
 		.baudrate = IF_Gbps(200ULL),
 	},
 
 	/**/
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_200G_KR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = {
 		.subtype = IFM_200G_CR4_PAM4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = {
 		.subtype = IFM_200G_SR4,
 		.baudrate = IF_Gbps(200ULL),
 	},
 
 	/**/
 	[MLX5E_400GAUI_8][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_400G_LR8,	/* XXX */
 		.baudrate = IF_Gbps(400ULL),
 	},
 
 	/**/
 	[MLX5E_400GAUI_4_400GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = {
 		.subtype = IFM_400G_LR8,	/* XXX */
 		.baudrate = IF_Gbps(400ULL),
 	},
 };
 
+static const struct if_snd_tag_sw mlx5e_ul_snd_tag_sw = {
+	.snd_tag_query = mlx5e_ul_snd_tag_query,
+	.snd_tag_free = mlx5e_ul_snd_tag_free,
+	.type = IF_SND_TAG_TYPE_UNLIMITED
+};
+
 DEBUGNET_DEFINE(mlx5_en);
 
 MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet");
 
 static void
 mlx5e_update_carrier(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	u32 eth_proto_oper;
 	int error;
 	u8 i;
 	u8 cable_type;
 	u8 port_state;
 	u8 is_er_type;
 	bool ext;
 	struct media media_entry = {};
 
 	port_state = mlx5_query_vport_state(mdev,
 	    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0);
 
 	if (port_state == VPORT_STATE_UP) {
 		priv->media_status_last |= IFM_ACTIVE;
 	} else {
 		priv->media_status_last &= ~IFM_ACTIVE;
 		priv->media_active_last = IFM_ETHER;
 		if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 		return;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error) {
 		priv->media_active_last = IFM_ETHER;
 		priv->ifp->if_baudrate = 1;
 		mlx5_en_err(priv->ifp, "query port ptys failed: 0x%x\n",
 		    error);
 		return;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_oper);
 
 	i = ilog2(eth_proto_oper);
 
 	if (ext) {
 		error = mlx5_query_pddr_cable_type(mdev, 1, &cable_type);
 		if (error != 0) {
 			/* use fallback entry */
 			media_entry = mlx5e_ext_mode_table[i][MLX5E_CABLE_TYPE_UNKNOWN];
 
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		} else {
 			media_entry = mlx5e_ext_mode_table[i][cable_type];
 
 			/* check if we should use fallback entry */
 			if (media_entry.subtype == 0)
 				media_entry = mlx5e_ext_mode_table[i][MLX5E_CABLE_TYPE_UNKNOWN];
 		}
 	} else {
 		media_entry = mlx5e_mode_table[i];
 	}
 
 	if (media_entry.subtype == 0) {
 		mlx5_en_err(priv->ifp,
 		    "Could not find operational media subtype\n");
 		return;
 	}
 
 	switch (media_entry.subtype) {
 	case IFM_10G_ER:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		}
 		if (error != 0 || is_er_type == 0)
 			media_entry.subtype = IFM_10G_LR;
 		break;
 	case IFM_40G_LR4:
 		error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type);
 		if (error != 0) {
 			mlx5_en_err(priv->ifp,
 			    "query port pddr failed: %d\n", error);
 		}
 		if (error == 0 && is_er_type != 0)
 			media_entry.subtype = IFM_40G_ER4;
 		break;
 	}
 	priv->media_active_last = media_entry.subtype | IFM_ETHER | IFM_FDX;
 	priv->ifp->if_baudrate = media_entry.baudrate;
 
 	if_link_state_change(priv->ifp, LINK_STATE_UP);
 }
 
 static void
 mlx5e_media_status(struct ifnet *dev, struct ifmediareq *ifmr)
 {
 	struct mlx5e_priv *priv = dev->if_softc;
 
 	ifmr->ifm_status = priv->media_status_last;
 	ifmr->ifm_current = ifmr->ifm_active = priv->media_active_last |
 	    (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) |
 	    (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0);
 
 }
 
 static u32
 mlx5e_find_link_mode(u32 subtype, bool ext)
 {
 	u32 link_mode = 0;
 
 	switch (subtype) {
 	case 0:
 		goto done;
 	case IFM_10G_LR:
 		subtype = IFM_10G_ER;
 		break;
 	case IFM_40G_ER4:
 		subtype = IFM_40G_LR4;
 		break;
 	default:
 		break;
 	}
 
 	if (ext) {
 		for (unsigned i = 0; i != MLX5E_EXT_LINK_SPEEDS_NUMBER; i++) {
 			for (unsigned j = 0; j != MLX5E_CABLE_TYPE_NUMBER; j++) {
 				if (mlx5e_ext_mode_table[i][j].subtype == subtype)
 					link_mode |= MLX5E_PROT_MASK(i);
 			}
 		}
 	} else {
 		for (unsigned i = 0; i != MLX5E_LINK_SPEEDS_NUMBER; i++) {
 			if (mlx5e_mode_table[i].subtype == subtype)
 				link_mode |= MLX5E_PROT_MASK(i);
 		}
 	}
 done:
 	return (link_mode);
 }
 
 static int
 mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv)
 {
 	return (mlx5_set_port_pause_and_pfc(priv->mdev, 1,
 	    priv->params.rx_pauseframe_control,
 	    priv->params.tx_pauseframe_control,
 	    priv->params.rx_priority_flow_control,
 	    priv->params.tx_priority_flow_control));
 }
 
 static int
 mlx5e_set_port_pfc(struct mlx5e_priv *priv)
 {
 	int error;
 
 	if (priv->gone != 0) {
 		error = -ENXIO;
 	} else if (priv->params.rx_pauseframe_control ||
 	    priv->params.tx_pauseframe_control) {
 		mlx5_en_err(priv->ifp,
 		    "Global pauseframes must be disabled before enabling PFC.\n");
 		error = -EINVAL;
 	} else {
 		error = mlx5e_set_port_pause_and_pfc(priv);
 	}
 	return (error);
 }
 
 static int
 mlx5e_media_change(struct ifnet *dev)
 {
 	struct mlx5e_priv *priv = dev->if_softc;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 eth_proto_cap;
 	u32 link_mode;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	int was_opened;
 	int locked;
 	int error;
 	bool ext;
 
 	locked = PRIV_LOCKED(priv);
 	if (!locked)
 		PRIV_LOCK(priv);
 
 	if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) {
 		error = EINVAL;
 		goto done;
 	}
 
 	error = mlx5_query_port_ptys(mdev, out, sizeof(out),
 	    MLX5_PTYS_EN, 1);
 	if (error != 0) {
 		mlx5_en_err(dev, "Query port media capability failed\n");
 		goto done;
 	}
 
 	ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 	link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media), ext);
 
 	/* query supported capabilities */
 	eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 	    eth_proto_capability);
 
 	/* check for autoselect */
 	if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) {
 		link_mode = eth_proto_cap;
 		if (link_mode == 0) {
 			mlx5_en_err(dev, "Port media capability is zero\n");
 			error = EINVAL;
 			goto done;
 		}
 	} else {
 		link_mode = link_mode & eth_proto_cap;
 		if (link_mode == 0) {
 			mlx5_en_err(dev, "Not supported link mode requested\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
 		/* check if PFC is enabled */
 		if (priv->params.rx_priority_flow_control ||
 		    priv->params.tx_priority_flow_control) {
 			mlx5_en_err(dev, "PFC must be disabled before enabling global pauseframes.\n");
 			error = EINVAL;
 			goto done;
 		}
 	}
 	/* update pauseframe control bits */
 	priv->params.rx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0;
 	priv->params.tx_pauseframe_control =
 	    (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0;
 
 	/* check if device is opened */
 	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	/* reconfigure the hardware */
 	mlx5_set_port_status(mdev, MLX5_PORT_DOWN);
 	mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN, ext);
 	error = -mlx5e_set_port_pause_and_pfc(priv);
 	if (was_opened)
 		mlx5_set_port_status(mdev, MLX5_PORT_UP);
 
 done:
 	if (!locked)
 		PRIV_UNLOCK(priv);
 	return (error);
 }
 
 static void
 mlx5e_update_carrier_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
 	    update_carrier_work);
 
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
 		mlx5e_update_carrier(priv);
 	PRIV_UNLOCK(priv);
 }
 
 #define	MLX5E_PCIE_PERF_GET_64(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET64(mpcnt_reg, out, counter_set.f.c);
 
 #define	MLX5E_PCIE_PERF_GET_32(a,b,c,d,e,f)    \
 	s_debug->c = MLX5_GET(mpcnt_reg, out, counter_set.f.c);
 
 static void
 mlx5e_update_pcie_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	const unsigned sz = MLX5_ST_SZ_BYTES(mpcnt_reg);
 	void *out;
 	void *in;
 	int err;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_64(MLX5E_PCIE_PERF_GET_64)
 	MLX5E_PCIE_PERFORMANCE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_LANE_COUNTERS_GROUP);
 	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 	if (err != 0)
 		goto free_out;
 
 	MLX5E_PCIE_LANE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32)
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 /*
  * This function reads the physical port counters from the firmware
  * using a pre-defined layout defined by various MLX5E_PPORT_XXX()
  * macros. The output is converted from big-endian 64-bit values into
  * host endian ones and stored in the "priv->stats.pport" structure.
  */
 static void
 mlx5e_update_pport_counters(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_pport_stats *s = &priv->stats.pport;
 	struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug;
 	u32 *in;
 	u32 *out;
 	const u64 *ptr;
 	unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 	unsigned x;
 	unsigned y;
 	unsigned z;
 
 	/* allocate firmware request structures */
 	in = mlx5_vzalloc(sz);
 	out = mlx5_vzalloc(sz);
 	if (in == NULL || out == NULL)
 		goto free_out;
 
 	/*
 	 * Get pointer to the 64-bit counter set which is located at a
 	 * fixed offset in the output firmware request structure:
 	 */
 	ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set);
 
 	MLX5_SET(ppcnt_reg, in, local_port, 1);
 
 	/* read IEEE802_3 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM;
 	     x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2819 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++)
 		s->arg[y] = be64toh(ptr[x]);
 
 	for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM +
 	    MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read RFC2863 counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read physical layer stats counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Ethernet counter group using predefined counter layout */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 	for (x = 0; x != MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM; x++, y++)
 		s_debug->arg[y] = be64toh(ptr[x]);
 
 	/* read Extended Statistical Group */
 	if (MLX5_CAP_GEN(mdev, pcam_reg) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) &&
 	    MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) {
 		/* read Extended Statistical counter group using predefined counter layout */
 		MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		for (x = 0; x != MLX5E_PPORT_STATISTICAL_DEBUG_NUM; x++, y++)
 			s_debug->arg[y] = be64toh(ptr[x]);
 	}
 
 	/* read PCIE counters */
 	mlx5e_update_pcie_counters(priv);
 
 	/* read per-priority counters */
 	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
 
 	/* iterate all the priorities */
 	for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) {
 		MLX5_SET(ppcnt_reg, in, prio_tc, z);
 		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
 
 		/* read per priority stats counter group using predefined counter layout */
 		for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM /
 		    MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++)
 			s->arg[y] = be64toh(ptr[x]);
 	}
 
 free_out:
 	/* free firmware request structures */
 	kvfree(in);
 	kvfree(out);
 }
 
 static void
 mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv)
 {
 	u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
 
 	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
 		return;
 
 	MLX5_SET(query_vnic_env_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VNIC_ENV);
 	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
 	MLX5_SET(query_vnic_env_in, in, other_vport, 0);
 
 	if (mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)) != 0)
 		return;
 
 	priv->stats.vport.rx_steer_missed_packets =
 	    MLX5_GET64(query_vnic_env_out, out,
 	    vport_env.nic_receive_steering_discard);
 }
 
 /*
  * This function is called regularly to collect all statistics
  * counters from the firmware. The values can be viewed through the
  * sysctl interface. Execution is serialized using the priv's global
  * configuration lock.
  */
 static void
 mlx5e_update_stats_locked(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_vport_stats *s = &priv->stats.vport;
 	struct mlx5e_sq_stats *sq_stats;
 #if (__FreeBSD_version < 1100000)
 	struct ifnet *ifp = priv->ifp;
 #endif
 
 	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
 	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
 	u64 tso_packets = 0;
 	u64 tso_bytes = 0;
 	u64 tx_queue_dropped = 0;
 	u64 tx_defragged = 0;
 	u64 tx_offload_none = 0;
 	u64 lro_packets = 0;
 	u64 lro_bytes = 0;
 	u64 sw_lro_queued = 0;
 	u64 sw_lro_flushed = 0;
 	u64 rx_csum_none = 0;
 	u64 rx_wqe_err = 0;
 	u64 rx_packets = 0;
 	u64 rx_bytes = 0;
 	u32 rx_out_of_buffer = 0;
 	int error;
 	int i;
 	int j;
 
 	out = mlx5_vzalloc(outlen);
 	if (out == NULL)
 		goto free_out;
 
 	/* Collect firts the SW counters and then HW for consistency */
 	for (i = 0; i < priv->params.num_channels; i++) {
 		struct mlx5e_channel *pch = priv->channel + i;
 		struct mlx5e_rq *rq = &pch->rq;
 		struct mlx5e_rq_stats *rq_stats = &pch->rq.stats;
 
 		/* collect stats from LRO */
 		rq_stats->sw_lro_queued = rq->lro.lro_queued;
 		rq_stats->sw_lro_flushed = rq->lro.lro_flushed;
 		sw_lro_queued += rq_stats->sw_lro_queued;
 		sw_lro_flushed += rq_stats->sw_lro_flushed;
 		lro_packets += rq_stats->lro_packets;
 		lro_bytes += rq_stats->lro_bytes;
 		rx_csum_none += rq_stats->csum_none;
 		rx_wqe_err += rq_stats->wqe_err;
 		rx_packets += rq_stats->packets;
 		rx_bytes += rq_stats->bytes;
 
 		for (j = 0; j < priv->num_tc; j++) {
 			sq_stats = &pch->sq[j].stats;
 
 			tso_packets += sq_stats->tso_packets;
 			tso_bytes += sq_stats->tso_bytes;
 			tx_queue_dropped += sq_stats->dropped;
 			tx_queue_dropped += sq_stats->enobuf;
 			tx_defragged += sq_stats->defragged;
 			tx_offload_none += sq_stats->csum_offload_none;
 		}
 	}
 
 #ifdef RATELIMIT
 	/* Collect statistics from all rate-limit queues */
 	for (j = 0; j < priv->rl.param.tx_worker_threads_def; j++) {
 		struct mlx5e_rl_worker *rlw = priv->rl.workers + j;
 
 		for (i = 0; i < priv->rl.param.tx_channels_per_worker_def; i++) {
 			struct mlx5e_rl_channel *channel = rlw->channels + i;
 			struct mlx5e_sq *sq = channel->sq;
 
 			if (sq == NULL)
 				continue;
 
 			sq_stats = &sq->stats;
 
 			tso_packets += sq_stats->tso_packets;
 			tso_bytes += sq_stats->tso_bytes;
 			tx_queue_dropped += sq_stats->dropped;
 			tx_queue_dropped += sq_stats->enobuf;
 			tx_defragged += sq_stats->defragged;
 			tx_offload_none += sq_stats->csum_offload_none;
 		}
 	}
 #endif
 
 	/* update counters */
 	s->tso_packets = tso_packets;
 	s->tso_bytes = tso_bytes;
 	s->tx_queue_dropped = tx_queue_dropped;
 	s->tx_defragged = tx_defragged;
 	s->lro_packets = lro_packets;
 	s->lro_bytes = lro_bytes;
 	s->sw_lro_queued = sw_lro_queued;
 	s->sw_lro_flushed = sw_lro_flushed;
 	s->rx_csum_none = rx_csum_none;
 	s->rx_wqe_err = rx_wqe_err;
 	s->rx_packets = rx_packets;
 	s->rx_bytes = rx_bytes;
 
 	mlx5e_grp_vnic_env_update_stats(priv);
 
 	/* HW counters */
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(query_vport_counter_in, in, opcode,
 	    MLX5_CMD_OP_QUERY_VPORT_COUNTER);
 	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
 	MLX5_SET(query_vport_counter_in, in, other_vport, 0);
 
 	memset(out, 0, outlen);
 
 	/* get number of out-of-buffer drops first */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 &&
 	    mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id,
 	    &rx_out_of_buffer) == 0) {
 		s->rx_out_of_buffer = rx_out_of_buffer;
 	}
 
 	/* get port statistics */
 	if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen) == 0) {
 #define	MLX5_GET_CTR(out, x) \
 	MLX5_GET64(query_vport_counter_out, out, x)
 
 		s->rx_error_packets =
 		    MLX5_GET_CTR(out, received_errors.packets);
 		s->rx_error_bytes =
 		    MLX5_GET_CTR(out, received_errors.octets);
 		s->tx_error_packets =
 		    MLX5_GET_CTR(out, transmit_errors.packets);
 		s->tx_error_bytes =
 		    MLX5_GET_CTR(out, transmit_errors.octets);
 
 		s->rx_unicast_packets =
 		    MLX5_GET_CTR(out, received_eth_unicast.packets);
 		s->rx_unicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_unicast.octets);
 		s->tx_unicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.packets);
 		s->tx_unicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_unicast.octets);
 
 		s->rx_multicast_packets =
 		    MLX5_GET_CTR(out, received_eth_multicast.packets);
 		s->rx_multicast_bytes =
 		    MLX5_GET_CTR(out, received_eth_multicast.octets);
 		s->tx_multicast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.packets);
 		s->tx_multicast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_multicast.octets);
 
 		s->rx_broadcast_packets =
 		    MLX5_GET_CTR(out, received_eth_broadcast.packets);
 		s->rx_broadcast_bytes =
 		    MLX5_GET_CTR(out, received_eth_broadcast.octets);
 		s->tx_broadcast_packets =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
 		s->tx_broadcast_bytes =
 		    MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
 
 		s->tx_packets = s->tx_unicast_packets +
 		    s->tx_multicast_packets + s->tx_broadcast_packets;
 		s->tx_bytes = s->tx_unicast_bytes + s->tx_multicast_bytes +
 		    s->tx_broadcast_bytes;
 
 		/* Update calculated offload counters */
 		s->tx_csum_offload = s->tx_packets - tx_offload_none;
 		s->rx_csum_good = s->rx_packets - s->rx_csum_none;
 	}
 
 	/* Get physical port counters */
 	mlx5e_update_pport_counters(priv);
 
 	s->tx_jumbo_packets =
 	    priv->stats.port_stats_debug.tx_stat_p1519to2047octets +
 	    priv->stats.port_stats_debug.tx_stat_p2048to4095octets +
 	    priv->stats.port_stats_debug.tx_stat_p4096to8191octets +
 	    priv->stats.port_stats_debug.tx_stat_p8192to10239octets;
 
 #if (__FreeBSD_version < 1100000)
 	/* no get_counters interface in fbsd 10 */
 	ifp->if_ipackets = s->rx_packets;
 	ifp->if_ierrors = priv->stats.pport.in_range_len_errors +
 	    priv->stats.pport.out_of_range_len +
 	    priv->stats.pport.too_long_errors +
 	    priv->stats.pport.check_seq_err +
 	    priv->stats.pport.alignment_err;
 	ifp->if_iqdrops = s->rx_out_of_buffer;
 	ifp->if_opackets = s->tx_packets;
 	ifp->if_oerrors = priv->stats.port_stats_debug.out_discards;
 	ifp->if_snd.ifq_drops = s->tx_queue_dropped;
 	ifp->if_ibytes = s->rx_bytes;
 	ifp->if_obytes = s->tx_bytes;
 	ifp->if_collisions =
 	    priv->stats.pport.collisions;
 #endif
 
 free_out:
 	kvfree(out);
 
 	/* Update diagnostics, if any */
 	if (priv->params_ethtool.diag_pci_enable ||
 	    priv->params_ethtool.diag_general_enable) {
 		error = mlx5_core_get_diagnostics_full(mdev,
 		    priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL,
 		    priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL);
 		if (error != 0)
 			mlx5_en_err(priv->ifp,
 			    "Failed reading diagnostics: %d\n", error);
 	}
 
 	/* Update FEC, if any */
 	error = mlx5e_fec_update(priv);
 	if (error != 0 && error != EOPNOTSUPP) {
 		mlx5_en_err(priv->ifp,
 		    "Updating FEC failed: %d\n", error);
 	}
 
 	/* Update temperature, if any */
 	if (priv->params_ethtool.hw_num_temp != 0) {
 		error = mlx5e_hw_temperature_update(priv);
 		if (error != 0 && error != EOPNOTSUPP) {
 			mlx5_en_err(priv->ifp,
 			    "Updating temperature failed: %d\n", error);
 		}
 	}
 }
 
 static void
 mlx5e_update_stats_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv;
 
 	priv = container_of(work, struct mlx5e_priv, update_stats_work);
 	PRIV_LOCK(priv);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 &&
 	    !test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &priv->mdev->intf_state))
 		mlx5e_update_stats_locked(priv);
 	PRIV_UNLOCK(priv);
 }
 
 static void
 mlx5e_update_stats(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	queue_work(priv->wq, &priv->update_stats_work);
 
 	callout_reset(&priv->watchdog, hz / 4, &mlx5e_update_stats, priv);
 }
 
 static void
 mlx5e_async_event_sub(struct mlx5e_priv *priv,
     enum mlx5_dev_event event)
 {
 	switch (event) {
 	case MLX5_DEV_EVENT_PORT_UP:
 	case MLX5_DEV_EVENT_PORT_DOWN:
 		queue_work(priv->wq, &priv->update_carrier_work);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static void
 mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
     enum mlx5_dev_event event, unsigned long param)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	mtx_lock(&priv->async_events_mtx);
 	if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state))
 		mlx5e_async_event_sub(priv, event);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void
 mlx5e_enable_async_events(struct mlx5e_priv *priv)
 {
 	set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 }
 
 static void
 mlx5e_disable_async_events(struct mlx5e_priv *priv)
 {
 	mtx_lock(&priv->async_events_mtx);
 	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
 	mtx_unlock(&priv->async_events_mtx);
 }
 
 static void mlx5e_calibration_callout(void *arg);
 static int mlx5e_calibration_duration = 20;
 static int mlx5e_fast_calibration = 1;
 static int mlx5e_normal_calibration = 30;
 
 static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "MLX5 timestamp calibration parameteres");
 
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN,
     &mlx5e_calibration_duration, 0,
     "Duration of initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN,
     &mlx5e_fast_calibration, 0,
     "Recalibration interval during initial calibration");
 SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN,
     &mlx5e_normal_calibration, 0,
     "Recalibration interval during normal operations");
 
 /*
  * Ignites the calibration process.
  */
 static void
 mlx5e_reset_calibration_callout(struct mlx5e_priv *priv)
 {
 
 	if (priv->clbr_done == 0)
 		mlx5e_calibration_callout(priv);
 	else
 		callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done <
 		    mlx5e_calibration_duration ? mlx5e_fast_calibration :
 		    mlx5e_normal_calibration) * hz, mlx5e_calibration_callout,
 		    priv);
 }
 
 static uint64_t
 mlx5e_timespec2usec(const struct timespec *ts)
 {
 
 	return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec);
 }
 
 static uint64_t
 mlx5e_hw_clock(struct mlx5e_priv *priv)
 {
 	struct mlx5_init_seg *iseg;
 	uint32_t hw_h, hw_h1, hw_l;
 
 	iseg = priv->mdev->iseg;
 	do {
 		hw_h = ioread32be(&iseg->internal_timer_h);
 		hw_l = ioread32be(&iseg->internal_timer_l);
 		hw_h1 = ioread32be(&iseg->internal_timer_h);
 	} while (hw_h1 != hw_h);
 	return (((uint64_t)hw_h << 32) | hw_l);
 }
 
 /*
  * The calibration callout, it runs either in the context of the
  * thread which enables calibration, or in callout.  It takes the
  * snapshot of system and adapter clocks, then advances the pointers to
  * the calibration point to allow rx path to read the consistent data
  * lockless.
  */
 static void
 mlx5e_calibration_callout(void *arg)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_clbr_point *next, *curr;
 	struct timespec ts;
 	int clbr_curr_next;
 
 	priv = arg;
 	curr = &priv->clbr_points[priv->clbr_curr];
 	clbr_curr_next = priv->clbr_curr + 1;
 	if (clbr_curr_next >= nitems(priv->clbr_points))
 		clbr_curr_next = 0;
 	next = &priv->clbr_points[clbr_curr_next];
 
 	next->base_prev = curr->base_curr;
 	next->clbr_hw_prev = curr->clbr_hw_curr;
 
 	next->clbr_hw_curr = mlx5e_hw_clock(priv);
 	if (((next->clbr_hw_curr - curr->clbr_hw_curr) >> MLX5E_TSTMP_PREC) ==
 	    0) {
 		if (priv->clbr_done != 0) {
 			mlx5_en_err(priv->ifp,
 			    "HW failed tstmp frozen %#jx %#jx, disabling\n",
 			     next->clbr_hw_curr, curr->clbr_hw_prev);
 			priv->clbr_done = 0;
 		}
 		atomic_store_rel_int(&curr->clbr_gen, 0);
 		return;
 	}
 
 	nanouptime(&ts);
 	next->base_curr = mlx5e_timespec2usec(&ts);
 
 	curr->clbr_gen = 0;
 	atomic_thread_fence_rel();
 	priv->clbr_curr = clbr_curr_next;
 	atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen));
 
 	if (priv->clbr_done < mlx5e_calibration_duration)
 		priv->clbr_done++;
 	mlx5e_reset_calibration_callout(priv);
 }
 
 static const char *mlx5e_rq_stats_desc[] = {
 	MLX5E_RQ_STATS(MLX5E_STATS_DESC)
 };
 
 static int
 mlx5e_create_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *rqc = param->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	int wq_sz;
 	int err;
 	int i;
 	u32 nsegs, wqe_sz;
 
 	err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	if (err != 0)
 		goto done;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsize */
 	    nsegs,			/* nsegments */
 	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &rq->dma_tag)))
 		goto done;
 
 	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
 	    &rq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
 	err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
 	if (err != 0)
 		goto err_rq_wq_destroy;
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 
 	err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz);
 	if (err)
 		goto err_rq_wq_destroy;
 
 	rq->mbuf = malloc_domainset(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN,
 	    mlx5_dev_domainset(mdev), M_WAITOK | M_ZERO);
 	for (i = 0; i != wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 		int j;
 
 		err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
 		if (err != 0) {
 			while (i--)
 				bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 			goto err_rq_mbuf_free;
 		}
 
 		/* set value for constant fields */
 		for (j = 0; j < rq->nsegs; j++)
 			wqe->data[j].lkey = cpu_to_be32(priv->mr.key);
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_dim_work);
 	if (priv->params.rx_cq_moderation_mode < 2) {
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 	} else {
 		void *cqc = container_of(param,
 		    struct mlx5e_channel_param, rq)->rx_cq.cqc;
 
 		switch (MLX5_GET(cqc, cqc, cq_period_mode)) {
 		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		default:
 			rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 			break;
 		}
 	}
 
 	rq->ifp = priv->ifp;
 	rq->channel = c;
 	rq->ix = c->ix;
 
 	snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix);
 	mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM,
 	    rq->stats.arg);
 	return (0);
 
 err_rq_mbuf_free:
 	free(rq->mbuf, M_MLX5EN);
 	tcp_lro_free(&rq->lro);
 err_rq_wq_destroy:
 	mlx5_wq_destroy(&rq->wq_ctrl);
 err_free_dma_tag:
 	bus_dma_tag_destroy(rq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
 	int wq_sz;
 	int i;
 
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&rq->stats.ctx);
 
 	/* free leftover LRO packets, if any */
 	tcp_lro_free(&rq->lro);
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	for (i = 0; i != wq_sz; i++) {
 		if (rq->mbuf[i].mbuf != NULL) {
 			bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map);
 			m_freem(rq->mbuf[i].mbuf);
 		}
 		bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 	}
 	free(rq->mbuf, M_MLX5EN);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 	bus_dma_tag_destroy(rq->dma_tag);
 }
 
 static int
 mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *in;
 	void *rqc;
 	void *wq;
 	int inlen;
 	int err;
 	u8 ts_format;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rq_in) +
 	    sizeof(u64) * rq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	ts_format = mlx5_get_rq_default_ts(mdev);
 	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
 	wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
 	memcpy(rqc, param->rqc, sizeof(param->rqc));
 
 	MLX5_SET(rqc, rqc, cqn, c->rq.cq.mcq.cqn);
 	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc, rqc, ts_format, ts_format);
 	MLX5_SET(rqc, rqc, flush_in_error_en, 1);
 	if (priv->counter_set_id >= 0)
 		MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id);
 	MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&rq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static int
 mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *rqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
 
 	MLX5_SET(modify_rq_in, in, rqn, rq->rqn);
 	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
 	MLX5_SET(rqc, rqc, state, next_state);
 
 	err = mlx5_core_modify_rq(mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_disable_rq(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	mlx5_core_destroy_rq(mdev, rq->rqn);
 }
 
 static int
 mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5e_channel *c = rq->channel;
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_wq_ll *wq = &rq->wq;
 	int i;
 
 	for (i = 0; i < 1000; i++) {
 		if (wq->cur_sz >= priv->params.min_rx_wqes)
 			return (0);
 
 		msleep(4);
 	}
 	return (-ETIMEDOUT);
 }
 
 static int
 mlx5e_open_rq(struct mlx5e_channel *c,
     struct mlx5e_rq_param *param,
     struct mlx5e_rq *rq)
 {
 	int err;
 
 	err = mlx5e_create_rq(c, param, rq);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_rq(rq, param);
 	if (err)
 		goto err_destroy_rq;
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err)
 		goto err_disable_rq;
 
 	c->rq.enabled = 1;
 
 	return (0);
 
 err_disable_rq:
 	mlx5e_disable_rq(rq);
 err_destroy_rq:
 	mlx5e_destroy_rq(rq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_rq(struct mlx5e_rq *rq)
 {
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 }
 
 static void
 mlx5e_close_rq_wait(struct mlx5e_rq *rq)
 {
 
 	mlx5e_disable_rq(rq);
 	mlx5e_close_cq(&rq->cq);
 	cancel_work_sync(&rq->dim.work);
 	mlx5e_destroy_rq(rq);
 }
 
 void
 mlx5e_free_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int x;
 
 	for (x = 0; x != wq_sz; x++) {
 		if (unlikely(sq->mbuf[x].p_refcount != NULL)) {
 			atomic_add_int(sq->mbuf[x].p_refcount, -1);
 			sq->mbuf[x].p_refcount = NULL;
 		}
 		if (sq->mbuf[x].mbuf != NULL) {
 			bus_dmamap_unload(sq->dma_tag, sq->mbuf[x].dma_map);
 			m_freem(sq->mbuf[x].mbuf);
 		}
 		bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 	}
 	free(sq->mbuf, M_MLX5EN);
 }
 
 int
 mlx5e_alloc_sq_db(struct mlx5e_sq *sq)
 {
 	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
 	int err;
 	int x;
 
 	sq->mbuf = malloc_domainset(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN,
 	    mlx5_dev_domainset(sq->priv->mdev), M_WAITOK | M_ZERO);
 
 	/* Create DMA descriptor MAPs */
 	for (x = 0; x != wq_sz; x++) {
 		err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map);
 		if (err != 0) {
 			while (x--)
 				bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map);
 			free(sq->mbuf, M_MLX5EN);
 			return (err);
 		}
 	}
 	return (0);
 }
 
 static const char *mlx5e_sq_stats_desc[] = {
 	MLX5E_SQ_STATS(MLX5E_STATS_DESC)
 };
 
 void
 mlx5e_update_sq_inline(struct mlx5e_sq *sq)
 {
 	sq->max_inline = sq->priv->params.tx_max_inline;
 	sq->min_inline_mode = sq->priv->params.tx_min_inline_mode;
 
 	/*
 	 * Check if trust state is DSCP or if inline mode is NONE which
 	 * indicates CX-5 or newer hardware.
 	 */
 	if (sq->priv->params_ethtool.trust_state != MLX5_QPTS_TRUST_PCP ||
 	    sq->min_inline_mode == MLX5_INLINE_MODE_NONE) {
 		if (MLX5_CAP_ETH(sq->priv->mdev, wqe_vlan_insert))
 			sq->min_insert_caps = MLX5E_INSERT_VLAN | MLX5E_INSERT_NON_VLAN;
 		else
 			sq->min_insert_caps = MLX5E_INSERT_NON_VLAN;
 	} else {
 		sq->min_insert_caps = 0;
 	}
 }
 
 static void
 mlx5e_refresh_sq_inline_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int i;
 
 	for (i = 0; i != priv->num_tc; i++) {
 		mtx_lock(&c->sq[i].lock);
 		mlx5e_update_sq_inline(&c->sq[i]);
 		mtx_unlock(&c->sq[i].lock);
 	}
 }
 
 void
 mlx5e_refresh_sq_inline(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_refresh_sq_inline_sub(priv, &priv->channel[i]);
 }
 
 static int
 mlx5e_create_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	struct mlx5e_priv *priv = c->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	char buffer[16];
 	void *sqc = param->sqc;
 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	int err;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &sq->dma_tag)))
 		goto done;
 
 	sq->mkey_be = cpu_to_be32(priv->mr.key);
 	sq->ifp = priv->ifp;
 	sq->priv = priv;
 	sq->tc = tc;
 
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 	    &sq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
 	err = mlx5e_alloc_sq_db(sq);
 	if (err)
 		goto err_sq_wq_destroy;
 
 	mlx5e_update_sq_inline(sq);
 
 	snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc);
 	mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM,
 	    sq->stats.arg);
 
 	return (0);
 
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
 err_free_dma_tag:
 	bus_dma_tag_destroy(sq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_destroy_sq(struct mlx5e_sq *sq)
 {
 	/* destroy all sysctl nodes */
 	sysctl_ctx_free(&sq->stats.ctx);
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 	bus_dma_tag_destroy(sq->dma_tag);
 }
 
 int
 mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param,
     const struct mlx5_sq_bfreg *bfreg, int tis_num)
 {
 	void *in;
 	void *sqc;
 	void *wq;
 	int inlen;
 	int err;
 	u8 ts_format;
 
 	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
 	    sizeof(u64) * sq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sq->uar_map = bfreg->map;
 
 	ts_format = mlx5_get_sq_default_ts(sq->priv->mdev);
 	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
 	wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	memcpy(sqc, param->sqc, sizeof(param->sqc));
 
 	MLX5_SET(sqc, sqc, tis_num_0, tis_num);
 	MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn);
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
 	MLX5_SET(sqc, sqc, ts_format, ts_format);
 	MLX5_SET(sqc, sqc, tis_lst_sz, 1);
 	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
 	MLX5_SET(sqc, sqc, allow_swp, 1);
 
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
 	MLX5_SET(wq, wq, uar_page, bfreg->index);
 	MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma);
 
 	mlx5_fill_page_array(&sq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 int
 mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state)
 {
 	void *in;
 	void *sqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
 	MLX5_SET(modify_sq_in, in, sq_state, curr_state);
 	MLX5_SET(sqc, sqc, state, next_state);
 
 	err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 void
 mlx5e_disable_sq(struct mlx5e_sq *sq)
 {
 
 	mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn);
 }
 
 static int
 mlx5e_open_sq(struct mlx5e_channel *c,
     int tc,
     struct mlx5e_sq_param *param,
     struct mlx5e_sq *sq)
 {
 	int err;
 
 	sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
 
 	/* ensure the TX completion event factor is not zero */
 	if (sq->cev_factor == 0)
 		sq->cev_factor = 1;
 
 	err = mlx5e_create_sq(c, tc, param, sq);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_sq(sq, param, &c->bfreg, c->priv->tisn[tc]);
 	if (err)
 		goto err_destroy_sq;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
 	if (err)
 		goto err_disable_sq;
 
 	WRITE_ONCE(sq->running, 1);
 
 	return (0);
 
 err_disable_sq:
 	mlx5e_disable_sq(sq);
 err_destroy_sq:
 	mlx5e_destroy_sq(sq);
 
 	return (err);
 }
 
 static void
 mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
 {
 	/* fill up remainder with NOPs */
 	while (sq->cev_counter != 0) {
 		while (!mlx5e_sq_has_room_for(sq, 1)) {
 			if (can_sleep != 0) {
 				mtx_unlock(&sq->lock);
 				msleep(4);
 				mtx_lock(&sq->lock);
 			} else {
 				goto done;
 			}
 		}
 		/* send a single NOP */
 		mlx5e_send_nop(sq, 1);
 		atomic_thread_fence_rel();
 	}
 done:
 	/* Check if we need to write the doorbell */
 	if (likely(sq->doorbell.d64 != 0)) {
 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32);
 		sq->doorbell.d64 = 0;
 	}
 }
 
 void
 mlx5e_sq_cev_timeout(void *arg)
 {
 	struct mlx5e_sq *sq = arg;
 
 	mtx_assert(&sq->lock, MA_OWNED);
 
 	/* check next state */
 	switch (sq->cev_next_state) {
 	case MLX5E_CEV_STATE_SEND_NOPS:
 		/* fill TX ring with NOPs, if any */
 		mlx5e_sq_send_nops_locked(sq, 0);
 
 		/* check if completed */
 		if (sq->cev_counter == 0) {
 			sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 			return;
 		}
 		break;
 	default:
 		/* send NOPs on next timeout */
 		sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
 		break;
 	}
 
 	/* restart timer */
 	callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
 }
 
 void
 mlx5e_drain_sq(struct mlx5e_sq *sq)
 {
 	int error;
 	struct mlx5_core_dev *mdev= sq->priv->mdev;
 
 	/*
 	 * Check if already stopped.
 	 *
 	 * NOTE: Serialization of this function is managed by the
 	 * caller ensuring the priv's state lock is locked or in case
 	 * of rate limit support, a single thread manages drain and
 	 * resume of SQs. The "running" variable can therefore safely
 	 * be read without any locks.
 	 */
 	if (READ_ONCE(sq->running) == 0)
 		return;
 
 	/* don't put more packets into the SQ */
 	WRITE_ONCE(sq->running, 0);
 
 	/* serialize access to DMA rings */
 	mtx_lock(&sq->lock);
 
 	/* teardown event factor timer, if any */
 	sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
 	callout_stop(&sq->cev_callout);
 
 	/* send dummy NOPs in order to flush the transmit ring */
 	mlx5e_sq_send_nops_locked(sq, 1);
 	mtx_unlock(&sq->lock);
 
 	/* wait till SQ is empty or link is down */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	    (sq->priv->media_status_last & IFM_ACTIVE) != 0 &&
 	    mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
 	    pci_channel_offline(mdev->pdev) == 0) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq, NULL);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 
 	/* error out remaining requests */
 	error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
 	if (error != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error);
 	}
 
 	/* wait till SQ is empty */
 	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc &&
 	       mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
 	       pci_channel_offline(mdev->pdev) == 0) {
 		mtx_unlock(&sq->lock);
 		msleep(1);
 		sq->cq.mcq.comp(&sq->cq.mcq, NULL);
 		mtx_lock(&sq->lock);
 	}
 	mtx_unlock(&sq->lock);
 }
 
 static void
 mlx5e_close_sq_wait(struct mlx5e_sq *sq)
 {
 
 	mlx5e_drain_sq(sq);
 	mlx5e_disable_sq(sq);
 	mlx5e_destroy_sq(sq);
 }
 
 static int
 mlx5e_create_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	int eqn_not_used;
 	int irqn;
 	int err;
 	u32 i;
 
 	err = mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn);
 	if (err)
 		return (err);
 
 	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
 	    &cq->wq_ctrl);
 	if (err)
 		return (err);
 
 	mcq->cqe_sz = 64;
 	mcq->set_ci_db = cq->wq_ctrl.db.db;
 	mcq->arm_db = cq->wq_ctrl.db.db + 1;
 	*mcq->set_ci_db = 0;
 	*mcq->arm_db = 0;
 	mcq->vector = eq_ix;
 	mcq->comp = comp;
 	mcq->event = mlx5e_cq_error_event;
 	mcq->irqn = irqn;
 
 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
 
 		cqe->op_own = 0xf1;
 	}
 
 	cq->priv = priv;
 
 	return (0);
 }
 
 static void
 mlx5e_destroy_cq(struct mlx5e_cq *cq)
 {
 	mlx5_wq_destroy(&cq->wq_ctrl);
 }
 
 static int
 mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix)
 {
 	struct mlx5_core_cq *mcq = &cq->mcq;
 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	void *in;
 	void *cqc;
 	int inlen;
 	int irqn_not_used;
 	int eqn;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 	    sizeof(u64) * cq->wq_ctrl.buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 
 	memcpy(cqc, param->cqc, sizeof(param->cqc));
 
 	mlx5_fill_page_array(&cq->wq_ctrl.buf,
 	    (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used);
 
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 	    PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
 
 	err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen, out, sizeof(out));
 
 	kvfree(in);
 
 	if (err)
 		return (err);
 
 	mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock));
 
 	return (0);
 }
 
 static void
 mlx5e_disable_cq(struct mlx5e_cq *cq)
 {
 
 	mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq);
 }
 
 int
 mlx5e_open_cq(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param,
     struct mlx5e_cq *cq,
     mlx5e_cq_comp_t *comp,
     int eq_ix)
 {
 	int err;
 
 	err = mlx5e_create_cq(priv, param, cq, comp, eq_ix);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_cq(cq, param, eq_ix);
 	if (err)
 		goto err_destroy_cq;
 
 	return (0);
 
 err_destroy_cq:
 	mlx5e_destroy_cq(cq);
 
 	return (err);
 }
 
 void
 mlx5e_close_cq(struct mlx5e_cq *cq)
 {
 	mlx5e_disable_cq(cq);
 	mlx5e_destroy_cq(cq);
 }
 
 static int
 mlx5e_open_tx_cqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++) {
 		/* open completion queue */
 		err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq,
 		    &mlx5e_tx_cq_comp, c->ix);
 		if (err)
 			goto err_close_tx_cqs;
 	}
 	return (0);
 
 err_close_tx_cqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_cq(&c->sq[tc].cq);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tx_cqs(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++)
 		mlx5e_close_cq(&c->sq[tc].cq);
 }
 
 static int
 mlx5e_open_sqs(struct mlx5e_channel *c,
     struct mlx5e_channel_param *cparam)
 {
 	int err;
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++) {
 		err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]);
 		if (err)
 			goto err_close_sqs;
 	}
 
 	return (0);
 
 err_close_sqs:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 
 	return (err);
 }
 
 static void
 mlx5e_close_sqs_wait(struct mlx5e_channel *c)
 {
 	int tc;
 
 	for (tc = 0; tc < c->priv->num_tc; tc++)
 		mlx5e_close_sq_wait(&c->sq[tc]);
 }
 
 static void
 mlx5e_chan_static_init(struct mlx5e_priv *priv, struct mlx5e_channel *c, int ix)
 {
 	int tc;
 
 	/* setup priv and channel number */
 	c->priv = priv;
 	c->ix = ix;
 
 	/* setup send tag */
-	m_snd_tag_init(&c->tag, c->priv->ifp, IF_SND_TAG_TYPE_UNLIMITED);
+	m_snd_tag_init(&c->tag, c->priv->ifp, &mlx5e_ul_snd_tag_sw);
 
 	init_completion(&c->completion);
 
 	mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
 
 	callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0);
 
 	for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) {
 		struct mlx5e_sq *sq = c->sq + tc;
 
 		mtx_init(&sq->lock, "mlx5tx",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 		mtx_init(&sq->comp_lock, "mlx5comp",
 		    MTX_NETWORK_LOCK " TX", MTX_DEF);
 
 		callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
 	}
 }
 
 static void
 mlx5e_chan_wait_for_completion(struct mlx5e_channel *c)
 {
 
 	m_snd_tag_rele(&c->tag);
 	wait_for_completion(&c->completion);
 }
 
 static void
 mlx5e_priv_wait_for_completion(struct mlx5e_priv *priv, const uint32_t channels)
 {
 	uint32_t x;
 
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_wait_for_completion(&priv->channel[x]);
 }
 
 static void
 mlx5e_chan_static_destroy(struct mlx5e_channel *c)
 {
 	int tc;
 
 	callout_drain(&c->rq.watchdog);
 
 	mtx_destroy(&c->rq.mtx);
 
 	for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) {
 		callout_drain(&c->sq[tc].cev_callout);
 		mtx_destroy(&c->sq[tc].lock);
 		mtx_destroy(&c->sq[tc].comp_lock);
 	}
 }
 
 static int
 mlx5e_open_channel(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam,
     struct mlx5e_channel *c)
 {
 	struct epoch_tracker et;
 	int i, err;
 
 	/* zero non-persistant data */
 	MLX5E_ZERO(&c->rq, mlx5e_rq_zero_start);
 	for (i = 0; i != priv->num_tc; i++)
 		MLX5E_ZERO(&c->sq[i], mlx5e_sq_zero_start);
 
 	/* open transmit completion queue */
 	err = mlx5e_open_tx_cqs(c, cparam);
 	if (err)
 		goto err_free;
 
 	/* open receive completion queue */
 	err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq,
 	    &mlx5e_rx_cq_comp, c->ix);
 	if (err)
 		goto err_close_tx_cqs;
 
 	err = mlx5e_open_sqs(c, cparam);
 	if (err)
 		goto err_close_rx_cq;
 
 	err = mlx5e_open_rq(c, &cparam->rq, &c->rq);
 	if (err)
 		goto err_close_sqs;
 
 	/* poll receive queue initially */
 	NET_EPOCH_ENTER(et);
 	c->rq.cq.mcq.comp(&c->rq.cq.mcq, NULL);
 	NET_EPOCH_EXIT(et);
 
 	return (0);
 
 err_close_sqs:
 	mlx5e_close_sqs_wait(c);
 
 err_close_rx_cq:
 	mlx5e_close_cq(&c->rq.cq);
 
 err_close_tx_cqs:
 	mlx5e_close_tx_cqs(c);
 
 err_free:
 	return (err);
 }
 
 static void
 mlx5e_close_channel(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq(&c->rq);
 }
 
 static void
 mlx5e_close_channel_wait(struct mlx5e_channel *c)
 {
 	mlx5e_close_rq_wait(&c->rq);
 	mlx5e_close_sqs_wait(c);
 	mlx5e_close_tx_cqs(c);
 }
 
 static int
 mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
 {
 	u32 r, n;
 
 	r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
 	    MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
 	if (r > MJUM16BYTES)
 		return (-ENOMEM);
 
 	if (r > MJUM9BYTES)
 		r = MJUM16BYTES;
 	else if (r > MJUMPAGESIZE)
 		r = MJUM9BYTES;
 	else if (r > MCLBYTES)
 		r = MJUMPAGESIZE;
 	else
 		r = MCLBYTES;
 
 	/*
 	 * n + 1 must be a power of two, because stride size must be.
 	 * Stride size is 16 * (n + 1), as the first segment is
 	 * control.
 	 */
 	for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
 		;
 
 	if (n > MLX5E_MAX_BUSDMA_RX_SEGS)
 		return (-ENOMEM);
 
 	*wqe_sz = r;
 	*nsegs = n;
 	return (0);
 }
 
 static void
 mlx5e_build_rq_param(struct mlx5e_priv *priv,
     struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	u32 wqe_sz, nsegs;
 
 	mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
 	    nsegs * sizeof(struct mlx5_wqe_data_seg)));
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_sq_param(struct mlx5e_priv *priv,
     struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index);
 }
 
 static void
 mlx5e_get_default_profile(struct mlx5e_priv *priv, int mode, struct net_dim_cq_moder *ptr)
 {
 
 	*ptr = net_dim_get_profile(mode, MLX5E_DIM_DEFAULT_PROFILE);
 
 	/* apply LRO restrictions */
 	if (priv->params.hw_lro_en &&
 	    ptr->pkts > MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO) {
 		ptr->pkts = MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO;
 	}
 }
 
 static void
 mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	struct net_dim_cq_moder curr;
 	void *cqc = param->cqc;
 
 	/*
 	 * We use MLX5_CQE_FORMAT_HASH because the RX hash mini CQE
 	 * format is more beneficial for FreeBSD use case.
 	 *
 	 * Adding support for MLX5_CQE_FORMAT_CSUM will require changes
 	 * in mlx5e_decompress_cqe.
 	 */
 	if (priv->params.cqe_zipping_en) {
 		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_HASH);
 		MLX5_SET(cqc, cqc, cqe_compression_en, 1);
 	}
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size);
 
 	switch (priv->params.rx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 1:
 		MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec);
 		MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 2:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	case 3:
 		mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE, &curr);
 		MLX5_SET(cqc, cqc, cq_period, curr.usec);
 		MLX5_SET(cqc, cqc, cq_max_count, curr.pkts);
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		break;
 	}
 
 	mlx5e_dim_build_cq_param(priv, param);
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 
 	MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size);
 	MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec);
 	MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts);
 
 	switch (priv->params.tx_cq_moderation_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	}
 
 	mlx5e_build_common_cq_param(priv, param);
 }
 
 static void
 mlx5e_build_channel_param(struct mlx5e_priv *priv,
     struct mlx5e_channel_param *cparam)
 {
 	memset(cparam, 0, sizeof(*cparam));
 
 	mlx5e_build_rq_param(priv, &cparam->rq);
 	mlx5e_build_sq_param(priv, &cparam->sq);
 	mlx5e_build_rx_cq_param(priv, &cparam->rx_cq);
 	mlx5e_build_tx_cq_param(priv, &cparam->tx_cq);
 }
 
 static int
 mlx5e_open_channels(struct mlx5e_priv *priv)
 {
 	struct mlx5e_channel_param *cparam;
 	int err;
 	int i;
 	int j;
 
 	cparam = malloc(sizeof(*cparam), M_MLX5EN, M_WAITOK);
 
 	mlx5e_build_channel_param(priv, cparam);
 	for (i = 0; i < priv->params.num_channels; i++) {
 		err = mlx5e_open_channel(priv, cparam, &priv->channel[i]);
 		if (err)
 			goto err_close_channels;
 
 		/* Bind interrupt vectors, if any. */
 		if (priv->params_ethtool.irq_cpu_base > -1) {
 			cpuset_t cpuset;
 			int cpu;
 			int irq;
 			int eqn;
 			int nirq;
 
 			err = mlx5_vector2eqn(priv->mdev, i,
 			    &eqn, &nirq);
 
 			/* error here is non-fatal */
 			if (err != 0)
 				continue;
 
 			irq = priv->mdev->priv.msix_arr[nirq].vector;
 			cpu = (unsigned)(priv->params_ethtool.irq_cpu_base +
 			    i * priv->params_ethtool.irq_cpu_stride) % (unsigned)mp_ncpus;
 
 			CPU_ZERO(&cpuset);
 			CPU_SET(cpu, &cpuset);
 			intr_setaffinity(irq, CPU_WHICH_INTRHANDLER, &cpuset);
 		}
 	}
 
 	for (j = 0; j < priv->params.num_channels; j++) {
 		err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j].rq);
 		if (err)
 			goto err_close_channels;
 	}
 	free(cparam, M_MLX5EN);
 	return (0);
 
 err_close_channels:
 	while (i--) {
 		mlx5e_close_channel(&priv->channel[i]);
 		mlx5e_close_channel_wait(&priv->channel[i]);
 	}
 	free(cparam, M_MLX5EN);
 	return (err);
 }
 
 static void
 mlx5e_close_channels(struct mlx5e_priv *priv)
 {
 	int i;
 
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel(&priv->channel[i]);
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_close_channel_wait(&priv->channel[i]);
 }
 
 static int
 mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 
 		switch (priv->params.tx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq,
 		    priv->params.tx_cq_moderation_usec,
 		    priv->params.tx_cq_moderation_pkts,
 		    cq_mode));
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq,
 	    priv->params.tx_cq_moderation_usec,
 	    priv->params.tx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq)
 {
 
 	if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) {
 		uint8_t cq_mode;
 		uint8_t dim_mode;
 		int retval;
 
 		switch (priv->params.rx_cq_moderation_mode) {
 		case 0:
 		case 2:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
 			break;
 		default:
 			cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
 			dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
 			break;
 		}
 
 		/* tear down dynamic interrupt moderation */
 		mtx_lock(&rq->mtx);
 		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED;
 		mtx_unlock(&rq->mtx);
 
 		/* wait for dynamic interrupt moderation work task, if any */
 		cancel_work_sync(&rq->dim.work);
 
 		if (priv->params.rx_cq_moderation_mode >= 2) {
 			struct net_dim_cq_moder curr;
 
 			mlx5e_get_default_profile(priv, dim_mode, &curr);
 
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    curr.usec, curr.pkts, cq_mode);
 
 			/* set dynamic interrupt moderation mode and zero defaults */
 			mtx_lock(&rq->mtx);
 			rq->dim.mode = dim_mode;
 			rq->dim.state = 0;
 			rq->dim.profile_ix = MLX5E_DIM_DEFAULT_PROFILE;
 			mtx_unlock(&rq->mtx);
 		} else {
 			retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq,
 			    priv->params.rx_cq_moderation_usec,
 			    priv->params.rx_cq_moderation_pkts,
 			    cq_mode);
 		}
 		return (retval);
 	}
 
 	return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq,
 	    priv->params.rx_cq_moderation_usec,
 	    priv->params.rx_cq_moderation_pkts));
 }
 
 static int
 mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c)
 {
 	int err;
 	int i;
 
 	err = mlx5e_refresh_rq_params(priv, &c->rq);
 	if (err)
 		goto done;
 
 	for (i = 0; i != priv->num_tc; i++) {
 		err = mlx5e_refresh_sq_params(priv, &c->sq[i]);
 		if (err)
 			goto done;
 	}
 done:
 	return (err);
 }
 
 int
 mlx5e_refresh_channel_params(struct mlx5e_priv *priv)
 {
 	int i;
 
 	/* check if channels are closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (EINVAL);
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		int err;
 
 		err = mlx5e_refresh_channel_params_sub(priv, &priv->channel[i]);
 		if (err)
 			return (err);
 	}
 	return (0);
 }
 
 static int
 mlx5e_open_tis(struct mlx5e_priv *priv, int tc)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(tisc, tisc, prio, tc);
 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
 
 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc]));
 }
 
 static void
 mlx5e_close_tis(struct mlx5e_priv *priv, int tc)
 {
 	mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc], 0);
 }
 
 static int
 mlx5e_open_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int err;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++) {
 		err = mlx5e_open_tis(priv, tc);
 		if (err)
 			goto err_close_tises;
 	}
 
 	return (0);
 
 err_close_tises:
 	for (tc--; tc >= 0; tc--)
 		mlx5e_close_tis(priv, tc);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tises(struct mlx5e_priv *priv)
 {
 	int num_tc = priv->num_tc;
 	int tc;
 
 	for (tc = 0; tc < num_tc; tc++)
 		mlx5e_close_tis(priv, tc);
 }
 
 static int
 mlx5e_open_rqt(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 *in;
 	u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0};
 	void *rqtc;
 	int inlen;
 	int err;
 	int sz;
 	int i;
 
 	sz = 1 << priv->params.rx_hash_log_tbl_sz;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
 	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
 
 	for (i = 0; i < sz; i++) {
 		int ix = i;
 #ifdef RSS
 		ix = rss_get_indirection_to_bucket(ix);
 #endif
 		/* ensure we don't overflow */
 		ix %= priv->params.num_channels;
 
 		/* apply receive side scaling stride, if any */
 		ix -= ix % (int)priv->params.channels_rsss;
 
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix].rq.rqn);
 	}
 
 	MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
 
 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
 	if (!err)
 		priv->rqtn = MLX5_GET(create_rqt_out, out, rqtn);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_close_rqt(struct mlx5e_priv *priv)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0};
 	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0};
 
 	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
 	MLX5_SET(destroy_rqt_in, in, rqtn, priv->rqtn);
 
 	mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out));
 }
 
 #define	MLX5E_RSS_KEY_SIZE (10 * 4)	/* bytes */
 
 static void
 mlx5e_get_rss_key(void *key_ptr)
 {
 #ifdef RSS
 	rss_getkey(key_ptr);
 #else
 	static const u32 rsskey[] = {
 	    cpu_to_be32(0xD181C62C),
 	    cpu_to_be32(0xF7F4DB5B),
 	    cpu_to_be32(0x1983A2FC),
 	    cpu_to_be32(0x943E1ADB),
 	    cpu_to_be32(0xD9389E6B),
 	    cpu_to_be32(0xD1039C2C),
 	    cpu_to_be32(0xA74499AD),
 	    cpu_to_be32(0x593D56D9),
 	    cpu_to_be32(0xF3253C06),
 	    cpu_to_be32(0x2ADC1FFC),
 	};
 	CTASSERT(sizeof(rsskey) == MLX5E_RSS_KEY_SIZE);
 	memcpy(key_ptr, rsskey, MLX5E_RSS_KEY_SIZE);
 #endif
 }
 
 static void
 mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt, bool inner_vxlan)
 {
 	void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 	void *hfsi = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner);
 	void *hfs = inner_vxlan ? hfsi : hfso;
 	__be32 *hkey;
 
 	MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
 
 #define	ROUGH_MAX_L2_L3_HDR_SZ 256
 
 #define	MLX5_HASH_IP     (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP)
 
 #define	MLX5_HASH_ALL    (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 			  MLX5_HASH_FIELD_SEL_DST_IP   |\
 			  MLX5_HASH_FIELD_SEL_L4_SPORT |\
 			  MLX5_HASH_FIELD_SEL_L4_DPORT)
 
 #define	MLX5_HASH_IP_IPSEC_SPI	(MLX5_HASH_FIELD_SEL_SRC_IP   |\
 				 MLX5_HASH_FIELD_SEL_DST_IP   |\
 				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
 
 	if (priv->params.hw_lro_en) {
 		MLX5_SET(tirc, tirc, lro_enable_mask,
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
 		    MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
 		MLX5_SET(tirc, tirc, lro_max_msg_sz,
 		    (priv->params.lro_wqe_sz -
 		    ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
 		/* TODO: add the option to choose timer value dynamically */
 		MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
 		    MLX5_CAP_ETH(priv->mdev,
 		    lro_timer_supported_periods[2]));
 	}
 
 	if (inner_vxlan)
 		MLX5_SET(tirc, tirc, tunneled_offload_en, 1);
 
 	/* setup parameters for hashing TIR type, if any */
 	switch (tt) {
 	case MLX5E_TT_ANY:
 		MLX5_SET(tirc, tirc, disp_type,
 		    MLX5_TIRC_DISP_TYPE_DIRECT);
 		MLX5_SET(tirc, tirc, inline_rqn,
 		    priv->channel[0].rq.rqn);
 		break;
 	default:
 		MLX5_SET(tirc, tirc, disp_type,
 		    MLX5_TIRC_DISP_TYPE_INDIRECT);
 		MLX5_SET(tirc, tirc, indirect_table,
 		    priv->rqtn);
 		MLX5_SET(tirc, tirc, rx_hash_fn,
 		    MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ);
 		hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
 
 		CTASSERT(MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key) >=
 		    MLX5E_RSS_KEY_SIZE);
 #ifdef RSS
 		/*
 		 * The FreeBSD RSS implementation does currently not
 		 * support symmetric Toeplitz hashes:
 		 */
 		MLX5_SET(tirc, tirc, rx_hash_symmetric, 0);
 #else
 		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
 #endif
 		mlx5e_get_rss_key(hkey);
 		break;
 	}
 
 	switch (tt) {
 	case MLX5E_TT_IPV4_TCP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_TCP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_TCP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_UDP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV6_UDP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, l4_prot_type,
 		    MLX5_L4_PROT_TYPE_UDP);
 #ifdef RSS
 		if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) {
 			MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 			    MLX5_HASH_IP);
 		} else
 #endif
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_ALL);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_AH:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV6_IPSEC_ESP:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP_IPSEC_SPI);
 		break;
 
 	case MLX5E_TT_IPV4:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV4);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	case MLX5E_TT_IPV6:
 		MLX5_SET(rx_hash_field_select, hfs, l3_prot_type,
 		    MLX5_L3_PROT_TYPE_IPV6);
 		MLX5_SET(rx_hash_field_select, hfs, selected_fields,
 		    MLX5_HASH_IP);
 		break;
 
 	default:
 		break;
 	}
 }
 
 static int
 mlx5e_open_tir(struct mlx5e_priv *priv, int tt, bool inner_vxlan)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 *in;
 	void *tirc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 	tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context);
 
 	mlx5e_build_tir_ctx(priv, tirc, tt, inner_vxlan);
 
 	err = mlx5_core_create_tir(mdev, in, inlen, inner_vxlan ?
 	    &priv->tirn_inner_vxlan[tt] : &priv->tirn[tt]);
 
 	kvfree(in);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tir(struct mlx5e_priv *priv, int tt, bool inner_vxlan)
 {
 	mlx5_core_destroy_tir(priv->mdev, inner_vxlan ?
 	    priv->tirn_inner_vxlan[tt] : priv->tirn[tt], 0);
 }
 
 static int
 mlx5e_open_tirs(struct mlx5e_priv *priv, bool inner_vxlan)
 {
 	int err;
 	int i;
 
 	for (i = 0; i < MLX5E_NUM_TT; i++) {
 		err = mlx5e_open_tir(priv, i, inner_vxlan);
 		if (err)
 			goto err_close_tirs;
 	}
 
 	return (0);
 
 err_close_tirs:
 	for (i--; i >= 0; i--)
 		mlx5e_close_tir(priv, i, inner_vxlan);
 
 	return (err);
 }
 
 static void
 mlx5e_close_tirs(struct mlx5e_priv *priv, bool inner_vxlan)
 {
 	int i;
 
 	for (i = 0; i < MLX5E_NUM_TT; i++)
 		mlx5e_close_tir(priv, i, inner_vxlan);
 }
 
 /*
  * SW MTU does not include headers,
  * HW MTU includes all headers and checksums.
  */
 static int
 mlx5e_set_dev_port_mtu(struct ifnet *ifp, int sw_mtu)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int hw_mtu;
 	int err;
 
 	hw_mtu = MLX5E_SW2HW_MTU(sw_mtu);
 
 	err = mlx5_set_port_mtu(mdev, hw_mtu);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5_set_port_mtu failed setting %d, err=%d\n",
 		    sw_mtu, err);
 		return (err);
 	}
 
 	/* Update vport context MTU */
 	err = mlx5_set_vport_mtu(mdev, hw_mtu);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "Failed updating vport context with MTU size, err=%d\n",
 		    err);
 	}
 
 	ifp->if_mtu = sw_mtu;
 
 	err = mlx5_query_vport_mtu(mdev, &hw_mtu);
 	if (err || !hw_mtu) {
 		/* fallback to port oper mtu */
 		err = mlx5_query_port_oper_mtu(mdev, &hw_mtu);
 	}
 	if (err) {
 		mlx5_en_err(ifp,
 		    "Query port MTU, after setting new MTU value, failed\n");
 		return (err);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) {
 		err = -E2BIG,
 		mlx5_en_err(ifp,
 		    "Port MTU %d is smaller than ifp mtu %d\n",
 		    hw_mtu, sw_mtu);
 	} else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) {
 		err = -EINVAL;
                 mlx5_en_err(ifp,
 		    "Port MTU %d is bigger than ifp mtu %d\n",
 		    hw_mtu, sw_mtu);
 	}
 	priv->params_ethtool.hw_mtu = hw_mtu;
 
 	/* compute MSB */
 	while (hw_mtu & (hw_mtu - 1))
 		hw_mtu &= (hw_mtu - 1);
 	priv->params_ethtool.hw_mtu_msb = hw_mtu;
 
 	return (err);
 }
 
 int
 mlx5e_open_locked(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	int err;
 	u16 set_id;
 
 	/* check if already opened */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 		return (0);
 
 #ifdef RSS
 	if (rss_getnumbuckets() > priv->params.num_channels) {
 		mlx5_en_info(ifp,
 		    "NOTE: There are more RSS buckets(%u) than channels(%u) available\n",
 		    rss_getnumbuckets(), priv->params.num_channels);
 	}
 #endif
 	err = mlx5e_open_tises(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_tises failed, %d\n", err);
 		return (err);
 	}
 	err = mlx5_vport_alloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, &set_id);
 	if (err) {
 		mlx5_en_err(priv->ifp,
 		    "mlx5_vport_alloc_q_counter failed: %d\n", err);
 		goto err_close_tises;
 	}
 	/* store counter set ID */
 	priv->counter_set_id = set_id;
 
 	err = mlx5e_open_channels(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_open_channels failed, %d\n", err);
 		goto err_dalloc_q_counter;
 	}
 	err = mlx5e_open_rqt(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_rqt failed, %d\n", err);
 		goto err_close_channels;
 	}
 	err = mlx5e_open_tirs(priv, false);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_open_tir(main) failed, %d\n", err);
 		goto err_close_rqls;
 	}
 	if ((ifp->if_capenable & IFCAP_VXLAN_HWCSUM) != 0) {
 		err = mlx5e_open_tirs(priv, true);
 		if (err) {
 			mlx5_en_err(ifp, "mlx5e_open_tir(inner) failed, %d\n",
 			    err);
 			goto err_close_tirs;
 		}
 	}
 	err = mlx5e_open_flow_table(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_open_flow_table failed, %d\n", err);
 		goto err_close_tirs_inner;
 	}
 	err = mlx5e_add_all_vlan_rules(priv);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5e_add_all_vlan_rules failed, %d\n", err);
 		goto err_close_flow_table;
 	}
 	if ((ifp->if_capenable & IFCAP_VXLAN_HWCSUM) != 0) {
 		err = mlx5e_add_all_vxlan_rules(priv);
 		if (err) {
 			mlx5_en_err(ifp,
 			    "mlx5e_add_all_vxlan_rules failed, %d\n", err);
 			goto err_del_vlan_rules;
 		}
 	}
 	set_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	mlx5e_update_carrier(priv);
 	mlx5e_set_rx_mode_core(priv);
 
 	return (0);
 
 err_del_vlan_rules:
 	mlx5e_del_all_vlan_rules(priv);
 
 err_close_flow_table:
 	mlx5e_close_flow_table(priv);
 
 err_close_tirs_inner:
 	if ((ifp->if_capenable & IFCAP_VXLAN_HWCSUM) != 0)
 		mlx5e_close_tirs(priv, true);
 
 err_close_tirs:
 	mlx5e_close_tirs(priv, false);
 
 err_close_rqls:
 	mlx5e_close_rqt(priv);
 
 err_close_channels:
 	mlx5e_close_channels(priv);
 
 err_dalloc_q_counter:
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 
 err_close_tises:
 	mlx5e_close_tises(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_open(void *arg)
 {
 	struct mlx5e_priv *priv = arg;
 
 	PRIV_LOCK(priv);
 	if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP))
 		mlx5_en_err(priv->ifp,
 		    "Setting port status to up failed\n");
 
 	mlx5e_open_locked(priv->ifp);
 	priv->ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	PRIV_UNLOCK(priv);
 }
 
 int
 mlx5e_close_locked(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 
 	/* check if already closed */
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return (0);
 
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	mlx5e_set_rx_mode_core(priv);
 	mlx5e_del_all_vlan_rules(priv);
 	if ((ifp->if_capenable & IFCAP_VXLAN_HWCSUM) != 0)
 		mlx5e_del_all_vxlan_rules(priv);
 	if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 	mlx5e_close_flow_table(priv);
 	if ((ifp->if_capenable & IFCAP_VXLAN_HWCSUM) != 0)
 		mlx5e_close_tirs(priv, true);
 	mlx5e_close_tirs(priv, false);
 	mlx5e_close_rqt(priv);
 	mlx5e_close_channels(priv);
 	mlx5_vport_dealloc_q_counter(priv->mdev,
 	    MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id);
 	mlx5e_close_tises(priv);
 
 	return (0);
 }
 
 #if (__FreeBSD_version >= 1100000)
 static uint64_t
 mlx5e_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	u64 retval;
 
 	/* PRIV_LOCK(priv); XXX not allowed */
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		retval = priv->stats.vport.rx_packets;
 		break;
 	case IFCOUNTER_IERRORS:
 		retval = priv->stats.pport.in_range_len_errors +
 		    priv->stats.pport.out_of_range_len +
 		    priv->stats.pport.too_long_errors +
 		    priv->stats.pport.check_seq_err +
 		    priv->stats.pport.alignment_err;
 		break;
 	case IFCOUNTER_IQDROPS:
 		retval = priv->stats.vport.rx_out_of_buffer;
 		break;
 	case IFCOUNTER_OPACKETS:
 		retval = priv->stats.vport.tx_packets;
 		break;
 	case IFCOUNTER_OERRORS:
 		retval = priv->stats.port_stats_debug.out_discards;
 		break;
 	case IFCOUNTER_IBYTES:
 		retval = priv->stats.vport.rx_bytes;
 		break;
 	case IFCOUNTER_OBYTES:
 		retval = priv->stats.vport.tx_bytes;
 		break;
 	case IFCOUNTER_IMCASTS:
 		retval = priv->stats.vport.rx_multicast_packets;
 		break;
 	case IFCOUNTER_OMCASTS:
 		retval = priv->stats.vport.tx_multicast_packets;
 		break;
 	case IFCOUNTER_OQDROPS:
 		retval = priv->stats.vport.tx_queue_dropped;
 		break;
 	case IFCOUNTER_COLLISIONS:
 		retval = priv->stats.pport.collisions;
 		break;
 	default:
 		retval = if_get_counter_default(ifp, cnt);
 		break;
 	}
 	/* PRIV_UNLOCK(priv); XXX not allowed */
 	return (retval);
 }
 #endif
 
 static void
 mlx5e_set_rx_mode(struct ifnet *ifp)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 
 	queue_work(priv->wq, &priv->set_rx_mode_work);
 }
 
 static int
 mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct mlx5e_priv *priv;
 	struct ifreq *ifr;
 	struct ifdownreason *ifdr;
 	struct ifi2creq i2c;
 	struct ifrsskey *ifrk;
 	struct ifrsshash *ifrh;
 	int error = 0;
 	int mask = 0;
 	int size_read = 0;
 	int module_status;
 	int module_num;
 	int max_mtu;
 	uint8_t read_addr;
 
 	priv = ifp->if_softc;
 
 	/* check if detaching */
 	if (priv == NULL || priv->gone != 0)
 		return (ENXIO);
 
 	switch (command) {
 	case SIOCSIFMTU:
 		ifr = (struct ifreq *)data;
 
 		PRIV_LOCK(priv);
 		mlx5_query_port_max_mtu(priv->mdev, &max_mtu);
 
 		if (ifr->ifr_mtu >= MLX5E_MTU_MIN &&
 		    ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) {
 			int was_opened;
 
 			was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			if (was_opened)
 				mlx5e_close_locked(ifp);
 
 			/* set new MTU */
 			mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu);
 
 			if (was_opened)
 				mlx5e_open_locked(ifp);
 		} else {
 			error = EINVAL;
 			mlx5_en_err(ifp,
 			    "Invalid MTU value. Min val: %d, Max val: %d\n",
 			    MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu));
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCSIFFLAGS:
 		if ((ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			mlx5e_set_rx_mode(ifp);
 			break;
 		}
 		PRIV_LOCK(priv);
 		if (ifp->if_flags & IFF_UP) {
 			if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 					mlx5e_open_locked(ifp);
 				ifp->if_drv_flags |= IFF_DRV_RUNNING;
 				mlx5_set_port_status(priv->mdev, MLX5_PORT_UP);
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				mlx5_set_port_status(priv->mdev,
 				    MLX5_PORT_DOWN);
 				if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0)
 					mlx5e_close_locked(ifp);
 				mlx5e_update_carrier(priv);
 				ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			}
 		}
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		mlx5e_set_rx_mode(ifp);
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 		ifr = (struct ifreq *)data;
 		error = ifmedia_ioctl(ifp, ifr, &priv->media, command);
 		break;
 	case SIOCSIFCAP:
 		ifr = (struct ifreq *)data;
 		PRIV_LOCK(priv);
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 
 		if (mask & IFCAP_TXCSUM) {
 			ifp->if_capenable ^= IFCAP_TXCSUM;
 			ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 
 			if (IFCAP_TSO4 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				mask &= ~IFCAP_TSO4;
 				ifp->if_capenable &= ~IFCAP_TSO4;
 				ifp->if_hwassist &= ~CSUM_IP_TSO;
 				mlx5_en_err(ifp,
 				    "tso4 disabled due to -txcsum.\n");
 			}
 		}
 		if (mask & IFCAP_TXCSUM_IPV6) {
 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 			ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 
 			if (IFCAP_TSO6 & ifp->if_capenable &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				mask &= ~IFCAP_TSO6;
 				ifp->if_capenable &= ~IFCAP_TSO6;
 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
 				mlx5_en_err(ifp,
 				    "tso6 disabled due to -txcsum6.\n");
 			}
 		}
 		if (mask & IFCAP_MEXTPG)
 			ifp->if_capenable ^= IFCAP_MEXTPG;
 		if (mask & IFCAP_TXTLS4)
 			ifp->if_capenable ^= IFCAP_TXTLS4;
 		if (mask & IFCAP_TXTLS6)
 			ifp->if_capenable ^= IFCAP_TXTLS6;
 #ifdef RATELIMIT
 		if (mask & IFCAP_TXTLS_RTLMT)
 			ifp->if_capenable ^= IFCAP_TXTLS_RTLMT;
 #endif
 		if (mask & IFCAP_RXCSUM)
 			ifp->if_capenable ^= IFCAP_RXCSUM;
 		if (mask & IFCAP_RXCSUM_IPV6)
 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 		if (mask & IFCAP_TSO4) {
 			if (!(IFCAP_TSO4 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM & ifp->if_capenable)) {
 				mlx5_en_err(ifp, "enable txcsum first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			ifp->if_capenable ^= IFCAP_TSO4;
 			ifp->if_hwassist ^= CSUM_IP_TSO;
 		}
 		if (mask & IFCAP_TSO6) {
 			if (!(IFCAP_TSO6 & ifp->if_capenable) &&
 			    !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) {
 				mlx5_en_err(ifp, "enable txcsum6 first.\n");
 				error = EAGAIN;
 				goto out;
 			}
 			ifp->if_capenable ^= IFCAP_TSO6;
 			ifp->if_hwassist ^= CSUM_IP6_TSO;
 		}
 		if (mask & IFCAP_VLAN_HWTSO)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
 		if (mask & IFCAP_VLAN_HWFILTER) {
 			if (ifp->if_capenable & IFCAP_VLAN_HWFILTER)
 				mlx5e_disable_vlan_filter(priv);
 			else
 				mlx5e_enable_vlan_filter(priv);
 
 			ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
 		}
 		if (mask & IFCAP_VLAN_HWTAGGING)
 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
 		if (mask & IFCAP_WOL_MAGIC)
 			ifp->if_capenable ^= IFCAP_WOL_MAGIC;
 		if (mask & IFCAP_VXLAN_HWCSUM) {
 			int was_opened = test_bit(MLX5E_STATE_OPENED,
 			    &priv->state);
 			if (was_opened)
 				mlx5e_close_locked(ifp);
 			ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM;
 			ifp->if_hwassist ^= CSUM_INNER_IP | CSUM_INNER_IP_UDP |
 			    CSUM_INNER_IP_TCP | CSUM_INNER_IP6_UDP |
 			    CSUM_INNER_IP6_TCP;
 			if (was_opened)
 				mlx5e_open_locked(ifp);
 		}
 		if (mask & IFCAP_VXLAN_HWTSO) {
 			ifp->if_capenable ^= IFCAP_VXLAN_HWTSO;
 			ifp->if_hwassist ^= CSUM_INNER_IP_TSO |
 			    CSUM_INNER_IP6_TSO;
 		}
 
 		VLAN_CAPABILITIES(ifp);
 		/* turn off LRO means also turn of HW LRO - if it's on */
 		if (mask & IFCAP_LRO) {
 			int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
 			bool need_restart = false;
 
 			ifp->if_capenable ^= IFCAP_LRO;
 
 			/* figure out if updating HW LRO is needed */
 			if (!(ifp->if_capenable & IFCAP_LRO)) {
 				if (priv->params.hw_lro_en) {
 					priv->params.hw_lro_en = false;
 					need_restart = true;
 				}
 			} else {
 				if (priv->params.hw_lro_en == false &&
 				    priv->params_ethtool.hw_lro != 0) {
 					priv->params.hw_lro_en = true;
 					need_restart = true;
 				}
 			}
 			if (was_opened && need_restart) {
 				mlx5e_close_locked(ifp);
 				mlx5e_open_locked(ifp);
 			}
 		}
 		if (mask & IFCAP_HWRXTSTMP) {
 			ifp->if_capenable ^= IFCAP_HWRXTSTMP;
 			if (ifp->if_capenable & IFCAP_HWRXTSTMP) {
 				if (priv->clbr_done == 0)
 					mlx5e_reset_calibration_callout(priv);
 			} else {
 				callout_drain(&priv->tstmp_clbr);
 				priv->clbr_done = 0;
 			}
 		}
 out:
 		PRIV_UNLOCK(priv);
 		break;
 
 	case SIOCGI2C:
 		ifr = (struct ifreq *)data;
 
 		/*
 		 * Copy from the user-space address ifr_data to the
 		 * kernel-space address i2c
 		 */
 		error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (error)
 			break;
 
 		if (i2c.len > sizeof(i2c.data)) {
 			error = EINVAL;
 			break;
 		}
 
 		PRIV_LOCK(priv);
 		/* Get module_num which is required for the query_eeprom */
 		error = mlx5_query_module_num(priv->mdev, &module_num);
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query module num failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/* Check if module is present before doing an access */
 		module_status = mlx5_query_module_status(priv->mdev, module_num);
 		if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) {
 			error = EINVAL;
 			goto err_i2c;
 		}
 		/*
 		 * Currently 0XA0 and 0xA2 are the only addresses permitted.
 		 * The internal conversion is as follows:
 		 */
 		if (i2c.dev_addr == 0xA0)
 			read_addr = MLX5_I2C_ADDR_LOW;
 		else if (i2c.dev_addr == 0xA2)
 			read_addr = MLX5_I2C_ADDR_HIGH;
 		else {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, Invalid Address: %X\n",
 			    i2c.dev_addr);
 			error = EINVAL;
 			goto err_i2c;
 		}
 		error = mlx5_query_eeprom(priv->mdev,
 		    read_addr, MLX5_EEPROM_LOW_PAGE,
 		    (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num,
 		    (uint32_t *)i2c.data, &size_read);
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		if (i2c.len > MLX5_EEPROM_MAX_BYTES) {
 			error = mlx5_query_eeprom(priv->mdev,
 			    read_addr, MLX5_EEPROM_LOW_PAGE,
 			    (uint32_t)(i2c.offset + size_read),
 			    (uint32_t)(i2c.len - size_read), module_num,
 			    (uint32_t *)(i2c.data + size_read), &size_read);
 		}
 		if (error) {
 			mlx5_en_err(ifp,
 			    "Query eeprom failed, eeprom reading is not supported\n");
 			error = EINVAL;
 			goto err_i2c;
 		}
 
 		error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c));
 err_i2c:
 		PRIV_UNLOCK(priv);
 		break;
 	case SIOCGIFDOWNREASON:
 		ifdr = (struct ifdownreason *)data;
 		bzero(ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg));
 		PRIV_LOCK(priv);
 		error = -mlx5_query_pddr_troubleshooting_info(priv->mdev, NULL,
 		    ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg));
 		PRIV_UNLOCK(priv);
 		if (error == 0)
 			ifdr->ifdr_reason = IFDR_REASON_MSG;
 		break;
 
 	case SIOCGIFRSSKEY:
 		ifrk = (struct ifrsskey *)data;
 		ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
 		ifrk->ifrk_keylen = MLX5E_RSS_KEY_SIZE;
 		CTASSERT(sizeof(ifrk->ifrk_key) >= MLX5E_RSS_KEY_SIZE);
 		mlx5e_get_rss_key(ifrk->ifrk_key);
 		break;
 
 	case SIOCGIFRSSHASH:
 		ifrh = (struct ifrsshash *)data;
 		ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
 		ifrh->ifrh_types =
 		    RSS_TYPE_IPV4 |
 		    RSS_TYPE_TCP_IPV4 |
 		    RSS_TYPE_UDP_IPV4 |
 		    RSS_TYPE_IPV6 |
 		    RSS_TYPE_TCP_IPV6 |
 		    RSS_TYPE_UDP_IPV6;
 		break;
 
 	default:
 		error = ether_ioctl(ifp, command, data);
 		break;
 	}
 	return (error);
 }
 
 static int
 mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
 {
 	/*
 	 * TODO: uncoment once FW really sets all these bits if
 	 * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap ||
 	 * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap ||
 	 * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return
 	 * -ENOTSUPP;
 	 */
 
 	/* TODO: add more must-to-have features */
 
 	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
 		return (-ENODEV);
 
 	return (0);
 }
 
 static u16
 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev)
 {
 	const int min_size = ETHER_VLAN_ENCAP_LEN + ETHER_HDR_LEN;
 	const int max_size = MLX5E_MAX_TX_INLINE;
 	const int bf_buf_size =
 	    ((1U << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2U) -
 	    (sizeof(struct mlx5e_tx_wqe) - 2);
 
 	/* verify against driver limits */
 	if (bf_buf_size > max_size)
 		return (max_size);
 	else if (bf_buf_size < min_size)
 		return (min_size);
 	else
 		return (bf_buf_size);
 }
 
 static int
 mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev,
     struct mlx5e_priv *priv,
     int num_comp_vectors)
 {
 	int err;
 
 	/*
 	 * TODO: Consider link speed for setting "log_sq_size",
 	 * "log_rq_size" and "cq_moderation_xxx":
 	 */
 	priv->params.log_sq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
 	priv->params.log_rq_size =
 	    MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
 	priv->params.rx_cq_moderation_usec =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE :
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	priv->params.rx_cq_moderation_mode =
 	    MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0;
 	priv->params.rx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
 	priv->params.tx_cq_moderation_usec =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
 	priv->params.tx_cq_moderation_pkts =
 	    MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	priv->params.min_rx_wqes =
 	    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
 	priv->params.rx_hash_log_tbl_sz =
 	    (order_base_2(num_comp_vectors) >
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ?
 	    order_base_2(num_comp_vectors) :
 	    MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ;
 	priv->params.num_tc = 1;
 	priv->params.default_vlan_prio = 0;
 	priv->counter_set_id = -1;
 	priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
 
 	err = mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
 	if (err)
 		return (err);
 
 	/*
 	 * hw lro is currently defaulted to off. when it won't anymore we
 	 * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)"
 	 */
 	priv->params.hw_lro_en = false;
 	priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
 
 	/*
 	 * CQE zipping is currently defaulted to off. when it won't
 	 * anymore we will consider the HW capability:
 	 * "!!MLX5_CAP_GEN(mdev, cqe_compression)"
 	 */
 	priv->params.cqe_zipping_en = false;
 
 	priv->mdev = mdev;
 	priv->params.num_channels = num_comp_vectors;
 	priv->params.channels_rsss = 1;
 	priv->order_base_2_num_channels = order_base_2(num_comp_vectors);
 	priv->queue_mapping_channel_mask =
 	    roundup_pow_of_two(num_comp_vectors) - 1;
 	priv->num_tc = priv->params.num_tc;
 	priv->default_vlan_prio = priv->params.default_vlan_prio;
 
 	INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
 
 	return (0);
 }
 
 static void
 mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc)
 {
 	bool ro_pci_enable =
 	    pci_get_relaxed_ordering_enabled(mdev->pdev->dev.bsddev);
 	bool ro_write = MLX5_CAP_GEN(mdev, relaxed_ordering_write);
 	bool ro_read = MLX5_CAP_GEN(mdev, relaxed_ordering_read);
 
 	MLX5_SET(mkc, mkc, relaxed_ordering_read, ro_pci_enable && ro_read);
 	MLX5_SET(mkc, mkc, relaxed_ordering_write, ro_pci_enable && ro_write);
 }
 
 static int
 mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn,
 		  struct mlx5_core_mkey *mkey)
 {
 	struct ifnet *ifp = priv->ifp;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	void *mkc;
 	u32 *in;
 	int err;
 
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL) {
 		mlx5_en_err(ifp, "failed to allocate inbox\n");
 		return (-ENOMEM);
 	}
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA);
 	MLX5_SET(mkc, mkc, umr_en, 1);	/* used by HW TLS */
 	MLX5_SET(mkc, mkc, lw, 1);
 	MLX5_SET(mkc, mkc, lr, 1);
 	mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
 	MLX5_SET(mkc, mkc, pd, pdn);
 	MLX5_SET(mkc, mkc, length64, 1);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 
 	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
 	if (err)
 		mlx5_en_err(ifp, "mlx5_core_create_mkey failed, %d\n",
 		    err);
 
 	kvfree(in);
 	return (err);
 }
 
 static const char *mlx5e_vport_stats_desc[] = {
 	MLX5E_VPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_pport_stats_desc[] = {
 	MLX5E_PPORT_STATS(MLX5E_STATS_DESC)
 };
 
 static int
 mlx5e_priv_static_init(struct mlx5e_priv *priv, struct mlx5_core_dev *mdev,
     const uint32_t channels)
 {
 	uint32_t x;
 	int err;
 
 	mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF);
 	sx_init(&priv->state_lock, "mlx5state");
 	callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0);
 	MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock);
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_init(priv, &priv->channel[x], x);
 
 	for (x = 0; x != channels; x++) {
 		err = mlx5_alloc_bfreg(mdev, &priv->channel[x].bfreg, false, false);
 		if (err)
 			goto err_alloc_bfreg;
 	}
 	return (0);
 
 err_alloc_bfreg:
 	while (x--)
 		mlx5_free_bfreg(mdev, &priv->channel[x].bfreg);
 
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_destroy(&priv->channel[x]);
 	callout_drain(&priv->watchdog);
 	mtx_destroy(&priv->async_events_mtx);
 	sx_destroy(&priv->state_lock);
 	return (err);
 }
 
 static void
 mlx5e_priv_static_destroy(struct mlx5e_priv *priv, struct mlx5_core_dev *mdev,
     const uint32_t channels)
 {
 	uint32_t x;
 
 	for (x = 0; x != channels; x++)
 		mlx5_free_bfreg(mdev, &priv->channel[x].bfreg);
 	for (x = 0; x != channels; x++)
 		mlx5e_chan_static_destroy(&priv->channel[x]);
 	callout_drain(&priv->watchdog);
 	mtx_destroy(&priv->async_events_mtx);
 	sx_destroy(&priv->state_lock);
 }
 
 static int
 sysctl_firmware(SYSCTL_HANDLER_ARGS)
 {
 	/*
 	 * %d.%d%.d the string format.
 	 * fw_rev_{maj,min,sub} return u16, 2^16 = 65536.
 	 * We need at most 5 chars to store that.
 	 * It also has: two "." and NULL at the end, which means we need 18
 	 * (5*3 + 3) chars at most.
 	 */
 	char fw[18];
 	struct mlx5e_priv *priv = arg1;
 	int error;
 
 	snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev),
 	    fw_rev_sub(priv->mdev));
 	error = sysctl_handle_string(oidp, fw, sizeof(fw), req);
 	return (error);
 }
 
 static void
 mlx5e_disable_tx_dma(struct mlx5e_channel *ch)
 {
 	int i;
 
 	for (i = 0; i < ch->priv->num_tc; i++)
 		mlx5e_drain_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq)
 {
 
 	sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP);
 	sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8);
 	mlx5e_tx_notify_hw(sq, sq->doorbell.d32);
 	sq->doorbell.d64 = 0;
 }
 
 void
 mlx5e_resume_sq(struct mlx5e_sq *sq)
 {
 	int err;
 
 	/* check if already enabled */
 	if (READ_ONCE(sq->running) != 0)
 		return;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR,
 	    MLX5_SQC_STATE_RST);
 	if (err != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from ERR to RST failed: %d\n", err);
 	}
 
 	sq->cc = 0;
 	sq->pc = 0;
 
 	/* reset doorbell prior to moving from RST to RDY */
 	mlx5e_reset_sq_doorbell_record(sq);
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST,
 	    MLX5_SQC_STATE_RDY);
 	if (err != 0) {
 		mlx5_en_err(sq->ifp,
 		    "mlx5e_modify_sq() from RST to RDY failed: %d\n", err);
 	}
 
 	sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
 	WRITE_ONCE(sq->running, 1);
 }
 
 static void
 mlx5e_enable_tx_dma(struct mlx5e_channel *ch)
 {
         int i;
 
 	for (i = 0; i < ch->priv->num_tc; i++)
 		mlx5e_resume_sq(&ch->sq[i]);
 }
 
 static void
 mlx5e_disable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	struct epoch_tracker et;
 	int err;
 
 	mtx_lock(&rq->mtx);
 	rq->enabled = 0;
 	callout_stop(&rq->watchdog);
 	mtx_unlock(&rq->mtx);
 
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from RDY to RST failed: %d\n", err);
 	}
 
 	while (!mlx5_wq_ll_is_empty(&rq->wq)) {
 		msleep(1);
 		NET_EPOCH_ENTER(et);
 		rq->cq.mcq.comp(&rq->cq.mcq, NULL);
 		NET_EPOCH_EXIT(et);
 	}
 
 	/*
 	 * Transitioning into RST state will allow the FW to track less ERR state queues,
 	 * thus reducing the recv queue flushing time
 	 */
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from ERR to RST failed: %d\n", err);
 	}
 }
 
 static void
 mlx5e_enable_rx_dma(struct mlx5e_channel *ch)
 {
 	struct mlx5e_rq *rq = &ch->rq;
 	struct epoch_tracker et;
 	int err;
 
 	rq->wq.wqe_ctr = 0;
 	mlx5_wq_ll_update_db_record(&rq->wq);
 	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
 	if (err != 0) {
 		mlx5_en_err(rq->ifp,
 		    "mlx5e_modify_rq() from RST to RDY failed: %d\n", err);
         }
 
 	rq->enabled = 1;
 
 	NET_EPOCH_ENTER(et);
 	rq->cq.mcq.comp(&rq->cq.mcq, NULL);
 	NET_EPOCH_EXIT(et);
 }
 
 void
 mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_tx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_tx_dma(&priv->channel[i]);
 	}
 }
 
 void
 mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value)
 {
 	int i;
 
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0)
 		return;
 
 	for (i = 0; i < priv->params.num_channels; i++) {
 		if (value)
 			mlx5e_disable_rx_dma(&priv->channel[i]);
 		else
 			mlx5e_enable_rx_dma(&priv->channel[i]);
 	}
 }
 
 static void
 mlx5e_add_hw_stats(struct mlx5e_priv *priv)
 {
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    priv, 0, sysctl_firmware, "A", "HCA firmware version");
 
 	SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw),
 	    OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0,
 	    "Board ID");
 }
 
 static int
 mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t tx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	tx_pfc = priv->params.tx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (tx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.tx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.tx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (tx_pfc != priv->params.tx_priority_flow_control)
 		err = -mlx5e_set_port_pfc(priv);
 done:
 	if (err != 0)
 		priv->params.tx_priority_flow_control= tx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static int
 mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_priv *priv = arg1;
 	uint8_t temp[MLX5E_MAX_PRIORITY];
 	uint32_t rx_pfc;
 	int err;
 	int i;
 
 	PRIV_LOCK(priv);
 
 	rx_pfc = priv->params.rx_priority_flow_control;
 
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++)
 		temp[i] = (rx_pfc >> i) & 1;
 
 	err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY);
 	if (err || !req->newptr)
 		goto done;
 	err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY);
 	if (err)
 		goto done;
 
 	priv->params.rx_priority_flow_control = 0;
 
 	/* range check input value */
 	for (i = 0; i != MLX5E_MAX_PRIORITY; i++) {
 		if (temp[i] > 1) {
 			err = ERANGE;
 			goto done;
 		}
 		priv->params.rx_priority_flow_control |= (temp[i] << i);
 	}
 
 	/* check if update is required */
 	if (rx_pfc != priv->params.rx_priority_flow_control) {
 		err = -mlx5e_set_port_pfc(priv);
 		if (err == 0 && priv->sw_is_port_buf_owner)
 			err = mlx5e_update_buf_lossy(priv);
 	}
 done:
 	if (err != 0)
 		priv->params.rx_priority_flow_control= rx_pfc;
 	PRIV_UNLOCK(priv);
 
 	return (err);
 }
 
 static void
 mlx5e_setup_pauseframes(struct mlx5e_priv *priv)
 {
 #if (__FreeBSD_version < 1100000)
 	char path[96];
 #endif
 	int error;
 
 	/* enable pauseframes by default */
 	priv->params.tx_pauseframe_control = 1;
 	priv->params.rx_pauseframe_control = 1;
 
 	/* disable ports flow control, PFC, by default */
 	priv->params.tx_priority_flow_control = 0;
 	priv->params.rx_priority_flow_control = 0;
 
 #if (__FreeBSD_version < 1100000)
 	/* compute path for sysctl */
 	snprintf(path, sizeof(path), "dev.mce.%d.tx_pauseframe_control",
 	    device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	/* try to fetch tunable, if any */
 	TUNABLE_INT_FETCH(path, &priv->params.tx_pauseframe_control);
 
 	/* compute path for sysctl */
 	snprintf(path, sizeof(path), "dev.mce.%d.rx_pauseframe_control",
 	    device_get_unit(priv->mdev->pdev->dev.bsddev));
 
 	/* try to fetch tunable, if any */
 	TUNABLE_INT_FETCH(path, &priv->params.rx_pauseframe_control);
 #endif
 
 	/* register pauseframe SYSCTLs */
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.tx_pauseframe_control, 0,
 	    "Set to enable TX pause frames. Clear to disable.");
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN,
 	    &priv->params.rx_pauseframe_control, 0,
 	    "Set to enable RX pause frames. Clear to disable.");
 
 	/* register priority flow control, PFC, SYSCTLs */
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "tx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_tx_priority_flow_control, "CU",
 	    "Set to enable TX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN |
 	    CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_rx_priority_flow_control, "CU",
 	    "Set to enable RX ports flow control frames for priorities 0..7. Clear to disable.");
 
 	PRIV_LOCK(priv);
 
 	/* range check */
 	priv->params.tx_pauseframe_control =
 	    priv->params.tx_pauseframe_control ? 1 : 0;
 	priv->params.rx_pauseframe_control =
 	    priv->params.rx_pauseframe_control ? 1 : 0;
 
 	/* update firmware */
 	error = mlx5e_set_port_pause_and_pfc(priv);
 	if (error == -EINVAL) {
 		mlx5_en_err(priv->ifp,
 		    "Global pauseframes must be disabled before enabling PFC.\n");
 		priv->params.rx_priority_flow_control = 0;
 		priv->params.tx_priority_flow_control = 0;
 
 		/* update firmware */
 		(void) mlx5e_set_port_pause_and_pfc(priv);
 	}
 	PRIV_UNLOCK(priv);
 }
 
-int
+static int
 mlx5e_ul_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct mlx5e_priv *priv;
 	struct mlx5e_channel *pch;
 
 	priv = ifp->if_softc;
 
 	if (unlikely(priv->gone || params->hdr.flowtype == M_HASHTYPE_NONE)) {
 		return (EOPNOTSUPP);
 	} else {
 		/* keep this code synced with mlx5e_select_queue() */
 		u32 ch = priv->params.num_channels;
 #ifdef RSS
 		u32 temp;
 
 		if (rss_hash2bucket(params->hdr.flowid,
 		    params->hdr.flowtype, &temp) == 0)
 			ch = temp % ch;
 		else
 #endif
 			ch = (params->hdr.flowid % 128) % ch;
 
 		/*
 		 * NOTE: The channels array is only freed at detach
 		 * and it safe to return a pointer to the send tag
 		 * inside the channels structure as long as we
 		 * reference the priv.
 		 */
 		pch = priv->channel + ch;
 
 		/* check if send queue is not running */
 		if (unlikely(pch->sq[0].running == 0))
 			return (ENXIO);
 		m_snd_tag_ref(&pch->tag);
 		*ppmt = &pch->tag;
 		return (0);
 	}
 }
 
-int
+static int
 mlx5e_ul_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag);
 
 	params->unlimited.max_rate = -1ULL;
 	params->unlimited.queue_level = mlx5e_sq_queue_level(&pch->sq[0]);
 	return (0);
 }
 
-void
+static void
 mlx5e_ul_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_channel *pch =
 	    container_of(pmt, struct mlx5e_channel, tag);
 
 	complete(&pch->completion);
 }
 
 static int
 mlx5e_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 
 	switch (params->hdr.type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		return (mlx5e_rl_snd_tag_alloc(ifp, params, ppmt));
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt));
 #endif
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		return (mlx5e_ul_snd_tag_alloc(ifp, params, ppmt));
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt));
 #endif
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
-static int
-mlx5e_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
-{
-
-	switch (pmt->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_RATE_LIMIT:
-		return (mlx5e_rl_snd_tag_modify(pmt, params));
-#ifdef KERN_TLS
-	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
-		return (mlx5e_tls_snd_tag_modify(pmt, params));
-#endif
-#endif
-	case IF_SND_TAG_TYPE_UNLIMITED:
-#ifdef KERN_TLS
-	case IF_SND_TAG_TYPE_TLS:
-#endif
-	default:
-		return (EOPNOTSUPP);
-	}
-}
-
-static int
-mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
-{
-
-	switch (pmt->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_RATE_LIMIT:
-		return (mlx5e_rl_snd_tag_query(pmt, params));
-#ifdef KERN_TLS
-	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
-		return (mlx5e_tls_snd_tag_query(pmt, params));
-#endif
-#endif
-	case IF_SND_TAG_TYPE_UNLIMITED:
-		return (mlx5e_ul_snd_tag_query(pmt, params));
-#ifdef KERN_TLS
-	case IF_SND_TAG_TYPE_TLS:
-		return (mlx5e_tls_snd_tag_query(pmt, params));
-#endif
-	default:
-		return (EOPNOTSUPP);
-	}
-}
-
 #ifdef RATELIMIT
 #define NUM_HDWR_RATES_MLX 13
 static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
 	135375,			/* 1,083,000 */
 	180500,			/* 1,444,000 */
 	270750,			/* 2,166,000 */
 	361000,			/* 2,888,000 */
 	541500,			/* 4,332,000 */
 	721875,			/* 5,775,000 */
 	1082875,		/* 8,663,000 */
 	1443875,		/* 11,551,000 */
 	2165750,		/* 17,326,000 */
 	2887750,		/* 23,102,000 */
 	4331625,		/* 34,653,000 */
 	5775500,		/* 46,204,000 */
 	8663125			/* 69,305,000 */
 };
 
 static void
 mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
 {
 	/*
 	 * This function needs updating by the driver maintainer!
 	 * For the MLX card there are currently (ConectX-4?) 13 
 	 * pre-set rates and others i.e. ConnectX-5, 6, 7??
 	 *
 	 * This will change based on later adapters
 	 * and this code should be updated to look at ifp
 	 * and figure out the specific adapter type
 	 * settings i.e. how many rates as well
 	 * as if they are fixed (as is shown here) or
 	 * if they are dynamic (example chelsio t4). Also if there
 	 * is a maximum number of flows that the adapter
 	 * can handle that too needs to be updated in
 	 * the max_flows field.
 	 */
 	q->rate_table = adapter_rates_mlx;
 	q->flags = RT_IS_FIXED_TABLE;
 	q->max_flows = 0;	/* mlx has no limit */
 	q->number_of_rates = NUM_HDWR_RATES_MLX;
 	q->min_segment_burst = 1;
 }
 #endif
 
-static void
-mlx5e_snd_tag_free(struct m_snd_tag *pmt)
-{
-
-	switch (pmt->type) {
-#ifdef RATELIMIT
-	case IF_SND_TAG_TYPE_RATE_LIMIT:
-		mlx5e_rl_snd_tag_free(pmt);
-		break;
-#ifdef KERN_TLS
-	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
-		mlx5e_tls_snd_tag_free(pmt);
-		break;
-#endif
-#endif
-	case IF_SND_TAG_TYPE_UNLIMITED:
-		mlx5e_ul_snd_tag_free(pmt);
-		break;
-#ifdef KERN_TLS
-	case IF_SND_TAG_TYPE_TLS:
-		mlx5e_tls_snd_tag_free(pmt);
-		break;
-#endif
-	default:
-		break;
-	}
-}
-
 static void
 mlx5e_ifm_add(struct mlx5e_priv *priv, int type)
 {
 	ifmedia_add(&priv->media, type | IFM_ETHER, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_ETH_RXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX |
 	    IFM_ETH_RXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX |
 	    IFM_ETH_TXPAUSE, 0, NULL);
 	ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL);
 }
 
 static void *
 mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 {
 	struct ifnet *ifp;
 	struct mlx5e_priv *priv;
 	u8 dev_addr[ETHER_ADDR_LEN] __aligned(4);
 	struct sysctl_oid_list *child;
 	int ncv = mdev->priv.eq_table.num_comp_vectors;
 	char unit[16];
 	struct pfil_head_args pa;
 	int err;
 	u32 eth_proto_cap;
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	bool ext;
 	struct media media_entry = {};
 
 	if (mlx5e_check_required_hca_cap(mdev)) {
 		mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n");
 		return (NULL);
 	}
 
 	/*
 	 * Try to allocate the priv and make room for worst-case
 	 * number of channel structures:
 	 */
 	priv = malloc_domainset(sizeof(*priv) +
 	    (sizeof(priv->channel[0]) * mdev->priv.eq_table.num_comp_vectors),
 	    M_MLX5EN, mlx5_dev_domainset(mdev), M_WAITOK | M_ZERO);
 
 	ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev);
 	if (ifp == NULL) {
 		mlx5_core_err(mdev, "if_alloc() failed\n");
 		goto err_free_priv;
 	}
 	/* setup all static fields */
 	if (mlx5e_priv_static_init(priv, mdev, mdev->priv.eq_table.num_comp_vectors)) {
 		mlx5_core_err(mdev, "mlx5e_priv_static_init() failed\n");
 		goto err_free_ifp;
 	}
 
 	ifp->if_softc = priv;
 	if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev));
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_init = mlx5e_open;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST |
 	    IFF_KNOWSEPOCH;
 	ifp->if_ioctl = mlx5e_ioctl;
 	ifp->if_transmit = mlx5e_xmit;
 	ifp->if_qflush = if_qflush;
 #if (__FreeBSD_version >= 1100000)
 	ifp->if_get_counter = mlx5e_get_counter;
 #endif
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	/*
          * Set driver features
          */
 	ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6;
 	ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
 	ifp->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER;
 	ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
 	ifp->if_capabilities |= IFCAP_LRO;
 	ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
 	ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
 	ifp->if_capabilities |= IFCAP_MEXTPG;
 	ifp->if_capabilities |= IFCAP_TXTLS4 | IFCAP_TXTLS6;
 #ifdef RATELIMIT
 	ifp->if_capabilities |= IFCAP_TXRTLMT | IFCAP_TXTLS_RTLMT;
 #endif
 	ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
 	ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc;
-	ifp->if_snd_tag_free = mlx5e_snd_tag_free;
-	ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
-	ifp->if_snd_tag_query = mlx5e_snd_tag_query;
 #ifdef RATELIMIT
 	ifp->if_ratelimit_query = mlx5e_ratelimit_query;
 #endif
 	/* set TSO limits so that we don't have to drop TX packets */
 	ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 	ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
 	ifp->if_hw_tsomaxsegsize = MLX5E_MAX_TX_MBUF_SIZE;
 
 	ifp->if_capenable = ifp->if_capabilities;
 	ifp->if_hwassist = 0;
 	if (ifp->if_capenable & IFCAP_TSO)
 		ifp->if_hwassist |= CSUM_TSO;
 	if (ifp->if_capenable & IFCAP_TXCSUM)
 		ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP);
 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 		ifp->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6);
 	if (ifp->if_capabilities & IFCAP_VXLAN_HWCSUM)
 		ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP |
 		    CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP |
 		    CSUM_ENCAP_VXLAN;
 	if (ifp->if_capabilities  & IFCAP_VXLAN_HWTSO)
 		ifp->if_hwassist |= CSUM_INNER_IP6_TSO | CSUM_INNER_IP_TSO;
 
 	/* ifnet sysctl tree */
 	sysctl_ctx_init(&priv->sysctl_ctx);
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev),
 	    OID_AUTO, ifp->if_dname, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet - interface name");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 	snprintf(unit, sizeof(unit), "%d", ifp->if_dunit);
 	priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, unit, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet - interface unit");
 	if (priv->sysctl_ifnet == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	/* HW sysctl tree */
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev));
 	priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child,
 	    OID_AUTO, "hw", CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 	    "MLX5 ethernet dev hw");
 	if (priv->sysctl_hw == NULL) {
 		mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n");
 		goto err_free_sysctl;
 	}
 
 	err = mlx5e_build_ifp_priv(mdev, priv, ncv);
 	if (err) {
 		mlx5_core_err(mdev, "mlx5e_build_ifp_priv() failed (%d)\n", err);
 		goto err_free_sysctl;
 	}
 
 	/* reuse mlx5core's watchdog workqueue */
 	priv->wq = mdev->priv.health.wq_watchdog;
 
 	err = mlx5_core_alloc_pd(mdev, &priv->pdn, 0);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5_core_alloc_pd failed, %d\n", err);
 		goto err_free_wq;
 	}
 	err = mlx5_alloc_transport_domain(mdev, &priv->tdn, 0);
 	if (err) {
 		mlx5_en_err(ifp,
 		    "mlx5_alloc_transport_domain failed, %d\n", err);
 		goto err_dealloc_pd;
 	}
 	err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_create_mkey failed, %d\n", err);
 		goto err_dealloc_transport_domain;
 	}
 	mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr);
 
 	/* check if we should generate a random MAC address */
 	if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 &&
 	    is_zero_ether_addr(dev_addr)) {
 		random_ether_addr(dev_addr);
 		mlx5_en_err(ifp, "Assigned random MAC address\n");
 	}
 
 	err = mlx5e_rl_init(priv);
 	if (err) {
 		mlx5_en_err(ifp, "mlx5e_rl_init failed, %d\n", err);
 		goto err_create_mkey;
 	}
 
 	err = mlx5e_tls_init(priv);
 	if (err) {
 		if_printf(ifp, "%s: mlx5e_tls_init failed\n", __func__);
 		goto err_rl_init;
 	}
 
 	/* set default MTU */
 	mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu);
 
 	/* Set default media status */
 	priv->media_status_last = IFM_AVALID;
 	priv->media_active_last = IFM_ETHER | IFM_AUTO | IFM_FDX;
 
 	/* setup default pauseframes configuration */
 	mlx5e_setup_pauseframes(priv);
 
 	/* Setup supported medias */
 	if (!mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1)) {
 		ext = MLX5_CAP_PCAM_FEATURE(mdev,
 		    ptys_extended_ethernet);
 		eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
 		    eth_proto_capability);
 	} else {
 		ext = false;
 		eth_proto_cap = 0;
 		mlx5_en_err(ifp, "Query port media capability failed, %d\n", err);
 	}
 
 	ifmedia_init(&priv->media, IFM_IMASK,
 	    mlx5e_media_change, mlx5e_media_status);
 
 	if (ext) {
 		for (unsigned i = 0; i != MLX5E_EXT_LINK_SPEEDS_NUMBER; i++) {
 			/* check if hardware has the right capability */
 			if (MLX5E_PROT_MASK(i) & ~eth_proto_cap)
 				continue;
 			for (unsigned j = 0; j != MLX5E_CABLE_TYPE_NUMBER; j++) {
 				media_entry = mlx5e_ext_mode_table[i][j];
 				if (media_entry.subtype == 0)
 					continue;
 				/* check if this subtype was already added */
 				for (unsigned k = 0; k != i; k++) {
 					/* check if hardware has the right capability */
 					if (MLX5E_PROT_MASK(k) & ~eth_proto_cap)
 						continue;
 					for (unsigned m = 0; m != MLX5E_CABLE_TYPE_NUMBER; m++) {
 						if (media_entry.subtype == mlx5e_ext_mode_table[k][m].subtype)
 							goto skip_ext_media;
 					}
 				}
 				mlx5e_ifm_add(priv, media_entry.subtype);
 			skip_ext_media:;
 			}
 		}
 	} else {
 		for (unsigned i = 0; i != MLX5E_LINK_SPEEDS_NUMBER; i++) {
 			media_entry = mlx5e_mode_table[i];
 			if (media_entry.subtype == 0)
 				continue;
 			if (MLX5E_PROT_MASK(i) & ~eth_proto_cap)
 				continue;
 			/* check if this subtype was already added */
 			for (unsigned k = 0; k != i; k++) {
 				if (media_entry.subtype == mlx5e_mode_table[k].subtype)
 					goto skip_media;
 			}
 			mlx5e_ifm_add(priv, media_entry.subtype);
 
 			/* NOTE: 10G ER and LR shares the same entry */
 			if (media_entry.subtype == IFM_10G_ER)
 				mlx5e_ifm_add(priv, IFM_10G_LR);
 		skip_media:;
 		}
 	}
 
 	mlx5e_ifm_add(priv, IFM_AUTO);
 
 	/* Set autoselect by default */
 	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX |
 	    IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE);
 
 	DEBUGNET_SET(ifp, mlx5_en);
 
 	ether_ifattach(ifp, dev_addr);
 
 	/* Register for VLAN events */
 	priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 	    mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
 	priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 	    mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
 
 	/* Register for VxLAN events */
 	priv->vxlan_start = EVENTHANDLER_REGISTER(vxlan_start,
 	    mlx5e_vxlan_start, priv, EVENTHANDLER_PRI_ANY);
 	priv->vxlan_stop = EVENTHANDLER_REGISTER(vxlan_stop,
 	    mlx5e_vxlan_stop, priv, EVENTHANDLER_PRI_ANY);
 
 	/* Link is down by default */
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 
 	mlx5e_enable_async_events(priv);
 
 	mlx5e_add_hw_stats(priv);
 
 	mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM,
 	    priv->stats.vport.arg);
 
 	mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM,
 	    priv->stats.pport.arg);
 
 	mlx5e_create_ethtool(priv);
 
 	mtx_lock(&priv->async_events_mtx);
 	mlx5e_update_stats(priv);
 	mtx_unlock(&priv->async_events_mtx);
 
 	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
 	    OID_AUTO, "rx_clbr_done", CTLFLAG_RD,
 	    &priv->clbr_done, 0,
 	    "RX timestamps calibration state");
 	callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT);
 	mlx5e_reset_calibration_callout(priv);
 
 	pa.pa_version = PFIL_VERSION;
 	pa.pa_flags = PFIL_IN;
 	pa.pa_type = PFIL_TYPE_ETHERNET;
 	pa.pa_headname = ifp->if_xname;
 	priv->pfil = pfil_head_register(&pa);
 
 	return (priv);
 
 err_rl_init:
 	mlx5e_rl_cleanup(priv);
 
 err_create_mkey:
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 
 err_dealloc_transport_domain:
 	mlx5_dealloc_transport_domain(mdev, priv->tdn, 0);
 
 err_dealloc_pd:
 	mlx5_core_dealloc_pd(mdev, priv->pdn, 0);
 
 err_free_wq:
 	flush_workqueue(priv->wq);
 
 err_free_sysctl:
 	sysctl_ctx_free(&priv->sysctl_ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	mlx5e_priv_static_destroy(priv, mdev, mdev->priv.eq_table.num_comp_vectors);
 
 err_free_ifp:
 	if_free(ifp);
 
 err_free_priv:
 	free(priv, M_MLX5EN);
 	return (NULL);
 }
 
 static void
 mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 	struct ifnet *ifp = priv->ifp;
 
 	/* don't allow more IOCTLs */
 	priv->gone = 1;
 
 	/* XXX wait a bit to allow IOCTL handlers to complete */
 	pause("W", hz);
 
 #ifdef RATELIMIT
 	/*
 	 * The kernel can have reference(s) via the m_snd_tag's into
 	 * the ratelimit channels, and these must go away before
 	 * detaching:
 	 */
 	while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) {
 		mlx5_en_err(priv->ifp,
 		    "Waiting for all ratelimit connections to terminate\n");
 		pause("W", hz);
 	}
 #endif
 
 #ifdef KERN_TLS
 	/* wait for all TLS tags to get freed */
 	while (priv->tls.init != 0 &&
 	    uma_zone_get_cur(priv->tls.zone) != 0)  {
 		mlx5_en_err(priv->ifp,
 		    "Waiting for all TLS connections to terminate\n");
 		pause("W", hz);
 	}
 #endif
 	/* wait for all unlimited send tags to complete */
 	mlx5e_priv_wait_for_completion(priv, mdev->priv.eq_table.num_comp_vectors);
 
 	/* stop watchdog timer */
 	callout_drain(&priv->watchdog);
 
 	callout_drain(&priv->tstmp_clbr);
 
 	if (priv->vlan_attach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
 	if (priv->vlan_detach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
 	if (priv->vxlan_start != NULL)
 		EVENTHANDLER_DEREGISTER(vxlan_start, priv->vxlan_start);
 	if (priv->vxlan_stop != NULL)
 		EVENTHANDLER_DEREGISTER(vxlan_stop, priv->vxlan_stop);
 
 	/* make sure device gets closed */
 	PRIV_LOCK(priv);
 	mlx5e_close_locked(ifp);
 	PRIV_UNLOCK(priv);
 
 	/* deregister pfil */
 	if (priv->pfil != NULL) {
 		pfil_head_unregister(priv->pfil);
 		priv->pfil = NULL;
 	}
 
 	/* unregister device */
 	ifmedia_removeall(&priv->media);
 	ether_ifdetach(ifp);
 
 	mlx5e_tls_cleanup(priv);
 	mlx5e_rl_cleanup(priv);
 
 	/* destroy all remaining sysctl nodes */
 	sysctl_ctx_free(&priv->stats.vport.ctx);
 	sysctl_ctx_free(&priv->stats.pport.ctx);
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);
 	sysctl_ctx_free(&priv->sysctl_ctx);
 
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 	mlx5_dealloc_transport_domain(priv->mdev, priv->tdn, 0);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn, 0);
 	mlx5e_disable_async_events(priv);
 	flush_workqueue(priv->wq);
 	mlx5e_priv_static_destroy(priv, mdev, mdev->priv.eq_table.num_comp_vectors);
 	if_free(ifp);
 	free(priv, M_MLX5EN);
 }
 
 #ifdef DEBUGNET
 static void
 mlx5_en_debugnet_init(struct ifnet *dev, int *nrxr, int *ncl, int *clsize)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 
 	PRIV_LOCK(priv);
 	*nrxr = priv->params.num_channels;
 	*ncl = DEBUGNET_MAX_IN_FLIGHT;
 	*clsize = MLX5E_MAX_RX_BYTES;
 	PRIV_UNLOCK(priv);
 }
 
 static void
 mlx5_en_debugnet_event(struct ifnet *dev, enum debugnet_ev event)
 {
 }
 
 static int
 mlx5_en_debugnet_transmit(struct ifnet *dev, struct mbuf *m)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 	struct mlx5e_sq *sq;
 	int err;
 
 	if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING || (priv->media_status_last & IFM_ACTIVE) == 0)
 		return (ENOENT);
 
 	sq = &priv->channel[0].sq[0];
 
 	if (sq->running == 0) {
 		m_freem(m);
 		return (ENOENT);
 	}
 
 	if (mlx5e_sq_xmit(sq, &m) != 0) {
 		m_freem(m);
 		err = ENOBUFS;
 	} else {
 		err = 0;
 	}
 
 	if (likely(sq->doorbell.d64 != 0)) {
 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32);
 		sq->doorbell.d64 = 0;
 	}
 	return (err);
 }
 
 static int
 mlx5_en_debugnet_poll(struct ifnet *dev, int count)
 {
 	struct mlx5e_priv *priv = if_getsoftc(dev);
 
 	if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0 ||
 	    (priv->media_status_last & IFM_ACTIVE) == 0)
 		return (ENOENT);
 
 	mlx5_poll_interrupts(priv->mdev);
 
 	return (0);
 }
 #endif /* DEBUGNET */
 
 static void *
 mlx5e_get_ifp(void *vpriv)
 {
 	struct mlx5e_priv *priv = vpriv;
 
 	return (priv->ifp);
 }
 
 static struct mlx5_interface mlx5e_interface = {
 	.add = mlx5e_create_ifp,
 	.remove = mlx5e_destroy_ifp,
 	.event = mlx5e_async_event,
 	.protocol = MLX5_INTERFACE_PROTOCOL_ETH,
 	.get_dev = mlx5e_get_ifp,
 };
 
 void
 mlx5e_init(void)
 {
 	mlx5_register_interface(&mlx5e_interface);
 }
 
 void
 mlx5e_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5e_interface);
 }
 
 module_init_order(mlx5e_init, SI_ORDER_SIXTH);
 module_exit_order(mlx5e_cleanup, SI_ORDER_SIXTH);
 
 #if (__FreeBSD_version >= 1100000)
 MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1);
 #endif
 MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1);
 MODULE_VERSION(mlx5en, 1);
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
index 43532c4d0cc0..a95a227e639d 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
@@ -1,1572 +1,1581 @@
 /*-
  * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "en.h"
 
 #ifdef RATELIMIT
 
 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
     struct sysctl_oid *, const char *name, const char *desc);
 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
       struct sysctl_oid *node, const char *name, const char *desc);
 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
+static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
+static if_snd_tag_query_t mlx5e_rl_snd_tag_query;
+static if_snd_tag_free_t mlx5e_rl_snd_tag_free;
+
+static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = {
+	.snd_tag_modify = mlx5e_rl_snd_tag_modify,
+	.snd_tag_query = mlx5e_rl_snd_tag_query,
+	.snd_tag_free = mlx5e_rl_snd_tag_free,
+	.type = IF_SND_TAG_TYPE_RATE_LIMIT
+};
 
 static void
 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
     struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
 
 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
 
 	param->wq.linear = 1;
 }
 
 static void
 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
     struct mlx5e_cq_param *param)
 {
 	void *cqc = param->cqc;
 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
 
 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
 	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
 
 	switch (rl->param.tx_coalesce_mode) {
 	case 0:
 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	default:
 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
 		else
 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 		break;
 	}
 }
 
 static void
 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
     struct mlx5e_rl_channel_param *cparam)
 {
 	memset(cparam, 0, sizeof(*cparam));
 
 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
 }
 
 static int
 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
     struct mlx5e_sq_param *param, int ix)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *sqc = param->sqc;
 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	int err;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
 	    1,				/* any alignment */
 	    0,				/* no boundary */
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &sq->dma_tag)))
 		goto done;
 
 	sq->mkey_be = cpu_to_be32(priv->mr.key);
 	sq->ifp = priv->ifp;
 	sq->priv = priv;
 
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 	    &sq->wq_ctrl);
 	if (err)
 		goto err_free_dma_tag;
 
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
 	err = mlx5e_alloc_sq_db(sq);
 	if (err)
 		goto err_sq_wq_destroy;
 
 	mlx5e_update_sq_inline(sq);
 
 	return (0);
 
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 err_free_dma_tag:
 	bus_dma_tag_destroy(sq->dma_tag);
 done:
 	return (err);
 }
 
 static void
 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
 {
 
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
 	bus_dma_tag_destroy(sq->dma_tag);
 }
 
 static int
 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
     struct mlx5e_sq_param *param, int ix)
 {
 	int err;
 
 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
 	if (err)
 		return (err);
 
 	err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
 	if (err)
 		goto err_destroy_sq;
 
 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
 	if (err)
 		goto err_disable_sq;
 
 	WRITE_ONCE(sq->running, 1);
 
 	return (0);
 
 err_disable_sq:
 	mlx5e_disable_sq(sq);
 err_destroy_sq:
 	mlx5e_rl_destroy_sq(sq);
 
 	return (err);
 }
 
 static void
 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
 {
 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
 
 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
 
 	sq->cev_factor = priv->rl.param.tx_completion_fact;
 
 	/* ensure the TX completion event factor is not zero */
 	if (sq->cev_factor == 0)
 		sq->cev_factor = 1;
 }
 
 static int
 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
     struct mlx5e_rl_channel_param *cparam,
     struct mlx5e_sq *volatile *ppsq)
 {
 	struct mlx5e_priv *priv = rlw->priv;
 	struct mlx5e_sq *sq;
 	int err;
 
 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
 
 	/* init mutexes */
 	mlx5e_rl_chan_mtx_init(priv, sq);
 
 	/* open TX completion queue */
 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
 	    &mlx5e_tx_cq_comp, eq_ix);
 	if (err)
 		goto err_free;
 
 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
 	if (err)
 		goto err_close_tx_cq;
 
 	/* store TX channel pointer */
 	*ppsq = sq;
 
 	/* poll TX queue initially */
 	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
 
 	return (0);
 
 err_close_tx_cq:
 	mlx5e_close_cq(&sq->cq);
 
 err_free:
 	/* destroy mutexes */
 	mtx_destroy(&sq->lock);
 	mtx_destroy(&sq->comp_lock);
 	free(sq, M_MLX5EN);
 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
 	return (err);
 }
 
 static void
 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
 {
 	struct mlx5e_sq *sq = *ppsq;
 
 	/* check if channel is already closed */
 	if (sq == NULL)
 		return;
 	/* ensure channel pointer is no longer used */
 	*ppsq = NULL;
 
 	/* teardown and destroy SQ */
 	mlx5e_drain_sq(sq);
 	mlx5e_disable_sq(sq);
 	mlx5e_rl_destroy_sq(sq);
 
 	/* close CQ */
 	mlx5e_close_cq(&sq->cq);
 
 	/* destroy mutexes */
 	mtx_destroy(&sq->lock);
 	mtx_destroy(&sq->comp_lock);
 
 	free(sq, M_MLX5EN);
 }
 
 static void
 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
 {
 	/*
 	 * Limit the maximum distance between completion events to
 	 * half of the currently set TX queue size.
 	 *
 	 * The maximum number of queue entries a single IP packet can
 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
 	 *
 	 * The worst case max value is then given as below:
 	 */
 	uint64_t max = rl->param.tx_queue_size /
 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
 
 	/*
 	 * Update the maximum completion factor value in case the
 	 * tx_queue_size field changed. Ensure we don't overflow
 	 * 16-bits.
 	 */
 	if (max < 1)
 		max = 1;
 	else if (max > 65535)
 		max = 65535;
 	rl->param.tx_completion_fact_max = max;
 
 	/*
 	 * Verify that the current TX completion factor is within the
 	 * given limits:
 	 */
 	if (rl->param.tx_completion_fact < 1)
 		rl->param.tx_completion_fact = 1;
 	else if (rl->param.tx_completion_fact > max)
 		rl->param.tx_completion_fact = max;
 }
 
 static int
 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
 {
 	struct mlx5e_priv *priv = sq->priv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 	void *in;
 	void *sqc;
 	int inlen;
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
 	in = mlx5_vzalloc(inlen);
 	if (in == NULL)
 		return (-ENOMEM);
 
 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
 
 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
 
 	err = mlx5_core_modify_sq(mdev, in, inlen);
 
 	kvfree(in);
 
 	return (err);
 }
 
 /*
  * This function will search the configured rate limit table for the
  * best match to avoid that a single socket based application can
  * allocate all the available hardware rates. If the user selected
  * rate deviates too much from the closes rate available in the rate
  * limit table, unlimited rate will be selected.
  */
 static uint64_t
 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
 {
 	uint64_t distance = -1ULL;
 	uint64_t diff;
 	uint64_t retval = 0;		/* unlimited */
 	uint64_t x;
 
 	/* search for closest rate */
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		uint64_t rate = rl->rate_limit_table[x];
 		if (rate == 0)
 			continue;
 
 		if (rate > user_rate)
 			diff = rate - user_rate;
 		else
 			diff = user_rate - rate;
 
 		/* check if distance is smaller than previous rate */
 		if (diff < distance) {
 			distance = diff;
 			retval = rate;
 		}
 	}
 
 	/* range check for multiplication below */
 	if (user_rate > rl->param.tx_limit_max)
 		user_rate = rl->param.tx_limit_max;
 
 	/* fallback to unlimited, if rate deviates too much */
 	if (distance > howmany(user_rate *
 	    rl->param.tx_allowed_deviation, 1000ULL))
 		retval = 0;
 
 	return (retval);
 }
 
 /*
  * This function sets the requested rate for a rate limit channel, in
  * bits per second. The requested rate will be filtered through the
  * find best rate function above.
  */
 static int
 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
     struct mlx5e_rl_channel *channel, uint64_t rate)
 {
 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
 	struct mlx5e_sq *sq;
 	uint64_t temp;
 	uint16_t index;
 	uint16_t burst;
 	int error;
 
 	if (rate != 0) {
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 
 		MLX5E_RL_RLOCK(rl);
 
 		/* get current burst size in bytes */
 		temp = rl->param.tx_burst_size *
 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
 
 		/* limit burst size to 64K currently */
 		if (temp > 65535)
 			temp = 65535;
 		burst = temp;
 
 		/* find best rate */
 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
 
 		MLX5E_RL_RUNLOCK(rl);
 
 		if (rate == 0) {
 			/* rate doesn't exist, fallback to unlimited */
 			index = 0;
 			rate = 0;
 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
 		} else {
 			/* get a reference on the new rate */
 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
 			    howmany(rate, 1000), burst, &index);
 
 			if (error != 0) {
 				/* adding rate failed, fallback to unlimited */
 				index = 0;
 				rate = 0;
 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
 			}
 		}
 		MLX5E_RL_WORKER_LOCK(rlw);
 	} else {
 		index = 0;
 		burst = 0;	/* default */
 	}
 
 	/* atomically swap rates */
 	temp = channel->last_rate;
 	channel->last_rate = rate;
 	rate = temp;
 
 	/* atomically swap burst size */
 	temp = channel->last_burst;
 	channel->last_burst = burst;
 	burst = temp;
 
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 	/* put reference on the old rate, if any */
 	if (rate != 0) {
 		mlx5_rl_remove_rate(rlw->priv->mdev,
 		    howmany(rate, 1000), burst);
 	}
 
 	/* set new rate, if SQ is running */
 	sq = channel->sq;
 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
 		error = mlx5e_rl_modify_sq(sq, index);
 		if (error != 0)
 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
 	} else
 		error = 0;
 	MLX5E_RL_WORKER_LOCK(rlw);
 
 	return (-error);
 }
 
 static void
 mlx5e_rl_worker(void *arg)
 {
 	struct thread *td;
 	struct mlx5e_rl_worker *rlw = arg;
 	struct mlx5e_rl_channel *channel;
 	struct mlx5e_priv *priv;
 	unsigned ix;
 	uint64_t x;
 	int error;
 
 	/* set thread priority */
 	td = curthread;
 
 	thread_lock(td);
 	sched_prio(td, PI_SWI(SWI_NET));
 	thread_unlock(td);
 
 	priv = rlw->priv;
 
 	/* compute completion vector */
 	ix = (rlw - priv->rl.workers) %
 	    priv->mdev->priv.eq_table.num_comp_vectors;
 
 	/* TODO bind to CPU */
 
 	/* open all the SQs */
 	MLX5E_RL_WORKER_LOCK(rlw);
 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
 		struct mlx5e_rl_channel *channel = rlw->channels + x;
 
 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
 		if (channel->state == MLX5E_RL_ST_FREE)
 			continue;
 #endif
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 
 		MLX5E_RL_RLOCK(&priv->rl);
 		error = mlx5e_rl_open_channel(rlw, ix,
 		    &priv->rl.chan_param, &channel->sq);
 		MLX5E_RL_RUNLOCK(&priv->rl);
 
 		MLX5E_RL_WORKER_LOCK(rlw);
 		if (error != 0) {
 			mlx5_en_err(priv->ifp,
 			    "mlx5e_rl_open_channel failed: %d\n", error);
 			break;
 		}
 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
 	}
 	while (1) {
 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
 			/* check if we are tearing down */
 			if (rlw->worker_done != 0)
 				break;
 			cv_wait(&rlw->cv, &rlw->mtx);
 		}
 		/* check if we are tearing down */
 		if (rlw->worker_done != 0)
 			break;
 		channel = STAILQ_FIRST(&rlw->process_head);
 		if (channel != NULL) {
 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
 
 			switch (channel->state) {
 			case MLX5E_RL_ST_MODIFY:
 				channel->state = MLX5E_RL_ST_USED;
 				MLX5E_RL_WORKER_UNLOCK(rlw);
 
 				/* create channel by demand */
 				if (channel->sq == NULL) {
 					MLX5E_RL_RLOCK(&priv->rl);
 					error = mlx5e_rl_open_channel(rlw, ix,
 					    &priv->rl.chan_param, &channel->sq);
 					MLX5E_RL_RUNLOCK(&priv->rl);
 
 					if (error != 0) {
 						mlx5_en_err(priv->ifp,
 						    "mlx5e_rl_open_channel failed: %d\n", error);
 					} else {
 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
 					}
 				} else {
 					mlx5e_resume_sq(channel->sq);
 				}
 
 				MLX5E_RL_WORKER_LOCK(rlw);
 				/* convert from bytes/s to bits/s and set new rate */
 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
 				    channel->new_rate * 8ULL);
 				if (error != 0) {
 					mlx5_en_err(priv->ifp,
 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
 					    error);
 				}
 				break;
 
 			case MLX5E_RL_ST_DESTROY:
 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
 				if (error != 0) {
 					mlx5_en_err(priv->ifp,
 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
 					    error);
 				}
 				if (channel->sq != NULL) {
 					/*
 					 * Make sure all packets are
 					 * transmitted before SQ is
 					 * returned to free list:
 					 */
 					MLX5E_RL_WORKER_UNLOCK(rlw);
 					mlx5e_drain_sq(channel->sq);
 					MLX5E_RL_WORKER_LOCK(rlw);
 				}
 				/* put the channel back into the free list */
 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
 				channel->state = MLX5E_RL_ST_FREE;
 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
 				break;
 			default:
 				/* NOP */
 				break;
 			}
 		}
 	}
 
 	/* close all the SQs */
 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
 		struct mlx5e_rl_channel *channel = rlw->channels + x;
 
 		/* update the initial rate */
 		channel->init_rate = channel->last_rate;
 
 		/* make sure we free up the rate resource */
 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
 
 		if (channel->sq != NULL) {
 			MLX5E_RL_WORKER_UNLOCK(rlw);
 			mlx5e_rl_close_channel(&channel->sq);
 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
 			MLX5E_RL_WORKER_LOCK(rlw);
 		}
 	}
 
 	rlw->worker_done = 0;
 	cv_broadcast(&rlw->cv);
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	kthread_exit();
 }
 
 static int
 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(tisc, tisc, prio, 0);
 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
 
 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
 }
 
 static void
 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
 {
 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0);
 }
 
 static void
 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
     struct mlx5_core_dev *mdev)
 {
 	/* ratelimit workers */
 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
 
 	/* range check */
 	if (param->tx_worker_threads_def == 0 ||
 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
 		param->tx_worker_threads_def = param->tx_worker_threads_max;
 
 	/* ratelimit channels */
 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
 	    param->tx_worker_threads_def;
 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
 
 	/* range check */
 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
 
 	/* set default burst size */
 	param->tx_burst_size = 4;	/* MTUs */
 
 	/*
 	 * Set maximum burst size
 	 *
 	 * The burst size is multiplied by the MTU and clamped to the
 	 * range 0 ... 65535 bytes inclusivly before fed into the
 	 * firmware.
 	 *
 	 * NOTE: If the burst size or MTU is changed only ratelimit
 	 * connections made after the change will use the new burst
 	 * size.
 	 */
 	param->tx_burst_size_max = 255;
 
 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
 
 	/* ratelimit table size */
 	param->tx_rates_max = mdev->priv.rl_table.max_size;
 
 	/* range check */
 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
 
 	/* set default number of rates */
 	param->tx_rates_def = param->tx_rates_max;
 
 	/* set maximum allowed rate deviation */
 	if (param->tx_limit_max != 0) {
 		/*
 		 * Make sure the deviation multiplication doesn't
 		 * overflow unsigned 64-bit:
 		 */
 		param->tx_allowed_deviation_max = -1ULL /
 		    param->tx_limit_max;
 	}
 	/* set default rate deviation */
 	param->tx_allowed_deviation = 50;	/* 5.0% */
 
 	/* channel parameters */
 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
 }
 
 static const char *mlx5e_rl_params_desc[] = {
 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_rl_table_params_desc[] = {
 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
 };
 
 static const char *mlx5e_rl_stats_desc[] = {
 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
 };
 
 int
 mlx5e_rl_init(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	struct sysctl_oid *node;
 	struct sysctl_oid *stats;
 	char buf[64];
 	uint64_t i;
 	uint64_t j;
 	int error;
 
 	/* check if there is support for packet pacing */
 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
 		return (0);
 
 	rl->priv = priv;
 
 	sysctl_ctx_init(&rl->ctx);
 
 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
 
 	/* open own TIS domain for ratelimit SQs */
 	error = mlx5e_rl_open_tis(priv);
 	if (error)
 		goto done;
 
 	/* setup default value for parameters */
 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
 
 	/* update the completion factor */
 	mlx5e_rl_sync_tx_completion_fact(rl);
 
 	/* create root node */
 	node = SYSCTL_ADD_NODE(&rl->ctx,
 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
 
 	if (node != NULL) {
 		/* create SYSCTLs */
 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
 			mlx5e_rl_sysctl_add_u64_oid(rl,
 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
 			    node, mlx5e_rl_params_desc[2 * i],
 			    mlx5e_rl_params_desc[2 * i + 1]);
 		}
 
 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 		    "Rate limiting statistics");
 		if (stats != NULL) {
 			/* create SYSCTLs */
 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
 				    stats, mlx5e_rl_stats_desc[2 * i],
 				    mlx5e_rl_stats_desc[2 * i + 1]);
 			}
 		}
 	}
 
 	/* allocate workers array */
 	rl->workers = malloc(sizeof(rl->workers[0]) *
 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
 
 	/* allocate rate limit array */
 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
 
 	if (node != NULL) {
 		/* create more SYSCTls */
 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
 		    "A", "Show table of all configured TX rates");
 
 		/* try to fetch rate table from kernel environment */
 		for (i = 0; i != rl->param.tx_rates_def; i++) {
 			/* compute path for tunable */
 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
 			if (TUNABLE_QUAD_FETCH(buf, &j))
 				mlx5e_rl_tx_limit_add(rl, j);
 		}
 
 		/* setup rate table sysctls */
 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
 			mlx5e_rl_sysctl_add_u64_oid(rl,
 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
 			    node, mlx5e_rl_table_params_desc[2 * i],
 			    mlx5e_rl_table_params_desc[2 * i + 1]);
 		}
 	}
 
 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + j;
 
 		rlw->priv = priv;
 
 		cv_init(&rlw->cv, "mlx5-worker-cv");
 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
 		STAILQ_INIT(&rlw->index_list_head);
 		STAILQ_INIT(&rlw->process_head);
 
 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
 
 		MLX5E_RL_WORKER_LOCK(rlw);
 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
 			struct mlx5e_rl_channel *channel = rlw->channels + i;
 			channel->worker = rlw;
-			channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
 		}
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 	}
 
 	PRIV_LOCK(priv);
 	error = mlx5e_rl_open_workers(priv);
 	PRIV_UNLOCK(priv);
 
 	if (error != 0) {
 		mlx5_en_err(priv->ifp,
 		    "mlx5e_rl_open_workers failed: %d\n", error);
 	}
 
 	return (0);
 
 done:
 	sysctl_ctx_free(&rl->ctx);
 	sx_destroy(&rl->rl_sxlock);
 	return (error);
 }
 
 static int
 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	struct thread *rl_thread = NULL;
 	struct proc *rl_proc = NULL;
 	uint64_t j;
 	int error;
 
 	if (priv->gone || rl->opened)
 		return (-EINVAL);
 
 	MLX5E_RL_WLOCK(rl);
 	/* compute channel parameters once */
 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
 	MLX5E_RL_WUNLOCK(rl);
 
 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + j;
 
 		/* start worker thread */
 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
 		if (error != 0) {
 			mlx5_en_err(rl->priv->ifp,
 			    "kproc_kthread_add failed: %d\n", error);
 			rlw->worker_done = 1;
 		}
 	}
 
 	rl->opened = 1;
 
 	return (0);
 }
 
 static void
 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	uint64_t y;
 
 	if (rl->opened == 0)
 		return;
 
 	/* tear down worker threads simultaneously */
 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		/* tear down worker before freeing SQs */
 		MLX5E_RL_WORKER_LOCK(rlw);
 		if (rlw->worker_done == 0) {
 			rlw->worker_done = 1;
 			cv_broadcast(&rlw->cv);
 		} else {
 			/* XXX thread not started */
 			rlw->worker_done = 0;
 		}
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 	}
 
 	/* wait for worker threads to exit */
 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		/* tear down worker before freeing SQs */
 		MLX5E_RL_WORKER_LOCK(rlw);
 		while (rlw->worker_done != 0)
 			cv_wait(&rlw->cv, &rlw->mtx);
 		MLX5E_RL_WORKER_UNLOCK(rlw);
 	}
 
 	rl->opened = 0;
 }
 
 static void
 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
 {
 	unsigned x;
 
 	MLX5E_RL_WLOCK(rl);
 	for (x = 0; x != rl->param.tx_rates_def; x++)
 		rl->rate_limit_table[x] = 0;
 	MLX5E_RL_WUNLOCK(rl);
 }
 
 void
 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rl_priv_data *rl = &priv->rl;
 	uint64_t y;
 
 	/* check if there is support for packet pacing */
 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
 		return;
 
 	/* TODO check if there is support for packet pacing */
 
 	sysctl_ctx_free(&rl->ctx);
 
 	PRIV_LOCK(priv);
 	mlx5e_rl_close_workers(priv);
 	PRIV_UNLOCK(priv);
 
 	mlx5e_rl_reset_rates(rl);
 
 	/* close TIS domain */
 	mlx5e_rl_close_tis(priv);
 
 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		cv_destroy(&rlw->cv);
 		mtx_destroy(&rlw->mtx);
 		free(rlw->channels, M_MLX5EN);
 	}
 	free(rl->rate_limit_table, M_MLX5EN);
 	free(rl->workers, M_MLX5EN);
 	sx_destroy(&rl->rl_sxlock);
 }
 
 static void
 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
     struct mlx5e_rl_channel *channel)
 {
 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
 	cv_broadcast(&rlw->cv);
 }
 
 static void
 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
 {
 	if (channel == NULL)
 		return;
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	switch (channel->state) {
 	case MLX5E_RL_ST_MODIFY:
 		channel->state = MLX5E_RL_ST_DESTROY;
 		break;
 	case MLX5E_RL_ST_USED:
 		channel->state = MLX5E_RL_ST_DESTROY;
 		mlx5e_rlw_queue_channel_locked(rlw, channel);
 		break;
 	default:
 		break;
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 }
 
 static int
 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
 {
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	channel->new_rate = rate;
 	switch (channel->state) {
 	case MLX5E_RL_ST_USED:
 		channel->state = MLX5E_RL_ST_MODIFY;
 		mlx5e_rlw_queue_channel_locked(rlw, channel);
 		break;
 	default:
 		break;
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	return (0);
 }
 
 static int
 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
     union if_snd_tag_query_params *params)
 {
 	int retval;
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	switch (channel->state) {
 	case MLX5E_RL_ST_USED:
 		params->rate_limit.max_rate = channel->last_rate;
 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
 		retval = 0;
 		break;
 	case MLX5E_RL_ST_MODIFY:
 		params->rate_limit.max_rate = channel->last_rate;
 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
 		retval = EBUSY;
 		break;
 	default:
 		retval = EINVAL;
 		break;
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	return (retval);
 }
 
 static int
 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
     struct mlx5e_rl_channel **pchannel)
 {
 	struct mlx5e_rl_channel *channel;
 	int retval = ENOMEM;
 
 	MLX5E_RL_WORKER_LOCK(rlw);
 	/* Check for available channel in free list */
 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
 		retval = 0;
 		/* Remove head index from available list */
 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
 		channel->state = MLX5E_RL_ST_USED;
 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
 	} else {
 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
 	}
 	MLX5E_RL_WORKER_UNLOCK(rlw);
 
 	*pchannel = channel;
 #ifdef RATELIMIT_DEBUG
 	mlx5_en_info(rlw->priv->ifp,
 	    "Channel pointer for rate limit connection is %p\n", channel);
 #endif
 	return (retval);
 }
 
 int
 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct mlx5e_rl_channel *channel;
 	struct mlx5e_rl_worker *rlw;
 	struct mlx5e_priv *priv;
 	int error;
 
 	priv = ifp->if_softc;
 
 	/* check if there is support for packet pacing or if device is going away */
 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
 		return (EOPNOTSUPP);
 
 	/* compute worker thread this TCP connection belongs to */
 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
 	    priv->rl.param.tx_worker_threads_def);
 
 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
 	if (error != 0)
 		goto done;
 
 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
 	if (error != 0) {
 		mlx5e_rl_free(rlw, channel);
 		goto done;
 	}
 
 	/* store pointer to mbuf tag */
 	MPASS(channel->tag.refcount == 0);
-	m_snd_tag_init(&channel->tag, ifp, IF_SND_TAG_TYPE_RATE_LIMIT);
+	m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw);
 	*ppmt = &channel->tag;
 done:
 	return (error);
 }
 
 
-int
+static int
 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
 {
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag);
 
 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
 }
 
-int
+static int
 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
 {
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag);
 
 	return (mlx5e_rl_query(channel->worker, channel, params));
 }
 
-void
+static void
 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_rl_channel *channel =
 	    container_of(pmt, struct mlx5e_rl_channel, tag);
 
 	mlx5e_rl_free(channel->worker, channel);
 }
 
 static int
 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_rl_priv_data *rl = arg1;
 	struct mlx5e_priv *priv = rl->priv;
 	struct sbuf sbuf;
 	unsigned x;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	PRIV_LOCK(priv);
 
 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
 
 	sbuf_printf(&sbuf,
 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
 	    "\t" "--------------------------------------------\n");
 
 	MLX5E_RL_RLOCK(rl);
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		if (rl->rate_limit_table[x] == 0)
 			continue;
 
 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
 		    x, (unsigned)rl->param.tx_burst_size,
 		    (long long)rl->rate_limit_table[x]);
 	}
 	MLX5E_RL_RUNLOCK(rl);
 
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 
 	PRIV_UNLOCK(priv);
 
 	return (error);
 }
 
 static int
 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
 {
 	uint64_t x;
 	uint64_t y;
 
 	MLX5E_RL_WLOCK(rl);
 	/* compute channel parameters once */
 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
 	MLX5E_RL_WUNLOCK(rl);
 
 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
 			struct mlx5e_rl_channel *channel;
 			struct mlx5e_sq *sq;
 
 			channel = rlw->channels + x;
 			sq = channel->sq;
 
 			if (sq == NULL)
 				continue;
 
 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
 				    rl->param.tx_coalesce_usecs,
 				    rl->param.tx_coalesce_pkts,
 				    rl->param.tx_coalesce_mode);
 			} else {
 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
 				    rl->param.tx_coalesce_usecs,
 				    rl->param.tx_coalesce_pkts);
 			}
 		}
 	}
 	return (0);
 }
 
 void
 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
 {
 	uint64_t x;
 	uint64_t y;
 
 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
 		struct mlx5e_rl_worker *rlw = rl->workers + y;
 
 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
 			struct mlx5e_rl_channel *channel;
 			struct mlx5e_sq *sq;
 
 			channel = rlw->channels + x;
 			sq = channel->sq;
 
 			if (sq == NULL)
 				continue;
 
 			mtx_lock(&sq->lock);
 			mlx5e_update_sq_inline(sq);
 			mtx_unlock(&sq->lock);
 		}
 	}
 }
 
 static int
 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
 {
 	unsigned x;
 	int error;
 
 	if (value < 1000 ||
 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
 		return (EINVAL);
 
 	MLX5E_RL_WLOCK(rl);
 	error = ENOMEM;
 
 	/* check if rate already exists */
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		if (rl->rate_limit_table[x] != value)
 			continue;
 		error = EEXIST;
 		break;
 	}
 
 	/* check if there is a free rate entry */
 	if (x == rl->param.tx_rates_def) {
 		for (x = 0; x != rl->param.tx_rates_def; x++) {
 			if (rl->rate_limit_table[x] != 0)
 				continue;
 			rl->rate_limit_table[x] = value;
 			error = 0;
 			break;
 		}
 	}
 	MLX5E_RL_WUNLOCK(rl);
 
 	return (error);
 }
 
 static int
 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
 {
 	unsigned x;
 	int error;
 
 	if (value == 0)
 		return (EINVAL);
 
 	MLX5E_RL_WLOCK(rl);
 
 	/* check if rate already exists */
 	for (x = 0; x != rl->param.tx_rates_def; x++) {
 		if (rl->rate_limit_table[x] != value)
 			continue;
 		/* free up rate */
 		rl->rate_limit_table[x] = 0;
 		break;
 	}
 
 	/* check if there is a free rate entry */
 	if (x == rl->param.tx_rates_def)
 		error = ENOENT;
 	else
 		error = 0;
 	MLX5E_RL_WUNLOCK(rl);
 
 	return (error);
 }
 
 static int
 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct mlx5e_rl_priv_data *rl = arg1;
 	struct mlx5e_priv *priv = rl->priv;
 	unsigned mode_modify;
 	unsigned was_opened;
 	uint64_t value;
 	uint64_t old;
 	int error;
 
 	PRIV_LOCK(priv);
 
 	MLX5E_RL_RLOCK(rl);
 	value = rl->param.arg[arg2];
 	MLX5E_RL_RUNLOCK(rl);
 
 	if (req != NULL) {
 		old = value;
 		error = sysctl_handle_64(oidp, &value, 0, req);
 		if (error || req->newptr == NULL ||
 		    value == rl->param.arg[arg2])
 			goto done;
 	} else {
 		old = 0;
 		error = 0;
 	}
 
 	/* check if device is gone */
 	if (priv->gone) {
 		error = ENXIO;
 		goto done;
 	}
 	was_opened = rl->opened;
 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
 
 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
 		if (value > rl->param.tx_worker_threads_max)
 			value = rl->param.tx_worker_threads_max;
 		else if (value < 1)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
 		if (value > rl->param.tx_channels_per_worker_max)
 			value = rl->param.tx_channels_per_worker_max;
 		else if (value < 1)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
 		if (value > rl->param.tx_rates_max)
 			value = rl->param.tx_rates_max;
 		else if (value < 1)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
 		/* range check */
 		if (value < 1)
 			value = 0;
 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
 			value = MLX5E_FLD_MAX(cqc, cq_period);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* check to avoid down and up the network interface */
 		if (was_opened)
 			error = mlx5e_rl_refresh_channel_params(rl);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
 		/* import TX coal pkts */
 		if (value < 1)
 			value = 0;
 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* check to avoid down and up the network interface */
 		if (was_opened)
 			error = mlx5e_rl_refresh_channel_params(rl);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
 		/* network interface must be down */
 		if (was_opened != 0 && mode_modify == 0)
 			mlx5e_rl_close_workers(priv);
 
 		/* import TX coalesce mode */
 		if (value != 0)
 			value = 1;
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* restart network interface, if any */
 		if (was_opened != 0) {
 			if (mode_modify == 0)
 				mlx5e_rl_open_workers(priv);
 			else
 				error = mlx5e_rl_refresh_channel_params(rl);
 		}
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
 		/* network interface must be down */
 		if (was_opened)
 			mlx5e_rl_close_workers(priv);
 
 		/* import TX queue size */
 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
 		else if (value > priv->params_ethtool.tx_queue_size_max)
 			value = priv->params_ethtool.tx_queue_size_max;
 
 		/* store actual TX queue size */
 		value = 1ULL << order_base_2(value);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* verify TX completion factor */
 		mlx5e_rl_sync_tx_completion_fact(rl);
 
 		/* restart network interface, if any */
 		if (was_opened)
 			mlx5e_rl_open_workers(priv);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
 		/* network interface must be down */
 		if (was_opened)
 			mlx5e_rl_close_workers(priv);
 
 		/* store new value */
 		rl->param.arg[arg2] = value;
 
 		/* verify parameter */
 		mlx5e_rl_sync_tx_completion_fact(rl);
 
 		/* restart network interface, if any */
 		if (was_opened)
 			mlx5e_rl_open_workers(priv);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
 		error = mlx5e_rl_tx_limit_add(rl, value);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
 		error = mlx5e_rl_tx_limit_clr(rl, value);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
 		/* range check */
 		if (value > rl->param.tx_allowed_deviation_max)
 			value = rl->param.tx_allowed_deviation_max;
 		else if (value < rl->param.tx_allowed_deviation_min)
 			value = rl->param.tx_allowed_deviation_min;
 
 		MLX5E_RL_WLOCK(rl);
 		rl->param.arg[arg2] = value;
 		MLX5E_RL_WUNLOCK(rl);
 		break;
 
 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
 		/* range check */
 		if (value > rl->param.tx_burst_size_max)
 			value = rl->param.tx_burst_size_max;
 		else if (value < rl->param.tx_burst_size_min)
 			value = rl->param.tx_burst_size_min;
 
 		MLX5E_RL_WLOCK(rl);
 		rl->param.arg[arg2] = value;
 		MLX5E_RL_WUNLOCK(rl);
 		break;
 
 	default:
 		break;
 	}
 done:
 	PRIV_UNLOCK(priv);
 	return (error);
 }
 
 static void
 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
     struct sysctl_oid *node, const char *name, const char *desc)
 {
 	/*
 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
 	 * take care of loading default sysctl value from the kernel
 	 * environment, if any:
 	 */
 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
 		/* read-only SYSCTLs */
 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 		    name, CTLTYPE_U64 | CTLFLAG_RD |
 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
 	} else {
 		if (strstr(name, "_def") != 0) {
 #ifdef RATELIMIT_DEBUG
 			/* tunable read-only advanced SYSCTLs */
 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
 #endif
 		} else {
 			/* read-write SYSCTLs */
 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
 		}
 	}
 }
 
 static void
 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
     struct sysctl_oid *node, const char *name, const char *desc)
 {
 	/* read-only SYSCTLs */
 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
 }
 
 #else
 
 int
 mlx5e_rl_init(struct mlx5e_priv *priv)
 {
 
 	return (0);
 }
 
 void
 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
 {
 	/* NOP */
 }
 
 #endif		/* RATELIMIT */
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
index e85522bdfad7..e469482c99bd 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
@@ -1,1184 +1,1184 @@
 /*-
  * Copyright (c) 2015-2019 Mellanox Technologies. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_kern_tls.h"
 
 #include "en.h"
 #include <machine/atomic.h>
 
 static inline bool
 mlx5e_do_send_cqe_inline(struct mlx5e_sq *sq)
 {
 	sq->cev_counter++;
 	/* interleave the CQEs */
 	if (sq->cev_counter >= sq->cev_factor) {
 		sq->cev_counter = 0;
 		return (true);
 	}
 	return (false);
 }
 
 bool
 mlx5e_do_send_cqe(struct mlx5e_sq *sq)
 {
 
 	return (mlx5e_do_send_cqe_inline(sq));
 }
 
 void
 mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt)
 {
 	u16 pi = sq->pc & sq->wq.sz_m1;
 	struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 
 	memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	if (mlx5e_do_send_cqe_inline(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
 	else
 		wqe->ctrl.fm_ce_se = 0;
 
 	/* Copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 }
 
 #if (__FreeBSD_version >= 1100000)
 static uint32_t mlx5e_hash_value;
 
 static void
 mlx5e_hash_init(void *arg)
 {
 	mlx5e_hash_value = m_ether_tcpip_hash_init();
 }
 
 /* Make kernel call mlx5e_hash_init after the random stack finished initializing */
 SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL);
 #endif
 
 static struct mlx5e_sq *
 mlx5e_select_queue_by_send_tag(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct m_snd_tag *mb_tag;
 	struct mlx5e_sq *sq;
 
 	mb_tag = mb->m_pkthdr.snd_tag;
 
 #ifdef KERN_TLS
 top:
 #endif
 	/* get pointer to sendqueue */
-	switch (mb_tag->type) {
+	switch (mb_tag->sw->type) {
 #ifdef RATELIMIT
 	case IF_SND_TAG_TYPE_RATE_LIMIT:
 		sq = container_of(mb_tag,
 		    struct mlx5e_rl_channel, tag)->sq;
 		break;
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
 		mb_tag = container_of(mb_tag, struct mlx5e_tls_tag, tag)->rl_tag;
 		goto top;
 #endif
 #endif
 	case IF_SND_TAG_TYPE_UNLIMITED:
 		sq = &container_of(mb_tag,
 		    struct mlx5e_channel, tag)->sq[0];
 		KASSERT((mb_tag->refcount > 0),
 		    ("mlx5e_select_queue: Channel refs are zero for unlimited tag"));
 		break;
 #ifdef KERN_TLS
 	case IF_SND_TAG_TYPE_TLS:
 		mb_tag = container_of(mb_tag, struct mlx5e_tls_tag, tag)->rl_tag;
 		goto top;
 #endif
 	default:
 		sq = NULL;
 		break;
 	}
 
 	/* check if valid */
 	if (sq != NULL && READ_ONCE(sq->running) != 0)
 		return (sq);
 
 	return (NULL);
 }
 
 static struct mlx5e_sq *
 mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct mlx5e_priv *priv = ifp->if_softc;
 	struct mlx5e_sq *sq;
 	u32 ch;
 	u32 tc;
 
 	/* obtain VLAN information if present */
 	if (mb->m_flags & M_VLANTAG) {
 		tc = (mb->m_pkthdr.ether_vtag >> 13);
 		if (tc >= priv->num_tc)
 			tc = priv->default_vlan_prio;
 	} else {
 		tc = priv->default_vlan_prio;
 	}
 
 	ch = priv->params.num_channels;
 
 	/* check if flowid is set */
 	if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) {
 #ifdef RSS
 		u32 temp;
 
 		if (rss_hash2bucket(mb->m_pkthdr.flowid,
 		    M_HASHTYPE_GET(mb), &temp) == 0)
 			ch = temp % ch;
 		else
 #endif
 			ch = (mb->m_pkthdr.flowid % 128) % ch;
 	} else {
 #if (__FreeBSD_version >= 1100000)
 		ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 |
 		    MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch;
 #else
 		/*
 		 * m_ether_tcpip_hash not present in stable, so just
 		 * throw unhashed mbufs on queue 0
 		 */
 		ch = 0;
 #endif
 	}
 
 	/* check if send queue is running */
 	sq = &priv->channel[ch].sq[tc];
 	if (likely(READ_ONCE(sq->running) != 0))
 		return (sq);
 	return (NULL);
 }
 
 static inline u16
 mlx5e_get_l2_header_size(struct mlx5e_sq *sq, struct mbuf *mb)
 {
 	struct ether_vlan_header *eh;
 	uint16_t eth_type;
 	int min_inline;
 
 	eh = mtod(mb, struct ether_vlan_header *);
 	if (unlikely(mb->m_len < ETHER_HDR_LEN)) {
 		goto max_inline;
 	} else if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		if (unlikely(mb->m_len < (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)))
 			goto max_inline;
 		eth_type = ntohs(eh->evl_proto);
 		min_inline = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		eth_type = ntohs(eh->evl_encap_proto);
 		min_inline = ETHER_HDR_LEN;
 	}
 
 	switch (eth_type) {
 	case ETHERTYPE_IP:
 	case ETHERTYPE_IPV6:
 		/*
 		 * Make sure the TOS(IPv4) or traffic class(IPv6)
 		 * field gets inlined. Else the SQ may stall.
 		 */
 		min_inline += 4;
 		break;
 	default:
 		goto max_inline;
 	}
 
 	/*
 	 * m_copydata() will be used on the remaining header which
 	 * does not need to reside within the first m_len bytes of
 	 * data:
 	 */
 	if (mb->m_pkthdr.len < min_inline)
 		goto max_inline;
 	return (min_inline);
 
 max_inline:
 	return (MIN(mb->m_pkthdr.len, sq->max_inline));
 }
 
 /*
  * This function parse IPv4 and IPv6 packets looking for TCP and UDP
  * headers.
  *
  * Upon return the pointer at which the "ppth" argument points, is set
  * to the location of the TCP header. NULL is used if no TCP header is
  * present.
  *
  * The return value indicates the number of bytes from the beginning
  * of the packet until the first byte after the TCP or UDP header. If
  * this function returns zero, the parsing failed.
  */
 int
 mlx5e_get_full_header_size(const struct mbuf *mb, const struct tcphdr **ppth)
 {
 	const struct ether_vlan_header *eh;
 	const struct tcphdr *th;
 	const struct ip *ip;
 	int ip_hlen, tcp_hlen;
 	const struct ip6_hdr *ip6;
 	uint16_t eth_type;
 	int eth_hdr_len;
 
 	eh = mtod(mb, const struct ether_vlan_header *);
 	if (unlikely(mb->m_len < ETHER_HDR_LEN))
 		goto failure;
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		if (unlikely(mb->m_len < ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN))
 			goto failure;
 		eth_type = ntohs(eh->evl_proto);
 		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		eth_type = ntohs(eh->evl_encap_proto);
 		eth_hdr_len = ETHER_HDR_LEN;
 	}
 
 	switch (eth_type) {
 	case ETHERTYPE_IP:
 		ip = (const struct ip *)(mb->m_data + eth_hdr_len);
 		if (unlikely(mb->m_len < eth_hdr_len + sizeof(*ip)))
 			goto failure;
 		switch (ip->ip_p) {
 		case IPPROTO_TCP:
 			ip_hlen = ip->ip_hl << 2;
 			eth_hdr_len += ip_hlen;
 			goto tcp_packet;
 		case IPPROTO_UDP:
 			ip_hlen = ip->ip_hl << 2;
 			eth_hdr_len += ip_hlen + sizeof(struct udphdr);
 			th = NULL;
 			goto udp_packet;
 		default:
 			goto failure;
 		}
 		break;
 	case ETHERTYPE_IPV6:
 		ip6 = (const struct ip6_hdr *)(mb->m_data + eth_hdr_len);
 		if (unlikely(mb->m_len < eth_hdr_len + sizeof(*ip6)))
 			goto failure;
 		switch (ip6->ip6_nxt) {
 		case IPPROTO_TCP:
 			eth_hdr_len += sizeof(*ip6);
 			goto tcp_packet;
 		case IPPROTO_UDP:
 			eth_hdr_len += sizeof(*ip6) + sizeof(struct udphdr);
 			th = NULL;
 			goto udp_packet;
 		default:
 			goto failure;
 		}
 		break;
 	default:
 		goto failure;
 	}
 tcp_packet:
 	if (unlikely(mb->m_len < eth_hdr_len + sizeof(*th))) {
 		const struct mbuf *m_th = mb->m_next;
 		if (unlikely(mb->m_len != eth_hdr_len ||
 		    m_th == NULL || m_th->m_len < sizeof(*th)))
 			goto failure;
 		th = (const struct tcphdr *)(m_th->m_data);
 	} else {
 		th = (const struct tcphdr *)(mb->m_data + eth_hdr_len);
 	}
 	tcp_hlen = th->th_off << 2;
 	eth_hdr_len += tcp_hlen;
 udp_packet:
 	/*
 	 * m_copydata() will be used on the remaining header which
 	 * does not need to reside within the first m_len bytes of
 	 * data:
 	 */
 	if (unlikely(mb->m_pkthdr.len < eth_hdr_len))
 		goto failure;
 	if (ppth != NULL)
 		*ppth = th;
 	return (eth_hdr_len);
 failure:
 	if (ppth != NULL)
 		*ppth = NULL;
 	return (0);
 }
 
 /*
  * Locate a pointer inside a mbuf chain. Returns NULL upon failure.
  */
 static inline void *
 mlx5e_parse_mbuf_chain(const struct mbuf **mb, int *poffset, int eth_hdr_len,
     int min_len)
 {
 	if (unlikely(mb[0]->m_len == eth_hdr_len)) {
 		poffset[0] = eth_hdr_len;
 		if (unlikely((mb[0] = mb[0]->m_next) == NULL))
 			return (NULL);
 	}
 	if (unlikely(mb[0]->m_len < eth_hdr_len - poffset[0] + min_len))
 		return (NULL);
 	return (mb[0]->m_data + eth_hdr_len - poffset[0]);
 }
 
 /*
  * This function parse IPv4 and IPv6 packets looking for UDP, VXLAN
  * and TCP headers.
  *
  * The return value indicates the number of bytes from the beginning
  * of the packet until the first byte after the TCP header. If this
  * function returns zero, the parsing failed.
  */
 static int
 mlx5e_get_vxlan_header_size(const struct mbuf *mb, struct mlx5e_tx_wqe *wqe,
     uint8_t cs_mask, uint8_t opcode)
 {
 	const struct ether_vlan_header *eh;
 	struct ip *ip4;
 	struct ip6_hdr *ip6;
 	struct tcphdr *th;
 	struct udphdr *udp;
 	bool has_outer_vlan_tag;
 	uint16_t eth_type;
 	uint8_t ip_type;
 	int pkt_hdr_len;
 	int eth_hdr_len;
 	int tcp_hlen;
 	int ip_hlen;
 	int offset;
 
 	pkt_hdr_len = mb->m_pkthdr.len;
 	has_outer_vlan_tag = (mb->m_flags & M_VLANTAG) != 0;
 	offset = 0;
 
 	eh = mtod(mb, const struct ether_vlan_header *);
 	if (unlikely(mb->m_len < ETHER_HDR_LEN))
 		return (0);
 
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		if (unlikely(mb->m_len < ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN))
 			return (0);
 		eth_type = eh->evl_proto;
 		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		eth_type = eh->evl_encap_proto;
 		eth_hdr_len = ETHER_HDR_LEN;
 	}
 
 	switch (eth_type) {
 	case htons(ETHERTYPE_IP):
 		ip4 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*ip4));
 		if (unlikely(ip4 == NULL))
 			return (0);
 		ip_type = ip4->ip_p;
 		if (unlikely(ip_type != IPPROTO_UDP))
 			return (0);
 		wqe->eth.swp_outer_l3_offset = eth_hdr_len / 2;
 		wqe->eth.cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
 		ip_hlen = ip4->ip_hl << 2;
 		eth_hdr_len += ip_hlen;
 		udp = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*udp));
 		if (unlikely(udp == NULL))
 			return (0);
 		wqe->eth.swp_outer_l4_offset = eth_hdr_len / 2;
 		wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_TYPE;
 		eth_hdr_len += sizeof(*udp);
 		break;
 	case htons(ETHERTYPE_IPV6):
 		ip6 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*ip6));
 		if (unlikely(ip6 == NULL))
 			return (0);
 		ip_type = ip6->ip6_nxt;
 		if (unlikely(ip_type != IPPROTO_UDP))
 			return (0);
 		wqe->eth.swp_outer_l3_offset = eth_hdr_len / 2;
 		wqe->eth.cs_flags = MLX5_ETH_WQE_L4_CSUM;
 		eth_hdr_len += sizeof(*ip6);
 		udp = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*udp));
 		if (unlikely(udp == NULL))
 			return (0);
 		wqe->eth.swp_outer_l4_offset = eth_hdr_len / 2;
 		wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_TYPE |
 		    MLX5_ETH_WQE_SWP_OUTER_L3_TYPE;
 		eth_hdr_len += sizeof(*udp);
 		break;
 	default:
 		return (0);
 	}
 
 	/*
 	 * If the hardware is not computing inner IP checksum, then
 	 * skip inlining the inner outer UDP and VXLAN header:
 	 */
 	if (unlikely((cs_mask & MLX5_ETH_WQE_L3_INNER_CSUM) == 0))
 		goto done;
 	if (unlikely(mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 	    8) == NULL))
 		return (0);
 	eth_hdr_len += 8;
 
 	/* Check for ethernet header again. */
 	eh = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, ETHER_HDR_LEN);
 	if (unlikely(eh == NULL))
 		return (0);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		if (unlikely(mb->m_len < eth_hdr_len - offset + ETHER_HDR_LEN +
 		    ETHER_VLAN_ENCAP_LEN))
 			return (0);
 		eth_type = eh->evl_proto;
 		eth_hdr_len += ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		eth_type = eh->evl_encap_proto;
 		eth_hdr_len += ETHER_HDR_LEN;
 	}
 
 	/* Check for IP header again. */
 	switch (eth_type) {
 	case htons(ETHERTYPE_IP):
 		ip4 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*ip4));
 		if (unlikely(ip4 == NULL))
 			return (0);
 		wqe->eth.swp_inner_l3_offset = eth_hdr_len / 2;
 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM;
 		ip_type = ip4->ip_p;
 		ip_hlen = ip4->ip_hl << 2;
 		eth_hdr_len += ip_hlen;
 		break;
 	case htons(ETHERTYPE_IPV6):
 		ip6 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*ip6));
 		if (unlikely(ip6 == NULL))
 			return (0);
 		wqe->eth.swp_inner_l3_offset = eth_hdr_len / 2;
 		wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_TYPE;
 		ip_type = ip6->ip6_nxt;
 		eth_hdr_len += sizeof(*ip6);
 		break;
 	default:
 		return (0);
 	}
 
 	/*
 	 * If the hardware is not computing inner UDP/TCP checksum,
 	 * then skip inlining the inner UDP/TCP header:
 	 */
 	if (unlikely((cs_mask & MLX5_ETH_WQE_L4_INNER_CSUM) == 0))
 		goto done;
 
 	switch (ip_type) {
 	case IPPROTO_UDP:
 		udp = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*udp));
 		if (unlikely(udp == NULL))
 			return (0);
 		wqe->eth.swp_inner_l4_offset = (eth_hdr_len / 2);
 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
 		wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_TYPE;
 		eth_hdr_len += sizeof(*udp);
 		break;
 	case IPPROTO_TCP:
 		th = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len,
 		    sizeof(*th));
 		if (unlikely(th == NULL))
 			return (0);
 		wqe->eth.swp_inner_l4_offset = eth_hdr_len / 2;
 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
 		tcp_hlen = th->th_off << 2;
 		eth_hdr_len += tcp_hlen;
 		break;
 	default:
 		return (0);
 	}
 done:
 	if (unlikely(pkt_hdr_len < eth_hdr_len))
 		return (0);
 
 	/* Account for software inserted VLAN tag, if any. */
 	if (unlikely(has_outer_vlan_tag)) {
 		wqe->eth.swp_outer_l3_offset += ETHER_VLAN_ENCAP_LEN / 2;
 		wqe->eth.swp_outer_l4_offset += ETHER_VLAN_ENCAP_LEN / 2;
 		wqe->eth.swp_inner_l3_offset += ETHER_VLAN_ENCAP_LEN / 2;
 		wqe->eth.swp_inner_l4_offset += ETHER_VLAN_ENCAP_LEN / 2;
 	}
 
 	/*
 	 * When inner checksums are set, outer L4 checksum flag must
 	 * be disabled.
 	 */
 	if (wqe->eth.cs_flags & (MLX5_ETH_WQE_L3_INNER_CSUM |
 	    MLX5_ETH_WQE_L4_INNER_CSUM))
 		wqe->eth.cs_flags &= ~MLX5_ETH_WQE_L4_CSUM;
 
 	return (eth_hdr_len);
 }
 
 struct mlx5_wqe_dump_seg {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_data_seg data;
 } __aligned(MLX5_SEND_WQE_BB);
 
 CTASSERT(DIV_ROUND_UP(2, MLX5_SEND_WQEBB_NUM_DS) == 1);
 
 int
 mlx5e_sq_dump_xmit(struct mlx5e_sq *sq, struct mlx5e_xmit_args *parg, struct mbuf **mbp)
 {
 	bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS];
 	struct mlx5_wqe_dump_seg *wqe;
 	struct mlx5_wqe_dump_seg *wqe_last;
 	int nsegs;
 	int xsegs;
 	u32 off;
 	u32 msb;
 	int err;
 	int x;
 	struct mbuf *mb;
 	const u32 ds_cnt = 2;
 	u16 pi;
 	const u8 opcode = MLX5_OPCODE_DUMP;
 
 	/* get pointer to mbuf */
 	mb = *mbp;
 
 	/* get producer index */
 	pi = sq->pc & sq->wq.sz_m1;
 
 	sq->mbuf[pi].num_bytes = mb->m_pkthdr.len;
 	sq->mbuf[pi].num_wqebbs = 0;
 
 	/* check number of segments in mbuf */
 	err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
 	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
 	if (err == EFBIG) {
 		/* update statistics */
 		sq->stats.defragged++;
 		/* too many mbuf fragments */
 		mb = m_defrag(*mbp, M_NOWAIT);
 		if (mb == NULL) {
 			mb = *mbp;
 			goto tx_drop;
 		}
 		/* try again */
 		err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
 		    mb, segs, &nsegs, BUS_DMA_NOWAIT);
 	}
 
 	if (err != 0)
 		goto tx_drop;
 
 	/* make sure all mbuf data, if any, is visible to the bus */
 	bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map,
 	    BUS_DMASYNC_PREWRITE);
 
 	/* compute number of real DUMP segments */
 	msb = sq->priv->params_ethtool.hw_mtu_msb;
 	for (x = xsegs = 0; x != nsegs; x++)
 		xsegs += howmany((u32)segs[x].ds_len, msb);
 
 	/* check if there are no segments */
 	if (unlikely(xsegs == 0)) {
 		bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map);
 		m_freem(mb);
 		*mbp = NULL;	/* safety clear */
 		return (0);
 	}
 
 	/* return ENOBUFS if the queue is full */
 	if (unlikely(!mlx5e_sq_has_room_for(sq, xsegs))) {
 		sq->stats.enobuf++;
 		bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map);
 		m_freem(mb);
 		*mbp = NULL;	/* safety clear */
 		return (ENOBUFS);
 	}
 
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 	wqe_last = mlx5_wq_cyc_get_wqe(&sq->wq, sq->wq.sz_m1);
 
 	for (x = 0; x != nsegs; x++) {
 		for (off = 0; off < segs[x].ds_len; off += msb) {
 			u32 len = segs[x].ds_len - off;
 
 			/* limit length */
 			if (likely(len > msb))
 				len = msb;
 
 			memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
 
 			/* fill control segment */
 			wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
 			wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 			wqe->ctrl.imm = cpu_to_be32(parg->tisn << 8);
 
 			/* fill data segment */
 			wqe->data.addr = cpu_to_be64((uint64_t)segs[x].ds_addr + off);
 			wqe->data.lkey = sq->mkey_be;
 			wqe->data.byte_count = cpu_to_be32(len);
 
 			/* advance to next building block */
 			if (unlikely(wqe == wqe_last))
 				wqe = mlx5_wq_cyc_get_wqe(&sq->wq, 0);
 			else
 				wqe++;
 
 			sq->mbuf[pi].num_wqebbs++;
 			sq->pc++;
 		}
 	}
 
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 	wqe_last = mlx5_wq_cyc_get_wqe(&sq->wq, (sq->pc - 1) & sq->wq.sz_m1);
 
 	/* put in place data fence */
 	wqe->ctrl.fm_ce_se |= MLX5_FENCE_MODE_INITIATOR_SMALL;
 
 	/* check if we should generate a completion event */
 	if (mlx5e_do_send_cqe_inline(sq))
 		wqe_last->ctrl.fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE;
 
 	/* copy data for doorbell */
 	memcpy(sq->doorbell.d32, wqe_last, sizeof(sq->doorbell.d32));
 
 	/* store pointer to mbuf */
 	sq->mbuf[pi].mbuf = mb;
 	sq->mbuf[pi].p_refcount = parg->pref;
 	atomic_add_int(parg->pref, 1);
 
 	/* count all traffic going out */
 	sq->stats.packets++;
 	sq->stats.bytes += sq->mbuf[pi].num_bytes;
 
 	*mbp = NULL;	/* safety clear */
 	return (0);
 
 tx_drop:
 	sq->stats.dropped++;
 	*mbp = NULL;
 	m_freem(mb);
 	return err;
 }
 
 int
 mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp)
 {
 	bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS];
 	struct mlx5e_xmit_args args = {};
 	struct mlx5_wqe_data_seg *dseg;
 	struct mlx5e_tx_wqe *wqe;
 	struct ifnet *ifp;
 	int nsegs;
 	int err;
 	int x;
 	struct mbuf *mb;
 	u16 ds_cnt;
 	u16 pi;
 	u8 opcode;
 
 #ifdef KERN_TLS
 top:
 #endif
 	/* Return ENOBUFS if the queue is full */
 	if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS))) {
 		sq->stats.enobuf++;
 		return (ENOBUFS);
 	}
 
 	/* Align SQ edge with NOPs to avoid WQE wrap around */
 	pi = ((~sq->pc) & sq->wq.sz_m1);
 	if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) {
 		/* Send one multi NOP message instead of many */
 		mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS);
 		pi = ((~sq->pc) & sq->wq.sz_m1);
 		if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) {
 			sq->stats.enobuf++;
 			return (ENOMEM);
 		}
 	}
 
 #ifdef KERN_TLS
 	/* Special handling for TLS packets, if any */
 	switch (mlx5e_sq_tls_xmit(sq, &args, mbp)) {
 	case MLX5E_TLS_LOOP:
 		goto top;
 	case MLX5E_TLS_FAILURE:
 		mb = *mbp;
 		err = ENOMEM;
 		goto tx_drop;
 	case MLX5E_TLS_DEFERRED:
 		return (0);
 	case MLX5E_TLS_CONTINUE:
 	default:
 		break;
 	}
 #endif
 
 	/* Setup local variables */
 	pi = sq->pc & sq->wq.sz_m1;
 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
 	ifp = sq->ifp;
 
 	memset(wqe, 0, sizeof(*wqe));
 
 	/* get pointer to mbuf */
 	mb = *mbp;
 
 	/* Send a copy of the frame to the BPF listener, if any */
 	if (ifp != NULL && ifp->if_bpf != NULL)
 		ETHER_BPF_MTAP(ifp, mb);
 
 	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) {
 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM;
 	}
 	if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) {
 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM;
 	}
 	if (wqe->eth.cs_flags == 0) {
 		sq->stats.csum_offload_none++;
 	}
 	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
 		u32 payload_len;
 		u32 mss = mb->m_pkthdr.tso_segsz;
 		u32 num_pkts;
 
 		wqe->eth.mss = cpu_to_be16(mss);
 		opcode = MLX5_OPCODE_LSO;
 		if (args.ihs == 0)
 			args.ihs = mlx5e_get_full_header_size(mb, NULL);
 		if (unlikely(args.ihs == 0)) {
 			err = EINVAL;
 			goto tx_drop;
 		}
 		payload_len = mb->m_pkthdr.len - args.ihs;
 		if (payload_len == 0)
 			num_pkts = 1;
 		else
 			num_pkts = DIV_ROUND_UP(payload_len, mss);
 		sq->mbuf[pi].num_bytes = payload_len + (num_pkts * args.ihs);
 
 
 		sq->stats.tso_packets++;
 		sq->stats.tso_bytes += payload_len;
 	} else if (mb->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN) {
 		/* check for inner TCP TSO first */
 		if (mb->m_pkthdr.csum_flags & (CSUM_INNER_IP_TSO |
 		    CSUM_INNER_IP6_TSO)) {
 			u32 payload_len;
 			u32 mss = mb->m_pkthdr.tso_segsz;
 			u32 num_pkts;
 
 			wqe->eth.mss = cpu_to_be16(mss);
 			opcode = MLX5_OPCODE_LSO;
 
 			if (likely(args.ihs == 0)) {
 				args.ihs = mlx5e_get_vxlan_header_size(mb, wqe,
 				       MLX5_ETH_WQE_L3_INNER_CSUM |
 				       MLX5_ETH_WQE_L4_INNER_CSUM |
 				       MLX5_ETH_WQE_L4_CSUM |
 				       MLX5_ETH_WQE_L3_CSUM,
 				       opcode);
 				if (unlikely(args.ihs == 0)) {
 					err = EINVAL;
 					goto tx_drop;
 				}
 			}
 
 			payload_len = mb->m_pkthdr.len - args.ihs;
 			if (payload_len == 0)
 				num_pkts = 1;
 			else
 				num_pkts = DIV_ROUND_UP(payload_len, mss);
 			sq->mbuf[pi].num_bytes = payload_len +
 			    num_pkts * args.ihs;
 
 			sq->stats.tso_packets++;
 			sq->stats.tso_bytes += payload_len;
 		} else {
 			opcode = MLX5_OPCODE_SEND;
 
 			if (likely(args.ihs == 0)) {
 				uint8_t cs_mask;
 
 				if (mb->m_pkthdr.csum_flags &
 				    (CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP |
 				     CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP)) {
 					cs_mask =
 					    MLX5_ETH_WQE_L3_INNER_CSUM |
 					    MLX5_ETH_WQE_L4_INNER_CSUM |
 					    MLX5_ETH_WQE_L4_CSUM |
 					    MLX5_ETH_WQE_L3_CSUM;
 				} else if (mb->m_pkthdr.csum_flags & CSUM_INNER_IP) {
 					cs_mask =
 					    MLX5_ETH_WQE_L3_INNER_CSUM |
 					    MLX5_ETH_WQE_L4_CSUM |
 					    MLX5_ETH_WQE_L3_CSUM;
 				} else {
 					cs_mask =
 					    MLX5_ETH_WQE_L4_CSUM |
 					    MLX5_ETH_WQE_L3_CSUM;
 				}
 				args.ihs = mlx5e_get_vxlan_header_size(mb, wqe,
 				    cs_mask, opcode);
 				if (unlikely(args.ihs == 0)) {
 					err = EINVAL;
 					goto tx_drop;
 				}
 			}
 
 			sq->mbuf[pi].num_bytes = max_t (unsigned int,
 			    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
 		}
 	} else {
 		opcode = MLX5_OPCODE_SEND;
 
 		if (args.ihs == 0) {
 			switch (sq->min_inline_mode) {
 			case MLX5_INLINE_MODE_IP:
 			case MLX5_INLINE_MODE_TCP_UDP:
 				args.ihs = mlx5e_get_full_header_size(mb, NULL);
 				if (unlikely(args.ihs == 0))
 					args.ihs = mlx5e_get_l2_header_size(sq, mb);
 				break;
 			case MLX5_INLINE_MODE_L2:
 				args.ihs = mlx5e_get_l2_header_size(sq, mb);
 				break;
 			case MLX5_INLINE_MODE_NONE:
 				/* FALLTHROUGH */
 			default:
 				if ((mb->m_flags & M_VLANTAG) != 0 &&
 				    (sq->min_insert_caps & MLX5E_INSERT_VLAN) != 0) {
 					/* inlining VLAN data is not required */
 					wqe->eth.vlan_cmd = htons(0x8000); /* bit 0 CVLAN */
 					wqe->eth.vlan_hdr = htons(mb->m_pkthdr.ether_vtag);
 					args.ihs = 0;
 				} else if ((mb->m_flags & M_VLANTAG) == 0 &&
 				    (sq->min_insert_caps & MLX5E_INSERT_NON_VLAN) != 0) {
 					/* inlining non-VLAN data is not required */
 					args.ihs = 0;
 				} else {
 					/* we are forced to inlining L2 header, if any */
 					args.ihs = mlx5e_get_l2_header_size(sq, mb);
 				}
 				break;
 			}
 		}
 		sq->mbuf[pi].num_bytes = max_t (unsigned int,
 		    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
 	}
 
 	if (likely(args.ihs == 0)) {
 		/* nothing to inline */
 	} else if ((mb->m_flags & M_VLANTAG) != 0) {
 		struct ether_vlan_header *eh = (struct ether_vlan_header *)
 		    wqe->eth.inline_hdr_start;
 
 		/* Range checks */
 		if (unlikely(args.ihs > (sq->max_inline - ETHER_VLAN_ENCAP_LEN))) {
 			if (mb->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_ENCAP_VXLAN)) {
 				err = EINVAL;
 				goto tx_drop;
 			}
 			args.ihs = (sq->max_inline - ETHER_VLAN_ENCAP_LEN);
 		} else if (unlikely(args.ihs < ETHER_HDR_LEN)) {
 			err = EINVAL;
 			goto tx_drop;
 		}
 		m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh);
 		m_adj(mb, ETHER_HDR_LEN);
 		/* Insert 4 bytes VLAN tag into data stream */
 		eh->evl_proto = eh->evl_encap_proto;
 		eh->evl_encap_proto = htons(ETHERTYPE_VLAN);
 		eh->evl_tag = htons(mb->m_pkthdr.ether_vtag);
 		/* Copy rest of header data, if any */
 		m_copydata(mb, 0, args.ihs - ETHER_HDR_LEN, (caddr_t)(eh + 1));
 		m_adj(mb, args.ihs - ETHER_HDR_LEN);
 		/* Extend header by 4 bytes */
 		args.ihs += ETHER_VLAN_ENCAP_LEN;
 		wqe->eth.inline_hdr_sz = cpu_to_be16(args.ihs);
 	} else {
 		/* check if inline header size is too big */
 		if (unlikely(args.ihs > sq->max_inline)) {
 			if (unlikely(mb->m_pkthdr.csum_flags & (CSUM_TSO |
 			    CSUM_ENCAP_VXLAN))) {
 				err = EINVAL;
 				goto tx_drop;
 			}
 			args.ihs = sq->max_inline;
 		}
 		m_copydata(mb, 0, args.ihs, wqe->eth.inline_hdr_start);
 		m_adj(mb, args.ihs);
 		wqe->eth.inline_hdr_sz = cpu_to_be16(args.ihs);
 	}
 
 	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
 	if (args.ihs > sizeof(wqe->eth.inline_hdr_start)) {
 		ds_cnt += DIV_ROUND_UP(args.ihs - sizeof(wqe->eth.inline_hdr_start),
 		    MLX5_SEND_WQE_DS);
 	}
 	dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt;
 
 	err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
 	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
 	if (err == EFBIG) {
 		/* Update statistics */
 		sq->stats.defragged++;
 		/* Too many mbuf fragments */
 		mb = m_defrag(*mbp, M_NOWAIT);
 		if (mb == NULL) {
 			mb = *mbp;
 			goto tx_drop;
 		}
 		/* Try again */
 		err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
 		    mb, segs, &nsegs, BUS_DMA_NOWAIT);
 	}
 	/* Catch errors */
 	if (err != 0)
 		goto tx_drop;
 
 	/* Make sure all mbuf data, if any, is visible to the bus */
 	if (nsegs != 0) {
 		bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map,
 		    BUS_DMASYNC_PREWRITE);
 	} else {
 		/* All data was inlined, free the mbuf. */
 		bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map);
 		m_freem(mb);
 		mb = NULL;
 	}
 
 	for (x = 0; x != nsegs; x++) {
 		if (segs[x].ds_len == 0)
 			continue;
 		dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr);
 		dseg->lkey = sq->mkey_be;
 		dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len);
 		dseg++;
 	}
 
 	ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl));
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
 	wqe->ctrl.imm = cpu_to_be32(args.tisn << 8);
 
 	if (mlx5e_do_send_cqe_inline(sq))
 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
 	else
 		wqe->ctrl.fm_ce_se = 0;
 
 	/* Copy data for doorbell */
 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
 
 	/* Store pointer to mbuf */
 	sq->mbuf[pi].mbuf = mb;
 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	sq->mbuf[pi].p_refcount = args.pref;
 	if (unlikely(args.pref != NULL))
 		atomic_add_int(args.pref, 1);
 	sq->pc += sq->mbuf[pi].num_wqebbs;
 
 	/* Count all traffic going out */
 	sq->stats.packets++;
 	sq->stats.bytes += sq->mbuf[pi].num_bytes;
 
 	*mbp = NULL;	/* safety clear */
 	return (0);
 
 tx_drop:
 	sq->stats.dropped++;
 	*mbp = NULL;
 	m_freem(mb);
 	return err;
 }
 
 static void
 mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
 {
 	u16 sqcc;
 
 	/*
 	 * sq->cc must be updated only after mlx5_cqwq_update_db_record(),
 	 * otherwise a cq overrun may occur
 	 */
 	sqcc = sq->cc;
 
 	while (budget > 0) {
 		struct mlx5_cqe64 *cqe;
 		struct mbuf *mb;
 		bool match;
 		u16 sqcc_this;
 		u16 delta;
 		u16 x;
 		u16 ci;
 
 		cqe = mlx5e_get_cqe(&sq->cq);
 		if (!cqe)
 			break;
 
 		mlx5_cqwq_pop(&sq->cq.wq);
 
 		/* check if the completion event indicates an error */
 		if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ))
 			sq->stats.cqe_err++;
 
 		/* setup local variables */
 		sqcc_this = be16toh(cqe->wqe_counter);
 		match = false;
 
 		/* update budget according to the event factor */
 		budget -= sq->cev_factor;
 
 		for (x = 0;; x++) {
 			if (unlikely(match != false)) {
 				break;
 			} else if (unlikely(x == sq->cev_factor)) {
 				/* WQE counter match not found */
 				sq->stats.cqe_err++;
 				break;
 			}
 			ci = sqcc & sq->wq.sz_m1;
 			delta = sqcc_this - sqcc;
 			match = (delta < sq->mbuf[ci].num_wqebbs);
 			mb = sq->mbuf[ci].mbuf;
 			sq->mbuf[ci].mbuf = NULL;
 
 			if (unlikely(sq->mbuf[ci].p_refcount != NULL)) {
 				atomic_add_int(sq->mbuf[ci].p_refcount, -1);
 				sq->mbuf[ci].p_refcount = NULL;
 			}
 
 			if (mb == NULL) {
 				if (unlikely(sq->mbuf[ci].num_bytes == 0))
 					sq->stats.nop++;
 			} else {
 				bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
 
 				/* Free transmitted mbuf */
 				m_freem(mb);
 			}
 			sqcc += sq->mbuf[ci].num_wqebbs;
 		}
 	}
 
 	mlx5_cqwq_update_db_record(&sq->cq.wq);
 
 	/* Ensure cq space is freed before enabling more cqes */
 	atomic_thread_fence_rel();
 
 	sq->cc = sqcc;
 }
 
 static int
 mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
 {
 	int err = 0;
 
 	if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    READ_ONCE(sq->running) == 0)) {
 		m_freem(mb);
 		return (ENETDOWN);
 	}
 
 	/* Do transmit */
 	if (mlx5e_sq_xmit(sq, &mb) != 0) {
 		/* NOTE: m_freem() is NULL safe */
 		m_freem(mb);
 		err = ENOBUFS;
 	}
 
 	/* Check if we need to write the doorbell */
 	if (likely(sq->doorbell.d64 != 0)) {
 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32);
 		sq->doorbell.d64 = 0;
 	}
 
 	/*
 	 * Check if we need to start the event timer which flushes the
 	 * transmit ring on timeout:
 	 */
 	if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
 	    sq->cev_factor != 1)) {
 		/* start the timer */
 		mlx5e_sq_cev_timeout(sq);
 	} else {
 		/* don't send NOPs yet */
 		sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
 	}
 	return (err);
 }
 
 int
 mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct mlx5e_sq *sq;
 	int ret;
 
 	if (mb->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		MPASS(mb->m_pkthdr.snd_tag->ifp == ifp);
 		sq = mlx5e_select_queue_by_send_tag(ifp, mb);
 		if (unlikely(sq == NULL)) {
 			goto select_queue;
 		}
 	} else {
 select_queue:
 		sq = mlx5e_select_queue(ifp, mb);
 		if (unlikely(sq == NULL)) {
 			/* Free mbuf */
 			m_freem(mb);
 
 			/* Invalid send queue */
 			return (ENXIO);
 		}
 	}
 
 	mtx_lock(&sq->lock);
 	ret = mlx5e_xmit_locked(ifp, sq, mb);
 	mtx_unlock(&sq->lock);
 
 	return (ret);
 }
 
 void
 mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe __unused)
 {
 	struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq);
 
 	mtx_lock(&sq->comp_lock);
 	mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX);
 	mlx5e_cq_arm(&sq->cq, MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock));
 	mtx_unlock(&sq->comp_lock);
 }
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
index 123985a7dec2..36316f2bc3ff 100644
--- a/sys/kern/kern_mbuf.c
+++ b/sys/kern/kern_mbuf.c
@@ -1,1708 +1,1709 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004, 2005,
  *	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/domainset.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_map.h>
 #include <vm/uma.h>
 #include <vm/uma_dbg.h>
 
 /*
  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
  * Zones.
  *
  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
  * administrator so desires.
  *
  * Mbufs are allocated from a UMA Primary Zone called the Mbuf
  * Zone.
  *
  * Additionally, FreeBSD provides a Packet Zone, which it
  * configures as a Secondary Zone to the Mbuf Primary Zone,
  * thus sharing backend Slab kegs with the Mbuf Primary Zone.
  *
  * Thus common-case allocations and locking are simplified:
  *
  *  m_clget()                m_getcl()
  *    |                         |
  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
  *    |   |             [     Packet   ]            |
  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Primary Zone ]
  *        |                       \________         |
  *  [ Cluster Keg   ]                      \       /
  *        |	                         [ Mbuf Keg   ]
  *  [ Cluster Slabs ]                         |
  *        |                              [ Mbuf Slabs ]
  *         \____________(VM)_________________/
  *
  *
  * Whenever an object is allocated with uma_zalloc() out of
  * one of the Zones its _ctor_ function is executed.  The same
  * for any deallocation through uma_zfree() the _dtor_ function
  * is executed.
  *
  * Caches are per-CPU and are filled from the Primary Zone.
  *
  * Whenever an object is allocated from the underlying global
  * memory pool it gets pre-initialized with the _zinit_ functions.
  * When the Keg's are overfull objects get decommissioned with
  * _zfini_ functions and free'd back to the global memory pool.
  *
  */
 
 int nmbufs;			/* limits number of mbufs */
 int nmbclusters;		/* limits number of mbuf clusters */
 int nmbjumbop;			/* limits number of page size jumbo clusters */
 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
 int nmbjumbo16;			/* limits number of 16k jumbo clusters */
 
 bool mb_use_ext_pgs = true;	/* use M_EXTPG mbufs for sendfile & TLS */
 SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN,
     &mb_use_ext_pgs, 0,
     "Use unmapped mbufs for sendfile(2) and TLS offload");
 
 static quad_t maxmbufmem;	/* overall real memory limit for all mbufs */
 
 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
     "Maximum real memory allocatable to various mbuf types");
 
 static counter_u64_t snd_tag_count;
 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW,
     &snd_tag_count, "# of active mbuf send tags");
 
 /*
  * tunable_mbinit() has to be run before any mbuf allocations are done.
  */
 static void
 tunable_mbinit(void *dummy)
 {
 	quad_t realmem;
 
 	/*
 	 * The default limit for all mbuf related memory is 1/2 of all
 	 * available kernel memory (physical or kmem).
 	 * At most it can be 3/4 of available kernel memory.
 	 */
 	realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
 	maxmbufmem = realmem / 2;
 	TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
 	if (maxmbufmem > realmem / 4 * 3)
 		maxmbufmem = realmem / 4 * 3;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 	if (nmbclusters == 0)
 		nmbclusters = maxmbufmem / MCLBYTES / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
 	if (nmbjumbop == 0)
 		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
 	if (nmbjumbo9 == 0)
 		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
 	if (nmbjumbo16 == 0)
 		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
 
 	/*
 	 * We need at least as many mbufs as we have clusters of
 	 * the various types added together.
 	 */
 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
 		nmbufs = lmax(maxmbufmem / MSIZE / 5,
 		    nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
 }
 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
 
 static int
 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbclusters;
 
 	newnmbclusters = nmbclusters;
 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
 	if (error == 0 && req->newptr && newnmbclusters != nmbclusters) {
 		if (newnmbclusters > nmbclusters &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbclusters = newnmbclusters;
 			nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 			EVENTHANDLER_INVOKE(nmbclusters_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbclusters, 0,
     sysctl_nmbclusters, "IU",
     "Maximum number of mbuf clusters allowed");
 
 static int
 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbop;
 
 	newnmbjumbop = nmbjumbop;
 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) {
 		if (newnmbjumbop > nmbjumbop &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbop = newnmbjumbop;
 			nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbop, 0,
     sysctl_nmbjumbop, "IU",
     "Maximum number of mbuf page size jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo9;
 
 	newnmbjumbo9 = nmbjumbo9;
 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) {
 		if (newnmbjumbo9 > nmbjumbo9 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo9 = newnmbjumbo9;
 			nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo9, 0,
     sysctl_nmbjumbo9, "IU",
     "Maximum number of mbuf 9k jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo16;
 
 	newnmbjumbo16 = nmbjumbo16;
 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) {
 		if (newnmbjumbo16 > nmbjumbo16 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo16 = newnmbjumbo16;
 			nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo16, 0,
     sysctl_nmbjumbo16, "IU",
     "Maximum number of mbuf 16k jumbo clusters allowed");
 
 static int
 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbufs;
 
 	newnmbufs = nmbufs;
 	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
 	if (error == 0 && req->newptr && newnmbufs != nmbufs) {
 		if (newnmbufs > nmbufs) {
 			nmbufs = newnmbufs;
 			nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 			EVENTHANDLER_INVOKE(nmbufs_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &nmbufs, 0, sysctl_nmbufs, "IU",
     "Maximum number of mbufs allowed");
 
 /*
  * Zones from which we allocate.
  */
 uma_zone_t	zone_mbuf;
 uma_zone_t	zone_clust;
 uma_zone_t	zone_pack;
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
 
 /*
  * Local prototypes.
  */
 static int	mb_ctor_mbuf(void *, int, void *, int);
 static int	mb_ctor_clust(void *, int, void *, int);
 static int	mb_ctor_pack(void *, int, void *, int);
 static void	mb_dtor_mbuf(void *, int, void *);
 static void	mb_dtor_pack(void *, int, void *);
 static int	mb_zinit_pack(void *, int, int);
 static void	mb_zfini_pack(void *, int);
 static void	mb_reclaim(uma_zone_t, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
 
 _Static_assert(sizeof(struct mbuf) <= MSIZE,
     "size of mbuf exceeds MSIZE");
 /*
  * Initialize FreeBSD Network buffer allocation.
  */
 static void
 mbuf_init(void *dummy)
 {
 
 	/*
 	 * Configure UMA zones for Mbufs, Clusters, and Packets.
 	 */
 	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
 	    mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
 	    MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET);
 	if (nmbufs > 0)
 		nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 	uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
 	uma_zone_set_maxaction(zone_mbuf, mb_reclaim);
 
 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbclusters > 0)
 		nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 	uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
 	uma_zone_set_maxaction(zone_clust, mb_reclaim);
 
 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
 
 	/* Make jumbo frame zone too. Page size, 9k and 16k. */
 	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbop > 0)
 		nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 	uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
 	uma_zone_set_maxaction(zone_jumbop, mb_reclaim);
 
 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbo9 > 0)
 		nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 	uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
 	uma_zone_set_maxaction(zone_jumbo9, mb_reclaim);
 
 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
 	    mb_ctor_clust, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
 	if (nmbjumbo16 > 0)
 		nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
 
 	/*
 	 * Hook event handler for low-memory situation, used to
 	 * drain protocols and push data back to the caches (UMA
 	 * later pushes it back to VM).
 	 */
 	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 
 	snd_tag_count = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
 
 #ifdef DEBUGNET
 /*
  * debugnet makes use of a pre-allocated pool of mbufs and clusters.  When
  * debugnet is configured, we initialize a set of UMA cache zones which return
  * items from this pool.  At panic-time, the regular UMA zone pointers are
  * overwritten with those of the cache zones so that drivers may allocate and
  * free mbufs and clusters without attempting to allocate physical memory.
  *
  * We keep mbufs and clusters in a pair of mbuf queues.  In particular, for
  * the purpose of caching clusters, we treat them as mbufs.
  */
 static struct mbufq dn_mbufq =
     { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX };
 static struct mbufq dn_clustq =
     { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX };
 
 static int dn_clsize;
 static uma_zone_t dn_zone_mbuf;
 static uma_zone_t dn_zone_clust;
 static uma_zone_t dn_zone_pack;
 
 static struct debugnet_saved_zones {
 	uma_zone_t dsz_mbuf;
 	uma_zone_t dsz_clust;
 	uma_zone_t dsz_pack;
 	uma_zone_t dsz_jumbop;
 	uma_zone_t dsz_jumbo9;
 	uma_zone_t dsz_jumbo16;
 	bool dsz_debugnet_zones_enabled;
 } dn_saved_zones;
 
 static int
 dn_buf_import(void *arg, void **store, int count, int domain __unused,
     int flags)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = mbufq_dequeue(q);
 		if (m == NULL)
 			break;
 		trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 dn_buf_release(void *arg, void **store, int count)
 {
 	struct mbufq *q;
 	struct mbuf *m;
 	int i;
 
 	q = arg;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		(void)mbufq_enqueue(q, m);
 	}
 }
 
 static int
 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused,
     int flags __unused)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = m_get(MT_DATA, M_NOWAIT);
 		if (m == NULL)
 			break;
 		clust = uma_zalloc(dn_zone_clust, M_NOWAIT);
 		if (clust == NULL) {
 			m_free(m);
 			break;
 		}
 		mb_ctor_clust(clust, dn_clsize, m, 0);
 		store[i] = m;
 	}
 	KASSERT((flags & M_WAITOK) == 0 || i == count,
 	    ("%s: ran out of pre-allocated mbufs", __func__));
 	return (i);
 }
 
 static void
 dn_pack_release(void *arg __unused, void **store, int count)
 {
 	struct mbuf *m;
 	void *clust;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		m = store[i];
 		clust = m->m_ext.ext_buf;
 		uma_zfree(dn_zone_clust, clust);
 		uma_zfree(dn_zone_mbuf, m);
 	}
 }
 
 /*
  * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy
  * the corresponding UMA cache zones.
  */
 void
 debugnet_mbuf_drain(void)
 {
 	struct mbuf *m;
 	void *item;
 
 	if (dn_zone_mbuf != NULL) {
 		uma_zdestroy(dn_zone_mbuf);
 		dn_zone_mbuf = NULL;
 	}
 	if (dn_zone_clust != NULL) {
 		uma_zdestroy(dn_zone_clust);
 		dn_zone_clust = NULL;
 	}
 	if (dn_zone_pack != NULL) {
 		uma_zdestroy(dn_zone_pack);
 		dn_zone_pack = NULL;
 	}
 
 	while ((m = mbufq_dequeue(&dn_mbufq)) != NULL)
 		m_free(m);
 	while ((item = mbufq_dequeue(&dn_clustq)) != NULL)
 		uma_zfree(m_getzone(dn_clsize), item);
 }
 
 /*
  * Callback invoked immediately prior to starting a debugnet connection.
  */
 void
 debugnet_mbuf_start(void)
 {
 
 	MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled);
 
 	/* Save the old zone pointers to restore when debugnet is closed. */
 	dn_saved_zones = (struct debugnet_saved_zones) {
 		.dsz_debugnet_zones_enabled = true,
 		.dsz_mbuf = zone_mbuf,
 		.dsz_clust = zone_clust,
 		.dsz_pack = zone_pack,
 		.dsz_jumbop = zone_jumbop,
 		.dsz_jumbo9 = zone_jumbo9,
 		.dsz_jumbo16 = zone_jumbo16,
 	};
 
 	/*
 	 * All cluster zones return buffers of the size requested by the
 	 * drivers.  It's up to the driver to reinitialize the zones if the
 	 * MTU of a debugnet-enabled interface changes.
 	 */
 	printf("debugnet: overwriting mbuf zone pointers\n");
 	zone_mbuf = dn_zone_mbuf;
 	zone_clust = dn_zone_clust;
 	zone_pack = dn_zone_pack;
 	zone_jumbop = dn_zone_clust;
 	zone_jumbo9 = dn_zone_clust;
 	zone_jumbo16 = dn_zone_clust;
 }
 
 /*
  * Callback invoked when a debugnet connection is closed/finished.
  */
 void
 debugnet_mbuf_finish(void)
 {
 
 	MPASS(dn_saved_zones.dsz_debugnet_zones_enabled);
 
 	printf("debugnet: restoring mbuf zone pointers\n");
 	zone_mbuf = dn_saved_zones.dsz_mbuf;
 	zone_clust = dn_saved_zones.dsz_clust;
 	zone_pack = dn_saved_zones.dsz_pack;
 	zone_jumbop = dn_saved_zones.dsz_jumbop;
 	zone_jumbo9 = dn_saved_zones.dsz_jumbo9;
 	zone_jumbo16 = dn_saved_zones.dsz_jumbo16;
 
 	memset(&dn_saved_zones, 0, sizeof(dn_saved_zones));
 }
 
 /*
  * Reinitialize the debugnet mbuf+cluster pool and cache zones.
  */
 void
 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize)
 {
 	struct mbuf *m;
 	void *item;
 
 	debugnet_mbuf_drain();
 
 	dn_clsize = clsize;
 
 	dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME,
 	    MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
 	    dn_buf_import, dn_buf_release,
 	    &dn_mbufq, UMA_ZONE_NOBUCKET);
 
 	dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME,
 	    clsize, mb_ctor_clust, NULL, NULL, NULL,
 	    dn_buf_import, dn_buf_release,
 	    &dn_clustq, UMA_ZONE_NOBUCKET);
 
 	dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME,
 	    MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL,
 	    dn_pack_import, dn_pack_release,
 	    NULL, UMA_ZONE_NOBUCKET);
 
 	while (nmbuf-- > 0) {
 		m = m_get(MT_DATA, M_WAITOK);
 		uma_zfree(dn_zone_mbuf, m);
 	}
 	while (nclust-- > 0) {
 		item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK);
 		uma_zfree(dn_zone_clust, item);
 	}
 }
 #endif /* DEBUGNET */
 
 /*
  * Constructor for Mbuf primary zone.
  *
  * The 'arg' pointer points to a mb_args structure which
  * contains call-specific information required to support the
  * mbuf allocation API.  See mbuf.h.
  */
 static int
 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error;
 	int flags;
 	short type;
 
 	args = (struct mb_args *)arg;
 	type = args->type;
 
 	/*
 	 * The mbuf is initialized later.  The caller has the
 	 * responsibility to set up any MAC labels too.
 	 */
 	if (type == MT_NOINIT)
 		return (0);
 
 	m = (struct mbuf *)mem;
 	flags = args->flags;
 	MPASS((flags & M_NOFREE) == 0);
 
 	error = m_init(m, how, type, flags);
 
 	return (error);
 }
 
 /*
  * The Mbuf primary zone destructor.
  */
 static void
 mb_dtor_mbuf(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 	unsigned long flags;
 
 	m = (struct mbuf *)mem;
 	flags = (unsigned long)arg;
 
 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
 	KASSERT((flags & 0x1) == 0, ("%s: obsolete MB_DTOR_SKIP passed", __func__));
 	if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
 		m_tag_delete_chain(m, NULL);
 }
 
 /*
  * The Mbuf Packet zone destructor.
  */
 static void
 mb_dtor_pack(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 	if ((m->m_flags & M_PKTHDR) != 0)
 		m_tag_delete_chain(m, NULL);
 
 	/* Make sure we've got a clean cluster back. */
 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
 	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
 	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
 #endif
 	/*
 	 * If there are processes blocked on zone_clust, waiting for pages
 	 * to be freed up, cause them to be woken up by draining the
 	 * packet zone.  We are exposed to a race here (in the check for
 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
 	 * is deliberate. We don't want to acquire the zone lock for every
 	 * mbuf free.
 	 */
 	if (uma_zone_exhausted(zone_clust))
 		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 }
 
 /*
  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
  *
  * Here the 'arg' pointer points to the Mbuf which we
  * are configuring cluster storage for.  If 'arg' is
  * empty we allocate just the cluster without setting
  * the mbuf to it.  See mbuf.h.
  */
 static int
 mb_ctor_clust(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)arg;
 	if (m != NULL) {
 		m->m_ext.ext_buf = (char *)mem;
 		m->m_data = m->m_ext.ext_buf;
 		m->m_flags |= M_EXT;
 		m->m_ext.ext_free = NULL;
 		m->m_ext.ext_arg1 = NULL;
 		m->m_ext.ext_arg2 = NULL;
 		m->m_ext.ext_size = size;
 		m->m_ext.ext_type = m_gettype(size);
 		m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 		m->m_ext.ext_count = 1;
 	}
 
 	return (0);
 }
 
 /*
  * The Packet secondary zone's init routine, executed on the
  * object's transition from mbuf keg slab to zone cache.
  */
 static int
 mb_zinit_pack(void *mem, int size, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;		/* m is virgin. */
 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
 	    m->m_ext.ext_buf == NULL)
 		return (ENOMEM);
 	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
 #endif
 	return (0);
 }
 
 /*
  * The Packet secondary zone's fini routine, executed on the
  * object's transition from zone cache to keg slab.
  */
 static void
 mb_zfini_pack(void *mem, int size)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
 #endif
 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_dtor(mem, size, NULL);
 #endif
 }
 
 /*
  * The "packet" keg constructor.
  */
 static int
 mb_ctor_pack(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error, flags;
 	short type;
 
 	m = (struct mbuf *)mem;
 	args = (struct mb_args *)arg;
 	flags = args->flags;
 	type = args->type;
 	MPASS((flags & M_NOFREE) == 0);
 
 #if defined(INVARIANTS) && !defined(KMSAN)
 	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
 #endif
 
 	error = m_init(m, how, type, flags);
 
 	/* m_ext is already initialized. */
 	m->m_data = m->m_ext.ext_buf;
  	m->m_flags = (flags | M_EXT);
 
 	return (error);
 }
 
 /*
  * This is the protocol drain routine.  Called by UMA whenever any of the
  * mbuf zones is closed to its limit.
  *
  * No locks should be held when this is called.  The drain routines have to
  * presently acquire some locks which raises the possibility of lock order
  * reversal.
  */
 static void
 mb_reclaim(uma_zone_t zone __unused, int pending __unused)
 {
 	struct epoch_tracker et;
 	struct domain *dp;
 	struct protosw *pr;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__);
 
 	NET_EPOCH_ENTER(et);
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_drain != NULL)
 				(*pr->pr_drain)();
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * Free "count" units of I/O from an mbuf chain.  They could be held
  * in M_EXTPG or just as a normal mbuf.  This code is intended to be
  * called in an error path (I/O error, closed connection, etc).
  */
 void
 mb_free_notready(struct mbuf *m, int count)
 {
 	int i;
 
 	for (i = 0; i < count && m != NULL; i++) {
 		if ((m->m_flags & M_EXTPG) != 0) {
 			m->m_epg_nrdy--;
 			if (m->m_epg_nrdy != 0)
 				continue;
 		}
 		m = m_free(m);
 	}
 	KASSERT(i == count, ("Removed only %d items from %p", i, m));
 }
 
 /*
  * Compress an unmapped mbuf into a simple mbuf when it holds a small
  * amount of data.  This is used as a DOS defense to avoid having
  * small packets tie up wired pages, an ext_pgs structure, and an
  * mbuf.  Since this converts the existing mbuf in place, it can only
  * be used if there are no other references to 'm'.
  */
 int
 mb_unmapped_compress(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	char buf[MLEN];
 
 	/*
 	 * Assert that 'm' does not have a packet header.  If 'm' had
 	 * a packet header, it would only be able to hold MHLEN bytes
 	 * and m_data would have to be initialized differently.
 	 */
 	KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG),
             ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m));
 	KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
 
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 	}
 
 	if (*refcnt != 1)
 		return (EBUSY);
 
 	m_copydata(m, 0, m->m_len, buf);
 
 	/* Free the backing pages. */
 	m->m_ext.ext_free(m);
 
 	/* Turn 'm' into a "normal" mbuf. */
 	m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG);
 	m->m_data = m->m_dat;
 
 	/* Copy data back into m. */
 	bcopy(buf, mtod(m, char *), m->m_len);
 
 	return (0);
 }
 
 /*
  * These next few routines are used to permit downgrading an unmapped
  * mbuf to a chain of mapped mbufs.  This is used when an interface
  * doesn't supported unmapped mbufs or if checksums need to be
  * computed in software.
  *
  * Each unmapped mbuf is converted to a chain of mbufs.  First, any
  * TLS header data is stored in a regular mbuf.  Second, each page of
  * unmapped data is stored in an mbuf with an EXT_SFBUF external
  * cluster.  These mbufs use an sf_buf to provide a valid KVA for the
  * associated physical page.  They also hold a reference on the
  * original M_EXTPG mbuf to ensure the physical page doesn't go away.
  * Finally, any TLS trailer data is stored in a regular mbuf.
  *
  * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
  * mbufs.  It frees the associated sf_buf and releases its reference
  * on the original M_EXTPG mbuf.
  *
  * _mb_unmapped_to_ext() is a helper function that converts a single
  * unmapped mbuf into a chain of mbufs.
  *
  * mb_unmapped_to_ext() is the public function that walks an mbuf
  * chain converting any unmapped mbufs to mapped mbufs.  It returns
  * the new chain of unmapped mbufs on success.  On failure it frees
  * the original mbuf chain and returns NULL.
  */
 static void
 mb_unmapped_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	struct mbuf *old_m;
 
 	sf = m->m_ext.ext_arg1;
 	sf_buf_free(sf);
 
 	/* Drop the reference on the backing M_EXTPG mbuf. */
 	old_m = m->m_ext.ext_arg2;
 	mb_free_extpg(old_m);
 }
 
 static struct mbuf *
 _mb_unmapped_to_ext(struct mbuf *m)
 {
 	struct mbuf *m_new, *top, *prev, *mref;
 	struct sf_buf *sf;
 	vm_page_t pg;
 	int i, len, off, pglen, pgoff, seglen, segoff;
 	volatile u_int *refcnt;
 	u_int ref_inc = 0;
 
 	M_ASSERTEXTPG(m);
 	len = m->m_len;
 	KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p",
 	    __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/* Skip over any data removed from the front. */
 	off = mtod(m, vm_offset_t);
 
 	top = NULL;
 	if (m->m_epg_hdrlen != 0) {
 		if (off >= m->m_epg_hdrlen) {
 			off -= m->m_epg_hdrlen;
 		} else {
 			seglen = m->m_epg_hdrlen - off;
 			segoff = off;
 			seglen = min(seglen, len);
 			off = 0;
 			len -= seglen;
 			m_new = m_get(M_NOWAIT, MT_DATA);
 			if (m_new == NULL)
 				goto fail;
 			m_new->m_len = seglen;
 			prev = top = m_new;
 			memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff],
 			    seglen);
 		}
 	}
 	pgoff = m->m_epg_1st_off;
 	for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 		pglen = m_epg_pagelen(m, i, pgoff);
 		if (off >= pglen) {
 			off -= pglen;
 			pgoff = 0;
 			continue;
 		}
 		seglen = pglen - off;
 		segoff = pgoff + off;
 		off = 0;
 		seglen = min(seglen, len);
 		len -= seglen;
 
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		m_new = m_get(M_NOWAIT, MT_DATA);
 		if (m_new == NULL)
 			goto fail;
 		if (top == NULL) {
 			top = prev = m_new;
 		} else {
 			prev->m_next = m_new;
 			prev = m_new;
 		}
 		sf = sf_buf_alloc(pg, SFB_NOWAIT);
 		if (sf == NULL)
 			goto fail;
 
 		ref_inc++;
 		m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
 		    mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
 		m_new->m_data += segoff;
 		m_new->m_len = seglen;
 
 		pgoff = 0;
 	};
 	if (len != 0) {
 		KASSERT((off + len) <= m->m_epg_trllen,
 		    ("off + len > trail (%d + %d > %d)", off, len,
 		    m->m_epg_trllen));
 		m_new = m_get(M_NOWAIT, MT_DATA);
 		if (m_new == NULL)
 			goto fail;
 		if (top == NULL)
 			top = m_new;
 		else
 			prev->m_next = m_new;
 		m_new->m_len = len;
 		memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len);
 	}
 
 	if (ref_inc != 0) {
 		/*
 		 * Obtain an additional reference on the old mbuf for
 		 * each created EXT_SFBUF mbuf.  They will be dropped
 		 * in mb_unmapped_free_mext().
 		 */
 		if (*refcnt == 1)
 			*refcnt += ref_inc;
 		else
 			atomic_add_int(refcnt, ref_inc);
 	}
 	m_free(m);
 	return (top);
 
 fail:
 	if (ref_inc != 0) {
 		/*
 		 * Obtain an additional reference on the old mbuf for
 		 * each created EXT_SFBUF mbuf.  They will be
 		 * immediately dropped when these mbufs are freed
 		 * below.
 		 */
 		if (*refcnt == 1)
 			*refcnt += ref_inc;
 		else
 			atomic_add_int(refcnt, ref_inc);
 	}
 	m_free(m);
 	m_freem(top);
 	return (NULL);
 }
 
 struct mbuf *
 mb_unmapped_to_ext(struct mbuf *top)
 {
 	struct mbuf *m, *next, *prev = NULL;
 
 	prev = NULL;
 	for (m = top; m != NULL; m = next) {
 		/* m might be freed, so cache the next pointer. */
 		next = m->m_next;
 		if (m->m_flags & M_EXTPG) {
 			if (prev != NULL) {
 				/*
 				 * Remove 'm' from the new chain so
 				 * that the 'top' chain terminates
 				 * before 'm' in case 'top' is freed
 				 * due to an error.
 				 */
 				prev->m_next = NULL;
 			}
 			m = _mb_unmapped_to_ext(m);
 			if (m == NULL) {
 				m_freem(top);
 				m_freem(next);
 				return (NULL);
 			}
 			if (prev == NULL) {
 				top = m;
 			} else {
 				prev->m_next = m;
 			}
 
 			/*
 			 * Replaced one mbuf with a chain, so we must
 			 * find the end of chain.
 			 */
 			prev = m_last(m);
 		} else {
 			if (prev != NULL) {
 				prev->m_next = m;
 			}
 			prev = m;
 		}
 	}
 	return (top);
 }
 
 /*
  * Allocate an empty M_EXTPG mbuf.  The ext_free routine is
  * responsible for freeing any pages backing this mbuf when it is
  * freed.
  */
 struct mbuf *
 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free)
 {
 	struct mbuf *m;
 
 	m = m_get(how, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	m->m_epg_npgs = 0;
 	m->m_epg_nrdy = 0;
 	m->m_epg_1st_off = 0;
 	m->m_epg_last_len = 0;
 	m->m_epg_flags = 0;
 	m->m_epg_hdrlen = 0;
 	m->m_epg_trllen = 0;
 	m->m_epg_tls = NULL;
 	m->m_epg_so = NULL;
 	m->m_data = NULL;
 	m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG);
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_ext.ext_size = 0;
 	m->m_ext.ext_free = ext_free;
 	return (m);
 }
 
 /*
  * Clean up after mbufs with M_EXT storage attached to them if the
  * reference count hits 1.
  */
 void
 mb_free_ext(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	struct mbuf *mref;
 	int freembuf;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/*
 	 * Check if the header is embedded in the cluster.  It is
 	 * important that we can't touch any of the mbuf fields
 	 * after we have freed the external storage, since mbuf
 	 * could have been embedded in it.  For now, the mbufs
 	 * embedded into the cluster are always of type EXT_EXTREF,
 	 * and for this type we won't free the mref.
 	 */
 	if (m->m_flags & M_NOFREE) {
 		freembuf = 0;
 		KASSERT(m->m_ext.ext_type == EXT_EXTREF ||
 		    m->m_ext.ext_type == EXT_RXRING,
 		    ("%s: no-free mbuf %p has wrong type", __func__, m));
 	} else
 		freembuf = 1;
 
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 		switch (m->m_ext.ext_type) {
 		case EXT_PACKET:
 			/* The packet zone is special. */
 			if (*refcnt == 0)
 				*refcnt = 1;
 			uma_zfree(zone_pack, mref);
 			break;
 		case EXT_CLUSTER:
 			uma_zfree(zone_clust, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_JUMBOP:
 			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_JUMBO9:
 			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			m_free_raw(mref);
 			break;
 		case EXT_SFBUF:
 		case EXT_NET_DRV:
 		case EXT_MOD_TYPE:
 		case EXT_DISPOSABLE:
 			KASSERT(mref->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			mref->m_ext.ext_free(mref);
 			m_free_raw(mref);
 			break;
 		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
 			    ("%s: ext_free not set", __func__));
 			m->m_ext.ext_free(m);
 			break;
 		case EXT_RXRING:
 			KASSERT(m->m_ext.ext_free == NULL,
 			    ("%s: ext_free is set", __func__));
 			break;
 		default:
 			KASSERT(m->m_ext.ext_type == 0,
 			    ("%s: unknown ext_type", __func__));
 		}
 	}
 
 	if (freembuf && m != mref)
 		m_free_raw(m);
 }
 
 /*
  * Clean up after mbufs with M_EXTPG storage attached to them if the
  * reference count hits 1.
  */
 void
 mb_free_extpg(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	struct mbuf *mref;
 
 	M_ASSERTEXTPG(m);
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 		KASSERT(mref->m_ext.ext_free != NULL,
 		    ("%s: ext_free not set", __func__));
 
 		mref->m_ext.ext_free(mref);
 #ifdef KERN_TLS
 		if (mref->m_epg_tls != NULL &&
 		    !refcount_release_if_not_last(&mref->m_epg_tls->refcount))
 			ktls_enqueue_to_free(mref);
 		else
 #endif
 			m_free_raw(mref);
 	}
 
 	if (m != mref)
 		m_free_raw(m);
 }
 
 /*
  * Official mbuf(9) allocation KPI for stack and drivers:
  *
  * m_get()	- a single mbuf without any attachments, sys/mbuf.h.
  * m_gethdr()	- a single mbuf initialized as M_PKTHDR, sys/mbuf.h.
  * m_getcl()	- an mbuf + 2k cluster, sys/mbuf.h.
  * m_clget()	- attach cluster to already allocated mbuf.
  * m_cljget()	- attach jumbo cluster to already allocated mbuf.
  * m_get2()	- allocate minimum mbuf that would fit size argument.
  * m_getm2()	- allocate a chain of mbufs/clusters.
  * m_extadd()	- attach external cluster to mbuf.
  *
  * m_free()	- free single mbuf with its tags and ext, sys/mbuf.h.
  * m_freem()	- free chain of mbufs.
  */
 
 int
 m_clget(struct mbuf *m, int how)
 {
 
 	KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 	    __func__, m));
 	m->m_ext.ext_buf = (char *)NULL;
 	uma_zalloc_arg(zone_clust, m, how);
 	/*
 	 * On a cluster allocation failure, drain the packet zone and retry,
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 		uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 	MBUF_PROBE2(m__clget, m, how);
 	return (m->m_flags & M_EXT);
 }
 
 /*
  * m_cljget() is different from m_clget() as it can allocate clusters without
  * attaching them to an mbuf.  In that case the return value is the pointer
  * to the cluster of the requested size.  If an mbuf was specified, it gets
  * the cluster attached to it and the return value can be safely ignored.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 void *
 m_cljget(struct mbuf *m, int how, int size)
 {
 	uma_zone_t zone;
 	void *retval;
 
 	if (m != NULL) {
 		KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 		    __func__, m));
 		m->m_ext.ext_buf = NULL;
 	}
 
 	zone = m_getzone(size);
 	retval = uma_zalloc_arg(zone, m, how);
 
 	MBUF_PROBE4(m__cljget, m, how, size, retval);
 
 	return (retval);
 }
 
 /*
  * m_get2() allocates minimum mbuf that would fit "size" argument.
  */
 struct mbuf *
 m_get2(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 
 	args.flags = flags;
 	args.type = type;
 
 	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
 		return (uma_zalloc_arg(zone_mbuf, &args, how));
 	if (size <= MCLBYTES)
 		return (uma_zalloc_arg(zone_pack, &args, how));
 
 	if (size > MJUMPAGESIZE)
 		return (NULL);
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	n = uma_zalloc_arg(zone_jumbop, m, how);
 	if (n == NULL) {
 		m_free_raw(m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_get3() allocates minimum mbuf that would fit "size" argument.
  * Unlike m_get2() it can allocate clusters up to MJUM16BYTES.
  */
 struct mbuf *
 m_get3(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size <= MJUMPAGESIZE)
 		return (m_get2(size, how, type, flags));
 
 	if (size > MJUM16BYTES)
 		return (NULL);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	if (size <= MJUM9BYTES)
 		zone = zone_jumbo9;
 	else
 		zone = zone_jumbo16;
 
 	n = uma_zalloc_arg(zone_jumbop, m, how);
 	if (n == NULL) {
 		m_free_raw(m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_getjcl() returns an mbuf with a cluster of the specified size attached.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 struct mbuf *
 m_getjcl(int how, short type, int flags, int size)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size == MCLBYTES)
 		return m_getcl(how, type, flags);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	zone = m_getzone(size);
 	n = uma_zalloc_arg(zone, m, how);
 	if (n == NULL) {
 		m_free_raw(m);
 		return (NULL);
 	}
 	MBUF_PROBE5(m__getjcl, how, type, flags, size, m);
 	return (m);
 }
 
 /*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
  * to the existing one and return a pointer to the provided mbuf.
  */
 struct mbuf *
 m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
 	struct mbuf *mb, *nm = NULL, *mtail = NULL;
 
 	KASSERT(len >= 0, ("%s: len is < 0", __func__));
 
 	/* Validate flags. */
 	flags &= (M_PKTHDR | M_EOR);
 
 	/* Packet header mbuf must be first in chain. */
 	if ((flags & M_PKTHDR) && m != NULL)
 		flags &= ~M_PKTHDR;
 
 	/* Loop and append maximum sized mbufs to the chain tail. */
 	while (len > 0) {
 		mb = NULL;
 		if (len > MCLBYTES) {
 			mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR),
 			    MJUMPAGESIZE);
 		}
 		if (mb == NULL) {
 			if (len >= MINCLSIZE)
 				mb = m_getcl(how, type, (flags & M_PKTHDR));
 			else if (flags & M_PKTHDR)
 				mb = m_gethdr(how, type);
 			else
 				mb = m_get(how, type);
 
 			/*
 			 * Fail the whole operation if one mbuf can't be
 			 * allocated.
 			 */
 			if (mb == NULL) {
 				m_freem(nm);
 				return (NULL);
 			}
 		}
 
 		/* Book keeping. */
 		len -= M_SIZE(mb);
 		if (mtail != NULL)
 			mtail->m_next = mb;
 		else
 			nm = mb;
 		mtail = mb;
 		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
 	}
 	if (flags & M_EOR)
 		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
 
 	/* If mbuf was supplied, append new chain to the end of it. */
 	if (m != NULL) {
 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
 			;
 		mtail->m_next = nm;
 		mtail->m_flags &= ~M_EOR;
 	} else
 		m = nm;
 
 	return (m);
 }
 
 /*-
  * Configure a provided mbuf to refer to the provided external storage
  * buffer and setup a reference count for said buffer.
  *
  * Arguments:
  *    mb     The existing mbuf to which to attach the provided buffer.
  *    buf    The address of the provided external storage buffer.
  *    size   The size of the provided buffer.
  *    freef  A pointer to a routine that is responsible for freeing the
  *           provided external storage buffer.
  *    args   A pointer to an argument structure (of any type) to be passed
  *           to the provided freef routine (may be NULL).
  *    flags  Any other flags to be passed to the provided mbuf.
  *    type   The type that the external storage buffer should be
  *           labeled with.
  *
  * Returns:
  *    Nothing.
  */
 void
 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef,
     void *arg1, void *arg2, int flags, int type)
 {
 
 	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
 	mb->m_flags |= (M_EXT | flags);
 	mb->m_ext.ext_buf = buf;
 	mb->m_data = mb->m_ext.ext_buf;
 	mb->m_ext.ext_size = size;
 	mb->m_ext.ext_free = freef;
 	mb->m_ext.ext_arg1 = arg1;
 	mb->m_ext.ext_arg2 = arg2;
 	mb->m_ext.ext_type = type;
 
 	if (type != EXT_EXTREF) {
 		mb->m_ext.ext_count = 1;
 		mb->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	} else
 		mb->m_ext.ext_flags = 0;
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, if
  * applicable.
  */
 void
 m_freem(struct mbuf *mb)
 {
 
 	MBUF_PROBE1(m__freem, mb);
 	while (mb != NULL)
 		mb = m_free(mb);
 }
 
 /*
  * Temporary primitive to allow freeing without going through m_free.
  */
 void
 m_free_raw(struct mbuf *mb)
 {
 
 	uma_zfree(zone_mbuf, mb);
 }
 
 int
 m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **mstp)
 {
 
 	if (ifp->if_snd_tag_alloc == NULL)
 		return (EOPNOTSUPP);
 	return (ifp->if_snd_tag_alloc(ifp, params, mstp));
 }
 
 void
-m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, u_int type)
+m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp,
+    const struct if_snd_tag_sw *sw)
 {
 
 	if_ref(ifp);
 	mst->ifp = ifp;
 	refcount_init(&mst->refcount, 1);
-	mst->type = type;
+	mst->sw = sw;
 	counter_u64_add(snd_tag_count, 1);
 }
 
 void
 m_snd_tag_destroy(struct m_snd_tag *mst)
 {
 	struct ifnet *ifp;
 
 	ifp = mst->ifp;
-	ifp->if_snd_tag_free(mst);
+	mst->sw->snd_tag_free(mst);
 	if_rele(ifp);
 	counter_u64_add(snd_tag_count, -1);
 }
 
 /*
  * Allocate an mbuf with anonymous external pages.
  */
 struct mbuf *
 mb_alloc_ext_plus_pages(int len, int how)
 {
 	struct mbuf *m;
 	vm_page_t pg;
 	int i, npgs;
 
 	m = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
 	if (m == NULL)
 		return (NULL);
 	m->m_epg_flags |= EPG_FLAG_ANON;
 	npgs = howmany(len, PAGE_SIZE);
 	for (i = 0; i < npgs; i++) {
 		do {
 			pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED);
 			if (pg == NULL) {
 				if (how == M_NOWAIT) {
 					m->m_epg_npgs = i;
 					m_free(m);
 					return (NULL);
 				}
 				vm_wait(NULL);
 			}
 		} while (pg == NULL);
 		m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg);
 	}
 	m->m_epg_npgs = npgs;
 	return (m);
 }
 
 /*
  * Copy the data in the mbuf chain to a chain of mbufs with anonymous external
  * unmapped pages.
  * len is the length of data in the input mbuf chain.
  * mlen is the maximum number of bytes put into each ext_page mbuf.
  */
 struct mbuf *
 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how,
     struct mbuf **mlast)
 {
 	struct mbuf *m, *mout;
 	char *pgpos, *mbpos;
 	int i, mblen, mbufsiz, pglen, xfer;
 
 	if (len == 0)
 		return (NULL);
 	mbufsiz = min(mlen, len);
 	m = mout = mb_alloc_ext_plus_pages(mbufsiz, how);
 	if (m == NULL)
 		return (m);
 	pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]);
 	pglen = PAGE_SIZE;
 	mblen = 0;
 	i = 0;
 	do {
 		if (pglen == 0) {
 			if (++i == m->m_epg_npgs) {
 				m->m_epg_last_len = PAGE_SIZE;
 				mbufsiz = min(mlen, len);
 				m->m_next = mb_alloc_ext_plus_pages(mbufsiz,
 				    how);
 				m = m->m_next;
 				if (m == NULL) {
 					m_freem(mout);
 					return (m);
 				}
 				i = 0;
 			}
 			pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]);
 			pglen = PAGE_SIZE;
 		}
 		while (mblen == 0) {
 			if (mp == NULL) {
 				m_freem(mout);
 				return (NULL);
 			}
 			KASSERT((mp->m_flags & M_EXTPG) == 0,
 			    ("mb_copym_ext_pgs: ext_pgs input mbuf"));
 			mbpos = mtod(mp, char *);
 			mblen = mp->m_len;
 			mp = mp->m_next;
 		}
 		xfer = min(mblen, pglen);
 		memcpy(pgpos, mbpos, xfer);
 		pgpos += xfer;
 		mbpos += xfer;
 		pglen -= xfer;
 		mblen -= xfer;
 		len -= xfer;
 		m->m_len += xfer;
 	} while (len > 0);
 	m->m_epg_last_len = PAGE_SIZE - pglen;
 	if (mlast != NULL)
 		*mlast = m;
 	return (mout);
 }
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 28fc7a0a97ec..9e9a6b5b60fb 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1,2559 +1,2557 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/domainset.h>
 #include <sys/endian.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/kthread.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 #include <machine/pcb.h>
 #endif
 #include <machine/vmparam.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #endif
 #include <net/route.h>
 #include <net/route/nhop.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #endif
 #include <netinet/tcp_var.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/ktls.h>
 #include <vm/uma_dbg.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pagequeue.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, mbuf) m_head;
 	STAILQ_HEAD(, socket) so_head;
 	bool		running;
 	int		lastallocfail;
 } __aligned(CACHE_LINE_SIZE);
 
 struct ktls_alloc_thread {
 	uint64_t wakeups;
 	uint64_t allocs;
 	struct thread *td;
 	int running;
 };
 
 struct ktls_domain_info {
 	int count;
 	int cpu[MAXCPU];
 	struct ktls_alloc_thread alloc_td;
 };
 
 struct ktls_domain_info ktls_domains[MAXMEMDOM];
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 static uma_zone_t ktls_session_zone;
 static uma_zone_t ktls_buffer_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload stats");
 
 #ifdef RSS
 static int ktls_bind_threads = 1;
 #else
 static int ktls_bind_threads;
 #endif
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
     &ktls_bind_threads, 0,
     "Bind crypto threads to cores (1) or cores and domains (2) at boot");
 
 static u_int ktls_maxlen = 16384;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
     &ktls_maxlen, 0, "Maximum TLS record size");
 
 static int ktls_number_threads;
 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
 unsigned int ktls_ifnet_max_rexmit_pct = 2;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
     &ktls_ifnet_max_rexmit_pct, 2,
     "Max percent bytes retransmitted before ifnet TLS is disabled");
 
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
     &ktls_offload_enable, 0,
     "Enable support for kernel TLS offload");
 
 static bool ktls_cbc_enable = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
     &ktls_cbc_enable, 1,
     "Enable Support of AES-CBC crypto for kernel TLS");
 
 static bool ktls_sw_buffer_cache = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
     &ktls_sw_buffer_cache, 1,
     "Enable caching of output buffers for SW encryption");
 
 static int ktls_max_alloc = 128;
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN,
     &ktls_max_alloc, 128,
     "Max number of 16k buffers to allocate in thread context");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
     &ktls_cnt_tx_queued,
     "Number of TLS records in queue to tasks for SW encryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
     &ktls_cnt_rx_queued,
     "Number of TLS sockets in queue to tasks for SW decryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_total);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
     "Total successful TLS setups (parameters set)");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
     CTLFLAG_RD, &ktls_offload_enable_calls,
     "Total number of TLS enable calls made");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
     &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
     &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
 
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Hardware (ifnet) TLS session stats");
 #ifdef TCP_OFFLOAD
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TOE TLS session stats");
 #endif
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
     "Active number of software TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
     "Active number of software TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_sw_chacha20,
     "Active number of software TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_ifnet_cbc,
     "Active number of ifnet TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_ifnet_gcm,
     "Active number of ifnet TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_ifnet_chacha20,
     "Active number of ifnet TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
     &ktls_ifnet_reset_dropped,
     "TLS sessions dropped after failing to update ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
     &ktls_ifnet_reset_failed,
     "TLS sessions that failed to allocate a new ifnet send tag");
 
 static int ktls_ifnet_permitted;
 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
     &ktls_ifnet_permitted, 1,
     "Whether to permit hardware (ifnet) TLS sessions");
 
 #ifdef TCP_OFFLOAD
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_toe_cbc,
     "Active number of TOE TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_toe_gcm,
     "Active number of TOE TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_toe_chacha20,
     "Active number of TOE TLS sessions using Chacha20-Poly1305");
 #endif
 
 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
 
 static void ktls_cleanup(struct ktls_session *tls);
 #if defined(INET) || defined(INET6)
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
 static void ktls_alloc_thread(void *ctx);
 
 #if defined(INET) || defined(INET6)
 static u_int
 ktls_get_cpu(struct socket *so)
 {
 	struct inpcb *inp;
 #ifdef NUMA
 	struct ktls_domain_info *di;
 #endif
 	u_int cpuid;
 
 	inp = sotoinpcb(so);
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid != NETISR_CPUID_NONE)
 		return (cpuid);
 #endif
 	/*
 	 * Just use the flowid to shard connections in a repeatable
 	 * fashion.  Note that TLS 1.0 sessions rely on the
 	 * serialization provided by having the same connection use
 	 * the same queue.
 	 */
 #ifdef NUMA
 	if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
 		di = &ktls_domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
 	} else
 #endif
 		cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
 	return (cpuid);
 }
 #endif
 
 static int
 ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
 {
 	vm_page_t m;
 	int i;
 
 	KASSERT((ktls_maxlen & PAGE_MASK) == 0,
 	    ("%s: ktls max length %d is not page size-aligned",
 	    __func__, ktls_maxlen));
 
 	for (i = 0; i < count; i++) {
 		m = vm_page_alloc_contig_domain(NULL, 0, domain,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_NODUMP | malloc2vm_flags(flags),
 		    atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
 		    VM_MEMATTR_DEFAULT);
 		if (m == NULL)
 			break;
 		store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	}
 	return (i);
 }
 
 static void
 ktls_buffer_release(void *arg __unused, void **store, int count)
 {
 	vm_page_t m;
 	int i, j;
 
 	for (i = 0; i < count; i++) {
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
 		for (j = 0; j < atop(ktls_maxlen); j++) {
 			(void)vm_page_unwire_noq(m + j);
 			vm_page_free(m + j);
 		}
 	}
 }
 
 static void
 ktls_free_mext_contig(struct mbuf *m)
 {
 	M_ASSERTEXTPG(m);
 	uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
 }
 
 static void
 ktls_init(void *dummy __unused)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	cpuset_t mask;
 	int count, domain, error, i;
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
 	    M_WAITOK | M_ZERO);
 
 	ktls_session_zone = uma_zcreate("ktls_session",
 	    sizeof(struct ktls_session),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	if (ktls_sw_buffer_cache) {
 		ktls_buffer_zone = uma_zcache_create("ktls_buffers",
 		    roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
 		    ktls_buffer_import, ktls_buffer_release, NULL,
 		    UMA_ZONE_FIRSTTOUCH);
 	}
 
 	/*
 	 * Initialize the workqueues to run the TLS work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&ktls_wq[i].m_head);
 		STAILQ_INIT(&ktls_wq[i].so_head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
 		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
 		if (error)
 			panic("Can't add KTLS thread %d error %d", i, error);
 
 		/*
 		 * Bind threads to cores.  If ktls_bind_threads is >
 		 * 1, then we bind to the NUMA domain.
 		 */
 		if (ktls_bind_threads) {
 			if (ktls_bind_threads > 1) {
 				pc = pcpu_find(i);
 				domain = pc->pc_domain;
 				CPU_COPY(&cpuset_domain[domain], &mask);
 				count = ktls_domains[domain].count;
 				ktls_domains[domain].cpu[count] = i;
 				ktls_domains[domain].count++;
 			} else {
 				CPU_SETOF(i, &mask);
 			}
 			error = cpuset_setthread(td->td_tid, &mask);
 			if (error)
 				panic(
 			    "Unable to bind KTLS thread for CPU %d error %d",
 				     i, error);
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 
 	/*
 	 * Start an allocation thread per-domain to perform blocking allocations
 	 * of 16k physically contiguous TLS crypto destination buffers.
 	 */
 	if (ktls_sw_buffer_cache) {
 		for (domain = 0; domain < vm_ndomains; domain++) {
 			if (VM_DOMAIN_EMPTY(domain))
 				continue;
 			if (CPU_EMPTY(&cpuset_domain[domain]))
 				continue;
 			error = kproc_kthread_add(ktls_alloc_thread,
 			    &ktls_domains[domain], &ktls_proc,
 			    &ktls_domains[domain].alloc_td.td,
 			    0, 0, "KTLS", "alloc_%d", domain);
 			if (error)
 				panic("Can't add KTLS alloc thread %d error %d",
 				    domain, error);
 			CPU_COPY(&cpuset_domain[domain], &mask);
 			error = cpuset_setthread(ktls_domains[domain].alloc_td.td->td_tid,
 			    &mask);
 			if (error)
 				panic("Unable to bind KTLS alloc %d error %d",
 				    domain, error);
 		}
 	}
 
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
 	 * among all KTLS threads.
 	 */
 	if (ktls_bind_threads > 1) {
 		for (i = 0; i < vm_ndomains; i++) {
 			if (ktls_domains[i].count == 0) {
 				ktls_bind_threads = 1;
 				break;
 			}
 		}
 	}
 
 	if (bootverbose)
 		printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 }
 SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
 
 #if defined(INET) || defined(INET6)
 static int
 ktls_create_session(struct socket *so, struct tls_enable *en,
     struct ktls_session **tlsp)
 {
 	struct ktls_session *tls;
 	int error;
 
 	/* Only TLS 1.0 - 1.3 are supported. */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
 		return (EINVAL);
 	if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
 	    en->tls_vminor > TLS_MINOR_VER_THREE)
 		return (EINVAL);
 
 	if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
 		return (EINVAL);
 
 	/* All supported algorithms require a cipher key. */
 	if (en->cipher_key_len == 0)
 		return (EINVAL);
 
 	/* No flags are currently supported. */
 	if (en->flags != 0)
 		return (EINVAL);
 
 	/* Common checks for supported algorithms. */
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * auth_algorithm isn't used, but permit GMAC values
 		 * for compatibility.
 		 */
 		switch (en->auth_algorithm) {
 		case 0:
 #ifdef COMPAT_FREEBSD12
 		/* XXX: Really 13.0-current COMPAT. */
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
 #endif
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len != 0)
 			return (EINVAL);
 		if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
 			en->iv_len != TLS_AEAD_GCM_LEN) ||
 		    (en->tls_vminor == TLS_MINOR_VER_THREE &&
 			en->iv_len != TLS_1_3_GCM_IV_LEN))
 			return (EINVAL);
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			/*
 			 * TLS 1.0 requires an implicit IV.  TLS 1.1+
 			 * all use explicit IVs.
 			 */
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
 					return (EINVAL);
 				break;
 			}
 
 			/* FALLTHROUGH */
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			/* Ignore any supplied IV. */
 			en->iv_len = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len == 0)
 			return (EINVAL);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		if (en->auth_algorithm != 0 || en->auth_key_len != 0)
 			return (EINVAL);
 		if (en->tls_vminor != TLS_MINOR_VER_TWO &&
 		    en->tls_vminor != TLS_MINOR_VER_THREE)
 			return (EINVAL);
 		if (en->iv_len != TLS_CHACHA20_IV_LEN)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
 	TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
 
 	tls->wq_index = ktls_get_cpu(so);
 
 	tls->params.cipher_algorithm = en->cipher_algorithm;
 	tls->params.auth_algorithm = en->auth_algorithm;
 	tls->params.tls_vmajor = en->tls_vmajor;
 	tls->params.tls_vminor = en->tls_vminor;
 	tls->params.flags = en->flags;
 	tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
 
 	/* Set the header and trailer lengths. */
 	tls->params.tls_hlen = sizeof(struct tls_record_layer);
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
 		 * nonce.  TLS 1.3 uses a 12 byte implicit IV.
 		 */
 		if (en->tls_vminor < TLS_MINOR_VER_THREE)
 			tls->params.tls_hlen += sizeof(uint64_t);
 		tls->params.tls_tlen = AES_GMAC_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA1_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_256_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_384_HASH_LEN;
 			break;
 		default:
 			panic("invalid hmac");
 		}
 		tls->params.tls_bs = AES_BLOCK_LEN;
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		/*
 		 * Chacha20 uses a 12 byte implicit IV.
 		 */
 		tls->params.tls_tlen = POLY1305_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	default:
 		panic("invalid cipher");
 	}
 
 	/*
 	 * TLS 1.3 includes optional padding which we do not support,
 	 * and also puts the "real" record type at the end of the
 	 * encrypted data.
 	 */
 	if (en->tls_vminor == TLS_MINOR_VER_THREE)
 		tls->params.tls_tlen += sizeof(uint8_t);
 
 	KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
 	    ("TLS header length too long: %d", tls->params.tls_hlen));
 	KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
 	    ("TLS trailer length too long: %d", tls->params.tls_tlen));
 
 	if (en->auth_key_len != 0) {
 		tls->params.auth_key_len = en->auth_key_len;
 		tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
 		    M_WAITOK);
 		error = copyin(en->auth_key, tls->params.auth_key,
 		    en->auth_key_len);
 		if (error)
 			goto out;
 	}
 
 	tls->params.cipher_key_len = en->cipher_key_len;
 	tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
 	error = copyin(en->cipher_key, tls->params.cipher_key,
 	    en->cipher_key_len);
 	if (error)
 		goto out;
 
 	/*
 	 * This holds the implicit portion of the nonce for AEAD
 	 * ciphers and the initial implicit IV for TLS 1.0.  The
 	 * explicit portions of the IV are generated in ktls_frame().
 	 */
 	if (en->iv_len != 0) {
 		tls->params.iv_len = en->iv_len;
 		error = copyin(en->iv, tls->params.iv, en->iv_len);
 		if (error)
 			goto out;
 
 		/*
 		 * For TLS 1.2 with GCM, generate an 8-byte nonce as a
 		 * counter to generate unique explicit IVs.
 		 *
 		 * Store this counter in the last 8 bytes of the IV
 		 * array so that it is 8-byte aligned.
 		 */
 		if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    en->tls_vminor == TLS_MINOR_VER_TWO)
 			arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
 	}
 
 	*tlsp = tls;
 	return (0);
 
 out:
 	ktls_cleanup(tls);
 	return (error);
 }
 
 static struct ktls_session *
 ktls_clone_session(struct ktls_session *tls)
 {
 	struct ktls_session *tls_new;
 
 	tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
 	TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag, tls_new);
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
 	tls_new->wq_index = tls->wq_index;
 
 	/* Deep copy keys. */
 	if (tls_new->params.auth_key != NULL) {
 		tls_new->params.auth_key = malloc(tls->params.auth_key_len,
 		    M_KTLS, M_WAITOK);
 		memcpy(tls_new->params.auth_key, tls->params.auth_key,
 		    tls->params.auth_key_len);
 	}
 
 	tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
 	    M_WAITOK);
 	memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
 	    tls->params.cipher_key_len);
 
 	return (tls_new);
 }
 #endif
 
 static void
 ktls_cleanup(struct ktls_session *tls)
 {
 
 	counter_u64_add(ktls_offload_active, -1);
 	switch (tls->mode) {
 	case TCP_TLS_MODE_SW:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_sw_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_sw_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_sw_chacha20, -1);
 			break;
 		}
 		ktls_ocf_free(tls);
 		break;
 	case TCP_TLS_MODE_IFNET:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_ifnet_chacha20, -1);
 			break;
 		}
 		if (tls->snd_tag != NULL)
 			m_snd_tag_rele(tls->snd_tag);
 		break;
 #ifdef TCP_OFFLOAD
 	case TCP_TLS_MODE_TOE:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, -1);
 			break;
 		}
 		break;
 #endif
 	}
 	if (tls->params.auth_key != NULL) {
 		zfree(tls->params.auth_key, M_KTLS);
 		tls->params.auth_key = NULL;
 		tls->params.auth_key_len = 0;
 	}
 	if (tls->params.cipher_key != NULL) {
 		zfree(tls->params.cipher_key, M_KTLS);
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 }
 
 #if defined(INET) || defined(INET6)
 
 #ifdef TCP_OFFLOAD
 static int
 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (!(tp->t_flags & TF_TOE)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = tcp_offload_alloc_tls_session(tp, tls, direction);
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_TOE;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, 1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, 1);
 			break;
 		}
 	}
 	return (error);
 }
 #endif
 
 /*
  * Common code used when first enabling ifnet TLS on a connection or
  * when allocating a new ifnet TLS session due to a routing change.
  * This function allocates a new TLS send tag on whatever interface
  * the connection is currently routed over.
  */
 static int
 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	struct tcpcb *tp;
 	int error;
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 *
 	 * - Always permit 'force' requests.
 	 * - ktls_ifnet_permitted == 0: always deny.
 	 */
 	if (!force && ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: Use the cached route in the inpcb to find the
 	 * interface.  This should perhaps instead use
 	 * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
 	 * enabled after a connection has completed key negotiation in
 	 * userland, the cached route will be present in practice.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 
 	/*
 	 * Allocate a TLS + ratelimit tag if the connection has an
 	 * existing pacing rate.
 	 */
 	if (tp->t_pacing_rate != -1 &&
 	    (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
 		params.tls_rate_limit.inp = inp;
 		params.tls_rate_limit.tls = tls;
 		params.tls_rate_limit.max_rate = tp->t_pacing_rate;
 	} else {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS;
 		params.tls.inp = inp;
 		params.tls.tls = tls;
 	}
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	INP_RUNLOCK(inp);
 
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = m_snd_tag_alloc(ifp, &params, mstp);
 out:
 	if_rele(ifp);
 	return (error);
 }
 
 static int
 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
 {
 	struct m_snd_tag *mst;
 	int error;
 
 	error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_IFNET;
 		tls->snd_tag = mst;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, 1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_ifnet_chacha20, 1);
 			break;
 		}
 	}
 	return (error);
 }
 
 static int
 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
 {
 	int error;
 
 	error = ktls_ocf_try(so, tls, direction);
 	if (error)
 		return (error);
 	tls->mode = TCP_TLS_MODE_SW;
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_sw_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_sw_gcm, 1);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		counter_u64_add(ktls_sw_chacha20, 1);
 		break;
 	}
 	return (0);
 }
 
 /*
  * KTLS RX stores data in the socket buffer as a list of TLS records,
  * where each record is stored as a control message containg the TLS
  * header followed by data mbufs containing the decrypted data.  This
  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
  * both encrypted and decrypted data.  TLS records decrypted by a NIC
  * should be queued to the socket buffer as records, but encrypted
  * data which needs to be decrypted by software arrives as a stream of
  * regular mbufs which need to be converted.  In addition, there may
  * already be pending encrypted data in the socket buffer when KTLS RX
  * is enabled.
  *
  * To manage not-yet-decrypted data for KTLS RX, the following scheme
  * is used:
  *
  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
  *
  * - ktls_check_rx checks this chain of mbufs reading the TLS header
  *   from the first mbuf.  Once all of the data for that TLS record is
  *   queued, the socket is queued to a worker thread.
  *
  * - The worker thread calls ktls_decrypt to decrypt TLS records in
  *   the TLS chain.  Each TLS record is detached from the TLS chain,
  *   decrypted, and inserted into the regular socket buffer chain as
  *   record starting with a control message holding the TLS header and
  *   a chain of mbufs holding the encrypted data.
  */
 
 static void
 sb_mark_notready(struct sockbuf *sb)
 {
 	struct mbuf *m;
 
 	m = sb->sb_mb;
 	sb->sb_mtls = m;
 	sb->sb_mb = NULL;
 	sb->sb_mbtail = NULL;
 	sb->sb_lastrecord = NULL;
 	for (; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 		    __func__));
 		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
 		    __func__));
 		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 		    __func__));
 		m->m_flags |= M_NOTREADY;
 		sb->sb_acc -= m->m_len;
 		sb->sb_tlscc += m->m_len;
 		sb->sb_mtlstail = m;
 	}
 	KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
 	    ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
 	    sb->sb_ccc));
 }
 
 int
 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_rcv.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS 1.3 is not yet supported. */
 	if (en->tls_vmajor == TLS_MAJOR_VER_ONE &&
 	    en->tls_vminor == TLS_MINOR_VER_THREE)
 		return (ENOTSUP);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_RX);
 	if (error)
 #endif
 		error = ktls_try_sw(so, tls, KTLS_RX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/* Mark the socket as using TLS offload. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_rcv.sb_tls_info = tls;
 	so->so_rcv.sb_flags |= SB_TLS_RX;
 
 	/* Mark existing data as not ready until it can be decrypted. */
 	if (tls->mode != TCP_TLS_MODE_TOE) {
 		sb_mark_notready(&so->so_rcv);
 		ktls_check_rx(&so->so_rcv);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_snd.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS requires ext pgs */
 	if (mb_use_ext_pgs == 0)
 		return (ENXIO);
 
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_TX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, false);
 	if (error)
 		error = ktls_try_sw(so, tls, KTLS_TX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/*
 	 * Write lock the INP when setting sb_tls_info so that
 	 * routines in tcp_ratelimit.c can read sb_tls_info while
 	 * holding the INP lock.
 	 */
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_snd.sb_tls_info = tls;
 	if (tls->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_get_rx_mode(struct socket *so)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int mode;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_rcv);
 	tls = so->so_rcv.sb_tls_info;
 	if (tls == NULL)
 		mode = TCP_TLS_MODE_NONE;
 	else
 		mode = tls->mode;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	return (mode);
 }
 
 int
 ktls_get_tx_mode(struct socket *so)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int mode;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL)
 		mode = TCP_TLS_MODE_NONE;
 	else
 		mode = tls->mode;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (mode);
 }
 
 /*
  * Switch between SW and ifnet TLS sessions as requested.
  */
 int
 ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
 	int error;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	switch (mode) {
 	case TCP_TLS_MODE_SW:
 	case TCP_TLS_MODE_IFNET:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	if (tls->mode == mode) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	tls = ktls_hold(tls);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 
 	tls_new = ktls_clone_session(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, true);
 	else
 		error = ktls_try_sw(so, tls_new, KTLS_TX);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	/*
 	 * If we raced with another session change, keep the existing
 	 * session.
 	 */
 	if (tls != so->so_snd.sb_tls_info) {
 		counter_u64_add(ktls_switch_failed, 1);
 		SOCK_IO_SEND_UNLOCK(so);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (EBUSY);
 	}
 
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
 	if (tls_new->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	/*
 	 * Drop two references on 'tls'.  The first is for the
 	 * ktls_hold() above.  The second drops the reference from the
 	 * socket buffer.
 	 */
 	KASSERT(tls->refcount >= 2, ("too few references on old session"));
 	ktls_free(tls);
 	ktls_free(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		counter_u64_add(ktls_switch_to_ifnet, 1);
 	else
 		counter_u64_add(ktls_switch_to_sw, 1);
 
 	INP_WLOCK(inp);
 	return (0);
 }
 
 /*
  * Try to allocate a new TLS send tag.  This task is scheduled when
  * ip_output detects a route change while trying to transmit a packet
  * holding a TLS record.  If a new tag is allocated, replace the tag
  * in the TLS session.  Subsequent packets on the connection will use
  * the new tag.  If a new tag cannot be allocated, drop the
  * connection.
  */
 static void
 ktls_reset_send_tag(void *context, int pending)
 {
 	struct epoch_tracker et;
 	struct ktls_session *tls;
 	struct m_snd_tag *old, *new;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	inp = tls->inp;
 
 	/*
 	 * Free the old tag first before allocating a new one.
 	 * ip[6]_output_send() will treat a NULL send tag the same as
 	 * an ifp mismatch and drop packets until a new tag is
 	 * allocated.
 	 *
 	 * Write-lock the INP when changing tls->snd_tag since
 	 * ip[6]_output_send() holds a read-lock when reading the
 	 * pointer.
 	 */
 	INP_WLOCK(inp);
 	old = tls->snd_tag;
 	tls->snd_tag = NULL;
 	INP_WUNLOCK(inp);
 	if (old != NULL)
 		m_snd_tag_rele(old);
 
 	error = ktls_alloc_snd_tag(inp, tls, true, &new);
 
 	if (error == 0) {
 		INP_WLOCK(inp);
 		tls->snd_tag = new;
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
 		if (!in_pcbrele_wlocked(inp))
 			INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
 		/*
 		 * XXX: Should we kick tcp_output explicitly now that
 		 * the send tag is fixed or just rely on timers?
 		 */
 	} else {
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		if (!in_pcbrele_wlocked(inp)) {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED)) {
 				tp = intotcpcb(inp);
 				CURVNET_SET(tp->t_vnet);
 				tp = tcp_drop(tp, ECONNABORTED);
 				CURVNET_RESTORE();
 				if (tp != NULL)
 					INP_WUNLOCK(inp);
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
 			} else
 				INP_WUNLOCK(inp);
 		}
 		NET_EPOCH_EXIT(et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 
 		/*
 		 * Leave reset_pending true to avoid future tasks while
 		 * the socket goes away.
 		 */
 	}
 
 	ktls_free(tls);
 }
 
 int
 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 {
 
 	if (inp == NULL)
 		return (ENOBUFS);
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * See if we should schedule a task to update the send tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		in_pcbref(inp);
 		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 	return (ENOBUFS);
 }
 
 #ifdef RATELIMIT
 int
 ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
-	struct ifnet *ifp;
 
 	/* Can't get to the inp, but it should be locked. */
 	/* INP_LOCK_ASSERT(inp); */
 
 	MPASS(tls->mode == TCP_TLS_MODE_IFNET);
 
 	if (tls->snd_tag == NULL) {
 		/*
 		 * Resetting send tag, ignore this change.  The
 		 * pending reset may or may not see this updated rate
 		 * in the tcpcb.  If it doesn't, we will just lose
 		 * this rate change.
 		 */
 		return (0);
 	}
 
 	MPASS(tls->snd_tag != NULL);
-	MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
+	MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
 
 	mst = tls->snd_tag;
-	ifp = mst->ifp;
-	return (ifp->if_snd_tag_modify(mst, &params));
+	return (mst->sw->snd_tag_modify(mst, &params));
 }
 #endif
 #endif
 
 void
 ktls_destroy(struct ktls_session *tls)
 {
 
 	ktls_cleanup(tls);
 	uma_zfree(ktls_session_zone, tls);
 }
 
 void
 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 {
 
 	for (; m != NULL; m = m->m_next) {
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_seq: mapped mbuf %p", m));
 
 		m->m_epg_seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 	}
 }
 
 /*
  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
  * mbuf in the chain must be an unmapped mbuf.  The payload of the
  * mbuf must be populated with the payload of each TLS record.
  *
  * The record_type argument specifies the TLS record type used when
  * populating the TLS header.
  *
  * The enq_count argument on return is set to the number of pages of
  * payload data for this entire chain that need to be encrypted via SW
  * encryption.  The returned value should be passed to ktls_enqueue
  * when scheduling encryption of this chain of mbufs.  To handle the
  * special case of empty fragments for TLS 1.0 sessions, an empty
  * fragment counts as one page.
  */
 void
 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
     uint8_t record_type)
 {
 	struct tls_record_layer *tlshdr;
 	struct mbuf *m;
 	uint64_t *noncep;
 	uint16_t tls_len;
 	int maxlen;
 
 	maxlen = tls->params.max_frame_len;
 	*enq_cnt = 0;
 	for (m = top; m != NULL; m = m->m_next) {
 		/*
 		 * All mbufs in the chain should be TLS records whose
 		 * payload does not exceed the maximum frame length.
 		 *
 		 * Empty TLS records are permitted when using CBC.
 		 */
 		KASSERT(m->m_len <= maxlen &&
 		    (tls->params.cipher_algorithm == CRYPTO_AES_CBC ?
 		    m->m_len >= 0 : m->m_len > 0),
 		    ("ktls_frame: m %p len %d\n", m, m->m_len));
 
 		/*
 		 * TLS frames require unmapped mbufs to store session
 		 * info.
 		 */
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
 
 		tls_len = m->m_len;
 
 		/* Save a reference to the session. */
 		m->m_epg_tls = ktls_hold(tls);
 
 		m->m_epg_hdrlen = tls->params.tls_hlen;
 		m->m_epg_trllen = tls->params.tls_tlen;
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 			int bs, delta;
 
 			/*
 			 * AES-CBC pads messages to a multiple of the
 			 * block size.  Note that the padding is
 			 * applied after the digest and the encryption
 			 * is done on the "plaintext || mac || padding".
 			 * At least one byte of padding is always
 			 * present.
 			 *
 			 * Compute the final trailer length assuming
 			 * at most one block of padding.
 			 * tls->params.tls_tlen is the maximum
 			 * possible trailer length (padding + digest).
 			 * delta holds the number of excess padding
 			 * bytes if the maximum were used.  Those
 			 * extra bytes are removed.
 			 */
 			bs = tls->params.tls_bs;
 			delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
 			m->m_epg_trllen -= delta;
 		}
 		m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
 
 		/* Populate the TLS header. */
 		tlshdr = (void *)m->m_epg_hdr;
 		tlshdr->tls_vmajor = tls->params.tls_vmajor;
 
 		/*
 		 * TLS 1.3 masquarades as TLS 1.2 with a record type
 		 * of TLS_RLTYPE_APP.
 		 */
 		if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
 		    tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
 			tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
 			tlshdr->tls_type = TLS_RLTYPE_APP;
 			/* save the real record type for later */
 			m->m_epg_record_type = record_type;
 			m->m_epg_trail[0] = record_type;
 		} else {
 			tlshdr->tls_vminor = tls->params.tls_vminor;
 			tlshdr->tls_type = record_type;
 		}
 		tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
 		/*
 		 * Store nonces / explicit IVs after the end of the
 		 * TLS header.
 		 *
 		 * For GCM with TLS 1.2, an 8 byte nonce is copied
 		 * from the end of the IV.  The nonce is then
 		 * incremented for use by the next record.
 		 *
 		 * For CBC, a random nonce is inserted for TLS 1.1+.
 		 */
 		if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
 			noncep = (uint64_t *)(tls->params.iv + 8);
 			be64enc(tlshdr + 1, *noncep);
 			(*noncep)++;
 		} else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 		    tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 			arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 
 		/*
 		 * When using SW encryption, mark the mbuf not ready.
 		 * It will be marked ready via sbready() after the
 		 * record has been encrypted.
 		 *
 		 * When using ifnet TLS, unencrypted TLS records are
 		 * sent down the stack to the NIC.
 		 */
 		if (tls->mode == TCP_TLS_MODE_SW) {
 			m->m_flags |= M_NOTREADY;
 			if (__predict_false(tls_len == 0)) {
 				/* TLS 1.0 empty fragment. */
 				m->m_epg_nrdy = 1;
 			} else
 				m->m_epg_nrdy = m->m_epg_npgs;
 			*enq_cnt += m->m_epg_nrdy;
 		}
 	}
 }
 
 void
 ktls_check_rx(struct sockbuf *sb)
 {
 	struct tls_record_layer hdr;
 	struct ktls_wq *wq;
 	struct socket *so;
 	bool running;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 	    __func__, sb));
 	so = __containerof(sb, struct socket, so_rcv);
 
 	if (sb->sb_flags & SB_TLS_RX_RUNNING)
 		return;
 
 	/* Is there enough queued for a TLS header? */
 	if (sb->sb_tlscc < sizeof(hdr)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
 
 	/* Is the entire record queued? */
 	if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	sb->sb_flags |= SB_TLS_RX_RUNNING;
 
 	soref(so);
 	wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_rx_queued, 1);
 }
 
 static struct mbuf *
 ktls_detach_record(struct sockbuf *sb, int len)
 {
 	struct mbuf *m, *n, *top;
 	int remain;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	MPASS(len <= sb->sb_tlscc);
 
 	/*
 	 * If TLS chain is the exact size of the record,
 	 * just grab the whole record.
 	 */
 	top = sb->sb_mtls;
 	if (sb->sb_tlscc == len) {
 		sb->sb_mtls = NULL;
 		sb->sb_mtlstail = NULL;
 		goto out;
 	}
 
 	/*
 	 * While it would be nice to use m_split() here, we need
 	 * to know exactly what m_split() allocates to update the
 	 * accounting, so do it inline instead.
 	 */
 	remain = len;
 	for (m = top; remain > m->m_len; m = m->m_next)
 		remain -= m->m_len;
 
 	/* Easy case: don't have to split 'm'. */
 	if (remain == m->m_len) {
 		sb->sb_mtls = m->m_next;
 		if (sb->sb_mtls == NULL)
 			sb->sb_mtlstail = NULL;
 		m->m_next = NULL;
 		goto out;
 	}
 
 	/*
 	 * Need to allocate an mbuf to hold the remainder of 'm'.  Try
 	 * with M_NOWAIT first.
 	 */
 	n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL) {
 		/*
 		 * Use M_WAITOK with socket buffer unlocked.  If
 		 * 'sb_mtls' changes while the lock is dropped, return
 		 * NULL to force the caller to retry.
 		 */
 		SOCKBUF_UNLOCK(sb);
 
 		n = m_get(M_WAITOK, MT_DATA);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_mtls != top) {
 			m_free(n);
 			return (NULL);
 		}
 	}
 	n->m_flags |= M_NOTREADY;
 
 	/* Store remainder in 'n'. */
 	n->m_len = m->m_len - remain;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + remain;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
 	}
 
 	/* Trim 'm' and update accounting. */
 	m->m_len -= n->m_len;
 	sb->sb_tlscc -= n->m_len;
 	sb->sb_ccc -= n->m_len;
 
 	/* Account for 'n'. */
 	sballoc_ktls_rx(sb, n);
 
 	/* Insert 'n' into the TLS chain. */
 	sb->sb_mtls = n;
 	n->m_next = m->m_next;
 	if (sb->sb_mtlstail == m)
 		sb->sb_mtlstail = n;
 
 	/* Detach the record from the TLS chain. */
 	m->m_next = NULL;
 
 out:
 	MPASS(m_length(top, NULL) == len);
 	for (m = top; m != NULL; m = m->m_next)
 		sbfree_ktls_rx(sb, m);
 	sb->sb_tlsdcc = len;
 	sb->sb_ccc += len;
 	SBCHECK(sb);
 	return (top);
 }
 
 static void
 ktls_decrypt(struct socket *so)
 {
 	char tls_header[MBUF_PEXT_HDR_LEN];
 	struct ktls_session *tls;
 	struct sockbuf *sb;
 	struct tls_record_layer *hdr;
 	struct tls_get_record tgr;
 	struct mbuf *control, *data, *m;
 	uint64_t seqno;
 	int error, remain, tls_len, trail_len;
 
 	hdr = (struct tls_record_layer *)tls_header;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
 	    ("%s: socket %p not running", __func__, so));
 
 	tls = sb->sb_tls_info;
 	MPASS(tls != NULL);
 
 	for (;;) {
 		/* Is there enough queued for a TLS header? */
 		if (sb->sb_tlscc < tls->params.tls_hlen)
 			break;
 
 		m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
 		tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
 
 		if (hdr->tls_vmajor != tls->params.tls_vmajor ||
 		    hdr->tls_vminor != tls->params.tls_vminor)
 			error = EINVAL;
 		else if (tls_len < tls->params.tls_hlen || tls_len >
 		    tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
 		    tls->params.tls_tlen)
 			error = EMSGSIZE;
 		else
 			error = 0;
 		if (__predict_false(error != 0)) {
 			/*
 			 * We have a corrupted record and are likely
 			 * out of sync.  The connection isn't
 			 * recoverable at this point, so abort it.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			counter_u64_add(ktls_offload_corrupted_records, 1);
 
 			CURVNET_SET(so->so_vnet);
 			so->so_proto->pr_usrreqs->pru_abort(so);
 			so->so_error = error;
 			CURVNET_RESTORE();
 			goto deref;
 		}
 
 		/* Is the entire record queued? */
 		if (sb->sb_tlscc < tls_len)
 			break;
 
 		/*
 		 * Split out the portion of the mbuf chain containing
 		 * this TLS record.
 		 */
 		data = ktls_detach_record(sb, tls_len);
 		if (data == NULL)
 			continue;
 		MPASS(sb->sb_tlsdcc == tls_len);
 
 		seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 		SBCHECK(sb);
 		SOCKBUF_UNLOCK(sb);
 
 		error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 
 			SOCKBUF_LOCK(sb);
 			if (sb->sb_tlsdcc == 0) {
 				/*
 				 * sbcut/drop/flush discarded these
 				 * mbufs.
 				 */
 				m_freem(data);
 				break;
 			}
 
 			/*
 			 * Drop this TLS record's data, but keep
 			 * decrypting subsequent records.
 			 */
 			sb->sb_ccc -= tls_len;
 			sb->sb_tlsdcc = 0;
 
 			CURVNET_SET(so->so_vnet);
 			so->so_error = EBADMSG;
 			sorwakeup_locked(so);
 			CURVNET_RESTORE();
 
 			m_freem(data);
 
 			SOCKBUF_LOCK(sb);
 			continue;
 		}
 
 		/* Allocate the control mbuf. */
 		tgr.tls_type = hdr->tls_type;
 		tgr.tls_vmajor = hdr->tls_vmajor;
 		tgr.tls_vminor = hdr->tls_vminor;
 		tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
 		    trail_len);
 		control = sbcreatecontrol_how(&tgr, sizeof(tgr),
 		    TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_tlsdcc == 0) {
 			/* sbcut/drop/flush discarded these mbufs. */
 			MPASS(sb->sb_tlscc == 0);
 			m_freem(data);
 			m_freem(control);
 			break;
 		}
 
 		/*
 		 * Clear the 'dcc' accounting in preparation for
 		 * adding the decrypted record.
 		 */
 		sb->sb_ccc -= tls_len;
 		sb->sb_tlsdcc = 0;
 		SBCHECK(sb);
 
 		/* If there is no payload, drop all of the data. */
 		if (tgr.tls_length == htobe16(0)) {
 			m_freem(data);
 			data = NULL;
 		} else {
 			/* Trim header. */
 			remain = tls->params.tls_hlen;
 			while (remain > 0) {
 				if (data->m_len > remain) {
 					data->m_data += remain;
 					data->m_len -= remain;
 					break;
 				}
 				remain -= data->m_len;
 				data = m_free(data);
 			}
 
 			/* Trim trailer and clear M_NOTREADY. */
 			remain = be16toh(tgr.tls_length);
 			m = data;
 			for (m = data; remain > m->m_len; m = m->m_next) {
 				m->m_flags &= ~M_NOTREADY;
 				remain -= m->m_len;
 			}
 			m->m_len = remain;
 			m_freem(m->m_next);
 			m->m_next = NULL;
 			m->m_flags &= ~M_NOTREADY;
 
 			/* Set EOR on the final mbuf. */
 			m->m_flags |= M_EOR;
 		}
 
 		sbappendcontrol_locked(sb, data, control, 0);
 	}
 
 	sb->sb_flags &= ~SB_TLS_RX_RUNNING;
 
 	if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
 		so->so_error = EMSGSIZE;
 
 	sorwakeup_locked(so);
 
 deref:
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	CURVNET_SET(so->so_vnet);
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_enqueue_to_free(struct mbuf *m)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	/* Mark it for freeing. */
 	m->m_epg_flags |= EPG_FLAG_2FREE;
 	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 static void *
 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
 {
 	void *buf;
 	int domain, running;
 
 	if (m->m_epg_npgs <= 2)
 		return (NULL);
 	if (ktls_buffer_zone == NULL)
 		return (NULL);
 	if ((u_int)(ticks - wq->lastallocfail) < hz) {
 		/*
 		 * Rate-limit allocation attempts after a failure.
 		 * ktls_buffer_import() will acquire a per-domain mutex to check
 		 * the free page queues and may fail consistently if memory is
 		 * fragmented.
 		 */
 		return (NULL);
 	}
 	buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
 	if (buf == NULL) {
 		domain = PCPU_GET(domain);
 		wq->lastallocfail = ticks;
 
 		/*
 		 * Note that this check is "racy", but the races are
 		 * harmless, and are either a spurious wakeup if
 		 * multiple threads fail allocations before the alloc
 		 * thread wakes, or waiting an extra second in case we
 		 * see an old value of running == true.
 		 */
 		if (!VM_DOMAIN_EMPTY(domain)) {
 			running = atomic_load_int(&ktls_domains[domain].alloc_td.running);
 			if (!running)
 				wakeup(&ktls_domains[domain].alloc_td);
 		}
 	}
 	return (buf);
 }
 
 static int
 ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m,
     struct ktls_session *tls, struct ktls_ocf_encrypt_state *state)
 {
 	vm_page_t pg;
 	int error, i, len, off;
 
 	KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY),
 	    ("%p not unready & nomap mbuf\n", m));
 	KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
 	    ("page count %d larger than maximum frame length %d", m->m_epg_npgs,
 	    ktls_maxlen));
 
 	/* Anonymous mbufs are encrypted in place. */
 	if ((m->m_epg_flags & EPG_FLAG_ANON) != 0)
 		return (tls->sw_encrypt(state, tls, m, NULL, 0));
 
 	/*
 	 * For file-backed mbufs (from sendfile), anonymous wired
 	 * pages are allocated and used as the encryption destination.
 	 */
 	if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
 		len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len -
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_base = (char *)state->cbuf +
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_len = len;
 		state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf);
 		i = 1;
 	} else {
 		off = m->m_epg_1st_off;
 		for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
 			do {
 				pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 				    VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP |
 				    VM_ALLOC_WIRED | VM_ALLOC_WAITFAIL);
 			} while (pg == NULL);
 
 			len = m_epg_pagelen(m, i, off);
 			state->parray[i] = VM_PAGE_TO_PHYS(pg);
 			state->dst_iov[i].iov_base =
 			    (char *)PHYS_TO_DMAP(state->parray[i]) + off;
 			state->dst_iov[i].iov_len = len;
 		}
 	}
 	KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small"));
 	state->dst_iov[i].iov_base = m->m_epg_trail;
 	state->dst_iov[i].iov_len = m->m_epg_trllen;
 
 	error = tls->sw_encrypt(state, tls, m, state->dst_iov, i + 1);
 
 	if (__predict_false(error != 0)) {
 		/* Free the anonymous pages. */
 		if (state->cbuf != NULL)
 			uma_zfree(ktls_buffer_zone, state->cbuf);
 		else {
 			for (i = 0; i < m->m_epg_npgs; i++) {
 				pg = PHYS_TO_VM_PAGE(state->parray[i]);
 				(void)vm_page_unwire_noq(pg);
 				vm_page_free(pg);
 			}
 		}
 	}
 	return (error);
 }
 
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
 	    (M_EXTPG | M_NOTREADY)),
 	    ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 	KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 
 	KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
 
 	m->m_epg_enc_cnt = page_count;
 
 	/*
 	 * Save a pointer to the socket.  The caller is responsible
 	 * for taking an additional reference via soref().
 	 */
 	m->m_epg_so = so;
 
 	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_tx_queued, 1);
 }
 
 /*
  * Once a file-backed mbuf (from sendfile) has been encrypted, free
  * the pages from the file and replace them with the anonymous pages
  * allocated in ktls_encrypt_record().
  */
 static void
 ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state)
 {
 	int i;
 
 	MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0);
 
 	/* Free the old pages. */
 	m->m_ext.ext_free(m);
 
 	/* Replace them with the new pages. */
 	if (state->cbuf != NULL) {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[0] + ptoa(i);
 
 		/* Contig pages should go back to the cache. */
 		m->m_ext.ext_free = ktls_free_mext_contig;
 	} else {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[i];
 
 		/* Use the basic free routine. */
 		m->m_ext.ext_free = mb_free_mext_pgs;
 	}
 
 	/* Pages are now writable. */
 	m->m_epg_flags |= EPG_FLAG_ANON;
 }
 
 static __noinline void
 ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int error, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	/*
 	 * Encrypt the TLS records in the chain of mbufs starting with
 	 * 'top'.  'total_pages' gives us a total count of pages and is
 	 * used to know when we have finished encrypting the TLS
 	 * records originally queued with 'top'.
 	 *
 	 * NB: These mbufs are queued in the socket buffer and
 	 * 'm_next' is traversing the mbufs in the socket buffer.  The
 	 * socket buffer lock is not held while traversing this chain.
 	 * Since the mbufs are all marked M_NOTREADY their 'm_next'
 	 * pointers should be stable.  However, the 'm_next' of the
 	 * last mbuf encrypted is not necessarily NULL.  It can point
 	 * to other mbufs appended while 'top' was on the TLS work
 	 * queue.
 	 *
 	 * Each mbuf holds an entire TLS record.
 	 */
 	error = 0;
 	for (m = top; npages != total_pages; m = m->m_next) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		error = ktls_encrypt_record(wq, m, tls, &state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			break;
 		}
 
 		if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 			ktls_finish_nonanon(m, &state);
 
 		npages += m->m_epg_nrdy;
 
 		/*
 		 * Drop a reference to the session now that it is no
 		 * longer needed.  Existing code depends on encrypted
 		 * records having no associated session vs
 		 * yet-to-be-encrypted records having an associated
 		 * session.
 		 */
 		m->m_epg_tls = NULL;
 		ktls_free(tls);
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error == 0) {
 		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
 	} else {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(top, total_pages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int npages;
 
 	m = state->m;
 
 	if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 		ktls_finish_nonanon(m, state);
 
 	so = state->so;
 	free(state, M_KTLS);
 
 	/*
 	 * Drop a reference to the session now that it is no longer
 	 * needed.  Existing code depends on encrypted records having
 	 * no associated session vs yet-to-be-encrypted records having
 	 * an associated session.
 	 */
 	tls = m->m_epg_tls;
 	m->m_epg_tls = NULL;
 	ktls_free(tls);
 
 	if (error != 0)
 		counter_u64_add(ktls_offload_failed_crypto, 1);
 
 	CURVNET_SET(so->so_vnet);
 	npages = m->m_epg_nrdy;
 
 	if (error == 0) {
 		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, m, npages);
 	} else {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, npages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Similar to ktls_encrypt, but used with asynchronous OCF backends
  * (coprocessors) where encryption does not use host CPU resources and
  * it can be beneficial to queue more requests than CPUs.
  */
 static __noinline void
 ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state *state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m, *n;
 	int error, mpages, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	error = 0;
 	for (m = top; npages != total_pages; m = n) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO);
 		soref(so);
 		state->so = so;
 		state->m = m;
 
 		mpages = m->m_epg_nrdy;
 		n = m->m_next;
 
 		error = ktls_encrypt_record(wq, m, tls, state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			free(state, M_KTLS);
 			CURVNET_SET(so->so_vnet);
 			SOCK_LOCK(so);
 			sorele(so);
 			CURVNET_RESTORE();
 			break;
 		}
 
 		npages += mpages;
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error != 0) {
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, total_pages - npages);
 	}
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static void
 ktls_alloc_thread(void *ctx)
 {
 	struct ktls_domain_info *ktls_domain = ctx;
 	struct ktls_alloc_thread *sc = &ktls_domain->alloc_td;
 	void **buf;
 	struct sysctl_oid *oid;
 	char name[80];
 	int i, nbufs;
 
 	curthread->td_domain.dr_policy =
 	    DOMAINSET_PREF(PCPU_GET(domain));
 	snprintf(name, sizeof(name), "domain%d", PCPU_GET(domain));
 	if (bootverbose)
 		printf("Starting KTLS alloc thread for domain %d\n",
 		    PCPU_GET(domain));
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
 	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
 	    CTLFLAG_RD,  &sc->allocs, 0, "buffers allocated");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups",
 	    CTLFLAG_RD,  &sc->wakeups, 0, "thread wakeups");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running",
 	    CTLFLAG_RD,  &sc->running, 0, "thread running");
 
 	buf = NULL;
 	nbufs = 0;
 	for (;;) {
 		atomic_store_int(&sc->running, 0);
 		tsleep(sc, PZERO | PNOLOCK, "-",  0);
 		atomic_store_int(&sc->running, 1);
 		sc->wakeups++;
 		if (nbufs != ktls_max_alloc) {
 			free(buf, M_KTLS);
 			nbufs = atomic_load_int(&ktls_max_alloc);
 			buf = malloc(sizeof(void *) * nbufs, M_KTLS,
 			    M_WAITOK | M_ZERO);
 		}
 		/*
 		 * Below we allocate nbufs with different allocation
 		 * flags than we use when allocating normally during
 		 * encryption in the ktls worker thread.  We specify
 		 * M_NORECLAIM in the worker thread. However, we omit
 		 * that flag here and add M_WAITOK so that the VM
 		 * system is permitted to perform expensive work to
 		 * defragment memory.  We do this here, as it does not
 		 * matter if this thread blocks.  If we block a ktls
 		 * worker thread, we risk developing backlogs of
 		 * buffers to be encrypted, leading to surges of
 		 * traffic and potential NIC output drops.
 		 */
 		for (i = 0; i < nbufs; i++) {
 			buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK);
 			sc->allocs++;
 		}
 		for (i = 0; i < nbufs; i++) {
 			uma_zfree(ktls_buffer_zone, buf[i]);
 			buf[i] = NULL;
 		}
 	}
 }
 
 static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
 	struct mbuf *m, *n;
 	struct socket *so, *son;
 	STAILQ_HEAD(, mbuf) local_m_head;
 	STAILQ_HEAD(, socket) local_so_head;
 
 	if (ktls_bind_threads > 1) {
 		curthread->td_domain.dr_policy =
 			DOMAINSET_PREF(PCPU_GET(domain));
 	}
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->m_head) &&
 		    STAILQ_EMPTY(&wq->so_head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
 		STAILQ_INIT(&local_m_head);
 		STAILQ_CONCAT(&local_m_head, &wq->m_head);
 		STAILQ_INIT(&local_so_head);
 		STAILQ_CONCAT(&local_so_head, &wq->so_head);
 		mtx_unlock(&wq->mtx);
 
 		STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
 			if (m->m_epg_flags & EPG_FLAG_2FREE) {
 				ktls_free(m->m_epg_tls);
 				m_free_raw(m);
 			} else {
 				if (m->m_epg_tls->sync_dispatch)
 					ktls_encrypt(wq, m);
 				else
 					ktls_encrypt_async(wq, m);
 				counter_u64_add(ktls_cnt_tx_queued, -1);
 			}
 		}
 
 		STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
 			ktls_decrypt(so);
 			counter_u64_add(ktls_cnt_rx_queued, -1);
 		}
 	}
 }
 
 #if defined(INET) || defined(INET6)
 static void
 ktls_disable_ifnet_help(void *context, int pending __unused)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	int err;
 
 	tls = context;
 	inp = tls->inp;
 	if (inp == NULL)
 		return;
 	INP_WLOCK(inp);
 	so = inp->inp_socket;
 	MPASS(so != NULL);
 	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 	    (inp->inp_flags2 & INP_FREED)) {
 		goto out;
 	}
 
 	if (so->so_snd.sb_tls_info != NULL)
 		err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
 	else
 		err = ENXIO;
 	if (err == 0) {
 		counter_u64_add(ktls_ifnet_disable_ok, 1);
 		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
 		    (inp->inp_flags2 & INP_FREED) == 0 &&
 		    (tp = intotcpcb(inp)) != NULL &&
 		    tp->t_fb->tfb_hwtls_change != NULL)
 			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
 	} else {
 		counter_u64_add(ktls_ifnet_disable_fail, 1);
 	}
 
 out:
 	SOCK_LOCK(so);
 	sorele(so);
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 	ktls_free(tls);
 }
 
 /*
  * Called when re-transmits are becoming a substantial portion of the
  * sends on this connection.  When this happens, we transition the
  * connection to software TLS.  This is needed because most inline TLS
  * NICs keep crypto state only for in-order transmits.  This means
  * that to handle a TCP rexmit (which is out-of-order), the NIC must
  * re-DMA the entire TLS record up to and including the current
  * segment.  This means that when re-transmitting the last ~1448 byte
  * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
  * of magnitude more data than we are sending.  This can cause the
  * PCIe link to saturate well before the network, which can cause
  * output drops, and a general loss of capacity.
  */
 void
 ktls_disable_ifnet(void *arg)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	struct socket *so;
 	struct ktls_session *tls;
 
 	tp = arg;
 	inp = tp->t_inpcb;
 	INP_WLOCK_ASSERT(inp);
 	so = inp->inp_socket;
 	SOCK_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
 	if (tls->disable_ifnet_pending) {
 		SOCK_UNLOCK(so);
 		return;
 	}
 
 	/*
 	 * note that disable_ifnet_pending is never cleared; disabling
 	 * ifnet can only be done once per session, so we never want
 	 * to do it again
 	 */
 
 	(void)ktls_hold(tls);
 	in_pcbref(inp);
 	soref(so);
 	tls->disable_ifnet_pending = true;
 	tls->inp = inp;
 	SOCK_UNLOCK(so);
 	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
 	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
 }
 #endif
diff --git a/sys/net/if_dead.c b/sys/net/if_dead.c
index b01d17fe9b1b..5721e9490776 100644
--- a/sys/net/if_dead.c
+++ b/sys/net/if_dead.c
@@ -1,163 +1,143 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * When an interface has been detached but not yet freed, we set the various
  * ifnet function pointers to "ifdead" versions.  This prevents unexpected
  * calls from the network stack into the device driver after if_detach() has
  * returned.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 
 static int
 ifdead_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa,
     struct route *ro)
 {
 
 	m_freem(m);
 	return (ENXIO);
 }
 
 static void
 ifdead_input(struct ifnet *ifp, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 static void
 ifdead_start(struct ifnet *ifp)
 {
 
 }
 
 static int
 ifdead_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 
 	return (ENXIO);
 }
 
 static int
 ifdead_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
     struct sockaddr *sa)
 {
 
 	*llsa = NULL;
 	return (ENXIO);
 }
 
 static void
 ifdead_qflush(struct ifnet *ifp)
 {
 
 }
 
 static int
 ifdead_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 
 	m_freem(m);
 	return (ENXIO);
 }
 
 static uint64_t
 ifdead_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 
 	return (0);
 }
 
 static int
 ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	return (EOPNOTSUPP);
 }
 
-static int
-ifdead_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
-{
-	return (EOPNOTSUPP);
-}
-
-static int
-ifdead_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
-{
-	return (EOPNOTSUPP);
-}
-
-static void
-ifdead_snd_tag_free(struct m_snd_tag *pmt)
-{
-}
-
 static void
 ifdead_ratelimit_query(struct ifnet *ifp __unused,
       struct if_ratelimit_query_results *q)
 {
 	/*
 	 * This guy does not support
 	 * this interface. Not sure
 	 * why we would specify a
 	 * flag on the interface
 	 * that says we do.
 	 */
 	q->rate_table = NULL;
 	q->flags = RT_NOSUPPORT;
 	q->max_flows = 0;
 	q->number_of_rates = 0;
 }
 
 void
 if_dead(struct ifnet *ifp)
 {
 
 	ifp->if_output = ifdead_output;
 	ifp->if_input = ifdead_input;
 	ifp->if_start = ifdead_start;
 	ifp->if_ioctl = ifdead_ioctl;
 	ifp->if_resolvemulti = ifdead_resolvemulti;
 	ifp->if_qflush = ifdead_qflush;
 	ifp->if_transmit = ifdead_transmit;
 	ifp->if_get_counter = ifdead_get_counter;
 	ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc;
-	ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
-	ifp->if_snd_tag_query = ifdead_snd_tag_query;
-	ifp->if_snd_tag_free = ifdead_snd_tag_free;
 	ifp->if_ratelimit_query = ifdead_ratelimit_query;
 }
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index c53e5b283b76..8f7900277f01 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -1,2641 +1,2699 @@
 /*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
 
 /*
  * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
  * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
  * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/eventhandler.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/bpf.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <net/infiniband.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #endif
 #ifdef INET
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 
 #include <net/if_vlan_var.h>
 #include <net/if_lagg.h>
 #include <net/ieee8023ad_lacp.h>
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 #define	LAGG_SX_INIT(_sc)	sx_init(&(_sc)->sc_sx, "if_lagg sx")
 #define	LAGG_SX_DESTROY(_sc)	sx_destroy(&(_sc)->sc_sx)
 #define	LAGG_XLOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
 #define	LAGG_XUNLOCK(_sc)	sx_xunlock(&(_sc)->sc_sx)
 #define	LAGG_SXLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_LOCKED)
 #define	LAGG_XLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_XLOCKED)
 
 /* Special flags we should propagate to the lagg ports. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } lagg_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 struct lagg_snd_tag {
 	struct m_snd_tag com;
 	struct m_snd_tag *tag;
 };
 
 VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
 #define	V_lagg_list	VNET(lagg_list)
 VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx);
 #define	V_lagg_list_mtx	VNET(lagg_list_mtx)
 #define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
 					"if_lagg list", NULL, MTX_DEF)
 #define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
 #define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
 #define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
 eventhandler_tag	lagg_detach_cookie = NULL;
 
 static int	lagg_clone_create(struct if_clone *, int, caddr_t);
 static void	lagg_clone_destroy(struct ifnet *);
 VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner);
 #define	V_lagg_cloner	VNET(lagg_cloner)
 static const char laggname[] = "lagg";
 static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface");
 
 static void	lagg_capabilities(struct lagg_softc *);
 static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
 static int	lagg_port_destroy(struct lagg_port *, int);
 static struct mbuf *lagg_input_ethernet(struct ifnet *, struct mbuf *);
 static struct mbuf *lagg_input_infiniband(struct ifnet *, struct mbuf *);
 static void	lagg_linkstate(struct lagg_softc *);
 static void	lagg_port_state(struct ifnet *, int);
 static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
 static int	lagg_port_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
 #ifdef LAGG_PORT_STACKING
 static int	lagg_port_checkstacking(struct lagg_softc *);
 #endif
 static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
 static void	lagg_init(void *);
 static void	lagg_stop(struct lagg_softc *);
 static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static int	lagg_snd_tag_alloc(struct ifnet *,
 		    union if_snd_tag_alloc_params *,
 		    struct m_snd_tag **);
 static int	lagg_snd_tag_modify(struct m_snd_tag *,
 		    union if_snd_tag_modify_params *);
 static int	lagg_snd_tag_query(struct m_snd_tag *,
 		    union if_snd_tag_query_params *);
 static void	lagg_snd_tag_free(struct m_snd_tag *);
 static struct m_snd_tag *lagg_next_snd_tag(struct m_snd_tag *);
 static void     lagg_ratelimit_query(struct ifnet *,
 		    struct if_ratelimit_query_results *);
 #endif
 static int	lagg_setmulti(struct lagg_port *);
 static int	lagg_clrmulti(struct lagg_port *);
 static	int	lagg_setcaps(struct lagg_port *, int cap);
 static	int	lagg_setflag(struct lagg_port *, int, int,
 		    int (*func)(struct ifnet *, int));
 static	int	lagg_setflags(struct lagg_port *, int status);
 static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
 static int	lagg_transmit_ethernet(struct ifnet *, struct mbuf *);
 static int	lagg_transmit_infiniband(struct ifnet *, struct mbuf *);
 static void	lagg_qflush(struct ifnet *);
 static int	lagg_media_change(struct ifnet *);
 static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
 static struct lagg_port *lagg_link_active(struct lagg_softc *,
 	    struct lagg_port *);
 
 /* Simple round robin */
 static void	lagg_rr_attach(struct lagg_softc *);
 static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Active failover */
 static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* Loadbalancing */
 static void	lagg_lb_attach(struct lagg_softc *);
 static void	lagg_lb_detach(struct lagg_softc *);
 static int	lagg_lb_port_create(struct lagg_port *);
 static void	lagg_lb_port_destroy(struct lagg_port *);
 static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
 
 /* Broadcast */
 static int    lagg_bcast_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 
 /* 802.3ad LACP */
 static void	lagg_lacp_attach(struct lagg_softc *);
 static void	lagg_lacp_detach(struct lagg_softc *);
 static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
 static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
 		    struct mbuf *);
 static void	lagg_lacp_lladdr(struct lagg_softc *);
 
 /* lagg protocol table */
 static const struct lagg_proto {
 	lagg_proto	pr_num;
 	void		(*pr_attach)(struct lagg_softc *);
 	void		(*pr_detach)(struct lagg_softc *);
 	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
 	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
 			    struct mbuf *);
 	int		(*pr_addport)(struct lagg_port *);
 	void		(*pr_delport)(struct lagg_port *);
 	void		(*pr_linkstate)(struct lagg_port *);
 	void 		(*pr_init)(struct lagg_softc *);
 	void 		(*pr_stop)(struct lagg_softc *);
 	void 		(*pr_lladdr)(struct lagg_softc *);
 	void		(*pr_request)(struct lagg_softc *, void *);
 	void		(*pr_portreq)(struct lagg_port *, void *);
 } lagg_protos[] = {
     {
 	.pr_num = LAGG_PROTO_NONE
     },
     {
 	.pr_num = LAGG_PROTO_ROUNDROBIN,
 	.pr_attach = lagg_rr_attach,
 	.pr_start = lagg_rr_start,
 	.pr_input = lagg_rr_input,
     },
     {
 	.pr_num = LAGG_PROTO_FAILOVER,
 	.pr_start = lagg_fail_start,
 	.pr_input = lagg_fail_input,
     },
     {
 	.pr_num = LAGG_PROTO_LOADBALANCE,
 	.pr_attach = lagg_lb_attach,
 	.pr_detach = lagg_lb_detach,
 	.pr_start = lagg_lb_start,
 	.pr_input = lagg_lb_input,
 	.pr_addport = lagg_lb_port_create,
 	.pr_delport = lagg_lb_port_destroy,
     },
     {
 	.pr_num = LAGG_PROTO_LACP,
 	.pr_attach = lagg_lacp_attach,
 	.pr_detach = lagg_lacp_detach,
 	.pr_start = lagg_lacp_start,
 	.pr_input = lagg_lacp_input,
 	.pr_addport = lacp_port_create,
 	.pr_delport = lacp_port_destroy,
 	.pr_linkstate = lacp_linkstate,
 	.pr_init = lacp_init,
 	.pr_stop = lacp_stop,
 	.pr_lladdr = lagg_lacp_lladdr,
 	.pr_request = lacp_req,
 	.pr_portreq = lacp_portreq,
     },
     {
 	.pr_num = LAGG_PROTO_BROADCAST,
 	.pr_start = lagg_bcast_start,
 	.pr_input = lagg_bcast_input,
     },
 };
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Link Aggregation");
 
 /* Allow input on any failover links */
 VNET_DEFINE_STATIC(int, lagg_failover_rx_all);
 #define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(lagg_failover_rx_all), 0,
     "Accept input from any interface in a failover lagg");
 
 /* Default value for using flowid */
 VNET_DEFINE_STATIC(int, def_use_flowid) = 0;
 #define	V_def_use_flowid	VNET(def_use_flowid)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_flowid), 0,
     "Default setting for using flow id for load sharing");
 
 /* Default value for using numa */
 VNET_DEFINE_STATIC(int, def_use_numa) = 1;
 #define	V_def_use_numa	VNET(def_use_numa)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN,
     &VNET_NAME(def_use_numa), 0,
     "Use numa to steer flows");
 
 /* Default value for flowid shift */
 VNET_DEFINE_STATIC(int, def_flowid_shift) = 16;
 #define	V_def_flowid_shift	VNET(def_flowid_shift)
 SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
     &VNET_NAME(def_flowid_shift), 0,
     "Default setting for flowid shift for load sharing");
 
 static void
 vnet_lagg_init(const void *unused __unused)
 {
 
 	LAGG_LIST_LOCK_INIT();
 	SLIST_INIT(&V_lagg_list);
 	V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
 	    lagg_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_lagg_init, NULL);
 
 static void
 vnet_lagg_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_lagg_cloner);
 	LAGG_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_lagg_uninit, NULL);
 
 static int
 lagg_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		lagg_input_ethernet_p = lagg_input_ethernet;
 		lagg_input_infiniband_p = lagg_input_infiniband;
 		lagg_linkstate_p = lagg_port_state;
 		lagg_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, lagg_port_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    lagg_detach_cookie);
 		lagg_input_ethernet_p = NULL;
 		lagg_input_infiniband_p = NULL;
 		lagg_linkstate_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t lagg_mod = {
 	"if_lagg",
 	lagg_modevent,
 	0
 };
 
 DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_lagg, 1);
 MODULE_DEPEND(if_lagg, if_infiniband, 1, 1, 1);
 
 static void
 lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
 {
 
 	LAGG_XLOCK_ASSERT(sc);
 	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
 	    __func__, sc));
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		if_printf(sc->sc_ifp, "using proto %u\n", pr);
 
 	if (lagg_protos[pr].pr_attach != NULL)
 		lagg_protos[pr].pr_attach(sc);
 	sc->sc_proto = pr;
 }
 
 static void
 lagg_proto_detach(struct lagg_softc *sc)
 {
 	lagg_proto pr;
 
 	LAGG_XLOCK_ASSERT(sc);
 	pr = sc->sc_proto;
 	sc->sc_proto = LAGG_PROTO_NONE;
 
 	if (lagg_protos[pr].pr_detach != NULL)
 		lagg_protos[pr].pr_detach(sc);
 }
 
 static int
 lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
 }
 
 static struct mbuf *
 lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 
 	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
 }
 
 static int
 lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
 		return (0);
 	else
 		return (lagg_protos[sc->sc_proto].pr_addport(lp));
 }
 
 static void
 lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
 		lagg_protos[sc->sc_proto].pr_delport(lp);
 }
 
 static void
 lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
 		lagg_protos[sc->sc_proto].pr_linkstate(lp);
 }
 
 static void
 lagg_proto_init(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_init != NULL)
 		lagg_protos[sc->sc_proto].pr_init(sc);
 }
 
 static void
 lagg_proto_stop(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
 		lagg_protos[sc->sc_proto].pr_stop(sc);
 }
 
 static void
 lagg_proto_lladdr(struct lagg_softc *sc)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
 		lagg_protos[sc->sc_proto].pr_lladdr(sc);
 }
 
 static void
 lagg_proto_request(struct lagg_softc *sc, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_request != NULL)
 		lagg_protos[sc->sc_proto].pr_request(sc, v);
 }
 
 static void
 lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
 {
 
 	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
 		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
 }
 
 /*
  * This routine is run via an vlan
  * config EVENT
  */
 static void
 lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_XLOCK(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
 	LAGG_XUNLOCK(sc);
 }
 
 /*
  * This routine is run via an vlan
  * unconfig EVENT
  */
 static void
 lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct lagg_softc *sc = ifp->if_softc;
 	struct lagg_port *lp;
 
 	if (ifp->if_softc !=  arg)   /* Not our event */
 		return;
 
 	LAGG_XLOCK(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
 	LAGG_XUNLOCK(sc);
 }
 
 static int
 lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct iflaggparam iflp;
 	struct lagg_softc *sc;
 	struct ifnet *ifp;
 	int if_type;
 	int error;
 	static const uint8_t eaddr[LAGG_ADDR_LEN];
 
 	if (params != NULL) {
 		error = copyin(params, &iflp, sizeof(iflp));
 		if (error)
 			return (error);
 
 		switch (iflp.lagg_type) {
 		case LAGG_TYPE_ETHERNET:
 			if_type = IFT_ETHER;
 			break;
 		case LAGG_TYPE_INFINIBAND:
 			if_type = IFT_INFINIBAND;
 			break;
 		default:
 			return (EINVAL);
 		}
 	} else {
 		if_type = IFT_ETHER;
 	}
 
 	sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(if_type);
 	if (ifp == NULL) {
 		free(sc, M_LAGG);
 		return (ENOSPC);
 	}
 	LAGG_SX_INIT(sc);
 
 	mtx_init(&sc->sc_mtx, "lagg-mtx", NULL, MTX_DEF);
 	callout_init_mtx(&sc->sc_watchdog, &sc->sc_mtx, 0);
 
 	LAGG_XLOCK(sc);
 	if (V_def_use_flowid)
 		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
 	if (V_def_use_numa)
 		sc->sc_opts |= LAGG_OPT_USE_NUMA;
 	sc->flowid_shift = V_def_flowid_shift;
 
 	/* Hash all layers by default */
 	sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4;
 
 	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
 
 	CK_SLIST_INIT(&sc->sc_ports);
 
 	switch (if_type) {
 	case IFT_ETHER:
 		/* Initialise pseudo media types */
 		ifmedia_init(&sc->sc_media, 0, lagg_media_change,
 		    lagg_media_status);
 		ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 		ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
 
 		if_initname(ifp, laggname, unit);
 		ifp->if_transmit = lagg_transmit_ethernet;
 		break;
 	case IFT_INFINIBAND:
 		if_initname(ifp, laggname, unit);
 		ifp->if_transmit = lagg_transmit_infiniband;
 		break;
 	default:
 		break;
 	}
 	ifp->if_softc = sc;
 	ifp->if_qflush = lagg_qflush;
 	ifp->if_init = lagg_init;
 	ifp->if_ioctl = lagg_ioctl;
 	ifp->if_get_counter = lagg_get_counter;
 	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
-	ifp->if_snd_tag_modify = lagg_snd_tag_modify;
-	ifp->if_snd_tag_query = lagg_snd_tag_query;
-	ifp->if_snd_tag_free = lagg_snd_tag_free;
-	ifp->if_next_snd_tag = lagg_next_snd_tag;
 	ifp->if_ratelimit_query = lagg_ratelimit_query;
 #endif
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
 	/*
 	 * Attach as an ordinary ethernet device, children will be attached
 	 * as special device IFT_IEEE8023ADLAG or IFT_INFINIBANDLAG.
 	 */
 	switch (if_type) {
 	case IFT_ETHER:
 		ether_ifattach(ifp, eaddr);
 		break;
 	case IFT_INFINIBAND:
 		infiniband_ifattach(ifp, eaddr, sc->sc_bcast_addr);
 		break;
 	default:
 		break;
 	}
 
 	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
 
 	/* Insert into the global list of laggs */
 	LAGG_LIST_LOCK();
 	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
 	LAGG_LIST_UNLOCK();
 	LAGG_XUNLOCK(sc);
 
 	return (0);
 }
 
 static void
 lagg_clone_destroy(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	sc->sc_destroying = 1;
 	lagg_stop(sc);
 	ifp->if_flags &= ~IFF_UP;
 
 	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
 	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
 
 	/* Shutdown and remove lagg ports */
 	while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL)
 		lagg_port_destroy(lp, 1);
 
 	/* Unhook the aggregation protocol */
 	lagg_proto_detach(sc);
 	LAGG_XUNLOCK(sc);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 		ifmedia_removeall(&sc->sc_media);
 		ether_ifdetach(ifp);
 		break;
 	case IFT_INFINIBAND:
 		infiniband_ifdetach(ifp);
 		break;
 	default:
 		break;
 	}
 	if_free(ifp);
 
 	LAGG_LIST_LOCK();
 	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
 	LAGG_LIST_UNLOCK();
 
 	mtx_destroy(&sc->sc_mtx);
 	LAGG_SX_DESTROY(sc);
 	free(sc, M_LAGG);
 }
 
 static void
 lagg_capabilities(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	int cap, ena, pena;
 	uint64_t hwa;
 	struct ifnet_hw_tsomax hw_tsomax;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* Get common enabled capabilities for the lagg ports */
 	ena = ~0;
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		ena &= lp->lp_ifp->if_capenable;
 	ena = (ena == ~0 ? 0 : ena);
 
 	/*
 	 * Apply common enabled capabilities back to the lagg ports.
 	 * May require several iterations if they are dependent.
 	 */
 	do {
 		pena = ena;
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setcaps(lp, ena);
 			ena &= lp->lp_ifp->if_capenable;
 		}
 	} while (pena != ena);
 
 	/* Get other capabilities from the lagg ports */
 	cap = ~0;
 	hwa = ~(uint64_t)0;
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		cap &= lp->lp_ifp->if_capabilities;
 		hwa &= lp->lp_ifp->if_hwassist;
 		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
 	}
 	cap = (cap == ~0 ? 0 : cap);
 	hwa = (hwa == ~(uint64_t)0 ? 0 : hwa);
 
 	if (sc->sc_ifp->if_capabilities != cap ||
 	    sc->sc_ifp->if_capenable != ena ||
 	    sc->sc_ifp->if_hwassist != hwa ||
 	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
 		sc->sc_ifp->if_capabilities = cap;
 		sc->sc_ifp->if_capenable = ena;
 		sc->sc_ifp->if_hwassist = hwa;
 		getmicrotime(&sc->sc_ifp->if_lastchange);
 
 		if (sc->sc_ifflags & IFF_DEBUG)
 			if_printf(sc->sc_ifp,
 			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
 	}
 }
 
 static int
 lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp, *tlp;
 	struct ifreq ifr;
 	int error, i, oldmtu;
 	int if_type;
 	uint64_t *pval;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (sc->sc_ifp == ifp) {
 		if_printf(sc->sc_ifp,
 		    "cannot add a lagg to itself as a port\n");
 		return (EINVAL);
 	}
 
 	if (sc->sc_destroying == 1)
 		return (ENXIO);
 
 	/* Limit the maximal number of lagg ports */
 	if (sc->sc_count >= LAGG_MAX_PORTS)
 		return (ENOSPC);
 
 	/* Check if port has already been associated to a lagg */
 	if (ifp->if_lagg != NULL) {
 		/* Port is already in the current lagg? */
 		lp = (struct lagg_port *)ifp->if_lagg;
 		if (lp->lp_softc == sc)
 			return (EEXIST);
 		return (EBUSY);
 	}
 
 	switch (sc->sc_ifp->if_type) {
 	case IFT_ETHER:
 		/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
 		if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
 			return (EPROTONOSUPPORT);
 		if_type = IFT_IEEE8023ADLAG;
 		break;
 	case IFT_INFINIBAND:
 		/* XXX Disallow non-infiniband interfaces */
 		if (ifp->if_type != IFT_INFINIBAND)
 			return (EPROTONOSUPPORT);
 		if_type = IFT_INFINIBANDLAG;
 		break;
 	default:
 		break;
 	}
 
 	/* Allow the first Ethernet member to define the MTU */
 	oldmtu = -1;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		sc->sc_ifp->if_mtu = ifp->if_mtu;
 	} else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
 		if (ifp->if_ioctl == NULL) {
 			if_printf(sc->sc_ifp, "cannot change MTU for %s\n",
 			    ifp->if_xname);
 			return (EINVAL);
 		}
 		oldmtu = ifp->if_mtu;
 		strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name));
 		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		if (error != 0) {
 			if_printf(sc->sc_ifp, "invalid MTU for %s\n",
 			    ifp->if_xname);
 			return (error);
 		}
 		ifr.ifr_mtu = oldmtu;
 	}
 
 	lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO);
 	lp->lp_softc = sc;
 
 	/* Check if port is a stacked lagg */
 	LAGG_LIST_LOCK();
 	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
 		if (ifp == sc_ptr->sc_ifp) {
 			LAGG_LIST_UNLOCK();
 			free(lp, M_LAGG);
 			if (oldmtu != -1)
 				(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 				    (caddr_t)&ifr);
 			return (EINVAL);
 			/* XXX disable stacking for the moment, its untested */
 #ifdef LAGG_PORT_STACKING
 			lp->lp_flags |= LAGG_PORT_STACK;
 			if (lagg_port_checkstacking(sc_ptr) >=
 			    LAGG_MAX_STACKING) {
 				LAGG_LIST_UNLOCK();
 				free(lp, M_LAGG);
 				if (oldmtu != -1)
 					(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
 					    (caddr_t)&ifr);
 				return (E2BIG);
 			}
 #endif
 		}
 	}
 	LAGG_LIST_UNLOCK();
 
 	if_ref(ifp);
 	lp->lp_ifp = ifp;
 
 	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ifp->if_addrlen);
 	lp->lp_ifcapenable = ifp->if_capenable;
 	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
 		lagg_proto_lladdr(sc);
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	} else {
 		if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
 	}
 	lagg_setflags(lp, 1);
 
 	if (CK_SLIST_EMPTY(&sc->sc_ports))
 		sc->sc_primary = lp;
 
 	/* Change the interface type */
 	lp->lp_iftype = ifp->if_type;
 	ifp->if_type = if_type;
 	ifp->if_lagg = lp;
 	lp->lp_ioctl = ifp->if_ioctl;
 	ifp->if_ioctl = lagg_port_ioctl;
 	lp->lp_output = ifp->if_output;
 	ifp->if_output = lagg_port_output;
 
 	/* Read port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++)
 		*pval = ifp->if_get_counter(ifp, i);
 
 	/*
 	 * Insert into the list of ports.
 	 * Keep ports sorted by if_index. It is handy, when configuration
 	 * is predictable and `ifconfig laggN create ...` command
 	 * will lead to the same result each time.
 	 */
 	CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
 		if (tlp->lp_ifp->if_index < ifp->if_index && (
 		    CK_SLIST_NEXT(tlp, lp_entries) == NULL ||
 		    ((struct  lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index >
 		    ifp->if_index))
 			break;
 	}
 	if (tlp != NULL)
 		CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries);
 	else
 		CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
 	sc->sc_count++;
 
 	lagg_setmulti(lp);
 
 	if ((error = lagg_proto_addport(sc, lp)) != 0) {
 		/* Remove the port, without calling pr_delport. */
 		lagg_port_destroy(lp, 0);
 		if (oldmtu != -1)
 			(*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
 		return (error);
 	}
 
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 #ifdef LAGG_PORT_STACKING
 static int
 lagg_port_checkstacking(struct lagg_softc *sc)
 {
 	struct lagg_softc *sc_ptr;
 	struct lagg_port *lp;
 	int m = 0;
 
 	LAGG_SXLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_flags & LAGG_PORT_STACK) {
 			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
 			m = MAX(m, lagg_port_checkstacking(sc_ptr));
 		}
 	}
 
 	return (m + 1);
 }
 #endif
 
 static void
 lagg_port_destroy_cb(epoch_context_t ec)
 {
 	struct lagg_port *lp;
 	struct ifnet *ifp;
 
 	lp = __containerof(ec, struct lagg_port, lp_epoch_ctx);
 	ifp = lp->lp_ifp;
 
 	if_rele(ifp);
 	free(lp, M_LAGG);
 }
 
 static int
 lagg_port_destroy(struct lagg_port *lp, int rundelport)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct lagg_port *lp_ptr, *lp0;
 	struct ifnet *ifp = lp->lp_ifp;
 	uint64_t *pval, vdiff;
 	int i;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if (rundelport)
 		lagg_proto_delport(sc, lp);
 
 	if (lp->lp_detaching == 0)
 		lagg_clrmulti(lp);
 
 	/* Restore interface */
 	ifp->if_type = lp->lp_iftype;
 	ifp->if_ioctl = lp->lp_ioctl;
 	ifp->if_output = lp->lp_output;
 	ifp->if_lagg = NULL;
 
 	/* Update detached port counters */
 	pval = lp->port_counters.val;
 	for (i = 0; i < IFCOUNTERS; i++, pval++) {
 		vdiff = ifp->if_get_counter(ifp, i) - *pval;
 		sc->detached_counters.val[i] += vdiff;
 	}
 
 	/* Finally, remove the port from the lagg */
 	CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
 	sc->sc_count--;
 
 	/* Update the primary interface */
 	if (lp == sc->sc_primary) {
 		uint8_t lladdr[LAGG_ADDR_LEN];
 
 		if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL)
 			bzero(&lladdr, LAGG_ADDR_LEN);
 		else
 			bcopy(lp0->lp_lladdr, lladdr, LAGG_ADDR_LEN);
 		sc->sc_primary = lp0;
 		if (sc->sc_destroying == 0) {
 			bcopy(lladdr, IF_LLADDR(sc->sc_ifp), sc->sc_ifp->if_addrlen);
 			lagg_proto_lladdr(sc);
 			EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 
 			/*
 			 * Update lladdr for each port (new primary needs update
 			 * as well, to switch from old lladdr to its 'real' one).
 			 * We can skip this if the lagg is being destroyed.
 			 */
 			CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
 				if_setlladdr(lp_ptr->lp_ifp, lladdr,
 				    lp_ptr->lp_ifp->if_addrlen);
 		}
 	}
 
 	if (lp->lp_ifflags)
 		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
 
 	if (lp->lp_detaching == 0) {
 		lagg_setflags(lp, 0);
 		lagg_setcaps(lp, lp->lp_ifcapenable);
 		if_setlladdr(ifp, lp->lp_lladdr, ifp->if_addrlen);
 	}
 
 	/*
 	 * free port and release it's ifnet reference after a grace period has
 	 * elapsed.
 	 */
 	NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx);
 	/* Update lagg capabilities */
 	lagg_capabilities(sc);
 	lagg_linkstate(sc);
 
 	return (0);
 }
 
 static int
 lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epoch_tracker et;
 	struct lagg_reqport *rp = (struct lagg_reqport *)data;
 	struct lagg_softc *sc;
 	struct lagg_port *lp = NULL;
 	int error = 0;
 
 	/* Should be checked by the caller */
 	switch (ifp->if_type) {
 	case IFT_IEEE8023ADLAG:
 	case IFT_INFINIBANDLAG:
 		if ((lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
 			goto fallback;
 		break;
 	default:
 		goto fallback;
 	}
 
 	switch (cmd) {
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    ifunit(rp->rp_portname) != ifp) {
 			error = EINVAL;
 			break;
 		}
 
 		NET_EPOCH_ENTER(et);
 		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
 			error = ENOENT;
 			NET_EPOCH_EXIT(et);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		NET_EPOCH_EXIT(et);
 		break;
 
 	case SIOCSIFCAP:
 		if (lp->lp_ioctl == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = (*lp->lp_ioctl)(ifp, cmd, data);
 		if (error)
 			break;
 
 		/* Update lagg interface capabilities */
 		LAGG_XLOCK(sc);
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(sc->sc_ifp);
 		break;
 
 	case SIOCSIFMTU:
 		/* Do not allow the MTU to be changed once joined */
 		error = EINVAL;
 		break;
 
 	default:
 		goto fallback;
 	}
 
 	return (error);
 
 fallback:
 	if (lp != NULL && lp->lp_ioctl != NULL)
 		return ((*lp->lp_ioctl)(ifp, cmd, data));
 
 	return (EINVAL);
 }
 
 /*
  * Requests counter @cnt data. 
  *
  * Counter value is calculated the following way:
  * 1) for each port, sum  difference between current and "initial" measurements.
  * 2) add lagg logical interface counters.
  * 3) add data from detached_counters array.
  *
  * We also do the following things on ports attach/detach:
  * 1) On port attach we store all counters it has into port_counter array. 
  * 2) On port detach we add the different between "initial" and
  *   current counters data to detached_counters array.
  */
 static uint64_t
 lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *lpifp;
 	uint64_t newval, oldval, vsum;
 
 	/* Revise this when we've got non-generic counters. */
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	sc = (struct lagg_softc *)ifp->if_softc;
 
 	vsum = 0;
 	NET_EPOCH_ENTER(et);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		/* Saved attached value */
 		oldval = lp->port_counters.val[cnt];
 		/* current value */
 		lpifp = lp->lp_ifp;
 		newval = lpifp->if_get_counter(lpifp, cnt);
 		/* Calculate diff and save new */
 		vsum += newval - oldval;
 	}
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * Add counter data which might be added by upper
 	 * layer protocols operating on logical interface.
 	 */
 	vsum += if_get_counter_default(ifp, cnt);
 
 	/*
 	 * Add counter data from detached ports counters
 	 */
 	vsum += sc->detached_counters.val[cnt];
 
 	return (vsum);
 }
 
 /*
  * For direct output to child ports.
  */
 static int
 lagg_port_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	struct lagg_port *lp = ifp->if_lagg;
 
 	switch (dst->sa_family) {
 		case pseudo_AF_HDRCMPLT:
 		case AF_UNSPEC:
 			if (lp != NULL)
 				return ((*lp->lp_output)(ifp, m, dst, ro));
 	}
 
 	/* drop any other frames */
 	m_freem(m);
 	return (ENETDOWN);
 }
 
 static void
 lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct lagg_port *lp;
 	struct lagg_softc *sc;
 
 	if ((lp = ifp->if_lagg) == NULL)
 		return;
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 
 	sc = lp->lp_softc;
 
 	LAGG_XLOCK(sc);
 	lp->lp_detaching = 1;
 	lagg_port_destroy(lp, 1);
 	LAGG_XUNLOCK(sc);
 	VLAN_CAPABILITIES(sc->sc_ifp);
 }
 
 static void
 lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 
 	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
 	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
 	rp->rp_prio = lp->lp_prio;
 	rp->rp_flags = lp->lp_flags;
 	lagg_proto_portreq(sc, lp, &rp->rp_psc);
 
 	/* Add protocol specific flags */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			if (lp == sc->sc_primary)
 				rp->rp_flags |= LAGG_PORT_MASTER;
 			if (lp == lagg_link_active(sc, sc->sc_primary))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			if (LAGG_PORTACTIVE(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			break;
 
 		case LAGG_PROTO_LACP:
 			/* LACP has a different definition of active */
 			if (lacp_isactive(lp))
 				rp->rp_flags |= LAGG_PORT_ACTIVE;
 			if (lacp_iscollecting(lp))
 				rp->rp_flags |= LAGG_PORT_COLLECTING;
 			if (lacp_isdistributing(lp))
 				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
 			break;
 	}
 
 }
 
 static void
 lagg_watchdog_infiniband(void *arg)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *ifp;
 	struct ifnet *lp_ifp;
 
 	sc = arg;
 
 	/*
 	 * Because infiniband nodes have a fixed MAC address, which is
 	 * generated by the so-called GID, we need to regularly update
 	 * the link level address of the parent lagg<N> device when
 	 * the active port changes. Possibly we could piggy-back on
 	 * link up/down events aswell, but using a timer also provides
 	 * a guarantee against too frequent events. This operation
 	 * does not have to be atomic.
 	 */
 	NET_EPOCH_ENTER(et);
 	lp = lagg_link_active(sc, sc->sc_primary);
 	if (lp != NULL) {
 		ifp = sc->sc_ifp;
 		lp_ifp = lp->lp_ifp;
 
 		if (ifp != NULL && lp_ifp != NULL &&
 		    (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen) != 0 ||
 		     memcmp(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen) != 0)) {
 			memcpy(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen);
 			memcpy(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen);
 
 			CURVNET_SET(ifp->if_vnet);
 			EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 			CURVNET_RESTORE();
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	callout_reset(&sc->sc_watchdog, hz, &lagg_watchdog_infiniband, arg);
 }
 
 static void
 lagg_init(void *xsc)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *lp;
 
 	LAGG_XLOCK(sc);
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		LAGG_XUNLOCK(sc);
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/*
 	 * Update the port lladdrs if needed.
 	 * This might be if_setlladdr() notification
 	 * that lladdr has been changed.
 	 */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp),
 		    ifp->if_addrlen) != 0)
 			if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ifp->if_addrlen);
 	}
 
 	lagg_proto_init(sc);
 
 	if (ifp->if_type == IFT_INFINIBAND) {
 		mtx_lock(&sc->sc_mtx);
 		lagg_watchdog_infiniband(sc);
 		mtx_unlock(&sc->sc_mtx);
 	}
 
 	LAGG_XUNLOCK(sc);
 }
 
 static void
 lagg_stop(struct lagg_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	lagg_proto_stop(sc);
 
 	mtx_lock(&sc->sc_mtx);
 	callout_stop(&sc->sc_watchdog);
 	mtx_unlock(&sc->sc_mtx);
 
 	callout_drain(&sc->sc_watchdog);
 }
 
 static int
 lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_reqall *ra = (struct lagg_reqall *)data;
 	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
 	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
 	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct lagg_port *lp;
 	struct ifnet *tpif;
 	struct thread *td = curthread;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0, oldmtu;
 
 	bzero(&rpbuf, sizeof(rpbuf));
 
 	/* XXX: This can race with lagg_clone_destroy. */
 
 	switch (cmd) {
 	case SIOCGLAGG:
 		LAGG_XLOCK(sc);
 		buflen = sc->sc_count * sizeof(struct lagg_reqport);
 		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 		ra->ra_proto = sc->sc_proto;
 		lagg_proto_request(sc, &ra->ra_psc);
 		count = 0;
 		buf = outbuf;
 		len = min(ra->ra_size, buflen);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (len < sizeof(rpbuf))
 				break;
 
 			lagg_port2req(lp, &rpbuf);
 			memcpy(buf, &rpbuf, sizeof(rpbuf));
 			count++;
 			buf += sizeof(rpbuf);
 			len -= sizeof(rpbuf);
 		}
 		LAGG_XUNLOCK(sc);
 		ra->ra_ports = count;
 		ra->ra_size = count * sizeof(rpbuf);
 		error = copyout(outbuf, ra->ra_port, ra->ra_size);
 		free(outbuf, M_TEMP);
 		break;
 	case SIOCSLAGG:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (ra->ra_proto >= LAGG_PROTO_MAX) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 		/* Infiniband only supports the failover protocol. */
 		if (ra->ra_proto != LAGG_PROTO_FAILOVER &&
 		    ifp->if_type == IFT_INFINIBAND) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 		LAGG_XLOCK(sc);
 		lagg_proto_detach(sc);
 		lagg_proto_attach(sc, ra->ra_proto);
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGOPTS:
 		LAGG_XLOCK(sc);
 		ro->ro_opts = sc->sc_opts;
 		if (sc->sc_proto == LAGG_PROTO_LACP) {
 			struct lacp_softc *lsc;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 			if (lsc->lsc_debug.lsc_tx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
 			if (lsc->lsc_debug.lsc_rx_test != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
 			if (lsc->lsc_strict_mode != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
 			if (lsc->lsc_fast_timeout != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_FAST_TIMO;
 
 			ro->ro_active = sc->sc_active;
 		} else {
 			ro->ro_active = 0;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				ro->ro_active += LAGG_PORTACTIVE(lp);
 		}
 		ro->ro_bkt = sc->sc_stride;
 		ro->ro_flapping = sc->sc_flapping;
 		ro->ro_flowid_shift = sc->flowid_shift;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGOPTS:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 
 		/*
 		 * The stride option was added without defining a corresponding
 		 * LAGG_OPT flag, so handle a non-zero value before checking
 		 * anything else to preserve compatibility.
 		 */
 		LAGG_XLOCK(sc);
 		if (ro->ro_opts == 0 && ro->ro_bkt != 0) {
 			if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) {
 				LAGG_XUNLOCK(sc);
 				error = EINVAL;
 				break;
 			}
 			sc->sc_stride = ro->ro_bkt;
 		}
 		if (ro->ro_opts == 0) {
 			LAGG_XUNLOCK(sc);
 			break;
 		}
 
 		/*
 		 * Set options.  LACP options are stored in sc->sc_psc,
 		 * not in sc_opts.
 		 */
 		int valid, lacp;
 
 		switch (ro->ro_opts) {
 		case LAGG_OPT_USE_FLOWID:
 		case -LAGG_OPT_USE_FLOWID:
 		case LAGG_OPT_USE_NUMA:
 		case -LAGG_OPT_USE_NUMA:
 		case LAGG_OPT_FLOWIDSHIFT:
 		case LAGG_OPT_RR_LIMIT:
 			valid = 1;
 			lacp = 0;
 			break;
 		case LAGG_OPT_LACP_TXTEST:
 		case -LAGG_OPT_LACP_TXTEST:
 		case LAGG_OPT_LACP_RXTEST:
 		case -LAGG_OPT_LACP_RXTEST:
 		case LAGG_OPT_LACP_STRICT:
 		case -LAGG_OPT_LACP_STRICT:
 		case LAGG_OPT_LACP_FAST_TIMO:
 		case -LAGG_OPT_LACP_FAST_TIMO:
 			valid = lacp = 1;
 			break;
 		default:
 			valid = lacp = 0;
 			break;
 		}
 
 		if (valid == 0 ||
 		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
 			/* Invalid combination of options specified. */
 			error = EINVAL;
 			LAGG_XUNLOCK(sc);
 			break;	/* Return from SIOCSLAGGOPTS. */ 
 		}
 
 		/*
 		 * Store new options into sc->sc_opts except for
 		 * FLOWIDSHIFT, RR and LACP options.
 		 */
 		if (lacp == 0) {
 			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
 				sc->flowid_shift = ro->ro_flowid_shift;
 			else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) {
 				if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN ||
 				    ro->ro_bkt == 0) {
 					error = EINVAL;
 					LAGG_XUNLOCK(sc);
 					break;
 				}
 				sc->sc_stride = ro->ro_bkt;
 			} else if (ro->ro_opts > 0)
 				sc->sc_opts |= ro->ro_opts;
 			else
 				sc->sc_opts &= ~ro->ro_opts;
 		} else {
 			struct lacp_softc *lsc;
 			struct lacp_port *lp;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 
 			switch (ro->ro_opts) {
 			case LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_TXTEST:
 				lsc->lsc_debug.lsc_tx_test = 0;
 				break;
 			case LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 1;
 				break;
 			case -LAGG_OPT_LACP_RXTEST:
 				lsc->lsc_debug.lsc_rx_test = 0;
 				break;
 			case LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 1;
 				break;
 			case -LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 0;
 				break;
 			case LAGG_OPT_LACP_FAST_TIMO:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state |= LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 1;
 				break;
 			case -LAGG_OPT_LACP_FAST_TIMO:
 				LACP_LOCK(lsc);
         			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
                         		lp->lp_state &= ~LACP_STATE_TIMEOUT;
 				LACP_UNLOCK(lsc);
 				lsc->lsc_fast_timeout = 0;
 				break;
 			}
 		}
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGFLAGS:
 		rf->rf_flags = 0;
 		LAGG_XLOCK(sc);
 		if (sc->sc_flags & MBUF_HASHFLAG_L2)
 			rf->rf_flags |= LAGG_F_HASHL2;
 		if (sc->sc_flags & MBUF_HASHFLAG_L3)
 			rf->rf_flags |= LAGG_F_HASHL3;
 		if (sc->sc_flags & MBUF_HASHFLAG_L4)
 			rf->rf_flags |= LAGG_F_HASHL4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCSLAGGHASH:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
 			error = EINVAL;
 			break;
 		}
 		LAGG_XLOCK(sc);
 		sc->sc_flags = 0;
 		if (rf->rf_flags & LAGG_F_HASHL2)
 			sc->sc_flags |= MBUF_HASHFLAG_L2;
 		if (rf->rf_flags & LAGG_F_HASHL3)
 			sc->sc_flags |= MBUF_HASHFLAG_L3;
 		if (rf->rf_flags & LAGG_F_HASHL4)
 			sc->sc_flags |= MBUF_HASHFLAG_L4;
 		LAGG_XUNLOCK(sc);
 		break;
 	case SIOCGLAGGPORT:
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		NET_EPOCH_ENTER(et);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			NET_EPOCH_EXIT(et);
 			if_rele(tpif);
 			break;
 		}
 
 		lagg_port2req(lp, rp);
 		NET_EPOCH_EXIT(et);
 		if_rele(tpif);
 		break;
 	case SIOCSLAGGPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 #ifdef INET6
 		/*
 		 * A laggport interface should not have inet6 address
 		 * because two interfaces with a valid link-local
 		 * scope zone must not be merged in any form.  This
 		 * restriction is needed to prevent violation of
 		 * link-local scope zone.  Attempts to add a laggport
 		 * interface which has inet6 addresses triggers
 		 * removal of all inet6 addresses on the member
 		 * interface.
 		 */
 		if (in6ifa_llaonifp(tpif)) {
 			in6_ifdetach(tpif);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    tpif->if_xname);
 		}
 #endif
 		oldmtu = ifp->if_mtu;
 		LAGG_XLOCK(sc);
 		error = lagg_port_create(sc, tpif);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 
 		/*
 		 * LAGG MTU may change during addition of the first port.
 		 * If it did, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSLAGGDELPORT:
 		error = priv_check(td, PRIV_NET_LAGG);
 		if (error)
 			break;
 		if (rp->rp_portname[0] == '\0' ||
 		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		LAGG_XLOCK(sc);
 		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
 		    lp->lp_softc != sc) {
 			error = ENOENT;
 			LAGG_XUNLOCK(sc);
 			if_rele(tpif);
 			break;
 		}
 
 		error = lagg_port_destroy(lp, 1);
 		LAGG_XUNLOCK(sc);
 		if_rele(tpif);
 		VLAN_CAPABILITIES(ifp);
 		break;
 	case SIOCSIFFLAGS:
 		/* Set flags on ports too */
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_setflags(lp, 1);
 		}
 
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			lagg_stop(sc);
 			LAGG_XUNLOCK(sc);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			LAGG_XUNLOCK(sc);
 			(*ifp->if_init)(sc);
 		} else
 			LAGG_XUNLOCK(sc);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			lagg_clrmulti(lp);
 			lagg_setmulti(lp);
 		}
 		LAGG_XUNLOCK(sc);
 		error = 0;
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		if (ifp->if_type == IFT_INFINIBAND)
 			error = EINVAL;
 		else
 			error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
 		break;
 
 	case SIOCSIFCAP:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 		}
 		lagg_capabilities(sc);
 		LAGG_XUNLOCK(sc);
 		VLAN_CAPABILITIES(ifp);
 		error = 0;
 		break;
 
 	case SIOCSIFMTU:
 		LAGG_XLOCK(sc);
 		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 			if (lp->lp_ioctl != NULL)
 				error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			else
 				error = EINVAL;
 			if (error != 0) {
 				if_printf(ifp,
 				    "failed to change MTU to %d on port %s, "
 				    "reverting all ports to original MTU (%d)\n",
 				    ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu);
 				break;
 			}
 		}
 		if (error == 0) {
 			ifp->if_mtu = ifr->ifr_mtu;
 		} else {
 			/* set every port back to the original MTU */
 			ifr->ifr_mtu = ifp->if_mtu;
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 				if (lp->lp_ioctl != NULL)
 					(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
 			}
 		}
 		LAGG_XUNLOCK(sc);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 	return (error);
 }
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
+#ifdef RATELIMIT
+static const struct if_snd_tag_sw lagg_snd_tag_ul_sw = {
+	.snd_tag_modify = lagg_snd_tag_modify,
+	.snd_tag_query = lagg_snd_tag_query,
+	.snd_tag_free = lagg_snd_tag_free,
+	.next_snd_tag = lagg_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_UNLIMITED
+};
+
+static const struct if_snd_tag_sw lagg_snd_tag_rl_sw = {
+	.snd_tag_modify = lagg_snd_tag_modify,
+	.snd_tag_query = lagg_snd_tag_query,
+	.snd_tag_free = lagg_snd_tag_free,
+	.next_snd_tag = lagg_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_RATE_LIMIT
+};
+#endif
+
+#ifdef KERN_TLS
+static const struct if_snd_tag_sw lagg_snd_tag_tls_sw = {
+	.snd_tag_modify = lagg_snd_tag_modify,
+	.snd_tag_query = lagg_snd_tag_query,
+	.snd_tag_free = lagg_snd_tag_free,
+	.next_snd_tag = lagg_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_TLS
+};
+
+#ifdef RATELIMIT
+static const struct if_snd_tag_sw lagg_snd_tag_tls_rl_sw = {
+	.snd_tag_modify = lagg_snd_tag_modify,
+	.snd_tag_query = lagg_snd_tag_query,
+	.snd_tag_free = lagg_snd_tag_free,
+	.next_snd_tag = lagg_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT
+};
+#endif
+#endif
+
 static inline struct lagg_snd_tag *
 mst_to_lst(struct m_snd_tag *mst)
 {
 
 	return (__containerof(mst, struct lagg_snd_tag, com));
 }
 
 /*
  * Look up the port used by a specific flow.  This only works for lagg
  * protocols with deterministic port mappings (e.g. not roundrobin).
  * In addition protocols which use a hash to map flows to ports must
  * be configured to use the mbuf flowid rather than hashing packet
  * contents.
  */
 static struct lagg_port *
 lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype,
     uint8_t numa_domain)
 {
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 	uint32_t hash, p;
 	int err;
 
 	sc = ifp->if_softc;
 
 	switch (sc->sc_proto) {
 	case LAGG_PROTO_FAILOVER:
 		return (lagg_link_active(sc, sc->sc_primary));
 	case LAGG_PROTO_LOADBALANCE:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
 		    flowtype == M_HASHTYPE_NONE)
 			return (NULL);
 		p = flowid >> sc->flowid_shift;
 		p %= sc->sc_count;
 		lb = (struct lagg_lb *)sc->sc_psc;
 		lp = lb->lb_ports[p];
 		return (lagg_link_active(sc, lp));
 	case LAGG_PROTO_LACP:
 		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
 		    flowtype == M_HASHTYPE_NONE)
 			return (NULL);
 		hash = flowid >> sc->flowid_shift;
 		return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, &err));
 	default:
 		return (NULL);
 	}
 }
 
 static int
 lagg_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct epoch_tracker et;
+	const struct if_snd_tag_sw *sw;
 	struct lagg_snd_tag *lst;
 	struct lagg_softc *sc;
 	struct lagg_port *lp;
 	struct ifnet *lp_ifp;
 	int error;
 
 	sc = ifp->if_softc;
 
+	switch (params->hdr.type) {
+#ifdef RATELIMIT
+	case IF_SND_TAG_TYPE_UNLIMITED:
+		sw = &lagg_snd_tag_ul_sw;
+		break;
+	case IF_SND_TAG_TYPE_RATE_LIMIT:
+		sw = &lagg_snd_tag_rl_sw;
+		break;
+#endif
+#ifdef KERN_TLS
+	case IF_SND_TAG_TYPE_TLS:
+		sw = &lagg_snd_tag_tls_sw;
+		break;
+#ifdef RATELIMIT
+	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
+		sw = &lagg_snd_tag_tls_rl_sw;
+		break;
+#endif
+#endif
+	default:
+		return (EOPNOTSUPP);
+	}
+
 	NET_EPOCH_ENTER(et);
 	lp = lookup_snd_tag_port(ifp, params->hdr.flowid,
 	    params->hdr.flowtype, params->hdr.numa_domain);
 	if (lp == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EOPNOTSUPP);
 	}
 	if (lp->lp_ifp == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EOPNOTSUPP);
 	}
 	lp_ifp = lp->lp_ifp;
 	if_ref(lp_ifp);
 	NET_EPOCH_EXIT(et);
 
 	lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT);
 	if (lst == NULL) {
 		if_rele(lp_ifp);
 		return (ENOMEM);
 	}
 
 	error = m_snd_tag_alloc(lp_ifp, params, &lst->tag);
 	if_rele(lp_ifp);
 	if (error) {
 		free(lst, M_LAGG);
 		return (error);
 	}
 
-	m_snd_tag_init(&lst->com, ifp, lst->tag->type);
+	m_snd_tag_init(&lst->com, ifp, sw);
 
 	*ppmt = &lst->com;
 	return (0);
 }
 
 static struct m_snd_tag *
 lagg_next_snd_tag(struct m_snd_tag *mst)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
 	return (lst->tag);
 }
 
 static int
 lagg_snd_tag_modify(struct m_snd_tag *mst,
     union if_snd_tag_modify_params *params)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
-	return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params));
+	return (lst->tag->sw->snd_tag_modify(lst->tag, params));
 }
 
 static int
 lagg_snd_tag_query(struct m_snd_tag *mst,
     union if_snd_tag_query_params *params)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
-	return (lst->tag->ifp->if_snd_tag_query(lst->tag, params));
+	return (lst->tag->sw->snd_tag_query(lst->tag, params));
 }
 
 static void
 lagg_snd_tag_free(struct m_snd_tag *mst)
 {
 	struct lagg_snd_tag *lst;
 
 	lst = mst_to_lst(mst);
 	m_snd_tag_rele(lst->tag);
 	free(lst, M_LAGG);
 }
 
 static void
 lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
 {
 	/*
 	 * For lagg, we have an indirect
 	 * interface. The caller needs to
 	 * get a ratelimit tag on the actual
 	 * interface the flow will go on.
 	 */
 	q->rate_table = NULL;
 	q->flags = RT_IS_INDIRECT;
 	q->max_flows = 0;
 	q->number_of_rates = 0;
 }
 #endif
 
 static int
 lagg_setmulti(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *ifp = lp->lp_ifp;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct lagg_mc *mc;
 	struct ifmultiaddr *ifma;
 	int error;
 
 	IF_ADDR_WLOCK(scifp);
 	CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(scifp);
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr,
 		    ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp->if_index;
 		mc->mc_ifma = NULL;
 		SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(scifp);
 	SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
 		error = if_addmulti(ifp,
 		    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_clrmulti(struct lagg_port *lp)
 {
 	struct lagg_mc *mc;
 
 	LAGG_XLOCK_ASSERT(lp->lp_softc);
 	while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
 		SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
 		if (mc->mc_ifma && lp->lp_detaching == 0)
 			if_delmulti_ifma(mc->mc_ifma);
 		free(mc, M_LAGG);
 	}
 	return (0);
 }
 
 static int
 lagg_setcaps(struct lagg_port *lp, int cap)
 {
 	struct ifreq ifr;
 
 	if (lp->lp_ifp->if_capenable == cap)
 		return (0);
 	if (lp->lp_ioctl == NULL)
 		return (ENXIO);
 	ifr.ifr_reqcap = cap;
 	return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr));
 }
 
 /* Handle a ref counted flag that should be set on the lagg port as well */
 static int
 lagg_setflag(struct lagg_port *lp, int flag, int status,
     int (*func)(struct ifnet *, int))
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 	struct ifnet *ifp = lp->lp_ifp;
 	int error;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	status = status ? (scifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded ports status is different from what
 	 * we want it to be.  If it is, flip it.  We record ports
 	 * status in lp_ifflags so that we won't clear ports flag
 	 * we haven't set.  In fact, we don't clear or set ports
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual ports flags.
 	 */
 	if (status != (lp->lp_ifflags & flag)) {
 		error = (*func)(ifp, status);
 		if (error)
 			return (error);
 		lp->lp_ifflags &= ~flag;
 		lp->lp_ifflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the lagg port
  * if "status" is true, update ports flags respective to the lagg
  * if "status" is false, forcedly clear the flags set on port.
  */
 static int
 lagg_setflags(struct lagg_port *lp, int status)
 {
 	int error, i;
 
 	for (i = 0; lagg_pflags[i].flag; i++) {
 		error = lagg_setflag(lp, lagg_pflags[i].flag,
 		    status, lagg_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 lagg_transmit_ethernet(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	int error;
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 #endif
 	NET_EPOCH_ENTER(et);
 	/* We need a Tx algorithm and at least one port */
 	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	error = lagg_proto_start(sc, m);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 static int
 lagg_transmit_infiniband(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	int error;
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 #endif
 	NET_EPOCH_ENTER(et);
 	/* We need a Tx algorithm and at least one port */
 	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 
 	INFINIBAND_BPF_MTAP(ifp, m);
 
 	error = lagg_proto_start(sc, m);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for lagg(4) is no-op.
  */
 static void
 lagg_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static struct mbuf *
 lagg_input_ethernet(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_port *lp = ifp->if_lagg;
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 
 	NET_EPOCH_ENTER(et);
 	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    lp->lp_detaching != 0 ||
 	    sc->sc_proto == LAGG_PROTO_NONE) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		return (NULL);
 	}
 
 	ETHER_BPF_MTAP(scifp, m);
 
 	m = lagg_proto_input(sc, lp, m);
 	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
 		m_freem(m);
 		m = NULL;
 	}
 
 	NET_EPOCH_EXIT(et);
 	return (m);
 }
 
 static struct mbuf *
 lagg_input_infiniband(struct ifnet *ifp, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct lagg_port *lp = ifp->if_lagg;
 	struct lagg_softc *sc = lp->lp_softc;
 	struct ifnet *scifp = sc->sc_ifp;
 
 	NET_EPOCH_ENTER(et);
 	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    lp->lp_detaching != 0 ||
 	    sc->sc_proto == LAGG_PROTO_NONE) {
 		NET_EPOCH_EXIT(et);
 		m_freem(m);
 		return (NULL);
 	}
 
 	INFINIBAND_BPF_MTAP(scifp, m);
 
 	m = lagg_proto_input(sc, lp, m);
 	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
 		m_freem(m);
 		m = NULL;
 	}
 
 	NET_EPOCH_EXIT(et);
 	return (m);
 }
 
 static int
 lagg_media_change(struct ifnet *ifp)
 {
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 
 	if (sc->sc_ifflags & IFF_DEBUG)
 		printf("%s\n", __func__);
 
 	/* Ignore */
 	return (0);
 }
 
 static void
 lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
 {
 	struct epoch_tracker et;
 	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
 	struct lagg_port *lp;
 
 	imr->ifm_status = IFM_AVALID;
 	imr->ifm_active = IFM_ETHER | IFM_AUTO;
 
 	NET_EPOCH_ENTER(et);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp))
 			imr->ifm_status |= IFM_ACTIVE;
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 lagg_linkstate(struct lagg_softc *sc)
 {
 	struct epoch_tracker et;
 	struct lagg_port *lp;
 	int new_link = LINK_STATE_DOWN;
 	uint64_t speed;
 
 	LAGG_XLOCK_ASSERT(sc);
 
 	/* LACP handles link state itself */
 	if (sc->sc_proto == LAGG_PROTO_LACP)
 		return;
 
 	/* Our link is considered up if at least one of our ports is active */
 	NET_EPOCH_ENTER(et);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	if_link_state_change(sc->sc_ifp, new_link);
 
 	/* Update if_baudrate to reflect the max possible speed */
 	switch (sc->sc_proto) {
 		case LAGG_PROTO_FAILOVER:
 			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
 			    sc->sc_primary->lp_ifp->if_baudrate : 0;
 			break;
 		case LAGG_PROTO_ROUNDROBIN:
 		case LAGG_PROTO_LOADBALANCE:
 		case LAGG_PROTO_BROADCAST:
 			speed = 0;
 			NET_EPOCH_ENTER(et);
 			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 				speed += lp->lp_ifp->if_baudrate;
 			NET_EPOCH_EXIT(et);
 			sc->sc_ifp->if_baudrate = speed;
 			break;
 		case LAGG_PROTO_LACP:
 			/* LACP updates if_baudrate itself */
 			break;
 	}
 }
 
 static void
 lagg_port_state(struct ifnet *ifp, int state)
 {
 	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
 	struct lagg_softc *sc = NULL;
 
 	if (lp != NULL)
 		sc = lp->lp_softc;
 	if (sc == NULL)
 		return;
 
 	LAGG_XLOCK(sc);
 	lagg_linkstate(sc);
 	lagg_proto_linkstate(sc, lp);
 	LAGG_XUNLOCK(sc);
 }
 
 struct lagg_port *
 lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_port *lp_next, *rval = NULL;
 
 	/*
 	 * Search a port which reports an active link state.
 	 */
 
 #ifdef INVARIANTS
 	/*
 	 * This is called with either in the network epoch
 	 * or with LAGG_XLOCK(sc) held.
 	 */
 	if (!in_epoch(net_epoch_preempt))
 		LAGG_XLOCK_ASSERT(sc);
 #endif
 
 	if (lp == NULL)
 		goto search;
 	if (LAGG_PORTACTIVE(lp)) {
 		rval = lp;
 		goto found;
 	}
 	if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL &&
 	    LAGG_PORTACTIVE(lp_next)) {
 		rval = lp_next;
 		goto found;
 	}
 
 search:
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (LAGG_PORTACTIVE(lp_next)) {
 			return (lp_next);
 		}
 	}
 found:
 	return (rval);
 }
 
 int
 lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
 {
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		struct lagg_snd_tag *lst;
 		struct m_snd_tag *mst;
 
 		mst = m->m_pkthdr.snd_tag;
 		lst = mst_to_lst(mst);
 		if (lst->tag->ifp != ifp) {
 			m_freem(m);
 			return (EAGAIN);
 		}
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag);
 		m_snd_tag_rele(mst);
 	}
 #endif
 	return (ifp->if_transmit)(ifp, m);
 }
 
 /*
  * Simple round robin aggregation
  */
 static void
 lagg_rr_attach(struct lagg_softc *sc)
 {
 	sc->sc_seq = 0;
 	sc->sc_stride = 1;
 }
 
 static int
 lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 	uint32_t p;
 
 	p = atomic_fetchadd_32(&sc->sc_seq, 1);
 	p /= sc->sc_stride;
 	p %= sc->sc_count;
 	lp = CK_SLIST_FIRST(&sc->sc_ports);
 
 	while (p--)
 		lp = CK_SLIST_NEXT(lp, lp_entries);
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * Broadcast mode
  */
 static int
 lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	int active_ports = 0;
 	int errors = 0;
 	int ret;
 	struct lagg_port *lp, *last = NULL;
 	struct mbuf *m0;
 
 	NET_EPOCH_ASSERT();
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
 		if (!LAGG_PORTACTIVE(lp))
 			continue;
 
 		active_ports++;
 
 		if (last != NULL) {
 			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (m0 == NULL) {
 				ret = ENOBUFS;
 				errors++;
 				break;
 			}
 			lagg_enqueue(last->lp_ifp, m0);
 		}
 		last = lp;
 	}
 
 	if (last == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENOENT);
 	}
 	if ((last = lagg_link_active(sc, last)) == NULL) {
 		errors++;
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	ret = lagg_enqueue(last->lp_ifp, m);
 	if (errors != 0)
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
 
 	return (ret);
 }
 
 static struct mbuf*
 lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
 
 /*
  * Active failover
  */
 static int
 lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 
 	/* Use the master port if active or the next available port */
 	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct lagg_port *tmp_tp;
 
 	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
 		m->m_pkthdr.rcvif = ifp;
 		return (m);
 	}
 
 	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
 		tmp_tp = lagg_link_active(sc, sc->sc_primary);
 		/*
 		 * If tmp_tp is null, we've received a packet when all
 		 * our links are down. Weird, but process it anyways.
 		 */
 		if ((tmp_tp == NULL || tmp_tp == lp)) {
 			m->m_pkthdr.rcvif = ifp;
 			return (m);
 		}
 	}
 
 	m_freem(m);
 	return (NULL);
 }
 
 /*
  * Loadbalancing
  */
 static void
 lagg_lb_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	struct lagg_lb *lb;
 
 	LAGG_XLOCK_ASSERT(sc);
 	lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO);
 	lb->lb_key = m_ether_tcpip_hash_init();
 	sc->sc_psc = lb;
 
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lagg_lb_port_create(lp);
 }
 
 static void
 lagg_lb_detach(struct lagg_softc *sc)
 {
 	struct lagg_lb *lb;
 
 	lb = (struct lagg_lb *)sc->sc_psc;
 	if (lb != NULL)
 		free(lb, M_LAGG);
 }
 
 static int
 lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp_next;
 	int i = 0, rv;
 
 	rv = 0;
 	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
 		if (lp_next == lp)
 			continue;
 		if (i >= LAGG_MAX_PORTS) {
 			rv = EINVAL;
 			break;
 		}
 		if (sc->sc_ifflags & IFF_DEBUG)
 			printf("%s: port %s at index %d\n",
 			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
 		lb->lb_ports[i++] = lp_next;
 	}
 
 	return (rv);
 }
 
 static int
 lagg_lb_port_create(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	return (lagg_lb_porttable(sc, NULL));
 }
 
 static void
 lagg_lb_port_destroy(struct lagg_port *lp)
 {
 	struct lagg_softc *sc = lp->lp_softc;
 	lagg_lb_porttable(sc, lp);
 }
 
 static int
 lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
 	struct lagg_port *lp = NULL;
 	uint32_t p = 0;
 
 	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
 	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		p = m->m_pkthdr.flowid >> sc->flowid_shift;
 	else
 		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
 	p %= sc->sc_count;
 	lp = lb->lb_ports[p];
 
 	/*
 	 * Check the port's link state. This will return the next active
 	 * port if the link is down or the port is NULL.
 	 */
 	if ((lp = lagg_link_active(sc, lp)) == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 
 	/* Just pass in the packet to our lagg device */
 	m->m_pkthdr.rcvif = ifp;
 
 	return (m);
 }
 
 /*
  * 802.3ad LACP
  */
 static void
 lagg_lacp_attach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	lacp_attach(sc);
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static void
 lagg_lacp_detach(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 	void *psc;
 
 	LAGG_XLOCK_ASSERT(sc);
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	psc = sc->sc_psc;
 	sc->sc_psc = NULL;
 	lacp_detach(psc);
 }
 
 static void
 lagg_lacp_lladdr(struct lagg_softc *sc)
 {
 	struct lagg_port *lp;
 
 	LAGG_SXLOCK_ASSERT(sc);
 
 	/* purge all the lacp ports */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_destroy(lp);
 
 	/* add them back in */
 	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
 		lacp_port_create(lp);
 }
 
 static int
 lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
 {
 	struct lagg_port *lp;
 	int err;
 
 	lp = lacp_select_tx_port(sc, m, &err);
 	if (lp == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (err);
 	}
 
 	/* Send mbuf */
 	return (lagg_enqueue(lp->lp_ifp, m));
 }
 
 static struct mbuf *
 lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
 {
 	struct ifnet *ifp = sc->sc_ifp;
 	struct ether_header *eh;
 	u_short etype;
 
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 
 	/* Tap off LACP control messages */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
 		m = lacp_input(lp, m);
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * If the port is not collecting or not in the active aggregator then
 	 * free and return.
 	 */
 	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
 		m_freem(m);
 		return (NULL);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 	return (m);
 }
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index 052ec6b407a0..45fba9513a8b 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -1,801 +1,808 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef	_NET_IF_VAR_H_
 #define	_NET_IF_VAR_H_
 
 /*
  * Structures defining a network interface, providing a packet
  * transport mechanism (ala level 0 of the PUP protocols).
  *
  * Each interface accepts output datagrams of a specified maximum
  * length, and provides higher level routines with input datagrams
  * received from its medium.
  *
  * Output occurs when the routine if_output is called, with three parameters:
  *	(*ifp->if_output)(ifp, m, dst, ro)
  * Here m is the mbuf chain to be sent and dst is the destination address.
  * The output routine encapsulates the supplied datagram if necessary,
  * and then transmits it on its medium.
  *
  * On input, each interface unwraps the data received by it, and either
  * places it on the input queue of an internetwork datagram routine
  * and posts the associated software interrupt, or passes the datagram to a raw
  * packet input routine.
  *
  * Routines exist for locating interfaces by their addresses
  * or for locating an interface on a certain network, as well as more general
  * routing and gateway routines maintaining information used to locate
  * interfaces.  These routines live in the files if.c and route.c
  */
 
 struct	rtentry;		/* ifa_rtrequest */
 struct	socket;
 struct	carp_if;
 struct	carp_softc;
 struct  ifvlantrunk;
 struct	route;			/* if_output */
 struct	vnet;
 struct	ifmedia;
 struct	netmap_adapter;
 struct	debugnet_methods;
 
 #ifdef _KERNEL
 #include <sys/_eventhandler.h>
 #include <sys/mbuf.h>		/* ifqueue only? */
 #include <sys/buf_ring.h>
 #include <net/vnet.h>
 #endif /* _KERNEL */
 #include <sys/ck.h>
 #include <sys/counter.h>
 #include <sys/epoch.h>
 #include <sys/lock.h>		/* XXX */
 #include <sys/mutex.h>		/* struct ifqueue */
 #include <sys/rwlock.h>		/* XXX */
 #include <sys/sx.h>		/* XXX */
 #include <sys/_task.h>		/* if_link_task */
 #define	IF_DUNIT_NONE	-1
 
 #include <net/altq/if_altq.h>
 
 CK_STAILQ_HEAD(ifnethead, ifnet);	/* we use TAILQs so that the order of */
 CK_STAILQ_HEAD(ifaddrhead, ifaddr);	/* instantiation is preserved in the list */
 CK_STAILQ_HEAD(ifmultihead, ifmultiaddr);
 CK_STAILQ_HEAD(ifgrouphead, ifg_group);
 
 #ifdef _KERNEL
 VNET_DECLARE(struct pfil_head *, link_pfil_head);
 #define	V_link_pfil_head	VNET(link_pfil_head)
 #define	PFIL_ETHER_NAME		"ethernet"
 
 #define	HHOOK_IPSEC_INET	0
 #define	HHOOK_IPSEC_INET6	1
 #define	HHOOK_IPSEC_COUNT	2
 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 #define	V_ipsec_hhh_in	VNET(ipsec_hhh_in)
 #define	V_ipsec_hhh_out	VNET(ipsec_hhh_out)
 #endif /* _KERNEL */
 
 typedef enum {
 	IFCOUNTER_IPACKETS = 0,
 	IFCOUNTER_IERRORS,
 	IFCOUNTER_OPACKETS,
 	IFCOUNTER_OERRORS,
 	IFCOUNTER_COLLISIONS,
 	IFCOUNTER_IBYTES,
 	IFCOUNTER_OBYTES,
 	IFCOUNTER_IMCASTS,
 	IFCOUNTER_OMCASTS,
 	IFCOUNTER_IQDROPS,
 	IFCOUNTER_OQDROPS,
 	IFCOUNTER_NOPROTO,
 	IFCOUNTERS /* Array size. */
 } ift_counter;
 
 typedef struct ifnet * if_t;
 
 typedef	void (*if_start_fn_t)(if_t);
 typedef	int (*if_ioctl_fn_t)(if_t, u_long, caddr_t);
 typedef	void (*if_init_fn_t)(void *);
 typedef void (*if_qflush_fn_t)(if_t);
 typedef int (*if_transmit_fn_t)(if_t, struct mbuf *);
 typedef	uint64_t (*if_get_counter_t)(if_t, ift_counter);
 
 struct ifnet_hw_tsomax {
 	u_int	tsomaxbytes;	/* TSO total burst length limit in bytes */
 	u_int	tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	tsomaxsegsize;	/* TSO maximum segment size in bytes */
 };
 
 /* Interface encap request types */
 typedef enum {
 	IFENCAP_LL = 1			/* pre-calculate link-layer header */
 } ife_type;
 
 /*
  * The structure below allows to request various pre-calculated L2/L3 headers
  * for different media. Requests varies by type (rtype field).
  *
  * IFENCAP_LL type: pre-calculates link header based on address family
  *   and destination lladdr.
  *
  *   Input data fields:
  *     buf: pointer to destination buffer
  *     bufsize: buffer size
  *     flags: IFENCAP_FLAG_BROADCAST if destination is broadcast
  *     family: address family defined by AF_ constant.
  *     lladdr: pointer to link-layer address
  *     lladdr_len: length of link-layer address
  *     hdata: pointer to L3 header (optional, used for ARP requests).
  *   Output data fields:
  *     buf: encap data is stored here
  *     bufsize: resulting encap length is stored here
  *     lladdr_off: offset of link-layer address from encap hdr start
  *     hdata: L3 header may be altered if necessary
  */
 
 struct if_encap_req {
 	u_char		*buf;		/* Destination buffer (w) */
 	size_t		bufsize;	/* size of provided buffer (r) */
 	ife_type	rtype;		/* request type (r) */
 	uint32_t	flags;		/* Request flags (r) */
 	int		family;		/* Address family AF_* (r) */
 	int		lladdr_off;	/* offset from header start (w) */
 	int		lladdr_len;	/* lladdr length (r) */
 	char		*lladdr;	/* link-level address pointer (r) */
 	char		*hdata;		/* Upper layer header data (rw) */
 };
 
 #define	IFENCAP_FLAG_BROADCAST	0x02	/* Destination is broadcast */
 
 /*
  * Network interface send tag support. The storage of "struct
  * m_snd_tag" comes from the network driver and it is free to allocate
  * as much additional space as it wants for its own use.
  */
 struct ktls_session;
 struct m_snd_tag;
 
 #define	IF_SND_TAG_TYPE_RATE_LIMIT 0
 #define	IF_SND_TAG_TYPE_UNLIMITED 1
 #define	IF_SND_TAG_TYPE_TLS 2
 #define	IF_SND_TAG_TYPE_TLS_RATE_LIMIT 3
 #define	IF_SND_TAG_TYPE_MAX 4
 
 struct if_snd_tag_alloc_header {
 	uint32_t type;		/* send tag type, see IF_SND_TAG_XXX */
 	uint32_t flowid;	/* mbuf hash value */
 	uint32_t flowtype;	/* mbuf hash type */
 	uint8_t numa_domain;	/* numa domain of associated inp */
 };
 
 struct if_snd_tag_alloc_rate_limit {
 	struct if_snd_tag_alloc_header hdr;
 	uint64_t max_rate;	/* in bytes/s */
 	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 	uint32_t reserved;	/* alignment */
 };
 
 struct if_snd_tag_alloc_tls {
 	struct if_snd_tag_alloc_header hdr;
 	struct inpcb *inp;
 	const struct ktls_session *tls;
 };
 
 struct if_snd_tag_alloc_tls_rate_limit {
 	struct if_snd_tag_alloc_header hdr;
 	struct inpcb *inp;
 	const struct ktls_session *tls;
 	uint64_t max_rate;	/* in bytes/s */
 };
 
 struct if_snd_tag_rate_limit_params {
 	uint64_t max_rate;	/* in bytes/s */
 	uint32_t queue_level;	/* 0 (empty) .. 65535 (full) */
 #define	IF_SND_QUEUE_LEVEL_MIN 0
 #define	IF_SND_QUEUE_LEVEL_MAX 65535
 	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 };
 
 union if_snd_tag_alloc_params {
 	struct if_snd_tag_alloc_header hdr;
 	struct if_snd_tag_alloc_rate_limit rate_limit;
 	struct if_snd_tag_alloc_rate_limit unlimited;
 	struct if_snd_tag_alloc_tls tls;
 	struct if_snd_tag_alloc_tls_rate_limit tls_rate_limit;
 };
 
 union if_snd_tag_modify_params {
 	struct if_snd_tag_rate_limit_params rate_limit;
 	struct if_snd_tag_rate_limit_params unlimited;
 	struct if_snd_tag_rate_limit_params tls_rate_limit;
 };
 
 union if_snd_tag_query_params {
 	struct if_snd_tag_rate_limit_params rate_limit;
 	struct if_snd_tag_rate_limit_params unlimited;
 	struct if_snd_tag_rate_limit_params tls_rate_limit;
 };
 
+typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
+    struct m_snd_tag **);
+typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
+typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
+typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
+typedef struct m_snd_tag *(if_next_send_tag_t)(struct m_snd_tag *);
+
+struct if_snd_tag_sw {
+	if_snd_tag_modify_t *snd_tag_modify;
+	if_snd_tag_query_t *snd_tag_query;
+	if_snd_tag_free_t *snd_tag_free;
+	if_next_send_tag_t *next_snd_tag;
+	u_int	type;			/* One of IF_SND_TAG_TYPE_*. */
+};
+
 /* Query return flags */
 #define RT_NOSUPPORT	  0x00000000	/* Not supported */
 #define RT_IS_INDIRECT    0x00000001	/*
 					 * Interface like a lagg, select
 					 * the actual interface for
 					 * capabilities.
 					 */
 #define RT_IS_SELECTABLE  0x00000002	/*
 					 * No rate table, you select
 					 * rates and the first
 					 * number_of_rates are created.
 					 */
 #define RT_IS_FIXED_TABLE 0x00000004	/* A fixed table is attached */
 #define RT_IS_UNUSABLE	  0x00000008	/* It is not usable for this */
 #define RT_IS_SETUP_REQ	  0x00000010	/* The interface setup must be called before use */
 
 struct if_ratelimit_query_results {
 	const uint64_t *rate_table;	/* Pointer to table if present */
 	uint32_t flags;			/* Flags indicating results */
 	uint32_t max_flows;		/* Max flows using, 0=unlimited */
 	uint32_t number_of_rates;	/* How many unique rates can be created */
 	uint32_t min_segment_burst;	/* The amount the adapter bursts at each send */
 };
 
-typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
-    struct m_snd_tag **);
-typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
-typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
-typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
-typedef struct m_snd_tag *(if_next_send_tag_t)(struct m_snd_tag *);
 typedef void (if_ratelimit_query_t)(struct ifnet *,
     struct if_ratelimit_query_results *);
 typedef int (if_ratelimit_setup_t)(struct ifnet *, uint64_t, uint32_t);
 
 /*
  * Structure defining a network interface.
  */
 struct ifnet {
 	/* General book keeping of interface lists. */
 	CK_STAILQ_ENTRY(ifnet) if_link; 	/* all struct ifnets are chained (CK_) */
 	LIST_ENTRY(ifnet) if_clones;	/* interfaces of a cloner */
 	CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */
 					/* protected by if_addr_lock */
 	u_char	if_alloctype;		/* if_type at time of allocation */
 	uint8_t	if_numa_domain;		/* NUMA domain of device */
 	/* Driver and protocol specific information that remains stable. */
 	void	*if_softc;		/* pointer to driver state */
 	void	*if_llsoftc;		/* link layer softc */
 	void	*if_l2com;		/* pointer to protocol bits */
 	const char *if_dname;		/* driver name */
 	int	if_dunit;		/* unit or IF_DUNIT_NONE */
 	u_short	if_index;		/* numeric abbreviation for this if  */
 	short	if_index_reserved;	/* spare space to grow if_index */
 	char	if_xname[IFNAMSIZ];	/* external name (name + unit) */
 	char	*if_description;	/* interface description */
 
 	/* Variable fields that are touched by the stack and drivers. */
 	int	if_flags;		/* up/down, broadcast, etc. */
 	int	if_drv_flags;		/* driver-managed status flags */
 	int	if_capabilities;	/* interface features & capabilities */
 	int	if_capenable;		/* enabled features & capabilities */
 	void	*if_linkmib;		/* link-type-specific MIB data */
 	size_t	if_linkmiblen;		/* length of above data */
 	u_int	if_refcount;		/* reference count */
 
 	/* These fields are shared with struct if_data. */
 	uint8_t		if_type;	/* ethernet, tokenring, etc */
 	uint8_t		if_addrlen;	/* media address length */
 	uint8_t		if_hdrlen;	/* media header length */
 	uint8_t		if_link_state;	/* current link state */
 	uint32_t	if_mtu;		/* maximum transmission unit */
 	uint32_t	if_metric;	/* routing metric (external only) */
 	uint64_t	if_baudrate;	/* linespeed */
 	uint64_t	if_hwassist;	/* HW offload capabilities, see IFCAP */
 	time_t		if_epoch;	/* uptime at attach or stat reset */
 	struct timeval	if_lastchange;	/* time of last administrative change */
 
 	struct  ifaltq if_snd;		/* output queue (includes altq) */
 	struct	task if_linktask;	/* task for link change events */
 	struct	task if_addmultitask;	/* task for SIOCADDMULTI */
 
 	/* Addresses of different protocol families assigned to this if. */
 	struct mtx if_addr_lock;	/* lock to protect address lists */
 		/*
 		 * if_addrhead is the list of all addresses associated to
 		 * an interface.
 		 * Some code in the kernel assumes that first element
 		 * of the list has type AF_LINK, and contains sockaddr_dl
 		 * addresses which store the link-level address and the name
 		 * of the interface.
 		 * However, access to the AF_LINK address through this
 		 * field is deprecated. Use if_addr or ifaddr_byindex() instead.
 		 */
 	struct	ifaddrhead if_addrhead;	/* linked list of addresses per if */
 	struct	ifmultihead if_multiaddrs; /* multicast addresses configured */
 	int	if_amcount;		/* number of all-multicast requests */
 	struct	ifaddr	*if_addr;	/* pointer to link-level address */
 	void	*if_hw_addr;		/* hardware link-level address */
 	const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */
 	struct	mtx if_afdata_lock;
 	void	*if_afdata[AF_MAX];
 	int	if_afdata_initialized;
 
 	/* Additional features hung off the interface. */
 	u_int	if_fib;			/* interface FIB */
 	struct	vnet *if_vnet;		/* pointer to network stack instance */
 	struct	vnet *if_home_vnet;	/* where this ifnet originates from */
 	struct  ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */
 	struct	bpf_if *if_bpf;		/* packet filter structure */
 	int	if_pcount;		/* number of promiscuous listeners */
 	void	*if_bridge;		/* bridge glue */
 	void	*if_lagg;		/* lagg glue */
 	void	*if_pf_kif;		/* pf glue */
 	struct	carp_if *if_carp;	/* carp interface structure */
 	struct	label *if_label;	/* interface MAC label */
 	struct	netmap_adapter *if_netmap; /* netmap(4) softc */
 
 	/* Various procedures of the layer2 encapsulation and drivers. */
 	int	(*if_output)		/* output routine (enqueue) */
 		(struct ifnet *, struct mbuf *, const struct sockaddr *,
 		     struct route *);
 	void	(*if_input)		/* input routine (from h/w driver) */
 		(struct ifnet *, struct mbuf *);
 	struct mbuf *(*if_bridge_input)(struct ifnet *, struct mbuf *);
 	int	(*if_bridge_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
 		    struct rtentry *);
 	void (*if_bridge_linkstate)(struct ifnet *ifp);
 	if_start_fn_t	if_start;	/* initiate output routine */
 	if_ioctl_fn_t	if_ioctl;	/* ioctl routine */
 	if_init_fn_t	if_init;	/* Init routine */
 	int	(*if_resolvemulti)	/* validate/resolve multicast */
 		(struct ifnet *, struct sockaddr **, struct sockaddr *);
 	if_qflush_fn_t	if_qflush;	/* flush any queue */
 	if_transmit_fn_t if_transmit;   /* initiate output routine */
 
 	void	(*if_reassign)		/* reassign to vnet routine */
 		(struct ifnet *, struct vnet *, char *);
 	if_get_counter_t if_get_counter; /* get counter values */
 	int	(*if_requestencap)	/* make link header from request */
 		(struct ifnet *, struct if_encap_req *);
 
 	/* Statistics. */
 	counter_u64_t	if_counters[IFCOUNTERS];
 
 	/* Stuff that's only temporary and doesn't belong here. */
 
 	/*
 	 * Network adapter TSO limits:
 	 * ===========================
 	 *
 	 * If the "if_hw_tsomax" field is zero the maximum segment
 	 * length limit does not apply. If the "if_hw_tsomaxsegcount"
 	 * or the "if_hw_tsomaxsegsize" field is zero the TSO segment
 	 * count limit does not apply. If all three fields are zero,
 	 * there is no TSO limit.
 	 *
 	 * NOTE: The TSO limits should reflect the values used in the
 	 * BUSDMA tag a network adapter is using to load a mbuf chain
 	 * for transmission. The TCP/IP network stack will subtract
 	 * space for all linklevel and protocol level headers and
 	 * ensure that the full mbuf chain passed to the network
 	 * adapter fits within the given limits.
 	 */
 	u_int	if_hw_tsomax;		/* TSO maximum size in bytes */
 	u_int	if_hw_tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	if_hw_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 
 	/*
 	 * Network adapter send tag support:
 	 */
 	if_snd_tag_alloc_t *if_snd_tag_alloc;
-	if_snd_tag_modify_t *if_snd_tag_modify;
-	if_snd_tag_query_t *if_snd_tag_query;
-	if_snd_tag_free_t *if_snd_tag_free;
-	if_next_send_tag_t *if_next_snd_tag;
+
+	/* Ratelimit (packet pacing) */
 	if_ratelimit_query_t *if_ratelimit_query;
 	if_ratelimit_setup_t *if_ratelimit_setup;
 
 	/* Ethernet PCP */
 	uint8_t if_pcp;
 
 	/*
 	 * Debugnet (Netdump) hooks to be called while in db/panic.
 	 */
 	struct debugnet_methods *if_debugnet_methods;
 	struct epoch_context	if_epoch_ctx;
 
 	/*
 	 * Spare fields to be added before branching a stable branch, so
 	 * that structure can be enhanced without changing the kernel
 	 * binary interface.
 	 */
 	int	if_ispare[4];		/* general use */
 };
 
 /* for compatibility with other BSDs */
 #define	if_name(ifp)	((ifp)->if_xname)
 
 #define	IF_NODOM	255
 /*
  * Locks for address lists on the network interface.
  */
 #define	IF_ADDR_LOCK_INIT(if)	mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF)
 #define	IF_ADDR_LOCK_DESTROY(if)	mtx_destroy(&(if)->if_addr_lock)
 
 #define	IF_ADDR_WLOCK(if)	mtx_lock(&(if)->if_addr_lock)
 #define	IF_ADDR_WUNLOCK(if)	mtx_unlock(&(if)->if_addr_lock)
 #define	IF_ADDR_LOCK_ASSERT(if)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock))
 #define	IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED)
 
 #ifdef _KERNEL
 /* interface link layer address change event */
 typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t);
 /* interface address change event */
 typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t);
 typedef void (*ifaddr_event_ext_handler_t)(void *, struct ifnet *,
     struct ifaddr *, int);
 EVENTHANDLER_DECLARE(ifaddr_event_ext, ifaddr_event_ext_handler_t);
 #define	IFADDR_EVENT_ADD	0
 #define	IFADDR_EVENT_DEL	1
 /* new interface arrival event */
 typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t);
 /* interface departure event */
 typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *);
 EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t);
 /* Interface link state change event */
 typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int);
 EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t);
 /* Interface up/down event */
 #define IFNET_EVENT_UP		0
 #define IFNET_EVENT_DOWN	1
 #define IFNET_EVENT_PCP		2	/* priority code point, PCP */
 
 typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event);
 EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn);
 
 /*
  * interface groups
  */
 struct ifg_group {
 	char				 ifg_group[IFNAMSIZ];
 	u_int				 ifg_refcnt;
 	void				*ifg_pf_kif;
 	CK_STAILQ_HEAD(, ifg_member)	 ifg_members; /* (CK_) */
 	CK_STAILQ_ENTRY(ifg_group)		 ifg_next; /* (CK_) */
 };
 
 struct ifg_member {
 	CK_STAILQ_ENTRY(ifg_member)	 ifgm_next; /* (CK_) */
 	struct ifnet		*ifgm_ifp;
 };
 
 struct ifg_list {
 	struct ifg_group	*ifgl_group;
 	CK_STAILQ_ENTRY(ifg_list)	 ifgl_next; /* (CK_) */
 };
 
 #ifdef _SYS_EVENTHANDLER_H_
 /* group attach event */
 typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
 /* group detach event */
 typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *);
 EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
 /* group change event */
 typedef void (*group_change_event_handler_t)(void *, const char *);
 EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);
 #endif /* _SYS_EVENTHANDLER_H_ */
 
 #define	IF_AFDATA_LOCK_INIT(ifp)	\
 	mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF)
 
 #define	IF_AFDATA_WLOCK(ifp)	mtx_lock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_WUNLOCK(ifp)	mtx_unlock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_LOCK(ifp)	IF_AFDATA_WLOCK(ifp)
 #define	IF_AFDATA_UNLOCK(ifp)	IF_AFDATA_WUNLOCK(ifp)
 #define	IF_AFDATA_TRYLOCK(ifp)	mtx_trylock(&(ifp)->if_afdata_lock)
 #define	IF_AFDATA_DESTROY(ifp)	mtx_destroy(&(ifp)->if_afdata_lock)
 
 #define	IF_AFDATA_LOCK_ASSERT(ifp)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock))
 #define	IF_AFDATA_WLOCK_ASSERT(ifp)	mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED)
 #define	IF_AFDATA_UNLOCK_ASSERT(ifp)	mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED)
 
 /*
  * 72 was chosen below because it is the size of a TCP/IP
  * header (40) + the minimum mss (32).
  */
 #define	IF_MINMTU	72
 #define	IF_MAXMTU	65535
 
 #define	TOEDEV(ifp)	((ifp)->if_llsoftc)
 
 /*
  * The ifaddr structure contains information about one address
  * of an interface.  They are maintained by the different address families,
  * are allocated and attached when an address is set, and are linked
  * together so all addresses for an interface can be located.
  *
  * NOTE: a 'struct ifaddr' is always at the beginning of a larger
  * chunk of malloc'ed memory, where we store the three addresses
  * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here.
  */
 struct ifaddr {
 	struct	sockaddr *ifa_addr;	/* address of interface */
 	struct	sockaddr *ifa_dstaddr;	/* other end of p-to-p link */
 #define	ifa_broadaddr	ifa_dstaddr	/* broadcast address interface */
 	struct	sockaddr *ifa_netmask;	/* used to determine subnet */
 	struct	ifnet *ifa_ifp;		/* back-pointer to interface */
 	struct	carp_softc *ifa_carp;	/* pointer to CARP data */
 	CK_STAILQ_ENTRY(ifaddr) ifa_link;	/* queue macro glue */
 	u_short	ifa_flags;		/* mostly rt_flags for cloning */
 #define	IFA_ROUTE	RTF_UP		/* route installed */
 #define	IFA_RTSELF	RTF_HOST	/* loopback route to self installed */
 	u_int	ifa_refcnt;		/* references to this structure */
 
 	counter_u64_t	ifa_ipackets;
 	counter_u64_t	ifa_opackets;
 	counter_u64_t	ifa_ibytes;
 	counter_u64_t	ifa_obytes;
 	struct	epoch_context	ifa_epoch_ctx;
 };
 
 struct ifaddr *	ifa_alloc(size_t size, int flags);
 void	ifa_free(struct ifaddr *ifa);
 void	ifa_ref(struct ifaddr *ifa);
 int __result_use_check ifa_try_ref(struct ifaddr *ifa);
 
 /*
  * Multicast address structure.  This is analogous to the ifaddr
  * structure except that it keeps track of multicast addresses.
  */
 #define IFMA_F_ENQUEUED		0x1
 struct ifmultiaddr {
 	CK_STAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */
 	struct	sockaddr *ifma_addr; 	/* address this membership is for */
 	struct	sockaddr *ifma_lladdr;	/* link-layer translation, if any */
 	struct	ifnet *ifma_ifp;	/* back-pointer to interface */
 	u_int	ifma_refcount;		/* reference count */
 	int	ifma_flags;
 	void	*ifma_protospec;	/* protocol-specific state, if any */
 	struct	ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */
 	struct	epoch_context	ifma_epoch_ctx;
 };
 
 extern	struct sx ifnet_sxlock;
 
 #define	IFNET_WLOCK()		sx_xlock(&ifnet_sxlock)
 #define	IFNET_WUNLOCK()		sx_xunlock(&ifnet_sxlock)
 #define	IFNET_RLOCK_ASSERT()	sx_assert(&ifnet_sxlock, SA_SLOCKED)
 #define	IFNET_WLOCK_ASSERT()	sx_assert(&ifnet_sxlock, SA_XLOCKED)
 #define	IFNET_RLOCK()		sx_slock(&ifnet_sxlock)
 #define	IFNET_RUNLOCK()		sx_sunlock(&ifnet_sxlock)
 
 /*
  * Look up an ifnet given its index; the _ref variant also acquires a
  * reference that must be freed using if_rele().  It is almost always a bug
  * to call ifnet_byindex() instead of ifnet_byindex_ref().
  */
 struct ifnet	*ifnet_byindex(u_short idx);
 struct ifnet	*ifnet_byindex_ref(u_short idx);
 
 /*
  * Given the index, ifaddr_byindex() returns the one and only
  * link-level ifaddr for the interface. You are not supposed to use
  * it to traverse the list of addresses associated to the interface.
  */
 struct ifaddr	*ifaddr_byindex(u_short idx);
 
 VNET_DECLARE(struct ifnethead, ifnet);
 VNET_DECLARE(struct ifgrouphead, ifg_head);
 VNET_DECLARE(int, if_index);
 VNET_DECLARE(struct ifnet *, loif);	/* first loopback interface */
 
 #define	V_ifnet		VNET(ifnet)
 #define	V_ifg_head	VNET(ifg_head)
 #define	V_if_index	VNET(if_index)
 #define	V_loif		VNET(loif)
 
 #ifdef MCAST_VERBOSE
 #define MCDPRINTF printf
 #else
 #define MCDPRINTF(...)
 #endif
 
 int	if_addgroup(struct ifnet *, const char *);
 int	if_delgroup(struct ifnet *, const char *);
 int	if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **);
 int	if_allmulti(struct ifnet *, int);
 struct	ifnet* if_alloc(u_char);
 struct	ifnet* if_alloc_dev(u_char, device_t dev);
 struct	ifnet* if_alloc_domain(u_char, int numa_domain);
 void	if_attach(struct ifnet *);
 void	if_dead(struct ifnet *);
 int	if_delmulti(struct ifnet *, struct sockaddr *);
 void	if_delmulti_ifma(struct ifmultiaddr *);
 void	if_delmulti_ifma_flags(struct ifmultiaddr *, int flags);
 void	if_detach(struct ifnet *);
 void	if_purgeaddrs(struct ifnet *);
 void	if_delallmulti(struct ifnet *);
 void	if_down(struct ifnet *);
 struct ifmultiaddr *
 	if_findmulti(struct ifnet *, const struct sockaddr *);
 void	if_freemulti(struct ifmultiaddr *ifma);
 void	if_free(struct ifnet *);
 void	if_initname(struct ifnet *, const char *, int);
 void	if_link_state_change(struct ifnet *, int);
 int	if_printf(struct ifnet *, const char *, ...) __printflike(2, 3);
 int	if_log(struct ifnet *, int, const char *, ...) __printflike(3, 4);
 void	if_ref(struct ifnet *);
 void	if_rele(struct ifnet *);
 bool	__result_use_check if_try_ref(struct ifnet *);
 int	if_setlladdr(struct ifnet *, const u_char *, int);
 int	if_tunnel_check_nesting(struct ifnet *, struct mbuf *, uint32_t, int);
 void	if_up(struct ifnet *);
 int	ifioctl(struct socket *, u_long, caddr_t, struct thread *);
 int	ifpromisc(struct ifnet *, int);
 struct	ifnet *ifunit(const char *);
 struct	ifnet *ifunit_ref(const char *);
 
 int	ifa_add_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_del_loopback_route(struct ifaddr *, struct sockaddr *);
 int	ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *);
 
 struct	ifaddr *ifa_ifwithaddr(const struct sockaddr *);
 int		ifa_ifwithaddr_check(const struct sockaddr *);
 struct	ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int);
 struct	ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int);
 struct	ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int);
 struct	ifaddr *ifa_ifwithroute(int, const struct sockaddr *,
     const struct sockaddr *, u_int);
 struct	ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *);
 int	ifa_preferred(struct ifaddr *, struct ifaddr *);
 
 int	if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen);
 
 typedef	void *if_com_alloc_t(u_char type, struct ifnet *ifp);
 typedef	void if_com_free_t(void *com, u_char type);
 void	if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f);
 void	if_deregister_com_alloc(u_char type);
 void	if_data_copy(struct ifnet *, struct if_data *);
 uint64_t if_get_counter_default(struct ifnet *, ift_counter);
 void	if_inc_counter(struct ifnet *, ift_counter, int64_t);
 
 #define IF_LLADDR(ifp)							\
     LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr))
 
 uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate);
 uint64_t if_getbaudrate(if_t ifp);
 int if_setcapabilities(if_t ifp, int capabilities);
 int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit);
 int if_getcapabilities(if_t ifp);
 int if_togglecapenable(if_t ifp, int togglecap);
 int if_setcapenable(if_t ifp, int capenable);
 int if_setcapenablebit(if_t ifp, int setcap, int clearcap);
 int if_getcapenable(if_t ifp);
 const char *if_getdname(if_t ifp);
 int if_setdev(if_t ifp, void *dev);
 int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags);
 int if_getdrvflags(if_t ifp);
 int if_setdrvflags(if_t ifp, int flags);
 int if_clearhwassist(if_t ifp);
 int if_sethwassistbits(if_t ifp, int toset, int toclear);
 int if_sethwassist(if_t ifp, int hwassist_bit);
 int if_gethwassist(if_t ifp);
 int if_setsoftc(if_t ifp, void *softc);
 void *if_getsoftc(if_t ifp);
 int if_setflags(if_t ifp, int flags);
 int if_gethwaddr(if_t ifp, struct ifreq *);
 int if_setmtu(if_t ifp, int mtu);
 int if_getmtu(if_t ifp);
 int if_getmtu_family(if_t ifp, int family);
 int if_setflagbits(if_t ifp, int set, int clear);
 int if_getflags(if_t ifp);
 int if_sendq_empty(if_t ifp);
 int if_setsendqready(if_t ifp);
 int if_setsendqlen(if_t ifp, int tx_desc_count);
 int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax);
 int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount);
 int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize);
 u_int if_gethwtsomax(if_t ifp);
 u_int if_gethwtsomaxsegcount(if_t ifp);
 u_int if_gethwtsomaxsegsize(if_t ifp);
 int if_input(if_t ifp, struct mbuf* sendmp);
 int if_sendq_prepend(if_t ifp, struct mbuf *m);
 struct mbuf *if_dequeue(if_t ifp);
 int if_setifheaderlen(if_t ifp, int len);
 void if_setrcvif(struct mbuf *m, if_t ifp);
 void if_setvtag(struct mbuf *m, u_int16_t tag);
 u_int16_t if_getvtag(struct mbuf *m);
 int if_vlantrunkinuse(if_t ifp);
 caddr_t if_getlladdr(if_t ifp);
 void *if_gethandle(u_char);
 void if_bpfmtap(if_t ifp, struct mbuf *m);
 void if_etherbpfmtap(if_t ifp, struct mbuf *m);
 void if_vlancap(if_t ifp);
 
 /*
  * Traversing through interface address lists.
  */
 struct sockaddr_dl;
 typedef u_int iflladdr_cb_t(void *, struct sockaddr_dl *, u_int);
 u_int if_foreach_lladdr(if_t, iflladdr_cb_t, void *);
 u_int if_foreach_llmaddr(if_t, iflladdr_cb_t, void *);
 u_int if_lladdr_count(if_t);
 u_int if_llmaddr_count(if_t);
 
 int if_getamcount(if_t ifp);
 struct ifaddr * if_getifaddr(if_t ifp);
 
 /* Functions */
 void if_setinitfn(if_t ifp, void (*)(void *));
 void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t));
 void if_setstartfn(if_t ifp, void (*)(if_t));
 void if_settransmitfn(if_t ifp, if_transmit_fn_t);
 void if_setqflushfn(if_t ifp, if_qflush_fn_t);
 void if_setgetcounterfn(if_t ifp, if_get_counter_t);
 
 /* Revisit the below. These are inline functions originally */
 int drbr_inuse_drv(if_t ifp, struct buf_ring *br);
 struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br);
 int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br);
 int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m);
 
 /* TSO */
 void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *);
 int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *);
 
 /* accessors for struct ifreq */
 void *ifr_data_get_ptr(void *ifrp);
 void *ifr_buffer_get_buffer(void *data);
 size_t ifr_buffer_get_length(void *data);
 
 int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 
 #ifdef DEVICE_POLLING
 enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS };
 
 typedef	int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count);
 int    ether_poll_register(poll_handler_t *h, if_t ifp);
 int    ether_poll_deregister(if_t ifp);
 #endif /* DEVICE_POLLING */
 
 #endif /* _KERNEL */
 
 #include <net/ifq.h>	/* XXXAO: temporary unconditional include */
 
 #endif /* !_NET_IF_VAR_H_ */
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index 10a254d22440..07c325d0cb12 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -1,2220 +1,2278 @@
 /*-
  * Copyright 1998 Massachusetts Institute of Technology
  * Copyright 2012 ADARA Networks, Inc.
  * Copyright 2017 Dell EMC Isilon
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to ADARA Networks, Inc.
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs.
  * This is sort of sneaky in the implementation, since
  * we need to pretend to be enough of an Ethernet implementation
  * to make arp work.  The way we do this is by telling everyone
  * that we are an Ethernet, and then catch the packets that
  * ether_output() sends to us via if_transmit(), rewrite them for
  * use by the real outgoing interface, and ask it to send them.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_vlan.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/priv.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 #define	VLAN_DEF_HWIDTH	4
 #define	VLAN_IFFLAGS	(IFF_BROADCAST | IFF_MULTICAST)
 
 #define	UP_AND_RUNNING(ifp) \
     ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING)
 
 CK_SLIST_HEAD(ifvlanhead, ifvlan);
 
 struct ifvlantrunk {
 	struct	ifnet   *parent;	/* parent interface of this trunk */
 	struct	mtx	lock;
 #ifdef VLAN_ARRAY
 #define	VLAN_ARRAY_SIZE	(EVL_VLID_MASK + 1)
 	struct	ifvlan	*vlans[VLAN_ARRAY_SIZE]; /* static table */
 #else
 	struct	ifvlanhead *hash;	/* dynamic hash-list table */
 	uint16_t	hmask;
 	uint16_t	hwidth;
 #endif
 	int		refcnt;
 };
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 struct vlan_snd_tag {
 	struct m_snd_tag com;
 	struct m_snd_tag *tag;
 };
 
 static inline struct vlan_snd_tag *
 mst_to_vst(struct m_snd_tag *mst)
 {
 
 	return (__containerof(mst, struct vlan_snd_tag, com));
 }
 #endif
 
 /*
  * This macro provides a facility to iterate over every vlan on a trunk with
  * the assumption that none will be added/removed during iteration.
  */
 #ifdef VLAN_ARRAY
 #define VLAN_FOREACH(_ifv, _trunk) \
 	size_t _i; \
 	for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]) != NULL)
 #else /* VLAN_ARRAY */
 #define VLAN_FOREACH(_ifv, _trunk) \
 	struct ifvlan *_next; \
 	size_t _i; \
 	for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \
 		CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next)
 #endif /* VLAN_ARRAY */
 
 /*
  * This macro provides a facility to iterate over every vlan on a trunk while
  * also modifying the number of vlans on the trunk. The iteration continues
  * until some condition is met or there are no more vlans on the trunk.
  */
 #ifdef VLAN_ARRAY
 /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \
 		if (((_ifv) = (_trunk)->vlans[_i]))
 #else /* VLAN_ARRAY */
 /*
  * The hash table case is more complicated. We allow for the hash table to be
  * modified (i.e. vlans removed) while we are iterating over it. To allow for
  * this we must restart the iteration every time we "touch" something during
  * the iteration, since removal will resize the hash table and invalidate our
  * current position. If acting on the touched element causes the trunk to be
  * emptied, then iteration also stops.
  */
 #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \
 	size_t _i; \
 	bool _touch = false; \
 	for (_i = 0; \
 	    !(_cond) && _i < (1 << (_trunk)->hwidth); \
 	    _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \
 		if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \
 		    (_touch = true))
 #endif /* VLAN_ARRAY */
 
 struct vlan_mc_entry {
 	struct sockaddr_dl		mc_addr;
 	CK_SLIST_ENTRY(vlan_mc_entry)	mc_entries;
 	struct epoch_context		mc_epoch_ctx;
 };
 
 struct ifvlan {
 	struct	ifvlantrunk *ifv_trunk;
 	struct	ifnet *ifv_ifp;
 #define	TRUNK(ifv)	((ifv)->ifv_trunk)
 #define	PARENT(ifv)	(TRUNK(ifv)->parent)
 	void	*ifv_cookie;
 	int	ifv_pflags;	/* special flags we have set on parent */
 	int	ifv_capenable;
 	int	ifv_encaplen;	/* encapsulation length */
 	int	ifv_mtufudge;	/* MTU fudged by this much */
 	int	ifv_mintu;	/* min transmission unit */
 	struct  ether_8021q_tag ifv_qtag;
 #define ifv_proto	ifv_qtag.proto
 #define ifv_vid		ifv_qtag.vid
 #define ifv_pcp		ifv_qtag.pcp
 	struct task lladdr_task;
 	CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead;
 #ifndef VLAN_ARRAY
 	CK_SLIST_ENTRY(ifvlan) ifv_list;
 #endif
 };
 
 /* Special flags we should propagate to parent. */
 static struct {
 	int flag;
 	int (*func)(struct ifnet *, int);
 } vlan_pflags[] = {
 	{IFF_PROMISC, ifpromisc},
 	{IFF_ALLMULTI, if_allmulti},
 	{0, NULL}
 };
 
 extern int vlan_mtag_pcp;
 
 static const char vlanname[] = "vlan";
 static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface");
 
 static eventhandler_tag ifdetach_tag;
 static eventhandler_tag iflladdr_tag;
 
 /*
  * if_vlan uses two module-level synchronizations primitives to allow concurrent
  * modification of vlan interfaces and (mostly) allow for vlans to be destroyed
  * while they are being used for tx/rx. To accomplish this in a way that has
  * acceptable performance and cooperation with other parts of the network stack
  * there is a non-sleepable epoch(9) and an sx(9).
  *
  * The performance-sensitive paths that warrant using the epoch(9) are
  * vlan_transmit and vlan_input. Both have to check for the vlan interface's
  * existence using if_vlantrunk, and being in the network tx/rx paths the use
  * of an epoch(9) gives a measureable improvement in performance.
  *
  * The reason for having an sx(9) is mostly because there are still areas that
  * must be sleepable and also have safe concurrent access to a vlan interface.
  * Since the sx(9) exists, it is used by default in most paths unless sleeping
  * is not permitted, or if it is not clear whether sleeping is permitted.
  *
  */
 #define _VLAN_SX_ID ifv_sx
 
 static struct sx _VLAN_SX_ID;
 
 #define VLAN_LOCKING_INIT() \
 	sx_init_flags(&_VLAN_SX_ID, "vlan_sx", SX_RECURSE)
 
 #define VLAN_LOCKING_DESTROY() \
 	sx_destroy(&_VLAN_SX_ID)
 
 #define	VLAN_SLOCK()			sx_slock(&_VLAN_SX_ID)
 #define	VLAN_SUNLOCK()			sx_sunlock(&_VLAN_SX_ID)
 #define	VLAN_XLOCK()			sx_xlock(&_VLAN_SX_ID)
 #define	VLAN_XUNLOCK()			sx_xunlock(&_VLAN_SX_ID)
 #define	VLAN_SLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_SLOCKED)
 #define	VLAN_XLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_XLOCKED)
 #define	VLAN_SXLOCK_ASSERT()		sx_assert(&_VLAN_SX_ID, SA_LOCKED)
 
 /*
  * We also have a per-trunk mutex that should be acquired when changing
  * its state.
  */
 #define	TRUNK_LOCK_INIT(trunk)		mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF)
 #define	TRUNK_LOCK_DESTROY(trunk)	mtx_destroy(&(trunk)->lock)
 #define	TRUNK_WLOCK(trunk)		mtx_lock(&(trunk)->lock)
 #define	TRUNK_WUNLOCK(trunk)		mtx_unlock(&(trunk)->lock)
 #define	TRUNK_WLOCK_ASSERT(trunk)	mtx_assert(&(trunk)->lock, MA_OWNED);
 
 /*
  * The VLAN_ARRAY substitutes the dynamic hash with a static array
  * with 4096 entries. In theory this can give a boost in processing,
  * however in practice it does not. Probably this is because the array
  * is too big to fit into CPU cache.
  */
 #ifndef VLAN_ARRAY
 static	void vlan_inithash(struct ifvlantrunk *trunk);
 static	void vlan_freehash(struct ifvlantrunk *trunk);
 static	int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv);
 static	void vlan_growhash(struct ifvlantrunk *trunk, int howmuch);
 static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk,
 	uint16_t vid);
 #endif
 static	void trunk_destroy(struct ifvlantrunk *trunk);
 
 static	void vlan_init(void *foo);
 static	void vlan_input(struct ifnet *ifp, struct mbuf *m);
 static	int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static	int vlan_snd_tag_alloc(struct ifnet *,
     union if_snd_tag_alloc_params *, struct m_snd_tag **);
 static	int vlan_snd_tag_modify(struct m_snd_tag *,
     union if_snd_tag_modify_params *);
 static	int vlan_snd_tag_query(struct m_snd_tag *,
     union if_snd_tag_query_params *);
 static	void vlan_snd_tag_free(struct m_snd_tag *);
 static struct m_snd_tag *vlan_next_snd_tag(struct m_snd_tag *);
 static void vlan_ratelimit_query(struct ifnet *,
     struct if_ratelimit_query_results *);
 #endif
 static	void vlan_qflush(struct ifnet *ifp);
 static	int vlan_setflag(struct ifnet *ifp, int flag, int status,
     int (*func)(struct ifnet *, int));
 static	int vlan_setflags(struct ifnet *ifp, int status);
 static	int vlan_setmulti(struct ifnet *ifp);
 static	int vlan_transmit(struct ifnet *ifp, struct mbuf *m);
 #ifdef ALTQ
 static void vlan_altq_start(struct ifnet *ifp);
 static	int vlan_altq_transmit(struct ifnet *ifp, struct mbuf *m);
 #endif
 static	int vlan_output(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, struct route *ro);
 static	void vlan_unconfig(struct ifnet *ifp);
 static	void vlan_unconfig_locked(struct ifnet *ifp, int departing);
 static	int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag,
 	uint16_t proto);
 static	void vlan_link_state(struct ifnet *ifp);
 static	void vlan_capabilities(struct ifvlan *ifv);
 static	void vlan_trunk_capabilities(struct ifnet *ifp);
 
 static	struct ifnet *vlan_clone_match_ethervid(const char *, int *);
 static	int vlan_clone_match(struct if_clone *, const char *);
 static	int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static	int vlan_clone_destroy(struct if_clone *, struct ifnet *);
 
 static	void vlan_ifdetach(void *arg, struct ifnet *ifp);
 static  void vlan_iflladdr(void *arg, struct ifnet *ifp);
 
 static  void vlan_lladdr_fn(void *arg, int pending);
 
 static struct if_clone *vlan_cloner;
 
 #ifdef VIMAGE
 VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner);
 #define	V_vlan_cloner	VNET(vlan_cloner)
 #endif
 
+#ifdef RATELIMIT
+static const struct if_snd_tag_sw vlan_snd_tag_ul_sw = {
+	.snd_tag_modify = vlan_snd_tag_modify,
+	.snd_tag_query = vlan_snd_tag_query,
+	.snd_tag_free = vlan_snd_tag_free,
+	.next_snd_tag = vlan_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_UNLIMITED
+};
+
+static const struct if_snd_tag_sw vlan_snd_tag_rl_sw = {
+	.snd_tag_modify = vlan_snd_tag_modify,
+	.snd_tag_query = vlan_snd_tag_query,
+	.snd_tag_free = vlan_snd_tag_free,
+	.next_snd_tag = vlan_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_RATE_LIMIT
+};
+#endif
+
+#ifdef KERN_TLS
+static const struct if_snd_tag_sw vlan_snd_tag_tls_sw = {
+	.snd_tag_modify = vlan_snd_tag_modify,
+	.snd_tag_query = vlan_snd_tag_query,
+	.snd_tag_free = vlan_snd_tag_free,
+	.next_snd_tag = vlan_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_TLS
+};
+
+#ifdef RATELIMIT
+static const struct if_snd_tag_sw vlan_snd_tag_tls_rl_sw = {
+	.snd_tag_modify = vlan_snd_tag_modify,
+	.snd_tag_query = vlan_snd_tag_query,
+	.snd_tag_free = vlan_snd_tag_free,
+	.next_snd_tag = vlan_next_snd_tag,
+	.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT
+};
+#endif
+#endif
+
 static void
 vlan_mc_free(struct epoch_context *ctx)
 {
 	struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx);
 	free(mc, M_VLAN);
 }
 
 #ifndef VLAN_ARRAY
 #define HASH(n, m)	((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m))
 
 static void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 	int i, n;
 
 	/*
 	 * The trunk must not be locked here since we call malloc(M_WAITOK).
 	 * It is OK in case this function is called before the trunk struct
 	 * gets hooked up and becomes visible from other threads.
 	 */
 
 	KASSERT(trunk->hwidth == 0 && trunk->hash == NULL,
 	    ("%s: hash already initialized", __func__));
 
 	trunk->hwidth = VLAN_DEF_HWIDTH;
 	n = 1 << trunk->hwidth;
 	trunk->hmask = n - 1;
 	trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK);
 	for (i = 0; i < n; i++)
 		CK_SLIST_INIT(&trunk->hash[i]);
 }
 
 static void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 #ifdef INVARIANTS
 	int i;
 
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 	for (i = 0; i < (1 << trunk->hwidth); i++)
 		KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]),
 		    ("%s: hash table not empty", __func__));
 #endif
 	free(trunk->hash, M_VLAN);
 	trunk->hash = NULL;
 	trunk->hwidth = trunk->hmask = 0;
 }
 
 static int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	b = 1 << trunk->hwidth;
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv->ifv_vid == ifv2->ifv_vid)
 			return (EEXIST);
 
 	/*
 	 * Grow the hash when the number of vlans exceeds half of the number of
 	 * hash buckets squared. This will make the average linked-list length
 	 * buckets/2.
 	 */
 	if (trunk->refcnt > (b * b) / 2) {
 		vlan_growhash(trunk, 1);
 		i = HASH(ifv->ifv_vid, trunk->hmask);
 	}
 	CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list);
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 	int i, b;
 	struct ifvlan *ifv2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	b = 1 << trunk->hwidth;
 	i = HASH(ifv->ifv_vid, trunk->hmask);
 	CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
 		if (ifv2 == ifv) {
 			trunk->refcnt--;
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list);
 			if (trunk->refcnt < (b * b) / 2)
 				vlan_growhash(trunk, -1);
 			return (0);
 		}
 
 	panic("%s: vlan not found\n", __func__);
 	return (ENOENT); /*NOTREACHED*/
 }
 
 /*
  * Grow the hash larger or smaller if memory permits.
  */
 static void
 vlan_growhash(struct ifvlantrunk *trunk, int howmuch)
 {
 	struct ifvlan *ifv;
 	struct ifvlanhead *hash2;
 	int hwidth2, i, j, n, n2;
 
 	VLAN_XLOCK_ASSERT();
 	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
 
 	if (howmuch == 0) {
 		/* Harmless yet obvious coding error */
 		printf("%s: howmuch is 0\n", __func__);
 		return;
 	}
 
 	hwidth2 = trunk->hwidth + howmuch;
 	n = 1 << trunk->hwidth;
 	n2 = 1 << hwidth2;
 	/* Do not shrink the table below the default */
 	if (hwidth2 < VLAN_DEF_HWIDTH)
 		return;
 
 	hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK);
 	if (hash2 == NULL) {
 		printf("%s: out of memory -- hash size not changed\n",
 		    __func__);
 		return;		/* We can live with the old hash table */
 	}
 	for (j = 0; j < n2; j++)
 		CK_SLIST_INIT(&hash2[j]);
 	for (i = 0; i < n; i++)
 		while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) {
 			CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list);
 			j = HASH(ifv->ifv_vid, n2 - 1);
 			CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list);
 		}
 	NET_EPOCH_WAIT();
 	free(trunk->hash, M_VLAN);
 	trunk->hash = hash2;
 	trunk->hwidth = hwidth2;
 	trunk->hmask = n2 - 1;
 
 	if (bootverbose)
 		if_printf(trunk->parent,
 		    "VLAN hash table resized from %d to %d buckets\n", n, n2);
 }
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list)
 		if (ifv->ifv_vid == vid)
 			return (ifv);
 	return (NULL);
 }
 
 #if 0
 /* Debugging code to view the hashtables. */
 static void
 vlan_dumphash(struct ifvlantrunk *trunk)
 {
 	int i;
 	struct ifvlan *ifv;
 
 	for (i = 0; i < (1 << trunk->hwidth); i++) {
 		printf("%d: ", i);
 		CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list)
 			printf("%s ", ifv->ifv_ifp->if_xname);
 		printf("\n");
 	}
 }
 #endif /* 0 */
 #else
 
 static __inline struct ifvlan *
 vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid)
 {
 
 	return trunk->vlans[vid];
 }
 
 static __inline int
 vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	if (trunk->vlans[ifv->ifv_vid] != NULL)
 		return EEXIST;
 	trunk->vlans[ifv->ifv_vid] = ifv;
 	trunk->refcnt++;
 
 	return (0);
 }
 
 static __inline int
 vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv)
 {
 
 	trunk->vlans[ifv->ifv_vid] = NULL;
 	trunk->refcnt--;
 
 	return (0);
 }
 
 static __inline void
 vlan_freehash(struct ifvlantrunk *trunk)
 {
 }
 
 static __inline void
 vlan_inithash(struct ifvlantrunk *trunk)
 {
 }
 
 #endif /* !VLAN_ARRAY */
 
 static void
 trunk_destroy(struct ifvlantrunk *trunk)
 {
 	VLAN_XLOCK_ASSERT();
 
 	vlan_freehash(trunk);
 	trunk->parent->if_vlantrunk = NULL;
 	TRUNK_LOCK_DESTROY(trunk);
 	if_rele(trunk->parent);
 	free(trunk, M_VLAN);
 }
 
 /*
  * Program our multicast filter. What we're actually doing is
  * programming the multicast filter of the parent. This has the
  * side effect of causing the parent interface to receive multicast
  * traffic that it doesn't really want, which ends up being discarded
  * later by the upper protocol layers. Unfortunately, there's no way
  * to avoid this: there really is only one physical interface.
  */
 static int
 vlan_setmulti(struct ifnet *ifp)
 {
 	struct ifnet		*ifp_p;
 	struct ifmultiaddr	*ifma;
 	struct ifvlan		*sc;
 	struct vlan_mc_entry	*mc;
 	int			error;
 
 	VLAN_XLOCK_ASSERT();
 
 	/* Find the parent. */
 	sc = ifp->if_softc;
 	ifp_p = PARENT(sc);
 
 	CURVNET_SET_QUIET(ifp_p->if_vnet);
 
 	/* First, remove any existing filter entries. */
 	while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) {
 		CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries);
 		(void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr);
 		NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx);
 	}
 
 	/* Now program new ones. */
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT);
 		if (mc == NULL) {
 			IF_ADDR_WUNLOCK(ifp);
 			CURVNET_RESTORE();
 			return (ENOMEM);
 		}
 		bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len);
 		mc->mc_addr.sdl_index = ifp_p->if_index;
 		CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 	CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) {
 		error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr,
 		    NULL);
 		if (error) {
 			CURVNET_RESTORE();
 			return (error);
 		}
 	}
 
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * A handler for parent interface link layer address changes.
  * If the parent interface link layer address is changed we
  * should also change it on all children vlans.
  */
 static void
 vlan_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlan *ifv;
 	struct ifnet *ifv_ifp;
 	struct ifvlantrunk *trunk;
 	struct sockaddr_dl *sdl;
 
 	/* Need the epoch since this is run on taskqueue_swi. */
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and change all vlan's lladdrs on it.
 	 * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR
 	 * ioctl calls on the parent garbling the lladdr of the child vlan.
 	 */
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		/*
 		 * Copy new new lladdr into the ifv_ifp, enqueue a task
 		 * to actually call if_setlladdr. if_setlladdr needs to
 		 * be deferred to a taskqueue because it will call into
 		 * the if_vlan ioctl path and try to acquire the global
 		 * lock.
 		 */
 		ifv_ifp = ifv->ifv_ifp;
 		bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp),
 		    ifp->if_addrlen);
 		sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr;
 		sdl->sdl_alen = ifp->if_addrlen;
 		taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * A handler for network interface departure events.
  * Track departure of trunks here so that we don't access invalid
  * pointers or whatever if a trunk is ripped from under us, e.g.,
  * by ejecting its hot-plug card.  However, if an ifnet is simply
  * being renamed, then there's no need to tear down the state.
  */
 static void
 vlan_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 
 	/* If the ifnet is just being renamed, don't do anything. */
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	VLAN_XLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_XUNLOCK();
 		return;
 	}
 
 	/*
 	 * OK, it's a trunk.  Loop over and detach all vlan's on it.
 	 * Check trunk pointer after each vlan_unconfig() as it will
 	 * free it and set to NULL after the last vlan was detached.
 	 */
 	VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk,
 	    ifp->if_vlantrunk == NULL)
 		vlan_unconfig_locked(ifv->ifv_ifp, 1);
 
 	/* Trunk should have been destroyed in vlan_unconfig(). */
 	KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__));
 	VLAN_XUNLOCK();
 }
 
 /*
  * Return the trunk device for a virtual interface.
  */
 static struct ifnet  *
 vlan_trunkdev(struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 
 	ifv = ifp->if_softc;
 	ifp = NULL;
 	if (ifv->ifv_trunk)
 		ifp = PARENT(ifv);
 	return (ifp);
 }
 
 /*
  * Return the 12-bit VLAN VID for this interface, for use by external
  * components such as Infiniband.
  *
  * XXXRW: Note that the function name here is historical; it should be named
  * vlan_vid().
  */
 static int
 vlan_tag(struct ifnet *ifp, uint16_t *vidp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*vidp = ifv->ifv_vid;
 	return (0);
 }
 
 static int
 vlan_pcp(struct ifnet *ifp, uint16_t *pcpp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	*pcpp = ifv->ifv_pcp;
 	return (0);
 }
 
 /*
  * Return a driver specific cookie for this interface.  Synchronization
  * with setcookie must be provided by the driver.
  */
 static void *
 vlan_cookie(struct ifnet *ifp)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (NULL);
 	ifv = ifp->if_softc;
 	return (ifv->ifv_cookie);
 }
 
 /*
  * Store a cookie in our softc that drivers can use to store driver
  * private per-instance data in.
  */
 static int
 vlan_setcookie(struct ifnet *ifp, void *cookie)
 {
 	struct ifvlan *ifv;
 
 	if (ifp->if_type != IFT_L2VLAN)
 		return (EINVAL);
 	ifv = ifp->if_softc;
 	ifv->ifv_cookie = cookie;
 	return (0);
 }
 
 /*
  * Return the vlan device present at the specific VID.
  */
 static struct ifnet *
 vlan_devat(struct ifnet *ifp, uint16_t vid)
 {
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ASSERT();
 
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL)
 		return (NULL);
 	ifp = NULL;
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv)
 		ifp = ifv->ifv_ifp;
 	return (ifp);
 }
 
 /*
  * VLAN support can be loaded as a module.  The only place in the
  * system that's intimately aware of this is ether_input.  We hook
  * into this code through vlan_input_p which is defined there and
  * set here.  No one else in the system should be aware of this so
  * we use an explicit reference here.
  */
 extern	void (*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* For if_link_state_change() eyes only... */
 extern	void (*vlan_link_state_p)(struct ifnet *);
 
 static int
 vlan_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 		    vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
 		if (ifdetach_tag == NULL)
 			return (ENOMEM);
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 		if (iflladdr_tag == NULL)
 			return (ENOMEM);
 		VLAN_LOCKING_INIT();
 		vlan_input_p = vlan_input;
 		vlan_link_state_p = vlan_link_state;
 		vlan_trunk_cap_p = vlan_trunk_capabilities;
 		vlan_trunkdev_p = vlan_trunkdev;
 		vlan_cookie_p = vlan_cookie;
 		vlan_setcookie_p = vlan_setcookie;
 		vlan_tag_p = vlan_tag;
 		vlan_pcp_p = vlan_pcp;
 		vlan_devat_p = vlan_devat;
 #ifndef VIMAGE
 		vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match,
 		    vlan_clone_create, vlan_clone_destroy);
 #endif
 		if (bootverbose)
 			printf("vlan: initialized, using "
 #ifdef VLAN_ARRAY
 			       "full-size arrays"
 #else
 			       "hash tables with chaining"
 #endif
 
 			       "\n");
 		break;
 	case MOD_UNLOAD:
 #ifndef VIMAGE
 		if_clone_detach(vlan_cloner);
 #endif
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag);
 		vlan_input_p = NULL;
 		vlan_link_state_p = NULL;
 		vlan_trunk_cap_p = NULL;
 		vlan_trunkdev_p = NULL;
 		vlan_tag_p = NULL;
 		vlan_cookie_p = NULL;
 		vlan_setcookie_p = NULL;
 		vlan_devat_p = NULL;
 		VLAN_LOCKING_DESTROY();
 		if (bootverbose)
 			printf("vlan: unloaded\n");
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t vlan_mod = {
 	"if_vlan",
 	vlan_modevent,
 	0
 };
 
 DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_vlan, 3);
 
 #ifdef VIMAGE
 static void
 vnet_vlan_init(const void *unused __unused)
 {
 
 	vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match,
 		    vlan_clone_create, vlan_clone_destroy);
 	V_vlan_cloner = vlan_cloner;
 }
 VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_vlan_init, NULL);
 
 static void
 vnet_vlan_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_vlan_cloner);
 }
 VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_vlan_uninit, NULL);
 #endif
 
 /*
  * Check for <etherif>.<vlan>[.<vlan> ...] style interface names.
  */
 static struct ifnet *
 vlan_clone_match_ethervid(const char *name, int *vidp)
 {
 	char ifname[IFNAMSIZ];
 	char *cp;
 	struct ifnet *ifp;
 	int vid;
 
 	strlcpy(ifname, name, IFNAMSIZ);
 	if ((cp = strrchr(ifname, '.')) == NULL)
 		return (NULL);
 	*cp = '\0';
 	if ((ifp = ifunit_ref(ifname)) == NULL)
 		return (NULL);
 	/* Parse VID. */
 	if (*++cp == '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	vid = 0;
 	for(; *cp >= '0' && *cp <= '9'; cp++)
 		vid = (vid * 10) + (*cp - '0');
 	if (*cp != '\0') {
 		if_rele(ifp);
 		return (NULL);
 	}
 	if (vidp != NULL)
 		*vidp = vid;
 
 	return (ifp);
 }
 
 static int
 vlan_clone_match(struct if_clone *ifc, const char *name)
 {
 	struct ifnet *ifp;
 	const char *cp;
 
 	ifp = vlan_clone_match_ethervid(name, NULL);
 	if (ifp != NULL) {
 		if_rele(ifp);
 		return (1);
 	}
 
 	if (strncmp(vlanname, name, strlen(vlanname)) != 0)
 		return (0);
 	for (cp = name + 4; *cp != '\0'; cp++) {
 		if (*cp < '0' || *cp > '9')
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	char *dp;
 	bool wildcard = false;
 	bool subinterface = false;
 	int unit;
 	int error;
 	int vid = 0;
 	uint16_t proto = ETHERTYPE_VLAN;
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 	struct ifnet *p = NULL;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	struct vlanreq vlr;
 	static const u_char eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
 
 
 	/*
 	 * There are three ways to specify the cloned device:
 	 * o pass a parameter block with the clone request.
 	 * o specify parameters in the text of the clone device name
 	 * o specify no parameters and get an unattached device that
 	 *   must be configured separately.
 	 * The first technique is preferred; the latter two are supported
 	 * for backwards compatibility.
 	 *
 	 * XXXRW: Note historic use of the word "tag" here.  New ioctls may be
 	 * called for.
 	 */
 
 	if (params) {
 		error = copyin(params, &vlr, sizeof(vlr));
 		if (error)
 			return error;
 		vid = vlr.vlr_tag;
 		proto = vlr.vlr_proto;
 
 #ifdef COMPAT_FREEBSD12
 		if (proto == 0)
 			proto = ETHERTYPE_VLAN;
 #endif
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL)
 			return (ENXIO);
 	}
 
 	if ((error = ifc_name2unit(name, &unit)) == 0) {
 
 		/*
 		 * vlanX interface. Set wildcard to true if the unit number
 		 * is not fixed (-1)
 		 */
 		wildcard = (unit < 0);
 	} else {
 		struct ifnet *p_tmp = vlan_clone_match_ethervid(name, &vid);
 		if (p_tmp != NULL) {
 			error = 0;
 			subinterface = true;
 			unit = IF_DUNIT_NONE;
 			wildcard = false;
 			if (p != NULL) {
 				if_rele(p_tmp);
 				if (p != p_tmp)
 					error = EINVAL;
 			} else
 				p = p_tmp;
 		} else
 			error = ENXIO;
 	}
 
 	if (error != 0) {
 		if (p != NULL)
 			if_rele(p);
 		return (error);
 	}
 
 	if (!subinterface) {
 		/* vlanX interface, mark X as busy or allocate new unit # */
 		error = ifc_alloc_unit(ifc, &unit);
 		if (error != 0) {
 			if (p != NULL)
 				if_rele(p);
 			return (error);
 		}
 	}
 
 	/* In the wildcard case, we need to update the name. */
 	if (wildcard) {
 		for (dp = name; *dp != '\0'; dp++);
 		if (snprintf(dp, len - (dp-name), "%d", unit) >
 		    len - (dp-name) - 1) {
 			panic("%s: interface name too long", __func__);
 		}
 	}
 
 	ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO);
 	ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		if (!subinterface)
 			ifc_free_unit(ifc, unit);
 		free(ifv, M_VLAN);
 		if (p != NULL)
 			if_rele(p);
 		return (ENOSPC);
 	}
 	CK_SLIST_INIT(&ifv->vlan_mc_listhead);
 	ifp->if_softc = ifv;
 	/*
 	 * Set the name manually rather than using if_initname because
 	 * we don't conform to the default naming convention for interfaces.
 	 */
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = vlanname;
 	ifp->if_dunit = unit;
 
 	ifp->if_init = vlan_init;
 #ifdef ALTQ
 	ifp->if_start = vlan_altq_start;
 	ifp->if_transmit = vlan_altq_transmit;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	ifp->if_snd.ifq_drv_maxlen = 0;
 	IFQ_SET_READY(&ifp->if_snd);
 #else
 	ifp->if_transmit = vlan_transmit;
 #endif
 	ifp->if_qflush = vlan_qflush;
 	ifp->if_ioctl = vlan_ioctl;
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
-	ifp->if_snd_tag_modify = vlan_snd_tag_modify;
-	ifp->if_snd_tag_query = vlan_snd_tag_query;
-	ifp->if_snd_tag_free = vlan_snd_tag_free;
-	ifp->if_next_snd_tag = vlan_next_snd_tag;
 	ifp->if_ratelimit_query = vlan_ratelimit_query;
 #endif
 	ifp->if_flags = VLAN_IFFLAGS;
 	ether_ifattach(ifp, eaddr);
 	/* Now undo some of the damage... */
 	ifp->if_baudrate = 0;
 	ifp->if_type = IFT_L2VLAN;
 	ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN;
 	ifa = ifp->if_addr;
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_L2VLAN;
 
 	if (p != NULL) {
 		error = vlan_config(ifv, p, vid, proto);
 		if_rele(p);
 		if (error != 0) {
 			/*
 			 * Since we've partially failed, we need to back
 			 * out all the way, otherwise userland could get
 			 * confused.  Thus, we destroy the interface.
 			 */
 			ether_ifdetach(ifp);
 			vlan_unconfig(ifp);
 			if_free(ifp);
 			if (!subinterface)
 				ifc_free_unit(ifc, unit);
 			free(ifv, M_VLAN);
 
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ifvlan *ifv = ifp->if_softc;
 	int unit = ifp->if_dunit;
 
 	if (ifp->if_vlantrunk)
 		return (EBUSY);
 
 #ifdef ALTQ
 	IFQ_PURGE(&ifp->if_snd);
 #endif
 	ether_ifdetach(ifp);	/* first, remove it from system-wide lists */
 	vlan_unconfig(ifp);	/* now it can be unconfigured and freed */
 	/*
 	 * We should have the only reference to the ifv now, so we can now
 	 * drain any remaining lladdr task before freeing the ifnet and the
 	 * ifvlan.
 	 */
 	taskqueue_drain(taskqueue_thread, &ifv->lladdr_task);
 	NET_EPOCH_WAIT();
 	if_free(ifp);
 	free(ifv, M_VLAN);
 	if (unit != IF_DUNIT_NONE)
 		ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 /*
  * The ifp->if_init entry point for vlan(4) is a no-op.
  */
 static void
 vlan_init(void *foo __unused)
 {
 }
 
 /*
  * The if_transmit method for vlan(4) interface.
  */
 static int
 vlan_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifvlan *ifv;
 	struct ifnet *p;
 	int error, len, mcast;
 
 	NET_EPOCH_ASSERT();
 
 	ifv = ifp->if_softc;
 	if (TRUNK(ifv) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 	p = PARENT(ifv);
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 
 	BPF_MTAP(ifp, m);
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
 		struct vlan_snd_tag *vst;
 		struct m_snd_tag *mst;
 
 		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
 		mst = m->m_pkthdr.snd_tag;
 		vst = mst_to_vst(mst);
 		if (vst->tag->ifp != p) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			return (EAGAIN);
 		}
 
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag);
 		m_snd_tag_rele(mst);
 	}
 #endif
 
 	/*
 	 * Do not run parent's if_transmit() if the parent is not up,
 	 * or parent's driver will cause a system crash.
 	 */
 	if (!UP_AND_RUNNING(p)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if (!ether_8021q_frame(&m, ifp, p, &ifv->ifv_qtag)) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (0);
 	}
 
 	/*
 	 * Send it, precisely as ether_output() would have.
 	 */
 	error = (p->if_transmit)(p, m);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (error);
 }
 
 static int
 vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct ifvlan *ifv;
 	struct ifnet *p;
 
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * Find the first non-VLAN parent interface.
 	 */
 	ifv = ifp->if_softc;
 	do {
 		if (TRUNK(ifv) == NULL) {
 			m_freem(m);
 			return (ENETDOWN);
 		}
 		p = PARENT(ifv);
 		ifv = p->if_softc;
 	} while (p->if_type == IFT_L2VLAN);
 
 	return p->if_output(ifp, m, dst, ro);
 }
 
 #ifdef ALTQ
 static void
 vlan_altq_start(if_t ifp)
 {
 	struct ifaltq *ifq = &ifp->if_snd;
 	struct mbuf *m;
 
 	IFQ_LOCK(ifq);
 	IFQ_DEQUEUE_NOLOCK(ifq, m);
 	while (m != NULL) {
 		vlan_transmit(ifp, m);
 		IFQ_DEQUEUE_NOLOCK(ifq, m);
 	}
 	IFQ_UNLOCK(ifq);
 }
 
 static int
 vlan_altq_transmit(if_t ifp, struct mbuf *m)
 {
 	int err;
 
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_ENQUEUE(&ifp->if_snd, m, err);
 		if (err == 0)
 			vlan_altq_start(ifp);
 	} else
 		err = vlan_transmit(ifp, m);
 
 	return (err);
 }
 #endif	/* ALTQ */
 
 /*
  * The ifp->if_qflush entry point for vlan(4) is a no-op.
  */
 static void
 vlan_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static void
 vlan_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 	struct m_tag *mtag;
 	uint16_t vid, tag;
 
 	NET_EPOCH_ASSERT();
 
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_flags & M_VLANTAG) {
 		/*
 		 * Packet is tagged, but m contains a normal
 		 * Ethernet frame; the tag is stored out-of-band.
 		 */
 		tag = m->m_pkthdr.ether_vtag;
 		m->m_flags &= ~M_VLANTAG;
 	} else {
 		struct ether_vlan_header *evl;
 
 		/*
 		 * Packet is tagged in-band as specified by 802.1q.
 		 */
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 			if (m->m_len < sizeof(*evl) &&
 			    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 				if_printf(ifp, "cannot pullup VLAN header\n");
 				return;
 			}
 			evl = mtod(m, struct ether_vlan_header *);
 			tag = ntohs(evl->evl_tag);
 
 			/*
 			 * Remove the 802.1q header by copying the Ethernet
 			 * addresses over it and adjusting the beginning of
 			 * the data in the mbuf.  The encapsulated Ethernet
 			 * type field is already in place.
 			 */
 			bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 			      ETHER_HDR_LEN - ETHER_TYPE_LEN);
 			m_adj(m, ETHER_VLAN_ENCAP_LEN);
 			break;
 
 		default:
 #ifdef INVARIANTS
 			panic("%s: %s has unsupported if_type %u",
 			      __func__, ifp->if_xname, ifp->if_type);
 #endif
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 	}
 
 	vid = EVL_VLANOFTAG(tag);
 
 	ifv = vlan_gethash(trunk, vid);
 	if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) {
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		m_freem(m);
 		return;
 	}
 
 	if (vlan_mtag_pcp) {
 		/*
 		 * While uncommon, it is possible that we will find a 802.1q
 		 * packet encapsulated inside another packet that also had an
 		 * 802.1q header.  For example, ethernet tunneled over IPSEC
 		 * arriving over ethernet.  In that case, we replace the
 		 * existing 802.1q PCP m_tag value.
 		 */
 		mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
 		if (mtag == NULL) {
 			mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN,
 			    sizeof(uint8_t), M_NOWAIT);
 			if (mtag == NULL) {
 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 				m_freem(m);
 				return;
 			}
 			m_tag_prepend(m, mtag);
 		}
 		*(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag);
 	}
 
 	m->m_pkthdr.rcvif = ifv->ifv_ifp;
 	if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1);
 
 	/* Pass it back through the parent's input routine. */
 	(*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m);
 }
 
 static void
 vlan_lladdr_fn(void *arg, int pending __unused)
 {
 	struct ifvlan *ifv;
 	struct ifnet *ifp;
 
 	ifv = (struct ifvlan *)arg;
 	ifp = ifv->ifv_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
 
 	/* The ifv_ifp already has the lladdr copied in. */
 	if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen);
 
 	CURVNET_RESTORE();
 }
 
 static int
 vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid,
 	uint16_t proto)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifnet *ifp;
 	int error = 0;
 
 	/*
 	 * We can handle non-ethernet hardware types as long as
 	 * they handle the tagging and headers themselves.
 	 */
 	if (p->if_type != IFT_ETHER &&
 	    p->if_type != IFT_L2VLAN &&
 	    (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0)
 		return (EPROTONOSUPPORT);
 	if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS)
 		return (EPROTONOSUPPORT);
 	/*
 	 * Don't let the caller set up a VLAN VID with
 	 * anything except VLID bits.
 	 * VID numbers 0x0 and 0xFFF are reserved.
 	 */
 	if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK))
 		return (EINVAL);
 	if (ifv->ifv_trunk)
 		return (EBUSY);
 
 	VLAN_XLOCK();
 	if (p->if_vlantrunk == NULL) {
 		trunk = malloc(sizeof(struct ifvlantrunk),
 		    M_VLAN, M_WAITOK | M_ZERO);
 		vlan_inithash(trunk);
 		TRUNK_LOCK_INIT(trunk);
 		TRUNK_WLOCK(trunk);
 		p->if_vlantrunk = trunk;
 		trunk->parent = p;
 		if_ref(trunk->parent);
 		TRUNK_WUNLOCK(trunk);
 	} else {
 		trunk = p->if_vlantrunk;
 	}
 
 	ifv->ifv_vid = vid;	/* must set this before vlan_inshash() */
 	ifv->ifv_pcp = 0;       /* Default: best effort delivery. */
 	error = vlan_inshash(trunk, ifv);
 	if (error)
 		goto done;
 	ifv->ifv_proto = proto;
 	ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN;
 	ifv->ifv_mintu = ETHERMIN;
 	ifv->ifv_pflags = 0;
 	ifv->ifv_capenable = -1;
 
 	/*
 	 * If the parent supports the VLAN_MTU capability,
 	 * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames,
 	 * use it.
 	 */
 	if (p->if_capenable & IFCAP_VLAN_MTU) {
 		/*
 		 * No need to fudge the MTU since the parent can
 		 * handle extended frames.
 		 */
 		ifv->ifv_mtufudge = 0;
 	} else {
 		/*
 		 * Fudge the MTU by the encapsulation size.  This
 		 * makes us incompatible with strictly compliant
 		 * 802.1Q implementations, but allows us to use
 		 * the feature with other NetBSD implementations,
 		 * which might still be useful.
 		 */
 		ifv->ifv_mtufudge = ifv->ifv_encaplen;
 	}
 
 	ifv->ifv_trunk = trunk;
 	ifp = ifv->ifv_ifp;
 	/*
 	 * Initialize fields from our parent.  This duplicates some
 	 * work with ether_ifattach() but allows for non-ethernet
 	 * interfaces to also work.
 	 */
 	ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge;
 	ifp->if_baudrate = p->if_baudrate;
 	ifp->if_input = p->if_input;
 	ifp->if_resolvemulti = p->if_resolvemulti;
 	ifp->if_addrlen = p->if_addrlen;
 	ifp->if_broadcastaddr = p->if_broadcastaddr;
 	ifp->if_pcp = ifv->ifv_pcp;
 
 	/*
 	 * We wrap the parent's if_output using vlan_output to ensure that it
 	 * can't become stale.
 	 */
 	ifp->if_output = vlan_output;
 
 	/*
 	 * Copy only a selected subset of flags from the parent.
 	 * Other flags are none of our business.
 	 */
 #define VLAN_COPY_FLAGS (IFF_SIMPLEX)
 	ifp->if_flags &= ~VLAN_COPY_FLAGS;
 	ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS;
 #undef VLAN_COPY_FLAGS
 
 	ifp->if_link_state = p->if_link_state;
 
 	NET_EPOCH_ENTER(et);
 	vlan_capabilities(ifv);
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * Set up our interface address to reflect the underlying
 	 * physical interface's.
 	 */
 	TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv);
 	((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen =
 	    p->if_addrlen;
 
 	/*
 	 * Do not schedule link address update if it was the same
 	 * as previous parent's. This helps avoid updating for each
 	 * associated llentry.
 	 */
 	if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) {
 		bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen);
 		taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task);
 	}
 
 	/* We are ready for operation now. */
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/* Update flags on the parent, if necessary. */
 	vlan_setflags(ifp, 1);
 
 	/*
 	 * Configure multicast addresses that may already be
 	 * joined on the vlan device.
 	 */
 	(void)vlan_setmulti(ifp);
 
 done:
 	if (error == 0)
 		EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid);
 	VLAN_XUNLOCK();
 
 	return (error);
 }
 
 static void
 vlan_unconfig(struct ifnet *ifp)
 {
 
 	VLAN_XLOCK();
 	vlan_unconfig_locked(ifp, 0);
 	VLAN_XUNLOCK();
 }
 
 static void
 vlan_unconfig_locked(struct ifnet *ifp, int departing)
 {
 	struct ifvlantrunk *trunk;
 	struct vlan_mc_entry *mc;
 	struct ifvlan *ifv;
 	struct ifnet  *parent;
 	int error;
 
 	VLAN_XLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	trunk = ifv->ifv_trunk;
 	parent = NULL;
 
 	if (trunk != NULL) {
 		parent = trunk->parent;
 
 		/*
 		 * Since the interface is being unconfigured, we need to
 		 * empty the list of multicast groups that we may have joined
 		 * while we were alive from the parent's list.
 		 */
 		while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) {
 			/*
 			 * If the parent interface is being detached,
 			 * all its multicast addresses have already
 			 * been removed.  Warn about errors if
 			 * if_delmulti() does fail, but don't abort as
 			 * all callers expect vlan destruction to
 			 * succeed.
 			 */
 			if (!departing) {
 				error = if_delmulti(parent,
 				    (struct sockaddr *)&mc->mc_addr);
 				if (error)
 					if_printf(ifp,
 		    "Failed to delete multicast address from parent: %d\n",
 					    error);
 			}
 			CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries);
 			NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx);
 		}
 
 		vlan_setflags(ifp, 0); /* clear special flags on parent */
 
 		vlan_remhash(trunk, ifv);
 		ifv->ifv_trunk = NULL;
 
 		/*
 		 * Check if we were the last.
 		 */
 		if (trunk->refcnt == 0) {
 			parent->if_vlantrunk = NULL;
 			NET_EPOCH_WAIT();
 			trunk_destroy(trunk);
 		}
 	}
 
 	/* Disconnect from parent. */
 	if (ifv->ifv_pflags)
 		if_printf(ifp, "%s: ifv_pflags unclean\n", __func__);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_link_state = LINK_STATE_UNKNOWN;
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * Only dispatch an event if vlan was
 	 * attached, otherwise there is nothing
 	 * to cleanup anyway.
 	 */
 	if (parent != NULL)
 		EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid);
 }
 
 /* Handle a reference counted flag that should be set on the parent as well */
 static int
 vlan_setflag(struct ifnet *ifp, int flag, int status,
 	     int (*func)(struct ifnet *, int))
 {
 	struct ifvlan *ifv;
 	int error;
 
 	VLAN_SXLOCK_ASSERT();
 
 	ifv = ifp->if_softc;
 	status = status ? (ifp->if_flags & flag) : 0;
 	/* Now "status" contains the flag value or 0 */
 
 	/*
 	 * See if recorded parent's status is different from what
 	 * we want it to be.  If it is, flip it.  We record parent's
 	 * status in ifv_pflags so that we won't clear parent's flag
 	 * we haven't set.  In fact, we don't clear or set parent's
 	 * flags directly, but get or release references to them.
 	 * That's why we can be sure that recorded flags still are
 	 * in accord with actual parent's flags.
 	 */
 	if (status != (ifv->ifv_pflags & flag)) {
 		error = (*func)(PARENT(ifv), status);
 		if (error)
 			return (error);
 		ifv->ifv_pflags &= ~flag;
 		ifv->ifv_pflags |= status;
 	}
 	return (0);
 }
 
 /*
  * Handle IFF_* flags that require certain changes on the parent:
  * if "status" is true, update parent's flags respective to our if_flags;
  * if "status" is false, forcedly clear the flags set on parent.
  */
 static int
 vlan_setflags(struct ifnet *ifp, int status)
 {
 	int error, i;
 
 	for (i = 0; vlan_pflags[i].flag; i++) {
 		error = vlan_setflag(ifp, vlan_pflags[i].flag,
 				     status, vlan_pflags[i].func);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 /* Inform all vlans that their parent has changed link state */
 static void
 vlan_link_state(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	NET_EPOCH_ENTER(et);
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 
 	TRUNK_WLOCK(trunk);
 	VLAN_FOREACH(ifv, trunk) {
 		ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate;
 		if_link_state_change(ifv->ifv_ifp,
 		    trunk->parent->if_link_state);
 	}
 	TRUNK_WUNLOCK(trunk);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 vlan_capabilities(struct ifvlan *ifv)
 {
 	struct ifnet *p;
 	struct ifnet *ifp;
 	struct ifnet_hw_tsomax hw_tsomax;
 	int cap = 0, ena = 0, mena;
 	u_long hwa = 0;
 
 	NET_EPOCH_ASSERT();
 	VLAN_SXLOCK_ASSERT();
 
 	p = PARENT(ifv);
 	ifp = ifv->ifv_ifp;
 
 	/* Mask parent interface enabled capabilities disabled by user. */
 	mena = p->if_capenable & ifv->ifv_capenable;
 
 	/*
 	 * If the parent interface can do checksum offloading
 	 * on VLANs, then propagate its hardware-assisted
 	 * checksumming flags. Also assert that checksum
 	 * offloading requires hardware VLAN tagging.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM &&
 	    p->if_capenable & IFCAP_VLAN_HWTAGGING) {
 		ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
 		if (ena & IFCAP_TXCSUM)
 			hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP |
 			    CSUM_UDP | CSUM_SCTP);
 		if (ena & IFCAP_TXCSUM_IPV6)
 			hwa |= p->if_hwassist & (CSUM_TCP_IPV6 |
 			    CSUM_UDP_IPV6 | CSUM_SCTP_IPV6);
 	}
 
 	/*
 	 * If the parent interface can do TSO on VLANs then
 	 * propagate the hardware-assisted flag. TSO on VLANs
 	 * does not necessarily require hardware VLAN tagging.
 	 */
 	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
 	if_hw_tsomax_common(p, &hw_tsomax);
 	if_hw_tsomax_update(ifp, &hw_tsomax);
 	if (p->if_capabilities & IFCAP_VLAN_HWTSO)
 		cap |= p->if_capabilities & IFCAP_TSO;
 	if (p->if_capenable & IFCAP_VLAN_HWTSO) {
 		ena |= mena & IFCAP_TSO;
 		if (ena & IFCAP_TSO)
 			hwa |= p->if_hwassist & CSUM_TSO;
 	}
 
 	/*
 	 * If the parent interface can do LRO and checksum offloading on
 	 * VLANs, then guess it may do LRO on VLANs.  False positive here
 	 * cost nothing, while false negative may lead to some confusions.
 	 */
 	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
 		cap |= p->if_capabilities & IFCAP_LRO;
 	if (p->if_capenable & IFCAP_VLAN_HWCSUM)
 		ena |= p->if_capenable & IFCAP_LRO;
 
 	/*
 	 * If the parent interface can offload TCP connections over VLANs then
 	 * propagate its TOE capability to the VLAN interface.
 	 *
 	 * All TOE drivers in the tree today can deal with VLANs.  If this
 	 * changes then IFCAP_VLAN_TOE should be promoted to a full capability
 	 * with its own bit.
 	 */
 #define	IFCAP_VLAN_TOE IFCAP_TOE
 	if (p->if_capabilities & IFCAP_VLAN_TOE)
 		cap |= p->if_capabilities & IFCAP_TOE;
 	if (p->if_capenable & IFCAP_VLAN_TOE) {
 		TOEDEV(ifp) = TOEDEV(p);
 		ena |= mena & IFCAP_TOE;
 	}
 
 	/*
 	 * If the parent interface supports dynamic link state, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_LINKSTATE);
 	ena |= (mena & IFCAP_LINKSTATE);
 
 #ifdef RATELIMIT
 	/*
 	 * If the parent interface supports ratelimiting, so does the
 	 * VLAN interface.
 	 */
 	cap |= (p->if_capabilities & IFCAP_TXRTLMT);
 	ena |= (mena & IFCAP_TXRTLMT);
 #endif
 
 	/*
 	 * If the parent interface supports unmapped mbufs, so does
 	 * the VLAN interface.  Note that this should be fine even for
 	 * interfaces that don't support hardware tagging as headers
 	 * are prepended in normal mbufs to unmapped mbufs holding
 	 * payload data.
 	 */
 	cap |= (p->if_capabilities & IFCAP_MEXTPG);
 	ena |= (mena & IFCAP_MEXTPG);
 
 	/*
 	 * If the parent interface can offload encryption and segmentation
 	 * of TLS records over TCP, propagate it's capability to the VLAN
 	 * interface.
 	 *
 	 * All TLS drivers in the tree today can deal with VLANs.  If
 	 * this ever changes, then a new IFCAP_VLAN_TXTLS can be
 	 * defined.
 	 */
 	if (p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT))
 		cap |= p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT);
 	if (p->if_capenable & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT))
 		ena |= mena & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT);
 
 	ifp->if_capabilities = cap;
 	ifp->if_capenable = ena;
 	ifp->if_hwassist = hwa;
 }
 
 static void
 vlan_trunk_capabilities(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifvlantrunk *trunk;
 	struct ifvlan *ifv;
 
 	VLAN_SLOCK();
 	trunk = ifp->if_vlantrunk;
 	if (trunk == NULL) {
 		VLAN_SUNLOCK();
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 	VLAN_FOREACH(ifv, trunk)
 		vlan_capabilities(ifv);
 	NET_EPOCH_EXIT(et);
 	VLAN_SUNLOCK();
 }
 
 static int
 vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifnet *p;
 	struct ifreq *ifr;
 	struct ifaddr *ifa;
 	struct ifvlan *ifv;
 	struct ifvlantrunk *trunk;
 	struct vlanreq vlr;
 	int error = 0, oldmtu;
 
 	ifr = (struct ifreq *)data;
 	ifa = (struct ifaddr *) data;
 	ifv = ifp->if_softc;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 #endif
 		break;
 	case SIOCGIFADDR:
 		bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0],
 		    ifp->if_addrlen);
 		break;
 	case SIOCGIFMEDIA:
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			p = PARENT(ifv);
 			if_ref(p);
 			error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data);
 			if_rele(p);
 			/* Limit the result to the parent's current config. */
 			if (error == 0) {
 				struct ifmediareq *ifmr;
 
 				ifmr = (struct ifmediareq *)data;
 				if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) {
 					ifmr->ifm_count = 1;
 					error = copyout(&ifmr->ifm_current,
 						ifmr->ifm_ulist,
 						sizeof(int));
 				}
 			}
 		} else {
 			error = EINVAL;
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSIFMEDIA:
 		error = EINVAL;
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		VLAN_SLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			TRUNK_WLOCK(trunk);
 			if (ifr->ifr_mtu >
 			     (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) ||
 			    ifr->ifr_mtu <
 			     (ifv->ifv_mintu - ifv->ifv_mtufudge))
 				error = EINVAL;
 			else
 				ifp->if_mtu = ifr->ifr_mtu;
 			TRUNK_WUNLOCK(trunk);
 		} else
 			error = EINVAL;
 		VLAN_SUNLOCK();
 		break;
 
 	case SIOCSETVLAN:
 #ifdef VIMAGE
 		/*
 		 * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN
 		 * interface to be delegated to a jail without allowing the
 		 * jail to change what underlying interface/VID it is
 		 * associated with.  We are not entirely convinced that this
 		 * is the right way to accomplish that policy goal.
 		 */
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr));
 		if (error)
 			break;
 		if (vlr.vlr_parent[0] == '\0') {
 			vlan_unconfig(ifp);
 			break;
 		}
 		p = ifunit_ref(vlr.vlr_parent);
 		if (p == NULL) {
 			error = ENOENT;
 			break;
 		}
 #ifdef COMPAT_FREEBSD12
 		if (vlr.vlr_proto == 0)
 			vlr.vlr_proto = ETHERTYPE_VLAN;
 #endif
 		oldmtu = ifp->if_mtu;
 		error = vlan_config(ifv, p, vlr.vlr_tag, vlr.vlr_proto);
 		if_rele(p);
 
 		/*
 		 * VLAN MTU may change during addition of the vlandev.
 		 * If it did, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 
 	case SIOCGETVLAN:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		bzero(&vlr, sizeof(vlr));
 		VLAN_SLOCK();
 		if (TRUNK(ifv) != NULL) {
 			strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname,
 			    sizeof(vlr.vlr_parent));
 			vlr.vlr_tag = ifv->ifv_vid;
 			vlr.vlr_proto = ifv->ifv_proto;
 		}
 		VLAN_SUNLOCK();
 		error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr));
 		break;
 
 	case SIOCSIFFLAGS:
 		/*
 		 * We should propagate selected flags to the parent,
 		 * e.g., promiscuous mode.
 		 */
 		VLAN_XLOCK();
 		if (TRUNK(ifv) != NULL)
 			error = vlan_setflags(ifp, 1);
 		VLAN_XUNLOCK();
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		/*
 		 * If we don't have a parent, just remember the membership for
 		 * when we do.
 		 *
 		 * XXX We need the rmlock here to avoid sleeping while
 		 * holding in6_multi_mtx.
 		 */
 		VLAN_XLOCK();
 		trunk = TRUNK(ifv);
 		if (trunk != NULL)
 			error = vlan_setmulti(ifp);
 		VLAN_XUNLOCK();
 
 		break;
 	case SIOCGVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		ifr->ifr_vlan_pcp = ifv->ifv_pcp;
 		break;
 
 	case SIOCSVLANPCP:
 #ifdef VIMAGE
 		if (ifp->if_vnet != ifp->if_home_vnet) {
 			error = EPERM;
 			break;
 		}
 #endif
 		error = priv_check(curthread, PRIV_NET_SETVLANPCP);
 		if (error)
 			break;
 		if (ifr->ifr_vlan_pcp > VLAN_PCP_MAX) {
 			error = EINVAL;
 			break;
 		}
 		ifv->ifv_pcp = ifr->ifr_vlan_pcp;
 		ifp->if_pcp = ifv->ifv_pcp;
 		/* broadcast event about PCP change */
 		EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP);
 		break;
 
 	case SIOCSIFCAP:
 		VLAN_SLOCK();
 		ifv->ifv_capenable = ifr->ifr_reqcap;
 		trunk = TRUNK(ifv);
 		if (trunk != NULL) {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			vlan_capabilities(ifv);
 			NET_EPOCH_EXIT(et);
 		}
 		VLAN_SUNLOCK();
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 #if defined(KERN_TLS) || defined(RATELIMIT)
 static int
 vlan_snd_tag_alloc(struct ifnet *ifp,
     union if_snd_tag_alloc_params *params,
     struct m_snd_tag **ppmt)
 {
 	struct epoch_tracker et;
+	const struct if_snd_tag_sw *sw;
 	struct vlan_snd_tag *vst;
 	struct ifvlan *ifv;
 	struct ifnet *parent;
 	int error;
 
+	switch (params->hdr.type) {
+#ifdef RATELIMIT
+	case IF_SND_TAG_TYPE_UNLIMITED:
+		sw = &vlan_snd_tag_ul_sw;
+		break;
+	case IF_SND_TAG_TYPE_RATE_LIMIT:
+		sw = &vlan_snd_tag_rl_sw;
+		break;
+#endif
+#ifdef KERN_TLS
+	case IF_SND_TAG_TYPE_TLS:
+		sw = &vlan_snd_tag_tls_sw;
+		break;
+#ifdef RATELIMIT
+	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
+		sw = &vlan_snd_tag_tls_rl_sw;
+		break;
+#endif
+#endif
+	default:
+		return (EOPNOTSUPP);
+	}
+
 	NET_EPOCH_ENTER(et);
 	ifv = ifp->if_softc;
 	if (ifv->ifv_trunk != NULL)
 		parent = PARENT(ifv);
 	else
 		parent = NULL;
 	if (parent == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EOPNOTSUPP);
 	}
 	if_ref(parent);
 	NET_EPOCH_EXIT(et);
 
 	vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT);
 	if (vst == NULL) {
 		if_rele(parent);
 		return (ENOMEM);
 	}
 
 	error = m_snd_tag_alloc(parent, params, &vst->tag);
 	if_rele(parent);
 	if (error) {
 		free(vst, M_VLAN);
 		return (error);
 	}
 
-	m_snd_tag_init(&vst->com, ifp, vst->tag->type);
+	m_snd_tag_init(&vst->com, ifp, sw);
 
 	*ppmt = &vst->com;
 	return (0);
 }
 
 static struct m_snd_tag *
 vlan_next_snd_tag(struct m_snd_tag *mst)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	return (vst->tag);
 }
 
 static int
 vlan_snd_tag_modify(struct m_snd_tag *mst,
     union if_snd_tag_modify_params *params)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
-	return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params));
+	return (vst->tag->sw->snd_tag_modify(vst->tag, params));
 }
 
 static int
 vlan_snd_tag_query(struct m_snd_tag *mst,
     union if_snd_tag_query_params *params)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
-	return (vst->tag->ifp->if_snd_tag_query(vst->tag, params));
+	return (vst->tag->sw->snd_tag_query(vst->tag, params));
 }
 
 static void
 vlan_snd_tag_free(struct m_snd_tag *mst)
 {
 	struct vlan_snd_tag *vst;
 
 	vst = mst_to_vst(mst);
 	m_snd_tag_rele(vst->tag);
 	free(vst, M_VLAN);
 }
 
 static void
 vlan_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
 {
 	/*
 	 * For vlan, we have an indirect
 	 * interface. The caller needs to
 	 * get a ratelimit tag on the actual
 	 * interface the flow will go on.
 	 */
 	q->rate_table = NULL;
 	q->flags = RT_IS_INDIRECT;
 	q->max_flows = 0;
 	q->number_of_rates = 0;
 }
 
 #endif
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 4c21bdbf1347..04d34b022772 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -1,3597 +1,3582 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_pcbgroup.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #ifdef INET
 #include <netinet/in_var.h>
 #include <netinet/in_fib.h>
 #endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #ifdef TCPHPTS
 #include <netinet/tcp_hpts.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 #include <net/route/nhop.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <security/mac/mac_framework.h>
 
 #define	INPCBLBGROUP_SIZMIN	8
 #define	INPCBLBGROUP_SIZMAX	256
 
 static struct callout	ipport_tick_callout;
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Variables dealing with random ephemeral port allocation. */
 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
 VNET_DEFINE(int, ipport_tcpallocs);
 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
 static void	in_pcbremlists(struct inpcb *inp);
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp,
 			    uint8_t numa_domain);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IP Ports");
 
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
 	&VNET_NAME(ipport_reservedhigh), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 
 #ifdef RATELIMIT
 counter_u64_t rate_limit_new;
 counter_u64_t rate_limit_chg;
 counter_u64_t rate_limit_active;
 counter_u64_t rate_limit_alloc_fail;
 counter_u64_t rate_limit_set_ok;
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "IP Rate Limiting");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
     &rate_limit_active, "Active rate limited connections");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
    &rate_limit_alloc_fail, "Rate limited connection failures");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
    &rate_limit_set_ok, "Rate limited setting succeeded");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
    &rate_limit_new, "Total Rate limit new attempts");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
    &rate_limit_chg, "Total Rate limited change attempts");
 
 #endif /* RATELIMIT */
 
 #endif /* INET */
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 static struct inpcblbgroup *
 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
     uint16_t port, const union in_dependaddr *addr, int size,
     uint8_t numa_domain)
 {
 	struct inpcblbgroup *grp;
 	size_t bytes;
 
 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
 	if (!grp)
 		return (NULL);
 	grp->il_vflag = vflag;
 	grp->il_lport = port;
 	grp->il_numa_domain = numa_domain;
 	grp->il_dependladdr = *addr;
 	grp->il_inpsiz = size;
 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
 	return (grp);
 }
 
 static void
 in_pcblbgroup_free_deferred(epoch_context_t ctx)
 {
 	struct inpcblbgroup *grp;
 
 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
 	free(grp, M_PCB);
 }
 
 static void
 in_pcblbgroup_free(struct inpcblbgroup *grp)
 {
 
 	CK_LIST_REMOVE(grp, il_list);
 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
 }
 
 static struct inpcblbgroup *
 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
     struct inpcblbgroup *old_grp, int size)
 {
 	struct inpcblbgroup *grp;
 	int i;
 
 	grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
 	    old_grp->il_numa_domain);
 	if (grp == NULL)
 		return (NULL);
 
 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid new local group size %d and old local group count %d",
 	     grp->il_inpsiz, old_grp->il_inpcnt));
 
 	for (i = 0; i < old_grp->il_inpcnt; ++i)
 		grp->il_inp[i] = old_grp->il_inp[i];
 	grp->il_inpcnt = old_grp->il_inpcnt;
 	in_pcblbgroup_free(old_grp);
 	return (grp);
 }
 
 /*
  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
  * and shrink group if possible.
  */
 static void
 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
     int i)
 {
 	struct inpcblbgroup *grp, *new_grp;
 
 	grp = *grpp;
 	for (; i + 1 < grp->il_inpcnt; ++i)
 		grp->il_inp[i] = grp->il_inp[i + 1];
 	grp->il_inpcnt--;
 
 	if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
 	    grp->il_inpcnt <= grp->il_inpsiz / 4) {
 		/* Shrink this group. */
 		new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
 		if (new_grp != NULL)
 			*grpp = new_grp;
 	}
 }
 
 /*
  * Add PCB to load balance group for SO_REUSEPORT_LB option.
  */
 static int
 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
 {
 	const static struct timeval interval = { 60, 0 };
 	static struct timeval lastprint;
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	/*
 	 * Don't allow jailed socket to join local group.
 	 */
 	if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
 		return (0);
 
 #ifdef INET6
 	/*
 	 * Don't allow IPv4 mapped INET6 wild socket.
 	 */
 	if ((inp->inp_vflag & INP_IPV4) &&
 	    inp->inp_laddr.s_addr == INADDR_ANY &&
 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
 		return (0);
 	}
 #endif
 
 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		if (grp->il_vflag == inp->inp_vflag &&
 		    grp->il_lport == inp->inp_lport &&
 		    grp->il_numa_domain == numa_domain &&
 		    memcmp(&grp->il_dependladdr,
 		    &inp->inp_inc.inc_ie.ie_dependladdr,
 		    sizeof(grp->il_dependladdr)) == 0)
 			break;
 	}
 	if (grp == NULL) {
 		/* Create new load balance group. */
 		grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
 		    INPCBLBGROUP_SIZMIN, numa_domain);
 		if (grp == NULL)
 			return (ENOBUFS);
 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
 			if (ratecheck(&lastprint, &interval))
 				printf("lb group port %d, limit reached\n",
 				    ntohs(grp->il_lport));
 			return (0);
 		}
 
 		/* Expand this local group. */
 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
 		if (grp == NULL)
 			return (ENOBUFS);
 	}
 
 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
 	    grp->il_inpcnt));
 
 	grp->il_inp[grp->il_inpcnt] = inp;
 	grp->il_inpcnt++;
 	return (0);
 }
 
 /*
  * Remove PCB from load balance group.
  */
 static void
 in_pcbremlbgrouphash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int i;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_inpcnt == 1) {
 				/* We are the last, free this local group. */
 				in_pcblbgroup_free(grp);
 			} else {
 				/* Pull up inpcbs, shrink group if possible. */
 				in_pcblbgroup_reorder(hdr, &grp, i);
 			}
 			return;
 		}
 	}
 }
 
 int
 in_pcblbgroup_numa(struct inpcb *inp, int arg)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int err, i;
 	uint8_t numa_domain;
 
 	switch (arg) {
 	case TCP_REUSPORT_LB_NUMA_NODOM:
 		numa_domain = M_NODOM;
 		break;
 	case TCP_REUSPORT_LB_NUMA_CURDOM:
 		numa_domain = PCPU_GET(domain);
 		break;
 	default:
 		if (arg < 0 || arg >= vm_ndomains)
 			return (EINVAL);
 		numa_domain = arg;
 	}
 
 	err = 0;
 	pcbinfo = inp->inp_pcbinfo;
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_numa_domain == numa_domain) {
 				goto abort_with_hash_wlock;
 			}
 
 			/* Remove it from the old group. */
 			in_pcbremlbgrouphash(inp);
 
 			/* Add it to the new group based on numa domain. */
 			in_pcbinslbgrouphash(inp, numa_domain);
 			goto abort_with_hash_wlock;
 		}
 	}
 	err = ENOENT;
 abort_with_hash_wlock:
 	INP_HASH_WUNLOCK(pcbinfo);
 	return (err);
 }
 
 /*
  * Different protocols initialize their inpcbs differently - giving
  * different name to the lock.  But they all are disposed the same.
  */
 static void
 inpcb_fini(void *mem, int size)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_DESTROY(inp);
 }
 
 /*
  * Initialize an inpcbinfo -- we should be able to reduce the number of
  * arguments in time.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
     char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
 {
 
 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
 
 	INP_INFO_LOCK_INIT(pcbinfo, name);
 	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
 	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	pcbinfo->ipi_listhead = listhead;
 	CK_LIST_INIT(pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
 	    NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
 	uma_zone_set_warning(pcbinfo->ipi_zone,
 	    "kern.ipc.maxsockets limit reached");
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
 	    pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_destroy(pcbinfo);
 #endif
 	uma_zdestroy(pcbinfo->ipi_zone);
 	INP_LIST_LOCK_DESTROY(pcbinfo);
 	INP_HASH_LOCK_DESTROY(pcbinfo);
 	INP_INFO_LOCK_DESTROY(pcbinfo);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 	int error;
 
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(&inp->inp_start_zero, inp_zero_size);
 #ifdef NUMA
 	inp->inp_numa_domain = M_NODOM;
 #endif
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	error = ipsec_init_pcbpolicy(inp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	INP_WLOCK(inp);
 	INP_LIST_WLOCK(pcbinfo);
 	CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
 
 	/*
 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
 	 * to be cleaned up.
 	 */
 	inp->inp_route.ro_flags = RT_LLE_CACHE;
 	INP_LIST_WUNLOCK(pcbinfo);
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	KASSERT(nam == NULL || nam->sa_family == AF_INET,
 	    ("%s: invalid address family for %p", __func__, nam));
 	KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
 	    ("%s: invalid address length for %p", __func__, nam));
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 #if defined(INET) || defined(INET6)
 /*
  * Assign a local port like in_pcb_lport(), but also used with connect()
  * and a foreign address and port.  If fsa is non-NULL, choose a local port
  * that is unused with those, otherwise one that is completely unused.
  * lsa can be NULL for IPv6.
  */
 int
 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, dorandom, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr, faddr;
 #endif
 #ifdef INET6
 	struct in6_addr *laddr6, *faddr6;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 	/*
 	 * For UDP(-Lite), use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
 		pcbinfo == &V_ulitecbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP(-Lite) packets in the count. */
 	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
 		V_ipport_tcpallocs++;
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	laddr.s_addr = INADDR_ANY;
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		if (lsa != NULL)
 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
 		if (fsa != NULL)
 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
 	}
 #endif
 #ifdef INET6
 	laddr6 = NULL;
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		if (lsa != NULL)
 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
 		if (fsa != NULL)
 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
 	}
 #endif
 
 	tmpinp = NULL;
 	lport = *lportp;
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 		if (fsa != NULL) {
 #ifdef INET
 			if (lsa->sa_family == AF_INET) {
 				tmpinp = in_pcblookup_hash_locked(pcbinfo,
 				    faddr, fport, laddr, lport, lookupflags,
 				    NULL, M_NODOM);
 			}
 #endif
 #ifdef INET6
 			if (lsa->sa_family == AF_INET6) {
 				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
 				    faddr6, fport, laddr6, lport, lookupflags,
 				    NULL, M_NODOM);
 			}
 #endif
 		} else {
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV6) != 0)
 				tmpinp = in6_pcblookup_local(pcbinfo,
 				    &inp->in6p_laddr, lport, lookupflags, cred);
 #endif
 #if defined(INET) && defined(INET6)
 			else
 #endif
 #ifdef INET
 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
 				    lport, lookupflags, cred);
 #endif
 		}
 	} while (tmpinp != NULL);
 
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Select a local port (number) to use.
  */
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct sockaddr_in laddr;
 
 	if (laddrp) {
 		bzero(&laddr, sizeof(laddr));
 		laddr.sin_family = AF_INET;
 		laddr.sin_addr = *laddrp;
 	}
 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
 	    NULL, lportp, NULL, 0, cred, lookupflags));
 }
 
 /*
  * Return cached socket options.
  */
 int
 inp_so_options(const struct inpcb *inp)
 {
 	int so_options;
 
 	so_options = 0;
 
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 		so_options |= SO_REUSEPORT_LB;
 	if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 		so_options |= SO_REUSEPORT;
 	if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 		so_options |= SO_REUSEADDR;
 	return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the exisitng inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
 	 * so that we don't have to add to the (already messy) code below.
 	 */
 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		KASSERT(sin->sin_family == AF_INET,
 		    ("%s: invalid family for address %p", __func__, sin));
 		KASSERT(sin->sin_len == sizeof(*sin),
 		    ("%s: invalid length for address %p", __func__, sin));
 
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 			/*
 			 * XXX: How to deal with SO_REUSEPORT_LB here?
 			 * Treat same as SO_REUSEPORT for now.
 			 */
 			if ((so->so_options &
 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address?
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) ||
 				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    ((reuseport & tw->tw_so_options) == 0 &&
 					(reuseport_lb &
 				            tw->tw_so_options) == 0)) {
 					return (EADDRINUSE);
 				}
 			} else if (t &&
 				   ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				   (reuseport & inp_so_options(t)) == 0 &&
 				   (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 						return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m, bool rehash)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		KASSERT(rehash == true,
 		    ("Rehashing required for unbound inps"));
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	if (rehash) {
 		in_pcbrehash_mbuf(inp, m);
 	} else {
 		in_pcbinshash_mbuf(inp, m);
 	}
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true));
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin, dst;
 	struct nhop_object *nh;
 	int error;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 
 	nh = NULL;
 	bzero(&dst, sizeof(dst));
 	sin = &dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
 		    0, NHR_NONE, 0);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 *
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (nh == NULL || nh->nh_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL) {
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 						inp->inp_socket->so_fibnum));
 		}
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ia = NULL;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)nh->nh_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)nh->nh_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = nh->nh_ifp;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct in_ifaddr *ia;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ia = NULL;
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				goto done;
 			}
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	KASSERT(sin->sin_family == AF_INET,
 	    ("%s: invalid address family for %p", __func__, sin));
 	KASSERT(sin->sin_len == sizeof(*sin),
 	    ("%s: invalid address length for %p", __func__, sin));
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	NET_EPOCH_ASSERT();
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 #ifdef ROUTE_MPATH
 	if (CALC_FLOWID_OUTBOUND) {
 		uint32_t hash_val, hash_type;
 
 		hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
 
 		inp->inp_flowid = hash_val;
 		inp->inp_flowtype = hash_type;
 	}
 #endif
 	if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			faddr =
 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			if (cred != NULL &&
 			    (error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&CK_STAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				IN_IFADDR_RLOCK(&in_ifa_tracker);
 				CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if ((ia->ia_ifp == ifp) &&
 					    (cred == NULL ||
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0))
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			}
 		}
 		if (error)
 			return (error);
 	}
 
 	if (lport != 0) {
 		oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
 		    fport, laddr, lport, 0, NULL, M_NODOM);
 		if (oinp != NULL) {
 			if (oinpp != NULL)
 				*oinpp = oinp;
 			return (EADDRINUSE);
 		}
 	} else {
 		struct sockaddr_in lsin, fsin;
 
 		bzero(&lsin, sizeof(lsin));
 		bzero(&fsin, sizeof(fsin));
 		lsin.sin_family = AF_INET;
 		lsin.sin_addr = laddr;
 		fsin.sin_family = AF_INET;
 		fsin.sin_addr = faddr;
 		error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
 		    &lport, (struct sockaddr *)& fsin, fport, cred,
 		    INPLOOKUP_WILDCARD);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 #ifdef RATELIMIT
 	if (inp->inp_snd_tag != NULL)
 		in_pcbdetach_txrtlmt(inp);
 #endif
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock may already held, or when acquiring a reference
  * via a pcbgroup.
  *
  * in_pcbref() should be used only to provide brief memory stability, and
  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
  * garbage collect the inpcb if it has been in_pcbfree()'d from another
  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
  * lock and rele are the *only* safe operations that may be performed on the
  * inpcb.
  *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
  * using in_pcbrele().
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	refcount_acquire(&inp->inp_refcount);
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
  *
  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
  * reference on an inpcb.  Historically more work was done here (actually, in
  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
  * about memory stability (and continued use of the write lock).
  */
 int
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_RUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 #ifdef TCPHPTS
 	if (inp->inp_in_hpts || inp->inp_in_input) {
 		struct tcp_hpts_entry *hpts;
 		/*
 		 * We should not be on the hpts at
 		 * this point in any form. we must
 		 * get the lock to be sure.
 		 */
 		hpts = tcp_hpts_lock(inp);
 		if (inp->inp_in_hpts)
 			panic("Hpts:%p inp:%p at free still on hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 		hpts = tcp_input_lock(inp);
 		if (inp->inp_in_input)
 			panic("Hpts:%p inp:%p at free still on input hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 	}
 #endif
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 int
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_WUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 #ifdef TCPHPTS
 	if (inp->inp_in_hpts || inp->inp_in_input) {
 		struct tcp_hpts_entry *hpts;
 		/*
 		 * We should not be on the hpts at
 		 * this point in any form. we must
 		 * get the lock to be sure.
 		 */
 		hpts = tcp_hpts_lock(inp);
 		if (inp->inp_in_hpts)
 			panic("Hpts:%p inp:%p at free still on hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 		hpts = tcp_input_lock(inp);
 		if (inp->inp_in_input)
 			panic("Hpts:%p inp:%p at free still on input hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 	}
 #endif
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 /*
  * Temporary wrapper.
  */
 int
 in_pcbrele(struct inpcb *inp)
 {
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 void
 in_pcblist_rele_rlocked(epoch_context_t ctx)
 {
 	struct in_pcblist *il;
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int i, n;
 
 	il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
 	pcbinfo = il->il_pcbinfo;
 	n = il->il_count;
 	INP_INFO_WLOCK(pcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = il->il_inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 	free(il, M_TEMP);
 }
 
 static void
 inpcbport_free(epoch_context_t ctx)
 {
 	struct inpcbport *phd;
 
 	phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
 	free(phd, M_PCB);
 }
 
 static void
 in_pcbfree_deferred(epoch_context_t ctx)
 {
 	struct inpcb *inp;
 	int released __unused;
 
 	inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
 
 	INP_WLOCK(inp);
 	CURVNET_SET(inp->inp_vnet);
 #ifdef INET
 	struct ip_moptions *imo = inp->inp_moptions;
 	inp->inp_moptions = NULL;
 #endif
 	/* XXXRW: Do as much as possible here. */
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 #ifdef INET6
 	struct ip6_moptions *im6o = NULL;
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		im6o = inp->in6p_moptions;
 		inp->in6p_moptions = NULL;
 	}
 #endif
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 	inp->inp_vflag = 0;
 	crfree(inp->inp_cred);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	released = in_pcbrele_wlocked(inp);
 	MPASS(released);
 #ifdef INET6
 	ip6_freemoptions(im6o);
 #endif
 #ifdef INET
 	inp_freemoptions(imo);
 #endif
 	CURVNET_RESTORE();
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
  * work, including removal from global lists, is done in this context, where
  * the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 	KASSERT((inp->inp_flags2 & INP_FREED) == 0,
 	    ("%s: called twice for pcb %p", __func__, inp));
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_WUNLOCK(inp);
 		return;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 	INP_LIST_WLOCK(pcbinfo);
 	in_pcbremlists(inp);
 	INP_LIST_WUNLOCK(pcbinfo);
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	/* mark as destruction in progress */
 	inp->inp_flags2 |= INP_FREED;
 	INP_WUNLOCK(inp);
 	NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 #ifdef INVARIANTS
 	if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
 		MPASS(inp->inp_refcount > 1);
 #endif
 
 	/*
 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 	 * the hash lock...?
 	 */
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		in_pcbremlbgrouphash(inp);
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 #ifdef PCBGROUP
 		in_pcbgroup_remove(inp);
 #endif
 	}
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct in_multi *inm;
 	struct in_mfilter *imf;
 	struct ip_moptions *imo;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 *
 			 * XXX This can all be deferred to an epoch_call
 			 */
 restart:
 			IP_MFILTER_FOREACH(imf, &imo->imo_head) {
 				if ((inm = imf->imf_inm) == NULL)
 					continue;
 				if (inm->inm_ifp != ifp)
 					continue;
 				ip_mfilter_remove(&imo->imo_head, imf);
 				IN_MULTI_LOCK_ASSERT();
 				in_leavegroup_locked(inm, NULL);
 				ip_mfilter_free(imf);
 				goto restart;
 			}
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    prison_equal_ip4(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
 					cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 static struct inpcb *
 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
     uint16_t fport, int lookupflags, int numa_domain)
 {
 	struct inpcb *local_wild, *numa_wild;
 	const struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 
 	/*
 	 * Order of socket selection:
 	 * 1. non-wild.
 	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
 	 *
 	 * NOTE:
 	 * - Load balanced group does not contain jailed sockets
 	 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
 	 */
 	local_wild = NULL;
 	numa_wild = NULL;
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 #ifdef INET6
 		if (!(grp->il_vflag & INP_IPV4))
 			continue;
 #endif
 		if (grp->il_lport != lport)
 			continue;
 
 		idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
 		    grp->il_inpcnt;
 		if (grp->il_laddr.s_addr == laddr->s_addr) {
 			if (numa_domain == M_NODOM ||
 			    grp->il_numa_domain == numa_domain) {
 				return (grp->il_inp[idx]);
 			} else {
 				numa_wild = grp->il_inp[idx];
 			}
 		}
 		if (grp->il_laddr.s_addr == INADDR_ANY &&
 		    (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
 		    (local_wild == NULL || numa_domain == M_NODOM ||
 			grp->il_numa_domain == numa_domain)) {
 			local_wild = grp->il_inp[idx];
 		}
 	}
 	if (numa_wild != NULL)
 		return (numa_wild);
 
 	return (local_wild);
 }
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 	bool locked;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbgroup->ipg_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 #ifdef	RSS
 	/*
 	 * For incoming connections, we may wish to do a wildcard
 	 * match for an RSS-local socket.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
 		    lport, 0, pcbgroup->ipg_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	}
 #endif
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_wildmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		locked = INP_TRY_WLOCK(inp);
 	else if (lookupflags & INPLOOKUP_RLOCKPCB)
 		locked = INP_TRY_RLOCK(inp);
 	else
 		panic("%s: locking bug", __func__);
 	if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB)
 			INP_WUNLOCK(inp);
 		else
 			INP_RUNLOCK(inp);
 		return (NULL);
 	} else if (!locked)
 		in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (!locked) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		}
 	}
 #ifdef INVARIANTS
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		INP_WLOCK_ASSERT(inp);
 	else
 		INP_RLOCK_ASSERT(inp);
 #endif
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has locked the hash list, and will not perform any further
  * locking or reference operations on either the hash list or the connection.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look in lb group (for wildcard match).
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
 		    fport, lookupflags, numa_domain);
 		if (inp != NULL)
 			return (inp);
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcb *inp;
 
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
 	if (inp != NULL) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 		} else
 			panic("%s: locking bug", __func__);
 		if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 			INP_UNLOCK(inp);
 			inp = NULL;
 		}
 	}
 
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in_pcbgroup.c.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, M_NODOM));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
 		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, m->m_pkthdr.numa_domain));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 static int
 in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 	int so_options;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Add entry to load balance group.
 	 * Only do this if SO_REUSEPORT_LB is set.
 	 */
 	so_options = inp_so_options(inp);
 	if (so_options & SO_REUSEPORT_LB) {
 		int ret = in_pcbinslbgrouphash(inp, M_NODOM);
 		if (ret) {
 			/* pcb lb group malloc fail (ret=ENOBUFS). */
 			return (ret);
 		}
 	}
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
 		phd->phd_port = inp->inp_lport;
 		CK_LIST_INIT(&phd->phd_pcblist);
 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 #ifdef PCBGROUP
 	if (m != NULL) {
 		in_pcbgroup_update_mbuf(inp, m);
 	} else {
 		in_pcbgroup_update(inp);
 	}
 #endif
 	return (0);
 }
 
 int
 in_pcbinshash(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, NULL));
 }
 
 int
 in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 
 	return (in_pcbinshash_internal(inp, m));
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	CK_LIST_REMOVE(inp, inp_hash);
 	CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 
 #ifdef PCBGROUP
 	if (m != NULL)
 		in_pcbgroup_update_mbuf(inp, m);
 	else
 		in_pcbgroup_update(inp);
 #endif
 }
 
 void
 in_pcbrehash(struct inpcb *inp)
 {
 
 	in_pcbrehash_mbuf(inp, NULL);
 }
 
 /*
  * Remove PCB from various lists.
  */
 static void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_LIST_WLOCK_ASSERT(pcbinfo);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
 
 		/* XXX: Only do if SO_REUSEPORT_LB set? */
 		in_pcbremlbgrouphash(inp);
 
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
 		}
 		INP_HASH_WUNLOCK(pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 	CK_LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 #ifdef PCBGROUP
 	in_pcbgroup_remove(inp);
 #endif
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in_losing(struct inpcb *inp)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	return;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 static void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 static void
 ip_fini(void *xtp)
 {
 
 	callout_stop(&ipport_tick_callout);
 }
 
 /*
  * The ipport_callout should start running at about the time we attach the
  * inet or inet6 domains.
  */
 static void
 ipport_tick_init(const void *unused __unused)
 {
 
 	/* Start ipport_tick. */
 	callout_init(&ipport_tick_callout, 1);
 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 }
 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
     ipport_tick_init, NULL);
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANT_SUPPORT
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 /*
  * Create an external-format (``xinpcb'') structure using the information in
  * the kernel-format in_pcb structure pointed to by inp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
 {
 
 	bzero(xi, sizeof(*xi));
 	xi->xi_len = sizeof(struct xinpcb);
 	if (inp->inp_socket)
 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
 	xi->inp_gencnt = inp->inp_gencnt;
 	xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
 	xi->inp_flow = inp->inp_flow;
 	xi->inp_flowid = inp->inp_flowid;
 	xi->inp_flowtype = inp->inp_flowtype;
 	xi->inp_flags = inp->inp_flags;
 	xi->inp_flags2 = inp->inp_flags2;
 	xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
 	xi->in6p_cksum = inp->in6p_cksum;
 	xi->in6p_hops = inp->in6p_hops;
 	xi->inp_ip_tos = inp->inp_ip_tos;
 	xi->inp_vflag = inp->inp_vflag;
 	xi->inp_ip_ttl = inp->inp_ip_ttl;
 	xi->inp_ip_p = inp->inp_ip_p;
 	xi->inp_ip_minttl = inp->inp_ip_minttl;
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ORIGDSTADDR) {
 		db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
 
 #ifdef RATELIMIT
 /*
  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
  * if any.
  */
 int
 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
-	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
-	ifp = mst->ifp;
-	if (ifp == NULL)
-		return (EINVAL);
-
-	if (ifp->if_snd_tag_modify == NULL) {
+	if (mst->sw->snd_tag_modify == NULL) {
 		error = EOPNOTSUPP;
 	} else {
-		error = ifp->if_snd_tag_modify(mst, &params);
+		error = mst->sw->snd_tag_modify(mst, &params);
 	}
 	return (error);
 }
 
 /*
  * Query existing TX rate limit based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
-	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
-	ifp = mst->ifp;
-	if (ifp == NULL)
-		return (EINVAL);
-
-	if (ifp->if_snd_tag_query == NULL) {
+	if (mst->sw->snd_tag_query == NULL) {
 		error = EOPNOTSUPP;
 	} else {
-		error = ifp->if_snd_tag_query(mst, &params);
-		if (error == 0 &&  p_max_pacing_rate != NULL)
+		error = mst->sw->snd_tag_query(mst, &params);
+		if (error == 0 && p_max_pacing_rate != NULL)
 			*p_max_pacing_rate = params.rate_limit.max_rate;
 	}
 	return (error);
 }
 
 /*
  * Query existing TX queue level based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
-	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
-	ifp = mst->ifp;
-	if (ifp == NULL)
-		return (EINVAL);
-
-	if (ifp->if_snd_tag_query == NULL)
+	if (mst->sw->snd_tag_query == NULL)
 		return (EOPNOTSUPP);
 
-	error = ifp->if_snd_tag_query(mst, &params);
-	if (error == 0 &&  p_txqueue_level != NULL)
+	error = mst->sw->snd_tag_query(mst, &params);
+	if (error == 0 && p_txqueue_level != NULL)
 		*p_txqueue_level = params.rate_limit.queue_level;
 	return (error);
 }
 
 /*
  * Allocate a new TX rate limit send tag from the network interface
  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
 
 {
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If there is already a send tag, or the INP is being torn
 	 * down, allocating a new send tag is not allowed. Else send
 	 * tags may leak.
 	 */
 	if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
 		return (EINVAL);
 
 	error = m_snd_tag_alloc(ifp, &params, st);
 #ifdef INET
 	if (error == 0) {
 		counter_u64_add(rate_limit_set_ok, 1);
 		counter_u64_add(rate_limit_active, 1);
 	} else if (error != EOPNOTSUPP)
 		  counter_u64_add(rate_limit_alloc_fail, 1);
 #endif
 	return (error);
 }
 
 void
 in_pcbdetach_tag(struct m_snd_tag *mst)
 {
 
 	m_snd_tag_rele(mst);
 #ifdef INET
 	counter_u64_add(rate_limit_active, -1);
 #endif
 }
 
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
  */
 void
 in_pcbdetach_txrtlmt(struct inpcb *inp)
 {
 	struct m_snd_tag *mst;
 
 	INP_WLOCK_ASSERT(inp);
 
 	mst = inp->inp_snd_tag;
 	inp->inp_snd_tag = NULL;
 
 	if (mst == NULL)
 		return;
 
 	m_snd_tag_rele(mst);
 #ifdef INET
 	counter_u64_add(rate_limit_active, -1);
 #endif
 }
 
 int
 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
 {
 	int error;
 
 	/*
 	 * If the existing send tag is for the wrong interface due to
 	 * a route change, first drop the existing tag.  Set the
 	 * CHANGED flag so that we will keep trying to allocate a new
 	 * tag if we fail to allocate one this time.
 	 */
 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
 		in_pcbdetach_txrtlmt(inp);
 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 	}
 
 	/*
 	 * NOTE: When attaching to a network interface a reference is
 	 * made to ensure the network interface doesn't go away until
 	 * all ratelimit connections are gone. The network interface
 	 * pointers compared below represent valid network interfaces,
 	 * except when comparing towards NULL.
 	 */
 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
 		error = 0;
 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
 		if (inp->inp_snd_tag != NULL)
 			in_pcbdetach_txrtlmt(inp);
 		error = 0;
 	} else if (inp->inp_snd_tag == NULL) {
 		/*
 		 * In order to utilize packet pacing with RSS, we need
 		 * to wait until there is a valid RSS hash before we
 		 * can proceed:
 		 */
 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
 			error = EAGAIN;
 		} else {
 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
 		}
 	} else {
 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 	}
 	if (error == 0 || error == EOPNOTSUPP)
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 
 	return (error);
 }
 
 /*
  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
  * is set in the fast path and will attach/detach/modify the TX rate
  * limit send tag based on the socket's so_max_pacing_rate value.
  */
 void
 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 {
 	struct socket *socket;
 	uint32_t max_pacing_rate;
 	bool did_upgrade;
 	int error;
 
 	if (inp == NULL)
 		return;
 
 	socket = inp->inp_socket;
 	if (socket == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/*
 	 * NOTE: The so_max_pacing_rate value is read unlocked,
 	 * because atomic updates are not required since the variable
 	 * is checked at every mbuf we send. It is assumed that the
 	 * variable read itself will be atomic.
 	 */
 	max_pacing_rate = socket->so_max_pacing_rate;
 
 	error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 /*
  * Track route changes for TX rate limiting.
  */
 void
 in_pcboutput_eagain(struct inpcb *inp)
 {
 	bool did_upgrade;
 
 	if (inp == NULL)
 		return;
 
 	if (inp->inp_snd_tag == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/* detach rate limiting */
 	in_pcbdetach_txrtlmt(inp);
 
 	/* make sure new mbuf send tag allocation is made */
 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 #ifdef INET
 static void
 rl_init(void *st)
 {
 	rate_limit_new = counter_u64_alloc(M_WAITOK);
 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
 	rate_limit_active = counter_u64_alloc(M_WAITOK);
 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
 }
 
 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
 #endif
 #endif /* RATELIMIT */
diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c
index c33b2872e91f..528dc062fd97 100644
--- a/sys/netinet/tcp_ratelimit.c
+++ b/sys/netinet/tcp_ratelimit.c
@@ -1,1734 +1,1734 @@
 /*-
  *
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2018-2020
  *	Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 /**
  * Author: Randall Stewart <rrs@netflix.com>
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 #include "opt_ratelimit.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/mutex.h>
 #include <sys/ck.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #define TCPSTATES		/* for logging */
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_ratelimit.h>
 #ifndef USECS_IN_SECOND
 #define USECS_IN_SECOND 1000000
 #endif
 /*
  * For the purposes of each send, what is the size
  * of an ethernet frame.
  */
 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
 #ifdef RATELIMIT
 
 /*
  * The following preferred table will seem weird to
  * the casual viewer. Why do we not have any rates below
  * 1Mbps? Why do we have a rate at 1.44Mbps called common?
  * Why do the rates cluster in the 1-100Mbps range more
  * than others? Why does the table jump around at the beginnign
  * and then be more consistently raising?
  *
  * Let me try to answer those questions. A lot of
  * this is dependant on the hardware. We have three basic
  * supporters of rate limiting
  *
  * Chelsio - Supporting 16 configurable rates.
  * Mlx  - c4 supporting 13 fixed rates.
  * Mlx  - c5 & c6 supporting 127 configurable rates.
  *
  * The c4 is why we have a common rate that is available
  * in all rate tables. This is a selected rate from the
  * c4 table and we assure its available in all ratelimit
  * tables. This way the tcp_ratelimit code has an assured
  * rate it should always be able to get. This answers a
  * couple of the questions above.
  *
  * So what about the rest, well the table is built to
  * try to get the most out of a joint hardware/software
  * pacing system.  The software pacer will always pick
  * a rate higher than the b/w that it is estimating
  *
  * on the path. This is done for two reasons.
  * a) So we can discover more b/w
  * and
  * b) So we can send a block of MSS's down and then
  *    have the software timer go off after the previous
  *    send is completely out of the hardware.
  *
  * But when we do <b> we don't want to have the delay
  * between the last packet sent by the hardware be
  * excessively long (to reach our desired rate).
  *
  * So let me give an example for clarity.
  *
  * Lets assume that the tcp stack sees that 29,110,000 bps is
  * what the bw of the path is. The stack would select the
  * rate 31Mbps. 31Mbps means that each send that is done
  * by the hardware will cause a 390 micro-second gap between
  * the packets sent at that rate. For 29,110,000 bps we
  * would need 416 micro-seconds gap between each send.
  *
  * Note that are calculating a complete time for pacing
  * which includes the ethernet, IP and TCP overhead. So
  * a full 1514 bytes is used for the above calculations.
  * My testing has shown that both cards are also using this
  * as their basis i.e. full payload size of the ethernet frame.
  * The TCP stack caller needs to be aware of this and make the
  * appropriate overhead calculations be included in its choices.
  *
  * Now, continuing our example, we pick a MSS size based on the
  * delta between the two rates (416 - 390) divided into the rate
  * we really wish to send at rounded up.  That results in a MSS
  * send of 17 mss's at once. The hardware then will
  * run out of data in a single 17MSS send in 6,630 micro-seconds.
  *
  * On the other hand the software pacer will send more data
  * in 7,072 micro-seconds. This means that we will refill
  * the hardware 52 microseconds after it would have sent
  * next if it had not ran out of data. This is a win since we are
  * only sending every 7ms or so and yet all the packets are spaced on
  * the wire with 94% of what they should be and only
  * the last packet is delayed extra to make up for the
  * difference.
  *
  * Note that the above formula has two important caveat.
  * If we are above (b/w wise) over 100Mbps we double the result
  * of the MSS calculation. The second caveat is if we are 500Mbps
  * or more we just send the maximum MSS at once i.e. 45MSS. At
  * the higher b/w's even the cards have limits to what times (timer granularity)
  * they can insert between packets and start to send more than one
  * packet at a time on the wire.
  *
  */
 #define COMMON_RATE 180500
 const uint64_t desired_rates[] = {
 	122500,			/* 1Mbps  - rate 1 */
 	180500,			/* 1.44Mpbs - rate 2  common rate */
 	375000,			/* 3Mbps    - rate 3 */
 	625000,			/* 5Mbps    - rate 4 */
 	1250000,		/* 10Mbps   - rate 5 */
 	1875000,		/* 15Mbps   - rate 6 */
 	2500000,		/* 20Mbps   - rate 7 */
 	3125000,	       	/* 25Mbps   - rate 8 */
 	3750000,		/* 30Mbps   - rate 9 */
 	4375000,		/* 35Mbps   - rate 10 */
 	5000000,		/* 40Meg    - rate 11 */
 	6250000,		/* 50Mbps   - rate 12 */
 	12500000,		/* 100Mbps  - rate 13 */
 	25000000,		/* 200Mbps  - rate 14 */
 	50000000,		/* 400Mbps  - rate 15 */
 	100000000,		/* 800Mbps  - rate 16 */
 	5625000,		/* 45Mbps   - rate 17 */
 	6875000,		/* 55Mbps   - rate 19 */
 	7500000,		/* 60Mbps   - rate 20 */
 	8125000,		/* 65Mbps   - rate 21 */
 	8750000,		/* 70Mbps   - rate 22 */
 	9375000,		/* 75Mbps   - rate 23 */
 	10000000,		/* 80Mbps   - rate 24 */
 	10625000,		/* 85Mbps   - rate 25 */
 	11250000,		/* 90Mbps   - rate 26 */
 	11875000,		/* 95Mbps   - rate 27 */
 	12500000,		/* 100Mbps  - rate 28 */
 	13750000,		/* 110Mbps  - rate 29 */
 	15000000,		/* 120Mbps  - rate 30 */
 	16250000,		/* 130Mbps  - rate 31 */
 	17500000,		/* 140Mbps  - rate 32 */
 	18750000,		/* 150Mbps  - rate 33 */
 	20000000,		/* 160Mbps  - rate 34 */
 	21250000,		/* 170Mbps  - rate 35 */
 	22500000,		/* 180Mbps  - rate 36 */
 	23750000,		/* 190Mbps  - rate 37 */
 	26250000,		/* 210Mbps  - rate 38 */
 	27500000,		/* 220Mbps  - rate 39 */
 	28750000,		/* 230Mbps  - rate 40 */
 	30000000,	       	/* 240Mbps  - rate 41 */
 	31250000,		/* 250Mbps  - rate 42 */
 	34375000,		/* 275Mbps  - rate 43 */
 	37500000,		/* 300Mbps  - rate 44 */
 	40625000,		/* 325Mbps  - rate 45 */
 	43750000,		/* 350Mbps  - rate 46 */
 	46875000,		/* 375Mbps  - rate 47 */
 	53125000,		/* 425Mbps  - rate 48 */
 	56250000,		/* 450Mbps  - rate 49 */
 	59375000,		/* 475Mbps  - rate 50 */
 	62500000,		/* 500Mbps  - rate 51 */
 	68750000,		/* 550Mbps  - rate 52 */
 	75000000,		/* 600Mbps  - rate 53 */
 	81250000,		/* 650Mbps  - rate 54 */
 	87500000,		/* 700Mbps  - rate 55 */
 	93750000,		/* 750Mbps  - rate 56 */
 	106250000,		/* 850Mbps  - rate 57 */
 	112500000,		/* 900Mbps  - rate 58 */
 	125000000,		/* 1Gbps    - rate 59 */
 	156250000,		/* 1.25Gps  - rate 60 */
 	187500000,		/* 1.5Gps   - rate 61 */
 	218750000,		/* 1.75Gps  - rate 62 */
 	250000000,		/* 2Gbps    - rate 63 */
 	281250000,		/* 2.25Gps  - rate 64 */
 	312500000,		/* 2.5Gbps  - rate 65 */
 	343750000,		/* 2.75Gbps - rate 66 */
 	375000000,		/* 3Gbps    - rate 67 */
 	500000000,		/* 4Gbps    - rate 68 */
 	625000000,		/* 5Gbps    - rate 69 */
 	750000000,		/* 6Gbps    - rate 70 */
 	875000000,		/* 7Gbps    - rate 71 */
 	1000000000,		/* 8Gbps    - rate 72 */
 	1125000000,		/* 9Gbps    - rate 73 */
 	1250000000,		/* 10Gbps   - rate 74 */
 	1875000000,		/* 15Gbps   - rate 75 */
 	2500000000		/* 20Gbps   - rate 76 */
 };
 
 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
 #define RS_ORDERED_COUNT 16	/*
 				 * Number that are in order
 				 * at the beginning of the table,
 				 * over this a sort is required.
 				 */
 #define RS_NEXT_ORDER_GROUP 16	/*
 				 * The point in our table where
 				 * we come fill in a second ordered
 				 * group (index wise means -1).
 				 */
 #define ALL_HARDWARE_RATES 1004 /*
 				 * 1Meg - 1Gig in 1 Meg steps
 				 * plus 100, 200k  and 500k and
 				 * 10Gig
 				 */
 
 #define RS_ONE_MEGABIT_PERSEC 1000000
 #define RS_ONE_GIGABIT_PERSEC 1000000000
 #define RS_TEN_GIGABIT_PERSEC 10000000000
 
 static struct head_tcp_rate_set int_rs;
 static struct mtx rs_mtx;
 uint32_t rs_number_alive;
 uint32_t rs_number_dead;
 static uint32_t rs_floor_mss = 0;
 static uint32_t wait_time_floor = 8000;	/* 8 ms */
 static uint32_t rs_hw_floor_mss = 16;
 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Ratelimit stats");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
     &rs_number_alive, 0,
     "Number of interfaces initialized for ratelimiting");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
     &rs_number_dead, 0,
     "Number of interfaces departing from ratelimiting");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
     &rs_floor_mss, 0,
     "Number of MSS that will override the normal minimums (0 means don't enforce)");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
     &wait_time_floor, 2000,
     "Has b/w increases what is the wait floor we are willing to wait at the end?");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
     &num_of_waits_allowed, 1,
     "How many time blocks on the end should software pacing be willing to wait?");
 
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
     &rs_hw_floor_mss, 16,
     "Number of mss that are a minum for hardware pacing?");
 
 
 static void
 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
 {
 	/*
 	 * Add sysctl entries for thus interface.
 	 */
 	if (rs->rs_flags & RS_INTF_NO_SUP) {
 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
 		   SYSCTL_CHILDREN(rl_sysctl_root),
 		   OID_AUTO, "disable", CTLFLAG_RD,
 		   &rs->rs_disable, 0,
 		   "Disable this interface from new hdwr limiting?");
 	} else {
 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
 		   SYSCTL_CHILDREN(rl_sysctl_root),
 		   OID_AUTO, "disable", CTLFLAG_RW,
 		   &rs->rs_disable, 0,
 		   "Disable this interface from new hdwr limiting?");
 	}
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "minseg", CTLFLAG_RW,
 	    &rs->rs_min_seg, 0,
 	    "What is the minimum we need to send on this interface?");
 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flow_limit", CTLFLAG_RW,
 	    &rs->rs_flow_limit, 0,
 	    "What is the limit for number of flows (0=unlimited)?");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "highest", CTLFLAG_RD,
 	    &rs->rs_highest_valid, 0,
 	    "Highest valid rate");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "lowest", CTLFLAG_RD,
 	    &rs->rs_lowest_valid, 0,
 	    "Lowest valid rate");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flags", CTLFLAG_RD,
 	    &rs->rs_flags, 0,
 	    "What lags are on the entry?");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "numrates", CTLFLAG_RD,
 	    &rs->rs_rate_cnt, 0,
 	    "How many rates re there?");
 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flows_using", CTLFLAG_RD,
 	    &rs->rs_flows_using, 0,
 	    "How many flows are using this interface now?");
 #ifdef DETAILED_RATELIMIT_SYSCTL
 	if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
 		/*  Lets display the rates */
 		int i;
 		struct sysctl_oid *rl_rates;
 		struct sysctl_oid *rl_rate_num;
 		char rate_num[16];
 		rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 					    SYSCTL_CHILDREN(rl_sysctl_root),
 					    OID_AUTO,
 					    "rate",
 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 					    "Ratelist");
 		for( i = 0; i < rs->rs_rate_cnt; i++) {
 			sprintf(rate_num, "%d", i);
 			rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 					    SYSCTL_CHILDREN(rl_rates),
 					    OID_AUTO,
 					    rate_num,
 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 					    "Individual Rate");
 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "flags", CTLFLAG_RD,
 				       &rs->rs_rlt[i].flags, 0,
 				       "Flags on this rate");
 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "pacetime", CTLFLAG_RD,
 				       &rs->rs_rlt[i].time_between, 0,
 				       "Time hardware inserts between 1500 byte sends");
 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "rate", CTLFLAG_RD,
 				       &rs->rs_rlt[i].rate,
 				       "Rate in bytes per second");
 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "using", CTLFLAG_RD,
 				       &rs->rs_rlt[i].using,
 				       "Number of flows using");
 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "enobufs", CTLFLAG_RD,
 				       &rs->rs_rlt[i].rs_num_enobufs,
 				       "Number of enobufs logged on this rate");
 
 		}
 	}
 #endif
 }
 
 static void
 rs_destroy(epoch_context_t ctx)
 {
 	struct tcp_rate_set *rs;
 	bool do_free_rs;
 
 	rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
 
 	mtx_lock(&rs_mtx);
 	rs->rs_flags &= ~RS_FUNERAL_SCHD;
 	/*
 	 * In theory its possible (but unlikely)
 	 * that while the delete was occuring
 	 * and we were applying the DEAD flag
 	 * someone slipped in and found the
 	 * interface in a lookup. While we
 	 * decided rs_flows_using were 0 and
 	 * scheduling the epoch_call, the other
 	 * thread incremented rs_flow_using. This
 	 * is because users have a pointer and
 	 * we only use the rs_flows_using in an
 	 * atomic fashion, i.e. the other entities
 	 * are not protected. To assure this did
 	 * not occur, we check rs_flows_using here
 	 * before deleting.
 	 */
 	do_free_rs = (rs->rs_flows_using == 0);
 	rs_number_dead--;
 	mtx_unlock(&rs_mtx);
 
 	if (do_free_rs) {
 		sysctl_ctx_free(&rs->sysctl_ctx);
 		free(rs->rs_rlt, M_TCPPACE);
 		free(rs, M_TCPPACE);
 	}
 }
 
 static void
 rs_defer_destroy(struct tcp_rate_set *rs)
 {
 
 	mtx_assert(&rs_mtx, MA_OWNED);
 
 	/* Check if already pending. */
 	if (rs->rs_flags & RS_FUNERAL_SCHD)
 		return;
 
 	rs_number_dead++;
 
 	/* Set flag to only defer once. */
 	rs->rs_flags |= RS_FUNERAL_SCHD;
 	NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
 }
 
 #ifdef INET
 extern counter_u64_t rate_limit_new;
 extern counter_u64_t rate_limit_chg;
 extern counter_u64_t rate_limit_set_ok;
 extern counter_u64_t rate_limit_active;
 extern counter_u64_t rate_limit_alloc_fail;
 #endif
 
 static int
 rl_attach_txrtlmt(struct ifnet *ifp,
     uint32_t flowtype,
     int flowid,
     uint64_t cfg_rate,
     struct m_snd_tag **tag)
 {
 	int error;
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.max_rate = cfg_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 
 	error = m_snd_tag_alloc(ifp, &params, tag);
 #ifdef INET
 	if (error == 0) {
 		counter_u64_add(rate_limit_set_ok, 1);
 		counter_u64_add(rate_limit_active, 1);
 	} else if (error != EOPNOTSUPP)
 		counter_u64_add(rate_limit_alloc_fail, 1);
 #endif
 	return (error);
 }
 
 static void
 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
 {
 	/*
 	 * The internal table is "special", it
 	 * is two seperate ordered tables that
 	 * must be merged. We get here when the
 	 * adapter specifies a number of rates that
 	 * covers both ranges in the table in some
 	 * form.
 	 */
 	int i, at_low, at_high;
 	uint8_t low_disabled = 0, high_disabled = 0;
 
 	for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
 		rs->rs_rlt[i].flags = 0;
 		rs->rs_rlt[i].time_between = 0;
 		if ((low_disabled == 0) &&
 		    (high_disabled ||
 		     (rate_table_act[at_low] < rate_table_act[at_high]))) {
 			rs->rs_rlt[i].rate = rate_table_act[at_low];
 			at_low++;
 			if (at_low == RS_NEXT_ORDER_GROUP)
 				low_disabled = 1;
 		} else if (high_disabled == 0) {
 			rs->rs_rlt[i].rate = rate_table_act[at_high];
 			at_high++;
 			if (at_high == MAX_HDWR_RATES)
 				high_disabled = 1;
 		}
 	}
 }
 
 static struct tcp_rate_set *
 rt_setup_new_rs(struct ifnet *ifp, int *error)
 {
 	struct tcp_rate_set *rs;
 	const uint64_t *rate_table_act;
 	uint64_t lentim, res;
 	size_t sz;
 	uint32_t hash_type;
 	int i;
 	struct if_ratelimit_query_results rl;
 	struct sysctl_oid *rl_sysctl_root;
 	struct epoch_tracker et;
 	/*
 	 * We expect to enter with the
 	 * mutex locked.
 	 */
 
 	if (ifp->if_ratelimit_query == NULL) {
 		/*
 		 * We can do nothing if we cannot
 		 * get a query back from the driver.
 		 */
 		printf("Warning:No query functions for %s:%d-- failed\n",
 		       ifp->if_dname, ifp->if_dunit);
 		return (NULL);
 	}
 	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
 	if (rs == NULL) {
 		if (error)
 			*error = ENOMEM;
 		printf("Warning:No memory for malloc of tcp_rate_set\n");
 		return (NULL);
 	}
 	memset(&rl, 0, sizeof(rl));
 	rl.flags = RT_NOSUPPORT;
 	ifp->if_ratelimit_query(ifp, &rl);
 	if (rl.flags & RT_IS_UNUSABLE) {
 		/*
 		 * The interface does not really support
 		 * the rate-limiting.
 		 */
 		memset(rs, 0, sizeof(struct tcp_rate_set));
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_flags = RS_INTF_NO_SUP;
 		rs->rs_disable = 1;
 		rs_number_alive++;
 		sysctl_ctx_init(&rs->sysctl_ctx);
 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 		    OID_AUTO,
 		    rs->rs_ifp->if_xname,
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		rl_add_syctl_entries(rl_sysctl_root, rs);
 		NET_EPOCH_ENTER(et);
 		mtx_lock(&rs_mtx);
 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 		return (rs);
 	} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
 		memset(rs, 0, sizeof(struct tcp_rate_set));
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_flags = RS_IS_DEFF;
 		rs_number_alive++;
 		sysctl_ctx_init(&rs->sysctl_ctx);
 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 		    OID_AUTO,
 		    rs->rs_ifp->if_xname,
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		rl_add_syctl_entries(rl_sysctl_root, rs);
 		NET_EPOCH_ENTER(et);
 		mtx_lock(&rs_mtx);
 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 		return (rs);
 	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
 		/* Mellanox C4 likely */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
 		rs->rs_min_seg = rl.min_segment_burst;
 		rs->rs_highest_valid = 0;
 		rs->rs_flow_limit = rl.max_flows;
 		rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
 		rs->rs_disable = 0;
 		rate_table_act = rl.rate_table;
 	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
 		/* Chelsio, C5 and C6 of Mellanox? */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
 		rs->rs_min_seg = rl.min_segment_burst;
 		rs->rs_disable = 0;
 		rs->rs_flow_limit = rl.max_flows;
 		rate_table_act = desired_rates;
 		if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
 		    (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
 			/*
 			 * Our desired table is not big
 			 * enough, do what we can.
 			 */
 			rs->rs_rate_cnt = MAX_HDWR_RATES;
 		 }
 		if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
 			rs->rs_flags = RS_IS_INTF;
 		else
 			rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
 		if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
 			rs->rs_rate_cnt = ALL_HARDWARE_RATES;
 	} else {
 		free(rs, M_TCPPACE);
 		return (NULL);
 	}
 	sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
 	rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
 	if (rs->rs_rlt == NULL) {
 		if (error)
 			*error = ENOMEM;
 bail:
 		free(rs, M_TCPPACE);
 		return (NULL);
 	}
 	if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
 		/*
 		 * The interface supports all
 		 * the rates we could possibly want.
 		 */
 		uint64_t rat;
 
 		rs->rs_rlt[0].rate = 12500;	/* 100k */
 		rs->rs_rlt[1].rate = 25000;	/* 200k */
 		rs->rs_rlt[2].rate = 62500;	/* 500k */
 		/* Note 125000 == 1Megabit
 		 * populate 1Meg - 1000meg.
 		 */
 		for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
 			rs->rs_rlt[i].rate = rat;
 			rat += 125000;
 		}
 		rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
 	} else if (rs->rs_flags & RS_INT_TBL) {
 		/* We populate this in a special way */
 		populate_canned_table(rs, rate_table_act);
 	} else {
 		/*
 		 * Just copy in the rates from
 		 * the table, it is in order.
 		 */
 		for (i=0; i<rs->rs_rate_cnt; i++) {
 			rs->rs_rlt[i].rate = rate_table_act[i];
 			rs->rs_rlt[i].time_between = 0;
 			rs->rs_rlt[i].flags = 0;
 		}
 	}
 	for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
 		/*
 		 * We go backwards through the list so that if we can't get
 		 * a rate and fail to init one, we have at least a chance of
 		 * getting the highest one.
 		 */
 		rs->rs_rlt[i].ptbl = rs;
 		rs->rs_rlt[i].tag = NULL;
 		rs->rs_rlt[i].using = 0;
 		rs->rs_rlt[i].rs_num_enobufs = 0;
 		/*
 		 * Calculate the time between.
 		 */
 		lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 		res = lentim / rs->rs_rlt[i].rate;
 		if (res > 0)
 			rs->rs_rlt[i].time_between = res;
 		else
 			rs->rs_rlt[i].time_between = 1;
 		if (rs->rs_flags & RS_NO_PRE) {
 			rs->rs_rlt[i].flags = HDWRPACE_INITED;
 			rs->rs_lowest_valid = i;
 		} else {
 			int err;
 
 			if ((rl.flags & RT_IS_SETUP_REQ)  &&
 			    (ifp->if_ratelimit_query)) {
 				err = ifp->if_ratelimit_setup(ifp,
   				         rs->rs_rlt[i].rate, i);
 				if (err)
 					goto handle_err;
 			}
 #ifdef RSS
 			hash_type = M_HASHTYPE_RSS_TCP_IPV4;
 #else
 			hash_type = M_HASHTYPE_OPAQUE_HASH;
 #endif
 			err = rl_attach_txrtlmt(ifp,
 			    hash_type,
 			    (i + 1),
 			    rs->rs_rlt[i].rate,
 			    &rs->rs_rlt[i].tag);
 			if (err) {
 handle_err:
 				if (i == (rs->rs_rate_cnt - 1)) {
 					/*
 					 * Huh - first rate and we can't get
 					 * it?
 					 */
 					free(rs->rs_rlt, M_TCPPACE);
 					if (error)
 						*error = err;
 					goto bail;
 				} else {
 					if (error)
 						*error = err;
 				}
 				break;
 			} else {
 				rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
 				rs->rs_lowest_valid = i;
 			}
 		}
 	}
 	/* Did we get at least 1 rate? */
 	if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
 		rs->rs_highest_valid = rs->rs_rate_cnt - 1;
 	else {
 		free(rs->rs_rlt, M_TCPPACE);
 		goto bail;
 	}
 	rs_number_alive++;
 	sysctl_ctx_init(&rs->sysctl_ctx);
 	rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 	    OID_AUTO,
 	    rs->rs_ifp->if_xname,
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "");
 	rl_add_syctl_entries(rl_sysctl_root, rs);
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 	return (rs);
 }
 
 /*
  * For an explanation of why the argument is volatile please
  * look at the comments around rt_setup_rate().
  */
 static const struct tcp_hwrate_limit_table *
 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
     uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
 {
 	struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
 	uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
 	int i;
 
 	mbits_per_sec = (bytes_per_sec * 8);
 	if (flags & RS_PACING_LT) {
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			/*
 			 * Smaller than 1Meg, only
 			 * 3 entries can match it.
 			 */
 			previous_rate = 0;
 			for(i = rs->rs_lowest_valid; i < 3; i++) {
 				if (bytes_per_sec <= rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					break;
 				} else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
 					arte = &rs->rs_rlt[i];
 				}
 				previous_rate = rs->rs_rlt[i].rate;
 			}
 			goto done;
 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 			/*
 			 * Larger than 1G (the majority of
 			 * our table.
 			 */
 			if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			else
 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
 			goto done;
 		}
 		/*
 		 * If we reach here its in our table (between 1Meg - 1000Meg),
 		 * just take the rounded down mbits per second, and add
 		 * 1Megabit to it, from this we can calculate
 		 * the index in the table.
 		 */
 		ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 		if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
 			ind_calc++;
 		/* our table is offset by 3, we add 2 */
 		ind_calc += 2;
 		if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 			/* This should not happen */
 			ind_calc = ALL_HARDWARE_RATES-1;
 		}
 		if ((ind_calc >= rs->rs_lowest_valid) &&
 		    (ind_calc <= rs->rs_highest_valid)) {
 			rte = &rs->rs_rlt[ind_calc];
 			if (ind_calc >= 1)
 				previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 		}
 	} else if (flags & RS_PACING_EXACT_MATCH) {
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			for(i = rs->rs_lowest_valid; i < 3; i++) {
 				if (bytes_per_sec == rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					break;
 				}
 			}
 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 			/* > 1Gbps only one rate */
 			if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
 				/* Its 10G wow */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			}
 		} else {
 			/* Ok it must be a exact meg (its between 1G and 1Meg) */
 			ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 			if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 				/* its an exact Mbps */
 				ind_calc += 2;
 				if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 					/* This should not happen */
 					ind_calc = ALL_HARDWARE_RATES-1;
 				}
 				if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
 					rte = &rs->rs_rlt[ind_calc];
 			}
 		}
 	} else {
 		/* we want greater than the requested rate */
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
 			for (i=2; i>=rs->rs_lowest_valid; i--) {
 				if (bytes_per_sec < rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					if (i >= 1) {
 						previous_rate = rs->rs_rlt[(i-1)].rate;
 					}
 					break;
 				} else if ((flags & RS_PACING_GEQ) &&
 					   (bytes_per_sec == rs->rs_rlt[i].rate)) {
 					rte = &rs->rs_rlt[i];
 					if (i >= 1) {
 						previous_rate = rs->rs_rlt[(i-1)].rate;
 					}
 					break;
 				} else {
 					arte = &rs->rs_rlt[i]; /* new alternate */
 				}
 			}
 		} else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
 			if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 			    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 				/* Our top rate is larger than the request */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			} else if ((flags & RS_PACING_GEQ) &&
 				   (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 				   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 				/* It matches our top rate */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			} else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
 				/* The top rate is an alternative */
 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			}
 			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
 		} else {
 			/* Its in our range 1Meg - 1Gig */
 			if (flags & RS_PACING_GEQ) {
 				ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 				if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 					if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 						/* This should not happen */
 						ind_calc = (ALL_HARDWARE_RATES-1);
 					}
 					rte = &rs->rs_rlt[ind_calc];
 					if (ind_calc >= 1)
 						previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 				}
 				goto done;
 			}
 			ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
 			ind_calc += 2;
 			if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 				/* This should not happen */
 				ind_calc = ALL_HARDWARE_RATES-1;
 			}
 			if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
 				rte = &rs->rs_rlt[ind_calc];
 				if (ind_calc >= 1)
 					previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 			}
 		}
 	}
 done:
 	if ((rte == NULL) &&
 	    (arte != NULL) &&
 	    (flags & RS_PACING_SUB_OK)) {
 		/* We can use the substitute */
 		rte = arte;
 	}
 	if (lower_rate)
 		*lower_rate = previous_rate;
 	return (rte);
 }
 
 /*
  * For an explanation of why the argument is volatile please
  * look at the comments around rt_setup_rate().
  */
 static const struct tcp_hwrate_limit_table *
 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
 {
 	/**
 	 * Hunt the rate table with the restrictions in flags and find a
 	 * suitable rate if possible.
 	 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
 	 * RS_PACING_GT     - must be greater than.
 	 * RS_PACING_GEQ    - must be greater than or equal.
 	 * RS_PACING_LT     - must be less than.
 	 * RS_PACING_SUB_OK - If we don't meet criteria a
 	 *                    substitute is ok.
 	 */
 	int i, matched;
 	struct tcp_hwrate_limit_table *rte = NULL;
 	uint64_t previous_rate = 0;
 
 	if ((rs->rs_flags & RS_INT_TBL) &&
 	    (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
 		/*
 		 * Here we don't want to paw thru
 		 * a big table, we have everything
 		 * from 1Meg - 1000Meg in 1Meg increments.
 		 * Use an alternate method to "lookup".
 		 */
 		return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
 	}
 	if ((flags & RS_PACING_LT) ||
 	    (flags & RS_PACING_EXACT_MATCH)) {
 		/*
 		 * For exact and less than we go forward through the table.
 		 * This way when we find one larger we stop (exact was a
 		 * toss up).
 		 */
 		for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
 			if ((flags & RS_PACING_EXACT_MATCH) &&
 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
 				rte = &rs->rs_rlt[i];
 				matched = 1;
 				if (lower_rate != NULL)
 					*lower_rate = previous_rate;
 				break;
 			} else if ((flags & RS_PACING_LT) &&
 			    (bytes_per_sec <= rs->rs_rlt[i].rate)) {
 				rte = &rs->rs_rlt[i];
 				matched = 1;
 				if (lower_rate != NULL)
 					*lower_rate = previous_rate;
 				break;
 			}
 			previous_rate = rs->rs_rlt[i].rate;
 			if (bytes_per_sec > rs->rs_rlt[i].rate)
 				break;
 		}
 		if ((matched == 0) &&
 		    (flags & RS_PACING_LT) &&
 		    (flags & RS_PACING_SUB_OK)) {
 			/* Kick in a substitute (the lowest) */
 			rte = &rs->rs_rlt[rs->rs_lowest_valid];
 		}
 	} else {
 		/*
 		 * Here we go backward through the table so that we can find
 		 * the one greater in theory faster (but its probably a
 		 * wash).
 		 */
 		for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
 			if (rs->rs_rlt[i].rate > bytes_per_sec) {
 				/* A possible candidate */
 				rte = &rs->rs_rlt[i];
 			}
 			if ((flags & RS_PACING_GEQ) &&
 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
 				/* An exact match and we want equal */
 				matched = 1;
 				rte = &rs->rs_rlt[i];
 				break;
 			} else if (rte) {
 				/*
 				 * Found one that is larger than but don't
 				 * stop, there may be a more closer match.
 				 */
 				matched = 1;
 			}
 			if (rs->rs_rlt[i].rate < bytes_per_sec) {
 				/*
 				 * We found a table entry that is smaller,
 				 * stop there will be none greater or equal.
 				 */
 				if (lower_rate != NULL)
 					*lower_rate = rs->rs_rlt[i].rate;
 				break;
 			}
 		}
 		if ((matched == 0) &&
 		    (flags & RS_PACING_SUB_OK)) {
 			/* Kick in a substitute (the highest) */
 			rte = &rs->rs_rlt[rs->rs_highest_valid];
 		}
 	}
 	return (rte);
 }
 
 static struct ifnet *
 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
 {
 	struct ifnet *tifp;
 	struct m_snd_tag *tag, *ntag;
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = inp->inp_flowid,
 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 		.rate_limit.max_rate = COMMON_RATE,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	int err;
 #ifdef RSS
 	params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
 #else
 	params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
 #endif
 	err = m_snd_tag_alloc(ifp, &params, &tag);
 	if (err) {
 		/* Failed to setup a tag? */
 		if (error)
 			*error = err;
 		return (NULL);
 	}
 	ntag = tag;
-	while(ntag->ifp->if_next_snd_tag != NULL) {
-		ntag = ntag->ifp->if_next_snd_tag(ntag);
+	while (ntag->sw->next_snd_tag != NULL) {
+		ntag = ntag->sw->next_snd_tag(ntag);
 	}
 	tifp = ntag->ifp;
 	m_snd_tag_rele(tag);
 	return (tifp);
 }
 
 static void
 rl_increment_using(const struct tcp_hwrate_limit_table *rte)
 {
 	struct tcp_hwrate_limit_table *decon_rte;
 
 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 	atomic_add_long(&decon_rte->using, 1);
 }
 
 static void
 rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
 {
 	struct tcp_hwrate_limit_table *decon_rte;
 
 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 	atomic_subtract_long(&decon_rte->using, 1);
 }
 
 void
 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
 {
 	struct tcp_hwrate_limit_table *decon_rte;
 
 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 	atomic_add_long(&decon_rte->rs_num_enobufs, 1);
 }
 
 /*
  * Do NOT take the __noinline out of the
  * find_rs_for_ifp() function. If you do the inline
  * of it for the rt_setup_rate() will show you a
  * compiler bug. For some reason the compiler thinks
  * the list can never be empty. The consequence of
  * this will be a crash when we dereference NULL
  * if an ifp is removed just has a hw rate limit
  * is attempted. If you are working on the compiler
  * and want to "test" this go ahead and take the noinline
  * out otherwise let sleeping dogs ly until such time
  * as we get a compiler fix 10/2/20 -- RRS
  */
 static __noinline struct tcp_rate_set *
 find_rs_for_ifp(struct ifnet *ifp)
 {
 	struct tcp_rate_set *rs;
 
 	CK_LIST_FOREACH(rs, &int_rs, next) {
 		if ((rs->rs_ifp == ifp) &&
 		    (rs->rs_if_dunit == ifp->if_dunit)) {
 			/* Ok we found it */
 			return (rs);
 		}
 	}
 	return (NULL);
 }
 
 
 static const struct tcp_hwrate_limit_table *
 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
     uint32_t flags, int *error, uint64_t *lower_rate)
 {
 	/* First lets find the interface if it exists */
 	const struct tcp_hwrate_limit_table *rte;
 	/*
 	 * So why is rs volatile? This is to defeat a
 	 * compiler bug where in the compiler is convinced
 	 * that rs can never be NULL (which is not true). Because
 	 * of its conviction it nicely optimizes out the if ((rs == NULL
 	 * below which means if you get a NULL back you dereference it.
 	 */
 	volatile struct tcp_rate_set *rs;
 	struct epoch_tracker et;
 	struct ifnet *oifp = ifp;
 	int err;
 
 	NET_EPOCH_ENTER(et);
 use_real_interface:
 	rs = find_rs_for_ifp(ifp);
 	if ((rs == NULL) ||
 	    (rs->rs_flags & RS_INTF_NO_SUP) ||
 	    (rs->rs_flags & RS_IS_DEAD)) {
 		/*
 		 * This means we got a packet *before*
 		 * the IF-UP was processed below, <or>
 		 * while or after we already received an interface
 		 * departed event. In either case we really don't
 		 * want to do anything with pacing, in
 		 * the departing case the packet is not
 		 * going to go very far. The new case
 		 * might be arguable, but its impossible
 		 * to tell from the departing case.
 		 */
 		if (error)
 			*error = ENODEV;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 
 	if ((rs == NULL) || (rs->rs_disable != 0)) {
 		if (error)
 			*error = ENOSPC;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	if (rs->rs_flags & RS_IS_DEFF) {
 		/* We need to find the real interface */
 		struct ifnet *tifp;
 
 		tifp = rt_find_real_interface(ifp, inp, error);
 		if (tifp == NULL) {
 			if (rs->rs_disable && error)
 				*error = ENOTSUP;
 			NET_EPOCH_EXIT(et);
 			return (NULL);
 		}
 		KASSERT((tifp != ifp),
 			("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
 			 ifp, inp, tifp));
 		ifp = tifp;
 		goto use_real_interface;
 	}
 	if (rs->rs_flow_limit &&
 	    ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
 		if (error)
 			*error = ENOSPC;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
 	if (rte) {
 		err = in_pcbattach_txrtlmt(inp, oifp,
 		    inp->inp_flowtype,
 		    inp->inp_flowid,
 		    rte->rate,
 		    &inp->inp_snd_tag);
 		if (err) {
 			/* Failed to attach */
 			if (error)
 				*error = err;
 			rte = NULL;
 		} else {
 			KASSERT((inp->inp_snd_tag != NULL) ,
 				("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
 				 inp, rte, (unsigned long long)rte->rate, rs));
 #ifdef INET
 			counter_u64_add(rate_limit_new, 1);
 #endif
 		}
 	}
 	if (rte) {
 		/*
 		 * We use an atomic here for accounting so we don't have to
 		 * use locks when freeing.
 		 */
 		atomic_add_64(&rs->rs_flows_using, 1);
 	}
 	NET_EPOCH_EXIT(et);
 	return (rte);
 }
 
 static void
 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
 {
 	int error;
 	struct tcp_rate_set *rs;
 	struct epoch_tracker et;
 
 	if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
 	    (link_state != LINK_STATE_UP)) {
 		/*
 		 * We only care on an interface going up that is rate-limit
 		 * capable.
 		 */
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	rs = find_rs_for_ifp(ifp);
 	if (rs) {
 		/* We already have initialized this guy */
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 	rt_setup_new_rs(ifp, &error);
 }
 
 static void
 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
 {
 	struct tcp_rate_set *rs;
 	struct epoch_tracker et;
 	int i;
 
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	rs = find_rs_for_ifp(ifp);
 	if (rs) {
 		CK_LIST_REMOVE(rs, next);
 		rs_number_alive--;
 		rs->rs_flags |= RS_IS_DEAD;
 		for (i = 0; i < rs->rs_rate_cnt; i++) {
 			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 				in_pcbdetach_tag(rs->rs_rlt[i].tag);
 				rs->rs_rlt[i].tag = NULL;
 			}
 			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 		}
 		if (rs->rs_flows_using == 0)
 			rs_defer_destroy(rs);
 	}
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 tcp_rl_shutdown(void *arg __unused, int howto __unused)
 {
 	struct tcp_rate_set *rs, *nrs;
 	struct epoch_tracker et;
 	int i;
 
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
 		CK_LIST_REMOVE(rs, next);
 		rs_number_alive--;
 		rs->rs_flags |= RS_IS_DEAD;
 		for (i = 0; i < rs->rs_rate_cnt; i++) {
 			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 				in_pcbdetach_tag(rs->rs_rlt[i].tag);
 				rs->rs_rlt[i].tag = NULL;
 			}
 			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 		}
 		if (rs->rs_flows_using == 0)
 			rs_defer_destroy(rs);
 	}
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 }
 
 const struct tcp_hwrate_limit_table *
 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
 {
 	const struct tcp_hwrate_limit_table *rte;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (tp->t_inpcb->inp_snd_tag == NULL) {
 		/*
 		 * We are setting up a rate for the first time.
 		 */
 		if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
 			/* Not supported by the egress */
 			if (error)
 				*error = ENODEV;
 			return (NULL);
 		}
 #ifdef KERN_TLS
 		tls = NULL;
 		if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
 			tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
 
 			if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
 			    tls->mode != TCP_TLS_MODE_IFNET) {
 				if (error)
 					*error = ENODEV;
 				return (NULL);
 			}
 		}
 #endif
 		rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error, lower_rate);
 		if (rte)
 			rl_increment_using(rte);
 #ifdef KERN_TLS
 		if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
 			/*
 			 * Fake a route change error to reset the TLS
 			 * send tag.  This will convert the existing
 			 * tag to a TLS ratelimit tag.
 			 */
-			MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS);
+			MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS);
 			ktls_output_eagain(tp->t_inpcb, tls);
 		}
 #endif
 	} else {
 		/*
 		 * We are modifying a rate, wrong interface?
 		 */
 		if (error)
 			*error = EINVAL;
 		rte = NULL;
 	}
 	if (rte != NULL) {
 		tp->t_pacing_rate = rte->rate;
 		*error = 0;
 	}
 	return (rte);
 }
 
 const struct tcp_hwrate_limit_table *
 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
     struct tcpcb *tp, struct ifnet *ifp,
     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
 {
 	const struct tcp_hwrate_limit_table *nrte;
 	const struct tcp_rate_set *rs;
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	int err;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (crte == NULL) {
 		/* Wrong interface */
 		if (error)
 			*error = EINVAL;
 		return (NULL);
 	}
 
 #ifdef KERN_TLS
 	if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
 		tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
 		MPASS(tls->mode == TCP_TLS_MODE_IFNET);
 		if (tls->snd_tag != NULL &&
-		    tls->snd_tag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
+		    tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
 			/*
 			 * NIC probably doesn't support ratelimit TLS
 			 * tags if it didn't allocate one when an
 			 * existing rate was present, so ignore.
 			 */
 			if (error)
 				*error = EOPNOTSUPP;
 			return (NULL);
 		}
 	}
 #endif
 	if (tp->t_inpcb->inp_snd_tag == NULL) {
 		/* Wrong interface */
 		if (error)
 			*error = EINVAL;
 		return (NULL);
 	}
 	rs = crte->ptbl;
 	if ((rs->rs_flags & RS_IS_DEAD) ||
 	    (crte->flags & HDWRPACE_IFPDEPARTED)) {
 		/* Release the rate, and try anew */
 
 		tcp_rel_pacing_rate(crte, tp);
 		nrte = tcp_set_pacing_rate(tp, ifp,
 		    bytes_per_sec, flags, error, lower_rate);
 		return (nrte);
 	}
 	nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
 	if (nrte == crte) {
 		/* No change */
 		if (error)
 			*error = 0;
 		return (crte);
 	}
 	if (nrte == NULL) {
 		/* Release the old rate */
 		if (error)
 			*error = ENOENT;
 		tcp_rel_pacing_rate(crte, tp);
 		return (NULL);
 	}
 	rl_decrement_using(crte);
 	rl_increment_using(nrte);
 	/* Change rates to our new entry */
 #ifdef KERN_TLS
 	if (tls != NULL)
 		err = ktls_modify_txrtlmt(tls, nrte->rate);
 	else
 #endif
 		err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
 	if (err) {
 		rl_decrement_using(nrte);
 		/* Do we still have a snd-tag attached? */
 		if (tp->t_inpcb->inp_snd_tag)
 			in_pcbdetach_txrtlmt(tp->t_inpcb);
 		if (error)
 			*error = err;
 		return (NULL);
 	} else {
 #ifdef INET
 		counter_u64_add(rate_limit_chg, 1);
 #endif
 	}
 	if (error)
 		*error = 0;
 	tp->t_pacing_rate = nrte->rate;
 	return (nrte);
 }
 
 void
 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
 {
 	const struct tcp_rate_set *crs;
 	struct tcp_rate_set *rs;
 	uint64_t pre;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tp->t_pacing_rate = -1;
 	crs = crte->ptbl;
 	/*
 	 * Now we must break the const
 	 * in order to release our refcount.
 	 */
 	rs = __DECONST(struct tcp_rate_set *, crs);
 	rl_decrement_using(crte);
 	pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
 	if (pre == 1) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		mtx_lock(&rs_mtx);
 		/*
 		 * Is it dead?
 		 */
 		if (rs->rs_flags & RS_IS_DEAD)
 			rs_defer_destroy(rs);
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 	}
 
 	/*
 	 * XXX: If this connection is using ifnet TLS, should we
 	 * switch it to using an unlimited rate, or perhaps use
 	 * ktls_output_eagain() to reset the send tag to a plain
 	 * TLS tag?
 	 */
 	in_pcbdetach_txrtlmt(tp->t_inpcb);
 }
 
 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
 #define ONE_HUNDRED_MBPS 12500000	/* 100Mbps in bytes per second */
 #define FIVE_HUNDRED_MBPS 62500000	/* 500Mbps in bytes per second */
 #define MAX_MSS_SENT 43	/* 43 mss = 43 x 1500 = 64,500 bytes */
 
 static void
 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
 		    uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
 		    uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 		uint32_t cts;
 
 		memset(&log, 0, sizeof(log));
 		cts = tcp_get_usecs(&tv);
 		log.u_bbr.flex1 = segsiz;
 		log.u_bbr.flex2 = new_tso;
 		log.u_bbr.flex3 = time_between;
 		log.u_bbr.flex4 = calc_time_between;
 		log.u_bbr.flex5 = segs;
 		log.u_bbr.flex6 = res_div;
 		log.u_bbr.flex7 = mult;
 		log.u_bbr.flex8 = mod;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.cur_del_rate = bw;
 		log.u_bbr.delRate = hw_rate;
 		TCP_LOG_EVENTP(tp, NULL,
 		    &tp->t_inpcb->inp_socket->so_rcv,
 		    &tp->t_inpcb->inp_socket->so_snd,
 		    TCP_HDWR_PACE_SIZE, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 uint32_t
 tcp_get_pacing_burst_size (struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
    const struct tcp_hwrate_limit_table *te, int *err)
 {
 	/*
 	 * We use the google formula to calculate the
 	 * TSO size. I.E.
 	 * bw < 24Meg
 	 *   tso = 2mss
 	 * else
 	 *   tso = min(bw/1000, 64k)
 	 *
 	 * Note for these calculations we ignore the
 	 * packet overhead (enet hdr, ip hdr and tcp hdr).
 	 */
 	uint64_t lentim, res, bytes;
 	uint32_t new_tso, min_tso_segs;
 
 	bytes = bw / 1000;
 	if (bytes > (64 * 1000))
 		bytes = 64 * 1000;
 	/* Round up */
 	new_tso = (bytes + segsiz - 1) / segsiz;
 	if (can_use_1mss && (bw < ONE_POINT_TWO_MEG))
 		min_tso_segs = 1;
 	else
 		min_tso_segs = 2;
 	if (rs_floor_mss && (new_tso < rs_floor_mss))
 		new_tso = rs_floor_mss;
 	else if (new_tso < min_tso_segs)
 		new_tso = min_tso_segs;
 	if (new_tso > MAX_MSS_SENT)
 		new_tso = MAX_MSS_SENT;
 	new_tso *= segsiz;
  	tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 			    0, 0, 0, 0, 0, 0, 1);
 	/*
 	 * If we are not doing hardware pacing
 	 * then we are done.
 	 */
 	if (te == NULL) {
 		if (err)
 			*err = 0;
 		return(new_tso);
 	}
 	/*
 	 * For hardware pacing we look at the
 	 * rate you are sending at and compare
 	 * that to the rate you have in hardware.
 	 *
 	 * If the hardware rate is slower than your
 	 * software rate then you are in error and
 	 * we will build a queue in our hardware whic
 	 * is probably not desired, in such a case
 	 * just return the non-hardware TSO size.
 	 *
 	 * If the rate in hardware is faster (which
 	 * it should be) then look at how long it
 	 * takes to send one ethernet segment size at
 	 * your b/w and compare that to the time it
 	 * takes to send at the rate you had selected.
 	 *
 	 * If your time is greater (which we hope it is)
 	 * we get the delta between the two, and then
 	 * divide that into your pacing time. This tells
 	 * us how many MSS you can send down at once (rounded up).
 	 *
 	 * Note we also double this value if the b/w is over
 	 * 100Mbps. If its over 500meg we just set you to the
 	 * max (43 segments).
 	 */
 	if (te->rate > FIVE_HUNDRED_MBPS)
 		goto max;
 	if (te->rate == bw) {
 		/* We are pacing at exactly the hdwr rate */
 max:
 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 				    te->rate, te->time_between, (uint32_t)0,
 				    (segsiz * MAX_MSS_SENT), 0, 0, 3);
 		return (segsiz * MAX_MSS_SENT);
 	}
 	lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 	res = lentim / bw;
 	if (res > te->time_between) {
 		uint32_t delta, segs, res_div;
 
 		res_div = ((res * num_of_waits_allowed) + wait_time_floor);
 		delta = res - te->time_between;
 		segs = (res_div + delta - 1)/delta;
 		if (segs < min_tso_segs)
 			segs = min_tso_segs;
 		if (segs < rs_hw_floor_mss)
 			segs = rs_hw_floor_mss;
 		if (segs > MAX_MSS_SENT)
 			segs = MAX_MSS_SENT;
 		segs *= segsiz;
 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 				    te->rate, te->time_between, (uint32_t)res,
 				    segs, res_div, 1, 3);
 		if (err)
 			*err = 0;
 		if (segs < new_tso) {
 			/* unexpected ? */
 			return(new_tso);
 		} else {
 			return (segs);
 		}
 	} else {
 		/*
 		 * Your time is smaller which means
 		 * we will grow a queue on our
 		 * hardware. Send back the non-hardware
 		 * rate.
 		 */
 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 				    te->rate, te->time_between, (uint32_t)res,
 				    0, 0, 0, 4);
 		if (err)
 			*err = -1;
 		return (new_tso);
 	}
 }
 
 uint64_t
 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
 {
 	struct epoch_tracker et;
 	struct tcp_rate_set *rs;
 	uint64_t rate_ret;
 
 	NET_EPOCH_ENTER(et);
 use_next_interface:
 	rs = find_rs_for_ifp(ifp);
 	if (rs == NULL) {
 		/* This interface does not do ratelimiting */
 		rate_ret = 0;
 	} else if (rs->rs_flags & RS_IS_DEFF) {
 		/* We need to find the real interface */
 		struct ifnet *tifp;
 
 		tifp = rt_find_real_interface(ifp, inp, NULL);
 		if (tifp == NULL) {
 			NET_EPOCH_EXIT(et);
 			return (0);
 		}
 		ifp = tifp;
 		goto use_next_interface;
 	} else {
 		/* Lets return the highest rate this guy has */
 		rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
 	}
 	NET_EPOCH_EXIT(et);
 	return(rate_ret);
 }
 
 static eventhandler_tag rl_ifnet_departs;
 static eventhandler_tag rl_ifnet_arrives;
 static eventhandler_tag rl_shutdown_start;
 
 static void
 tcp_rs_init(void *st __unused)
 {
 	CK_LIST_INIT(&int_rs);
 	rs_number_alive = 0;
 	rs_number_dead = 0;
 	mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
 	rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    tcp_rl_ifnet_departure,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
 	    tcp_rl_ifnet_link,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 	    tcp_rl_shutdown, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	printf("TCP_ratelimit: Is now initialized\n");
 }
 
 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
 #endif
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index 58ada4d0b7b2..9c196f30b319 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -1,1672 +1,1675 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 /* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/refcount.h>
 #include <vm/uma.h>
 #ifdef WITNESS
 #include <sys/lock.h>
 #endif
 #endif
 
 #ifdef _KERNEL
 #include <sys/sdt.h>
 
 #define	MBUF_PROBE1(probe, arg0)					\
 	SDT_PROBE1(sdt, , , probe, arg0)
 #define	MBUF_PROBE2(probe, arg0, arg1)					\
 	SDT_PROBE2(sdt, , , probe, arg0, arg1)
 #define	MBUF_PROBE3(probe, arg0, arg1, arg2)				\
 	SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2)
 #define	MBUF_PROBE4(probe, arg0, arg1, arg2, arg3)			\
 	SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3)
 #define	MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4)		\
 	SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4)
 
 SDT_PROBE_DECLARE(sdt, , , m__init);
 SDT_PROBE_DECLARE(sdt, , , m__gethdr_raw);
 SDT_PROBE_DECLARE(sdt, , , m__gethdr);
 SDT_PROBE_DECLARE(sdt, , , m__get_raw);
 SDT_PROBE_DECLARE(sdt, , , m__get);
 SDT_PROBE_DECLARE(sdt, , , m__getcl);
 SDT_PROBE_DECLARE(sdt, , , m__getjcl);
 SDT_PROBE_DECLARE(sdt, , , m__clget);
 SDT_PROBE_DECLARE(sdt, , , m__cljget);
 SDT_PROBE_DECLARE(sdt, , , m__cljset);
 SDT_PROBE_DECLARE(sdt, , , m__free);
 SDT_PROBE_DECLARE(sdt, , , m__freem);
 
 #endif /* _KERNEL */
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead.
  * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in
  * sys/param.h), which has no additional overhead and is used instead of the
  * internal data area; this is done when at least MINCLSIZE of data must be
  * stored.  Additionally, it is possible to allocate a separate buffer
  * externally and attach it to the mbuf in a way similar to that of mbuf
  * clusters.
  *
  * NB: These calculation do not take actual compiler-induced alignment and
  * padding inside the complete struct mbuf into account.  Appropriate
  * attention is required when changing members of struct mbuf.
  *
  * MLEN is data length in a normal mbuf.
  * MHLEN is data length in an mbuf with pktheader.
  * MINCLSIZE is a smallest amount of data that should be put into cluster.
  *
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are sensible.
  */
 struct mbuf;
 #define	MHSIZE		offsetof(struct mbuf, m_dat)
 #define	MPKTHSIZE	offsetof(struct mbuf, m_pktdat)
 #define	MLEN		((int)(MSIZE - MHSIZE))
 #define	MHLEN		((int)(MSIZE - MPKTHSIZE))
 #define	MINCLSIZE	(MHLEN + 1)
 #define	M_NODOM		255
 
 #ifdef _KERNEL
 /*-
  * Macro for type conversion: convert mbuf pointer to data pointer of correct
  * type:
  *
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * mtodo(m, o) -- Same as above but with offset 'o' into data.
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	mtodo(m, o)	((void *)(((m)->m_data) + (o)))
 
 /*
  * Argument structure passed to UMA routines during mbuf and packet
  * allocations.
  */
 struct mb_args {
 	int	flags;	/* Flags for mbuf being allocated */
 	short	type;	/* Type of mbuf being allocated */
 };
 #endif /* _KERNEL */
 
 /*
  * Packet tag structure (see below for details).
  */
 struct m_tag {
 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
 	u_int16_t		m_tag_id;	/* Tag ID */
 	u_int16_t		m_tag_len;	/* Length of data */
 	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
 	void			(*m_tag_free)(struct m_tag *);
 };
 
 /*
  * Static network interface owned tag.
  * Allocated through ifp->if_snd_tag_alloc().
  */
+struct if_snd_tag_sw;
+
 struct m_snd_tag {
 	struct ifnet *ifp;		/* network interface tag belongs to */
+	const struct if_snd_tag_sw *sw;
 	volatile u_int refcount;
-	u_int	type;			/* One of IF_SND_TAG_TYPE_*. */
 };
 
 /*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *	 LP64: 56
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 struct pkthdr {
 	union {
 		struct m_snd_tag *snd_tag;	/* send tag, if any */
 		struct ifnet	*rcvif;		/* rcv interface */
 	};
 	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	int32_t		 len;		/* total packet length */
 
 	/* Layer crossing persistent information. */
 	uint32_t	 flowid;	/* packet's 4-tuple system */
 	uint32_t	 csum_flags;	/* checksum and offload features */
 	uint16_t	 fibnum;	/* this packet should use this fib */
 	uint8_t		 numa_domain;	/* NUMA domain of recvd pkt */
 	uint8_t		 rsstype;	/* hash type */
 	union {
 		uint64_t	rcv_tstmp;	/* timestamp in ns */
 		struct {
 			uint8_t		 l2hlen;	/* layer 2 hdr len */
 			uint8_t		 l3hlen;	/* layer 3 hdr len */
 			uint8_t		 l4hlen;	/* layer 4 hdr len */
 			uint8_t		 l5hlen;	/* layer 5 hdr len */
 			uint8_t		 inner_l2hlen;
 			uint8_t		 inner_l3hlen;
 			uint8_t		 inner_l4hlen;
 			uint8_t		 inner_l5hlen;
 		};
 	};
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void	*ptr;
 	} PH_per;
 
 	/* Layer specific non-persistent local storage for reassembly, etc. */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void 	*ptr;
 	} PH_loc;
 };
 #define	ether_vtag	PH_per.sixteen[0]
 #define tcp_tun_port	PH_per.sixteen[0] /* outbound */
 #define	PH_vt		PH_per
 #define	vt_nrecs	sixteen[0]	  /* mld and v6-ND */
 #define	tso_segsz	PH_per.sixteen[1] /* inbound after LRO */
 #define	lro_nsegs	tso_segsz	  /* inbound after LRO */
 #define	csum_data	PH_per.thirtytwo[1] /* inbound from hardware up */
 #define	lro_tcp_d_len	PH_loc.sixteen[0] /* inbound during LRO (no reassembly) */
 #define	lro_tcp_d_csum	PH_loc.sixteen[1] /* inbound during LRO (no reassembly) */
 #define	lro_tcp_h_off	PH_loc.sixteen[2] /* inbound during LRO (no reassembly) */
 #define	lro_etype	PH_loc.sixteen[3] /* inbound during LRO (no reassembly) */
 /* Note PH_loc is used during IP reassembly (all 8 bytes as a ptr) */
 
 /*
  * TLS records for TLS 1.0-1.2 can have the following header lengths:
  * - 5 (AES-CBC with implicit IV)
  * - 21 (AES-CBC with explicit IV)
  * - 13 (AES-GCM with 8 byte explicit IV)
  */
 #define	MBUF_PEXT_HDR_LEN	23
 
 /*
  * TLS records for TLS 1.0-1.2 can have the following maximum trailer
  * lengths:
  * - 16 (AES-GCM)
  * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
  * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
  * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
  */
 #define	MBUF_PEXT_TRAIL_LEN	64
 
 #if defined(__LP64__)
 #define MBUF_PEXT_MAX_PGS (40 / sizeof(vm_paddr_t))
 #else
 #define MBUF_PEXT_MAX_PGS (72 / sizeof(vm_paddr_t))
 #endif
 
 #define	MBUF_PEXT_MAX_BYTES						\
     (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
 
 struct ktls_session;
 struct socket;
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
  * set.
  * Size ILP32: 28
  *	 LP64: 48
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 typedef	void m_ext_free_t(struct mbuf *);
 struct m_ext {
 	union {
 		/*
 		 * If EXT_FLAG_EMBREF is set, then we use refcount in the
 		 * mbuf, the 'ext_count' member.  Otherwise, we have a
 		 * shadow copy and we use pointer 'ext_cnt'.  The original
 		 * mbuf is responsible to carry the pointer to free routine
 		 * and its arguments.  They aren't copied into shadows in
 		 * mb_dupcl() to avoid dereferencing next cachelines.
 		 */
 		volatile u_int	 ext_count;
 		volatile u_int	*ext_cnt;
 	};
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
 	union {
 		struct {
 			/*
 			 * Regular M_EXT mbuf:
 			 * o ext_buf always points to the external buffer.
 			 * o ext_free (below) and two optional arguments
 			 *   ext_arg1 and ext_arg2 store the free context for
 			 *   the external storage.  They are set only in the
 			 *   refcount carrying mbuf, the one with
 			 *   EXT_FLAG_EMBREF flag, with exclusion for
 			 *   EXT_EXTREF type, where the free context is copied
 			 *   into all mbufs that use same external storage.
 			 */
 			char 	*ext_buf;	/* start of buffer */
 #define	m_ext_copylen	offsetof(struct m_ext, ext_arg2)
 			void	*ext_arg2;
 		};
 		struct {
 			/*
 			 * Multi-page M_EXTPG mbuf:
 			 * o extpg_pa - page vector.
 			 * o extpg_trail and extpg_hdr - TLS trailer and
 			 *   header.
 			 * Uses ext_free and may also use ext_arg1.
 			 */
 			vm_paddr_t	extpg_pa[MBUF_PEXT_MAX_PGS];
 			char		extpg_trail[MBUF_PEXT_TRAIL_LEN];
 			char		extpg_hdr[MBUF_PEXT_HDR_LEN];
 			/* Pretend these 3 fields are part of mbuf itself. */
 #define	m_epg_pa	m_ext.extpg_pa
 #define	m_epg_trail	m_ext.extpg_trail
 #define	m_epg_hdr	m_ext.extpg_hdr
 #define	m_epg_ext_copylen	offsetof(struct m_ext, ext_free)
 		};
 	};
 	/*
 	 * Free method and optional argument pointer, both
 	 * used by M_EXT and M_EXTPG.
 	 */
 	m_ext_free_t	*ext_free;
 	void		*ext_arg1;
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for practical
  * purposes.
  */
 struct mbuf {
 	/*
 	 * Header present at the beginning of every mbuf.
 	 * Size ILP32: 24
 	 *      LP64: 32
 	 * Compile-time assertions in uipc_mbuf.c test these values to ensure
 	 * that they are correct.
 	 */
 	union {	/* next buffer in chain */
 		struct mbuf		*m_next;
 		SLIST_ENTRY(mbuf)	m_slist;
 		STAILQ_ENTRY(mbuf)	m_stailq;
 	};
 	union {	/* next chain in queue/record */
 		struct mbuf		*m_nextpkt;
 		SLIST_ENTRY(mbuf)	m_slistpkt;
 		STAILQ_ENTRY(mbuf)	m_stailqpkt;
 	};
 	caddr_t		 m_data;	/* location of data */
 	int32_t		 m_len;		/* amount of data in this mbuf */
 	uint32_t	 m_type:8,	/* type of data in this mbuf */
 			 m_flags:24;	/* flags; see below */
 #if !defined(__LP64__)
 	uint32_t	 m_pad;		/* pad for 64bit alignment */
 #endif
 
 	/*
 	 * A set of optional headers (packet header, external storage header)
 	 * and internal data storage.  Historically, these arrays were sized
 	 * to MHLEN (space left after a packet header) and MLEN (space left
 	 * after only a regular mbuf header); they are now variable size in
 	 * order to support future work on variable-size mbufs.
 	 */
 	union {
 		struct {
 			union {
 				/* M_PKTHDR set. */
 				struct pkthdr	m_pkthdr;
 
 				/* M_EXTPG set.
 				 * Multi-page M_EXTPG mbuf has its meta data
 				 * split between the below anonymous structure
 				 * and m_ext.  It carries vector of pages,
 				 * optional header and trailer char vectors
 				 * and pointers to socket/TLS data.
 				 */
 #define	m_epg_startcopy		m_epg_npgs
 #define	m_epg_endcopy		m_epg_stailq
 				struct {
 					/* Overall count of pages and count of
 					 * pages with I/O pending. */
 					uint8_t	m_epg_npgs;
 					uint8_t	m_epg_nrdy;
 					/* TLS header and trailer lengths.
 					 * The data itself resides in m_ext. */
 					uint8_t	m_epg_hdrlen;
 					uint8_t	m_epg_trllen;
 					/* Offset into 1st page and length of
 					 * data in the last page. */
 					uint16_t m_epg_1st_off;
 					uint16_t m_epg_last_len;
 					uint8_t	m_epg_flags;
 #define	EPG_FLAG_ANON	0x1	/* Data can be encrypted in place. */
 #define	EPG_FLAG_2FREE	0x2	/* Scheduled for free. */
 					uint8_t	m_epg_record_type;
 					uint8_t	__spare[2];
 					int	m_epg_enc_cnt;
 					struct ktls_session *m_epg_tls;
 					struct socket	*m_epg_so;
 					uint64_t	m_epg_seqno;
 					STAILQ_ENTRY(mbuf) m_epg_stailq;
 				};
 			};
 			union {
 				/* M_EXT or M_EXTPG set. */
 				struct m_ext	m_ext;
 				/* M_PKTHDR set, neither M_EXT nor M_EXTPG. */
 				char		m_pktdat[0];
 			};
 		};
 		char	m_dat[0];			/* !M_PKTHDR, !M_EXT */
 	};
 };
 
 #ifdef _KERNEL
 static inline int
 m_epg_pagelen(const struct mbuf *m, int pidx, int pgoff)
 {
 
 	KASSERT(pgoff == 0 || pidx == 0,
 	    ("page %d with non-zero offset %d in %p", pidx, pgoff, m));
 
 	if (pidx == m->m_epg_npgs - 1) {
 		return (m->m_epg_last_len);
 	} else {
 		return (PAGE_SIZE - pgoff);
 	}
 }
 
 #ifdef INVARIANTS
 #define	MCHECK(ex, msg)	KASSERT((ex),				\
 	    ("Multi page mbuf %p with " #msg " at %s:%d",	\
 	    m, __FILE__, __LINE__))
 /*
  * NB: This expects a non-empty buffer (npgs > 0 and
  * last_pg_len > 0).
  */
 #define	MBUF_EXT_PGS_ASSERT_SANITY(m)	do {				\
 	MCHECK(m->m_epg_npgs > 0, "no valid pages");		\
 	MCHECK(m->m_epg_npgs <= nitems(m->m_epg_pa),		\
 	    "too many pages");						\
 	MCHECK(m->m_epg_nrdy <= m->m_epg_npgs,			\
 	    "too many ready pages");					\
 	MCHECK(m->m_epg_1st_off < PAGE_SIZE,			\
 		"too large page offset");				\
 	MCHECK(m->m_epg_last_len > 0, "zero last page length");	\
 	MCHECK(m->m_epg_last_len <= PAGE_SIZE,			\
 	    "too large last page length");				\
 	if (m->m_epg_npgs == 1)					\
 		MCHECK(m->m_epg_1st_off +			\
 		    m->m_epg_last_len <=	 PAGE_SIZE,		\
 		    "single page too large");				\
 	MCHECK(m->m_epg_hdrlen <= sizeof(m->m_epg_hdr),		\
 	    "too large header length");					\
 	MCHECK(m->m_epg_trllen <= sizeof(m->m_epg_trail),	\
 	    "too large header length");					\
 } while (0)
 #else
 #define	MBUF_EXT_PGS_ASSERT_SANITY(m)	do {} while (0);
 #endif
 #endif
 
 /*
  * mbuf flags of global significance and layer crossing.
  * Those of only protocol/layer specific significance are to be mapped
  * to M_PROTO[1-11] and cleared at layer handoff boundaries.
  * NB: Limited to the lower 24 bits.
  */
 #define	M_EXT		0x00000001 /* has associated external storage */
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data is marked read-only */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
 #define	M_EXTPG		0x00000100 /* has array of unmapped pages and TLS */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
 #define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
 				      hw-stamped on port (useful for IEEE 1588
 				      and 802.1AS) */
 #define M_TSTMP_LRO	0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
 
 #define	M_PROTO1	0x00002000 /* protocol-specific */
 #define	M_PROTO2	0x00004000 /* protocol-specific */
 #define	M_PROTO3	0x00008000 /* protocol-specific */
 #define	M_PROTO4	0x00010000 /* protocol-specific */
 #define	M_PROTO5	0x00020000 /* protocol-specific */
 #define	M_PROTO6	0x00040000 /* protocol-specific */
 #define	M_PROTO7	0x00080000 /* protocol-specific */
 #define	M_PROTO8	0x00100000 /* protocol-specific */
 #define	M_PROTO9	0x00200000 /* protocol-specific */
 #define	M_PROTO10	0x00400000 /* protocol-specific */
 #define	M_PROTO11	0x00800000 /* protocol-specific */
 
 /*
  * Flags to purge when crossing layers.
  */
 #define	M_PROTOFLAGS \
     (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
      M_PROTO9|M_PROTO10|M_PROTO11)
 
 /*
  * Flags preserved when copying m_pkthdr.
  */
 #define M_COPYFLAGS \
     (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \
      M_TSTMP_HPREC|M_TSTMP_LRO|M_PROTOFLAGS)
 
 /*
  * Mbuf flag description for use with printf(9) %b identifier.
  */
 #define	M_FLAG_BITS \
     "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
     "\7M_PROMISC\10M_VLANTAG\11M_EXTPG\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC\15M_TSTMP_LRO"
 #define	M_FLAG_PROTOBITS \
     "\16M_PROTO1\17M_PROTO2\20M_PROTO3\21M_PROTO4" \
     "\22M_PROTO5\23M_PROTO6\24M_PROTO7\25M_PROTO8\26M_PROTO9" \
     "\27M_PROTO10\28M_PROTO11"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
 
 /*
  * Network interface cards are able to hash protocol fields (such as IPv4
  * addresses and TCP port numbers) classify packets into flows.  These flows
  * can then be used to maintain ordering while delivering packets to the OS
  * via parallel input queues, as well as to provide a stateless affinity
  * model.  NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set
  * m_flag fields to indicate how the hash should be interpreted by the
  * network stack.
  *
  * Most NICs support RSS, which provides ordering and explicit affinity, and
  * use the hash m_flag bits to indicate what header fields were covered by
  * the hash.  M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non-
  * RSS cards or configurations that provide an opaque flow identifier, allowing
  * for ordering and distribution without explicit affinity.  Additionally,
  * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash
  * properties.
  *
  * The meaning of the IPV6_EX suffix:
  * "o  Home address from the home address option in the IPv6 destination
  *     options header.  If the extension header is not present, use the Source
  *     IPv6 Address.
  *  o  IPv6 address that is contained in the Routing-Header-Type-2 from the
  *     associated extension header.  If the extension header is not present,
  *     use the Destination IPv6 Address."
  * Quoted from:
  * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex
  */
 #define	M_HASHTYPE_HASHPROP		0x80	/* has hash properties */
 #define	M_HASHTYPE_INNER		0x40	/* calculated from inner headers */
 #define	M_HASHTYPE_HASH(t)		(M_HASHTYPE_HASHPROP | (t))
 /* Microsoft RSS standard hash types */
 #define	M_HASHTYPE_NONE			0
 #define	M_HASHTYPE_RSS_IPV4		M_HASHTYPE_HASH(1) /* IPv4 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV4		M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6		M_HASHTYPE_HASH(3) /* IPv6 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV6		M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6_EX		M_HASHTYPE_HASH(5) /* IPv6 2-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_TCP_IPV6_EX	M_HASHTYPE_HASH(6) /* TCPv6 4-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_UDP_IPV4		M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6		M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6_EX	M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple +
 							    * ext hdrs */
 
 #define	M_HASHTYPE_OPAQUE		0x3f	/* ordering, not affinity */
 #define	M_HASHTYPE_OPAQUE_HASH		M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE)
 						/* ordering+hash, not affinity*/
 
 #define	M_HASHTYPE_CLEAR(m)	((m)->m_pkthdr.rsstype = 0)
 #define	M_HASHTYPE_GET(m)	((m)->m_pkthdr.rsstype & ~M_HASHTYPE_INNER)
 #define	M_HASHTYPE_SET(m, v)	((m)->m_pkthdr.rsstype = (v))
 #define	M_HASHTYPE_TEST(m, v)	(M_HASHTYPE_GET(m) == (v))
 #define	M_HASHTYPE_ISHASH(m)	\
     (((m)->m_pkthdr.rsstype & M_HASHTYPE_HASHPROP) != 0)
 #define	M_HASHTYPE_SETINNER(m)	do {			\
 	(m)->m_pkthdr.rsstype |= M_HASHTYPE_INNER;	\
     } while (0)
 
 /*
  * External mbuf storage buffer types.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_buf */
 #define	EXT_JUMBOP	3	/* jumbo cluster page sized */
 #define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
 #define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 #define	EXT_RXRING	8	/* data in NIC receive ring */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
 #define	EXT_VENDOR3	226	/* for vendor-internal use */
 #define	EXT_VENDOR4	227	/* for vendor-internal use */
 
 #define	EXT_EXP1	244	/* for experimental use */
 #define	EXT_EXP2	245	/* for experimental use */
 #define	EXT_EXP3	246	/* for experimental use */
 #define	EXT_EXP4	247	/* for experimental use */
 
 #define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
 #define	EXT_EXTREF	255	/* has externally maintained ext_cnt ptr */
 
 /*
  * Flags for external mbuf buffer types.
  * NB: limited to the lower 24 bits.
  */
 #define	EXT_FLAG_EMBREF		0x000001	/* embedded ext_count */
 #define	EXT_FLAG_EXTREF		0x000002	/* external ext_cnt, notyet */
 
 #define	EXT_FLAG_NOFREE		0x000010	/* don't free mbuf to pool, notyet */
 
 #define	EXT_FLAG_VENDOR1	0x010000	/* These flags are vendor */
 #define	EXT_FLAG_VENDOR2	0x020000	/* or submodule specific, */
 #define	EXT_FLAG_VENDOR3	0x040000	/* not used by mbuf code. */
 #define	EXT_FLAG_VENDOR4	0x080000	/* Set/read by submodule. */
 
 #define	EXT_FLAG_EXP1		0x100000	/* for experimental use */
 #define	EXT_FLAG_EXP2		0x200000	/* for experimental use */
 #define	EXT_FLAG_EXP3		0x400000	/* for experimental use */
 #define	EXT_FLAG_EXP4		0x800000	/* for experimental use */
 
 /*
  * EXT flag description for use with printf(9) %b identifier.
  */
 #define	EXT_FLAG_BITS \
     "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \
     "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \
     "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
     "\30EXT_FLAG_EXP4"
 
 /*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.
  *
  * Outbound flags that are set by upper protocol layers requesting lower
  * layers, or ideally the hardware, to perform these offloading tasks.
  * For outbound packets this field and its flags can be directly tested
  * against ifnet if_hwassist.  Note that the outbound and the inbound flags do
  * not collide right now but they could be allowed to (as long as the flags are
  * scrubbed appropriately when the direction of an mbuf changes).  CSUM_BITS
  * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX.
  *
  * CSUM_INNER_<x> is the same as CSUM_<x> but it applies to the inner frame.
  * The CSUM_ENCAP_<x> bits identify the outer encapsulation.
  */
 #define	CSUM_IP			0x00000001	/* IP header checksum offload */
 #define	CSUM_IP_UDP		0x00000002	/* UDP checksum offload */
 #define	CSUM_IP_TCP		0x00000004	/* TCP checksum offload */
 #define	CSUM_IP_SCTP		0x00000008	/* SCTP checksum offload */
 #define	CSUM_IP_TSO		0x00000010	/* TCP segmentation offload */
 #define	CSUM_IP_ISCSI		0x00000020	/* iSCSI checksum offload */
 
 #define	CSUM_INNER_IP6_UDP	0x00000040
 #define	CSUM_INNER_IP6_TCP	0x00000080
 #define	CSUM_INNER_IP6_TSO	0x00000100
 #define	CSUM_IP6_UDP		0x00000200	/* UDP checksum offload */
 #define	CSUM_IP6_TCP		0x00000400	/* TCP checksum offload */
 #define	CSUM_IP6_SCTP		0x00000800	/* SCTP checksum offload */
 #define	CSUM_IP6_TSO		0x00001000	/* TCP segmentation offload */
 #define	CSUM_IP6_ISCSI		0x00002000	/* iSCSI checksum offload */
 
 #define	CSUM_INNER_IP		0x00004000
 #define	CSUM_INNER_IP_UDP	0x00008000
 #define	CSUM_INNER_IP_TCP	0x00010000
 #define	CSUM_INNER_IP_TSO	0x00020000
 
 #define	CSUM_ENCAP_VXLAN	0x00040000	/* VXLAN outer encapsulation */
 #define	CSUM_ENCAP_RSVD1	0x00080000
 
 /* Inbound checksum support where the checksum was verified by hardware. */
 #define	CSUM_INNER_L3_CALC	0x00100000
 #define	CSUM_INNER_L3_VALID	0x00200000
 #define	CSUM_INNER_L4_CALC	0x00400000
 #define	CSUM_INNER_L4_VALID	0x00800000
 #define	CSUM_L3_CALC		0x01000000	/* calculated layer 3 csum */
 #define	CSUM_L3_VALID		0x02000000	/* checksum is correct */
 #define	CSUM_L4_CALC		0x04000000	/* calculated layer 4 csum */
 #define	CSUM_L4_VALID		0x08000000	/* checksum is correct */
 #define	CSUM_L5_CALC		0x10000000	/* calculated layer 5 csum */
 #define	CSUM_L5_VALID		0x20000000	/* checksum is correct */
 #define	CSUM_COALESCED		0x40000000	/* contains merged segments */
 
 #define	CSUM_SND_TAG		0x80000000	/* Packet header has send tag */
 
 #define CSUM_FLAGS_TX (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \
     CSUM_IP_TSO | CSUM_IP_ISCSI | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | \
     CSUM_INNER_IP6_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP | \
     CSUM_IP6_TSO | CSUM_IP6_ISCSI | CSUM_INNER_IP | CSUM_INNER_IP_UDP | \
     CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN | \
     CSUM_ENCAP_RSVD1 | CSUM_SND_TAG)
 
 #define CSUM_FLAGS_RX (CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | \
     CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | CSUM_L3_CALC | CSUM_L3_VALID | \
     CSUM_L4_CALC | CSUM_L4_VALID | CSUM_L5_CALC | CSUM_L5_VALID | \
     CSUM_COALESCED)
 
 /*
  * CSUM flag description for use with printf(9) %b identifier.
  */
 #define	CSUM_BITS \
     "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
     "\6CSUM_IP_ISCSI\7CSUM_INNER_IP6_UDP\10CSUM_INNER_IP6_TCP" \
     "\11CSUM_INNER_IP6_TSO\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP" \
     "\15CSUM_IP6_TSO\16CSUM_IP6_ISCSI\17CSUM_INNER_IP\20CSUM_INNER_IP_UDP" \
     "\21CSUM_INNER_IP_TCP\22CSUM_INNER_IP_TSO\23CSUM_ENCAP_VXLAN" \
     "\24CSUM_ENCAP_RSVD1\25CSUM_INNER_L3_CALC\26CSUM_INNER_L3_VALID" \
     "\27CSUM_INNER_L4_CALC\30CSUM_INNER_L4_VALID\31CSUM_L3_CALC" \
     "\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID\35CSUM_L5_CALC" \
     "\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
 
 /* CSUM flags compatibility mappings. */
 #define	CSUM_IP_CHECKED		CSUM_L3_CALC
 #define	CSUM_IP_VALID		CSUM_L3_VALID
 #define	CSUM_DATA_VALID		CSUM_L4_VALID
 #define	CSUM_PSEUDO_HDR		CSUM_L4_CALC
 #define	CSUM_SCTP_VALID		CSUM_L4_VALID
 #define	CSUM_DELAY_DATA		(CSUM_TCP|CSUM_UDP)
 #define	CSUM_DELAY_IP		CSUM_IP		/* Only v4, no v6 IP hdr csum */
 #define	CSUM_DELAY_DATA_IPV6	(CSUM_TCP_IPV6|CSUM_UDP_IPV6)
 #define	CSUM_DATA_VALID_IPV6	CSUM_DATA_VALID
 #define	CSUM_TCP		CSUM_IP_TCP
 #define	CSUM_UDP		CSUM_IP_UDP
 #define	CSUM_SCTP		CSUM_IP_SCTP
 #define	CSUM_TSO		(CSUM_IP_TSO|CSUM_IP6_TSO)
 #define	CSUM_INNER_TSO		(CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO)
 #define	CSUM_UDP_IPV6		CSUM_IP6_UDP
 #define	CSUM_TCP_IPV6		CSUM_IP6_TCP
 #define	CSUM_SCTP_IPV6		CSUM_IP6_SCTP
 
 /*
  * mbuf types describing the content of the mbuf (including external storage).
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	MT_DATA	/* packet header, use M_PKTHDR instead */
 
 #define	MT_VENDOR1	4	/* for vendor-internal use */
 #define	MT_VENDOR2	5	/* for vendor-internal use */
 #define	MT_VENDOR3	6	/* for vendor-internal use */
 #define	MT_VENDOR4	7	/* for vendor-internal use */
 
 #define	MT_SONAME	8	/* socket name */
 
 #define	MT_EXP1		9	/* for experimental use */
 #define	MT_EXP2		10	/* for experimental use */
 #define	MT_EXP3		11	/* for experimental use */
 #define	MT_EXP4		12	/* for experimental use */
 
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_EXTCONTROL	15	/* control message with externalized contents */
 #define	MT_OOBDATA	16	/* expedited data  */
 
 #define	MT_NOINIT	255	/* Not a type but a flag to allocate
 				   a non-initialized mbuf */
 
 /*
  * String names of mbuf-related UMA(9) and malloc(9) types.  Exposed to
  * !_KERNEL so that monitoring tools can look up the zones with
  * libmemstat(3).
  */
 #define	MBUF_MEM_NAME		"mbuf"
 #define	MBUF_CLUSTER_MEM_NAME	"mbuf_cluster"
 #define	MBUF_PACKET_MEM_NAME	"mbuf_packet"
 #define	MBUF_JUMBOP_MEM_NAME	"mbuf_jumbo_page"
 #define	MBUF_JUMBO9_MEM_NAME	"mbuf_jumbo_9k"
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
 #define	MBUF_EXTPGS_MEM_NAME	"mbuf_extpgs"
 
 #ifdef _KERNEL
 union if_snd_tag_alloc_params;
 
 #ifdef WITNESS
 #define	MBUF_CHECKSLEEP(how) do {					\
 	if (how == M_WAITOK)						\
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,		\
 		    "Sleeping in \"%s\"", __func__);			\
 } while (0)
 #else
 #define	MBUF_CHECKSLEEP(how)
 #endif
 
 /*
  * Network buffer allocation API
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
 extern uma_zone_t	zone_extpgs;
 
 void		 mb_dupcl(struct mbuf *, struct mbuf *);
 void		 mb_free_ext(struct mbuf *);
 void		 mb_free_extpg(struct mbuf *);
 void		 mb_free_mext_pgs(struct mbuf *);
 struct mbuf	*mb_alloc_ext_pgs(int, m_ext_free_t);
 struct mbuf	*mb_alloc_ext_plus_pages(int, int);
 struct mbuf	*mb_mapped_to_unmapped(struct mbuf *, int, int, int,
 		    struct mbuf **);
 int		 mb_unmapped_compress(struct mbuf *m);
 struct mbuf 	*mb_unmapped_to_ext(struct mbuf *m);
 void		 mb_free_notready(struct mbuf *m, int count);
 void		 m_adj(struct mbuf *, int);
 void		 m_adj_decap(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 int		 m_append(struct mbuf *, int, c_caddr_t);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_catpkt(struct mbuf *, struct mbuf *);
 int		 m_clget(struct mbuf *m, int how);
 void 		*m_cljget(struct mbuf *m, int how, int size);
 struct mbuf	*m_collapse(struct mbuf *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct mbuf	*m_copym(struct mbuf *, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote_pkthdr(struct mbuf *);
 void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 void		 m_dispose_extcontrolm(struct mbuf *m);
 struct mbuf	*m_dup(const struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, const struct mbuf *, int);
 void		 m_extadd(struct mbuf *, char *, u_int, m_ext_free_t,
 		    void *, void *, int, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct mbuf	*m_fragment(struct mbuf *, int, int);
 void		 m_freem(struct mbuf *);
 void		 m_free_raw(struct mbuf *);
 struct mbuf	*m_get2(int, int, short, int);
 struct mbuf	*m_get3(int, int, short, int);
 struct mbuf	*m_getjcl(int, short, int, int);
 struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, const struct mbuf *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *, int);
 struct mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf	*m_pullup(struct mbuf *, int);
 int		 m_sanity(struct mbuf *, int);
 struct mbuf	*m_split(struct mbuf *, int, int);
 struct mbuf	*m_uiotombuf(struct uio *, int, int, int, int);
 int		 m_unmapped_uiomove(const struct mbuf *, int, struct uio *,
 		    int);
 struct mbuf	*m_unshare(struct mbuf *, int);
 int		 m_snd_tag_alloc(struct ifnet *,
 		    union if_snd_tag_alloc_params *, struct m_snd_tag **);
-void		 m_snd_tag_init(struct m_snd_tag *, struct ifnet *, u_int);
+void		 m_snd_tag_init(struct m_snd_tag *, struct ifnet *,
+		    const struct if_snd_tag_sw *);
 void		 m_snd_tag_destroy(struct m_snd_tag *);
 
 static __inline int
 m_gettype(int size)
 {
 	int type;
 
 	switch (size) {
 	case MSIZE:
 		type = EXT_MBUF;
 		break;
 	case MCLBYTES:
 		type = EXT_CLUSTER;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		type = EXT_JUMBOP;
 		break;
 #endif
 	case MJUM9BYTES:
 		type = EXT_JUMBO9;
 		break;
 	case MJUM16BYTES:
 		type = EXT_JUMBO16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (type);
 }
 
 /*
  * Associated an external reference counted buffer with an mbuf.
  */
 static __inline void
 m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt,
     m_ext_free_t freef, void *arg1, void *arg2)
 {
 
 	KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__));
 
 	atomic_add_int(ref_cnt, 1);
 	m->m_flags |= M_EXT;
 	m->m_ext.ext_buf = buf;
 	m->m_ext.ext_cnt = ref_cnt;
 	m->m_data = m->m_ext.ext_buf;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_free = freef;
 	m->m_ext.ext_arg1 = arg1;
 	m->m_ext.ext_arg2 = arg2;
 	m->m_ext.ext_type = EXT_EXTREF;
 	m->m_ext.ext_flags = 0;
 }
 
 static __inline uma_zone_t
 m_getzone(int size)
 {
 	uma_zone_t zone;
 
 	switch (size) {
 	case MCLBYTES:
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		zone = zone_jumbop;
 		break;
 #endif
 	case MJUM9BYTES:
 		zone = zone_jumbo9;
 		break;
 	case MJUM16BYTES:
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (zone);
 }
 
 /*
  * Initialize an mbuf with linear storage.
  *
  * Inline because the consumer text overhead will be roughly the same to
  * initialize or call a function with this many parameters and M_PKTHDR
  * should go away with constant propagation for !MGETHDR.
  */
 static __inline int
 m_init(struct mbuf *m, int how, short type, int flags)
 {
 	int error;
 
 	m->m_next = NULL;
 	m->m_nextpkt = NULL;
 	m->m_data = m->m_dat;
 	m->m_len = 0;
 	m->m_flags = flags;
 	m->m_type = type;
 	if (flags & M_PKTHDR)
 		error = m_pkthdr_init(m, how);
 	else
 		error = 0;
 
 	MBUF_PROBE5(m__init, m, how, type, flags, error);
 	return (error);
 }
 
 static __inline struct mbuf *
 m_get_raw(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type | MT_NOINIT;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__get_raw, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_get(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__get, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr_raw(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type | MT_NOINIT;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__gethdr_raw, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__gethdr, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = flags;
 	args.type = type;
 	m = uma_zalloc_arg(zone_pack, &args, how);
 	MBUF_PROBE4(m__getcl, how, type, flags, m);
 	return (m);
 }
 
 /*
  * XXX: m_cljset() is a dangerous API.  One must attach only a new,
  * unreferenced cluster to an mbuf(9).  It is not possible to assert
  * that, so care can be taken only by users of the API.
  */
 static __inline void
 m_cljset(struct mbuf *m, void *cl, int type)
 {
 	int size;
 
 	switch (type) {
 	case EXT_CLUSTER:
 		size = MCLBYTES;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case EXT_JUMBOP:
 		size = MJUMPAGESIZE;
 		break;
 #endif
 	case EXT_JUMBO9:
 		size = MJUM9BYTES;
 		break;
 	case EXT_JUMBO16:
 		size = MJUM16BYTES;
 		break;
 	default:
 		panic("%s: unknown cluster type %d", __func__, type);
 		break;
 	}
 
 	m->m_data = m->m_ext.ext_buf = cl;
 	m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_type = type;
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_flags |= M_EXT;
 	MBUF_PROBE3(m__cljset, m, cl, type);
 }
 
 static __inline void
 m_chtype(struct mbuf *m, short new_type)
 {
 
 	m->m_type = new_type;
 }
 
 static __inline void
 m_clrprotoflags(struct mbuf *m)
 {
 
 	while (m) {
 		m->m_flags &= ~M_PROTOFLAGS;
 		m = m->m_next;
 	}
 }
 
 static __inline struct mbuf *
 m_last(struct mbuf *m)
 {
 
 	while (m->m_next)
 		m = m->m_next;
 	return (m);
 }
 
 static inline u_int
 m_extrefcnt(struct mbuf *m)
 {
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__));
 
 	return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count :
 	    *m->m_ext.ext_cnt);
 }
 
 /*
  * mbuf, cluster, and external object allocation macros (for compatibility
  * purposes).
  */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, arg1, arg2, flags, type)		\
     m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2),	\
     (flags), (type))
 #define	m_getm(m, len, how, type)					\
     m_getm2((m), (len), (how), (type), M_PKTHDR)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
  * be both the local data payload, or an external buffer area, depending on
  * whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(((m)->m_flags & (M_RDONLY | M_EXTPG)) == 0 &&	\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (m_extrefcnt(m) == 1)))
 
 /* Check if the supplied mbuf has a packet header, or else panic. */
 #define	M_ASSERTPKTHDR(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR,			\
 	    ("%s: no mbuf packet header!", __func__))
 
 /* Check if mbuf is multipage. */
 #define M_ASSERTEXTPG(m)						\
 	KASSERT(((m)->m_flags & (M_EXTPG|M_PKTHDR)) == M_EXTPG,		\
 	    ("%s: m %p is not multipage!", __func__, m))
 
 /*
  * Ensure that the supplied mbuf is a valid, non-free mbuf.
  *
  * XXX: Broken at the moment.  Need some UMA magic to make it work again.
  */
 #define	M_ASSERTVALID(m)						\
 	KASSERT((((struct mbuf *)m)->m_flags & 0) == 0,			\
 	    ("%s: attempted use of a free mbuf!", __func__))
 
 /* Check whether any mbuf in the chain is unmapped. */
 #ifdef INVARIANTS
 #define	M_ASSERTMAPPED(m) do {						\
 	for (struct mbuf *__m = (m); __m != NULL; __m = __m->m_next)	\
 		KASSERT((__m->m_flags & M_EXTPG) == 0,			\
 		    ("%s: chain %p contains an unmapped mbuf", __func__, (m)));\
 } while (0)
 #else
 #define	M_ASSERTMAPPED(m)
 #endif
 
 /*
  * Return the address of the start of the buffer associated with an mbuf,
  * handling external storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_START(m)							\
 	(((m)->m_flags & M_EXTPG) ? NULL :				\
 	 ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])
 
 /*
  * Return the size of the buffer associated with an mbuf, handling external
  * storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_SIZE(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
 	 ((m)->m_flags & M_PKTHDR) ? MHLEN :				\
 	 MLEN)
 
 /*
  * Set the m_data pointer of a newly allocated mbuf to place an object of the
  * specified size at the end of the mbuf, longword aligned.
  *
  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
  * separate macros, each asserting that it was called at the proper moment.
  * This required callers to themselves test the storage type and call the
  * right one.  Rather than require callers to be aware of those layout
  * decisions, we centralize here.
  */
 static __inline void
 m_align(struct mbuf *m, int len)
 {
 #ifdef INVARIANTS
 	const char *msg = "%s: not a virgin mbuf";
 #endif
 	int adjust;
 
 	KASSERT(m->m_data == M_START(m), (msg, __func__));
 
 	adjust = M_SIZE(m) - len;
 	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
 #define	M_ALIGN(m, len)		m_align(m, len)
 #define	MH_ALIGN(m, len)	m_align(m, len)
 #define	MEXT_ALIGN(m, len)	m_align(m, len)
 
 /*
  * Compute the amount of space available before the current start of data in
  * an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_LEADINGSPACE(m)						\
 	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
  * Compute the amount of space available after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_TRAILINGSPACE(m)						\
 	(M_WRITABLE(m) ?						\
 	    ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.  If a new mbuf must be
  * allocated, how specifies whether to wait.  If the allocation fails, the
  * original mbuf chain is freed and m is set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	MBUF_CHECKSLEEP(how);						\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.  This is a relatively expensive operation and
  * should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
 /* Return the rcvif of a packet header. */
 static __inline struct ifnet *
 m_rcvif(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		return (NULL);
 	return (m->m_pkthdr.rcvif);
 }
 
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 extern int		max_datalen;	/* MHLEN - max_hdr */
 extern int		max_hdr;	/* Largest link + protocol header */
 extern int		max_linkhdr;	/* Largest link-level header */
 extern int		max_protohdr;	/* Largest protocol header */
 extern int		nmbclusters;	/* Maximum number of clusters */
 extern bool		mb_use_ext_pgs;	/* Use ext_pgs for sendfile */
 
 /*-
  * Network packets may have annotations attached by affixing a list of
  * "packet tags" to the pkthdr structure.  Packet tags are dynamically
  * allocated semi-opaque data structures that have a fixed header
  * (struct m_tag) that specifies the size of the memory block and a
  * <cookie,type> pair that identifies it.  The cookie is a 32-bit unique
  * unsigned value used to identify a module or ABI.  By convention this value
  * is chosen as the date+time that the module is created, expressed as the
  * number of seconds since the epoch (e.g., using date -u +'%s').  The type
  * value is an ABI/module-specific value that identifies a particular
  * annotation and is private to the module.  For compatibility with systems
  * like OpenBSD that define packet tags w/o an ABI/module cookie, the value
  * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find
  * compatibility shim functions and several tag types are defined below.
  * Users that do not require compatibility should use a private cookie value
  * so that packet tag-related definitions can be maintained privately.
  *
  * Note that the packet tag returned by m_tag_alloc has the default memory
  * alignment implemented by malloc.  To reference private data one can use a
  * construct like:
  *
  *	struct m_tag *mtag = m_tag_alloc(...);
  *	struct foo *p = (struct foo *)(mtag+1);
  *
  * if the alignment of struct m_tag is sufficient for referencing members of
  * struct foo.  Otherwise it is necessary to embed struct m_tag within the
  * private data structure to insure proper alignment; e.g.,
  *
  *	struct foo {
  *		struct m_tag	tag;
  *		...
  *	};
  *	struct foo *p = (struct foo *) m_tag_alloc(...);
  *	struct m_tag *mtag = &p->tag;
  */
 
 /*
  * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
  * tags are expected to ``vanish'' when they pass through a network
  * interface.  For most interfaces this happens normally as the tags are
  * reclaimed when the mbuf is free'd.  However in some special cases
  * reclaiming must be done manually.  An example is packets that pass through
  * the loopback interface.  Also, one must be careful to do this when
  * ``turning around'' packets (e.g., icmp_reflect).
  *
  * To mark a tag persistent bit-or this flag in when defining the tag id.
  * The tag will then be treated as described above.
  */
 #define	MTAG_PERSISTENT				0x800
 
 #define	PACKET_TAG_NONE				0  /* Nadda */
 
 /* Packet tags for use with PACKET_ABI_COMPAT. */
 #define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
 #define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
 #define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
 #define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
 #define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
 #define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
 #define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
 #define	PACKET_TAG_GIF				8  /* GIF processing done */
 #define	PACKET_TAG_GRE				9  /* GRE processing done */
 #define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
 #define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
 #define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
 #define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
 #define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
 #define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
 #define	PACKET_TAG_DIVERT			17 /* divert info */
 #define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
 #define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
 #define	PACKET_TAG_PF		(21 | MTAG_PERSISTENT) /* PF/ALTQ information */
 #define	PACKET_TAG_RTSOCKFAM			25 /* rtsock sa family */
 #define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
 #define	PACKET_TAG_CARP				28 /* CARP info */
 #define	PACKET_TAG_IPSEC_NAT_T_PORTS		29 /* two uint16_t */
 #define	PACKET_TAG_ND_OUTGOING			30 /* ND outgoing */
 
 /* Specific cookies and tags. */
 
 /* Packet tag routines. */
 struct m_tag	*m_tag_alloc(u_int32_t, int, int, int);
 void		 m_tag_delete(struct mbuf *, struct m_tag *);
 void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
 void		 m_tag_free_default(struct m_tag *);
 struct m_tag	*m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *);
 struct m_tag	*m_tag_copy(struct m_tag *, int);
 int		 m_tag_copy_chain(struct mbuf *, const struct mbuf *, int);
 void		 m_tag_delete_nonpersistent(struct mbuf *);
 
 /*
  * Initialize the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_init(struct mbuf *m)
 {
 
 	SLIST_INIT(&m->m_pkthdr.tags);
 }
 
 /*
  * Set up the contents of a tag.  Note that this does not fill in the free
  * method; the caller is expected to do that.
  *
  * XXX probably should be called m_tag_init, but that was already taken.
  */
 static __inline void
 m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len)
 {
 
 	t->m_tag_id = type;
 	t->m_tag_len = len;
 	t->m_tag_cookie = cookie;
 }
 
 /*
  * Reclaim resources associated with a tag.
  */
 static __inline void
 m_tag_free(struct m_tag *t)
 {
 
 	(*t->m_tag_free)(t);
 }
 
 /*
  * Return the first tag associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
 
 	return (SLIST_FIRST(&m->m_pkthdr.tags));
 }
 
 /*
  * Return the next tag in the list of tags associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_next(struct mbuf *m __unused, struct m_tag *t)
 {
 
 	return (SLIST_NEXT(t, m_tag_link));
 }
 
 /*
  * Prepend a tag to the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_prepend(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
 /*
  * Unlink a tag from the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_unlink(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
 /* These are for OpenBSD compatibility. */
 #define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
 
 static __inline struct m_tag *
 m_tag_get(int type, int length, int wait)
 {
 	return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait));
 }
 
 static __inline struct m_tag *
 m_tag_find(struct mbuf *m, int type, struct m_tag *start)
 {
 	return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL :
 	    m_tag_locate(m, MTAG_ABI_COMPAT, type, start));
 }
 
 static inline struct m_snd_tag *
 m_snd_tag_ref(struct m_snd_tag *mst)
 {
 
 	refcount_acquire(&mst->refcount);
 	return (mst);
 }
 
 static inline void
 m_snd_tag_rele(struct m_snd_tag *mst)
 {
 
 	if (refcount_release(&mst->refcount))
 		m_snd_tag_destroy(mst);
 }
 
 static __inline struct mbuf *
 m_free(struct mbuf *m)
 {
 	struct mbuf *n = m->m_next;
 
 	MBUF_PROBE1(m__free, m);
 	if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE))
 		m_tag_delete_chain(m, NULL);
 	if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG)
 		m_snd_tag_rele(m->m_pkthdr.snd_tag);
 	if (m->m_flags & M_EXTPG)
 		mb_free_extpg(m);
 	else if (m->m_flags & M_EXT)
 		mb_free_ext(m);
 	else if ((m->m_flags & M_NOFREE) == 0)
 		uma_zfree(zone_mbuf, m);
 	return (n);
 }
 
 static __inline int
 rt_m_getfib(struct mbuf *m)
 {
 	KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf."));
 	return (m->m_pkthdr.fibnum);
 }
 
 #define M_GETFIB(_m)   rt_m_getfib(_m)
 
 #define M_SETFIB(_m, _fib) do {						\
         KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf."));	\
 	((_m)->m_pkthdr.fibnum) = (_fib);				\
 } while (0)
 
 /* flags passed as first argument for "m_xxx_tcpip_hash()" */
 #define	MBUF_HASHFLAG_L2	(1 << 2)
 #define	MBUF_HASHFLAG_L3	(1 << 3)
 #define	MBUF_HASHFLAG_L4	(1 << 4)
 
 /* mbuf hashing helper routines */
 uint32_t	m_ether_tcpip_hash_init(void);
 uint32_t	m_ether_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t);
 uint32_t	m_infiniband_tcpip_hash_init(void);
 uint32_t	m_infiniband_tcpip_hash(const uint32_t, const struct mbuf *, uint32_t);
 
 #ifdef MBUF_PROFILING
  void m_profile(struct mbuf *m);
  #define M_PROFILE(m) m_profile(m)
 #else
  #define M_PROFILE(m)
 #endif
 
 struct mbufq {
 	STAILQ_HEAD(, mbuf)	mq_head;
 	int			mq_len;
 	int			mq_maxlen;
 };
 
 static inline void
 mbufq_init(struct mbufq *mq, int maxlen)
 {
 
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_maxlen = maxlen;
 	mq->mq_len = 0;
 }
 
 static inline struct mbuf *
 mbufq_flush(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_len = 0;
 	return (m);
 }
 
 static inline void
 mbufq_drain(struct mbufq *mq)
 {
 	struct mbuf *m, *n;
 
 	n = mbufq_flush(mq);
 	while ((m = n) != NULL) {
 		n = STAILQ_NEXT(m, m_stailqpkt);
 		m_freem(m);
 	}
 }
 
 static inline struct mbuf *
 mbufq_first(const struct mbufq *mq)
 {
 
 	return (STAILQ_FIRST(&mq->mq_head));
 }
 
 static inline struct mbuf *
 mbufq_last(const struct mbufq *mq)
 {
 
 	return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt));
 }
 
 static inline int
 mbufq_full(const struct mbufq *mq)
 {
 
 	return (mq->mq_maxlen > 0 && mq->mq_len >= mq->mq_maxlen);
 }
 
 static inline int
 mbufq_len(const struct mbufq *mq)
 {
 
 	return (mq->mq_len);
 }
 
 static inline int
 mbufq_enqueue(struct mbufq *mq, struct mbuf *m)
 {
 
 	if (mbufq_full(mq))
 		return (ENOBUFS);
 	STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 	return (0);
 }
 
 static inline struct mbuf *
 mbufq_dequeue(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	if (m) {
 		STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt);
 		m->m_nextpkt = NULL;
 		mq->mq_len--;
 	}
 	return (m);
 }
 
 static inline void
 mbufq_prepend(struct mbufq *mq, struct mbuf *m)
 {
 
 	STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 }
 
 /*
  * Note: this doesn't enforce the maximum list size for dst.
  */
 static inline void
 mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src)
 {
 
 	mq_dst->mq_len += mq_src->mq_len;
 	STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head);
 	mq_src->mq_len = 0;
 }
 
 #ifdef _SYS_TIMESPEC_H_
 static inline void
 mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts)
 {
 
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m));
 	KASSERT((m->m_flags & (M_TSTMP|M_TSTMP_LRO)) != 0, ("mbuf %p no M_TSTMP or M_TSTMP_LRO", m));
 	ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
 	ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000;
 }
 #endif
 
 #ifdef DEBUGNET
 /* Invoked from the debugnet client code. */
 void	debugnet_mbuf_drain(void);
 void	debugnet_mbuf_start(void);
 void	debugnet_mbuf_finish(void);
 void	debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize);
 #endif
 
 static inline bool
 mbuf_has_tls_session(struct mbuf *m)
 {
 
 	if (m->m_flags & M_EXTPG) {
 		if (m->m_epg_tls != NULL) {
 			return (true);
 		}
 	}
 	return (false);
 }
 
 #endif /* _KERNEL */
 #endif /* !_SYS_MBUF_H_ */