Index: stable/11/sys/dev/bnxt/bnxt.h
===================================================================
--- stable/11/sys/dev/bnxt/bnxt.h	(revision 333337)
+++ stable/11/sys/dev/bnxt/bnxt.h	(revision 333338)
@@ -1,592 +1,594 @@
 /*-
  * Broadcom NetXtreme-C/E network driver.
  *
  * Copyright (c) 2016 Broadcom, All Rights Reserved.
  * The term Broadcom refers to Broadcom Limited and/or its subsidiaries
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS'
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _BNXT_H
 #define _BNXT_H
 
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/bus_dma.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/iflib.h>
 
 #include "hsi_struct_def.h"
 
 /* PCI IDs */
 #define BROADCOM_VENDOR_ID	0x14E4
 
 #define BCM57301	0x16c8
 #define BCM57302	0x16c9
 #define BCM57304	0x16ca
 #define BCM57311	0x16ce
 #define BCM57312	0x16cf
 #define BCM57314	0x16df
 #define BCM57402	0x16d0
 #define BCM57402_NPAR	0x16d4
 #define BCM57404	0x16d1
 #define BCM57404_NPAR	0x16e7
 #define BCM57406	0x16d2
 #define BCM57406_NPAR	0x16e8
 #define BCM57407	0x16d5
 #define BCM57407_NPAR	0x16ea
 #define BCM57407_SFP	0x16e9
 #define BCM57412	0x16d6
 #define BCM57412_NPAR1	0x16de
 #define BCM57412_NPAR2	0x16eb
 #define BCM57414	0x16d7
 #define BCM57414_NPAR1	0x16ec
 #define BCM57414_NPAR2	0x16ed
 #define BCM57416	0x16d8
 #define BCM57416_NPAR1	0x16ee
 #define BCM57416_NPAR2	0x16ef
 #define BCM57416_SFP	0x16e3
 #define BCM57417	0x16d9
 #define BCM57417_NPAR1	0x16c0
 #define BCM57417_NPAR2	0x16cc
 #define BCM57417_SFP	0x16e2
 #define BCM57454	0x1614
 #define BCM58700	0x16cd
 #define NETXTREME_C_VF1	0x16cb
 #define NETXTREME_C_VF2	0x16e1
 #define NETXTREME_C_VF3	0x16e5
 #define NETXTREME_E_VF1	0x16c1
 #define NETXTREME_E_VF2	0x16d3
 #define NETXTREME_E_VF3	0x16dc
 
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 
 #define BNXT_MAX_MTU	9000
 
 /* Completion related defines */
 #define CMP_VALID(cmp, v_bit) \
 	((!!(((struct cmpl_base *)(cmp))->info3_v & htole32(CMPL_BASE_V))) == !!(v_bit) )
 
 #define NEXT_CP_CONS_V(ring, cons, v_bit) do {				    \
 	if (__predict_false(++(cons) == (ring)->ring_size))		    \
 		((cons) = 0, (v_bit) = !v_bit);				    \
 } while (0)
 
 #define RING_NEXT(ring, idx) (__predict_false(idx + 1 == (ring)->ring_size) ? \
 								0 : idx + 1)
 
 #define CMPL_PREFETCH_NEXT(cpr, idx)					    \
 	__builtin_prefetch(&((struct cmpl_base *)(cpr)->ring.vaddr)[((idx) +\
 	    (CACHE_LINE_SIZE / sizeof(struct cmpl_base))) &		    \
 	    ((cpr)->ring.ring_size - 1)])
 
 /*
  * If we update the index, a write barrier is needed after the write to ensure
  * the completion ring has space before the RX/TX ring does.  Since we can't
  * make the RX and AG doorbells covered by the same barrier without remapping
  * MSI-X vectors, we create the barrier over the enture doorbell bar.
  * TODO: Remap the MSI-X vectors to allow a barrier to only cover the doorbells
  *       for a single ring group.
  *
  * A barrier of just the size of the write is used to ensure the ordering
  * remains correct and no writes are lost.
  */
 #define BNXT_CP_DISABLE_DB(ring) do {					    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell, 4,	    \
 	    BUS_SPACE_BARRIER_WRITE);					    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, 0,			    \
 	    (ring)->softc->doorbell_bar.size, BUS_SPACE_BARRIER_WRITE);	    \
 	bus_space_write_4((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell,	    \
 	    htole32(CMPL_DOORBELL_KEY_CMPL | CMPL_DOORBELL_MASK));	    \
 } while (0)
 
 #define BNXT_CP_ENABLE_DB(ring) do {					    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell, 4,	    \
 	    BUS_SPACE_BARRIER_WRITE);					    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, 0,			    \
 	    (ring)->softc->doorbell_bar.size, BUS_SPACE_BARRIER_WRITE);	    \
 	bus_space_write_4((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell,	    \
 	    htole32(CMPL_DOORBELL_KEY_CMPL));				    \
 } while (0)
 
 #define BNXT_CP_IDX_ENABLE_DB(ring, cons) do {				    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell, 4,	    \
 	    BUS_SPACE_BARRIER_WRITE);					    \
 	bus_space_write_4((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell,	    \
 	    htole32(CMPL_DOORBELL_KEY_CMPL | CMPL_DOORBELL_IDX_VALID |	    \
 	    (cons)));							    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, 0,			    \
 	    (ring)->softc->doorbell_bar.size, BUS_SPACE_BARRIER_WRITE);	    \
 } while (0)
 
 #define BNXT_CP_IDX_DISABLE_DB(ring, cons) do {				    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell, 4,	    \
 	    BUS_SPACE_BARRIER_WRITE);					    \
 	bus_space_write_4((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell,	    \
 	    htole32(CMPL_DOORBELL_KEY_CMPL | CMPL_DOORBELL_IDX_VALID |	    \
 	    CMPL_DOORBELL_MASK | (cons)));				    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, 0,			    \
 	    (ring)->softc->doorbell_bar.size, BUS_SPACE_BARRIER_WRITE);	    \
 } while (0)
 
 #define BNXT_TX_DB(ring, idx) do {					    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell, 4,	    \
 	    BUS_SPACE_BARRIER_WRITE);					    \
 	bus_space_write_4(						    \
 	    (ring)->softc->doorbell_bar.tag,				    \
 	    (ring)->softc->doorbell_bar.handle,				    \
 	    (ring)->doorbell, htole32(TX_DOORBELL_KEY_TX | (idx)));	    \
 } while (0)
 
 #define BNXT_RX_DB(ring, idx) do {					    \
 	bus_space_barrier((ring)->softc->doorbell_bar.tag,		    \
 	    (ring)->softc->doorbell_bar.handle, (ring)->doorbell, 4,	    \
 	    BUS_SPACE_BARRIER_WRITE);					    \
 	bus_space_write_4(						    \
 	    (ring)->softc->doorbell_bar.tag,				    \
 	    (ring)->softc->doorbell_bar.handle,				    \
 	    (ring)->doorbell, htole32(RX_DOORBELL_KEY_RX | (idx)));	    \
 } while (0)
 
 /* Lock macros */
 #define BNXT_HWRM_LOCK_INIT(_softc, _name) \
     mtx_init(&(_softc)->hwrm_lock, _name, "BNXT HWRM Lock", MTX_DEF)
 #define BNXT_HWRM_LOCK(_softc)		mtx_lock(&(_softc)->hwrm_lock)
 #define BNXT_HWRM_UNLOCK(_softc)	mtx_unlock(&(_softc)->hwrm_lock)
 #define BNXT_HWRM_LOCK_DESTROY(_softc)	mtx_destroy(&(_softc)->hwrm_lock)
 #define BNXT_HWRM_LOCK_ASSERT(_softc)	mtx_assert(&(_softc)->hwrm_lock,    \
     MA_OWNED)
 
 /* Chip info */
 #define BNXT_TSO_SIZE	UINT16_MAX
 
+#define BNXT_MIN_FRAME_SIZE	52	/* Frames must be padded to this size for some A0 chips */
+
 /* NVRAM access */
 enum bnxt_nvm_directory_type {
 	BNX_DIR_TYPE_UNUSED = 0,
 	BNX_DIR_TYPE_PKG_LOG = 1,
 	BNX_DIR_TYPE_UPDATE = 2,
 	BNX_DIR_TYPE_CHIMP_PATCH = 3,
 	BNX_DIR_TYPE_BOOTCODE = 4,
 	BNX_DIR_TYPE_VPD = 5,
 	BNX_DIR_TYPE_EXP_ROM_MBA = 6,
 	BNX_DIR_TYPE_AVS = 7,
 	BNX_DIR_TYPE_PCIE = 8,
 	BNX_DIR_TYPE_PORT_MACRO = 9,
 	BNX_DIR_TYPE_APE_FW = 10,
 	BNX_DIR_TYPE_APE_PATCH = 11,
 	BNX_DIR_TYPE_KONG_FW = 12,
 	BNX_DIR_TYPE_KONG_PATCH = 13,
 	BNX_DIR_TYPE_BONO_FW = 14,
 	BNX_DIR_TYPE_BONO_PATCH = 15,
 	BNX_DIR_TYPE_TANG_FW = 16,
 	BNX_DIR_TYPE_TANG_PATCH = 17,
 	BNX_DIR_TYPE_BOOTCODE_2 = 18,
 	BNX_DIR_TYPE_CCM = 19,
 	BNX_DIR_TYPE_PCI_CFG = 20,
 	BNX_DIR_TYPE_TSCF_UCODE = 21,
 	BNX_DIR_TYPE_ISCSI_BOOT = 22,
 	BNX_DIR_TYPE_ISCSI_BOOT_IPV6 = 24,
 	BNX_DIR_TYPE_ISCSI_BOOT_IPV4N6 = 25,
 	BNX_DIR_TYPE_ISCSI_BOOT_CFG6 = 26,
 	BNX_DIR_TYPE_EXT_PHY = 27,
 	BNX_DIR_TYPE_SHARED_CFG = 40,
 	BNX_DIR_TYPE_PORT_CFG = 41,
 	BNX_DIR_TYPE_FUNC_CFG = 42,
 	BNX_DIR_TYPE_MGMT_CFG = 48,
 	BNX_DIR_TYPE_MGMT_DATA = 49,
 	BNX_DIR_TYPE_MGMT_WEB_DATA = 50,
 	BNX_DIR_TYPE_MGMT_WEB_META = 51,
 	BNX_DIR_TYPE_MGMT_EVENT_LOG = 52,
 	BNX_DIR_TYPE_MGMT_AUDIT_LOG = 53
 };
 
 enum bnxnvm_pkglog_field_index {
 	BNX_PKG_LOG_FIELD_IDX_INSTALLED_TIMESTAMP	= 0,
 	BNX_PKG_LOG_FIELD_IDX_PKG_DESCRIPTION		= 1,
 	BNX_PKG_LOG_FIELD_IDX_PKG_VERSION		= 2,
 	BNX_PKG_LOG_FIELD_IDX_PKG_TIMESTAMP		= 3,
 	BNX_PKG_LOG_FIELD_IDX_PKG_CHECKSUM		= 4,
 	BNX_PKG_LOG_FIELD_IDX_INSTALLED_ITEMS		= 5,
 	BNX_PKG_LOG_FIELD_IDX_INSTALLED_MASK		= 6
 };
 
 #define BNX_DIR_ORDINAL_FIRST		0
 #define BNX_DIR_EXT_NONE		0
 
 struct bnxt_bar_info {
 	struct resource		*res;
 	bus_space_tag_t		tag;
 	bus_space_handle_t	handle;
 	bus_size_t		size;
 	int			rid;
 };
 
 struct bnxt_link_info {
 	uint8_t		media_type;
 	uint8_t		transceiver;
 	uint8_t		phy_addr;
 	uint8_t		phy_link_status;
 	uint8_t		wire_speed;
 	uint8_t		loop_back;
 	uint8_t		link_up;
 	uint8_t		last_link_up;
 	uint8_t		duplex;
 	uint8_t		last_duplex;
 	uint8_t		pause;
 	uint8_t		last_pause;
 	uint8_t		auto_pause;
 	uint8_t		force_pause;
 	uint8_t		duplex_setting;
 	uint8_t		auto_mode;
 #define PHY_VER_LEN		3
 	uint8_t		phy_ver[PHY_VER_LEN];
 	uint8_t		phy_type;
 	uint16_t	link_speed;
 	uint16_t	support_speeds;
 	uint16_t	auto_link_speeds;
 	uint16_t	auto_link_speed;
 	uint16_t	force_link_speed;
 	uint32_t	preemphasis;
 
 	/* copy of requested setting */
 	uint8_t		autoneg;
 #define BNXT_AUTONEG_SPEED	1
 #define BNXT_AUTONEG_FLOW_CTRL	2
 	uint8_t		req_duplex;
 	uint8_t		req_flow_ctrl;
 	uint16_t	req_link_speed;
 };
 
 enum bnxt_cp_type {
 	BNXT_DEFAULT,
 	BNXT_TX,
 	BNXT_RX,
 	BNXT_SHARED
 };
 
 struct bnxt_cos_queue {
 	uint8_t	id;
 	uint8_t	profile;
 };
 
 struct bnxt_func_info {
 	uint32_t	fw_fid;
 	uint8_t		mac_addr[ETHER_ADDR_LEN];
 	uint16_t	max_rsscos_ctxs;
 	uint16_t	max_cp_rings;
 	uint16_t	max_tx_rings;
 	uint16_t	max_rx_rings;
 	uint16_t	max_hw_ring_grps;
 	uint16_t	max_irqs;
 	uint16_t	max_l2_ctxs;
 	uint16_t	max_vnics;
 	uint16_t	max_stat_ctxs;
 };
 
 struct bnxt_pf_info {
 #define BNXT_FIRST_PF_FID	1
 #define BNXT_FIRST_VF_FID	128
 	uint8_t		port_id;
 	uint32_t	first_vf_id;
 	uint16_t	active_vfs;
 	uint16_t	max_vfs;
 	uint32_t	max_encap_records;
 	uint32_t	max_decap_records;
 	uint32_t	max_tx_em_flows;
 	uint32_t	max_tx_wm_flows;
 	uint32_t	max_rx_em_flows;
 	uint32_t	max_rx_wm_flows;
 	unsigned long	*vf_event_bmap;
 	uint16_t	hwrm_cmd_req_pages;
 	void		*hwrm_cmd_req_addr[4];
 	bus_addr_t	hwrm_cmd_req_dma_addr[4];
 };
 
 struct bnxt_vf_info {
 	uint16_t	fw_fid;
 	uint8_t		mac_addr[ETHER_ADDR_LEN];
 	uint16_t	max_rsscos_ctxs;
 	uint16_t	max_cp_rings;
 	uint16_t	max_tx_rings;
 	uint16_t	max_rx_rings;
 	uint16_t	max_hw_ring_grps;
 	uint16_t	max_l2_ctxs;
 	uint16_t	max_irqs;
 	uint16_t	max_vnics;
 	uint16_t	max_stat_ctxs;
 	uint32_t	vlan;
 #define BNXT_VF_QOS		0x1
 #define BNXT_VF_SPOOFCHK	0x2
 #define BNXT_VF_LINK_FORCED	0x4
 #define BNXT_VF_LINK_UP		0x8
 	uint32_t	flags;
 	uint32_t	func_flags; /* func cfg flags */
 	uint32_t	min_tx_rate;
 	uint32_t	max_tx_rate;
 	void		*hwrm_cmd_req_addr;
 	bus_addr_t	hwrm_cmd_req_dma_addr;
 };
 
 #define BNXT_FLAG_VF		(1<<1)
 
 #define BNXT_PF(softc)		(!((softc)->flags & BNXT_FLAG_VF))
 #define BNXT_VF(softc)		((softc)->flags & BNXT_FLAG_VF)
 
 struct bnxt_vlan_tag {
 	SLIST_ENTRY(bnxt_vlan_tag) next;
 	uint16_t	tpid;
 	uint16_t	tag;
 };
 
 struct bnxt_vnic_info {
 	uint16_t	id;
 	uint16_t	def_ring_grp;
 	uint16_t	cos_rule;
 	uint16_t	lb_rule;
 	uint16_t	mru;
 
 	uint32_t	rx_mask;
 	bool		vlan_only;
 	struct iflib_dma_info mc_list;
 	int		mc_list_count;
 #define BNXT_MAX_MC_ADDRS		16
 
 	uint32_t	flags;
 #define BNXT_VNIC_FLAG_DEFAULT		0x01
 #define BNXT_VNIC_FLAG_BD_STALL		0x02
 #define BNXT_VNIC_FLAG_VLAN_STRIP	0x04
 
 	uint64_t	filter_id;
 	uint32_t	flow_id;
 
 	uint16_t	rss_id;
 	uint32_t	rss_hash_type;
 	uint8_t		rss_hash_key[HW_HASH_KEY_SIZE];
 	struct iflib_dma_info rss_hash_key_tbl;
 	struct iflib_dma_info	rss_grp_tbl;
 	SLIST_HEAD(vlan_head, bnxt_vlan_tag) vlan_tags;
 	struct iflib_dma_info vlan_tag_list;
 };
 
 struct bnxt_grp_info {
 	uint16_t	stats_ctx;
 	uint16_t	grp_id;
 	uint16_t	rx_ring_id;
 	uint16_t	cp_ring_id;
 	uint16_t	ag_ring_id;
 };
 
 struct bnxt_ring {
 	uint64_t		paddr;
 	vm_offset_t		doorbell;
 	caddr_t			vaddr;
 	struct bnxt_softc	*softc;
 	uint32_t		ring_size;	/* Must be a power of two */
 	uint16_t		id;		/* Logical ID */
 	uint16_t		phys_id;
+	struct bnxt_full_tpa_start *tpa_start;
 };
 
 struct bnxt_cp_ring {
 	struct bnxt_ring	ring;
 	struct if_irq		irq;
 	uint32_t		cons;
 	bool			v_bit;		/* Value of valid bit */
 	struct ctx_hw_stats	*stats;
 	uint32_t		stats_ctx_id;
 	uint32_t		last_idx;	/* Used by RX rings only
 						 * set to the last read pidx
 						 */
 };
 
 struct bnxt_full_tpa_start {
 	struct rx_tpa_start_cmpl low;
 	struct rx_tpa_start_cmpl_hi high;
 };
 
 /* All the version information for the part */
 #define BNXT_VERSTR_SIZE	(3*3+2+1)	/* ie: "255.255.255\0" */
 #define BNXT_NAME_SIZE		17
 struct bnxt_ver_info {
 	uint8_t		hwrm_if_major;
 	uint8_t		hwrm_if_minor;
 	uint8_t		hwrm_if_update;
 	char		hwrm_if_ver[BNXT_VERSTR_SIZE];
 	char		driver_hwrm_if_ver[BNXT_VERSTR_SIZE];
 	char		hwrm_fw_ver[BNXT_VERSTR_SIZE];
 	char		mgmt_fw_ver[BNXT_VERSTR_SIZE];
 	char		netctrl_fw_ver[BNXT_VERSTR_SIZE];
 	char		roce_fw_ver[BNXT_VERSTR_SIZE];
 	char		phy_ver[BNXT_VERSTR_SIZE];
 	char		pkg_ver[64];
 
 	char		hwrm_fw_name[BNXT_NAME_SIZE];
 	char		mgmt_fw_name[BNXT_NAME_SIZE];
 	char		netctrl_fw_name[BNXT_NAME_SIZE];
 	char		roce_fw_name[BNXT_NAME_SIZE];
 	char		phy_vendor[BNXT_NAME_SIZE];
 	char		phy_partnumber[BNXT_NAME_SIZE];
 
 	uint16_t	chip_num;
 	uint8_t		chip_rev;
 	uint8_t		chip_metal;
 	uint8_t		chip_bond_id;
 	uint8_t		chip_type;
 
 	uint8_t		hwrm_min_major;
 	uint8_t		hwrm_min_minor;
 	uint8_t		hwrm_min_update;
 
 	struct sysctl_ctx_list	ver_ctx;
 	struct sysctl_oid	*ver_oid;
 };
 
 struct bnxt_nvram_info {
 	uint16_t	mfg_id;
 	uint16_t	device_id;
 	uint32_t	sector_size;
 	uint32_t	size;
 	uint32_t	reserved_size;
 	uint32_t	available_size;
 
 	struct sysctl_ctx_list	nvm_ctx;
 	struct sysctl_oid	*nvm_oid;
 };
 
 struct bnxt_softc {
 	device_t	dev;
 	if_ctx_t	ctx;
 	if_softc_ctx_t	scctx;
 	if_shared_ctx_t	sctx;
 	struct ifmedia	*media;
 
 	struct bnxt_bar_info	hwrm_bar;
 	struct bnxt_bar_info	doorbell_bar;
 	struct bnxt_link_info	link_info;
 #define BNXT_FLAG_NPAR		1
 	uint32_t		flags;
 	uint32_t		total_msix;
 
 	struct bnxt_func_info	func;
 	struct bnxt_pf_info	pf;
 	struct bnxt_vf_info	vf;
 
 	uint16_t		hwrm_cmd_seq;
 	uint32_t		hwrm_cmd_timeo;	/* milliseconds */
 	struct iflib_dma_info	hwrm_cmd_resp;
 	/* Interrupt info for HWRM */
 	struct if_irq		irq;
 	struct mtx		hwrm_lock;
 	uint16_t		hwrm_max_req_len;
 
 #define BNXT_MAX_QUEUE		8
 	uint8_t			max_tc;
 	struct bnxt_cos_queue	q_info[BNXT_MAX_QUEUE];
 
 	struct iflib_dma_info	hw_rx_port_stats;
 	struct iflib_dma_info	hw_tx_port_stats;
 	struct rx_port_stats	*rx_port_stats;
 	struct tx_port_stats	*tx_port_stats;
 
 	int			num_cp_rings;
 
 	struct bnxt_ring	*tx_rings;
 	struct bnxt_cp_ring	*tx_cp_rings;
 	struct iflib_dma_info	tx_stats;
 	int			ntxqsets;
 
 	struct bnxt_vnic_info	vnic_info;
 	struct bnxt_ring	*ag_rings;
 	struct bnxt_ring	*rx_rings;
 	struct bnxt_cp_ring	*rx_cp_rings;
 	struct bnxt_grp_info	*grp_info;
 	struct iflib_dma_info	rx_stats;
 	int			nrxqsets;
 
 	struct bnxt_cp_ring	def_cp_ring;
 	struct iflib_dma_info	def_cp_ring_mem;
 	struct grouptask	def_cp_task;
 
 	struct sysctl_ctx_list	hw_stats;
 	struct sysctl_oid	*hw_stats_oid;
 
-	struct bnxt_full_tpa_start *tpa_start;
 	struct bnxt_ver_info	*ver_info;
 	struct bnxt_nvram_info	*nvm_info;
 };
 
 struct bnxt_filter_info {
 	STAILQ_ENTRY(bnxt_filter_info) next;
 	uint64_t	fw_l2_filter_id;
 #define INVALID_MAC_INDEX ((uint16_t)-1)
 	uint16_t	mac_index;
 
 	/* Filter Characteristics */
 	uint32_t	flags;
 	uint32_t	enables;
 	uint8_t		l2_addr[ETHER_ADDR_LEN];
 	uint8_t		l2_addr_mask[ETHER_ADDR_LEN];
 	uint16_t	l2_ovlan;
 	uint16_t	l2_ovlan_mask;
 	uint16_t	l2_ivlan;
 	uint16_t	l2_ivlan_mask;
 	uint8_t		t_l2_addr[ETHER_ADDR_LEN];
 	uint8_t		t_l2_addr_mask[ETHER_ADDR_LEN];
 	uint16_t	t_l2_ovlan;
 	uint16_t	t_l2_ovlan_mask;
 	uint16_t	t_l2_ivlan;
 	uint16_t	t_l2_ivlan_mask;
 	uint8_t		tunnel_type;
 	uint16_t	mirror_vnic_id;
 	uint32_t	vni;
 	uint8_t		pri_hint;
 	uint64_t	l2_filter_id_hint;
 };
 
 /* Function declarations */
 void bnxt_report_link(struct bnxt_softc *softc);
 bool bnxt_check_hwrm_version(struct bnxt_softc *softc);
 
 #endif /* _BNXT_H */
Index: stable/11/sys/dev/bnxt/bnxt_hwrm.c
===================================================================
--- stable/11/sys/dev/bnxt/bnxt_hwrm.c	(revision 333337)
+++ stable/11/sys/dev/bnxt/bnxt_hwrm.c	(revision 333338)
@@ -1,1485 +1,1485 @@
 /*-
  * Broadcom NetXtreme-C/E network driver.
  *
  * Copyright (c) 2016 Broadcom, All Rights Reserved.
  * The term Broadcom refers to Broadcom Limited and/or its subsidiaries
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS'
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/endian.h>
 
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "hsi_struct_def.h"
 
 static int bnxt_hwrm_err_map(uint16_t err);
 static inline int _is_valid_ether_addr(uint8_t *);
 static inline void get_random_ether_addr(uint8_t *);
 static void	bnxt_hwrm_set_link_common(struct bnxt_softc *softc,
 		    struct hwrm_port_phy_cfg_input *req);
 static void	bnxt_hwrm_set_pause_common(struct bnxt_softc *softc,
 		    struct hwrm_port_phy_cfg_input *req);
 static void	bnxt_hwrm_set_eee(struct bnxt_softc *softc,
 		    struct hwrm_port_phy_cfg_input *req);
 static int	_hwrm_send_message(struct bnxt_softc *, void *, uint32_t);
 static int	hwrm_send_message(struct bnxt_softc *, void *, uint32_t);
 static void bnxt_hwrm_cmd_hdr_init(struct bnxt_softc *, void *, uint16_t);
 
 /* NVRam stuff has a five minute timeout */
 #define BNXT_NVM_TIMEO	(5 * 60 * 1000)
 
 static int
 bnxt_hwrm_err_map(uint16_t err)
 {
 	int rc;
 
 	switch (err) {
 	case HWRM_ERR_CODE_SUCCESS:
 		return 0;
 	case HWRM_ERR_CODE_INVALID_PARAMS:
 	case HWRM_ERR_CODE_INVALID_FLAGS:
 	case HWRM_ERR_CODE_INVALID_ENABLES:
 		return EINVAL;
 	case HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED:
 		return EACCES;
 	case HWRM_ERR_CODE_RESOURCE_ALLOC_ERROR:
 		return ENOMEM;
 	case HWRM_ERR_CODE_CMD_NOT_SUPPORTED:
 		return ENOSYS;
 	case HWRM_ERR_CODE_FAIL:
 		return EIO;
 	case HWRM_ERR_CODE_HWRM_ERROR:
 	case HWRM_ERR_CODE_UNKNOWN_ERR:
 	default:
 		return EDOOFUS;
 	}
 
 	return rc;
 }
 
 int
 bnxt_alloc_hwrm_dma_mem(struct bnxt_softc *softc)
 {
 	int rc;
 
 	rc = iflib_dma_alloc(softc->ctx, PAGE_SIZE, &softc->hwrm_cmd_resp,
 	    BUS_DMA_NOWAIT);
 	return rc;
 }
 
 void
 bnxt_free_hwrm_dma_mem(struct bnxt_softc *softc)
 {
 	if (softc->hwrm_cmd_resp.idi_vaddr)
 		iflib_dma_free(&softc->hwrm_cmd_resp);
 	softc->hwrm_cmd_resp.idi_vaddr = NULL;
 	return;
 }
 
 static void
 bnxt_hwrm_cmd_hdr_init(struct bnxt_softc *softc, void *request,
     uint16_t req_type)
 {
 	struct input *req = request;
 
 	req->req_type = htole16(req_type);
 	req->cmpl_ring = 0xffff;
 	req->target_id = 0xffff;
 	req->resp_addr = htole64(softc->hwrm_cmd_resp.idi_paddr);
 }
 
 static int
 _hwrm_send_message(struct bnxt_softc *softc, void *msg, uint32_t msg_len)
 {
 	struct input *req = msg;
 	struct hwrm_err_output *resp = (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	uint32_t *data = msg;
 	int i;
 	uint16_t cp_ring_id;
 	uint8_t *valid;
 	uint16_t err;
 
 	/* TODO: DMASYNC in here. */
 	req->seq_id = htole16(softc->hwrm_cmd_seq++);
 	memset(resp, 0, PAGE_SIZE);
 	cp_ring_id = le16toh(req->cmpl_ring);
 
 	/* Write request msg to hwrm channel */
 	for (i = 0; i < msg_len; i += 4) {
 		bus_space_write_4(softc->hwrm_bar.tag,
 				  softc->hwrm_bar.handle,
 				  i, *data);
 		data++;
 	}
 
 	/* Clear to the end of the request buffer */
 	for (i = msg_len; i < HWRM_MAX_REQ_LEN; i += 4)
 		bus_space_write_4(softc->hwrm_bar.tag, softc->hwrm_bar.handle,
 		    i, 0);
 
 	/* Ring channel doorbell */
 	bus_space_write_4(softc->hwrm_bar.tag,
 			  softc->hwrm_bar.handle,
 			  0x100, htole32(1));
 
 	/* Check if response len is updated */
 	for (i = 0; i < softc->hwrm_cmd_timeo; i++) {
 		if (resp->resp_len && resp->resp_len <= 4096)
 			break;
 		DELAY(1000);
 	}
 	if (i >= softc->hwrm_cmd_timeo) {
 		device_printf(softc->dev,
 		    "Timeout sending %s: (timeout: %u) seq: %d\n",
 		    GET_HWRM_REQ_TYPE(req->req_type), softc->hwrm_cmd_timeo,
 		    le16toh(req->seq_id));
 		return ETIMEDOUT;
 	}
 	/* Last byte of resp contains the valid key */
 	valid = (uint8_t *)resp + resp->resp_len - 1;
 	for (i = 0; i < softc->hwrm_cmd_timeo; i++) {
 		if (*valid == HWRM_RESP_VALID_KEY)
 			break;
 		DELAY(1000);
 	}
 	if (i >= softc->hwrm_cmd_timeo) {
 		device_printf(softc->dev, "Timeout sending %s: "
 		    "(timeout: %u) msg {0x%x 0x%x} len:%d v: %d\n",
 		    GET_HWRM_REQ_TYPE(req->req_type),
 		    softc->hwrm_cmd_timeo, le16toh(req->req_type),
 		    le16toh(req->seq_id), msg_len,
 		    *valid);
 		return ETIMEDOUT;
 	}
 
 	err = le16toh(resp->error_code);
 	if (err) {
 		/* HWRM_ERR_CODE_FAIL is a "normal" error, don't log */
 		if (err != HWRM_ERR_CODE_FAIL) {
 			device_printf(softc->dev,
 			    "%s command returned %s error.\n",
 			    GET_HWRM_REQ_TYPE(req->req_type),
 			    GET_HWRM_ERROR_CODE(err));
 		}
 		return bnxt_hwrm_err_map(err);
 	}
 
 	return 0;
 }
 
 static int
 hwrm_send_message(struct bnxt_softc *softc, void *msg, uint32_t msg_len)
 {
 	int rc;
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, msg, msg_len);
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_queue_qportcfg(struct bnxt_softc *softc)
 {
 	struct hwrm_queue_qportcfg_input req = {0};
 	struct hwrm_queue_qportcfg_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 
 	int	rc = 0;
 	uint8_t	*qptr;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_QUEUE_QPORTCFG);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto qportcfg_exit;
 
 	if (!resp->max_configurable_queues) {
 		rc = -EINVAL;
 		goto qportcfg_exit;
 	}
 	softc->max_tc = resp->max_configurable_queues;
 	if (softc->max_tc > BNXT_MAX_QUEUE)
 		softc->max_tc = BNXT_MAX_QUEUE;
 
 	qptr = &resp->queue_id0;
 	for (int i = 0; i < softc->max_tc; i++) {
 		softc->q_info[i].id = *qptr++;
 		softc->q_info[i].profile = *qptr++;
 	}
 
 qportcfg_exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return (rc);
 }
 
 
 int
 bnxt_hwrm_ver_get(struct bnxt_softc *softc)
 {
 	struct hwrm_ver_get_input	req = {0};
 	struct hwrm_ver_get_output	*resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int				rc;
 	const char nastr[] = "<not installed>";
 	const char naver[] = "<N/A>";
 
 	softc->hwrm_max_req_len = HWRM_MAX_REQ_LEN;
 	softc->hwrm_cmd_timeo = 1000;
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VER_GET);
 
 	req.hwrm_intf_maj = HWRM_VERSION_MAJOR;
 	req.hwrm_intf_min = HWRM_VERSION_MINOR;
 	req.hwrm_intf_upd = HWRM_VERSION_UPDATE;
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	snprintf(softc->ver_info->hwrm_if_ver, BNXT_VERSTR_SIZE, "%d.%d.%d",
 	    resp->hwrm_intf_maj, resp->hwrm_intf_min, resp->hwrm_intf_upd);
 	softc->ver_info->hwrm_if_major = resp->hwrm_intf_maj;
 	softc->ver_info->hwrm_if_minor = resp->hwrm_intf_min;
 	softc->ver_info->hwrm_if_update = resp->hwrm_intf_upd;
 	snprintf(softc->ver_info->hwrm_fw_ver, BNXT_VERSTR_SIZE, "%d.%d.%d",
 	    resp->hwrm_fw_maj, resp->hwrm_fw_min, resp->hwrm_fw_bld);
 	strlcpy(softc->ver_info->driver_hwrm_if_ver, HWRM_VERSION_STR,
 	    BNXT_VERSTR_SIZE);
 	strlcpy(softc->ver_info->hwrm_fw_name, resp->hwrm_fw_name,
 	    BNXT_NAME_SIZE);
 
 	if (resp->mgmt_fw_maj == 0 && resp->mgmt_fw_min == 0 &&
 	    resp->mgmt_fw_bld == 0) {
 		strlcpy(softc->ver_info->mgmt_fw_ver, naver, BNXT_VERSTR_SIZE);
 		strlcpy(softc->ver_info->mgmt_fw_name, nastr, BNXT_NAME_SIZE);
 	}
 	else {
 		snprintf(softc->ver_info->mgmt_fw_ver, BNXT_VERSTR_SIZE,
 		    "%d.%d.%d", resp->mgmt_fw_maj, resp->mgmt_fw_min,
 		    resp->mgmt_fw_bld);
 		strlcpy(softc->ver_info->mgmt_fw_name, resp->mgmt_fw_name,
 		    BNXT_NAME_SIZE);
 	}
 	if (resp->netctrl_fw_maj == 0 && resp->netctrl_fw_min == 0 &&
 	    resp->netctrl_fw_bld == 0) {
 		strlcpy(softc->ver_info->netctrl_fw_ver, naver,
 		    BNXT_VERSTR_SIZE);
 		strlcpy(softc->ver_info->netctrl_fw_name, nastr,
 		    BNXT_NAME_SIZE);
 	}
 	else {
 		snprintf(softc->ver_info->netctrl_fw_ver, BNXT_VERSTR_SIZE,
 		    "%d.%d.%d", resp->netctrl_fw_maj, resp->netctrl_fw_min,
 		    resp->netctrl_fw_bld);
 		strlcpy(softc->ver_info->netctrl_fw_name, resp->netctrl_fw_name,
 		    BNXT_NAME_SIZE);
 	}
 	if (resp->roce_fw_maj == 0 && resp->roce_fw_min == 0 &&
 	    resp->roce_fw_bld == 0) {
 		strlcpy(softc->ver_info->roce_fw_ver, naver, BNXT_VERSTR_SIZE);
 		strlcpy(softc->ver_info->roce_fw_name, nastr, BNXT_NAME_SIZE);
 	}
 	else {
 		snprintf(softc->ver_info->roce_fw_ver, BNXT_VERSTR_SIZE,
 		    "%d.%d.%d", resp->roce_fw_maj, resp->roce_fw_min,
 		    resp->roce_fw_bld);
 		strlcpy(softc->ver_info->roce_fw_name, resp->roce_fw_name,
 		    BNXT_NAME_SIZE);
 	}
 	softc->ver_info->chip_num = le16toh(resp->chip_num);
 	softc->ver_info->chip_rev = resp->chip_rev;
 	softc->ver_info->chip_metal = resp->chip_metal;
 	softc->ver_info->chip_bond_id = resp->chip_bond_id;
 	softc->ver_info->chip_type = resp->chip_platform_type;
 
 	if (resp->max_req_win_len)
 		softc->hwrm_max_req_len = le16toh(resp->max_req_win_len);
 	if (resp->def_req_timeout)
 		softc->hwrm_cmd_timeo = le16toh(resp->def_req_timeout);
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_func_drv_rgtr(struct bnxt_softc *softc)
 {
 	struct hwrm_func_drv_rgtr_input req = {0};
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FUNC_DRV_RGTR);
 
 	req.enables = htole32(HWRM_FUNC_DRV_RGTR_INPUT_ENABLES_VER |
 	    HWRM_FUNC_DRV_RGTR_INPUT_ENABLES_OS_TYPE);
 	req.os_type = htole16(HWRM_FUNC_DRV_RGTR_INPUT_OS_TYPE_FREEBSD);
 
 	req.ver_maj = __FreeBSD_version / 100000;
 	req.ver_min = (__FreeBSD_version / 1000) % 100;
 	req.ver_upd = (__FreeBSD_version / 100) % 10;
 
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 
 int
 bnxt_hwrm_func_drv_unrgtr(struct bnxt_softc *softc, bool shutdown)
 {
 	struct hwrm_func_drv_unrgtr_input req = {0};
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FUNC_DRV_UNRGTR);
 	if (shutdown == true)
 		req.flags |=
 		    HWRM_FUNC_DRV_UNRGTR_INPUT_FLAGS_PREPARE_FOR_SHUTDOWN;
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 
 static inline int
 _is_valid_ether_addr(uint8_t *addr)
 {
 	char zero_addr[6] = { 0, 0, 0, 0, 0, 0 };
 
 	if ((addr[0] & 1) || (!bcmp(addr, zero_addr, ETHER_ADDR_LEN)))
 		return (FALSE);
 
 	return (TRUE);
 }
 
 static inline void
 get_random_ether_addr(uint8_t *addr)
 {
 	uint8_t temp[ETHER_ADDR_LEN];
 
 	arc4rand(&temp, sizeof(temp), 0);
 	temp[0] &= 0xFE;
 	temp[0] |= 0x02;
 	bcopy(temp, addr, sizeof(temp));
 }
 
 int
 bnxt_hwrm_func_qcaps(struct bnxt_softc *softc)
 {
 	int rc = 0;
 	struct hwrm_func_qcaps_input req = {0};
 	struct hwrm_func_qcaps_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	struct bnxt_func_info *func = &softc->func;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FUNC_QCAPS);
 	req.fid = htole16(0xffff);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	func->fw_fid = le16toh(resp->fid);
 	memcpy(func->mac_addr, resp->mac_address, ETHER_ADDR_LEN);
 	func->max_rsscos_ctxs = le16toh(resp->max_rsscos_ctx);
 	func->max_cp_rings = le16toh(resp->max_cmpl_rings);
 	func->max_tx_rings = le16toh(resp->max_tx_rings);
 	func->max_rx_rings = le16toh(resp->max_rx_rings);
 	func->max_hw_ring_grps = le32toh(resp->max_hw_ring_grps);
 	if (!func->max_hw_ring_grps)
 		func->max_hw_ring_grps = func->max_tx_rings;
 	func->max_l2_ctxs = le16toh(resp->max_l2_ctxs);
 	func->max_vnics = le16toh(resp->max_vnics);
 	func->max_stat_ctxs = le16toh(resp->max_stat_ctx);
 	if (BNXT_PF(softc)) {
 		struct bnxt_pf_info *pf = &softc->pf;
 
 		pf->port_id = le16toh(resp->port_id);
 		pf->first_vf_id = le16toh(resp->first_vf_id);
 		pf->max_vfs = le16toh(resp->max_vfs);
 		pf->max_encap_records = le32toh(resp->max_encap_records);
 		pf->max_decap_records = le32toh(resp->max_decap_records);
 		pf->max_tx_em_flows = le32toh(resp->max_tx_em_flows);
 		pf->max_tx_wm_flows = le32toh(resp->max_tx_wm_flows);
 		pf->max_rx_em_flows = le32toh(resp->max_rx_em_flows);
 		pf->max_rx_wm_flows = le32toh(resp->max_rx_wm_flows);
 	}
 	if (!_is_valid_ether_addr(func->mac_addr)) {
 		device_printf(softc->dev, "Invalid ethernet address, generating random locally administered address\n");
 		get_random_ether_addr(func->mac_addr);
 	}
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_func_reset(struct bnxt_softc *softc)
 {
 	struct hwrm_func_reset_input req = {0};
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FUNC_RESET);
 	req.enables = 0;
 
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 static void
 bnxt_hwrm_set_link_common(struct bnxt_softc *softc,
     struct hwrm_port_phy_cfg_input *req)
 {
 	uint8_t autoneg = softc->link_info.autoneg;
 	uint16_t fw_link_speed = softc->link_info.req_link_speed;
 
 	if (autoneg & BNXT_AUTONEG_SPEED) {
 		req->auto_mode |=
 		    HWRM_PORT_PHY_CFG_INPUT_AUTO_MODE_ALL_SPEEDS;
 
 		req->enables |=
 		    htole32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_MODE);
 		req->flags |=
 		    htole32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_RESTART_AUTONEG);
 	} else {
 		req->force_link_speed = htole16(fw_link_speed);
 		req->flags |= htole32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_FORCE);
 	}
 
 	/* tell chimp that the setting takes effect immediately */
 	req->flags |= htole32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_RESET_PHY);
 }
 
 
 static void
 bnxt_hwrm_set_pause_common(struct bnxt_softc *softc,
     struct hwrm_port_phy_cfg_input *req)
 {
 	if (softc->link_info.autoneg & BNXT_AUTONEG_FLOW_CTRL) {
 		req->auto_pause =
 		    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_AUTONEG_PAUSE;
 		if (softc->link_info.req_flow_ctrl &
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX)
 			req->auto_pause |=
 			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_RX;
 		if (softc->link_info.req_flow_ctrl &
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX)
 			req->auto_pause |=
 			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_RX;
 		req->enables |=
 		    htole32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_PAUSE);
 	} else {
 		if (softc->link_info.req_flow_ctrl &
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX)
 			req->force_pause |=
 			    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_RX;
 		if (softc->link_info.req_flow_ctrl &
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX)
 			req->force_pause |=
 			    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_TX;
 		req->enables |=
 			htole32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_FORCE_PAUSE);
 		req->auto_pause = req->force_pause;
 		req->enables |= htole32(
 		    HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_PAUSE);
 	}
 }
 
 
 /* JFV this needs interface connection */
 static void
 bnxt_hwrm_set_eee(struct bnxt_softc *softc, struct hwrm_port_phy_cfg_input *req)
 {
 	/* struct ethtool_eee *eee = &softc->eee; */
 	bool	eee_enabled = false;
 
 	if (eee_enabled) {
 #if 0
 		uint16_t eee_speeds;
 		uint32_t flags = HWRM_PORT_PHY_CFG_INPUT_FLAGS_EEE_ENABLE;
 
 		if (eee->tx_lpi_enabled)
 			flags |= HWRM_PORT_PHY_CFG_INPUT_FLAGS_EEE_TX_LPI;
 
 		req->flags |= htole32(flags);
 		eee_speeds = bnxt_get_fw_auto_link_speeds(eee->advertised);
 		req->eee_link_speed_mask = htole16(eee_speeds);
 		req->tx_lpi_timer = htole32(eee->tx_lpi_timer);
 #endif
 	} else {
 		req->flags |=
 		    htole32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_EEE_DISABLE);
 	}
 }
 
 
 int
 bnxt_hwrm_set_link_setting(struct bnxt_softc *softc, bool set_pause,
     bool set_eee)
 {
 	struct hwrm_port_phy_cfg_input req = {0};
 
 	if (softc->flags & BNXT_FLAG_NPAR)
 		return ENOTSUP;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_PORT_PHY_CFG);
 	if (set_pause)
 		bnxt_hwrm_set_pause_common(softc, &req);
 
 	bnxt_hwrm_set_link_common(softc, &req);
 	if (set_eee)
 		bnxt_hwrm_set_eee(softc, &req);
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 
 int
 bnxt_hwrm_set_pause(struct bnxt_softc *softc)
 {
 	struct hwrm_port_phy_cfg_input req = {0};
 	int rc;
 
 	if (softc->flags & BNXT_FLAG_NPAR)
 		return ENOTSUP;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_PORT_PHY_CFG);
 	bnxt_hwrm_set_pause_common(softc, &req);
 
 	if (softc->link_info.autoneg & BNXT_AUTONEG_FLOW_CTRL)
 		bnxt_hwrm_set_link_common(softc, &req);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (!rc && !(softc->link_info.autoneg & BNXT_AUTONEG_FLOW_CTRL)) {
 		/* since changing of pause setting doesn't trigger any link
 		 * change event, the driver needs to update the current pause
 		 * result upon successfully return of the phy_cfg command */
 		softc->link_info.pause =
 		softc->link_info.force_pause = softc->link_info.req_flow_ctrl;
 		softc->link_info.auto_pause = 0;
 		bnxt_report_link(softc);
 	}
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_vnic_cfg(struct bnxt_softc *softc, struct bnxt_vnic_info *vnic)
 {
 	struct hwrm_vnic_cfg_input req = {0};
 	struct hwrm_vnic_cfg_output *resp;
 
 	resp = (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VNIC_CFG);
 
 	if (vnic->flags & BNXT_VNIC_FLAG_DEFAULT)
 		req.flags |= htole32(HWRM_VNIC_CFG_INPUT_FLAGS_DEFAULT);
 	if (vnic->flags & BNXT_VNIC_FLAG_BD_STALL)
 		req.flags |= htole32(HWRM_VNIC_CFG_INPUT_FLAGS_BD_STALL_MODE);
 	if (vnic->flags & BNXT_VNIC_FLAG_VLAN_STRIP)
 		req.flags |= htole32(HWRM_VNIC_CFG_INPUT_FLAGS_VLAN_STRIP_MODE);
 	req.enables = htole32(HWRM_VNIC_CFG_INPUT_ENABLES_DFLT_RING_GRP |
 	    HWRM_VNIC_CFG_INPUT_ENABLES_RSS_RULE |
 	    HWRM_VNIC_CFG_INPUT_ENABLES_MRU);
 	req.vnic_id = htole16(vnic->id);
 	req.dflt_ring_grp = htole16(vnic->def_ring_grp);
 	req.rss_rule = htole16(vnic->rss_id);
 	req.cos_rule = htole16(vnic->cos_rule);
 	req.lb_rule = htole16(vnic->lb_rule);
 	req.mru = htole16(vnic->mru);
 
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 int
 bnxt_hwrm_vnic_alloc(struct bnxt_softc *softc, struct bnxt_vnic_info *vnic)
 {
 	struct hwrm_vnic_alloc_input req = {0};
 	struct hwrm_vnic_alloc_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 
 	if (vnic->id != (uint16_t)HWRM_NA_SIGNATURE) {
 		device_printf(softc->dev,
 		    "Attempt to re-allocate vnic %04x\n", vnic->id);
 		return EDOOFUS;
 	}
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VNIC_ALLOC);
 
 	if (vnic->flags & BNXT_VNIC_FLAG_DEFAULT)
 		req.flags = htole32(HWRM_VNIC_ALLOC_INPUT_FLAGS_DEFAULT);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	vnic->id = le32toh(resp->vnic_id);
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 	return (rc);
 }
 
 int
 bnxt_hwrm_vnic_ctx_alloc(struct bnxt_softc *softc, uint16_t *ctx_id)
 {
 	struct hwrm_vnic_rss_cos_lb_ctx_alloc_input req = {0};
 	struct hwrm_vnic_rss_cos_lb_ctx_alloc_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 
 	if (*ctx_id != (uint16_t)HWRM_NA_SIGNATURE) {
 		device_printf(softc->dev,
 		    "Attempt to re-allocate vnic ctx %04x\n", *ctx_id);
 		return EDOOFUS;
 	}
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VNIC_RSS_COS_LB_CTX_ALLOC);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	*ctx_id = le32toh(resp->rss_cos_lb_ctx_id);
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 	return (rc);
 }
 
 int
 bnxt_hwrm_ring_grp_alloc(struct bnxt_softc *softc, struct bnxt_grp_info *grp)
 {
 	struct hwrm_ring_grp_alloc_input req = {0};
 	struct hwrm_ring_grp_alloc_output *resp;
 	int rc = 0;
 
 	if (grp->grp_id != (uint16_t)HWRM_NA_SIGNATURE) {
 		device_printf(softc->dev,
 		    "Attempt to re-allocate ring group %04x\n", grp->grp_id);
 		return EDOOFUS;
 	}
 
 	resp = (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_RING_GRP_ALLOC);
 	req.cr = htole16(grp->cp_ring_id);
 	req.rr = htole16(grp->rx_ring_id);
 	req.ar = htole16(grp->ag_ring_id);
 	req.sc = htole16(grp->stats_ctx);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	grp->grp_id = le32toh(resp->ring_group_id);
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 /*
  * Ring allocation message to the firmware
  */
 int
 bnxt_hwrm_ring_alloc(struct bnxt_softc *softc, uint8_t type,
     struct bnxt_ring *ring, uint16_t cmpl_ring_id, uint32_t stat_ctx_id,
     bool irq)
 {
 	struct hwrm_ring_alloc_input req = {0};
 	struct hwrm_ring_alloc_output *resp;
 	int rc;
 
 	if (ring->phys_id != (uint16_t)HWRM_NA_SIGNATURE) {
 		device_printf(softc->dev,
 		    "Attempt to re-allocate ring %04x\n", ring->phys_id);
 		return EDOOFUS;
 	}
 
 	resp = (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_RING_ALLOC);
 	req.enables = htole32(0);
 	req.fbo = htole32(0);
 
 	if (stat_ctx_id != HWRM_NA_SIGNATURE) {
 		req.enables |= htole32(
 		    HWRM_RING_ALLOC_INPUT_ENABLES_STAT_CTX_ID_VALID);
 		req.stat_ctx_id = htole32(stat_ctx_id);
 	}
 	req.ring_type = type;
 	req.page_tbl_addr = htole64(ring->paddr);
 	req.length = htole32(ring->ring_size);
 	req.logical_id = htole16(ring->id);
 	req.cmpl_ring_id = htole16(cmpl_ring_id);
 	req.queue_id = htole16(softc->q_info[0].id);
 #if 0
 	/* MODE_POLL appears to crash the firmware */
 	if (irq)
 		req.int_mode = HWRM_RING_ALLOC_INPUT_INT_MODE_MSIX;
 	else
 		req.int_mode = HWRM_RING_ALLOC_INPUT_INT_MODE_POLL;
 #else
 	req.int_mode = HWRM_RING_ALLOC_INPUT_INT_MODE_MSIX;
 #endif
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	ring->phys_id = le16toh(resp->ring_id);
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_stat_ctx_alloc(struct bnxt_softc *softc, struct bnxt_cp_ring *cpr,
     uint64_t paddr)
 {
 	struct hwrm_stat_ctx_alloc_input req = {0};
 	struct hwrm_stat_ctx_alloc_output *resp;
 	int rc = 0;
 
 	if (cpr->stats_ctx_id != HWRM_NA_SIGNATURE) {
 		device_printf(softc->dev,
 		    "Attempt to re-allocate stats ctx %08x\n",
 		    cpr->stats_ctx_id);
 		return EDOOFUS;
 	}
 
 	resp = (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_STAT_CTX_ALLOC);
 
 	req.update_period_ms = htole32(1000);
 	req.stats_dma_addr = htole64(paddr);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	cpr->stats_ctx_id = le32toh(resp->stat_ctx_id);
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 
 	return rc;
 }
 
 int
 bnxt_hwrm_cfa_l2_set_rx_mask(struct bnxt_softc *softc,
     struct bnxt_vnic_info *vnic)
 {
 	struct hwrm_cfa_l2_set_rx_mask_input req = {0};
 	struct bnxt_vlan_tag *tag;
 	uint32_t *tags;
 	uint32_t num_vlan_tags = 0;;
 	uint32_t i;
 	uint32_t mask = vnic->rx_mask;
 	int rc;
 
 	SLIST_FOREACH(tag, &vnic->vlan_tags, next)
 		num_vlan_tags++;
 
 	if (num_vlan_tags) {
 		if (!(mask &
 		    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_ANYVLAN_NONVLAN)) {
 			if (!vnic->vlan_only)
 				mask |= HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_VLAN_NONVLAN;
 			else
 				mask |=
 				    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_VLANONLY;
 		}
 		if (vnic->vlan_tag_list.idi_vaddr) {
 			iflib_dma_free(&vnic->vlan_tag_list);
 			vnic->vlan_tag_list.idi_vaddr = NULL;
 		}
 		rc = iflib_dma_alloc(softc->ctx, 4 * num_vlan_tags,
 		    &vnic->vlan_tag_list, BUS_DMA_NOWAIT);
 		if (rc)
 			return rc;
 		tags = (uint32_t *)vnic->vlan_tag_list.idi_vaddr;
 
 		i = 0;
 		SLIST_FOREACH(tag, &vnic->vlan_tags, next) {
 			tags[i] = htole32((tag->tpid << 16) | tag->tag);
 			i++;
 		}
 	}
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_CFA_L2_SET_RX_MASK);
 
 	req.vnic_id = htole32(vnic->id);
 	req.mask = htole32(mask);
 	req.mc_tbl_addr = htole64(vnic->mc_list.idi_paddr);
 	req.num_mc_entries = htole32(vnic->mc_list_count);
 	req.vlan_tag_tbl_addr = htole64(vnic->vlan_tag_list.idi_paddr);
 	req.num_vlan_tags = htole32(num_vlan_tags);
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 
 int
 bnxt_hwrm_set_filter(struct bnxt_softc *softc, struct bnxt_vnic_info *vnic)
 {
 	struct hwrm_cfa_l2_filter_alloc_input	req = {0};
 	struct hwrm_cfa_l2_filter_alloc_output	*resp;
 	uint32_t enables = 0;
 	int rc = 0;
 
 	if (vnic->filter_id != -1) {
 		device_printf(softc->dev,
 		    "Attempt to re-allocate l2 ctx filter\n");
 		return EDOOFUS;
 	}
 
 	resp = (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_CFA_L2_FILTER_ALLOC);
 
 	req.flags = htole32(HWRM_CFA_L2_FILTER_ALLOC_INPUT_FLAGS_PATH_RX);
 	enables = HWRM_CFA_L2_FILTER_ALLOC_INPUT_ENABLES_L2_ADDR
 	    | HWRM_CFA_L2_FILTER_ALLOC_INPUT_ENABLES_L2_ADDR_MASK
 	    | HWRM_CFA_L2_FILTER_ALLOC_INPUT_ENABLES_DST_ID;
 	req.enables = htole32(enables);
 	req.dst_id = htole16(vnic->id);
 	memcpy(req.l2_addr, if_getlladdr(iflib_get_ifp(softc->ctx)),
 	    ETHER_ADDR_LEN);
 	memset(&req.l2_addr_mask, 0xff, sizeof(req.l2_addr_mask));
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto fail;
 
 	vnic->filter_id = le64toh(resp->l2_filter_id);
 	vnic->flow_id = le64toh(resp->flow_id);
 
 fail:
 	BNXT_HWRM_UNLOCK(softc);
 	return (rc);
 }
 
 int
 bnxt_hwrm_rss_cfg(struct bnxt_softc *softc, struct bnxt_vnic_info *vnic,
     uint32_t hash_type)
 {
 	struct hwrm_vnic_rss_cfg_input	req = {0};
 	struct hwrm_vnic_rss_cfg_output	*resp;
 
 	resp = (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VNIC_RSS_CFG);
 
 	req.hash_type = htole32(hash_type);
 	req.ring_grp_tbl_addr = htole64(vnic->rss_grp_tbl.idi_paddr);
 	req.hash_key_tbl_addr = htole64(vnic->rss_hash_key_tbl.idi_paddr);
 	req.rss_ctx_idx = htole16(vnic->rss_id);
 
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 int
 bnxt_hwrm_func_cfg(struct bnxt_softc *softc)
 {
 	struct hwrm_func_cfg_input req = {0};
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FUNC_CFG);
 
 	req.fid = 0xffff;
 	req.enables = htole32(HWRM_FUNC_CFG_INPUT_ENABLES_ASYNC_EVENT_CR);
 
 	req.async_event_cr = softc->def_cp_ring.ring.phys_id;
 
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 int
 bnxt_hwrm_vnic_tpa_cfg(struct bnxt_softc *softc, struct bnxt_vnic_info *vnic,
     uint32_t flags)
 {
 	struct hwrm_vnic_tpa_cfg_input req = {0};
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_VNIC_TPA_CFG);
 
 	req.flags = htole32(flags);
 	req.vnic_id = htole16(vnic->id);
 	req.enables = htole32(HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGG_SEGS |
 	    HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGGS |
 	    /* HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MAX_AGG_TIMER | */
 	    HWRM_VNIC_TPA_CFG_INPUT_ENABLES_MIN_AGG_LEN);
 	/* TODO: Calculate this based on ring size? */
 	req.max_agg_segs = htole16(3);
 	/* Base this in the allocated TPA start size... */
-	req.max_aggs = htole16(2);
+	req.max_aggs = htole16(7);
 	/*
 	 * TODO: max_agg_timer?
 	 * req.mag_agg_timer = htole32(XXX);
 	 */
 	req.min_agg_len = htole32(0);
 
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 int
 bnxt_hwrm_nvm_find_dir_entry(struct bnxt_softc *softc, uint16_t type,
     uint16_t *ordinal, uint16_t ext, uint16_t *index, bool use_index,
     uint8_t search_opt, uint32_t *data_length, uint32_t *item_length,
     uint32_t *fw_ver)
 {
 	struct hwrm_nvm_find_dir_entry_input req = {0};
 	struct hwrm_nvm_find_dir_entry_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int	rc = 0;
 	uint32_t old_timeo;
 
 	MPASS(ordinal);
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_FIND_DIR_ENTRY);
 	if (use_index) {
 		req.enables = htole32(
 		    HWRM_NVM_FIND_DIR_ENTRY_INPUT_ENABLES_DIR_IDX_VALID);
 		req.dir_idx = htole16(*index);
 	}
 	req.dir_type = htole16(type);
 	req.dir_ordinal = htole16(*ordinal);
 	req.dir_ext = htole16(ext);
 	req.opt_ordinal = search_opt;
 
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	if (rc)
 		goto exit;
 
 	if (item_length)
 		*item_length = le32toh(resp->dir_item_length);
 	if (data_length)
 		*data_length = le32toh(resp->dir_data_length);
 	if (fw_ver)
 		*fw_ver = le32toh(resp->fw_ver);
 	*ordinal = le16toh(resp->dir_ordinal);
 	if (index)
 		*index = le16toh(resp->dir_idx);
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return (rc);
 }
 
 int
 bnxt_hwrm_nvm_read(struct bnxt_softc *softc, uint16_t index, uint32_t offset,
     uint32_t length, struct iflib_dma_info *data)
 {
 	struct hwrm_nvm_read_input req = {0};
 	int rc;
 	uint32_t old_timeo;
 
 	if (length > data->idi_size) {
 		rc = EINVAL;
 		goto exit;
 	}
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_READ);
 	req.host_dest_addr = htole64(data->idi_paddr);
 	req.dir_idx = htole16(index);
 	req.offset = htole32(offset);
 	req.len = htole32(length);
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	BNXT_HWRM_UNLOCK(softc);
 	if (rc)
 		goto exit;
 	bus_dmamap_sync(data->idi_tag, data->idi_map, BUS_DMASYNC_POSTREAD);
 
 	goto exit;
 
 exit:
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_modify(struct bnxt_softc *softc, uint16_t index, uint32_t offset,
     void *data, bool cpyin, uint32_t length)
 {
 	struct hwrm_nvm_modify_input req = {0};
 	struct iflib_dma_info dma_data;
 	int rc;
 	uint32_t old_timeo;
 
 	if (length == 0 || !data)
 		return EINVAL;
 	rc = iflib_dma_alloc(softc->ctx, length, &dma_data,
 	    BUS_DMA_NOWAIT);
 	if (rc)
 		return ENOMEM;
 	if (cpyin) {
 		rc = copyin(data, dma_data.idi_vaddr, length);
 		if (rc)
 			goto exit;
 	}
 	else
 		memcpy(dma_data.idi_vaddr, data, length);
 	bus_dmamap_sync(dma_data.idi_tag, dma_data.idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_MODIFY);
 	req.host_src_addr = htole64(dma_data.idi_paddr);
 	req.dir_idx = htole16(index);
 	req.offset = htole32(offset);
 	req.len = htole32(length);
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	BNXT_HWRM_UNLOCK(softc);
 
 exit:
 	iflib_dma_free(&dma_data);
 	return rc;
 }
 
 int
 bnxt_hwrm_fw_reset(struct bnxt_softc *softc, uint8_t processor,
     uint8_t *selfreset)
 {
 	struct hwrm_fw_reset_input req = {0};
 	struct hwrm_fw_reset_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 
 	MPASS(selfreset);
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FW_RESET);
 	req.embedded_proc_type = processor;
 	req.selfrst_status = *selfreset;
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto exit;
 	*selfreset = resp->selfrst_status;
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_fw_qstatus(struct bnxt_softc *softc, uint8_t type, uint8_t *selfreset)
 {
 	struct hwrm_fw_qstatus_input req = {0};
 	struct hwrm_fw_qstatus_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 
 	MPASS(selfreset);
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FW_QSTATUS);
 	req.embedded_proc_type = type;
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto exit;
 	*selfreset = resp->selfrst_status;
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_write(struct bnxt_softc *softc, void *data, bool cpyin,
     uint16_t type, uint16_t ordinal, uint16_t ext, uint16_t attr,
     uint16_t option, uint32_t data_length, bool keep, uint32_t *item_length,
     uint16_t *index)
 {
 	struct hwrm_nvm_write_input req = {0};
 	struct hwrm_nvm_write_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	struct iflib_dma_info dma_data;
 	int rc;
 	uint32_t old_timeo;
 
 	if (data_length) {
 		rc = iflib_dma_alloc(softc->ctx, data_length, &dma_data,
 		    BUS_DMA_NOWAIT);
 		if (rc)
 			return ENOMEM;
 		if (cpyin) {
 			rc = copyin(data, dma_data.idi_vaddr, data_length);
 			if (rc)
 				goto early_exit;
 		}
 		else
 			memcpy(dma_data.idi_vaddr, data, data_length);
 		bus_dmamap_sync(dma_data.idi_tag, dma_data.idi_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	}
 	else
 		dma_data.idi_paddr = 0;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_WRITE);
 
 	req.host_src_addr = htole64(dma_data.idi_paddr);
 	req.dir_type = htole16(type);
 	req.dir_ordinal = htole16(ordinal);
 	req.dir_ext = htole16(ext);
 	req.dir_attr = htole16(attr);
 	req.dir_data_length = htole32(data_length);
 	req.option = htole16(option);
 	if (keep) {
 		req.flags =
 		    htole16(HWRM_NVM_WRITE_INPUT_FLAGS_KEEP_ORIG_ACTIVE_IMG);
 	}
 	if (item_length)
 		req.dir_item_length = htole32(*item_length);
 
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	if (rc)
 		goto exit;
 	if (item_length)
 		*item_length = le32toh(resp->dir_item_length);
 	if (index)
 		*index = le16toh(resp->dir_idx);
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 early_exit:
 	if (data_length)
 		iflib_dma_free(&dma_data);
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_erase_dir_entry(struct bnxt_softc *softc, uint16_t index)
 {
 	struct hwrm_nvm_erase_dir_entry_input req = {0};
 	uint32_t old_timeo;
 	int rc;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_ERASE_DIR_ENTRY);
 	req.dir_idx = htole16(index);
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_get_dir_info(struct bnxt_softc *softc, uint32_t *entries,
     uint32_t *entry_length)
 {
 	struct hwrm_nvm_get_dir_info_input req = {0};
 	struct hwrm_nvm_get_dir_info_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 	uint32_t old_timeo;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_GET_DIR_INFO);
 
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	if (rc)
 		goto exit;
 
 	if (entries)
 		*entries = le32toh(resp->entries);
 	if (entry_length)
 		*entry_length = le32toh(resp->entry_length);
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_get_dir_entries(struct bnxt_softc *softc, uint32_t *entries,
     uint32_t *entry_length, struct iflib_dma_info *dma_data)
 {
 	struct hwrm_nvm_get_dir_entries_input req = {0};
 	uint32_t ent;
 	uint32_t ent_len;
 	int rc;
 	uint32_t old_timeo;
 
 	if (!entries)
 		entries = &ent;
 	if (!entry_length)
 		entry_length = &ent_len;
 
 	rc = bnxt_hwrm_nvm_get_dir_info(softc, entries, entry_length);
 	if (rc)
 		goto exit;
 	if (*entries * *entry_length > dma_data->idi_size) {
 		rc = EINVAL;
 		goto exit;
 	}
 
 	/*
 	 * TODO: There's a race condition here that could blow up DMA memory...
 	 *	 we need to allocate the max size, not the currently in use
 	 *	 size.  The command should totally have a max size here.
 	 */
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_GET_DIR_ENTRIES);
 	req.host_dest_addr = htole64(dma_data->idi_paddr);
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	BNXT_HWRM_UNLOCK(softc);
 	if (rc)
 		goto exit;
 	bus_dmamap_sync(dma_data->idi_tag, dma_data->idi_map,
 	    BUS_DMASYNC_POSTWRITE);
 
 exit:
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_get_dev_info(struct bnxt_softc *softc, uint16_t *mfg_id,
     uint16_t *device_id, uint32_t *sector_size, uint32_t *nvram_size,
     uint32_t *reserved_size, uint32_t *available_size)
 {
 	struct hwrm_nvm_get_dev_info_input req = {0};
 	struct hwrm_nvm_get_dev_info_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 	uint32_t old_timeo;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_GET_DEV_INFO);
 
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	if (rc)
 		goto exit;
 
 	if (mfg_id)
 		*mfg_id = le16toh(resp->manufacturer_id);
 	if (device_id)
 		*device_id = le16toh(resp->device_id);
 	if (sector_size)
 		*sector_size = le32toh(resp->sector_size);
 	if (nvram_size)
 		*nvram_size = le32toh(resp->nvram_size);
 	if (reserved_size)
 		*reserved_size = le32toh(resp->reserved_size);
 	if (available_size)
 		*available_size = le32toh(resp->available_size);
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_install_update(struct bnxt_softc *softc,
     uint32_t install_type, uint64_t *installed_items, uint8_t *result,
     uint8_t *problem_item, uint8_t *reset_required)
 {
 	struct hwrm_nvm_install_update_input req = {0};
 	struct hwrm_nvm_install_update_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 	uint32_t old_timeo;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_INSTALL_UPDATE);
 	req.install_type = htole32(install_type);
 
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	if (rc)
 		goto exit;
 
 	if (installed_items)
 		*installed_items = le32toh(resp->installed_items);
 	if (result)
 		*result = resp->result;
 	if (problem_item)
 		*problem_item = resp->problem_item;
 	if (reset_required)
 		*reset_required = resp->reset_required;
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_nvm_verify_update(struct bnxt_softc *softc, uint16_t type,
     uint16_t ordinal, uint16_t ext)
 {
 	struct hwrm_nvm_verify_update_input req = {0};
 	uint32_t old_timeo;
 	int rc;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_NVM_VERIFY_UPDATE);
 
 	req.dir_type = htole16(type);
 	req.dir_ordinal = htole16(ordinal);
 	req.dir_ext = htole16(ext);
 
 	BNXT_HWRM_LOCK(softc);
 	old_timeo = softc->hwrm_cmd_timeo;
 	softc->hwrm_cmd_timeo = BNXT_NVM_TIMEO;
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	softc->hwrm_cmd_timeo = old_timeo;
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_fw_get_time(struct bnxt_softc *softc, uint16_t *year, uint8_t *month,
     uint8_t *day, uint8_t *hour, uint8_t *minute, uint8_t *second,
     uint16_t *millisecond, uint16_t *zone)
 {
 	struct hwrm_fw_get_time_input req = {0};
 	struct hwrm_fw_get_time_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc;
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FW_GET_TIME);
 
 	BNXT_HWRM_LOCK(softc);
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto exit;
 
 	if (year)
 		*year = le16toh(resp->year);
 	if (month)
 		*month = resp->month;
 	if (day)
 		*day = resp->day;
 	if (hour)
 		*hour = resp->hour;
 	if (minute)
 		*minute = resp->minute;
 	if (second)
 		*second = resp->second;
 	if (millisecond)
 		*millisecond = le16toh(resp->millisecond);
 	if (zone)
 		*zone = le16toh(resp->zone);
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
 
 int
 bnxt_hwrm_fw_set_time(struct bnxt_softc *softc, uint16_t year, uint8_t month,
     uint8_t day, uint8_t hour, uint8_t minute, uint8_t second,
     uint16_t millisecond, uint16_t zone)
 {
 	struct hwrm_fw_set_time_input req = {0};
 
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_FW_SET_TIME);
 
 	req.year = htole16(year);
 	req.month = month;
 	req.day = day;
 	req.hour = hour;
 	req.minute = minute;
 	req.second = second;
 	req.millisecond = htole16(millisecond);
 	req.zone = htole16(zone);
 	return hwrm_send_message(softc, &req, sizeof(req));
 }
 
 int
 bnxt_hwrm_port_phy_qcfg(struct bnxt_softc *softc)
 {
 	struct bnxt_link_info *link_info = &softc->link_info;
 	struct hwrm_port_phy_qcfg_input req = {0};
 	struct hwrm_port_phy_qcfg_output *resp =
 	    (void *)softc->hwrm_cmd_resp.idi_vaddr;
 	int rc = 0;
 
 	BNXT_HWRM_LOCK(softc);
 	bnxt_hwrm_cmd_hdr_init(softc, &req, HWRM_PORT_PHY_QCFG);
 
 	rc = _hwrm_send_message(softc, &req, sizeof(req));
 	if (rc)
 		goto exit;
 
 	link_info->phy_link_status = resp->link;
 	link_info->duplex =  resp->duplex;
 	link_info->pause = resp->pause;
 	link_info->auto_mode = resp->auto_mode;
 	link_info->auto_pause = resp->auto_pause;
 	link_info->force_pause = resp->force_pause;
 	link_info->duplex_setting = resp->duplex;
 	if (link_info->phy_link_status == HWRM_PORT_PHY_QCFG_OUTPUT_LINK_LINK)
 		link_info->link_speed = le16toh(resp->link_speed);
 	else
 		link_info->link_speed = 0;
 	link_info->force_link_speed = le16toh(resp->force_link_speed);
 	link_info->auto_link_speed = le16toh(resp->auto_link_speed);
 	link_info->support_speeds = le16toh(resp->support_speeds);
 	link_info->auto_link_speeds = le16toh(resp->auto_link_speed_mask);
 	link_info->preemphasis = le32toh(resp->preemphasis);
 	link_info->phy_ver[0] = resp->phy_maj;
 	link_info->phy_ver[1] = resp->phy_min;
 	link_info->phy_ver[2] = resp->phy_bld;
 	snprintf(softc->ver_info->phy_ver, sizeof(softc->ver_info->phy_ver),
 	    "%d.%d.%d", link_info->phy_ver[0], link_info->phy_ver[1],
 	    link_info->phy_ver[2]);
 	strlcpy(softc->ver_info->phy_vendor, resp->phy_vendor_name,
 	    BNXT_NAME_SIZE);
 	strlcpy(softc->ver_info->phy_partnumber, resp->phy_vendor_partnumber,
 	    BNXT_NAME_SIZE);
 	link_info->media_type = resp->media_type;
 	link_info->phy_type = resp->phy_type;
 	link_info->transceiver = resp->xcvr_pkg_type;
 	link_info->phy_addr = resp->eee_config_phy_addr &
 	    HWRM_PORT_PHY_QCFG_OUTPUT_PHY_ADDR_MASK;
 
 exit:
 	BNXT_HWRM_UNLOCK(softc);
 	return rc;
 }
Index: stable/11/sys/dev/bnxt/bnxt_txrx.c
===================================================================
--- stable/11/sys/dev/bnxt/bnxt_txrx.c	(revision 333337)
+++ stable/11/sys/dev/bnxt/bnxt_txrx.c	(revision 333338)
@@ -1,641 +1,660 @@
 /*-
  * Broadcom NetXtreme-C/E network driver.
  *
  * Copyright (c) 2016 Broadcom, All Rights Reserved.
  * The term Broadcom refers to Broadcom Limited and/or its subsidiaries
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS'
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/endian.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/iflib.h>
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 
 #include "bnxt.h"
 
 /*
  * Function prototypes
  */
 
 static int bnxt_isc_txd_encap(void *sc, if_pkt_info_t pi);
-static void bnxt_isc_txd_flush(void *sc, uint16_t txqid, uint32_t pidx);
-static int bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, uint32_t cidx,
-    bool clear);
+static void bnxt_isc_txd_flush(void *sc, uint16_t txqid, qidx_t pidx);
+static int bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, bool clear);
 
-static void bnxt_isc_rxd_refill(void *sc, uint16_t rxqid, uint8_t flid,
+static void bnxt_isc_rxd_refill(void *sc, if_rxd_update_t iru);
+
+/*				uint16_t rxqid, uint8_t flid,
     uint32_t pidx, uint64_t *paddrs, caddr_t *vaddrs, uint16_t count,
     uint16_t buf_size);
+*/
 static void bnxt_isc_rxd_flush(void *sc, uint16_t rxqid, uint8_t flid,
-    uint32_t pidx);
-static int bnxt_isc_rxd_available(void *sc, uint16_t rxqid, uint32_t idx,
-    int budget);
+    qidx_t pidx);
+static int bnxt_isc_rxd_available(void *sc, uint16_t rxqid, qidx_t idx,
+    qidx_t budget);
 static int bnxt_isc_rxd_pkt_get(void *sc, if_rxd_info_t ri);
 
 static int bnxt_intr(void *sc);
 
 struct if_txrx bnxt_txrx  = {
 	bnxt_isc_txd_encap,
 	bnxt_isc_txd_flush,
 	bnxt_isc_txd_credits_update,
 	bnxt_isc_rxd_available,
 	bnxt_isc_rxd_pkt_get,
 	bnxt_isc_rxd_refill,
 	bnxt_isc_rxd_flush,
 	bnxt_intr
 };
 
 /*
  * Device Dependent Packet Transmit and Receive Functions
  */
 
 static const uint16_t bnxt_tx_lhint[] = {
 	TX_BD_SHORT_FLAGS_LHINT_LT512,
 	TX_BD_SHORT_FLAGS_LHINT_LT1K,
 	TX_BD_SHORT_FLAGS_LHINT_LT2K,
 	TX_BD_SHORT_FLAGS_LHINT_LT2K,
 	TX_BD_SHORT_FLAGS_LHINT_GTE2K,
 };
 
 static int
 bnxt_isc_txd_encap(void *sc, if_pkt_info_t pi)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_ring *txr = &softc->tx_rings[pi->ipi_qsidx];
 	struct tx_bd_long *tbd;
 	struct tx_bd_long_hi *tbdh;
 	bool need_hi = false;
 	uint16_t flags_type;
 	uint16_t lflags;
 	uint32_t cfa_meta;
 	int seg = 0;
 
 	/* If we have offloads enabled, we need to use two BDs. */
 	if ((pi->ipi_csum_flags & (CSUM_OFFLOAD | CSUM_TSO | CSUM_IP)) ||
 	    pi->ipi_mflags & M_VLANTAG)
 		need_hi = true;
 
 	/* TODO: Devices before Cu+B1 need to not mix long and short BDs */
 	need_hi = true;
 
 	pi->ipi_new_pidx = pi->ipi_pidx;
 	tbd = &((struct tx_bd_long *)txr->vaddr)[pi->ipi_new_pidx];
 	pi->ipi_ndescs = 0;
 	/* No need to byte-swap the opaque value */
 	tbd->opaque = ((pi->ipi_nsegs + need_hi) << 24) | pi->ipi_new_pidx;
 	tbd->len = htole16(pi->ipi_segs[seg].ds_len);
 	tbd->addr = htole64(pi->ipi_segs[seg++].ds_addr);
 	flags_type = ((pi->ipi_nsegs + need_hi) <<
 	    TX_BD_SHORT_FLAGS_BD_CNT_SFT) & TX_BD_SHORT_FLAGS_BD_CNT_MASK;
 	if (pi->ipi_len >= 2048)
 		flags_type |= TX_BD_SHORT_FLAGS_LHINT_GTE2K;
 	else
 		flags_type |= bnxt_tx_lhint[pi->ipi_len >> 9];
 
 	if (need_hi) {
 		flags_type |= TX_BD_LONG_TYPE_TX_BD_LONG;
 
 		pi->ipi_new_pidx = RING_NEXT(txr, pi->ipi_new_pidx);
 		tbdh = &((struct tx_bd_long_hi *)txr->vaddr)[pi->ipi_new_pidx];
 		tbdh->mss = htole16(pi->ipi_tso_segsz);
 		tbdh->hdr_size = htole16((pi->ipi_ehdrlen + pi->ipi_ip_hlen +
 		    pi->ipi_tcp_hlen) >> 1);
 		tbdh->cfa_action = 0;
 		lflags = 0;
 		cfa_meta = 0;
 		if (pi->ipi_mflags & M_VLANTAG) {
 			/* TODO: Do we need to byte-swap the vtag here? */
 			cfa_meta = TX_BD_LONG_CFA_META_KEY_VLAN_TAG |
 			    pi->ipi_vtag;
 			cfa_meta |= TX_BD_LONG_CFA_META_VLAN_TPID_TPID8100;
 		}
 		tbdh->cfa_meta = htole32(cfa_meta);
 		if (pi->ipi_csum_flags & CSUM_TSO) {
 			lflags |= TX_BD_LONG_LFLAGS_LSO |
 			    TX_BD_LONG_LFLAGS_T_IPID;
 		}
 		else if(pi->ipi_csum_flags & CSUM_OFFLOAD) {
 			lflags |= TX_BD_LONG_LFLAGS_TCP_UDP_CHKSUM |
 			    TX_BD_LONG_LFLAGS_IP_CHKSUM;
 		}
 		else if(pi->ipi_csum_flags & CSUM_IP) {
 			lflags |= TX_BD_LONG_LFLAGS_IP_CHKSUM;
 		}
 		tbdh->lflags = htole16(lflags);
 	}
 	else {
 		flags_type |= TX_BD_SHORT_TYPE_TX_BD_SHORT;
 	}
 
 	for (; seg < pi->ipi_nsegs; seg++) {
 		tbd->flags_type = htole16(flags_type);
 		pi->ipi_new_pidx = RING_NEXT(txr, pi->ipi_new_pidx);
 		tbd = &((struct tx_bd_long *)txr->vaddr)[pi->ipi_new_pidx];
 		tbd->len = htole16(pi->ipi_segs[seg].ds_len);
 		tbd->addr = htole64(pi->ipi_segs[seg].ds_addr);
 		flags_type = TX_BD_SHORT_TYPE_TX_BD_SHORT;
 	}
 	flags_type |= TX_BD_SHORT_FLAGS_PACKET_END;
 	tbd->flags_type = htole16(flags_type);
 	pi->ipi_new_pidx = RING_NEXT(txr, pi->ipi_new_pidx);
 
 	return 0;
 }
 
 static void
-bnxt_isc_txd_flush(void *sc, uint16_t txqid, uint32_t pidx)
+bnxt_isc_txd_flush(void *sc, uint16_t txqid, qidx_t pidx)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_ring *tx_ring = &softc->tx_rings[txqid];
 
 	/* pidx is what we last set ipi_new_pidx to */
 	BNXT_TX_DB(tx_ring, pidx);
 	/* TODO: Cumulus+ doesn't need the double doorbell */
 	BNXT_TX_DB(tx_ring, pidx);
 	return;
 }
 
 static int
-bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, uint32_t idx, bool clear)
+bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, bool clear)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_cp_ring *cpr = &softc->tx_cp_rings[txqid];
 	struct tx_cmpl *cmpl = (struct tx_cmpl *)cpr->ring.vaddr;
 	int avail = 0;
 	uint32_t cons = cpr->cons;
 	bool v_bit = cpr->v_bit;
 	bool last_v_bit;
 	uint32_t last_cons;
 	uint16_t type;
 	uint16_t err;
 
 	for (;;) {
 		last_cons = cons;
 		last_v_bit = v_bit;
 		NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 		CMPL_PREFETCH_NEXT(cpr, cons);
 
 		if (!CMP_VALID(&cmpl[cons], v_bit))
 			goto done;
 
 		type = cmpl[cons].flags_type & TX_CMPL_TYPE_MASK;
 		switch (type) {
 		case TX_CMPL_TYPE_TX_L2:
 			err = (le16toh(cmpl[cons].errors_v) &
 			    TX_CMPL_ERRORS_BUFFER_ERROR_MASK) >>
 			    TX_CMPL_ERRORS_BUFFER_ERROR_SFT;
 			if (err)
 				device_printf(softc->dev,
 				    "TX completion error %u\n", err);
 			/* No need to byte-swap the opaque value */
 			avail += cmpl[cons].opaque >> 24;
 			/*
 			 * If we're not clearing, iflib only cares if there's
 			 * at least one buffer.  Don't scan the whole ring in
 			 * this case.
 			 */
 			if (!clear)
 				goto done;
 			break;
 		default:
 			if (type & 1) {
 				NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 				if (!CMP_VALID(&cmpl[cons], v_bit))
 					goto done;
 			}
 			device_printf(softc->dev,
 			    "Unhandled TX completion type %u\n", type);
 			break;
 		}
 	}
 done:
 
 	if (clear && avail) {
 		cpr->cons = last_cons;
 		cpr->v_bit = last_v_bit;
 		BNXT_CP_IDX_DISABLE_DB(&cpr->ring, cpr->cons);
 	}
 
 	return avail;
 }
 
 static void
-bnxt_isc_rxd_refill(void *sc, uint16_t rxqid, uint8_t flid,
-				uint32_t pidx, uint64_t *paddrs,
-				caddr_t *vaddrs, uint16_t count, uint16_t len)
+bnxt_isc_rxd_refill(void *sc, if_rxd_update_t iru)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_ring *rx_ring;
 	struct rx_prod_pkt_bd *rxbd;
 	uint16_t type;
 	uint16_t i;
+	uint16_t rxqid;
+	uint16_t count, len;
+	uint32_t pidx;
+	uint8_t flid;
+	uint64_t *paddrs;
+	caddr_t *vaddrs;
+	qidx_t	*frag_idxs;
 
+	rxqid = iru->iru_qsidx;
+	count = iru->iru_count;
+	len = iru->iru_buf_size;
+	pidx = iru->iru_pidx;
+	flid = iru->iru_flidx;
+	vaddrs = iru->iru_vaddrs;
+	paddrs = iru->iru_paddrs;
+	frag_idxs = iru->iru_idxs;
+
 	if (flid == 0) {
 		rx_ring = &softc->rx_rings[rxqid];
 		type = RX_PROD_PKT_BD_TYPE_RX_PROD_PKT;
 	}
 	else {
 		rx_ring = &softc->ag_rings[rxqid];
 		type = RX_PROD_AGG_BD_TYPE_RX_PROD_AGG;
 	}
 	rxbd = (void *)rx_ring->vaddr;
 
 	for (i=0; i<count; i++) {
 		rxbd[pidx].flags_type = htole16(type);
 		rxbd[pidx].len = htole16(len);
 		/* No need to byte-swap the opaque value */
-		rxbd[pidx].opaque = ((rxqid & 0xff) << 24) | (flid << 16)
-		    | pidx;
+		rxbd[pidx].opaque = (((rxqid & 0xff) << 24) | (flid << 16)
+		    | (frag_idxs[i]));
 		rxbd[pidx].addr = htole64(paddrs[i]);
 		if (++pidx == rx_ring->ring_size)
 			pidx = 0;
 	}
 	return;
 }
 
 static void
 bnxt_isc_rxd_flush(void *sc, uint16_t rxqid, uint8_t flid,
-    uint32_t pidx)
+    qidx_t pidx)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_ring *rx_ring;
 
 	if (flid == 0)
 		rx_ring = &softc->rx_rings[rxqid];
 	else
 		rx_ring = &softc->ag_rings[rxqid];
 
 	/*
 	 * We *must* update the completion ring before updating the RX ring
 	 * or we will overrun the completion ring and the device will wedge for
 	 * RX.
 	 */
 	if (softc->rx_cp_rings[rxqid].cons != UINT32_MAX)
 		BNXT_CP_IDX_DISABLE_DB(&softc->rx_cp_rings[rxqid].ring,
 		    softc->rx_cp_rings[rxqid].cons);
 	/* We're given the last filled RX buffer here, not the next empty one */
 	BNXT_RX_DB(rx_ring, RING_NEXT(rx_ring, pidx));
 	/* TODO: Cumulus+ doesn't need the double doorbell */
 	BNXT_RX_DB(rx_ring, RING_NEXT(rx_ring, pidx));
 	return;
 }
 
 static int
-bnxt_isc_rxd_available(void *sc, uint16_t rxqid, uint32_t idx, int budget)
+bnxt_isc_rxd_available(void *sc, uint16_t rxqid, qidx_t idx, qidx_t budget)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_cp_ring *cpr = &softc->rx_cp_rings[rxqid];
 	struct rx_pkt_cmpl *rcp;
-	struct rx_tpa_start_cmpl *rtpa;
 	struct rx_tpa_end_cmpl *rtpae;
 	struct cmpl_base *cmp = (struct cmpl_base *)cpr->ring.vaddr;
 	int avail = 0;
 	uint32_t cons = cpr->cons;
 	bool v_bit = cpr->v_bit;
 	uint8_t ags;
 	int i;
 	uint16_t type;
-	uint8_t agg_id;
 
 	for (;;) {
 		NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 		CMPL_PREFETCH_NEXT(cpr, cons);
 
 		if (!CMP_VALID(&cmp[cons], v_bit))
 			goto cmpl_invalid;
 
 		type = le16toh(cmp[cons].type) & CMPL_BASE_TYPE_MASK;
 		switch (type) {
 		case CMPL_BASE_TYPE_RX_L2:
 			rcp = (void *)&cmp[cons];
 			ags = (rcp->agg_bufs_v1 & RX_PKT_CMPL_AGG_BUFS_MASK) >>
 			    RX_PKT_CMPL_AGG_BUFS_SFT;
 			NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 			CMPL_PREFETCH_NEXT(cpr, cons);
 
 			if (!CMP_VALID(&cmp[cons], v_bit))
 				goto cmpl_invalid;
 
 			/* Now account for all the AG completions */
 			for (i=0; i<ags; i++) {
 				NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 				CMPL_PREFETCH_NEXT(cpr, cons);
 				if (!CMP_VALID(&cmp[cons], v_bit))
 					goto cmpl_invalid;
 			}
 			avail++;
 			break;
 		case CMPL_BASE_TYPE_RX_TPA_END:
 			rtpae = (void *)&cmp[cons];
 			ags = (rtpae->agg_bufs_v1 &
 			    RX_TPA_END_CMPL_AGG_BUFS_MASK) >>
 			    RX_TPA_END_CMPL_AGG_BUFS_SFT;
 			NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 			CMPL_PREFETCH_NEXT(cpr, cons);
 
 			if (!CMP_VALID(&cmp[cons], v_bit))
 				goto cmpl_invalid;
 			/* Now account for all the AG completions */
 			for (i=0; i<ags; i++) {
 				NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 				CMPL_PREFETCH_NEXT(cpr, cons);
 				if (!CMP_VALID(&cmp[cons], v_bit))
 					goto cmpl_invalid;
 			}
 			avail++;
 			break;
 		case CMPL_BASE_TYPE_RX_TPA_START:
-			rtpa = (void *)&cmp[cons];
-			agg_id = (rtpa->agg_id &
-			    RX_TPA_START_CMPL_AGG_ID_MASK) >>
-			    RX_TPA_START_CMPL_AGG_ID_SFT;
-			softc->tpa_start[agg_id].low = *rtpa;
 			NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 			CMPL_PREFETCH_NEXT(cpr, cons);
 
 			if (!CMP_VALID(&cmp[cons], v_bit))
 				goto cmpl_invalid;
-			softc->tpa_start[agg_id].high =
-			    ((struct rx_tpa_start_cmpl_hi *)cmp)[cons];
 			break;
 		case CMPL_BASE_TYPE_RX_AGG:
 			break;
 		default:
 			device_printf(softc->dev,
 			    "Unhandled completion type %d on RXQ %d\n",
 			    type, rxqid);
 
 			/* Odd completion types use two completions */
 			if (type & 1) {
 				NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 				CMPL_PREFETCH_NEXT(cpr, cons);
 
 				if (!CMP_VALID(&cmp[cons], v_bit))
 					goto cmpl_invalid;
 			}
 			break;
 		}
 		if (avail > budget)
 			break;
 	}
 cmpl_invalid:
 
 	return avail;
 }
 
 static int
 bnxt_pkt_get_l2(struct bnxt_softc *softc, if_rxd_info_t ri,
     struct bnxt_cp_ring *cpr, uint16_t flags_type)
 {
 	struct rx_pkt_cmpl *rcp;
 	struct rx_pkt_cmpl_hi *rcph;
 	struct rx_abuf_cmpl *acp;
 	uint32_t flags2;
 	uint32_t errors;
 	uint8_t	ags;
 	int i;
 
 	rcp = &((struct rx_pkt_cmpl *)cpr->ring.vaddr)[cpr->cons];
 
 	/* Extract from the first 16-byte BD */
 	if (flags_type & RX_PKT_CMPL_FLAGS_RSS_VALID) {
 		ri->iri_flowid = le32toh(rcp->rss_hash);
 		/*
 		 * TODO: Extract something useful from rcp->rss_hash_type
 		 * (undocumented)
 		 * May be documented in the "LSI ES"
 		 * also check the firmware code.
 		 */
 		ri->iri_rsstype = M_HASHTYPE_OPAQUE;
 	}
 	else {
 		ri->iri_rsstype = M_HASHTYPE_NONE;
 	}
 	ags = (rcp->agg_bufs_v1 & RX_PKT_CMPL_AGG_BUFS_MASK) >>
 	    RX_PKT_CMPL_AGG_BUFS_SFT;
 	ri->iri_nfrags = ags + 1;
 	/* No need to byte-swap the opaque value */
 	ri->iri_frags[0].irf_flid = (rcp->opaque >> 16) & 0xff;
 	ri->iri_frags[0].irf_idx = rcp->opaque & 0xffff;
 	ri->iri_frags[0].irf_len = le16toh(rcp->len);
 	ri->iri_len = le16toh(rcp->len);
 
 	/* Now the second 16-byte BD */
 	NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
 	ri->iri_cidx = RING_NEXT(&cpr->ring, ri->iri_cidx);
 	rcph = &((struct rx_pkt_cmpl_hi *)cpr->ring.vaddr)[cpr->cons];
 
 	flags2 = le32toh(rcph->flags2);
 	errors = le16toh(rcph->errors_v2);
 	if ((flags2 & RX_PKT_CMPL_FLAGS2_META_FORMAT_MASK) ==
 	    RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN) {
 		ri->iri_flags |= M_VLANTAG;
 		/* TODO: Should this be the entire 16-bits? */
 		ri->iri_vtag = le32toh(rcph->metadata) &
 		    (RX_PKT_CMPL_METADATA_VID_MASK | RX_PKT_CMPL_METADATA_DE |
 		    RX_PKT_CMPL_METADATA_PRI_MASK);
 	}
 	if (flags2 & RX_PKT_CMPL_FLAGS2_IP_CS_CALC) {
 		ri->iri_csum_flags |= CSUM_IP_CHECKED;
 		if (!(errors & RX_PKT_CMPL_ERRORS_IP_CS_ERROR))
 			ri->iri_csum_flags |= CSUM_IP_VALID;
 	}
 	if (flags2 & RX_PKT_CMPL_FLAGS2_L4_CS_CALC) {
 		ri->iri_csum_flags |= CSUM_L4_CALC;
 		if (!(errors & RX_PKT_CMPL_ERRORS_L4_CS_ERROR)) {
 			ri->iri_csum_flags |= CSUM_L4_VALID;
 			ri->iri_csum_data = 0xffff;
 		}
 	}
 
 	/* And finally the ag ring stuff. */
 	for (i=1; i < ri->iri_nfrags; i++) {
 		NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
 		ri->iri_cidx = RING_NEXT(&cpr->ring, ri->iri_cidx);
 		acp = &((struct rx_abuf_cmpl *)cpr->ring.vaddr)[cpr->cons];
 
 		/* No need to byte-swap the opaque value */
 		ri->iri_frags[i].irf_flid = (acp->opaque >> 16 & 0xff);
 		ri->iri_frags[i].irf_idx = acp->opaque & 0xffff;
 		ri->iri_frags[i].irf_len = le16toh(acp->len);
 		ri->iri_len += le16toh(acp->len);
 	}
 
 	return 0;
 }
 
 static int
 bnxt_pkt_get_tpa(struct bnxt_softc *softc, if_rxd_info_t ri,
     struct bnxt_cp_ring *cpr, uint16_t flags_type)
 {
 	struct rx_tpa_end_cmpl *agend =
 	    &((struct rx_tpa_end_cmpl *)cpr->ring.vaddr)[cpr->cons];
 	struct rx_tpa_end_cmpl_hi *agendh;
 	struct rx_abuf_cmpl *acp;
 	struct bnxt_full_tpa_start *tpas;
 	uint32_t flags2;
 	uint8_t	ags;
 	uint8_t agg_id;
 	int i;
 
 	/* Get the agg_id */
 	agg_id = (agend->agg_id & RX_TPA_END_CMPL_AGG_ID_MASK) >>
 	    RX_TPA_END_CMPL_AGG_ID_SFT;
-	tpas = &softc->tpa_start[agg_id];
+	tpas = &(softc->rx_rings[ri->iri_qsidx].tpa_start[agg_id]);
 
 	/* Extract from the first 16-byte BD */
 	if (le16toh(tpas->low.flags_type) & RX_TPA_START_CMPL_FLAGS_RSS_VALID) {
 		ri->iri_flowid = le32toh(tpas->low.rss_hash);
 		/*
 		 * TODO: Extract something useful from tpas->low.rss_hash_type
 		 * (undocumented)
 		 * May be documented in the "LSI ES"
 		 * also check the firmware code.
 		 */
 		ri->iri_rsstype = M_HASHTYPE_OPAQUE;
 	}
 	else {
 		ri->iri_rsstype = M_HASHTYPE_NONE;
 	}
 	ags = (agend->agg_bufs_v1 & RX_TPA_END_CMPL_AGG_BUFS_MASK) >>
 	    RX_TPA_END_CMPL_AGG_BUFS_SFT;
 	ri->iri_nfrags = ags + 1;
 	/* No need to byte-swap the opaque value */
-	ri->iri_frags[0].irf_flid = (tpas->low.opaque >> 16) & 0xff;
-	ri->iri_frags[0].irf_idx = tpas->low.opaque & 0xffff;
+	ri->iri_frags[0].irf_flid = ((tpas->low.opaque >> 16) & 0xff);
+	ri->iri_frags[0].irf_idx = (tpas->low.opaque & 0xffff);
 	ri->iri_frags[0].irf_len = le16toh(tpas->low.len);
 	ri->iri_len = le16toh(tpas->low.len);
 
 	/* Now the second 16-byte BD */
 	NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
 	ri->iri_cidx = RING_NEXT(&cpr->ring, ri->iri_cidx);
 	agendh = &((struct rx_tpa_end_cmpl_hi *)cpr->ring.vaddr)[cpr->cons];
 
 	flags2 = le32toh(tpas->high.flags2);
 	if ((flags2 & RX_TPA_START_CMPL_FLAGS2_META_FORMAT_MASK) ==
 	    RX_TPA_START_CMPL_FLAGS2_META_FORMAT_VLAN) {
 		ri->iri_flags |= M_VLANTAG;
 		/* TODO: Should this be the entire 16-bits? */
 		ri->iri_vtag = le32toh(tpas->high.metadata) &
 		    (RX_TPA_START_CMPL_METADATA_VID_MASK |
 		    RX_TPA_START_CMPL_METADATA_DE |
 		    RX_TPA_START_CMPL_METADATA_PRI_MASK);
 	}
 	if (flags2 & RX_TPA_START_CMPL_FLAGS2_IP_CS_CALC) {
 		ri->iri_csum_flags |= CSUM_IP_CHECKED;
 		ri->iri_csum_flags |= CSUM_IP_VALID;
 	}
 	if (flags2 & RX_TPA_START_CMPL_FLAGS2_L4_CS_CALC) {
 		ri->iri_csum_flags |= CSUM_L4_CALC;
 		ri->iri_csum_flags |= CSUM_L4_VALID;
 		ri->iri_csum_data = 0xffff;
 	}
 
 	/* Now the ag ring stuff. */
 	for (i=1; i < ri->iri_nfrags; i++) {
 		NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
 		ri->iri_cidx = RING_NEXT(&cpr->ring, ri->iri_cidx);
 		acp = &((struct rx_abuf_cmpl *)cpr->ring.vaddr)[cpr->cons];
 
 		/* No need to byte-swap the opaque value */
-		ri->iri_frags[i].irf_flid = (acp->opaque >> 16) & 0xff;
-		ri->iri_frags[i].irf_idx = acp->opaque & 0xffff;
+		ri->iri_frags[i].irf_flid = ((acp->opaque >> 16) & 0xff);
+		ri->iri_frags[i].irf_idx = (acp->opaque & 0xffff);
 		ri->iri_frags[i].irf_len = le16toh(acp->len);
 		ri->iri_len += le16toh(acp->len);
 	}
 
 	/* And finally, the empty BD at the end... */
 	ri->iri_nfrags++;
 	/* No need to byte-swap the opaque value */
-	ri->iri_frags[i].irf_flid = (agend->opaque >> 16) % 0xff;
-	ri->iri_frags[i].irf_idx = agend->opaque & 0xffff;
+	ri->iri_frags[i].irf_flid = ((agend->opaque >> 16) & 0xff);
+	ri->iri_frags[i].irf_idx = (agend->opaque & 0xffff);
 	ri->iri_frags[i].irf_len = le16toh(agend->len);
 	ri->iri_len += le16toh(agend->len);
 
 	return 0;
 }
 
 /* If we return anything but zero, iflib will assert... */
 static int
 bnxt_isc_rxd_pkt_get(void *sc, if_rxd_info_t ri)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_cp_ring *cpr = &softc->rx_cp_rings[ri->iri_qsidx];
+	struct cmpl_base *cmp_q = (struct cmpl_base *)cpr->ring.vaddr;
 	struct cmpl_base *cmp;
+	struct rx_tpa_start_cmpl *rtpa;
 	uint16_t flags_type;
 	uint16_t type;
+	uint8_t agg_id;
 
 	for (;;) {
 		NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
 		ri->iri_cidx = RING_NEXT(&cpr->ring, ri->iri_cidx);
 		CMPL_PREFETCH_NEXT(cpr, cpr->cons);
 		cmp = &((struct cmpl_base *)cpr->ring.vaddr)[cpr->cons];
 
 		flags_type = le16toh(cmp->type);
 		type = flags_type & CMPL_BASE_TYPE_MASK;
 
 		switch (type) {
 		case CMPL_BASE_TYPE_RX_L2:
 			return bnxt_pkt_get_l2(softc, ri, cpr, flags_type);
 		case CMPL_BASE_TYPE_RX_TPA_END:
 			return bnxt_pkt_get_tpa(softc, ri, cpr, flags_type);
 		case CMPL_BASE_TYPE_RX_TPA_START:
+			rtpa = (void *)&cmp_q[cpr->cons];
+			agg_id = (rtpa->agg_id &
+			    RX_TPA_START_CMPL_AGG_ID_MASK) >>
+			    RX_TPA_START_CMPL_AGG_ID_SFT;
+			softc->rx_rings[ri->iri_qsidx].tpa_start[agg_id].low = *rtpa;
+
 			NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
 			ri->iri_cidx = RING_NEXT(&cpr->ring, ri->iri_cidx);
 			CMPL_PREFETCH_NEXT(cpr, cpr->cons);
+
+			softc->rx_rings[ri->iri_qsidx].tpa_start[agg_id].high =
+			    ((struct rx_tpa_start_cmpl_hi *)cmp_q)[cpr->cons];
 			break;
 		default:
 			device_printf(softc->dev,
 			    "Unhandled completion type %d on RXQ %d get\n",
 			    type, ri->iri_qsidx);
 			if (type & 1) {
 				NEXT_CP_CONS_V(&cpr->ring, cpr->cons,
 				    cpr->v_bit);
 				ri->iri_cidx = RING_NEXT(&cpr->ring,
 				    ri->iri_cidx);
 				CMPL_PREFETCH_NEXT(cpr, cpr->cons);
 			}
 			break;
 		}
 	}
 
 	return 0;
 }
 
 static int
 bnxt_intr(void *sc)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 
 	device_printf(softc->dev, "STUB: %s @ %s:%d\n", __func__, __FILE__, __LINE__);
 	return ENOSYS;
 }
Index: stable/11/sys/dev/bnxt/if_bnxt.c
===================================================================
--- stable/11/sys/dev/bnxt/if_bnxt.c	(revision 333337)
+++ stable/11/sys/dev/bnxt/if_bnxt.c	(revision 333338)
@@ -1,2421 +1,2424 @@
 /*-
  * Broadcom NetXtreme-C/E network driver.
  *
  * Copyright (c) 2016 Broadcom, All Rights Reserved.
  * The term Broadcom refers to Broadcom Limited and/or its subsidiaries
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS'
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/endian.h>
 #include <sys/sockio.h>
 #include <sys/priv.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/iflib.h>
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 
 #include "ifdi_if.h"
 
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_ioctl.h"
 #include "bnxt_sysctl.h"
 #include "hsi_struct_def.h"
 
 /*
  * PCI Device ID Table
  */
 
 static pci_vendor_info_t bnxt_vendor_info_array[] =
 {
     PVID(BROADCOM_VENDOR_ID, BCM57301,
 	"Broadcom BCM57301 NetXtreme-C 10Gb Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57302,
 	"Broadcom BCM57302 NetXtreme-C 10Gb/25Gb Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57304,
 	"Broadcom BCM57304 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57311,
 	"Broadcom BCM57311 NetXtreme-C 10Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57312,
 	"Broadcom BCM57312 NetXtreme-C 10Gb/25Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57314,
 	"Broadcom BCM57314 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57402,
 	"Broadcom BCM57402 NetXtreme-E 10Gb Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57402_NPAR,
 	"Broadcom BCM57402 NetXtreme-E Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57404,
 	"Broadcom BCM57404 NetXtreme-E 10Gb/25Gb Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57404_NPAR,
 	"Broadcom BCM57404 NetXtreme-E Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57406,
 	"Broadcom BCM57406 NetXtreme-E 10GBase-T Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57406_NPAR,
 	"Broadcom BCM57406 NetXtreme-E Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57407,
 	"Broadcom BCM57407 NetXtreme-E 10GBase-T Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57407_NPAR,
 	"Broadcom BCM57407 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57407_SFP,
 	"Broadcom BCM57407 NetXtreme-E 25Gb Ethernet Controller"),
     PVID(BROADCOM_VENDOR_ID, BCM57412,
 	"Broadcom BCM57412 NetXtreme-E 10Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57412_NPAR1,
 	"Broadcom BCM57412 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57412_NPAR2,
 	"Broadcom BCM57412 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57414,
 	"Broadcom BCM57414 NetXtreme-E 10Gb/25Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57414_NPAR1,
 	"Broadcom BCM57414 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57414_NPAR2,
 	"Broadcom BCM57414 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57416,
 	"Broadcom BCM57416 NetXtreme-E 10GBase-T Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57416_NPAR1,
 	"Broadcom BCM57416 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57416_NPAR2,
 	"Broadcom BCM57416 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57416_SFP,
 	"Broadcom BCM57416 NetXtreme-E 10Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57417,
 	"Broadcom BCM57417 NetXtreme-E 10GBase-T Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57417_NPAR1,
 	"Broadcom BCM57417 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57417_NPAR2,
 	"Broadcom BCM57417 NetXtreme-E Ethernet Partition"),
     PVID(BROADCOM_VENDOR_ID, BCM57417_SFP,
 	"Broadcom BCM57417 NetXtreme-E 10Gb/25Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM57454,
 	"Broadcom BCM57454 NetXtreme-E 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, BCM58700,
 	"Broadcom BCM58700 Nitro 1Gb/2.5Gb/10Gb Ethernet"),
     PVID(BROADCOM_VENDOR_ID, NETXTREME_C_VF1,
 	"Broadcom NetXtreme-C Ethernet Virtual Function"),
     PVID(BROADCOM_VENDOR_ID, NETXTREME_C_VF2,
 	"Broadcom NetXtreme-C Ethernet Virtual Function"),
     PVID(BROADCOM_VENDOR_ID, NETXTREME_C_VF3,
 	"Broadcom NetXtreme-C Ethernet Virtual Function"),
     PVID(BROADCOM_VENDOR_ID, NETXTREME_E_VF1,
 	"Broadcom NetXtreme-E Ethernet Virtual Function"),
     PVID(BROADCOM_VENDOR_ID, NETXTREME_E_VF2,
 	"Broadcom NetXtreme-E Ethernet Virtual Function"),
     PVID(BROADCOM_VENDOR_ID, NETXTREME_E_VF3,
 	"Broadcom NetXtreme-E Ethernet Virtual Function"),
     /* required last entry */
 
     PVID_END
 };
 
 /*
  * Function prototypes
  */
 
 static void *bnxt_register(device_t dev);
 
 /* Soft queue setup and teardown */
 static int bnxt_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs,
     uint64_t *paddrs, int ntxqs, int ntxqsets);
 static int bnxt_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs,
     uint64_t *paddrs, int nrxqs, int nrxqsets);
 static void bnxt_queues_free(if_ctx_t ctx);
 
 /* Device setup and teardown */
 static int bnxt_attach_pre(if_ctx_t ctx);
 static int bnxt_attach_post(if_ctx_t ctx);
 static int bnxt_detach(if_ctx_t ctx);
 
 /* Device configuration */
 static void bnxt_init(if_ctx_t ctx);
 static void bnxt_stop(if_ctx_t ctx);
 static void bnxt_multi_set(if_ctx_t ctx);
 static int bnxt_mtu_set(if_ctx_t ctx, uint32_t mtu);
 static void bnxt_media_status(if_ctx_t ctx, struct ifmediareq * ifmr);
 static int bnxt_media_change(if_ctx_t ctx);
 static int bnxt_promisc_set(if_ctx_t ctx, int flags);
 static uint64_t	bnxt_get_counter(if_ctx_t, ift_counter);
 static void bnxt_update_admin_status(if_ctx_t ctx);
 
 /* Interrupt enable / disable */
 static void bnxt_intr_enable(if_ctx_t ctx);
 static int bnxt_queue_intr_enable(if_ctx_t ctx, uint16_t qid);
 static void bnxt_disable_intr(if_ctx_t ctx);
 static int bnxt_msix_intr_assign(if_ctx_t ctx, int msix);
 
 /* vlan support */
 static void bnxt_vlan_register(if_ctx_t ctx, uint16_t vtag);
 static void bnxt_vlan_unregister(if_ctx_t ctx, uint16_t vtag);
 
 /* ioctl */
 static int bnxt_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data);
 
 /* Internal support functions */
 static int bnxt_probe_phy(struct bnxt_softc *softc);
 static void bnxt_add_media_types(struct bnxt_softc *softc);
 static int bnxt_pci_mapping(struct bnxt_softc *softc);
 static void bnxt_pci_mapping_free(struct bnxt_softc *softc);
 static int bnxt_update_link(struct bnxt_softc *softc, bool chng_link_state);
 static int bnxt_handle_def_cp(void *arg);
 static int bnxt_handle_rx_cp(void *arg);
 static void bnxt_clear_ids(struct bnxt_softc *softc);
 static void inline bnxt_do_enable_intr(struct bnxt_cp_ring *cpr);
 static void inline bnxt_do_disable_intr(struct bnxt_cp_ring *cpr);
 static void bnxt_mark_cpr_invalid(struct bnxt_cp_ring *cpr);
 static void bnxt_def_cp_task(void *context);
 static void bnxt_handle_async_event(struct bnxt_softc *softc,
     struct cmpl_base *cmpl);
 static uint8_t get_phy_type(struct bnxt_softc *softc);
 static uint64_t bnxt_get_baudrate(struct bnxt_link_info *link);
 
 /*
  * Device Interface Declaration
  */
 
 static device_method_t bnxt_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_register, bnxt_register),
 	DEVMETHOD(device_probe, iflib_device_probe),
 	DEVMETHOD(device_attach, iflib_device_attach),
 	DEVMETHOD(device_detach, iflib_device_detach),
 	DEVMETHOD(device_shutdown, iflib_device_shutdown),
 	DEVMETHOD(device_suspend, iflib_device_suspend),
 	DEVMETHOD(device_resume, iflib_device_resume),
 	DEVMETHOD_END
 };
 
 static driver_t bnxt_driver = {
 	"bnxt", bnxt_methods, sizeof(struct bnxt_softc),
 };
 
 devclass_t bnxt_devclass;
 DRIVER_MODULE(bnxt, pci, bnxt_driver, bnxt_devclass, 0, 0);
 
 MODULE_DEPEND(bnxt, pci, 1, 1, 1);
 MODULE_DEPEND(bnxt, ether, 1, 1, 1);
 MODULE_DEPEND(bnxt, iflib, 1, 1, 1);
 
+IFLIB_PNP_INFO(pci, bnxt, bnxt_vendor_info_array);
+
 static device_method_t bnxt_iflib_methods[] = {
 	DEVMETHOD(ifdi_tx_queues_alloc, bnxt_tx_queues_alloc),
 	DEVMETHOD(ifdi_rx_queues_alloc, bnxt_rx_queues_alloc),
 	DEVMETHOD(ifdi_queues_free, bnxt_queues_free),
 
 	DEVMETHOD(ifdi_attach_pre, bnxt_attach_pre),
 	DEVMETHOD(ifdi_attach_post, bnxt_attach_post),
 	DEVMETHOD(ifdi_detach, bnxt_detach),
 
 	DEVMETHOD(ifdi_init, bnxt_init),
 	DEVMETHOD(ifdi_stop, bnxt_stop),
 	DEVMETHOD(ifdi_multi_set, bnxt_multi_set),
 	DEVMETHOD(ifdi_mtu_set, bnxt_mtu_set),
 	DEVMETHOD(ifdi_media_status, bnxt_media_status),
 	DEVMETHOD(ifdi_media_change, bnxt_media_change),
 	DEVMETHOD(ifdi_promisc_set, bnxt_promisc_set),
 	DEVMETHOD(ifdi_get_counter, bnxt_get_counter),
 	DEVMETHOD(ifdi_update_admin_status, bnxt_update_admin_status),
 
 	DEVMETHOD(ifdi_intr_enable, bnxt_intr_enable),
-	DEVMETHOD(ifdi_queue_intr_enable, bnxt_queue_intr_enable),
+	DEVMETHOD(ifdi_tx_queue_intr_enable, bnxt_queue_intr_enable),
+	DEVMETHOD(ifdi_rx_queue_intr_enable, bnxt_queue_intr_enable),
 	DEVMETHOD(ifdi_intr_disable, bnxt_disable_intr),
 	DEVMETHOD(ifdi_msix_intr_assign, bnxt_msix_intr_assign),
 
 	DEVMETHOD(ifdi_vlan_register, bnxt_vlan_register),
 	DEVMETHOD(ifdi_vlan_unregister, bnxt_vlan_unregister),
 
 	DEVMETHOD(ifdi_priv_ioctl, bnxt_priv_ioctl),
 
 	DEVMETHOD_END
 };
 
 static driver_t bnxt_iflib_driver = {
 	"bnxt", bnxt_iflib_methods, sizeof(struct bnxt_softc)
 };
 
 /*
  * iflib shared context
  */
 
 char bnxt_driver_version[] = "FreeBSD base";
 extern struct if_txrx bnxt_txrx;
 static struct if_shared_ctx bnxt_sctx_init = {
 	.isc_magic = IFLIB_MAGIC,
-	.isc_txrx = &bnxt_txrx,
 	.isc_driver = &bnxt_iflib_driver,
 	.isc_nfl = 2,				// Number of Free Lists
-	.isc_flags = IFLIB_HAS_RXCQ | IFLIB_HAS_TXCQ,
+	.isc_flags = IFLIB_HAS_RXCQ | IFLIB_HAS_TXCQ | IFLIB_NEED_ETHER_PAD,
 	.isc_q_align = PAGE_SIZE,
 	.isc_tx_maxsize = BNXT_TSO_SIZE,
 	.isc_tx_maxsegsize = BNXT_TSO_SIZE,
 	.isc_rx_maxsize = BNXT_TSO_SIZE,
 	.isc_rx_maxsegsize = BNXT_TSO_SIZE,
 
 	// Only use a single segment to avoid page size constraints
 	.isc_rx_nsegments = 1,
 	.isc_ntxqs = 2,
 	.isc_nrxqs = 3,
 	.isc_nrxd_min = {16, 16, 16},
 	.isc_nrxd_default = {PAGE_SIZE / sizeof(struct cmpl_base) * 8,
 	    PAGE_SIZE / sizeof(struct rx_prod_pkt_bd),
 	    PAGE_SIZE / sizeof(struct rx_prod_pkt_bd)},
 	.isc_nrxd_max = {INT32_MAX, INT32_MAX, INT32_MAX},
 	.isc_ntxd_min = {16, 16, 16},
 	.isc_ntxd_default = {PAGE_SIZE / sizeof(struct cmpl_base) * 2,
 	    PAGE_SIZE / sizeof(struct tx_bd_short)},
 	.isc_ntxd_max = {INT32_MAX, INT32_MAX, INT32_MAX},
 
 	.isc_admin_intrcnt = 1,
 	.isc_vendor_info = bnxt_vendor_info_array,
 	.isc_driver_version = bnxt_driver_version,
 };
 
 if_shared_ctx_t bnxt_sctx = &bnxt_sctx_init;
 
 /*
  * Device Methods
  */
 
 static void *
 bnxt_register(device_t dev)
 {
 	return bnxt_sctx;
 }
 
 /*
  * Device Dependent Configuration Functions
 */
 
 /* Soft queue setup and teardown */
 static int
 bnxt_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs,
     uint64_t *paddrs, int ntxqs, int ntxqsets)
 {
 	struct bnxt_softc *softc;
 	int i;
 	int rc;
 
 	softc = iflib_get_softc(ctx);
 
 	softc->tx_cp_rings = malloc(sizeof(struct bnxt_cp_ring) * ntxqsets,
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (!softc->tx_cp_rings) {
 		device_printf(iflib_get_dev(ctx),
 		    "unable to allocate TX completion rings\n");
 		rc = ENOMEM;
 		goto cp_alloc_fail;
 	}
 	softc->tx_rings = malloc(sizeof(struct bnxt_ring) * ntxqsets,
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (!softc->tx_rings) {
 		device_printf(iflib_get_dev(ctx),
 		    "unable to allocate TX rings\n");
 		rc = ENOMEM;
 		goto ring_alloc_fail;
 	}
 	rc = iflib_dma_alloc(ctx, sizeof(struct ctx_hw_stats) * ntxqsets,
 	    &softc->tx_stats, 0);
 	if (rc)
 		goto dma_alloc_fail;
 	bus_dmamap_sync(softc->tx_stats.idi_tag, softc->tx_stats.idi_map,
 	    BUS_DMASYNC_PREREAD);
 
 	for (i = 0; i < ntxqsets; i++) {
 		/* Set up the completion ring */
 		softc->tx_cp_rings[i].stats_ctx_id = HWRM_NA_SIGNATURE;
 		softc->tx_cp_rings[i].ring.phys_id =
 		    (uint16_t)HWRM_NA_SIGNATURE;
 		softc->tx_cp_rings[i].ring.softc = softc;
 		softc->tx_cp_rings[i].ring.id =
 		    (softc->scctx->isc_nrxqsets * 2) + 1 + i;
 		softc->tx_cp_rings[i].ring.doorbell =
 		    softc->tx_cp_rings[i].ring.id * 0x80;
 		softc->tx_cp_rings[i].ring.ring_size =
 		    softc->scctx->isc_ntxd[0];
 		softc->tx_cp_rings[i].ring.vaddr = vaddrs[i * ntxqs];
 		softc->tx_cp_rings[i].ring.paddr = paddrs[i * ntxqs];
 
 		/* Set up the TX ring */
 		softc->tx_rings[i].phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 		softc->tx_rings[i].softc = softc;
 		softc->tx_rings[i].id =
 		    (softc->scctx->isc_nrxqsets * 2) + 1 + i;
 		softc->tx_rings[i].doorbell = softc->tx_rings[i].id * 0x80;
 		softc->tx_rings[i].ring_size = softc->scctx->isc_ntxd[1];
 		softc->tx_rings[i].vaddr = vaddrs[i * ntxqs + 1];
 		softc->tx_rings[i].paddr = paddrs[i * ntxqs + 1];
 
 		bnxt_create_tx_sysctls(softc, i);
 	}
 
 	softc->ntxqsets = ntxqsets;
 	return rc;
 
 dma_alloc_fail:
 	free(softc->tx_rings, M_DEVBUF);
 ring_alloc_fail:
 	free(softc->tx_cp_rings, M_DEVBUF);
 cp_alloc_fail:
 	return rc;
 }
 
 static void
 bnxt_queues_free(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 
 	// Free TX queues
 	iflib_dma_free(&softc->tx_stats);
 	free(softc->tx_rings, M_DEVBUF);
 	softc->tx_rings = NULL;
 	free(softc->tx_cp_rings, M_DEVBUF);
 	softc->tx_cp_rings = NULL;
 	softc->ntxqsets = 0;
 
 	// Free RX queues
 	iflib_dma_free(&softc->rx_stats);
 	free(softc->grp_info, M_DEVBUF);
 	free(softc->ag_rings, M_DEVBUF);
 	free(softc->rx_rings, M_DEVBUF);
 	free(softc->rx_cp_rings, M_DEVBUF);
 }
 
 static int
 bnxt_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs,
     uint64_t *paddrs, int nrxqs, int nrxqsets)
 {
 	struct bnxt_softc *softc;
 	int i;
 	int rc;
 
 	softc = iflib_get_softc(ctx);
 
 	softc->rx_cp_rings = malloc(sizeof(struct bnxt_cp_ring) * nrxqsets,
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (!softc->rx_cp_rings) {
 		device_printf(iflib_get_dev(ctx),
 		    "unable to allocate RX completion rings\n");
 		rc = ENOMEM;
 		goto cp_alloc_fail;
 	}
 	softc->rx_rings = malloc(sizeof(struct bnxt_ring) * nrxqsets,
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (!softc->rx_rings) {
 		device_printf(iflib_get_dev(ctx),
 		    "unable to allocate RX rings\n");
 		rc = ENOMEM;
 		goto ring_alloc_fail;
 	}
 	softc->ag_rings = malloc(sizeof(struct bnxt_ring) * nrxqsets,
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (!softc->ag_rings) {
 		device_printf(iflib_get_dev(ctx),
 		    "unable to allocate aggregation rings\n");
 		rc = ENOMEM;
 		goto ag_alloc_fail;
 	}
 	softc->grp_info = malloc(sizeof(struct bnxt_grp_info) * nrxqsets,
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (!softc->grp_info) {
 		device_printf(iflib_get_dev(ctx),
 		    "unable to allocate ring groups\n");
 		rc = ENOMEM;
 		goto grp_alloc_fail;
 	}
 
 	rc = iflib_dma_alloc(ctx, sizeof(struct ctx_hw_stats) * nrxqsets,
 	    &softc->rx_stats, 0);
 	if (rc)
 		goto hw_stats_alloc_fail;
 	bus_dmamap_sync(softc->rx_stats.idi_tag, softc->rx_stats.idi_map,
 	    BUS_DMASYNC_PREREAD);
 
 	for (i = 0; i < nrxqsets; i++) {
 		/* Allocation the completion ring */
 		softc->rx_cp_rings[i].stats_ctx_id = HWRM_NA_SIGNATURE;
 		softc->rx_cp_rings[i].ring.phys_id =
 		    (uint16_t)HWRM_NA_SIGNATURE;
 		softc->rx_cp_rings[i].ring.softc = softc;
 		softc->rx_cp_rings[i].ring.id = i + 1;
 		softc->rx_cp_rings[i].ring.doorbell =
 		    softc->rx_cp_rings[i].ring.id * 0x80;
 		/*
 		 * If this ring overflows, RX stops working.
 		 */
 		softc->rx_cp_rings[i].ring.ring_size =
 		    softc->scctx->isc_nrxd[0];
 		softc->rx_cp_rings[i].ring.vaddr = vaddrs[i * nrxqs];
 		softc->rx_cp_rings[i].ring.paddr = paddrs[i * nrxqs];
 
 		/* Allocate the RX ring */
 		softc->rx_rings[i].phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 		softc->rx_rings[i].softc = softc;
 		softc->rx_rings[i].id = i + 1;
 		softc->rx_rings[i].doorbell = softc->rx_rings[i].id * 0x80;
 		softc->rx_rings[i].ring_size = softc->scctx->isc_nrxd[1];
 		softc->rx_rings[i].vaddr = vaddrs[i * nrxqs + 1];
 		softc->rx_rings[i].paddr = paddrs[i * nrxqs + 1];
 
+		/* Allocate the TPA start buffer */
+		softc->rx_rings[i].tpa_start = malloc(sizeof(struct bnxt_full_tpa_start) *
+	    		(RX_TPA_START_CMPL_AGG_ID_MASK >> RX_TPA_START_CMPL_AGG_ID_SFT),
+	    		M_DEVBUF, M_NOWAIT | M_ZERO);
+		if (softc->rx_rings[i].tpa_start == NULL) {
+			rc = -ENOMEM;
+			device_printf(softc->dev,
+					"Unable to allocate space for TPA\n");
+			goto tpa_alloc_fail;
+		}
+
 		/* Allocate the AG ring */
 		softc->ag_rings[i].phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 		softc->ag_rings[i].softc = softc;
 		softc->ag_rings[i].id = nrxqsets + i + 1;
 		softc->ag_rings[i].doorbell = softc->ag_rings[i].id * 0x80;
 		softc->ag_rings[i].ring_size = softc->scctx->isc_nrxd[2];
 		softc->ag_rings[i].vaddr = vaddrs[i * nrxqs + 2];
 		softc->ag_rings[i].paddr = paddrs[i * nrxqs + 2];
 
 		/* Allocate the ring group */
 		softc->grp_info[i].grp_id = (uint16_t)HWRM_NA_SIGNATURE;
 		softc->grp_info[i].stats_ctx =
 		    softc->rx_cp_rings[i].stats_ctx_id;
 		softc->grp_info[i].rx_ring_id = softc->rx_rings[i].phys_id;
 		softc->grp_info[i].ag_ring_id = softc->ag_rings[i].phys_id;
 		softc->grp_info[i].cp_ring_id =
 		    softc->rx_cp_rings[i].ring.phys_id;
 
 		bnxt_create_rx_sysctls(softc, i);
 	}
 
 	/* And finally, the VNIC */
 	softc->vnic_info.id = (uint16_t)HWRM_NA_SIGNATURE;
 	softc->vnic_info.flow_id = (uint16_t)HWRM_NA_SIGNATURE;
 	softc->vnic_info.filter_id = -1;
 	softc->vnic_info.def_ring_grp = (uint16_t)HWRM_NA_SIGNATURE;
 	softc->vnic_info.cos_rule = (uint16_t)HWRM_NA_SIGNATURE;
 	softc->vnic_info.lb_rule = (uint16_t)HWRM_NA_SIGNATURE;
 	softc->vnic_info.rx_mask = HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_BCAST;
 	softc->vnic_info.mc_list_count = 0;
 	softc->vnic_info.flags = BNXT_VNIC_FLAG_DEFAULT;
 	rc = iflib_dma_alloc(ctx, BNXT_MAX_MC_ADDRS * ETHER_ADDR_LEN,
 	    &softc->vnic_info.mc_list, 0);
 	if (rc)
 		goto mc_list_alloc_fail;
 
 	/* The VNIC RSS Hash Key */
 	rc = iflib_dma_alloc(ctx, HW_HASH_KEY_SIZE,
 	    &softc->vnic_info.rss_hash_key_tbl, 0);
 	if (rc)
 		goto rss_hash_alloc_fail;
 	bus_dmamap_sync(softc->vnic_info.rss_hash_key_tbl.idi_tag,
 	    softc->vnic_info.rss_hash_key_tbl.idi_map,
 	    BUS_DMASYNC_PREWRITE);
 	memcpy(softc->vnic_info.rss_hash_key_tbl.idi_vaddr,
 	    softc->vnic_info.rss_hash_key, HW_HASH_KEY_SIZE);
 
 	/* Allocate the RSS tables */
 	rc = iflib_dma_alloc(ctx, HW_HASH_INDEX_SIZE * sizeof(uint16_t),
 	    &softc->vnic_info.rss_grp_tbl, 0);
 	if (rc)
 		goto rss_grp_alloc_fail;
 	bus_dmamap_sync(softc->vnic_info.rss_grp_tbl.idi_tag,
 	    softc->vnic_info.rss_grp_tbl.idi_map,
 	    BUS_DMASYNC_PREWRITE);
 	memset(softc->vnic_info.rss_grp_tbl.idi_vaddr, 0xff,
 	    softc->vnic_info.rss_grp_tbl.idi_size);
 
 	softc->nrxqsets = nrxqsets;
 	return rc;
 
 rss_grp_alloc_fail:
 	iflib_dma_free(&softc->vnic_info.rss_hash_key_tbl);
 rss_hash_alloc_fail:
 	iflib_dma_free(&softc->vnic_info.mc_list);
+tpa_alloc_fail:
 mc_list_alloc_fail:
+	for (i = i - 1; i >= 0; i--)
+		free(softc->rx_rings[i].tpa_start, M_DEVBUF);
 	iflib_dma_free(&softc->rx_stats);
 hw_stats_alloc_fail:
 	free(softc->grp_info, M_DEVBUF);
 grp_alloc_fail:
 	free(softc->ag_rings, M_DEVBUF);
 ag_alloc_fail:
 	free(softc->rx_rings, M_DEVBUF);
 ring_alloc_fail:
 	free(softc->rx_cp_rings, M_DEVBUF);
 cp_alloc_fail:
 	return rc;
 }
 
 /* Device setup and teardown */
 static int
 bnxt_attach_pre(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	if_softc_ctx_t scctx;
 	int rc = 0;
 
 	softc->ctx = ctx;
 	softc->dev = iflib_get_dev(ctx);
 	softc->media = iflib_get_media(ctx);
 	softc->scctx = iflib_get_softc_ctx(ctx);
 	softc->sctx = iflib_get_sctx(ctx);
 	scctx = softc->scctx;
 
 	/* TODO: Better way of detecting NPAR/VF is needed */
 	switch (softc->sctx->isc_vendor_info->pvi_device_id) {
 	case BCM57402_NPAR:
 	case BCM57404_NPAR:
 	case BCM57406_NPAR:
 	case BCM57407_NPAR:
 	case BCM57412_NPAR1:
 	case BCM57412_NPAR2:
 	case BCM57414_NPAR1:
 	case BCM57414_NPAR2:
 	case BCM57416_NPAR1:
 	case BCM57416_NPAR2:
 		softc->flags |= BNXT_FLAG_NPAR;
 		break;
 	case NETXTREME_C_VF1:
 	case NETXTREME_C_VF2:
 	case NETXTREME_C_VF3:
 	case NETXTREME_E_VF1:
 	case NETXTREME_E_VF2:
 	case NETXTREME_E_VF3:
 		softc->flags |= BNXT_FLAG_VF;
 		break;
 	}
 
 	pci_enable_busmaster(softc->dev);
 
 	if (bnxt_pci_mapping(softc))
 		return (ENXIO);
 
 	/* HWRM setup/init */
 	BNXT_HWRM_LOCK_INIT(softc, device_get_nameunit(softc->dev));
 	rc = bnxt_alloc_hwrm_dma_mem(softc);
 	if (rc)
 		goto dma_fail;
 
-	/* Allocate the TPA start buffer */
-	softc->tpa_start = malloc(sizeof(struct bnxt_full_tpa_start) *
-	    (RX_TPA_START_CMPL_AGG_ID_MASK >> RX_TPA_START_CMPL_AGG_ID_SFT),
-	    M_DEVBUF, M_NOWAIT | M_ZERO);
-	if (softc->tpa_start == NULL) {
-		rc = ENOMEM;
-		device_printf(softc->dev,
-		    "Unable to allocate space for TPA\n");
-		goto tpa_failed;
-	}
 
 	/* Get firmware version and compare with driver */
 	softc->ver_info = malloc(sizeof(struct bnxt_ver_info),
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (softc->ver_info == NULL) {
 		rc = ENOMEM;
 		device_printf(softc->dev,
 		    "Unable to allocate space for version info\n");
 		goto ver_alloc_fail;
 	}
 	/* Default minimum required HWRM version */
 	softc->ver_info->hwrm_min_major = 1;
 	softc->ver_info->hwrm_min_minor = 2;
 	softc->ver_info->hwrm_min_update = 2;
 
 	rc = bnxt_hwrm_ver_get(softc);
 	if (rc) {
 		device_printf(softc->dev, "attach: hwrm ver get failed\n");
 		goto ver_fail;
 	}
 
 	/* Get NVRAM info */
 	softc->nvm_info = malloc(sizeof(struct bnxt_nvram_info),
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (softc->nvm_info == NULL) {
 		rc = ENOMEM;
 		device_printf(softc->dev,
 		    "Unable to allocate space for NVRAM info\n");
 		goto nvm_alloc_fail;
 	}
 	rc = bnxt_hwrm_nvm_get_dev_info(softc, &softc->nvm_info->mfg_id,
 	    &softc->nvm_info->device_id, &softc->nvm_info->sector_size,
 	    &softc->nvm_info->size, &softc->nvm_info->reserved_size,
 	    &softc->nvm_info->available_size);
 
 	/* Register the driver with the FW */
 	rc = bnxt_hwrm_func_drv_rgtr(softc);
 	if (rc) {
 		device_printf(softc->dev, "attach: hwrm drv rgtr failed\n");
 		goto drv_rgtr_fail;
 	}
 
 	/* Get the HW capabilities */
 	rc = bnxt_hwrm_func_qcaps(softc);
 	if (rc)
 		goto failed;
 	iflib_set_mac(ctx, softc->func.mac_addr);
 
+	scctx->isc_txrx = &bnxt_txrx;
+	scctx->isc_tx_csum_flags = (CSUM_IP | CSUM_TCP | CSUM_UDP |
+	    CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_TSO);
+	scctx->isc_capenable =
+	    /* These are translated to hwassit bits */
+	    IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO4 | IFCAP_TSO6 |
+	    /* These are checked by iflib */
+	    IFCAP_LRO | IFCAP_VLAN_HWFILTER |
+	    /* These are part of the iflib mask */
+	    IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_VLAN_MTU |
+	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO |
+	    /* These likely get lost... */
+	    IFCAP_VLAN_HWCSUM | IFCAP_JUMBO_MTU;
+
 	/* Get the queue config */
 	rc = bnxt_hwrm_queue_qportcfg(softc);
 	if (rc) {
 		device_printf(softc->dev, "attach: hwrm qportcfg failed\n");
 		goto failed;
 	}
 
 	/* Now perform a function reset */
 	rc = bnxt_hwrm_func_reset(softc);
 	bnxt_clear_ids(softc);
 	if (rc)
 		goto failed;
 
 	/* Now set up iflib sc */
 	scctx->isc_tx_nsegments = 31,
 	scctx->isc_tx_tso_segments_max = 31;
 	scctx->isc_tx_tso_size_max = BNXT_TSO_SIZE;
 	scctx->isc_tx_tso_segsize_max = BNXT_TSO_SIZE;
 	scctx->isc_vectors = softc->func.max_cp_rings;
+	scctx->isc_min_frame_size = BNXT_MIN_FRAME_SIZE;
+	scctx->isc_txrx = &bnxt_txrx;
+
 	if (scctx->isc_nrxd[0] <
 	    ((scctx->isc_nrxd[1] * 4) + scctx->isc_nrxd[2]))
 		device_printf(softc->dev,
 		    "WARNING: nrxd0 (%d) should be at least 4 * nrxd1 (%d) + nrxd2 (%d).  Driver may be unstable\n",
 		    scctx->isc_nrxd[0], scctx->isc_nrxd[1], scctx->isc_nrxd[2]);
 	if (scctx->isc_ntxd[0] < scctx->isc_ntxd[1] * 2)
 		device_printf(softc->dev,
 		    "WARNING: ntxd0 (%d) should be at least 2 * ntxd1 (%d).  Driver may be unstable\n",
 		    scctx->isc_ntxd[0], scctx->isc_ntxd[1]);
 	scctx->isc_txqsizes[0] = sizeof(struct cmpl_base) * scctx->isc_ntxd[0];
 	scctx->isc_txqsizes[1] = sizeof(struct tx_bd_short) *
 	    scctx->isc_ntxd[1];
 	scctx->isc_rxqsizes[0] = sizeof(struct cmpl_base) * scctx->isc_nrxd[0];
 	scctx->isc_rxqsizes[1] = sizeof(struct rx_prod_pkt_bd) *
 	    scctx->isc_nrxd[1];
 	scctx->isc_rxqsizes[2] = sizeof(struct rx_prod_pkt_bd) *
 	    scctx->isc_nrxd[2];
-	scctx->isc_max_rxqsets = min(pci_msix_count(softc->dev)-1,
+	scctx->isc_nrxqsets_max = min(pci_msix_count(softc->dev)-1,
 	    softc->func.max_cp_rings - 1);
-	scctx->isc_max_rxqsets = min(scctx->isc_max_rxqsets,
+	scctx->isc_nrxqsets_max = min(scctx->isc_nrxqsets_max,
 	    softc->func.max_rx_rings);
-	scctx->isc_max_txqsets = min(softc->func.max_rx_rings,
-	    softc->func.max_cp_rings - scctx->isc_max_rxqsets - 1);
+	scctx->isc_ntxqsets_max = min(softc->func.max_rx_rings,
+	    softc->func.max_cp_rings - scctx->isc_nrxqsets_max - 1);
 	scctx->isc_rss_table_size = HW_HASH_INDEX_SIZE;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size - 1;
 
 	/* iflib will map and release this bar */
 	scctx->isc_msix_bar = pci_msix_table_bar(softc->dev);
 
 	/* Allocate the default completion ring */
 	softc->def_cp_ring.stats_ctx_id = HWRM_NA_SIGNATURE;
 	softc->def_cp_ring.ring.phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 	softc->def_cp_ring.ring.softc = softc;
 	softc->def_cp_ring.ring.id = 0;
 	softc->def_cp_ring.ring.doorbell = softc->def_cp_ring.ring.id * 0x80;
 	softc->def_cp_ring.ring.ring_size = PAGE_SIZE /
 	    sizeof(struct cmpl_base);
 	rc = iflib_dma_alloc(ctx,
 	    sizeof(struct cmpl_base) * softc->def_cp_ring.ring.ring_size,
 	    &softc->def_cp_ring_mem, 0);
 	softc->def_cp_ring.ring.vaddr = softc->def_cp_ring_mem.idi_vaddr;
 	softc->def_cp_ring.ring.paddr = softc->def_cp_ring_mem.idi_paddr;
 	iflib_config_gtask_init(ctx, &softc->def_cp_task, bnxt_def_cp_task,
 	    "dflt_cp");
 
 	rc = bnxt_init_sysctl_ctx(softc);
 	if (rc)
 		goto init_sysctl_failed;
 	rc = bnxt_create_nvram_sysctls(softc->nvm_info);
 	if (rc)
 		goto failed;
 
 	arc4rand(softc->vnic_info.rss_hash_key, HW_HASH_KEY_SIZE, 0);
 	softc->vnic_info.rss_hash_type =
 	    HWRM_VNIC_RSS_CFG_INPUT_HASH_TYPE_IPV4 |
 	    HWRM_VNIC_RSS_CFG_INPUT_HASH_TYPE_TCP_IPV4 |
 	    HWRM_VNIC_RSS_CFG_INPUT_HASH_TYPE_UDP_IPV4 |
 	    HWRM_VNIC_RSS_CFG_INPUT_HASH_TYPE_IPV6 |
 	    HWRM_VNIC_RSS_CFG_INPUT_HASH_TYPE_TCP_IPV6 |
 	    HWRM_VNIC_RSS_CFG_INPUT_HASH_TYPE_UDP_IPV6;
 	rc = bnxt_create_config_sysctls_pre(softc);
 	if (rc)
 		goto failed;
 
 	/* Initialize the vlan list */
 	SLIST_INIT(&softc->vnic_info.vlan_tags);
 	softc->vnic_info.vlan_tag_list.idi_vaddr = NULL;
 
 	return (rc);
 
 failed:
 	bnxt_free_sysctl_ctx(softc);
 init_sysctl_failed:
 	bnxt_hwrm_func_drv_unrgtr(softc, false);
 drv_rgtr_fail:
 	free(softc->nvm_info, M_DEVBUF);
 nvm_alloc_fail:
 ver_fail:
 	free(softc->ver_info, M_DEVBUF);
 ver_alloc_fail:
-	free(softc->tpa_start, M_DEVBUF);
-tpa_failed:
 	bnxt_free_hwrm_dma_mem(softc);
 dma_fail:
 	BNXT_HWRM_LOCK_DESTROY(softc);
 	bnxt_pci_mapping_free(softc);
 	pci_disable_busmaster(softc->dev);
 	return (rc);
 }
 
 static int
 bnxt_attach_post(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	if_t ifp = iflib_get_ifp(ctx);
-	int capabilities, enabling;
 	int rc;
 
 	bnxt_create_config_sysctls_post(softc);
 
 	/* Update link state etc... */
 	rc = bnxt_probe_phy(softc);
 	if (rc)
 		goto failed;
 
 	/* Needs to be done after probing the phy */
 	bnxt_create_ver_sysctls(softc);
 	bnxt_add_media_types(softc);
 	ifmedia_set(softc->media, IFM_ETHER | IFM_AUTO);
 
-	if_sethwassist(ifp, (CSUM_TCP | CSUM_UDP | CSUM_TCP_IPV6 |
-	    CSUM_UDP_IPV6 | CSUM_TSO));
-
-	capabilities =
-	    /* These are translated to hwassit bits */
-	    IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO4 | IFCAP_TSO6 |
-	    /* These are checked by iflib */
-	    IFCAP_LRO | IFCAP_VLAN_HWFILTER |
-	    /* These are part of the iflib mask */
-	    IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_VLAN_MTU |
-	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO |
-	    /* These likely get lost... */
-	    IFCAP_VLAN_HWCSUM | IFCAP_JUMBO_MTU;
-
-	if_setcapabilities(ifp, capabilities);
-
-	enabling = capabilities;
-
-	if_setcapenable(ifp, enabling);
-
 	softc->scctx->isc_max_frame_size = ifp->if_mtu + ETHER_HDR_LEN +
 	    ETHER_CRC_LEN;
 
 failed:
 	return rc;
 }
 
 static int
 bnxt_detach(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct bnxt_vlan_tag *tag;
 	struct bnxt_vlan_tag *tmp;
 	int i;
 
 	bnxt_do_disable_intr(&softc->def_cp_ring);
 	bnxt_free_sysctl_ctx(softc);
 	bnxt_hwrm_func_reset(softc);
 	bnxt_clear_ids(softc);
 	iflib_irq_free(ctx, &softc->def_cp_ring.irq);
 	iflib_config_gtask_deinit(&softc->def_cp_task);
 	/* We need to free() these here... */
 	for (i = softc->nrxqsets-1; i>=0; i--) {
 		iflib_irq_free(ctx, &softc->rx_cp_rings[i].irq);
 	}
 	iflib_dma_free(&softc->vnic_info.mc_list);
 	iflib_dma_free(&softc->vnic_info.rss_hash_key_tbl);
 	iflib_dma_free(&softc->vnic_info.rss_grp_tbl);
 	if (softc->vnic_info.vlan_tag_list.idi_vaddr)
 		iflib_dma_free(&softc->vnic_info.vlan_tag_list);
 	SLIST_FOREACH_SAFE(tag, &softc->vnic_info.vlan_tags, next, tmp)
 		free(tag, M_DEVBUF);
 	iflib_dma_free(&softc->def_cp_ring_mem);
-	free(softc->tpa_start, M_DEVBUF);
+	for (i = 0; i < softc->nrxqsets; i++)
+		free(softc->rx_rings[i].tpa_start, M_DEVBUF);
 	free(softc->ver_info, M_DEVBUF);
 	free(softc->nvm_info, M_DEVBUF);
 
 	bnxt_hwrm_func_drv_unrgtr(softc, false);
 	bnxt_free_hwrm_dma_mem(softc);
 	BNXT_HWRM_LOCK_DESTROY(softc);
 
 	pci_disable_busmaster(softc->dev);
 	bnxt_pci_mapping_free(softc);
 
 	return 0;
 }
 
 /* Device configuration */
 static void
 bnxt_init(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct ifmediareq ifmr;
 	int i, j;
 	int rc;
 
 	rc = bnxt_hwrm_func_reset(softc);
 	if (rc)
 		return;
 	bnxt_clear_ids(softc);
 
 	/* Allocate the default completion ring */
 	softc->def_cp_ring.cons = UINT32_MAX;
 	softc->def_cp_ring.v_bit = 1;
 	bnxt_mark_cpr_invalid(&softc->def_cp_ring);
 	rc = bnxt_hwrm_ring_alloc(softc,
 	    HWRM_RING_ALLOC_INPUT_RING_TYPE_CMPL,
 	    &softc->def_cp_ring.ring,
 	    (uint16_t)HWRM_NA_SIGNATURE,
 	    HWRM_NA_SIGNATURE, true);
 	if (rc)
 		goto fail;
 
 	/* And now set the default CP ring as the async CP ring */
 	rc = bnxt_hwrm_func_cfg(softc);
 	if (rc)
 		goto fail;
 
 	for (i = 0; i < softc->nrxqsets; i++) {
 		/* Allocate the statistics context */
 		rc = bnxt_hwrm_stat_ctx_alloc(softc, &softc->rx_cp_rings[i],
 		    softc->rx_stats.idi_paddr +
 		    (sizeof(struct ctx_hw_stats) * i));
 		if (rc)
 			goto fail;
 
 		/* Allocate the completion ring */
 		softc->rx_cp_rings[i].cons = UINT32_MAX;
 		softc->rx_cp_rings[i].v_bit = 1;
 		softc->rx_cp_rings[i].last_idx = UINT32_MAX;
 		bnxt_mark_cpr_invalid(&softc->rx_cp_rings[i]);
 		rc = bnxt_hwrm_ring_alloc(softc,
 		    HWRM_RING_ALLOC_INPUT_RING_TYPE_CMPL,
 		    &softc->rx_cp_rings[i].ring, (uint16_t)HWRM_NA_SIGNATURE,
 		    HWRM_NA_SIGNATURE, true);
 		if (rc)
 			goto fail;
 
 		/* Allocate the RX ring */
 		rc = bnxt_hwrm_ring_alloc(softc,
 		    HWRM_RING_ALLOC_INPUT_RING_TYPE_RX,
 		    &softc->rx_rings[i], (uint16_t)HWRM_NA_SIGNATURE,
 		    HWRM_NA_SIGNATURE, false);
 		if (rc)
 			goto fail;
 		BNXT_RX_DB(&softc->rx_rings[i], 0);
 		/* TODO: Cumulus+ doesn't need the double doorbell */
 		BNXT_RX_DB(&softc->rx_rings[i], 0);
 
 		/* Allocate the AG ring */
 		rc = bnxt_hwrm_ring_alloc(softc,
 		    HWRM_RING_ALLOC_INPUT_RING_TYPE_RX,
 		    &softc->ag_rings[i], (uint16_t)HWRM_NA_SIGNATURE,
 		    HWRM_NA_SIGNATURE, false);
 		if (rc)
 			goto fail;
 		BNXT_RX_DB(&softc->rx_rings[i], 0);
 		/* TODO: Cumulus+ doesn't need the double doorbell */
 		BNXT_RX_DB(&softc->ag_rings[i], 0);
 
 		/* Allocate the ring group */
 		softc->grp_info[i].stats_ctx =
 		    softc->rx_cp_rings[i].stats_ctx_id;
 		softc->grp_info[i].rx_ring_id = softc->rx_rings[i].phys_id;
 		softc->grp_info[i].ag_ring_id = softc->ag_rings[i].phys_id;
 		softc->grp_info[i].cp_ring_id =
 		    softc->rx_cp_rings[i].ring.phys_id;
 		rc = bnxt_hwrm_ring_grp_alloc(softc, &softc->grp_info[i]);
 		if (rc)
 			goto fail;
 
 	}
 
 	/* Allocate the VNIC RSS context */
 	rc = bnxt_hwrm_vnic_ctx_alloc(softc, &softc->vnic_info.rss_id);
 	if (rc)
 		goto fail;
 
 	/* Allocate the vnic */
 	softc->vnic_info.def_ring_grp = softc->grp_info[0].grp_id;
 	softc->vnic_info.mru = softc->scctx->isc_max_frame_size;
 	rc = bnxt_hwrm_vnic_alloc(softc, &softc->vnic_info);
 	if (rc)
 		goto fail;
 	rc = bnxt_hwrm_vnic_cfg(softc, &softc->vnic_info);
 	if (rc)
 		goto fail;
 	rc = bnxt_hwrm_set_filter(softc, &softc->vnic_info);
 	if (rc)
 		goto fail;
 
 	/* Enable RSS on the VNICs */
 	for (i = 0, j = 0; i < HW_HASH_INDEX_SIZE; i++) {
 		((uint16_t *)
 		    softc->vnic_info.rss_grp_tbl.idi_vaddr)[i] =
 		    htole16(softc->grp_info[j].grp_id);
 		if (++j == softc->nrxqsets)
 			j = 0;
 	}
 
 	rc = bnxt_hwrm_rss_cfg(softc, &softc->vnic_info,
 	    softc->vnic_info.rss_hash_type);
 	if (rc)
 		goto fail;
 
-#ifdef notyet
-	/* Enable LRO/TPA/GRO */
+	/* 
+         * Enable LRO/TPA/GRO 
+         * TBD: 
+         *      Enable / Disable HW_LRO based on
+         *      ifconfig lro / ifconfig -lro setting
+         */
 	rc = bnxt_hwrm_vnic_tpa_cfg(softc, &softc->vnic_info,
 	    (if_getcapenable(iflib_get_ifp(ctx)) & IFCAP_LRO) ?
 	    HWRM_VNIC_TPA_CFG_INPUT_FLAGS_TPA : 0);
 	if (rc)
 		goto fail;
-#endif
 
 	for (i = 0; i < softc->ntxqsets; i++) {
 		/* Allocate the statistics context */
 		rc = bnxt_hwrm_stat_ctx_alloc(softc, &softc->tx_cp_rings[i],
 		    softc->tx_stats.idi_paddr +
 		    (sizeof(struct ctx_hw_stats) * i));
 		if (rc)
 			goto fail;
 
 		/* Allocate the completion ring */
 		softc->tx_cp_rings[i].cons = UINT32_MAX;
 		softc->tx_cp_rings[i].v_bit = 1;
 		bnxt_mark_cpr_invalid(&softc->tx_cp_rings[i]);
 		rc = bnxt_hwrm_ring_alloc(softc,
 		    HWRM_RING_ALLOC_INPUT_RING_TYPE_CMPL,
 		    &softc->tx_cp_rings[i].ring, (uint16_t)HWRM_NA_SIGNATURE,
 		    HWRM_NA_SIGNATURE, false);
 		if (rc)
 			goto fail;
 
 		/* Allocate the TX ring */
 		rc = bnxt_hwrm_ring_alloc(softc,
 		    HWRM_RING_ALLOC_INPUT_RING_TYPE_TX,
 		    &softc->tx_rings[i], softc->tx_cp_rings[i].ring.phys_id,
 		    softc->tx_cp_rings[i].stats_ctx_id, false);
 		if (rc)
 			goto fail;
 		BNXT_TX_DB(&softc->tx_rings[i], 0);
 		/* TODO: Cumulus+ doesn't need the double doorbell */
 		BNXT_TX_DB(&softc->tx_rings[i], 0);
 	}
 
 	bnxt_do_enable_intr(&softc->def_cp_ring);
 	bnxt_media_status(softc->ctx, &ifmr);
 	return;
 
 fail:
 	bnxt_hwrm_func_reset(softc);
 	bnxt_clear_ids(softc);
 	return;
 }
 
 static void
 bnxt_stop(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 
 	bnxt_do_disable_intr(&softc->def_cp_ring);
 	bnxt_hwrm_func_reset(softc);
 	bnxt_clear_ids(softc);
 	return;
 }
 
 static void
 bnxt_multi_set(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	if_t ifp = iflib_get_ifp(ctx);
 	uint8_t *mta;
 	int cnt, mcnt;
 
 	mcnt = if_multiaddr_count(ifp, -1);
 
 	if (mcnt > BNXT_MAX_MC_ADDRS) {
 		softc->vnic_info.rx_mask |=
 		    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_ALL_MCAST;
 		bnxt_hwrm_cfa_l2_set_rx_mask(softc, &softc->vnic_info);
 	}
 	else {
 		softc->vnic_info.rx_mask &=
 		    ~HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_ALL_MCAST;
 		mta = softc->vnic_info.mc_list.idi_vaddr;
 		bzero(mta, softc->vnic_info.mc_list.idi_size);
 		if_multiaddr_array(ifp, mta, &cnt, mcnt);
 		bus_dmamap_sync(softc->vnic_info.mc_list.idi_tag,
 		    softc->vnic_info.mc_list.idi_map, BUS_DMASYNC_PREWRITE);
 		softc->vnic_info.mc_list_count = cnt;
 		softc->vnic_info.rx_mask |=
 		    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_MCAST;
 		if (bnxt_hwrm_cfa_l2_set_rx_mask(softc, &softc->vnic_info))
 			device_printf(softc->dev,
 			    "set_multi: rx_mask set failed\n");
 	}
 }
 
 static int
 bnxt_mtu_set(if_ctx_t ctx, uint32_t mtu)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 
 	if (mtu > BNXT_MAX_MTU)
 		return EINVAL;
 
 	softc->scctx->isc_max_frame_size = mtu + ETHER_HDR_LEN + ETHER_CRC_LEN;
 	return 0;
 }
 
 static void
 bnxt_media_status(if_ctx_t ctx, struct ifmediareq * ifmr)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct bnxt_link_info *link_info = &softc->link_info;
 	uint8_t phy_type = get_phy_type(softc);
 
 	bnxt_update_link(softc, true);
 
 	ifmr->ifm_status = IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER;
 
 	if (link_info->link_up)
 		ifmr->ifm_status |= IFM_ACTIVE;
 	else
 		ifmr->ifm_status &= ~IFM_ACTIVE;
 
 	if (link_info->duplex == HWRM_PORT_PHY_QCFG_OUTPUT_DUPLEX_FULL)
 		ifmr->ifm_active |= IFM_FDX;
 	else
 		ifmr->ifm_active |= IFM_HDX;
 
 	switch (link_info->link_speed) {
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_100MB:
 		ifmr->ifm_active |= IFM_100_T;
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_1GB:
 		switch (phy_type) {
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKX:
 			ifmr->ifm_active |= IFM_1000_KX;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASET:
 			ifmr->ifm_active |= IFM_1000_T;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_SGMIIEXTPHY:
 			ifmr->ifm_active |= IFM_1000_SGMII;
 			break;
 		default:
 			ifmr->ifm_active |= IFM_UNKNOWN;
 			break;
 		}
 	break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_2_5GB:
 		switch (phy_type) {
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKX:
 			ifmr->ifm_active |= IFM_2500_KX;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASET:
 			ifmr->ifm_active |= IFM_2500_T;
 			break;
 		default:
 			ifmr->ifm_active |= IFM_UNKNOWN;
 			break;
 		}
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_10GB:
 		switch (phy_type) {
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASECR:
 			ifmr->ifm_active |= IFM_10G_CR1;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR4:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR2:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR:
 			ifmr->ifm_active |= IFM_10G_KR;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASELR:
 			ifmr->ifm_active |= IFM_10G_LR;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASESR:
 			ifmr->ifm_active |= IFM_10G_SR;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKX:
 			ifmr->ifm_active |= IFM_10G_KX4;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASET:
 			ifmr->ifm_active |= IFM_10G_T;
 			break;
 		default:
 			ifmr->ifm_active |= IFM_UNKNOWN;
 			break;
 		}
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_20GB:
 		ifmr->ifm_active |= IFM_20G_KR2;
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_25GB:
 		switch (phy_type) {
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASECR:
 			ifmr->ifm_active |= IFM_25G_CR;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR4:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR2:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR:
 			ifmr->ifm_active |= IFM_25G_KR;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASESR:
 			ifmr->ifm_active |= IFM_25G_SR;
 			break;
 		default:
 			ifmr->ifm_active |= IFM_UNKNOWN;
 			break;
 		}
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_40GB:
 		switch (phy_type) {
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASECR:
 			ifmr->ifm_active |= IFM_40G_CR4;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR4:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR2:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR:
 			ifmr->ifm_active |= IFM_40G_KR4;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASELR:
 			ifmr->ifm_active |= IFM_40G_LR4;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASESR:
 			ifmr->ifm_active |= IFM_40G_SR4;
 			break;
 		default:
 			ifmr->ifm_active |= IFM_UNKNOWN;
 			break;
 		}
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_50GB:
 		switch (phy_type) {
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASECR:
 			ifmr->ifm_active |= IFM_50G_CR2;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR4:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR2:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR:
 			ifmr->ifm_active |= IFM_50G_KR2;
 			break;
 		default:
 			ifmr->ifm_active |= IFM_UNKNOWN;
 			break;
 		}
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_100GB:
 		switch (phy_type) {
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASECR:
 			ifmr->ifm_active |= IFM_100G_CR4;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR4:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR2:
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR:
 			ifmr->ifm_active |= IFM_100G_KR4;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASELR:
 			ifmr->ifm_active |= IFM_100G_LR4;
 			break;
 		case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASESR:
 			ifmr->ifm_active |= IFM_100G_SR4;
 			break;
 		default:
 			ifmr->ifm_active |= IFM_UNKNOWN;
 			break;
 		}
 	default:
 		return;
 	}
 
 	if (link_info->pause == (HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX |
 	    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX))
 		ifmr->ifm_active |= (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE);
 	else if (link_info->pause == HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX)
 		ifmr->ifm_active |= IFM_ETH_TXPAUSE;
 	else if (link_info->pause == HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX)
 		ifmr->ifm_active |= IFM_ETH_RXPAUSE;
 
 	bnxt_report_link(softc);
 	return;
 }
 
 static int
 bnxt_media_change(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct ifmedia *ifm = iflib_get_media(ctx);
 	struct ifmediareq ifmr;
 	int rc;
 
 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
 		return EINVAL;
 
 	switch (IFM_SUBTYPE(ifm->ifm_media)) {
 	case IFM_100_T:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_100MB;
 		break;
 	case IFM_1000_KX:
 	case IFM_1000_T:
 	case IFM_1000_SGMII:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_1GB;
 		break;
 	case IFM_2500_KX:
 	case IFM_2500_T:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_2_5GB;
 		break;
 	case IFM_10G_CR1:
 	case IFM_10G_KR:
 	case IFM_10G_LR:
 	case IFM_10G_SR:
 	case IFM_10G_T:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_10GB;
 		break;
 	case IFM_20G_KR2:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_20GB;
 		break;
 	case IFM_25G_CR:
 	case IFM_25G_KR:
 	case IFM_25G_SR:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_25GB;
 		break;
 	case IFM_40G_CR4:
 	case IFM_40G_KR4:
 	case IFM_40G_LR4:
 	case IFM_40G_SR4:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_40GB;
 		break;
 	case IFM_50G_CR2:
 	case IFM_50G_KR2:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 		    HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_50GB;
 		break;
 	case IFM_100G_CR4:
 	case IFM_100G_KR4:
 	case IFM_100G_LR4:
 	case IFM_100G_SR4:
 		softc->link_info.autoneg &= ~BNXT_AUTONEG_SPEED;
 		softc->link_info.req_link_speed =
 			HWRM_PORT_PHY_CFG_INPUT_FORCE_LINK_SPEED_100GB;
 		break;
 	default:
 		device_printf(softc->dev,
 		    "Unsupported media type!  Using auto\n");
 		/* Fall-through */
 	case IFM_AUTO:
 		// Auto
 		softc->link_info.autoneg |= BNXT_AUTONEG_SPEED;
 		break;
 	}
 	rc = bnxt_hwrm_set_link_setting(softc, true, true);
 	bnxt_media_status(softc->ctx, &ifmr);
 	return rc;
 }
 
 static int
 bnxt_promisc_set(if_ctx_t ctx, int flags)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	if_t ifp = iflib_get_ifp(ctx);
 	int rc;
 
 	if (ifp->if_flags & IFF_ALLMULTI ||
 	    if_multiaddr_count(ifp, -1) > BNXT_MAX_MC_ADDRS)
 		softc->vnic_info.rx_mask |=
 		    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_ALL_MCAST;
 	else
 		softc->vnic_info.rx_mask &=
 		    ~HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_ALL_MCAST;
 
 	if (ifp->if_flags & IFF_PROMISC)
 		softc->vnic_info.rx_mask |=
 		    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_PROMISCUOUS |
 		    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_ANYVLAN_NONVLAN;
 	else
 		softc->vnic_info.rx_mask &=
 		    ~(HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_PROMISCUOUS |
 		    HWRM_CFA_L2_SET_RX_MASK_INPUT_MASK_ANYVLAN_NONVLAN);
 
 	rc = bnxt_hwrm_cfa_l2_set_rx_mask(softc, &softc->vnic_info);
 
 	return rc;
 }
 
 static uint64_t
 bnxt_get_counter(if_ctx_t ctx, ift_counter cnt)
 {
 	if_t ifp = iflib_get_ifp(ctx);
 
 	if (cnt < IFCOUNTERS)
 		return if_get_counter_default(ifp, cnt);
 
 	return 0;
 }
 
 static void
 bnxt_update_admin_status(if_ctx_t ctx)
 {
 	/* TODO: do we need to do anything here? */
 	return;
 }
 
 static void inline
 bnxt_do_enable_intr(struct bnxt_cp_ring *cpr)
 {
 	if (cpr->ring.phys_id != (uint16_t)HWRM_NA_SIGNATURE) {
 		/* First time enabling, do not set index */
 		if (cpr->cons == UINT32_MAX)
 			BNXT_CP_ENABLE_DB(&cpr->ring);
 		else
 			BNXT_CP_IDX_ENABLE_DB(&cpr->ring, cpr->cons);
 	}
 }
 
 static void inline
 bnxt_do_disable_intr(struct bnxt_cp_ring *cpr)
 {
 	if (cpr->ring.phys_id != (uint16_t)HWRM_NA_SIGNATURE)
 		BNXT_CP_DISABLE_DB(&cpr->ring);
 }
 
 /* Enable all interrupts */
 static void
 bnxt_intr_enable(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	int i;
 
 	bnxt_do_enable_intr(&softc->def_cp_ring);
 	for (i = 0; i < softc->nrxqsets; i++)
 		bnxt_do_enable_intr(&softc->rx_cp_rings[i]);
 
 	return;
 }
 
 /* Enable interrupt for a single queue */
 static int
 bnxt_queue_intr_enable(if_ctx_t ctx, uint16_t qid)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 
 	bnxt_do_enable_intr(&softc->rx_cp_rings[qid]);
 	return 0;
 }
 
 /* Disable all interrupts */
 static void
 bnxt_disable_intr(if_ctx_t ctx)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	int i;
 
 	/*
 	 * NOTE: These TX interrupts should never get enabled, so don't
 	 * update the index
 	 */
 	for (i = 0; i < softc->ntxqsets; i++)
 		bnxt_do_disable_intr(&softc->tx_cp_rings[i]);
 	for (i = 0; i < softc->nrxqsets; i++)
 		bnxt_do_disable_intr(&softc->rx_cp_rings[i]);
 
 	return;
 }
 
 static int
 bnxt_msix_intr_assign(if_ctx_t ctx, int msix)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	int rc;
 	int i;
 
 	rc = iflib_irq_alloc_generic(ctx, &softc->def_cp_ring.irq,
 	    softc->def_cp_ring.ring.id + 1, IFLIB_INTR_ADMIN,
 	    bnxt_handle_def_cp, softc, 0, "def_cp");
 	if (rc) {
 		device_printf(iflib_get_dev(ctx),
 		    "Failed to register default completion ring handler\n");
 		return rc;
 	}
 
 	for (i=0; i<softc->scctx->isc_nrxqsets; i++) {
 		rc = iflib_irq_alloc_generic(ctx, &softc->rx_cp_rings[i].irq,
-		    softc->rx_cp_rings[i].ring.id + 1, IFLIB_INTR_RX,
+		    softc->rx_cp_rings[i].ring.id + 1, IFLIB_INTR_RXTX,
 		    bnxt_handle_rx_cp, &softc->rx_cp_rings[i], i, "rx_cp");
 		if (rc) {
 			device_printf(iflib_get_dev(ctx),
 			    "Failed to register RX completion ring handler\n");
 			i--;
 			goto fail;
 		}
 	}
 
 	for (i=0; i<softc->scctx->isc_ntxqsets; i++)
-		iflib_softirq_alloc_generic(ctx, i + 1, IFLIB_INTR_TX, NULL, i,
-		    "tx_cp");
+		iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_TX, NULL, i, "tx_cp");
 
 	return rc;
 
 fail:
 	for (; i>=0; i--)
 		iflib_irq_free(ctx, &softc->rx_cp_rings[i].irq);
 	iflib_irq_free(ctx, &softc->def_cp_ring.irq);
 	return rc;
 }
 
 /*
  * We're explicitly allowing duplicates here.  They will need to be
  * removed as many times as they are added.
  */
 static void
 bnxt_vlan_register(if_ctx_t ctx, uint16_t vtag)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct bnxt_vlan_tag *new_tag;
 
 	new_tag = malloc(sizeof(struct bnxt_vlan_tag), M_DEVBUF, M_NOWAIT);
 	if (new_tag == NULL)
 		return;
 	new_tag->tag = vtag;
 	new_tag->tpid = 8100;
 	SLIST_INSERT_HEAD(&softc->vnic_info.vlan_tags, new_tag, next);
 };
 
 static void
 bnxt_vlan_unregister(if_ctx_t ctx, uint16_t vtag)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct bnxt_vlan_tag *vlan_tag;
 
 	SLIST_FOREACH(vlan_tag, &softc->vnic_info.vlan_tags, next) {
 		if (vlan_tag->tag == vtag) {
 			SLIST_REMOVE(&softc->vnic_info.vlan_tags, vlan_tag,
 			    bnxt_vlan_tag, next);
 			free(vlan_tag, M_DEVBUF);
 			break;
 		}
 	}
 }
 
 static int
 bnxt_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data)
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct ifreq_buffer *ifbuf = &ifr->ifr_ifru.ifru_buffer;
 	struct bnxt_ioctl_header *ioh =
 	    (struct bnxt_ioctl_header *)(ifbuf->buffer);
 	int rc = ENOTSUP;
 	struct bnxt_ioctl_data *iod = NULL;
 
 	switch (command) {
 	case SIOCGPRIVATE_0:
 		if ((rc = priv_check(curthread, PRIV_DRIVER)) != 0)
 			goto exit;
 
 		iod = malloc(ifbuf->length, M_DEVBUF, M_NOWAIT | M_ZERO);
 		if (!iod) {
 			rc = ENOMEM;
 			goto exit;
 		}
 		copyin(ioh, iod, ifbuf->length);
 
 		switch (ioh->type) {
 		case BNXT_HWRM_NVM_FIND_DIR_ENTRY:
 		{
 			struct bnxt_ioctl_hwrm_nvm_find_dir_entry *find =
 			    &iod->find;
 
 			rc = bnxt_hwrm_nvm_find_dir_entry(softc, find->type,
 			    &find->ordinal, find->ext, &find->index,
 			    find->use_index, find->search_opt,
 			    &find->data_length, &find->item_length,
 			    &find->fw_ver);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_READ:
 		{
 			struct bnxt_ioctl_hwrm_nvm_read *rd = &iod->read;
 			struct iflib_dma_info dma_data;
 			size_t offset;
 			size_t remain;
 			size_t csize;
 
 			/*
 			 * Some HWRM versions can't read more than 0x8000 bytes
 			 */
 			rc = iflib_dma_alloc(softc->ctx,
 			    min(rd->length, 0x8000), &dma_data, BUS_DMA_NOWAIT);
 			if (rc)
 				break;
 			for (remain = rd->length, offset = 0;
 			    remain && offset < rd->length; offset += 0x8000) {
 				csize = min(remain, 0x8000);
 				rc = bnxt_hwrm_nvm_read(softc, rd->index,
 				    rd->offset + offset, csize, &dma_data);
 				if (rc) {
 					iod->hdr.rc = rc;
 					copyout(&iod->hdr.rc, &ioh->rc,
 					    sizeof(ioh->rc));
 					break;
 				}
 				else {
 					copyout(dma_data.idi_vaddr,
 					    rd->data + offset, csize);
 					iod->hdr.rc = 0;
 				}
 				remain -= csize;
 			}
 			if (iod->hdr.rc == 0)
 				copyout(iod, ioh, ifbuf->length);
 
 			iflib_dma_free(&dma_data);
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_FW_RESET:
 		{
 			struct bnxt_ioctl_hwrm_fw_reset *rst =
 			    &iod->reset;
 
 			rc = bnxt_hwrm_fw_reset(softc, rst->processor,
 			    &rst->selfreset);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_FW_QSTATUS:
 		{
 			struct bnxt_ioctl_hwrm_fw_qstatus *qstat =
 			    &iod->status;
 
 			rc = bnxt_hwrm_fw_qstatus(softc, qstat->processor,
 			    &qstat->selfreset);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_WRITE:
 		{
 			struct bnxt_ioctl_hwrm_nvm_write *wr =
 			    &iod->write;
 
 			rc = bnxt_hwrm_nvm_write(softc, wr->data, true,
 			    wr->type, wr->ordinal, wr->ext, wr->attr,
 			    wr->option, wr->data_length, wr->keep,
 			    &wr->item_length, &wr->index);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_ERASE_DIR_ENTRY:
 		{
 			struct bnxt_ioctl_hwrm_nvm_erase_dir_entry *erase =
 			    &iod->erase;
 
 			rc = bnxt_hwrm_nvm_erase_dir_entry(softc, erase->index);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_GET_DIR_INFO:
 		{
 			struct bnxt_ioctl_hwrm_nvm_get_dir_info *info =
 			    &iod->dir_info;
 
 			rc = bnxt_hwrm_nvm_get_dir_info(softc, &info->entries,
 			    &info->entry_length);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_GET_DIR_ENTRIES:
 		{
 			struct bnxt_ioctl_hwrm_nvm_get_dir_entries *get =
 			    &iod->dir_entries;
 			struct iflib_dma_info dma_data;
 
 			rc = iflib_dma_alloc(softc->ctx, get->max_size,
 			    &dma_data, BUS_DMA_NOWAIT);
 			if (rc)
 				break;
 			rc = bnxt_hwrm_nvm_get_dir_entries(softc, &get->entries,
 			    &get->entry_length, &dma_data);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				copyout(dma_data.idi_vaddr, get->data,
 				    get->entry_length * get->entries);
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 			iflib_dma_free(&dma_data);
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_VERIFY_UPDATE:
 		{
 			struct bnxt_ioctl_hwrm_nvm_verify_update *vrfy =
 			    &iod->verify;
 
 			rc = bnxt_hwrm_nvm_verify_update(softc, vrfy->type,
 			    vrfy->ordinal, vrfy->ext);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_INSTALL_UPDATE:
 		{
 			struct bnxt_ioctl_hwrm_nvm_install_update *inst =
 			    &iod->install;
 
 			rc = bnxt_hwrm_nvm_install_update(softc,
 			    inst->install_type, &inst->installed_items,
 			    &inst->result, &inst->problem_item,
 			    &inst->reset_required);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_NVM_MODIFY:
 		{
 			struct bnxt_ioctl_hwrm_nvm_modify *mod = &iod->modify;
 
 			rc = bnxt_hwrm_nvm_modify(softc, mod->index,
 			    mod->offset, mod->data, true, mod->length);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_FW_GET_TIME:
 		{
 			struct bnxt_ioctl_hwrm_fw_get_time *gtm =
 			    &iod->get_time;
 
 			rc = bnxt_hwrm_fw_get_time(softc, &gtm->year,
 			    &gtm->month, &gtm->day, &gtm->hour, &gtm->minute,
 			    &gtm->second, &gtm->millisecond, &gtm->zone);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		case BNXT_HWRM_FW_SET_TIME:
 		{
 			struct bnxt_ioctl_hwrm_fw_set_time *stm =
 			    &iod->set_time;
 
 			rc = bnxt_hwrm_fw_set_time(softc, stm->year,
 			    stm->month, stm->day, stm->hour, stm->minute,
 			    stm->second, stm->millisecond, stm->zone);
 			if (rc) {
 				iod->hdr.rc = rc;
 				copyout(&iod->hdr.rc, &ioh->rc,
 				    sizeof(ioh->rc));
 			}
 			else {
 				iod->hdr.rc = 0;
 				copyout(iod, ioh, ifbuf->length);
 			}
 
 			rc = 0;
 			goto exit;
 		}
 		}
 		break;
 	}
 
 exit:
 	if (iod)
 		free(iod, M_DEVBUF);
 	return rc;
 }
 
 /*
  * Support functions
  */
 static int
 bnxt_probe_phy(struct bnxt_softc *softc)
 {
 	struct bnxt_link_info *link_info = &softc->link_info;
 	int rc = 0;
 
 	rc = bnxt_update_link(softc, false);
 	if (rc) {
 		device_printf(softc->dev,
 		    "Probe phy can't update link (rc: %x)\n", rc);
 		return (rc);
 	}
 
 	/*initialize the ethool setting copy with NVM settings */
 	if (link_info->auto_mode != HWRM_PORT_PHY_QCFG_OUTPUT_AUTO_MODE_NONE)
 		link_info->autoneg |= BNXT_AUTONEG_SPEED;
 
 	if (link_info->auto_pause & (HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX |
 	    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX)) {
 		if (link_info->auto_pause == (
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX |
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX))
 			link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
 		link_info->req_flow_ctrl = link_info->auto_pause;
 	} else if (link_info->force_pause & (
 	    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX |
 	    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX)) {
 		link_info->req_flow_ctrl = link_info->force_pause;
 	}
 	link_info->req_duplex = link_info->duplex_setting;
 	if (link_info->autoneg & BNXT_AUTONEG_SPEED)
 		link_info->req_link_speed = link_info->auto_link_speed;
 	else
 		link_info->req_link_speed = link_info->force_link_speed;
 	return (rc);
 }
 
 static void
 bnxt_add_media_types(struct bnxt_softc *softc)
 {
 	struct bnxt_link_info *link_info = &softc->link_info;
 	uint16_t supported;
 	uint8_t phy_type = get_phy_type(softc);
 
 	supported = link_info->support_speeds;
 
 	/* Auto is always supported */
 	ifmedia_add(softc->media, IFM_ETHER | IFM_AUTO, 0, NULL);
 
 	if (softc->flags & BNXT_FLAG_NPAR)
 		return;
 
 	switch (phy_type) {
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASECR:
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_100GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_100G_CR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_50GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_50G_CR2, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_40GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_40G_CR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_25GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_25G_CR, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_10GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_10G_CR1, 0,
 			    NULL);
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_UNKNOWN:
 		/* Auto only */
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR4:
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR2:
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR:
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_100GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_100G_KR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_50GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_50G_KR2, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_40GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_40G_KR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_25GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_25G_KR, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_20GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_20G_KR2, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_10GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_10G_KR, 0,
 			    NULL);
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASELR:
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_100GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_100G_LR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_40GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_40G_LR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_10GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_10G_LR, 0,
 			    NULL);
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASESR:
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_100GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_100G_SR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_40GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_40G_SR4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_25GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_25G_SR, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_10GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_10G_SR, 0,
 			    NULL);
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKX:
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_10GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_10G_KX4, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_2_5GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_2500_KX, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_1GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_1000_KX, 0,
 			    NULL);
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASET:
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASETE:
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_10MB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_10_T, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_100MB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_100_T, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_1GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_1000_T, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_2_5GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_2500_T, 0,
 			    NULL);
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_10GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_10G_T, 0,
 			    NULL);
 		break;
 	case HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_SGMIIEXTPHY:
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_1GB)
 			ifmedia_add(softc->media, IFM_ETHER | IFM_1000_SGMII, 0,
 			    NULL);
 		break;
 	}
 
 	return;
 }
 
 static int
 bnxt_map_bar(struct bnxt_softc *softc, struct bnxt_bar_info *bar, int bar_num, bool shareable)
 {
 	uint32_t	flag;
 
 	if (bar->res != NULL) {
 		device_printf(softc->dev, "Bar %d already mapped\n", bar_num);
 		return EDOOFUS;
 	}
 
 	bar->rid = PCIR_BAR(bar_num);
 	flag = RF_ACTIVE;
 	if (shareable)
 		flag |= RF_SHAREABLE;
 
 	if ((bar->res =
 		bus_alloc_resource_any(softc->dev,
 			   SYS_RES_MEMORY,
 			   &bar->rid,
 			   flag)) == NULL) {
 		device_printf(softc->dev,
 		    "PCI BAR%d mapping failure\n", bar_num);
 		return (ENXIO);
 	}
 	bar->tag = rman_get_bustag(bar->res);
 	bar->handle = rman_get_bushandle(bar->res);
 	bar->size = rman_get_size(bar->res);
 
 	return 0;
 }
 
 static int
 bnxt_pci_mapping(struct bnxt_softc *softc)
 {
 	int rc;
 
 	rc = bnxt_map_bar(softc, &softc->hwrm_bar, 0, true);
 	if (rc)
 		return rc;
 
 	rc = bnxt_map_bar(softc, &softc->doorbell_bar, 2, false);
 
 	return rc;
 }
 
 static void
 bnxt_pci_mapping_free(struct bnxt_softc *softc)
 {
 	if (softc->hwrm_bar.res != NULL)
 		bus_release_resource(softc->dev, SYS_RES_MEMORY,
 		    softc->hwrm_bar.rid, softc->hwrm_bar.res);
 	softc->hwrm_bar.res = NULL;
 
 	if (softc->doorbell_bar.res != NULL)
 		bus_release_resource(softc->dev, SYS_RES_MEMORY,
 		    softc->doorbell_bar.rid, softc->doorbell_bar.res);
 	softc->doorbell_bar.res = NULL;
 }
 
 static int
 bnxt_update_link(struct bnxt_softc *softc, bool chng_link_state)
 {
 	struct bnxt_link_info *link_info = &softc->link_info;
 	uint8_t link_up = link_info->link_up;
 	int rc = 0;
 
 	rc = bnxt_hwrm_port_phy_qcfg(softc);
 	if (rc)
 		goto exit;
 
 	/* TODO: need to add more logic to report VF link */
 	if (chng_link_state) {
 		if (link_info->phy_link_status ==
 		    HWRM_PORT_PHY_QCFG_OUTPUT_LINK_LINK)
 			link_info->link_up = 1;
 		else
 			link_info->link_up = 0;
 		if (link_up != link_info->link_up)
 			bnxt_report_link(softc);
 	} else {
 		/* always link down if not require to update link state */
 		link_info->link_up = 0;
 	}
 
 exit:
 	return rc;
 }
 
 void
 bnxt_report_link(struct bnxt_softc *softc)
 {
 	const char *duplex = NULL, *flow_ctrl = NULL;
 
 	if (softc->link_info.link_up == softc->link_info.last_link_up) {
 		if (!softc->link_info.link_up)
 			return;
 		if (softc->link_info.pause == softc->link_info.last_pause &&
 		    softc->link_info.duplex == softc->link_info.last_duplex)
 			return;
 	}
 
 	if (softc->link_info.link_up) {
 		if (softc->link_info.duplex ==
 		    HWRM_PORT_PHY_QCFG_OUTPUT_DUPLEX_FULL)
 			duplex = "full duplex";
 		else
 			duplex = "half duplex";
 		if (softc->link_info.pause == (
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX |
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX))
 			flow_ctrl = "FC - receive & transmit";
 		else if (softc->link_info.pause ==
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_TX)
 			flow_ctrl = "FC - transmit";
 		else if (softc->link_info.pause ==
 		    HWRM_PORT_PHY_QCFG_OUTPUT_PAUSE_RX)
 			flow_ctrl = "FC - receive";
 		else
 			flow_ctrl = "none";
 		iflib_link_state_change(softc->ctx, LINK_STATE_UP,
 		    IF_Gbps(100));
 		device_printf(softc->dev, "Link is UP %s, %s\n", duplex,
 		    flow_ctrl);
 	} else {
 		iflib_link_state_change(softc->ctx, LINK_STATE_DOWN,
 		    bnxt_get_baudrate(&softc->link_info));
 		device_printf(softc->dev, "Link is Down\n");
 	}
 
 	softc->link_info.last_link_up = softc->link_info.link_up;
 	softc->link_info.last_pause = softc->link_info.pause;
 	softc->link_info.last_duplex = softc->link_info.duplex;
 }
 
 static int
 bnxt_handle_rx_cp(void *arg)
 {
 	struct bnxt_cp_ring *cpr = arg;
 
 	/* Disable further interrupts for this queue */
 	BNXT_CP_DISABLE_DB(&cpr->ring);
 	return FILTER_SCHEDULE_THREAD;
 }
 
 static int
 bnxt_handle_def_cp(void *arg)
 {
 	struct bnxt_softc *softc = arg;
 
 	BNXT_CP_DISABLE_DB(&softc->def_cp_ring.ring);
 	GROUPTASK_ENQUEUE(&softc->def_cp_task);
 	return FILTER_HANDLED;
 }
 
 static void
 bnxt_clear_ids(struct bnxt_softc *softc)
 {
 	int i;
 
 	softc->def_cp_ring.stats_ctx_id = HWRM_NA_SIGNATURE;
 	softc->def_cp_ring.ring.phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 	for (i = 0; i < softc->ntxqsets; i++) {
 		softc->tx_cp_rings[i].stats_ctx_id = HWRM_NA_SIGNATURE;
 		softc->tx_cp_rings[i].ring.phys_id =
 		    (uint16_t)HWRM_NA_SIGNATURE;
 		softc->tx_rings[i].phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 	}
 	for (i = 0; i < softc->nrxqsets; i++) {
 		softc->rx_cp_rings[i].stats_ctx_id = HWRM_NA_SIGNATURE;
 		softc->rx_cp_rings[i].ring.phys_id =
 		    (uint16_t)HWRM_NA_SIGNATURE;
 		softc->rx_rings[i].phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 		softc->ag_rings[i].phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 		softc->grp_info[i].grp_id = (uint16_t)HWRM_NA_SIGNATURE;
 	}
 	softc->vnic_info.filter_id = -1;
 	softc->vnic_info.id = (uint16_t)HWRM_NA_SIGNATURE;
 	softc->vnic_info.rss_id = (uint16_t)HWRM_NA_SIGNATURE;
 	memset(softc->vnic_info.rss_grp_tbl.idi_vaddr, 0xff,
 	    softc->vnic_info.rss_grp_tbl.idi_size);
 }
 
 static void
 bnxt_mark_cpr_invalid(struct bnxt_cp_ring *cpr)
 {
 	struct cmpl_base *cmp = (void *)cpr->ring.vaddr;
 	int i;
 
 	for (i = 0; i < cpr->ring.ring_size; i++)
 		cmp[i].info3_v = !cpr->v_bit;
 }
 
 static void
 bnxt_handle_async_event(struct bnxt_softc *softc, struct cmpl_base *cmpl)
 {
 	struct hwrm_async_event_cmpl *ae = (void *)cmpl;
 	uint16_t async_id = le16toh(ae->event_id);
 	struct ifmediareq ifmr;
 
 	switch (async_id) {
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CHANGE:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE:
 		bnxt_media_status(softc->ctx, &ifmr);
 		break;
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_MTU_CHANGE:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_NOT_ALLOWED:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_UNLOAD:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_LOAD:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_LOAD:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_VF_FLR:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_VF_MAC_ADDR_CHANGE:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_PF_VF_COMM_STATUS_CHANGE:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE:
 	case HWRM_ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR:
 		device_printf(softc->dev,
 		    "Unhandled async completion type %u\n", async_id);
 		break;
 	default:
 		device_printf(softc->dev,
 		    "Unknown async completion type %u\n", async_id);
 		break;
 	}
 }
 
 static void
 bnxt_def_cp_task(void *context)
 {
 	if_ctx_t ctx = context;
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	struct bnxt_cp_ring *cpr = &softc->def_cp_ring;
 
 	/* Handle completions on the default completion ring */
 	struct cmpl_base *cmpl;
 	uint32_t cons = cpr->cons;
 	bool v_bit = cpr->v_bit;
 	bool last_v_bit;
 	uint32_t last_cons;
 	uint16_t type;
 
 	for (;;) {
 		last_cons = cons;
 		last_v_bit = v_bit;
 		NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 		cmpl = &((struct cmpl_base *)cpr->ring.vaddr)[cons];
 
 		if (!CMP_VALID(cmpl, v_bit))
 			break;
 
 		type = le16toh(cmpl->type) & CMPL_BASE_TYPE_MASK;
 		switch (type) {
 		case CMPL_BASE_TYPE_HWRM_ASYNC_EVENT:
 			bnxt_handle_async_event(softc, cmpl);
 			break;
 		case CMPL_BASE_TYPE_TX_L2:
 		case CMPL_BASE_TYPE_RX_L2:
 		case CMPL_BASE_TYPE_RX_AGG:
 		case CMPL_BASE_TYPE_RX_TPA_START:
 		case CMPL_BASE_TYPE_RX_TPA_END:
 		case CMPL_BASE_TYPE_STAT_EJECT:
 		case CMPL_BASE_TYPE_HWRM_DONE:
 		case CMPL_BASE_TYPE_HWRM_FWD_REQ:
 		case CMPL_BASE_TYPE_HWRM_FWD_RESP:
 		case CMPL_BASE_TYPE_CQ_NOTIFICATION:
 		case CMPL_BASE_TYPE_SRQ_EVENT:
 		case CMPL_BASE_TYPE_DBQ_EVENT:
 		case CMPL_BASE_TYPE_QP_EVENT:
 		case CMPL_BASE_TYPE_FUNC_EVENT:
 			device_printf(softc->dev,
 			    "Unhandled completion type %u\n", type);
 			break;
 		default:
 			device_printf(softc->dev,
 			    "Unknown completion type %u\n", type);
 			break;
 		}
 	}
 
 	cpr->cons = last_cons;
 	cpr->v_bit = last_v_bit;
 	BNXT_CP_IDX_ENABLE_DB(&cpr->ring, cpr->cons);
 }
 
 static uint8_t
 get_phy_type(struct bnxt_softc *softc)
 {
 	struct bnxt_link_info *link_info = &softc->link_info;
 	uint8_t phy_type = link_info->phy_type;
 	uint16_t supported;
 
 	if (phy_type != HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_UNKNOWN)
 		return phy_type;
 
 	/* Deduce the phy type from the media type and supported speeds */
 	supported = link_info->support_speeds;
 
 	if (link_info->media_type ==
 	    HWRM_PORT_PHY_QCFG_OUTPUT_MEDIA_TYPE_TP)
 		return HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASET;
 	if (link_info->media_type ==
 	    HWRM_PORT_PHY_QCFG_OUTPUT_MEDIA_TYPE_DAC) {
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_2_5GB)
 			return HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKX;
 		if (supported & HWRM_PORT_PHY_QCFG_OUTPUT_SUPPORT_SPEEDS_20GB)
 			return HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASEKR;
 		return HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASECR;
 	}
 	if (link_info->media_type ==
 	    HWRM_PORT_PHY_QCFG_OUTPUT_MEDIA_TYPE_FIBRE)
 		return HWRM_PORT_PHY_QCFG_OUTPUT_PHY_TYPE_BASESR;
 
 	return phy_type;
 }
 
 bool
 bnxt_check_hwrm_version(struct bnxt_softc *softc)
 {
 	char buf[16];
 
 	sprintf(buf, "%hhu.%hhu.%hhu", softc->ver_info->hwrm_min_major,
 	    softc->ver_info->hwrm_min_minor, softc->ver_info->hwrm_min_update);
 	if (softc->ver_info->hwrm_min_major > softc->ver_info->hwrm_if_major) {
 		device_printf(softc->dev,
 		    "WARNING: HWRM version %s is too old (older than %s)\n",
 		    softc->ver_info->hwrm_if_ver, buf);
 		return false;
 	}
 	else if(softc->ver_info->hwrm_min_major ==
 	    softc->ver_info->hwrm_if_major) {
 		if (softc->ver_info->hwrm_min_minor >
 		    softc->ver_info->hwrm_if_minor) {
 			device_printf(softc->dev,
 			    "WARNING: HWRM version %s is too old (older than %s)\n",
 			    softc->ver_info->hwrm_if_ver, buf);
 			return false;
 		}
 		else if (softc->ver_info->hwrm_min_minor ==
 		    softc->ver_info->hwrm_if_minor) {
 			if (softc->ver_info->hwrm_min_update >
 			    softc->ver_info->hwrm_if_update) {
 				device_printf(softc->dev,
 				    "WARNING: HWRM version %s is too old (older than %s)\n",
 				    softc->ver_info->hwrm_if_ver, buf);
 				return false;
 			}
 		}
 	}
 	return true;
 }
 
 static uint64_t
 bnxt_get_baudrate(struct bnxt_link_info *link)
 {
 	switch (link->link_speed) {
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_100MB:
 		return IF_Mbps(100);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_1GB:
 		return IF_Gbps(1);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_2GB:
 		return IF_Gbps(2);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_2_5GB:
 		return IF_Mbps(2500);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_10GB:
 		return IF_Gbps(10);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_20GB:
 		return IF_Gbps(20);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_25GB:
 		return IF_Gbps(25);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_40GB:
 		return IF_Gbps(40);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_50GB:
 		return IF_Gbps(50);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_100GB:
 		return IF_Gbps(100);
 	case HWRM_PORT_PHY_QCFG_OUTPUT_LINK_SPEED_10MB:
 		return IF_Mbps(10);
 	}
 	return IF_Gbps(100);
 }
Index: stable/11/sys/kern/kern_cpuset.c
===================================================================
--- stable/11/sys/kern/kern_cpuset.c	(revision 333337)
+++ stable/11/sys/kern/kern_cpuset.c	(revision 333338)
@@ -1,1342 +1,1350 @@
 /*-
  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  * 
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/capsicum.h>
 #include <sys/cpuset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 /*
  * cpusets provide a mechanism for creating and manipulating sets of
  * processors for the purpose of constraining the scheduling of threads to
  * specific processors.
  *
  * Each process belongs to an identified set, by default this is set 1.  Each
  * thread may further restrict the cpus it may run on to a subset of this
  * named set.  This creates an anonymous set which other threads and processes
  * may not join by number.
  *
  * The named set is referred to herein as the 'base' set to avoid ambiguity.
  * This set is usually a child of a 'root' set while the anonymous set may
  * simply be referred to as a mask.  In the syscall api these are referred to
  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
  *
  * Threads inherit their set from their creator whether it be anonymous or
  * not.  This means that anonymous sets are immutable because they may be
  * shared.  To modify an anonymous set a new set is created with the desired
  * mask and the same parent as the existing anonymous set.  This gives the
  * illusion of each thread having a private mask.
  *
  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  * modifies all numbered and anonymous child sets to comply with the new mask.
  * Modifying a pid or tid's mask applies only to that tid but must still
  * exist within the assigned parent set.
  *
  * A thread may not be assigned to a group separate from other threads in
  * the process.  This is to remove ambiguity when the setid is queried with
  * a pid argument.  There is no other technical limitation.
  *
  * This somewhat complex arrangement is intended to make it easy for
  * applications to query available processors and bind their threads to
  * specific processors while also allowing administrators to dynamically
  * reprovision by changing sets which apply to groups of processes.
  *
  * A simple application should not concern itself with sets at all and
  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 static uma_zone_t cpuset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
 
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
 /*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
 cpuset_ref(struct cpuset *set)
 {
 
 	refcount_acquire(&set->cs_ref);
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.  Returns the root
  * referenced.
  */
 static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
 	for (; set->cs_parent != NULL; set = set->cs_parent)
 		if (set->cs_flags & CPU_SET_ROOT)
 			break;
 	cpuset_ref(set);
 
 	return (set);
 }
 
 /*
  * Find the first non-anonymous set starting from 'set'.  Returns this set
  * referenced.  May return the passed in set with an extra ref if it is
  * not anonymous. 
  */
 static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
 	if (set->cs_id == CPUSET_INVALID)
 		set = set->cs_parent;
 	cpuset_ref(set);
 
 	return (set);
 }
 
 /*
  * Release a reference in a context where it is safe to allocate.
  */
 void
 cpuset_rel(struct cpuset *set)
 {
 	cpusetid_t id;
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	id = set->cs_id;
 	if (id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 	if (id != CPUSET_INVALID)
 		free_unr(cpuset_unr, id);
 }
 
 /*
  * Deferred release must be used when in a context that is not safe to
  * allocate/free.  This places any unreferenced sets on the list 'head'.
  */
 static void
 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
 {
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	LIST_INSERT_HEAD(head, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 }
 
 /*
  * Complete a deferred release.  Removes the set from the list provided to
  * cpuset_rel_defer.
  */
 static void
 cpuset_rel_complete(struct cpuset *set)
 {
 	LIST_REMOVE(set, cs_link);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 }
 
 /*
  * Find a set based on an id.  Returns it with a ref.
  */
 static struct cpuset *
 cpuset_lookup(cpusetid_t setid, struct thread *td)
 {
 	struct cpuset *set;
 
 	if (setid == CPUSET_INVALID)
 		return (NULL);
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(set, &cpuset_ids, cs_link)
 		if (set->cs_id == setid)
 			break;
 	if (set)
 		cpuset_ref(set);
 	mtx_unlock_spin(&cpuset_lock);
 
 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
 	if (set != NULL && jailed(td->td_ucred)) {
 		struct cpuset *jset, *tset;
 
 		jset = td->td_ucred->cr_prison->pr_cpuset;
 		for (tset = set; tset != NULL; tset = tset->cs_parent)
 			if (tset == jset)
 				break;
 		if (tset == NULL) {
 			cpuset_rel(set);
 			set = NULL;
 		}
 	}
 
 	return (set);
 }
 
 /*
  * Create a set in the space provided in 'set' with the provided parameters.
  * The set is returned with a single ref.  May return EDEADLK if the set
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
 _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
     cpusetid_t id)
 {
 
 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
 		return (EDEADLK);
 	CPU_COPY(mask, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = 0;
 	mtx_lock_spin(&cpuset_lock);
 	CPU_AND(&set->cs_mask, &parent->cs_mask);
 	set->cs_id = id;
 	set->cs_parent = cpuset_ref(parent);
 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (0);
 }
 
 /*
  * Create a new non-anonymous set with the requested parent and mask.  May
  * return failures if the mask is invalid or a new number can not be
  * allocated.
  */
 static int
 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
 {
 	struct cpuset *set;
 	cpusetid_t id;
 	int error;
 
 	id = alloc_unr(cpuset_unr);
 	if (id == -1)
 		return (ENFILE);
 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = _cpuset_create(set, parent, mask, id);
 	if (error == 0)
 		return (0);
 	free_unr(cpuset_unr, id);
 	uma_zfree(cpuset_zone, set);
 
 	return (error);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
 {
 	struct cpuset *nset;
 	cpuset_t newmask;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	if (check_mask) {
 		if (!CPU_OVERLAP(&set->cs_mask, mask))
 			return (EDEADLK);
 		CPU_COPY(&set->cs_mask, &newmask);
 		CPU_AND(&newmask, mask);
 	} else
 		CPU_COPY(mask, &newmask);
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	CPU_AND(&set->cs_mask, mask);
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update(nset, &set->cs_mask);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *root;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	/*
 	 * Verify that we have access to this set of
 	 * cpus.
 	 */
 	root = set->cs_parent;
 	if (root && !CPU_SUBSET(&root->cs_mask, mask))
 		return (EINVAL);
 	mtx_lock_spin(&cpuset_lock);
 	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
 	CPU_COPY(mask, &set->cs_mask);
 	cpuset_update(set, mask);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (error);
 }
 
 /*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  * checks for permission via p_cansched().
  *
  * For WHICH_SET returns a valid set with a new reference.
  *
  * -1 may be supplied for any argument to mean the current proc/thread or
  * the base set of the current thread.  May fail with ESRCH/EPERM.
  */
 int
 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
     struct cpuset **setp)
 {
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	*pp = p = NULL;
 	*tdp = td = NULL;
 	*setp = set = NULL;
 	switch (which) {
 	case CPU_WHICH_PID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			break;
 		}
 		if ((p = pfind(id)) == NULL)
 			return (ESRCH);
 		break;
 	case CPU_WHICH_TID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			td = curthread;
 			break;
 		}
 		td = tdfind(id, -1);
 		if (td == NULL)
 			return (ESRCH);
 		p = td->td_proc;
 		break;
 	case CPU_WHICH_CPUSET:
 		if (id == -1) {
 			thread_lock(curthread);
 			set = cpuset_refbase(curthread->td_cpuset);
 			thread_unlock(curthread);
 		} else
 			set = cpuset_lookup(id, curthread);
 		if (set) {
 			*setp = set;
 			return (0);
 		}
 		return (ESRCH);
 	case CPU_WHICH_JAIL:
 	{
 		/* Find `set' for prison with given id. */
 		struct prison *pr;
 
 		sx_slock(&allprison_lock);
 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			return (ESRCH);
 		cpuset_ref(pr->pr_cpuset);
 		*setp = pr->pr_cpuset;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	error = p_cansched(curthread, p);
 	if (error) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (td == NULL)
 		td = FIRST_THREAD_IN_PROC(p);
 	*pp = p;
 	*tdp = td;
 	return (0);
 }
 
 /*
  * Create an anonymous set with the provided mask in the space provided by
  * 'fset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
 cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
 {
 	struct cpuset *parent;
 
 	if (set->cs_id == CPUSET_INVALID)
 		parent = set->cs_parent;
 	else
 		parent = set;
 	if (!CPU_SUBSET(&parent->cs_mask, mask))
 		return (EDEADLK);
 	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
 }
 
 /*
  * Handle two cases for replacing the base set or mask of an entire process.
  *
  * 1) Set is non-null and mask is null.  This reparents all anonymous sets
  *    to the provided set and replaces all non-anonymous td_cpusets with the
  *    provided set.
  * 2) Mask is non-null and set is null.  This replaces or creates anonymous
  *    sets for every thread with the existing base as a parent.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct cpuset *tdset;
 	struct cpuset *nset;
 	struct thread *td;
 	struct proc *p;
 	int threads;
 	int nfree;
 	int error;
 
 	/*
 	 * The algorithm requires two passes due to locking considerations.
 	 * 
 	 * 1) Lookup the process and acquire the locks in the required order.
 	 * 2) If enough cpusets have not been allocated release the locks and
 	 *    allocate them.  Loop.
 	 */
 	LIST_INIT(&freelist);
 	LIST_INIT(&droplist);
 	nfree = 0;
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
 		if (nfree >= p->p_numthreads)
 			break;
 		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
 		for (; nfree < threads; nfree++) {
 			nset = uma_zalloc(cpuset_zone, M_WAITOK);
 			LIST_INSERT_HEAD(&freelist, nset, cs_link);
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes.  The
 	 * proc lock prevents td_cpuset from changing between calls.
 	 */
 	error = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		tdset = td->td_cpuset;
 		/*
 		 * Verify that a new mask doesn't specify cpus outside of
 		 * the set the thread is a member of.
 		 */
 		if (mask) {
 			if (tdset->cs_id == CPUSET_INVALID)
 				tdset = tdset->cs_parent;
 			if (!CPU_SUBSET(&tdset->cs_mask, mask))
 				error = EDEADLK;
 		/*
 		 * Verify that a new set won't leave an existing thread
 		 * mask without a cpu to run on.  It can, however, restrict
 		 * the set.
 		 */
 		} else if (tdset->cs_id == CPUSET_INVALID) {
 			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
 				error = EDEADLK;
 		}
 		thread_unlock(td);
 		if (error)
 			goto unlock_out;
 	}
 	/*
 	 * Replace each thread's cpuset while using deferred release.  We
 	 * must do this because the thread lock must be held while operating
 	 * on the thread and this limits the type of operations allowed.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		/*
 		 * If we presently have an anonymous set or are applying a
 		 * mask we must create an anonymous shadow set.  That is
 		 * either parented to our existing base or the supplied set.
 		 *
 		 * If we have a base set with no anonymous shadow we simply
 		 * replace it outright.
 		 */
 		tdset = td->td_cpuset;
 		if (tdset->cs_id == CPUSET_INVALID || mask) {
 			nset = LIST_FIRST(&freelist);
 			LIST_REMOVE(nset, cs_link);
 			if (mask)
 				error = cpuset_shadow(tdset, nset, mask);
 			else
 				error = _cpuset_create(nset, set,
 				    &tdset->cs_mask, CPUSET_INVALID);
 			if (error) {
 				LIST_INSERT_HEAD(&freelist, nset, cs_link);
 				thread_unlock(td);
 				break;
 			}
 		} else
 			nset = cpuset_ref(set);
 		cpuset_rel_defer(&droplist, tdset);
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		thread_unlock(td);
 	}
 unlock_out:
 	PROC_UNLOCK(p);
 out:
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	while ((nset = LIST_FIRST(&freelist)) != NULL) {
 		LIST_REMOVE(nset, cs_link);
 		uma_zfree(cpuset_zone, nset);
 	}
 	return (error);
 }
 
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
 	char *tbuf;
 	size_t i, bytesp, bufsiz;
 
 	tbuf = buf;
 	bytesp = 0;
 	bufsiz = CPUSETBUFSIZ;
 
 	for (i = 0; i < (_NCPUWORDS - 1); i++) {
 		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
 		bufsiz -= bytesp;
 		tbuf += bytesp;
 	}
 	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
 	return (buf);
 }
 
 /*
  * Build a valid cpuset_t object from a string representation.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
 	u_int nwords;
 	int i, ret;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
 	/* Allow to pass a shorter version of the mask when necessary. */
 	nwords = 1;
 	for (i = 0; buf[i] != '\0'; i++)
 		if (buf[i] == ',')
 			nwords++;
 	if (nwords > _NCPUWORDS)
 		return (-1);
 
 	CPU_ZERO(set);
 	for (i = 0; i < (nwords - 1); i++) {
 		ret = sscanf(buf, "%lx,", &set->__bits[i]);
 		if (ret == 0 || ret == -1)
 			return (-1);
 		buf = strstr(buf, ",");
 		if (buf == NULL)
 			return (-1);
 		buf++;
 	}
 	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
 	if (ret == 0 || ret == -1)
 		return (-1);
 	return (0);
 }
 
 /*
  * Apply an anonymous mask to a single thread.
  */
 int
 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 	if (error)
 		goto out;
 	set = NULL;
 	thread_lock(td);
 	error = cpuset_shadow(td->td_cpuset, nset, mask);
 	if (error == 0) {
 		set = td->td_cpuset;
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		nset = NULL;
 	}
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 	if (set)
 		cpuset_rel(set);
 out:
 	if (nset)
 		uma_zfree(cpuset_zone, nset);
 	return (error);
 }
 
 /*
  * Apply new cpumask to the ithread.
  */
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
 	struct cpuset *nset, *rset;
 	struct cpuset *parent, *old_set;
 	struct thread *td;
 	struct proc *p;
 	cpusetid_t cs_id;
 	cpuset_t mask;
 	int error;
 
 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
 	rset = uma_zalloc(cpuset_zone, M_WAITOK);
 	cs_id = CPUSET_INVALID;
 
 	CPU_ZERO(&mask);
 	if (cpu == NOCPU)
 		CPU_COPY(cpuset_root, &mask);
 	else
 		CPU_SET(cpu, &mask);
 
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
 	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
 		goto out;
 
 	/* cpuset_which() returns with PROC_LOCK held. */
 	old_set = td->td_cpuset;
 
 	if (cpu == NOCPU) {
 
 		/*
 		 * roll back to default set. We're not using cpuset_shadow()
 		 * here because we can fail CPU_SUBSET() check. This can happen
 		 * if default set does not contain all CPUs.
 		 */
 		error = _cpuset_create(nset, cpuset_default, &mask,
 		    CPUSET_INVALID);
 
 		goto applyset;
 	}
 
 	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
 	    old_set->cs_parent->cs_id == 1)) {
 
 		/*
 		 * Current set is either default (1) or
 		 * shadowed version of default set.
 		 *
 		 * Allocate new root set to be able to shadow it
 		 * with any mask.
 		 */
 		error = _cpuset_create(rset, cpuset_zero,
 		    &cpuset_zero->cs_mask, cs_id);
 		if (error != 0) {
 			PROC_UNLOCK(p);
 			goto out;
 		}
 		rset->cs_flags |= CPU_SET_ROOT;
 		parent = rset;
 		rset = NULL;
 		cs_id = CPUSET_INVALID;
 	} else {
 		/* Assume existing set was already allocated by previous call */
 		parent = old_set;
 		old_set = NULL;
 	}
 
 	error = cpuset_shadow(parent, nset, &mask);
 applyset:
 	if (error == 0) {
 		thread_lock(td);
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		thread_unlock(td);
 		nset = NULL;
 	} else
 		old_set = NULL;
 	PROC_UNLOCK(p);
 	if (old_set != NULL)
 		cpuset_rel(old_set);
 out:
 	if (nset != NULL)
 		uma_zfree(cpuset_zone, nset);
 	if (rset != NULL)
 		uma_zfree(cpuset_zone, rset);
 	if (cs_id != CPUSET_INVALID)
 		free_unr(cpuset_unr, cs_id);
 	return (error);
 }
 
 
 /*
  * Creates system-wide cpusets and the cpuset for thread0 including two
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
  *     system.  It is initially created with a mask of all processors
  *     because we don't know what processors are valid until cpuset_init()
  *     runs.  This set is immutable.
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
  */
 struct cpuset *
 cpuset_thread0(void)
 {
 	struct cpuset *set;
 	int error, i;
 
 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 
 	/*
 	 * Create the root system set for the whole machine.  Doesn't use
 	 * cpuset_create() due to NULL parent.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	CPU_FILL(&set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	set->cs_ref = 1;
 	set->cs_flags = CPU_SET_ROOT;
 	cpuset_zero = set;
 	cpuset_root = &set->cs_mask;
 
 	/*
 	 * Now derive a default, modifiable set from that to give out.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
 	cpuset_default = set;
 
 	/*
 	 * Initialize the unit allocator. 0 and 1 are allocated above.
 	 */
 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
 
 	/*
 	 * If MD code has not initialized per-domain cpusets, place all
 	 * CPUs in domain 0.
 	 */
 	for (i = 0; i < MAXMEMDOM; i++)
 		if (!CPU_EMPTY(&cpuset_domain[i]))
 			goto domains_set;
 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
 domains_set:
 
 	return (set);
 }
 
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
  *
  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
  * for that.
  *
  * In case of no error, returns the set in *setp locked with a reference.
  */
 int
 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 {
 	struct cpuset *set;
 	int error;
 
 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 
 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 	if (error)
 		return (error);
 
 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
 	    __func__, __LINE__));
 
 	/* Mark the set as root. */
 	set = *setp;
 	set->cs_flags |= CPU_SET_ROOT;
 
 	return (0);
 }
 
 int
 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 {
 	int error;
 
 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
 	error = cpuset_setproc(p->p_pid, set, NULL);
 	if (error)
 		return (error);
 	cpuset_rel(set);
 	return (0);
 }
 
 /*
  * This is called once the final set of system cpus is known.  Modifies
  * the root set and all children and mark the root read-only.  
  */
 static void
 cpuset_init(void *arg)
 {
 	cpuset_t mask;
 
 	mask = all_cpus;
 	if (cpuset_modify(cpuset_zero, &mask))
 		panic("Can't set initial cpuset mask.\n");
 	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
 }
 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_args {
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 {
 	struct cpuset *root;
 	struct cpuset *set;
 	int error;
 
 	thread_lock(td);
 	root = cpuset_refroot(td->td_cpuset);
 	thread_unlock(td);
 	error = cpuset_create(&set, root, &root->cs_mask);
 	cpuset_rel(root);
 	if (error)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
 		error = cpuset_setproc(-1, set, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setid_args {
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	setid;
 };
 #endif
 int
 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 }
 
 int
 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
     id_t id, cpusetid_t setid)
 {
 	struct cpuset *set;
 	int error;
 
 	/*
 	 * Presently we only support per-process sets.
 	 */
 	if (which != CPU_WHICH_PID)
 		return (EINVAL);
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
 	error = cpuset_setproc(id, set, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getid_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 	    uap->setid));
 }
 
 int
 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, cpusetid_t *setid)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpusetid_t tmpid;
 	int error;
 
 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 		return (EINVAL);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		return (error);
 	switch (which) {
 	case CPU_WHICH_TID:
 	case CPU_WHICH_PID:
 		thread_lock(ttd);
 		set = cpuset_refbase(ttd->td_cpuset);
 		thread_unlock(ttd);
 		PROC_UNLOCK(p);
 		break;
 	case CPU_WHICH_CPUSET:
 	case CPU_WHICH_JAIL:
 		break;
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (EINVAL);
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 		nset = cpuset_refroot(set);
 		cpuset_rel(set);
 		set = nset;
 		break;
 	case CPU_LEVEL_CPUSET:
 		break;
 	case CPU_LEVEL_WHICH:
 		break;
 	}
 	tmpid = set->cs_id;
 	cpuset_rel(set);
 	if (error == 0)
 		error = copyout(&tmpid, setid, sizeof(tmpid));
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 {
 
 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, cpuset_t *maskp)
 {
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 	size_t size;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only get your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 	    if (level != CPU_LEVEL_WHICH)
 		return (ECAPMODE);
 	    if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 		return (ECAPMODE);
 	    if (id != -1)
 		return (ECAPMODE);
 	}
 	size = cpusetsize;
 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		CPU_COPY(&nset->cs_mask, mask);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			CPU_COPY(&set->cs_mask, mask);
 			break;
 		case CPU_WHICH_IRQ:
-			error = intr_getaffinity(id, mask);
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
+			error = intr_getaffinity(id, which, mask);
 			break;
 		case CPU_WHICH_DOMAIN:
 			if (id < 0 || id >= MAXMEMDOM)
 				error = ESRCH;
 			else
 				CPU_COPY(&cpuset_domain[id], mask);
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	if (error == 0)
 		error = copyout(mask, maskp, size);
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	const cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 {
 
 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, const cpuset_t *maskp)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only set your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 	    if (level != CPU_LEVEL_WHICH)
 		return (ECAPMODE);
 	    if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 		return (ECAPMODE);
 	    if (id != -1)
 		return (ECAPMODE);
 	}
 	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = copyin(maskp, mask, cpusetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (cpusetsize > sizeof(cpuset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += cpusetsize;
 		cp += sizeof(cpuset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify(nset, mask);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, mask);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify(set, mask);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
-			error = intr_setaffinity(id, mask);
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
+			error = intr_setaffinity(id, which, mask);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 void
 ddb_display_cpuset(const cpuset_t *set)
 {
 	int cpu, once;
 
 	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
 		if (CPU_ISSET(cpu, set)) {
 			if (once == 0) {
 				db_printf("%d", cpu);
 				once = 1;
 			} else  
 				db_printf(",%d", cpu);
 		}
 	}
 	if (once == 0)
 		db_printf("<none>");
 }
 
 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 {
 	struct cpuset *set;
 
 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 		    set, set->cs_id, set->cs_ref, set->cs_flags,
 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 		db_printf("  mask=");
 		ddb_display_cpuset(&set->cs_mask);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
Index: stable/11/sys/kern/kern_intr.c
===================================================================
--- stable/11/sys/kern/kern_intr.c	(revision 333337)
+++ stable/11/sys/kern/kern_intr.c	(revision 333338)
@@ -1,1934 +1,2008 @@
 /*-
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/cpuset.h>
 #include <sys/rtprio.h>
 #include <sys/systm.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/stdarg.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 /*
  * Describe an interrupt thread.  There is one of these per interrupt event.
  */
 struct intr_thread {
 	struct intr_event *it_event;
 	struct thread *it_thread;	/* Kernel thread. */
 	int	it_flags;		/* (j) IT_* flags. */
 	int	it_need;		/* Needs service. */
 };
 
 /* Interrupt thread flags kept in it_flags */
 #define	IT_DEAD		0x000001	/* Thread is waiting to exit. */
 #define	IT_WAIT		0x000002	/* Thread is waiting for completion. */
 
 struct	intr_entropy {
 	struct	thread *td;
 	uintptr_t event;
 };
 
 struct	intr_event *clk_intr_event;
 struct	intr_event *tty_intr_event;
 void	*vm_ih;
 struct proc *intrproc;
 
 static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
 
 static int intr_storm_threshold = 1000;
 SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN,
     &intr_storm_threshold, 0,
     "Number of consecutive interrupts before storm protection is enabled");
 static TAILQ_HEAD(, intr_event) event_list =
     TAILQ_HEAD_INITIALIZER(event_list);
 static struct mtx event_lock;
 MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF);
 
 static void	intr_event_update(struct intr_event *ie);
 #ifdef INTR_FILTER
 static int	intr_event_schedule_thread(struct intr_event *ie,
 		    struct intr_thread *ithd);
 static int	intr_filter_loop(struct intr_event *ie,
 		    struct trapframe *frame, struct intr_thread **ithd);
 static struct intr_thread *ithread_create(const char *name,
 			      struct intr_handler *ih);
 #else
 static int	intr_event_schedule_thread(struct intr_event *ie);
 static struct intr_thread *ithread_create(const char *name);
 #endif
 static void	ithread_destroy(struct intr_thread *ithread);
 static void	ithread_execute_handlers(struct proc *p, 
 		    struct intr_event *ie);
 #ifdef INTR_FILTER
 static void	priv_ithread_execute_handler(struct proc *p, 
 		    struct intr_handler *ih);
 #endif
 static void	ithread_loop(void *);
 static void	ithread_update(struct intr_thread *ithd);
 static void	start_softintr(void *);
 
 /* Map an interrupt type to an ithread priority. */
 u_char
 intr_priority(enum intr_type flags)
 {
 	u_char pri;
 
 	flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
 	    INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
 	switch (flags) {
 	case INTR_TYPE_TTY:
 		pri = PI_TTY;
 		break;
 	case INTR_TYPE_BIO:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_NET:
 		pri = PI_NET;
 		break;
 	case INTR_TYPE_CAM:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_AV:
 		pri = PI_AV;
 		break;
 	case INTR_TYPE_CLK:
 		pri = PI_REALTIME;
 		break;
 	case INTR_TYPE_MISC:
 		pri = PI_DULL;          /* don't care */
 		break;
 	default:
 		/* We didn't specify an interrupt level. */
 		panic("intr_priority: no interrupt type in flags");
 	}
 
 	return pri;
 }
 
 /*
  * Update an ithread based on the associated intr_event.
  */
 static void
 ithread_update(struct intr_thread *ithd)
 {
 	struct intr_event *ie;
 	struct thread *td;
 	u_char pri;
 
 	ie = ithd->it_event;
 	td = ithd->it_thread;
 
 	/* Determine the overall priority of this event. */
 	if (TAILQ_EMPTY(&ie->ie_handlers))
 		pri = PRI_MAX_ITHD;
 	else
 		pri = TAILQ_FIRST(&ie->ie_handlers)->ih_pri;
 
 	/* Update name and priority. */
 	strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 	thread_lock(td);
 	sched_prio(td, pri);
 	thread_unlock(td);
 }
 
 /*
  * Regenerate the full name of an interrupt event and update its priority.
  */
 static void
 intr_event_update(struct intr_event *ie)
 {
 	struct intr_handler *ih;
 	char *last;
 	int missed, space;
 
 	/* Start off with no entropy and just the name of the event. */
 	mtx_assert(&ie->ie_lock, MA_OWNED);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	ie->ie_flags &= ~IE_ENTROPY;
 	missed = 0;
 	space = 1;
 
 	/* Run through all the handlers updating values. */
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 <
 		    sizeof(ie->ie_fullname)) {
 			strcat(ie->ie_fullname, " ");
 			strcat(ie->ie_fullname, ih->ih_name);
 			space = 0;
 		} else
 			missed++;
 		if (ih->ih_flags & IH_ENTROPY)
 			ie->ie_flags |= IE_ENTROPY;
 	}
 
 	/*
 	 * If the handler names were too long, add +'s to indicate missing
 	 * names. If we run out of room and still have +'s to add, change
 	 * the last character from a + to a *.
 	 */
 	last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2];
 	while (missed-- > 0) {
 		if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) {
 			if (*last == '+') {
 				*last = '*';
 				break;
 			} else
 				*last = '+';
 		} else if (space) {
 			strcat(ie->ie_fullname, " +");
 			space = 0;
 		} else
 			strcat(ie->ie_fullname, "+");
 	}
 
 	/*
 	 * If this event has an ithread, update it's priority and
 	 * name.
 	 */
 	if (ie->ie_thread != NULL)
 		ithread_update(ie->ie_thread);
 	CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
 }
 
 int
 intr_event_create(struct intr_event **event, void *source, int flags, int irq,
     void (*pre_ithread)(void *), void (*post_ithread)(void *),
     void (*post_filter)(void *), int (*assign_cpu)(void *, int),
     const char *fmt, ...)
 {
 	struct intr_event *ie;
 	va_list ap;
 
 	/* The only valid flag during creation is IE_SOFT. */
 	if ((flags & ~IE_SOFT) != 0)
 		return (EINVAL);
 	ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
 	ie->ie_source = source;
 	ie->ie_pre_ithread = pre_ithread;
 	ie->ie_post_ithread = post_ithread;
 	ie->ie_post_filter = post_filter;
 	ie->ie_assign_cpu = assign_cpu;
 	ie->ie_flags = flags;
 	ie->ie_irq = irq;
 	ie->ie_cpu = NOCPU;
 	TAILQ_INIT(&ie->ie_handlers);
 	mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
 
 	va_start(ap, fmt);
 	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
 	va_end(ap);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	mtx_lock(&event_lock);
 	TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
 	mtx_unlock(&event_lock);
 	if (event != NULL)
 		*event = ie;
 	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
 	return (0);
 }
 
 /*
  * Bind an interrupt event to the specified CPU.  Note that not all
  * platforms support binding an interrupt to a CPU.  For those
- * platforms this request will fail.  For supported platforms, any
- * associated ithreads as well as the primary interrupt context will
- * be bound to the specificed CPU.  Using a cpu id of NOCPU unbinds
+ * platforms this request will fail.  Using a cpu id of NOCPU unbinds
  * the interrupt event.
  */
-int
-intr_event_bind(struct intr_event *ie, int cpu)
+static int
+_intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread)
 {
 	lwpid_t id;
 	int error;
 
 	/* Need a CPU to bind to. */
 	if (cpu != NOCPU && CPU_ABSENT(cpu))
 		return (EINVAL);
 
 	if (ie->ie_assign_cpu == NULL)
 		return (EOPNOTSUPP);
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR);
 	if (error)
 		return (error);
 
 	/*
 	 * If we have any ithreads try to set their mask first to verify
 	 * permissions, etc.
 	 */
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_thread != NULL) {
-		id = ie->ie_thread->it_thread->td_tid;
-		mtx_unlock(&ie->ie_lock);
-		error = cpuset_setithread(id, cpu);
-		if (error)
-			return (error);
-	} else
-		mtx_unlock(&ie->ie_lock);
-	error = ie->ie_assign_cpu(ie->ie_source, cpu);
-	if (error) {
+	if (bindithread) {
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_thread != NULL) {
-			cpu = ie->ie_cpu;
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
-			(void)cpuset_setithread(id, cpu);
+			error = cpuset_setithread(id, cpu);
+			if (error)
+				return (error);
 		} else
 			mtx_unlock(&ie->ie_lock);
+	}
+	if (bindirq)
+		error = ie->ie_assign_cpu(ie->ie_source, cpu);
+	if (error) {
+		if (bindithread) {
+			mtx_lock(&ie->ie_lock);
+			if (ie->ie_thread != NULL) {
+				cpu = ie->ie_cpu;
+				id = ie->ie_thread->it_thread->td_tid;
+				mtx_unlock(&ie->ie_lock);
+				(void)cpuset_setithread(id, cpu);
+			} else
+				mtx_unlock(&ie->ie_lock);
+		}
 		return (error);
 	}
 
-	mtx_lock(&ie->ie_lock);
-	ie->ie_cpu = cpu;
-	mtx_unlock(&ie->ie_lock);
+	if (bindirq) {
+		mtx_lock(&ie->ie_lock);
+		ie->ie_cpu = cpu;
+		mtx_unlock(&ie->ie_lock);
+	}
 
 	return (error);
 }
 
+/*
+ * Bind an interrupt event to the specified CPU.  For supported platforms, any
+ * associated ithreads as well as the primary interrupt context will be bound
+ * to the specificed CPU.
+ */
+int
+intr_event_bind(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, true, true));
+}
+
+/*
+ * Bind an interrupt event to the specified CPU, but do not bind associated
+ * ithreads.
+ */
+int
+intr_event_bind_irqonly(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, true, false));
+}
+
+/*
+ * Bind an interrupt event's ithread to the specified CPU.
+ */
+int
+intr_event_bind_ithread(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, false, true));
+}
+
 static struct intr_event *
 intr_lookup(int irq)
 {
 	struct intr_event *ie;
 
 	mtx_lock(&event_lock);
 	TAILQ_FOREACH(ie, &event_list, ie_list)
 		if (ie->ie_irq == irq &&
 		    (ie->ie_flags & IE_SOFT) == 0 &&
 		    TAILQ_FIRST(&ie->ie_handlers) != NULL)
 			break;
 	mtx_unlock(&event_lock);
 	return (ie);
 }
 
 int
-intr_setaffinity(int irq, void *m)
+intr_setaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
 	cpuset_t *mask;
 	int cpu, n;
 
 	mask = m;
 	cpu = NOCPU;
 	/*
 	 * If we're setting all cpus we can unbind.  Otherwise make sure
 	 * only one cpu is in the set.
 	 */
 	if (CPU_CMP(cpuset_root, mask)) {
 		for (n = 0; n < CPU_SETSIZE; n++) {
 			if (!CPU_ISSET(n, mask))
 				continue;
 			if (cpu != NOCPU)
 				return (EINVAL);
 			cpu = n;
 		}
 	}
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
-	return (intr_event_bind(ie, cpu));
+	switch (mode) {
+	case CPU_WHICH_IRQ:
+		return (intr_event_bind(ie, cpu));
+	case CPU_WHICH_INTRHANDLER:
+		return (intr_event_bind_irqonly(ie, cpu));
+	case CPU_WHICH_ITHREAD:
+		return (intr_event_bind_ithread(ie, cpu));
+	default:
+		return (EINVAL);
+	}
 }
 
 int
-intr_getaffinity(int irq, void *m)
+intr_getaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
+	struct thread *td;
+	struct proc *p;
 	cpuset_t *mask;
+	lwpid_t id;
+	int error;
 
 	mask = m;
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
+
+	error = 0;
 	CPU_ZERO(mask);
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_cpu == NOCPU)
-		CPU_COPY(cpuset_root, mask);
-	else
-		CPU_SET(ie->ie_cpu, mask);
-	mtx_unlock(&ie->ie_lock);
+	switch (mode) {
+	case CPU_WHICH_IRQ:
+	case CPU_WHICH_INTRHANDLER:
+		mtx_lock(&ie->ie_lock);
+		if (ie->ie_cpu == NOCPU)
+			CPU_COPY(cpuset_root, mask);
+		else
+			CPU_SET(ie->ie_cpu, mask);
+		mtx_unlock(&ie->ie_lock);
+		break;
+	case CPU_WHICH_ITHREAD:
+		mtx_lock(&ie->ie_lock);
+		if (ie->ie_thread == NULL) {
+			mtx_unlock(&ie->ie_lock);
+			CPU_COPY(cpuset_root, mask);
+		} else {
+			id = ie->ie_thread->it_thread->td_tid;
+			mtx_unlock(&ie->ie_lock);
+			error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL);
+			if (error != 0)
+				return (error);
+			CPU_COPY(&td->td_cpuset->cs_mask, mask);
+			PROC_UNLOCK(p);
+		}
+	default:
+		return (EINVAL);
+	}
 	return (0);
 }
 
 int
 intr_event_destroy(struct intr_event *ie)
 {
 
 	mtx_lock(&event_lock);
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		mtx_unlock(&ie->ie_lock);
 		mtx_unlock(&event_lock);
 		return (EBUSY);
 	}
 	TAILQ_REMOVE(&event_list, ie, ie_list);
 #ifndef notyet
 	if (ie->ie_thread != NULL) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	mtx_unlock(&event_lock);
 	mtx_destroy(&ie->ie_lock);
 	free(ie, M_ITHREAD);
 	return (0);
 }
 
 #ifndef INTR_FILTER
 static struct intr_thread *
 ithread_create(const char *name)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
 	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
 	error = kproc_kthread_add(ithread_loop, ithd, &intrproc,
 		    &td, RFSTOPPED | RFHIGHPID,
 	    	    0, "intr", "%s", name);
 	if (error)
 		panic("kproc_create() failed with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
 #else
 static struct intr_thread *
 ithread_create(const char *name, struct intr_handler *ih)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
 	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
 	error = kproc_kthread_add(ithread_loop, ih, &intrproc,
 		    &td, RFSTOPPED | RFHIGHPID,
 	    	    0, "intr", "%s", name);
 	if (error)
 		panic("kproc_create() failed with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
 #endif
 
 static void
 ithread_destroy(struct intr_thread *ithread)
 {
 	struct thread *td;
 
 	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
 	td = ithread->it_thread;
 	thread_lock(td);
 	ithread->it_flags |= IT_DEAD;
 	if (TD_AWAITING_INTR(td)) {
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	}
 	thread_unlock(td);
 }
 
 #ifndef INTR_FILTER
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
 
 	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
 	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		if ((flags & INTR_EXCL) ||
 		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
 			mtx_unlock(&ie->ie_lock);
 			free(ih, M_ITHREAD);
 			return (EINVAL);
 		}
 	}
 
 	/* Create a thread if we need one. */
 	while (ie->ie_thread == NULL && handler != NULL) {
 		if (ie->ie_flags & IE_ADDING_THREAD)
 			msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 		else {
 			ie->ie_flags |= IE_ADDING_THREAD;
 			mtx_unlock(&ie->ie_lock);
 			it = ithread_create("intr: newborn");
 			mtx_lock(&ie->ie_lock);
 			ie->ie_flags &= ~IE_ADDING_THREAD;
 			ie->ie_thread = it;
 			it->it_event = ie;
 			ithread_update(it);
 			wakeup(ie);
 		}
 	}
 
 	/* Add the new handler to the event in priority order. */
 	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
 	intr_event_update(ie);
 
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
 #else
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
 
 	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
 	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		if ((flags & INTR_EXCL) ||
 		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
 			mtx_unlock(&ie->ie_lock);
 			free(ih, M_ITHREAD);
 			return (EINVAL);
 		}
 	}
 
 	/* For filtered handlers, create a private ithread to run on. */
 	if (filter != NULL && handler != NULL) {
 		mtx_unlock(&ie->ie_lock);
 		it = ithread_create("intr: newborn", ih);
 		mtx_lock(&ie->ie_lock);
 		it->it_event = ie;
 		ih->ih_thread = it;
 		ithread_update(it); /* XXX - do we really need this?!?!? */
 	} else { /* Create the global per-event thread if we need one. */
 		while (ie->ie_thread == NULL && handler != NULL) {
 			if (ie->ie_flags & IE_ADDING_THREAD)
 				msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 			else {
 				ie->ie_flags |= IE_ADDING_THREAD;
 				mtx_unlock(&ie->ie_lock);
 				it = ithread_create("intr: newborn", ih);
 				mtx_lock(&ie->ie_lock);
 				ie->ie_flags &= ~IE_ADDING_THREAD;
 				ie->ie_thread = it;
 				it->it_event = ie;
 				ithread_update(it);
 				wakeup(ie);
 			}
 		}
 	}
 
 	/* Add the new handler to the event in priority order. */
 	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
 	intr_event_update(ie);
 
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
 #endif
 
 /*
  * Append a description preceded by a ':' to the name of the specified
  * interrupt handler.
  */
 int
 intr_event_describe_handler(struct intr_event *ie, void *cookie,
     const char *descr)
 {
 	struct intr_handler *ih;
 	size_t space;
 	char *start;
 
 	mtx_lock(&ie->ie_lock);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih == cookie)
 			break;
 	}
 	if (ih == NULL) {
 		mtx_unlock(&ie->ie_lock);
 		panic("handler %p not found in interrupt event %p", cookie, ie);
 	}
 #endif
 	ih = cookie;
 
 	/*
 	 * Look for an existing description by checking for an
 	 * existing ":".  This assumes device names do not include
 	 * colons.  If one is found, prepare to insert the new
 	 * description at that point.  If one is not found, find the
 	 * end of the name to use as the insertion point.
 	 */
 	start = strchr(ih->ih_name, ':');
 	if (start == NULL)
 		start = strchr(ih->ih_name, 0);
 
 	/*
 	 * See if there is enough remaining room in the string for the
 	 * description + ":".  The "- 1" leaves room for the trailing
 	 * '\0'.  The "+ 1" accounts for the colon.
 	 */
 	space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1;
 	if (strlen(descr) + 1 > space) {
 		mtx_unlock(&ie->ie_lock);
 		return (ENOSPC);
 	}
 
 	/* Append a colon followed by the description. */
 	*start = ':';
 	strcpy(start + 1, descr);
 	intr_event_update(ie);
 	mtx_unlock(&ie->ie_lock);
 	return (0);
 }
 
 /*
  * Return the ie_source field from the intr_event an intr_handler is
  * associated with.
  */
 void *
 intr_handler_source(void *cookie)
 {
 	struct intr_handler *ih;
 	struct intr_event *ie;
 
 	ih = (struct intr_handler *)cookie;
 	if (ih == NULL)
 		return (NULL);
 	ie = ih->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    ih->ih_name));
 	return (ie->ie_source);
 }
 
 /*
  * Sleep until an ithread finishes executing an interrupt handler.
  *
  * XXX Doesn't currently handle interrupt filters or fast interrupt
  * handlers.  This is intended for compatibility with linux drivers
  * only.  Do not use in BSD code.
  */
 void
 _intr_drain(int irq)
 {
 	struct intr_event *ie;
 	struct intr_thread *ithd;
 	struct thread *td;
 
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return;
 	if (ie->ie_thread == NULL)
 		return;
 	ithd = ie->ie_thread;
 	td = ithd->it_thread;
 	/*
 	 * We set the flag and wait for it to be cleared to avoid
 	 * long delays with potentially busy interrupt handlers
 	 * were we to only sample TD_AWAITING_INTR() every tick.
 	 */
 	thread_lock(td);
 	if (!TD_AWAITING_INTR(td)) {
 		ithd->it_flags |= IT_WAIT;
 		while (ithd->it_flags & IT_WAIT) {
 			thread_unlock(td);
 			pause("idrain", 1);
 			thread_lock(td);
 		}
 	}
 	thread_unlock(td);
 	return;
 }
 
 
 #ifndef INTR_FILTER
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 #ifdef INVARIANTS
 	struct intr_handler *ih;
 #endif
 #ifdef notyet
 	int dead;
 #endif
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		if (ih == handler)
 			goto ok;
 	mtx_unlock(&ie->ie_lock);
 	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
 	    ih->ih_name, ie->ie_name);
 ok:
 #endif
 	/*
 	 * If there is no ithread, then just remove the handler and return.
 	 * XXX: Note that an INTR_FAST handler might be running on another
 	 * CPU!
 	 */
 	if (ie->ie_thread == NULL) {
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 		mtx_unlock(&ie->ie_lock);
 		free(handler, M_ITHREAD);
 		return (0);
 	}
 
 	/*
 	 * If the interrupt thread is already running, then just mark this
 	 * handler as being dead and let the ithread do the actual removal.
 	 *
 	 * During a cold boot while cold is set, msleep() does not sleep,
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
 	thread_lock(ie->ie_thread->it_thread);
 	if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
 		 * Ensure that the thread will process the handler list
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 *
 		 * The release part of the following store ensures
 		 * that the update of ih_flags is ordered before the
 		 * it_need setting.  See the comment before
 		 * atomic_cmpset_acq(&ithd->it_need, ...) operation in
 		 * the ithread_execute_handlers().
 		 */
 		atomic_store_rel_int(&ie->ie_thread->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(ie->ie_thread->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	intr_event_update(ie);
 #ifdef notyet
 	/*
 	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
 	 * this could lead to races of stale data when servicing an
 	 * interrupt.
 	 */
 	dead = 1;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (!(ih->ih_flags & IH_FAST)) {
 			dead = 0;
 			break;
 		}
 	}
 	if (dead) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 static int
 intr_event_schedule_thread(struct intr_event *ie)
 {
 	struct intr_entropy entropy;
 	struct intr_thread *it;
 	struct thread *td;
 	struct thread *ctd;
 	struct proc *p;
 
 	/*
 	 * If no ithread or no handlers, then we have a stray interrupt.
 	 */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) ||
 	    ie->ie_thread == NULL)
 		return (EINVAL);
 
 	ctd = curthread;
 	it = ie->ie_thread;
 	td = it->it_thread;
 	p = td->td_proc;
 
 	/*
 	 * If any of the handlers for this ithread claim to be good
 	 * sources of entropy, then gather some.
 	 */
 	if (ie->ie_flags & IE_ENTROPY) {
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
 		random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT);
 	}
 
 	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 *
 	 * Use store_rel to arrange that the store to ih_need in
 	 * swi_sched() is before the store to it_need and prepare for
 	 * transfer of this order to loads in the ithread.
 	 */
 	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    td->td_name);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
 	}
 	thread_unlock(td);
 
 	return (0);
 }
 #else
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 	struct intr_thread *it;
 #ifdef INVARIANTS
 	struct intr_handler *ih;
 #endif
 #ifdef notyet
 	int dead;
 #endif
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		if (ih == handler)
 			goto ok;
 	mtx_unlock(&ie->ie_lock);
 	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
 	    ih->ih_name, ie->ie_name);
 ok:
 #endif
 	/*
 	 * If there are no ithreads (per event and per handler), then
 	 * just remove the handler and return.  
 	 * XXX: Note that an INTR_FAST handler might be running on another CPU!
 	 */
 	if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 		mtx_unlock(&ie->ie_lock);
 		free(handler, M_ITHREAD);
 		return (0);
 	}
 
 	/* Private or global ithread? */
 	it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
 	/*
 	 * If the interrupt thread is already running, then just mark this
 	 * handler as being dead and let the ithread do the actual removal.
 	 *
 	 * During a cold boot while cold is set, msleep() does not sleep,
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
 	thread_lock(it->it_thread);
 	if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
 		 * Ensure that the thread will process the handler list
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 *
 		 * The release part of the following store ensures
 		 * that the update of ih_flags is ordered before the
 		 * it_need setting.  See the comment before
 		 * atomic_cmpset_acq(&ithd->it_need, ...) operation in
 		 * the ithread_execute_handlers().
 		 */
 		atomic_store_rel_int(&it->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(it->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	/* 
 	 * At this point, the handler has been disconnected from the event,
 	 * so we can kill the private ithread if any.
 	 */
 	if (handler->ih_thread) {
 		ithread_destroy(handler->ih_thread);
 		handler->ih_thread = NULL;
 	}
 	intr_event_update(ie);
 #ifdef notyet
 	/*
 	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
 	 * this could lead to races of stale data when servicing an
 	 * interrupt.
 	 */
 	dead = 1;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (handler != NULL) {
 			dead = 0;
 			break;
 		}
 	}
 	if (dead) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 static int
 intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
 {
 	struct intr_entropy entropy;
 	struct thread *td;
 	struct thread *ctd;
 	struct proc *p;
 
 	/*
 	 * If no ithread or no handlers, then we have a stray interrupt.
 	 */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
 		return (EINVAL);
 
 	ctd = curthread;
 	td = it->it_thread;
 	p = td->td_proc;
 
 	/*
 	 * If any of the handlers for this ithread claim to be good
 	 * sources of entropy, then gather some.
 	 */
 	if (ie->ie_flags & IE_ENTROPY) {
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
 		random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT);
 	}
 
 	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 *
 	 * Use store_rel to arrange that the store to ih_need in
 	 * swi_sched() is before the store to it_need and prepare for
 	 * transfer of this order to loads in the ithread.
 	 */
 	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    td->td_name);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
 	}
 	thread_unlock(td);
 
 	return (0);
 }
 #endif
 
 /*
  * Allow interrupt event binding for software interrupt handlers -- a no-op,
  * since interrupts are generated in software rather than being directed by
  * a PIC.
  */
 static int
 swi_assign_cpu(void *arg, int cpu)
 {
 
 	return (0);
 }
 
 /*
  * Add a software interrupt handler to a specified event.  If a given event
  * is not specified, then a new event is created.
  */
 int
 swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
 	    void *arg, int pri, enum intr_type flags, void **cookiep)
 {
 	struct intr_event *ie;
 	int error;
 
 	if (flags & INTR_ENTROPY)
 		return (EINVAL);
 
 	ie = (eventp != NULL) ? *eventp : NULL;
 
 	if (ie != NULL) {
 		if (!(ie->ie_flags & IE_SOFT))
 			return (EINVAL);
 	} else {
 		error = intr_event_create(&ie, NULL, IE_SOFT, 0,
 		    NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri);
 		if (error)
 			return (error);
 		if (eventp != NULL)
 			*eventp = ie;
 	}
 	error = intr_event_add_handler(ie, name, NULL, handler, arg,
 	    PI_SWI(pri), flags, cookiep);
 	return (error);
 }
 
 /*
  * Schedule a software interrupt thread.
  */
 void
 swi_sched(void *cookie, int flags)
 {
 	struct intr_handler *ih = (struct intr_handler *)cookie;
 	struct intr_event *ie = ih->ih_event;
 	struct intr_entropy entropy;
 	int error;
 
 	CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
 	    ih->ih_need);
 
 	entropy.event = (uintptr_t)ih;
 	entropy.td = curthread;
 	random_harvest_queue(&entropy, sizeof(entropy), 1, RANDOM_SWI);
 
 	/*
 	 * Set ih_need for this handler so that if the ithread is already
 	 * running it will execute this handler on the next pass.  Otherwise,
 	 * it will execute it the next time it runs.
 	 */
 	ih->ih_need = 1;
 
 	if (!(flags & SWI_DELAY)) {
 		PCPU_INC(cnt.v_soft);
 #ifdef INTR_FILTER
 		error = intr_event_schedule_thread(ie, ie->ie_thread);
 #else
 		error = intr_event_schedule_thread(ie);
 #endif
 		KASSERT(error == 0, ("stray software interrupt"));
 	}
 }
 
 /*
  * Remove a software interrupt handler.  Currently this code does not
  * remove the associated interrupt event if it becomes empty.  Calling code
  * may do so manually via intr_event_destroy(), but that's not really
  * an optimal interface.
  */
 int
 swi_remove(void *cookie)
 {
 
 	return (intr_event_remove_handler(cookie));
 }
 
 #ifdef INTR_FILTER
 static void
 priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
 {
 	struct intr_event *ie;
 
 	ie = ih->ih_event;
 	/*
 	 * If this handler is marked for death, remove it from
 	 * the list of handlers and wake up the sleeper.
 	 */
 	if (ih->ih_flags & IH_DEAD) {
 		mtx_lock(&ie->ie_lock);
 		TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
 		ih->ih_flags &= ~IH_DEAD;
 		wakeup(ih);
 		mtx_unlock(&ie->ie_lock);
 		return;
 	}
 	
 	/* Execute this handler. */
 	CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
 	     __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
 	     ih->ih_name, ih->ih_flags);
 	
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_lock(&Giant);
 	ih->ih_handler(ih->ih_argument);
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_unlock(&Giant);
 }
 #endif
 
 /*
  * This is a public function for use by drivers that mux interrupt
  * handlers for child devices from their interrupt handler.
  */
 void
 intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 	struct intr_handler *ih, *ihn;
 
 	TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) {
 		/*
 		 * If this handler is marked for death, remove it from
 		 * the list of handlers and wake up the sleeper.
 		 */
 		if (ih->ih_flags & IH_DEAD) {
 			mtx_lock(&ie->ie_lock);
 			TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
 			ih->ih_flags &= ~IH_DEAD;
 			wakeup(ih);
 			mtx_unlock(&ie->ie_lock);
 			continue;
 		}
 
 		/* Skip filter only handlers */
 		if (ih->ih_handler == NULL)
 			continue;
 
 		/*
 		 * For software interrupt threads, we only execute
 		 * handlers that have their need flag set.  Hardware
 		 * interrupt threads always invoke all of their handlers.
 		 *
 		 * ih_need can only be 0 or 1.  Failed cmpset below
 		 * means that there is no request to execute handlers,
 		 * so a retry of the cmpset is not needed.
 		 */
 		if ((ie->ie_flags & IE_SOFT) != 0 &&
 		    atomic_cmpset_int(&ih->ih_need, 1, 0) == 0)
 			continue;
 
 		/* Execute this handler. */
 		CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
 		    __func__, p->p_pid, (void *)ih->ih_handler, 
 		    ih->ih_argument, ih->ih_name, ih->ih_flags);
 
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_lock(&Giant);
 		ih->ih_handler(ih->ih_argument);
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_unlock(&Giant);
 	}
 }
 
 static void
 ithread_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 
 	/* Interrupt handlers should not sleep. */
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_NO_SLEEPING();
 	intr_event_execute_handlers(p, ie);
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_SLEEPING_OK();
 
 	/*
 	 * Interrupt storm handling:
 	 *
 	 * If this interrupt source is currently storming, then throttle
 	 * it to only fire the handler once  per clock tick.
 	 *
 	 * If this interrupt source is not currently storming, but the
 	 * number of back to back interrupts exceeds the storm threshold,
 	 * then enter storming mode.
 	 */
 	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
 	    !(ie->ie_flags & IE_SOFT)) {
 		/* Report the message only once every second. */
 		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
 			printf(
 	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
 			    ie->ie_name);
 		}
 		pause("istorm", 1);
 	} else
 		ie->ie_count++;
 
 	/*
 	 * Now that all the handlers have had a chance to run, reenable
 	 * the interrupt source.
 	 */
 	if (ie->ie_post_ithread != NULL)
 		ie->ie_post_ithread(ie->ie_source);
 }
 
 #ifndef INTR_FILTER
 /*
  * This is the main code for interrupt threads.
  */
 static void
 ithread_loop(void *arg)
 {
 	struct intr_thread *ithd;
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	int wake;
 
 	td = curthread;
 	p = td->td_proc;
 	ithd = (struct intr_thread *)arg;
 	KASSERT(ithd->it_thread == td,
 	    ("%s: ithread and proc linkage out of sync", __func__));
 	ie = ithd->it_event;
 	ie->ie_count = 0;
 	wake = 0;
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
 			    p->p_pid, td->td_name);
 			free(ithd, M_ITHREAD);
 			kthread_exit();
 		}
 
 		/*
 		 * Service interrupts.  If another interrupt arrives while
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 *
 		 * The load_acq part of the following cmpset ensures
 		 * that the load of ih_need in ithread_execute_handlers()
 		 * is ordered after the load of it_need here.
 		 */
 		while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0)
 			ithread_execute_handlers(p, ie);
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
 		if (atomic_load_acq_int(&ithd->it_need) == 0 &&
 		    (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
 		if (ithd->it_flags & IT_WAIT) {
 			wake = 1;
 			ithd->it_flags &= ~IT_WAIT;
 		}
 		thread_unlock(td);
 		if (wake) {
 			wakeup(ithd);
 			wake = 0;
 		}
 	}
 }
 
 /*
  * Main interrupt handling body.
  *
  * Input:
  * o ie:                        the event connected to this interrupt.
  * o frame:                     some archs (i.e. i386) pass a frame to some.
  *                              handlers as their main argument.
  * Return value:
  * o 0:                         everything ok.
  * o EINVAL:                    stray interrupt.
  */
 int
 intr_event_handle(struct intr_event *ie, struct trapframe *frame)
 {
 	struct intr_handler *ih;
 	struct trapframe *oldframe;
 	struct thread *td;
 	int error, ret, thread;
 
 	td = curthread;
 
 #ifdef KSTACK_USAGE_PROF
 	intr_prof_stack_use(td, frame);
 #endif
 
 	/* An interrupt with no event or handlers is a stray interrupt. */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
 	/*
 	 * Execute fast interrupt handlers directly.
 	 * To support clock handlers, if a handler registers
 	 * with a NULL argument, then we pass it a pointer to
 	 * a trapframe as its argument.
 	 */
 	td->td_intr_nesting_level++;
 	thread = 0;
 	ret = 0;
 	critical_enter();
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = frame;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih->ih_filter == NULL) {
 			thread = 1;
 			continue;
 		}
 		CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
 		    ih->ih_filter, ih->ih_argument == NULL ? frame :
 		    ih->ih_argument, ih->ih_name);
 		if (ih->ih_argument == NULL)
 			ret = ih->ih_filter(frame);
 		else
 			ret = ih->ih_filter(ih->ih_argument);
 		KASSERT(ret == FILTER_STRAY ||
 		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
 		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
 		    ("%s: incorrect return value %#x from %s", __func__, ret,
 		    ih->ih_name));
 
 		/* 
 		 * Wrapper handler special handling:
 		 *
 		 * in some particular cases (like pccard and pccbb), 
 		 * the _real_ device handler is wrapped in a couple of
 		 * functions - a filter wrapper and an ithread wrapper.
 		 * In this case (and just in this case), the filter wrapper 
 		 * could ask the system to schedule the ithread and mask
 		 * the interrupt source if the wrapped handler is composed
 		 * of just an ithread handler.
 		 *
 		 * TODO: write a generic wrapper to avoid people rolling 
 		 * their own
 		 */
 		if (!thread) {
 			if (ret == FILTER_SCHEDULE_THREAD)
 				thread = 1;
 		}
 	}
 	td->td_intr_frame = oldframe;
 
 	if (thread) {
 		if (ie->ie_pre_ithread != NULL)
 			ie->ie_pre_ithread(ie->ie_source);
 	} else {
 		if (ie->ie_post_filter != NULL)
 			ie->ie_post_filter(ie->ie_source);
 	}
 	
 	/* Schedule the ithread if needed. */
 	if (thread) {
 		error = intr_event_schedule_thread(ie);
 		KASSERT(error == 0, ("bad stray interrupt"));
 	}
 	critical_exit();
 	td->td_intr_nesting_level--;
 	return (0);
 }
 #else
 /*
  * This is the main code for interrupt threads.
  */
 static void
 ithread_loop(void *arg)
 {
 	struct intr_thread *ithd;
 	struct intr_handler *ih;
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	int priv;
 	int wake;
 
 	td = curthread;
 	p = td->td_proc;
 	ih = (struct intr_handler *)arg;
 	priv = (ih->ih_thread != NULL) ? 1 : 0;
 	ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
 	KASSERT(ithd->it_thread == td,
 	    ("%s: ithread and proc linkage out of sync", __func__));
 	ie = ithd->it_event;
 	ie->ie_count = 0;
 	wake = 0;
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
 			    p->p_pid, td->td_name);
 			free(ithd, M_ITHREAD);
 			kthread_exit();
 		}
 
 		/*
 		 * Service interrupts.  If another interrupt arrives while
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 *
 		 * The load_acq part of the following cmpset ensures
 		 * that the load of ih_need in ithread_execute_handlers()
 		 * is ordered after the load of it_need here.
 		 */
 		while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) {
 			if (priv)
 				priv_ithread_execute_handler(p, ih);
 			else 
 				ithread_execute_handlers(p, ie);
 		}
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
 		if (atomic_load_acq_int(&ithd->it_need) == 0 &&
 		    (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
 		if (ithd->it_flags & IT_WAIT) {
 			wake = 1;
 			ithd->it_flags &= ~IT_WAIT;
 		}
 		thread_unlock(td);
 		if (wake) {
 			wakeup(ithd);
 			wake = 0;
 		}
 	}
 }
 
 /* 
  * Main loop for interrupt filter.
  *
  * Some architectures (i386, amd64 and arm) require the optional frame 
  * parameter, and use it as the main argument for fast handler execution
  * when ih_argument == NULL.
  *
  * Return value:
  * o FILTER_STRAY:              No filter recognized the event, and no
  *                              filter-less handler is registered on this 
  *                              line.
  * o FILTER_HANDLED:            A filter claimed the event and served it.
  * o FILTER_SCHEDULE_THREAD:    No filter claimed the event, but there's at
  *                              least one filter-less handler on this line.
  * o FILTER_HANDLED | 
  *   FILTER_SCHEDULE_THREAD:    A filter claimed the event, and asked for
  *                              scheduling the per-handler ithread.
  *
  * In case an ithread has to be scheduled, in *ithd there will be a 
  * pointer to a struct intr_thread containing the thread to be
  * scheduled.
  */
 
 static int
 intr_filter_loop(struct intr_event *ie, struct trapframe *frame, 
 		 struct intr_thread **ithd) 
 {
 	struct intr_handler *ih;
 	void *arg;
 	int ret, thread_only;
 
 	ret = 0;
 	thread_only = 0;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		/*
 		 * Execute fast interrupt handlers directly.
 		 * To support clock handlers, if a handler registers
 		 * with a NULL argument, then we pass it a pointer to
 		 * a trapframe as its argument.
 		 */
 		arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
 		
 		CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
 		     ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
 
 		if (ih->ih_filter != NULL)
 			ret = ih->ih_filter(arg);
 		else {
 			thread_only = 1;
 			continue;
 		}
 		KASSERT(ret == FILTER_STRAY ||
 		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
 		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
 		    ("%s: incorrect return value %#x from %s", __func__, ret,
 		    ih->ih_name));
 		if (ret & FILTER_STRAY)
 			continue;
 		else { 
 			*ithd = ih->ih_thread;
 			return (ret);
 		}
 	}
 
 	/*
 	 * No filters handled the interrupt and we have at least
 	 * one handler without a filter.  In this case, we schedule
 	 * all of the filter-less handlers to run in the ithread.
 	 */	
 	if (thread_only) {
 		*ithd = ie->ie_thread;
 		return (FILTER_SCHEDULE_THREAD);
 	}
 	return (FILTER_STRAY);
 }
 
 /*
  * Main interrupt handling body.
  *
  * Input:
  * o ie:                        the event connected to this interrupt.
  * o frame:                     some archs (i.e. i386) pass a frame to some.
  *                              handlers as their main argument.
  * Return value:
  * o 0:                         everything ok.
  * o EINVAL:                    stray interrupt.
  */
 int
 intr_event_handle(struct intr_event *ie, struct trapframe *frame)
 {
 	struct intr_thread *ithd;
 	struct trapframe *oldframe;
 	struct thread *td;
 	int thread;
 
 	ithd = NULL;
 	td = curthread;
 
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
 	td->td_intr_nesting_level++;
 	thread = 0;
 	critical_enter();
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = frame;
 	thread = intr_filter_loop(ie, frame, &ithd);	
 	if (thread & FILTER_HANDLED) {
 		if (ie->ie_post_filter != NULL)
 			ie->ie_post_filter(ie->ie_source);
 	} else {
 		if (ie->ie_pre_ithread != NULL)
 			ie->ie_pre_ithread(ie->ie_source);
 	}
 	td->td_intr_frame = oldframe;
 	critical_exit();
 	
 	/* Interrupt storm logic */
 	if (thread & FILTER_STRAY) {
 		ie->ie_count++;
 		if (ie->ie_count < intr_storm_threshold)
 			printf("Interrupt stray detection not present\n");
 	}
 
 	/* Schedule an ithread if needed. */
 	if (thread & FILTER_SCHEDULE_THREAD) {
 		if (intr_event_schedule_thread(ie, ithd) != 0)
 			panic("%s: impossible stray interrupt", __func__);
 	}
 	td->td_intr_nesting_level--;
 	return (0);
 }
 #endif
 
 #ifdef DDB
 /*
  * Dump details about an interrupt handler
  */
 static void
 db_dump_intrhand(struct intr_handler *ih)
 {
 	int comma;
 
 	db_printf("\t%-10s ", ih->ih_name);
 	switch (ih->ih_pri) {
 	case PI_REALTIME:
 		db_printf("CLK ");
 		break;
 	case PI_AV:
 		db_printf("AV  ");
 		break;
 	case PI_TTY:
 		db_printf("TTY ");
 		break;
 	case PI_NET:
 		db_printf("NET ");
 		break;
 	case PI_DISK:
 		db_printf("DISK");
 		break;
 	case PI_DULL:
 		db_printf("DULL");
 		break;
 	default:
 		if (ih->ih_pri >= PI_SOFT)
 			db_printf("SWI ");
 		else
 			db_printf("%4u", ih->ih_pri);
 		break;
 	}
 	db_printf(" ");
 	if (ih->ih_filter != NULL) {
 		db_printf("[F]");
 		db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC);
 	}
 	if (ih->ih_handler != NULL) {
 		if (ih->ih_filter != NULL)
 			db_printf(",");
 		db_printf("[H]");
 		db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
 	}
 	db_printf("(%p)", ih->ih_argument);
 	if (ih->ih_need ||
 	    (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
 	    IH_MPSAFE)) != 0) {
 		db_printf(" {");
 		comma = 0;
 		if (ih->ih_flags & IH_EXCLUSIVE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("EXCL");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_ENTROPY) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_DEAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("DEAD");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_MPSAFE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("MPSAFE");
 			comma = 1;
 		}
 		if (ih->ih_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 }
 
 /*
  * Dump details about a event.
  */
 void
 db_dump_intr_event(struct intr_event *ie, int handlers)
 {
 	struct intr_handler *ih;
 	struct intr_thread *it;
 	int comma;
 
 	db_printf("%s ", ie->ie_fullname);
 	it = ie->ie_thread;
 	if (it != NULL)
 		db_printf("(pid %d)", it->it_thread->td_proc->p_pid);
 	else
 		db_printf("(no thread)");
 	if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 ||
 	    (it != NULL && it->it_need)) {
 		db_printf(" {");
 		comma = 0;
 		if (ie->ie_flags & IE_SOFT) {
 			db_printf("SOFT");
 			comma = 1;
 		}
 		if (ie->ie_flags & IE_ENTROPY) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
 			comma = 1;
 		}
 		if (ie->ie_flags & IE_ADDING_THREAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ADDING_THREAD");
 			comma = 1;
 		}
 		if (it != NULL && it->it_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 
 	if (handlers)
 		TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		    db_dump_intrhand(ih);
 }
 
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(intr, db_show_intr)
 {
 	struct intr_event *ie;
 	int all, verbose;
 
 	verbose = strchr(modif, 'v') != NULL;
 	all = strchr(modif, 'a') != NULL;
 	TAILQ_FOREACH(ie, &event_list, ie_list) {
 		if (!all && TAILQ_EMPTY(&ie->ie_handlers))
 			continue;
 		db_dump_intr_event(ie, verbose);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
 
 /*
  * Start standard software interrupt threads
  */
 static void
 start_softintr(void *dummy)
 {
 
 	if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih))
 		panic("died while creating vm swi ithread");
 }
 SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr,
     NULL);
 
 /*
  * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
  * The data for this machine dependent, and the declarations are in machine
  * dependent code.  The layout of intrnames and intrcnt however is machine
  * independent.
  *
  * We do not know the length of intrcnt and intrnames at compile time, so
  * calculate things at run time.
  */
 static int
 sysctl_intrnames(SYSCTL_HANDLER_ARGS)
 {
 	return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_intrnames, "", "Interrupt Names");
 
 static int
 sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
 {
 #ifdef SCTL_MASK32
 	uint32_t *intrcnt32;
 	unsigned i;
 	int error;
 
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
 		intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
 		if (intrcnt32 == NULL)
 			return (ENOMEM);
 		for (i = 0; i < sintrcnt / sizeof (u_long); i++)
 			intrcnt32[i] = intrcnt[i];
 		error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
 		free(intrcnt32, M_TEMP);
 		return (error);
 	}
 #endif
 	return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_intrcnt, "", "Interrupt Counts");
 
 #ifdef DDB
 /*
  * DDB command to dump the interrupt statistics.
  */
 DB_SHOW_COMMAND(intrcnt, db_show_intrcnt)
 {
 	u_long *i;
 	char *cp;
 	u_int j;
 
 	cp = intrnames;
 	j = 0;
 	for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit;
 	    i++, j++) {
 		if (*cp == '\0')
 			break;
 		if (*i != 0)
 			db_printf("%s\t%lu\n", cp, *i);
 		cp += strlen(cp) + 1;
 	}
 }
 #endif
Index: stable/11/sys/kern/subr_gtaskqueue.c
===================================================================
--- stable/11/sys/kern/subr_gtaskqueue.c	(revision 333337)
+++ stable/11/sys/kern/subr_gtaskqueue.c	(revision 333338)
@@ -1,965 +1,979 @@
 /*-
  * Copyright (c) 2000 Doug Rabson
  * Copyright (c) 2014 Jeff Roberson
  * Copyright (c) 2016 Matthew Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/gtaskqueue.h>
 #include <sys/unistd.h>
 #include <machine/stdarg.h>
 
-static MALLOC_DEFINE(M_GTASKQUEUE, "taskqueue", "Task Queues");
+static MALLOC_DEFINE(M_GTASKQUEUE, "gtaskqueue", "Group Task Queues");
 static void	gtaskqueue_thread_enqueue(void *);
 static void	gtaskqueue_thread_loop(void *arg);
 
 TASKQGROUP_DEFINE(softirq, mp_ncpus, 1);
 
 struct gtaskqueue_busy {
 	struct gtask	*tb_running;
 	TAILQ_ENTRY(gtaskqueue_busy) tb_link;
 };
 
 static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1;
 
 struct gtaskqueue {
 	STAILQ_HEAD(, gtask)	tq_queue;
 	gtaskqueue_enqueue_fn	tq_enqueue;
 	void			*tq_context;
 	char			*tq_name;
 	TAILQ_HEAD(, gtaskqueue_busy) tq_active;
 	struct mtx		tq_mutex;
 	struct thread		**tq_threads;
 	int			tq_tcount;
 	int			tq_spin;
 	int			tq_flags;
 	int			tq_callouts;
 	taskqueue_callback_fn	tq_callbacks[TASKQUEUE_NUM_CALLBACKS];
 	void			*tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS];
 };
 
 #define	TQ_FLAGS_ACTIVE		(1 << 0)
 #define	TQ_FLAGS_BLOCKED	(1 << 1)
 #define	TQ_FLAGS_UNLOCKED_ENQUEUE	(1 << 2)
 
 #define	DT_CALLOUT_ARMED	(1 << 0)
 
 #define	TQ_LOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_lock_spin(&(tq)->tq_mutex);			\
 		else							\
 			mtx_lock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_LOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_OWNED)
 
 #define	TQ_UNLOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_unlock_spin(&(tq)->tq_mutex);		\
 		else							\
 			mtx_unlock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_UNLOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED)
 
 #ifdef INVARIANTS
 static void
 gtask_dump(struct gtask *gtask)
 {
 	printf("gtask: %p ta_flags=%x ta_priority=%d ta_func=%p ta_context=%p\n",
 	       gtask, gtask->ta_flags, gtask->ta_priority, gtask->ta_func, gtask->ta_context);
 }
 #endif
 
 static __inline int
 TQ_SLEEP(struct gtaskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
     int t)
 {
 	if (tq->tq_spin)
 		return (msleep_spin(p, m, wm, t));
 	return (msleep(p, m, pri, wm, t));
 }
 
 static struct gtaskqueue *
 _gtaskqueue_create(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context,
 		 int mtxflags, const char *mtxname __unused)
 {
 	struct gtaskqueue *queue;
 	char *tq_name;
 
 	tq_name = malloc(TASKQUEUE_NAMELEN, M_GTASKQUEUE, mflags | M_ZERO);
 	if (!tq_name)
 		return (NULL);
 
 	snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue");
 
 	queue = malloc(sizeof(struct gtaskqueue), M_GTASKQUEUE, mflags | M_ZERO);
-	if (!queue)
+	if (!queue) {
+		free(tq_name, M_GTASKQUEUE);
 		return (NULL);
+	}
 
 	STAILQ_INIT(&queue->tq_queue);
 	TAILQ_INIT(&queue->tq_active);
 	queue->tq_enqueue = enqueue;
 	queue->tq_context = context;
 	queue->tq_name = tq_name;
 	queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
 	queue->tq_flags |= TQ_FLAGS_ACTIVE;
 	if (enqueue == gtaskqueue_thread_enqueue)
 		queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE;
 	mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags);
 
 	return (queue);
 }
 
 
 /*
  * Signal a taskqueue thread to terminate.
  */
 static void
 gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq)
 {
 
 	while (tq->tq_tcount > 0 || tq->tq_callouts > 0) {
 		wakeup(tq);
 		TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0);
 	}
 }
 
 static void
 gtaskqueue_free(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_ACTIVE;
 	gtaskqueue_terminate(queue->tq_threads, queue);
 	KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?"));
 	KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
 	mtx_destroy(&queue->tq_mutex);
 	free(queue->tq_threads, M_GTASKQUEUE);
 	free(queue->tq_name, M_GTASKQUEUE);
 	free(queue, M_GTASKQUEUE);
 }
 
 int
 grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *gtask)
 {
 #ifdef INVARIANTS
 	if (queue == NULL) {
 		gtask_dump(gtask);
 		panic("queue == NULL");
 	}
 #endif
 	TQ_LOCK(queue);
 	if (gtask->ta_flags & TASK_ENQUEUED) {
 		TQ_UNLOCK(queue);
 		return (0);
 	}
 	STAILQ_INSERT_TAIL(&queue->tq_queue, gtask, ta_link);
 	gtask->ta_flags |= TASK_ENQUEUED;
 	TQ_UNLOCK(queue);
 	if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
 		queue->tq_enqueue(queue->tq_context);
 	return (0);
 }
 
 static void
 gtaskqueue_task_nop_fn(void *context)
 {
 }
 
 /*
  * Block until all currently queued tasks in this taskqueue
  * have begun execution.  Tasks queued during execution of
  * this function are ignored.
  */
 static void
 gtaskqueue_drain_tq_queue(struct gtaskqueue *queue)
 {
 	struct gtask t_barrier;
 
 	if (STAILQ_EMPTY(&queue->tq_queue))
 		return;
 
 	/*
 	 * Enqueue our barrier after all current tasks, but with
 	 * the highest priority so that newly queued tasks cannot
 	 * pass it.  Because of the high priority, we can not use
 	 * taskqueue_enqueue_locked directly (which drops the lock
 	 * anyway) so just insert it at tail while we have the
 	 * queue lock.
 	 */
 	GTASK_INIT(&t_barrier, 0, USHRT_MAX, gtaskqueue_task_nop_fn, &t_barrier);
 	STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link);
 	t_barrier.ta_flags |= TASK_ENQUEUED;
 
 	/*
 	 * Once the barrier has executed, all previously queued tasks
 	 * have completed or are currently executing.
 	 */
 	while (t_barrier.ta_flags & TASK_ENQUEUED)
 		TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0);
 }
 
 /*
  * Block until all currently executing tasks for this taskqueue
  * complete.  Tasks that begin execution during the execution
  * of this function are ignored.
  */
 static void
 gtaskqueue_drain_tq_active(struct gtaskqueue *queue)
 {
 	struct gtaskqueue_busy tb_marker, *tb_first;
 
 	if (TAILQ_EMPTY(&queue->tq_active))
 		return;
 
 	/* Block taskq_terminate().*/
 	queue->tq_callouts++;
 
 	/*
 	 * Wait for all currently executing taskqueue threads
 	 * to go idle.
 	 */
 	tb_marker.tb_running = TB_DRAIN_WAITER;
 	TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link);
 	while (TAILQ_FIRST(&queue->tq_active) != &tb_marker)
 		TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0);
 	TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link);
 
 	/*
 	 * Wakeup any other drain waiter that happened to queue up
 	 * without any intervening active thread.
 	 */
 	tb_first = TAILQ_FIRST(&queue->tq_active);
 	if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER)
 		wakeup(tb_first);
 
 	/* Release taskqueue_terminate(). */
 	queue->tq_callouts--;
 	if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 		wakeup_one(queue->tq_threads);
 }
 
 void
 gtaskqueue_block(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags |= TQ_FLAGS_BLOCKED;
 	TQ_UNLOCK(queue);
 }
 
 void
 gtaskqueue_unblock(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_BLOCKED;
 	if (!STAILQ_EMPTY(&queue->tq_queue))
 		queue->tq_enqueue(queue->tq_context);
 	TQ_UNLOCK(queue);
 }
 
 static void
 gtaskqueue_run_locked(struct gtaskqueue *queue)
 {
 	struct gtaskqueue_busy tb;
 	struct gtaskqueue_busy *tb_first;
 	struct gtask *gtask;
 
 	KASSERT(queue != NULL, ("tq is NULL"));
 	TQ_ASSERT_LOCKED(queue);
 	tb.tb_running = NULL;
 
 	while (STAILQ_FIRST(&queue->tq_queue)) {
 		TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link);
 
 		/*
 		 * Carefully remove the first task from the queue and
 		 * clear its TASK_ENQUEUED flag
 		 */
 		gtask = STAILQ_FIRST(&queue->tq_queue);
 		KASSERT(gtask != NULL, ("task is NULL"));
 		STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
 		gtask->ta_flags &= ~TASK_ENQUEUED;
 		tb.tb_running = gtask;
 		TQ_UNLOCK(queue);
 
 		KASSERT(gtask->ta_func != NULL, ("task->ta_func is NULL"));
 		gtask->ta_func(gtask->ta_context);
 
 		TQ_LOCK(queue);
 		tb.tb_running = NULL;
 		wakeup(gtask);
 
 		TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
 		tb_first = TAILQ_FIRST(&queue->tq_active);
 		if (tb_first != NULL &&
 		    tb_first->tb_running == TB_DRAIN_WAITER)
 			wakeup(tb_first);
 	}
 }
 
 static int
 task_is_running(struct gtaskqueue *queue, struct gtask *gtask)
 {
 	struct gtaskqueue_busy *tb;
 
 	TQ_ASSERT_LOCKED(queue);
 	TAILQ_FOREACH(tb, &queue->tq_active, tb_link) {
 		if (tb->tb_running == gtask)
 			return (1);
 	}
 	return (0);
 }
 
 static int
 gtaskqueue_cancel_locked(struct gtaskqueue *queue, struct gtask *gtask)
 {
 
 	if (gtask->ta_flags & TASK_ENQUEUED)
 		STAILQ_REMOVE(&queue->tq_queue, gtask, gtask, ta_link);
 	gtask->ta_flags &= ~TASK_ENQUEUED;
 	return (task_is_running(queue, gtask) ? EBUSY : 0);
 }
 
 int
 gtaskqueue_cancel(struct gtaskqueue *queue, struct gtask *gtask)
 {
 	int error;
 
 	TQ_LOCK(queue);
 	error = gtaskqueue_cancel_locked(queue, gtask);
 	TQ_UNLOCK(queue);
 
 	return (error);
 }
 
 void
 gtaskqueue_drain(struct gtaskqueue *queue, struct gtask *gtask)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	while ((gtask->ta_flags & TASK_ENQUEUED) || task_is_running(queue, gtask))
 		TQ_SLEEP(queue, gtask, &queue->tq_mutex, PWAIT, "-", 0);
 	TQ_UNLOCK(queue);
 }
 
 void
 gtaskqueue_drain_all(struct gtaskqueue *queue)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	gtaskqueue_drain_tq_queue(queue);
 	gtaskqueue_drain_tq_active(queue);
 	TQ_UNLOCK(queue);
 }
 
 static int
 _gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
     cpuset_t *mask, const char *name, va_list ap)
 {
 	char ktname[MAXCOMLEN + 1];
 	struct thread *td;
 	struct gtaskqueue *tq;
 	int i, error;
 
 	if (count <= 0)
 		return (EINVAL);
 
 	vsnprintf(ktname, sizeof(ktname), name, ap);
 	tq = *tqp;
 
 	tq->tq_threads = malloc(sizeof(struct thread *) * count, M_GTASKQUEUE,
 	    M_NOWAIT | M_ZERO);
 	if (tq->tq_threads == NULL) {
 		printf("%s: no memory for %s threads\n", __func__, ktname);
 		return (ENOMEM);
 	}
 
 	for (i = 0; i < count; i++) {
 		if (count == 1)
 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname);
 		else
 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0,
 			    "%s_%d", ktname, i);
 		if (error) {
 			/* should be ok to continue, taskqueue_free will dtrt */
 			printf("%s: kthread_add(%s): error %d", __func__,
 			    ktname, error);
 			tq->tq_threads[i] = NULL;		/* paranoid */
 		} else
 			tq->tq_tcount++;
 	}
 	for (i = 0; i < count; i++) {
 		if (tq->tq_threads[i] == NULL)
 			continue;
 		td = tq->tq_threads[i];
 		if (mask) {
 			error = cpuset_setthread(td->td_tid, mask);
 			/*
 			 * Failing to pin is rarely an actual fatal error;
 			 * it'll just affect performance.
 			 */
 			if (error)
 				printf("%s: curthread=%llu: can't pin; "
 				    "error=%d\n",
 				    __func__,
 				    (unsigned long long) td->td_tid,
 				    error);
 		}
 		thread_lock(td);
 		sched_prio(td, pri);
 		sched_add(td, SRQ_BORING);
 		thread_unlock(td);
 	}
 
 	return (0);
 }
 
 static int
 gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
     const char *name, ...)
 {
 	va_list ap;
 	int error;
 
 	va_start(ap, name);
 	error = _gtaskqueue_start_threads(tqp, count, pri, NULL, name, ap);
 	va_end(ap);
 	return (error);
 }
 
 static inline void
 gtaskqueue_run_callback(struct gtaskqueue *tq,
     enum taskqueue_callback_type cb_type)
 {
 	taskqueue_callback_fn tq_callback;
 
 	TQ_ASSERT_UNLOCKED(tq);
 	tq_callback = tq->tq_callbacks[cb_type];
 	if (tq_callback != NULL)
 		tq_callback(tq->tq_cb_contexts[cb_type]);
 }
 
 static void
 gtaskqueue_thread_loop(void *arg)
 {
 	struct gtaskqueue **tqp, *tq;
 
 	tqp = arg;
 	tq = *tqp;
 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
 	TQ_LOCK(tq);
 	while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
 		/* XXX ? */
 		gtaskqueue_run_locked(tq);
 		/*
 		 * Because taskqueue_run() can drop tq_mutex, we need to
 		 * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the
 		 * meantime, which means we missed a wakeup.
 		 */
 		if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 			break;
 		TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
 	}
 	gtaskqueue_run_locked(tq);
 	/*
 	 * This thread is on its way out, so just drop the lock temporarily
 	 * in order to call the shutdown callback.  This allows the callback
 	 * to look at the taskqueue, even just before it dies.
 	 */
 	TQ_UNLOCK(tq);
 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN);
 	TQ_LOCK(tq);
 
 	/* rendezvous with thread that asked us to terminate */
 	tq->tq_tcount--;
 	wakeup_one(tq->tq_threads);
 	TQ_UNLOCK(tq);
 	kthread_exit();
 }
 
 static void
 gtaskqueue_thread_enqueue(void *context)
 {
 	struct gtaskqueue **tqp, *tq;
 
 	tqp = context;
 	tq = *tqp;
 	wakeup_one(tq);
 }
 
 
 static struct gtaskqueue *
 gtaskqueue_create_fast(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context)
 {
 	return _gtaskqueue_create(name, mflags, enqueue, context,
 			MTX_SPIN, "fast_taskqueue");
 }
 
 
 struct taskqgroup_cpu {
 	LIST_HEAD(, grouptask)	tgc_tasks;
 	struct gtaskqueue	*tgc_taskq;
 	int	tgc_cnt;
 	int	tgc_cpu;
 };
 
 struct taskqgroup {
 	struct taskqgroup_cpu tqg_queue[MAXCPU];
 	struct mtx	tqg_lock;
 	char *		tqg_name;
 	int		tqg_adjusting;
 	int		tqg_stride;
 	int		tqg_cnt;
 };
 
 struct taskq_bind_task {
 	struct gtask bt_task;
 	int	bt_cpuid;
 };
 
 static void
 taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
 {
 	struct taskqgroup_cpu *qcpu;
 
 	qcpu = &qgroup->tqg_queue[idx];
 	LIST_INIT(&qcpu->tgc_tasks);
 	qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK,
 	    taskqueue_thread_enqueue, &qcpu->tgc_taskq);
 	gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
 	    "%s_%d", qgroup->tqg_name, idx);
 	qcpu->tgc_cpu = cpu;
 }
 
 static void
 taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx)
 {
 
 	gtaskqueue_free(qgroup->tqg_queue[idx].tgc_taskq);
 }
 
 /*
  * Find the taskq with least # of tasks that doesn't currently have any
  * other queues from the uniq identifier.
  */
 static int
 taskqgroup_find(struct taskqgroup *qgroup, void *uniq)
 {
 	struct grouptask *n;
 	int i, idx, mincnt;
 	int strict;
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 	if (qgroup->tqg_cnt == 0)
 		return (0);
 	idx = -1;
 	mincnt = INT_MAX;
 	/*
 	 * Two passes;  First scan for a queue with the least tasks that
 	 * does not already service this uniq id.  If that fails simply find
 	 * the queue with the least total tasks;
 	 */
 	for (strict = 1; mincnt == INT_MAX; strict = 0) {
 		for (i = 0; i < qgroup->tqg_cnt; i++) {
 			if (qgroup->tqg_queue[i].tgc_cnt > mincnt)
 				continue;
 			if (strict) {
 				LIST_FOREACH(n,
 				    &qgroup->tqg_queue[i].tgc_tasks, gt_list)
 					if (n->gt_uniq == uniq)
 						break;
 				if (n != NULL)
 					continue;
 			}
 			mincnt = qgroup->tqg_queue[i].tgc_cnt;
 			idx = i;
 		}
 	}
 	if (idx == -1)
 		panic("taskqgroup_find: Failed to pick a qid.");
 
 	return (idx);
 }
 
 /*
  * smp_started is unusable since it is not set for UP kernels or even for
  * SMP kernels when there is 1 CPU.  This is usually handled by adding a
  * (mp_ncpus == 1) test, but that would be broken here since we need to
  * to synchronize with the SI_SUB_SMP ordering.  Even in the pure SMP case
  * smp_started only gives a fuzzy ordering relative to SI_SUB_SMP.
  *
  * So maintain our own flag.  It must be set after all CPUs are started
  * and before SI_SUB_SMP:SI_ORDER_ANY so that the SYSINIT for delayed
  * adjustment is properly delayed.  SI_ORDER_FOURTH is clearly before
  * SI_ORDER_ANY and unclearly after the CPUs are started.  It would be
  * simpler for adjustment to pass a flag indicating if it is delayed.
  */ 
 
 static int tqg_smp_started;
 
 static void
 tqg_record_smp_started(void *arg)
 {
 	tqg_smp_started = 1;
 }
 
 SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH,
 	tqg_record_smp_started, NULL);
 
 void
 taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
     void *uniq, int irq, char *name)
 {
 	cpuset_t mask;
-	int qid;
+	int qid, error;
 
 	gtask->gt_uniq = uniq;
-	gtask->gt_name = name;
+	snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = -1;
 	mtx_lock(&qgroup->tqg_lock);
 	qid = taskqgroup_find(qgroup, uniq);
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	if (irq != -1 && tqg_smp_started) {
 		gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu;
 		CPU_ZERO(&mask);
 		CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
 		mtx_unlock(&qgroup->tqg_lock);
-		intr_setaffinity(irq, &mask);
+		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
+		if (error)
+			printf("%s: setaffinity failed for %s: %d\n", __func__, gtask->gt_name, error);
 	} else
 		mtx_unlock(&qgroup->tqg_lock);
 }
 
 static void
 taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
-	int qid, cpu;
+	int qid, cpu, error;
 
 	mtx_lock(&qgroup->tqg_lock);
 	qid = taskqgroup_find(qgroup, gtask->gt_uniq);
 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
 	if (gtask->gt_irq != -1) {
 		mtx_unlock(&qgroup->tqg_lock);
 
 		CPU_ZERO(&mask);
 		CPU_SET(cpu, &mask);
-		intr_setaffinity(gtask->gt_irq, &mask);
-
+		error = intr_setaffinity(gtask->gt_irq, CPU_WHICH_IRQ, &mask);
 		mtx_lock(&qgroup->tqg_lock);
+		if (error)
+			printf("%s: %s setaffinity failed: %d\n", __func__, gtask->gt_name, error);
+
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
 
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask,
 			 gt_list);
 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	mtx_unlock(&qgroup->tqg_lock);
 }
 
 int
 taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask,
 	void *uniq, int cpu, int irq, char *name)
 {
 	cpuset_t mask;
-	int i, qid;
+	int i, qid, error;
 
 	qid = -1;
 	gtask->gt_uniq = uniq;
-	gtask->gt_name = name;
+	snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = cpu;
 	mtx_lock(&qgroup->tqg_lock);
 	if (tqg_smp_started) {
 		for (i = 0; i < qgroup->tqg_cnt; i++)
 			if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
 				qid = i;
 				break;
 			}
 		if (qid == -1) {
 			mtx_unlock(&qgroup->tqg_lock);
+			printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu);
 			return (EINVAL);
 		}
 	} else
 		qid = 0;
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
 	mtx_unlock(&qgroup->tqg_lock);
 
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
-	if (irq != -1 && tqg_smp_started)
-		intr_setaffinity(irq, &mask);
+	if (irq != -1 && tqg_smp_started) {
+		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
+		if (error)
+			printf("%s: setaffinity failed: %d\n", __func__, error);
+	}
 	return (0);
 }
 
 static int
 taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
-	int i, qid, irq, cpu;
+	int i, qid, irq, cpu, error;
 
 	qid = -1;
 	irq = gtask->gt_irq;
 	cpu = gtask->gt_cpu;
 	MPASS(tqg_smp_started);
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++)
 		if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
 			qid = i;
 			break;
 		}
 	if (qid == -1) {
 		mtx_unlock(&qgroup->tqg_lock);
+		printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu);
 		return (EINVAL);
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	mtx_unlock(&qgroup->tqg_lock);
 
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
 
-	if (irq != -1)
-		intr_setaffinity(irq, &mask);
+	if (irq != -1) {
+		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
+		if (error)
+			printf("%s: setaffinity failed: %d\n", __func__, error);
+	}
 	return (0);
 }
 
 void
 taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	int i;
 
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++)
 		if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue)
 			break;
 	if (i == qgroup->tqg_cnt)
-		panic("taskqgroup_detach: task not in group\n");
+		panic("taskqgroup_detach: task %s not in group\n", gtask->gt_name);
 	qgroup->tqg_queue[i].tgc_cnt--;
 	LIST_REMOVE(gtask, gt_list);
 	mtx_unlock(&qgroup->tqg_lock);
 	gtask->gt_taskqueue = NULL;
 }
 
 static void
 taskqgroup_binder(void *ctx)
 {
 	struct taskq_bind_task *gtask = (struct taskq_bind_task *)ctx;
 	cpuset_t mask;
 	int error;
 
 	CPU_ZERO(&mask);
 	CPU_SET(gtask->bt_cpuid, &mask);
 	error = cpuset_setthread(curthread->td_tid, &mask);
 	thread_lock(curthread);
 	sched_bind(curthread, gtask->bt_cpuid);
 	thread_unlock(curthread);
 
 	if (error)
-		printf("taskqgroup_binder: setaffinity failed: %d\n",
+		printf("%s: setaffinity failed: %d\n", __func__,
 		    error);
 	free(gtask, M_DEVBUF);
 }
 
 static void
 taskqgroup_bind(struct taskqgroup *qgroup)
 {
 	struct taskq_bind_task *gtask;
 	int i;
 
 	/*
 	 * Bind taskqueue threads to specific CPUs, if they have been assigned
 	 * one.
 	 */
 	if (qgroup->tqg_cnt == 1)
 		return;
 
 	for (i = 0; i < qgroup->tqg_cnt; i++) {
 		gtask = malloc(sizeof (*gtask), M_DEVBUF, M_WAITOK);
 		GTASK_INIT(&gtask->bt_task, 0, 0, taskqgroup_binder, gtask);
 		gtask->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu;
 		grouptaskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq,
 		    &gtask->bt_task);
 	}
 }
 
 static int
 _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 {
 	LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
 	struct grouptask *gtask;
 	int i, k, old_cnt, old_cpu, cpu;
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 
 	if (cnt < 1 || cnt * stride > mp_ncpus || !tqg_smp_started) {
 		printf("%s: failed cnt: %d stride: %d "
 		    "mp_ncpus: %d tqg_smp_started: %d\n",
 		    __func__, cnt, stride, mp_ncpus, tqg_smp_started);
 		return (EINVAL);
 	}
 	if (qgroup->tqg_adjusting) {
-		printf("taskqgroup_adjust failed: adjusting\n");
+		printf("%s failed: adjusting\n", __func__);
 		return (EBUSY);
 	}
 	qgroup->tqg_adjusting = 1;
 	old_cnt = qgroup->tqg_cnt;
 	old_cpu = 0;
 	if (old_cnt < cnt)
 		old_cpu = qgroup->tqg_queue[old_cnt].tgc_cpu;
 	mtx_unlock(&qgroup->tqg_lock);
 	/*
 	 * Set up queue for tasks added before boot.
 	 */
 	if (old_cnt == 0) {
 		LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
 		    grouptask, gt_list);
 		qgroup->tqg_queue[0].tgc_cnt = 0;
 	}
 
 	/*
 	 * If new taskq threads have been added.
 	 */
 	cpu = old_cpu;
 	for (i = old_cnt; i < cnt; i++) {
 		taskqgroup_cpu_create(qgroup, i, cpu);
 
 		for (k = 0; k < stride; k++)
 			cpu = CPU_NEXT(cpu);
 	}
 	mtx_lock(&qgroup->tqg_lock);
 	qgroup->tqg_cnt = cnt;
 	qgroup->tqg_stride = stride;
 
 	/*
 	 * Adjust drivers to use new taskqs.
 	 */
 	for (i = 0; i < old_cnt; i++) {
 		while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) {
 			LIST_REMOVE(gtask, gt_list);
 			qgroup->tqg_queue[i].tgc_cnt--;
 			LIST_INSERT_HEAD(&gtask_head, gtask, gt_list);
 		}
 	}
 	mtx_unlock(&qgroup->tqg_lock);
 
 	while ((gtask = LIST_FIRST(&gtask_head))) {
 		LIST_REMOVE(gtask, gt_list);
 		if (gtask->gt_cpu == -1)
 			taskqgroup_attach_deferred(qgroup, gtask);
 		else if (taskqgroup_attach_cpu_deferred(qgroup, gtask))
 			taskqgroup_attach_deferred(qgroup, gtask);
 	}
 
 #ifdef INVARIANTS
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++) {
 		MPASS(qgroup->tqg_queue[i].tgc_taskq != NULL);
 		LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list)
 			MPASS(gtask->gt_taskqueue != NULL);
 	}
 	mtx_unlock(&qgroup->tqg_lock);
 #endif
 	/*
 	 * If taskq thread count has been reduced.
 	 */
 	for (i = cnt; i < old_cnt; i++)
 		taskqgroup_cpu_remove(qgroup, i);
 
 	taskqgroup_bind(qgroup);
 
 	mtx_lock(&qgroup->tqg_lock);
 	qgroup->tqg_adjusting = 0;
 
 	return (0);
 }
 
 int
 taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 {
 	int error;
 
 	mtx_lock(&qgroup->tqg_lock);
 	error = _taskqgroup_adjust(qgroup, cnt, stride);
 	mtx_unlock(&qgroup->tqg_lock);
 
 	return (error);
 }
 
 struct taskqgroup *
 taskqgroup_create(char *name)
 {
 	struct taskqgroup *qgroup;
 
 	qgroup = malloc(sizeof(*qgroup), M_GTASKQUEUE, M_WAITOK | M_ZERO);
 	mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF);
 	qgroup->tqg_name = name;
 	LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks);
 
 	return (qgroup);
 }
 
 void
 taskqgroup_destroy(struct taskqgroup *qgroup)
 {
 
 }
Index: stable/11/sys/net/ifdi_if.m
===================================================================
--- stable/11/sys/net/ifdi_if.m	(revision 333337)
+++ stable/11/sys/net/ifdi_if.m	(revision 333338)
@@ -1,335 +1,343 @@
 #
 # Copyright (c) 2014, Matthew Macy (kmacy@freebsd.org)
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #     this list of conditions and the following disclaimer.
 #
 #  2. Neither the name of Matthew Macy nor the names of its
 #     contributors may be used to endorse or promote products derived from
 #     this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 #
 # $FreeBSD$
 #
 
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 
 #include <machine/bus.h>
 #include <sys/bus.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/iflib.h>
 
 INTERFACE ifdi;
 
 CODE {
 
 	static void
 	null_void_op(if_ctx_t _ctx __unused)
 	{
 	}
 
 	static void
 	null_timer_op(if_ctx_t _ctx __unused, uint16_t _qsidx __unused)
 	{
 	}
 
 	static int
 	null_int_op(if_ctx_t _ctx __unused)
 	{
 		return (0);
 	}
 
 	static int
 	null_queue_intr_enable(if_ctx_t _ctx __unused, uint16_t _qid __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static void
 	null_led_func(if_ctx_t _ctx __unused, int _onoff __unused)
 	{
 	}
 
 	static void
 	null_vlan_register_op(if_ctx_t _ctx __unused, uint16_t vtag __unused)
 	{
 	}
 
 	static int
 	null_q_setup(if_ctx_t _ctx __unused, uint32_t _qid __unused)
 	{
 		return (0);
 	}
 
 	static int
 	null_i2c_req(if_ctx_t _sctx __unused, struct ifi2creq *_i2c __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static int
 	null_sysctl_int_delay(if_ctx_t _sctx __unused, if_int_delay_info_t _iidi __unused)
 	{
 		return (0);
 	}
 
 	static int
 	null_iov_init(if_ctx_t _ctx __unused, uint16_t num_vfs __unused, const nvlist_t *params __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static int
 	null_vf_add(if_ctx_t _ctx __unused, uint16_t num_vfs __unused, const nvlist_t *params __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static int
 	null_priv_ioctl(if_ctx_t _ctx __unused, u_long command, caddr_t *data __unused)
 	{
 		return (ENOTSUP);
 	}
 };
 
 #
 # bus interfaces
 #
 
 METHOD int attach_pre {
 	if_ctx_t _ctx;
 };
 
 METHOD int attach_post {
 	if_ctx_t _ctx;
 };
 
 METHOD int detach {
 	if_ctx_t _ctx;
 };
 
 METHOD int suspend {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int shutdown {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int resume {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 #
 # downcall to driver to allocate its
 # own queue state and tie it to the parent
 #
 
 METHOD int tx_queues_alloc {
 	if_ctx_t _ctx;
 	caddr_t *_vaddrs;
 	uint64_t *_paddrs;
 	int ntxqs;
 	int ntxqsets;
 };
 
 METHOD int rx_queues_alloc {
 	if_ctx_t _ctx;
 	caddr_t *_vaddrs;
 	uint64_t *_paddrs;
 	int nrxqs;
 	int nrxqsets;
 };
 
 METHOD void queues_free {
 	if_ctx_t _ctx;
 };
 
 #
 # interface reset / stop
 #
 
 METHOD void init {
 	if_ctx_t _ctx;
 };
 
 METHOD void stop {
 	if_ctx_t _ctx;
 };
 
 #
 # interrupt setup and manipulation
 #
 
 METHOD int msix_intr_assign {
 	if_ctx_t _sctx;
 	int msix;
 };
 
 METHOD void intr_enable {
 	if_ctx_t _ctx;
 };
 
 METHOD void intr_disable {
 	if_ctx_t _ctx;
 };
 
-METHOD int queue_intr_enable {
+METHOD int rx_queue_intr_enable {
 	if_ctx_t _ctx;
 	uint16_t _qid;
 } DEFAULT null_queue_intr_enable;
 
+METHOD int tx_queue_intr_enable {
+	if_ctx_t _ctx;
+	uint16_t _qid;
+} DEFAULT null_queue_intr_enable;
+
 METHOD void link_intr_enable {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 #
 # interface configuration
 #
 
 METHOD void multi_set {
 	if_ctx_t _ctx;
 };
 
 METHOD int mtu_set {
 	if_ctx_t _ctx;
 	uint32_t _mtu;
 };
 
 METHOD void media_set{
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD int promisc_set {
 	if_ctx_t _ctx;
 	int _flags;
 };
 
 METHOD void crcstrip_set {
 	if_ctx_t _ctx;
 	int _onoff;
+	int _strip;
 };
 
 #
 # IOV handling
 #
 
 METHOD void vflr_handle {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD int iov_init {
 	if_ctx_t _ctx;
 	uint16_t num_vfs;
 	const nvlist_t * params;
 } DEFAULT null_iov_init;
 
 METHOD void iov_uninit {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD int iov_vf_add {
 	if_ctx_t _ctx;
 	uint16_t num_vfs;
 	const nvlist_t * params;
 } DEFAULT null_vf_add;
 
 
 #
 # Device status
 #
 
 METHOD void update_admin_status {
 	if_ctx_t _ctx;
 };
 
 METHOD void media_status {
 	if_ctx_t _ctx;
 	struct ifmediareq *_ifm;
 };
 
 METHOD int media_change {
 	if_ctx_t _ctx;
 };
 
 METHOD uint64_t get_counter {
 	if_ctx_t _ctx;
 	ift_counter cnt;
 };
 
 METHOD int priv_ioctl {
 	if_ctx_t _ctx;
 	u_long   _cmd;
 	caddr_t _data;
 } DEFAULT null_priv_ioctl;
 
 #
 # optional methods
 #
 
 METHOD int i2c_req {
 	if_ctx_t _ctx;
 	struct ifi2creq *_req;
 } DEFAULT null_i2c_req;
 
 METHOD int txq_setup {
 	if_ctx_t _ctx;
 	uint32_t _txqid;
 } DEFAULT null_q_setup;
 
 METHOD int rxq_setup {
 	if_ctx_t _ctx;
 	uint32_t _txqid;
 } DEFAULT null_q_setup;
 
 METHOD void timer {
 	if_ctx_t _ctx;
 	uint16_t _txqid;
 } DEFAULT null_timer_op;
 
 METHOD void watchdog_reset {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD void led_func {
 	if_ctx_t _ctx;
 	int _onoff;
 } DEFAULT null_led_func;
 
 METHOD void vlan_register {
 	if_ctx_t _ctx;
 	uint16_t _vtag;
 } DEFAULT null_vlan_register_op;
 
 METHOD void vlan_unregister {
 	if_ctx_t _ctx;
 	uint16_t _vtag;
 } DEFAULT null_vlan_register_op;
 
 METHOD int sysctl_int_delay {
 	if_ctx_t _sctx;
 	if_int_delay_info_t _iidi;
 } DEFAULT null_sysctl_int_delay;
 
-
+METHOD void debug {
+	if_ctx_t _ctx;
+} DEFAULT null_void_op;
Index: stable/11/sys/net/iflib.c
===================================================================
--- stable/11/sys/net/iflib.c	(revision 333337)
+++ stable/11/sys/net/iflib.c	(revision 333338)
@@ -1,5012 +1,5982 @@
 /*-
- * Copyright (c) 2014-2016, Matthew Macy <mmacy@nextbsd.org>
+ * Copyright (c) 2014-2017, Matthew Macy <mmacy@nextbsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
+#include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/sockio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/kobj.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/limits.h>
 
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/mp_ring.h>
+#include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/in_systm.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
+#include <netinet/ip_var.h>
+#include <netinet6/ip6_var.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/led/led.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 
 #include <net/iflib.h>
 
 #include "ifdi_if.h"
 
 #if defined(__i386__) || defined(__amd64__)
 #include <sys/memdesc.h>
 #include <machine/bus.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/include/busdma_impl.h>
 #include <x86/iommu/busdma_dmar.h>
 #endif
 
-
+#include <sys/bitstring.h>
 /*
- * enable accounting of every mbuf as it comes in to and goes out of iflib's software descriptor references
+ * enable accounting of every mbuf as it comes in to and goes out of
+ * iflib's software descriptor references
  */
 #define MEMORY_LOGGING 0
 /*
  * Enable mbuf vectors for compressing long mbuf chains
  */
 
 /*
  * NB:
  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
  *   we prefetch needs to be determined by the time spent in m_free vis a vis
  *   the cost of a prefetch. This will of course vary based on the workload:
  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
  *        is quite expensive, thus suggesting very little prefetch.
  *      - small packet forwarding which is just returning a single mbuf to
  *        UMA will typically be very fast vis a vis the cost of a memory
  *        access.
  */
 
 
 /*
  * File organization:
  *  - private structures
  *  - iflib private utility functions
  *  - ifnet functions
  *  - vlan registry and other exported functions
  *  - iflib public core functions
  *
  *
  */
 static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
 
 struct iflib_txq;
 typedef struct iflib_txq *iflib_txq_t;
 struct iflib_rxq;
 typedef struct iflib_rxq *iflib_rxq_t;
 struct iflib_fl;
 typedef struct iflib_fl *iflib_fl_t;
 
+static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
+
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
 	void *ifi_filter_arg;
 	struct grouptask *ifi_task;
+	void *ifi_ctx;
 } *iflib_filter_info_t;
 
 struct iflib_ctx {
 	KOBJ_FIELDS;
    /*
    * Pointer to hardware driver's softc
    */
 	void *ifc_softc;
 	device_t ifc_dev;
 	if_t ifc_ifp;
 
 	cpuset_t ifc_cpus;
 	if_shared_ctx_t ifc_sctx;
 	struct if_softc_ctx ifc_softc_ctx;
 
 	struct mtx ifc_mtx;
 
 	uint16_t ifc_nhwtxqs;
-	uint16_t ifc_nhwrxqs;
 
 	iflib_txq_t ifc_txqs;
 	iflib_rxq_t ifc_rxqs;
 	uint32_t ifc_if_flags;
 	uint32_t ifc_flags;
 	uint32_t ifc_max_fl_buf_size;
 	int ifc_in_detach;
 
 	int ifc_link_state;
 	int ifc_link_irq;
-	int ifc_pause_frames;
 	int ifc_watchdog_events;
 	struct cdev *ifc_led_dev;
 	struct resource *ifc_msix_mem;
 
 	struct if_irq ifc_legacy_irq;
 	struct grouptask ifc_admin_task;
 	struct grouptask ifc_vflr_task;
 	struct iflib_filter_info ifc_filter_info;
 	struct ifmedia	ifc_media;
 
 	struct sysctl_oid *ifc_sysctl_node;
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
 	uint16_t ifc_sysctl_qs_eq_override;
+	uint16_t ifc_sysctl_rx_budget;
 
-	uint16_t ifc_sysctl_ntxds[8];
-	uint16_t ifc_sysctl_nrxds[8];
+	qidx_t ifc_sysctl_ntxds[8];
+	qidx_t ifc_sysctl_nrxds[8];
 	struct if_txrx ifc_txrx;
 #define isc_txd_encap  ifc_txrx.ift_txd_encap
 #define isc_txd_flush  ifc_txrx.ift_txd_flush
 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
 #define isc_rxd_available ifc_txrx.ift_rxd_available
 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
 	eventhandler_tag ifc_vlan_attach_event;
 	eventhandler_tag ifc_vlan_detach_event;
 	uint8_t ifc_mac[ETHER_ADDR_LEN];
 	char ifc_mtx_name[16];
 };
 
 
 void *
 iflib_get_softc(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_softc);
 }
 
 device_t
 iflib_get_dev(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_dev);
 }
 
 if_t
 iflib_get_ifp(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_ifp);
 }
 
 struct ifmedia *
 iflib_get_media(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_media);
 }
 
 void
 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
 {
 
 	bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN);
 }
 
 if_softc_ctx_t
 iflib_get_softc_ctx(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_softc_ctx);
 }
 
 if_shared_ctx_t
 iflib_get_sctx(if_ctx_t ctx)
 {
 
 	return (ctx->ifc_sctx);
 }
 
+#define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
+#define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
 
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
 
 #define RX_SW_DESC_MAP_CREATED	(1 << 0)
 #define TX_SW_DESC_MAP_CREATED	(1 << 1)
 #define RX_SW_DESC_INUSE        (1 << 3)
 #define TX_SW_DESC_MAPPED       (1 << 4)
 
-typedef struct iflib_sw_rx_desc {
-	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
-	struct mbuf    *ifsd_m;           /* rx: uninitialized mbuf */
-	caddr_t         ifsd_cl;          /* direct cluster pointer for rx */
-	uint16_t	ifsd_flags;
-} *iflib_rxsd_t;
+#define	M_TOOBIG		M_PROTO1
 
-typedef struct iflib_sw_tx_desc_val {
-	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
-	struct mbuf    *ifsd_m;           /* pkthdr mbuf */
-	uint8_t		ifsd_flags;
-} *iflib_txsd_val_t;
+typedef struct iflib_sw_rx_desc_array {
+	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
+	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
+	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
+	uint8_t		*ifsd_flags;
+} iflib_rxsd_array_t;
 
 typedef struct iflib_sw_tx_desc_array {
 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
 	uint8_t		*ifsd_flags;
-} iflib_txsd_array_t;
+} if_txsd_vec_t;
 
 
 /* magic number that should be high enough for any hardware */
 #define IFLIB_MAX_TX_SEGS		128
-#define IFLIB_MAX_RX_SEGS		32
+/* bnxt supports 64 with hardware LRO enabled */
+#define IFLIB_MAX_RX_SEGS		64
 #define IFLIB_RX_COPY_THRESH		128
 #define IFLIB_MAX_RX_REFRESH		32
+/* The minimum descriptors per second before we start coalescing */
+#define IFLIB_MIN_DESC_SEC		16384
+#define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
 #define IFLIB_QUEUE_IDLE		0
 #define IFLIB_QUEUE_HUNG		1
 #define IFLIB_QUEUE_WORKING		2
+/* maximum number of txqs that can share an rx interrupt */
+#define IFLIB_MAX_TX_SHARED_INTR	4
 
-/* this should really scale with ring size - 32 is a fairly arbitrary value for this */
-#define TX_BATCH_SIZE			16
+/* this should really scale with ring size - this is a fairly arbitrary value */
+#define TX_BATCH_SIZE			32
 
 #define IFLIB_RESTART_BUDGET		8
 
-#define	IFC_LEGACY		0x01
-#define	IFC_QFLUSH		0x02
-#define	IFC_MULTISEG		0x04
-#define	IFC_DMAR		0x08
-#define	IFC_SC_ALLOCATED	0x10
+#define	IFC_LEGACY		0x001
+#define	IFC_QFLUSH		0x002
+#define	IFC_MULTISEG		0x004
+#define	IFC_DMAR		0x008
+#define	IFC_SC_ALLOCATED	0x010
+#define	IFC_INIT_DONE		0x020
+#define	IFC_PREFETCH		0x040
+#define	IFC_DO_RESET		0x080
+#define	IFC_CHECK_HUNG		0x100
 
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 struct iflib_txq {
-	uint16_t	ift_in_use;
-	uint16_t	ift_cidx;
-	uint16_t	ift_cidx_processed;
-	uint16_t	ift_pidx;
+	qidx_t		ift_in_use;
+	qidx_t		ift_cidx;
+	qidx_t		ift_cidx_processed;
+	qidx_t		ift_pidx;
 	uint8_t		ift_gen;
-	uint8_t		ift_db_pending;
-	uint8_t		ift_db_pending_queued;
-	uint8_t		ift_npending;
 	uint8_t		ift_br_offset;
+	uint16_t	ift_npending;
+	uint16_t	ift_db_pending;
+	uint16_t	ift_rs_pending;
 	/* implicit pad */
+	uint8_t		ift_txd_size[8];
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
+	uint64_t	ift_cleaned_prev;
 #if MEMORY_LOGGING
 	uint64_t	ift_enqueued;
 	uint64_t	ift_dequeued;
 #endif
 	uint64_t	ift_no_tx_dma_setup;
 	uint64_t	ift_no_desc_avail;
 	uint64_t	ift_mbuf_defrag_failed;
 	uint64_t	ift_mbuf_defrag;
 	uint64_t	ift_map_failed;
 	uint64_t	ift_txd_encap_efbig;
 	uint64_t	ift_pullups;
 
 	struct mtx	ift_mtx;
 	struct mtx	ift_db_mtx;
 
 	/* constant values */
 	if_ctx_t	ift_ctx;
-	struct ifmp_ring        **ift_br;
+	struct ifmp_ring        *ift_br;
 	struct grouptask	ift_task;
-	uint16_t	ift_size;
+	qidx_t		ift_size;
 	uint16_t	ift_id;
 	struct callout	ift_timer;
-	struct callout	ift_db_check;
 
-	iflib_txsd_array_t	ift_sds;
-	uint8_t			ift_nbr;
-	uint8_t			ift_qstatus;
-	uint8_t			ift_active;
-	uint8_t			ift_closed;
-	int			ift_watchdog_time;
+	if_txsd_vec_t	ift_sds;
+	uint8_t		ift_qstatus;
+	uint8_t		ift_closed;
+	uint8_t		ift_update_freq;
 	struct iflib_filter_info ift_filter_info;
 	bus_dma_tag_t		ift_desc_tag;
 	bus_dma_tag_t		ift_tso_desc_tag;
 	iflib_dma_info_t	ift_ifdi;
 #define MTX_NAME_LEN 16
 	char                    ift_mtx_name[MTX_NAME_LEN];
 	char                    ift_db_mtx_name[MTX_NAME_LEN];
 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
+#ifdef IFLIB_DIAGNOSTICS
+	uint64_t ift_cpu_exec_count[256];
+#endif
 } __aligned(CACHE_LINE_SIZE);
 
 struct iflib_fl {
-	uint16_t	ifl_cidx;
-	uint16_t	ifl_pidx;
-	uint16_t	ifl_credits;
+	qidx_t		ifl_cidx;
+	qidx_t		ifl_pidx;
+	qidx_t		ifl_credits;
 	uint8_t		ifl_gen;
+	uint8_t		ifl_rxd_size;
 #if MEMORY_LOGGING
 	uint64_t	ifl_m_enqueued;
 	uint64_t	ifl_m_dequeued;
 	uint64_t	ifl_cl_enqueued;
 	uint64_t	ifl_cl_dequeued;
 #endif
 	/* implicit pad */
 
+	bitstr_t 	*ifl_rx_bitmap;
+	qidx_t		ifl_fragidx;
 	/* constant */
-	uint16_t	ifl_size;
+	qidx_t		ifl_size;
 	uint16_t	ifl_buf_size;
 	uint16_t	ifl_cltype;
 	uma_zone_t	ifl_zone;
-	iflib_rxsd_t	ifl_sds;
+	iflib_rxsd_array_t	ifl_sds;
 	iflib_rxq_t	ifl_rxq;
 	uint8_t		ifl_id;
 	bus_dma_tag_t           ifl_desc_tag;
 	iflib_dma_info_t	ifl_ifdi;
 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
 	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
+	qidx_t	ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
 }  __aligned(CACHE_LINE_SIZE);
 
-static inline int
-get_inuse(int size, int cidx, int pidx, int gen)
+static inline qidx_t
+get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
 {
-	int used;
+	qidx_t used;
 
 	if (pidx > cidx)
 		used = pidx - cidx;
 	else if (pidx < cidx)
 		used = size - cidx + pidx;
 	else if (gen == 0 && pidx == cidx)
 		used = 0;
 	else if (gen == 1 && pidx == cidx)
 		used = size;
 	else
 		panic("bad state");
 
 	return (used);
 }
 
 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
 
 #define IDXDIFF(head, tail, wrap) \
 	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
 
 struct iflib_rxq {
 	/* If there is a separate completion queue -
 	 * these are the cq cidx and pidx. Otherwise
 	 * these are unused.
 	 */
-	uint16_t	ifr_size;
-	uint16_t	ifr_cq_cidx;
-	uint16_t	ifr_cq_pidx;
+	qidx_t		ifr_size;
+	qidx_t		ifr_cq_cidx;
+	qidx_t		ifr_cq_pidx;
 	uint8_t		ifr_cq_gen;
 	uint8_t		ifr_fl_offset;
 
 	if_ctx_t	ifr_ctx;
 	iflib_fl_t	ifr_fl;
 	uint64_t	ifr_rx_irq;
 	uint16_t	ifr_id;
 	uint8_t		ifr_lro_enabled;
 	uint8_t		ifr_nfl;
+	uint8_t		ifr_ntxqirq;
+	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
 	struct lro_ctrl			ifr_lc;
 	struct grouptask        ifr_task;
 	struct iflib_filter_info ifr_filter_info;
 	iflib_dma_info_t		ifr_ifdi;
+
 	/* dynamically allocate if any drivers need a value substantially larger than this */
 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
+#ifdef IFLIB_DIAGNOSTICS
+	uint64_t ifr_cpu_exec_count[256];
+#endif
 }  __aligned(CACHE_LINE_SIZE);
 
+typedef struct if_rxsd {
+	caddr_t *ifsd_cl;
+	struct mbuf **ifsd_m;
+	iflib_fl_t ifsd_fl;
+	qidx_t ifsd_cidx;
+} *if_rxsd_t;
+
+/* multiple of word size */
+#ifdef __LP64__
+#define PKT_INFO_SIZE	6
+#define RXD_INFO_SIZE	5
+#define PKT_TYPE uint64_t
+#else
+#define PKT_INFO_SIZE	11
+#define RXD_INFO_SIZE	8
+#define PKT_TYPE uint32_t
+#endif
+#define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
+#define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
+
+typedef struct if_pkt_info_pad {
+	PKT_TYPE pkt_val[PKT_INFO_SIZE];
+} *if_pkt_info_pad_t;
+typedef struct if_rxd_info_pad {
+	PKT_TYPE rxd_val[RXD_INFO_SIZE];
+} *if_rxd_info_pad_t;
+
+CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
+CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
+
+
+static inline void
+pkt_info_zero(if_pkt_info_t pi)
+{
+	if_pkt_info_pad_t pi_pad;
+
+	pi_pad = (if_pkt_info_pad_t)pi;
+	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
+	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
+#ifndef __LP64__
+	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
+	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
+#endif	
+}
+
+static inline void
+rxd_info_zero(if_rxd_info_t ri)
+{
+	if_rxd_info_pad_t ri_pad;
+	int i;
+
+	ri_pad = (if_rxd_info_pad_t)ri;
+	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
+		ri_pad->rxd_val[i] = 0;
+		ri_pad->rxd_val[i+1] = 0;
+		ri_pad->rxd_val[i+2] = 0;
+		ri_pad->rxd_val[i+3] = 0;
+	}
+#ifdef __LP64__
+	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
+#endif
+}
+
 /*
  * Only allow a single packet to take up most 1/nth of the tx ring
  */
 #define MAX_SINGLE_PACKET_FRACTION 12
 #define IF_BAD_DMA (bus_addr_t)-1
 
-static int enable_msix = 1;
-
-#define mtx_held(m)	(((m)->mtx_lock & ~MTX_FLAGMASK) != (uintptr_t)0)
-
-
-
 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
 
 #define CTX_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_mtx, _name, "iflib ctx lock", MTX_DEF)
 
 #define CTX_LOCK(ctx) mtx_lock(&(ctx)->ifc_mtx)
 #define CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_mtx)
 #define CTX_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_mtx)
 
 
-#define TXDB_LOCK_INIT(txq)  mtx_init(&(txq)->ift_db_mtx, (txq)->ift_db_mtx_name, NULL, MTX_DEF)
-#define TXDB_TRYLOCK(txq) mtx_trylock(&(txq)->ift_db_mtx)
-#define TXDB_LOCK(txq) mtx_lock(&(txq)->ift_db_mtx)
-#define TXDB_UNLOCK(txq) mtx_unlock(&(txq)->ift_db_mtx)
-#define TXDB_LOCK_DESTROY(txq) mtx_destroy(&(txq)->ift_db_mtx)
-
 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
 
 
 /* Our boot-time initialization hook */
 static int	iflib_module_event_handler(module_t, int, void *);
 
 static moduledata_t iflib_moduledata = {
 	"iflib",
 	iflib_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(iflib, 1);
 
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
+TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
 
 #ifndef IFLIB_DEBUG_COUNTERS
 #ifdef INVARIANTS
 #define IFLIB_DEBUG_COUNTERS 1
 #else
 #define IFLIB_DEBUG_COUNTERS 0
 #endif /* !INVARIANTS */
 #endif
 
 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
                    "iflib driver parameters");
 
 /*
  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  */
 static int iflib_min_tx_latency = 0;
-
 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
-		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possibel expense of throughput");
+		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
+static int iflib_no_tx_batch = 0;
+SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
+		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
 
 
 #if IFLIB_DEBUG_COUNTERS
 
 static int iflib_tx_seen;
 static int iflib_tx_sent;
 static int iflib_tx_encap;
 static int iflib_rx_allocs;
 static int iflib_fl_refills;
 static int iflib_fl_refills_large;
 static int iflib_tx_frees;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
 		   &iflib_tx_seen, 0, "# tx mbufs seen");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
 		   &iflib_tx_sent, 0, "# tx mbufs sent");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
 		   &iflib_tx_encap, 0, "# tx mbufs encapped");
 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
 		   &iflib_tx_frees, 0, "# tx frees");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
 		   &iflib_rx_allocs, 0, "# rx allocations");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
 		   &iflib_fl_refills, 0, "# refills");
 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
 		   &iflib_fl_refills_large, 0, "# large refills");
 
 
 static int iflib_txq_drain_flushing;
 static int iflib_txq_drain_oactive;
 static int iflib_txq_drain_notready;
 static int iflib_txq_drain_encapfail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
 		   &iflib_txq_drain_flushing, 0, "# drain flushes");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
 		   &iflib_txq_drain_oactive, 0, "# drain oactives");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
 		   &iflib_txq_drain_notready, 0, "# drain notready");
 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD,
 		   &iflib_txq_drain_encapfail, 0, "# drain encap fails");
 
 
 static int iflib_encap_load_mbuf_fail;
+static int iflib_encap_pad_mbuf_fail;
 static int iflib_encap_txq_avail_fail;
 static int iflib_encap_txd_encap_fail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
+SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
+		   &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
 		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
 
 static int iflib_task_fn_rxs;
 static int iflib_rx_intr_enables;
 static int iflib_fast_intrs;
 static int iflib_intr_link;
 static int iflib_intr_msix; 
 static int iflib_rx_unavail;
 static int iflib_rx_ctx_inactive;
 static int iflib_rx_zero_len;
 static int iflib_rx_if_input;
 static int iflib_rx_mbuf_null;
 static int iflib_rxd_flush;
 
 static int iflib_verbose_debug;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD,
 		   &iflib_intr_link, 0, "# intr link calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD,
 		   &iflib_intr_msix, 0, "# intr msix calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
 		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
 		   &iflib_rx_intr_enables, 0, "# rx intr enables");
 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
 		   &iflib_fast_intrs, 0, "# fast_intr calls");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
 		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
 		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD,
 		   &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
 		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
 SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
 		   &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf");
 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
 	         &iflib_rxd_flush, 0, "# times rxd_flush called");
 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
 		   &iflib_verbose_debug, 0, "enable verbose debugging");
 
 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
+static void
+iflib_debug_reset(void)
+{
+	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
+		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
+		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
+		iflib_txq_drain_notready = iflib_txq_drain_encapfail =
+		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
+		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
+		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
+		iflib_intr_link = iflib_intr_msix = iflib_rx_unavail =
+		iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input =
+		iflib_rx_mbuf_null = iflib_rxd_flush = 0;
+}
 
 #else
 #define DBG_COUNTER_INC(name)
-
+static void iflib_debug_reset(void) {}
 #endif
 
 
 
 #define IFLIB_DEBUG 0
 
 static void iflib_tx_structures_free(if_ctx_t ctx);
 static void iflib_rx_structures_free(if_ctx_t ctx);
 static int iflib_queues_alloc(if_ctx_t ctx);
 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
-static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget);
+static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
 static int iflib_qset_structures_setup(if_ctx_t ctx);
 static int iflib_msix_init(if_ctx_t ctx);
 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str);
 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
 static int iflib_register(if_ctx_t);
 static void iflib_init_locked(if_ctx_t ctx);
 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
+static void iflib_ifmp_purge(iflib_txq_t txq);
+static void _iflib_pre_assert(if_softc_ctx_t scctx);
+static void iflib_stop(if_ctx_t ctx);
+static void iflib_if_init_locked(if_ctx_t ctx);
+#ifndef __NO_STRICT_ALIGNMENT
+static struct mbuf * iflib_fixup_rx(struct mbuf *m);
+#endif
 
-
 #ifdef DEV_NETMAP
 #include <sys/selinfo.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
 
+static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init);
+
 /*
  * device-specific sysctl variables:
  *
  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
  *	During regular operations the CRC is stripped, but on some
  *	hardware reception of frames not multiple of 64 is slower,
  *	so using crcstrip=0 helps in benchmarks.
  *
  * iflib_rx_miss, iflib_rx_miss_bufs:
  *	count packets that might be missed due to lost interrupts.
  */
 SYSCTL_DECL(_dev_netmap);
 /*
  * The xl driver by default strips CRCs and we do not override it.
  */
 
 int iflib_crcstrip = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames");
 
 int iflib_rx_miss, iflib_rx_miss_bufs;
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr");
 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs");
 
 /*
  * Register/unregister. We are already under netmap lock.
  * Only called on the first register or the last unregister.
  */
 static int
 iflib_netmap_register(struct netmap_adapter *na, int onoff)
 {
 	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
+	int status;
 
 	CTX_LOCK(ctx);
 	IFDI_INTR_DISABLE(ctx);
 
 	/* Tell the stack that the interface is no longer active */
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 	if (!CTX_IS_VF(ctx))
-		IFDI_CRCSTRIP_SET(ctx, onoff);
+		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
 
 	/* enable or disable flags and callbacks in na and ifp */
 	if (onoff) {
 		nm_set_native_flags(na);
 	} else {
 		nm_clear_native_flags(na);
 	}
-	IFDI_INIT(ctx);
-	IFDI_CRCSTRIP_SET(ctx, onoff); // XXX why twice ?
+	iflib_stop(ctx);
+	iflib_init_locked(ctx);
+	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
+	status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
+	if (status)
+		nm_clear_native_flags(na);
 	CTX_UNLOCK(ctx);
-	return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
+	return (status);
 }
 
+static int
+netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init)
+{
+	struct netmap_adapter *na = kring->na;
+	u_int const lim = kring->nkr_num_slots - 1;
+	u_int head = kring->rhead;
+	struct netmap_ring *ring = kring->ring;
+	bus_dmamap_t *map;
+	struct if_rxd_update iru;
+	if_ctx_t ctx = rxq->ifr_ctx;
+	iflib_fl_t fl = &rxq->ifr_fl[0];
+	uint32_t refill_pidx, nic_i;
+
+	if (nm_i == head && __predict_true(!init))
+		return 0;
+	iru_init(&iru, rxq, 0 /* flid */);
+	map = fl->ifl_sds.ifsd_map;
+	refill_pidx = netmap_idx_k2n(kring, nm_i);
+	/*
+	 * IMPORTANT: we must leave one free slot in the ring,
+	 * so move head back by one unit
+	 */
+	head = nm_prev(head, lim);
+	while (nm_i != head) {
+		for (int tmp_pidx = 0; tmp_pidx < IFLIB_MAX_RX_REFRESH && nm_i != head; tmp_pidx++) {
+			struct netmap_slot *slot = &ring->slot[nm_i];
+			void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[tmp_pidx]);
+			uint32_t nic_i_dma = refill_pidx;
+			nic_i = netmap_idx_k2n(kring, nm_i);
+
+			MPASS(tmp_pidx < IFLIB_MAX_RX_REFRESH);
+
+			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
+			        return netmap_ring_reinit(kring);
+
+			fl->ifl_vm_addrs[tmp_pidx] = addr;
+			if (__predict_false(init) && map) {
+				netmap_load_map(na, fl->ifl_ifdi->idi_tag, map[nic_i], addr);
+			} else if (map && (slot->flags & NS_BUF_CHANGED)) {
+				/* buffer has changed, reload map */
+				netmap_reload_map(na, fl->ifl_ifdi->idi_tag, map[nic_i], addr);
+			}
+			slot->flags &= ~NS_BUF_CHANGED;
+
+			nm_i = nm_next(nm_i, lim);
+			fl->ifl_rxd_idxs[tmp_pidx] = nic_i = nm_next(nic_i, lim);
+			if (nm_i != head && tmp_pidx < IFLIB_MAX_RX_REFRESH-1)
+				continue;
+
+			iru.iru_pidx = refill_pidx;
+			iru.iru_count = tmp_pidx+1;
+			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
+
+			refill_pidx = nic_i;
+			if (map == NULL)
+				continue;
+
+			for (int n = 0; n < iru.iru_count; n++) {
+				bus_dmamap_sync(fl->ifl_ifdi->idi_tag, map[nic_i_dma],
+						BUS_DMASYNC_PREREAD);
+				/* XXX - change this to not use the netmap func*/
+				nic_i_dma = nm_next(nic_i_dma, lim);
+			}
+		}
+	}
+	kring->nr_hwcur = head;
+
+	if (map)
+		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
+				BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
+	return (0);
+}
+
 /*
  * Reconcile kernel and user view of the transmit ring.
  *
  * All information is in the kring.
  * Userspace wants to send packets up to the one before kring->rhead,
  * kernel knows kring->nr_hwcur is the first unsent packet.
  *
  * Here we push packets out (as many as possible), and possibly
  * reclaim buffers from previously completed transmission.
  *
  * The caller (netmap) guarantees that there is only one instance
  * running at any time. Any interference with other driver
  * methods should be handled by the individual drivers.
  */
 static int
 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
 	u_int n;
 	u_int const lim = kring->nkr_num_slots - 1;
 	u_int const head = kring->rhead;
 	struct if_pkt_info pi;
 
 	/*
 	 * interrupts on every tx packet are expensive so request
 	 * them every half ring, or where NS_REPORT is set
 	 */
 	u_int report_frequency = kring->nkr_num_slots >> 1;
 	/* device-specific */
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 
-	pi.ipi_segs = txq->ift_segs;
-	pi.ipi_qsidx = kring->ring_id;
-	pi.ipi_ndescs = 0;
+	if (txq->ift_sds.ifsd_map)
+		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
+				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
-	bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
-					BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
-
 	/*
 	 * First part: process new packets to send.
 	 * nm_i is the current index in the netmap ring,
 	 * nic_i is the corresponding index in the NIC ring.
 	 *
 	 * If we have packets to send (nm_i != head)
 	 * iterate over the netmap ring, fetch length and update
 	 * the corresponding slot in the NIC ring. Some drivers also
 	 * need to update the buffer's physical address in the NIC slot
 	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
 	 *
 	 * The netmap_reload_map() calls is especially expensive,
 	 * even when (as in this case) the tag is 0, so do only
 	 * when the buffer has actually changed.
 	 *
 	 * If possible do not set the report/intr bit on all slots,
 	 * but only a few times per ring or when NS_REPORT is set.
 	 *
 	 * Finally, on 10G and faster drivers, it might be useful
 	 * to prefetch the next slot and txr entry.
 	 */
 
-	nm_i = kring->nr_hwcur;
+	nm_i = netmap_idx_n2k(kring, kring->nr_hwcur);
+	pkt_info_zero(&pi);
+	pi.ipi_segs = txq->ift_segs;
+	pi.ipi_qsidx = kring->ring_id;
 	if (nm_i != head) {	/* we have new packets to send */
 		nic_i = netmap_idx_k2n(kring, nm_i);
 
 		__builtin_prefetch(&ring->slot[nm_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
-		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
+		if (txq->ift_sds.ifsd_map)
+			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
 			void *addr = PNMB(na, slot, &paddr);
 			int flags = (slot->flags & NS_REPORT ||
 				nic_i == 0 || nic_i == report_frequency) ?
 				IPI_TX_INTR : 0;
 
 			/* device-specific */
+			pi.ipi_len = len;
+			pi.ipi_segs[0].ds_addr = paddr;
+			pi.ipi_segs[0].ds_len = len;
+			pi.ipi_nsegs = 1;
+			pi.ipi_ndescs = 0;
 			pi.ipi_pidx = nic_i;
 			pi.ipi_flags = flags;
 
 			/* Fill the slot in the NIC ring. */
 			ctx->isc_txd_encap(ctx->ifc_softc, &pi);
 
 			/* prefetch for next round */
 			__builtin_prefetch(&ring->slot[nm_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
-			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
+			if (txq->ift_sds.ifsd_map) {
+				__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 
-			NM_CHECK_ADDR_LEN(na, addr, len);
+				NM_CHECK_ADDR_LEN(na, addr, len);
 
-			if (slot->flags & NS_BUF_CHANGED) {
-				/* buffer has changed, reload map */
-				netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
+				if (slot->flags & NS_BUF_CHANGED) {
+					/* buffer has changed, reload map */
+					netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
+				}
+				/* make sure changes to the buffer are synced */
+				bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
+						BUS_DMASYNC_PREWRITE);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
-
-			/* make sure changes to the buffer are synced */
-			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
-							BUS_DMASYNC_PREWRITE);
-
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = head;
 
 		/* synchronize the NIC ring */
-		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
+		if (txq->ift_sds.ifsd_map)
+			bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
 	}
 
 	/*
 	 * Second part: reclaim buffers for completed transmissions.
 	 */
 	if (iflib_tx_credits_update(ctx, txq)) {
 		/* some tx completed, increment avail */
 		nic_i = txq->ift_cidx_processed;
 		kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
 	}
 	return (0);
 }
 
 /*
  * Reconcile kernel and user view of the receive ring.
  * Same as for the txsync, this routine must be efficient.
  * The caller guarantees a single invocations, but races against
  * the rest of the driver should be handled here.
  *
  * On call, kring->rhead is the first packet that userspace wants
  * to keep, and kring->rcur is the wakeup point.
  * The kernel has previously reported packets up to kring->rtail.
  *
  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
  * of whether or not we received an interrupt.
  */
 static int
 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
-	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
-	u_int nm_i;	/* index into the netmap ring */
-	u_int nic_i;	/* index into the NIC ring */
+	uint32_t nm_i;	/* index into the netmap ring */
+	uint32_t nic_i;	/* index into the NIC ring */
 	u_int i, n;
 	u_int const lim = kring->nkr_num_slots - 1;
-	u_int const head = kring->rhead;
+	u_int const head = netmap_idx_n2k(kring, kring->rhead);
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 	struct if_rxd_info ri;
-	/* device-specific */
+
+	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 	iflib_fl_t fl = rxq->ifr_fl;
 	if (head > lim)
 		return netmap_ring_reinit(kring);
 
-	bzero(&ri, sizeof(ri));
-	ri.iri_qsidx = kring->ring_id;
-	ri.iri_ifp = ctx->ifc_ifp;
 	/* XXX check sync modes */
-	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++)
+	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) {
+		if (fl->ifl_sds.ifsd_map == NULL)
+			continue;
 		bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map,
 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
-
+	}
 	/*
 	 * First part: import newly received packets.
 	 *
 	 * nm_i is the index of the next free slot in the netmap ring,
 	 * nic_i is the index of the next received packet in the NIC ring,
 	 * and they may differ in case if_init() has been called while
 	 * in netmap mode. For the receive ring we have
 	 *
 	 *	nic_i = rxr->next_check;
 	 *	nm_i = kring->nr_hwtail (previous)
 	 * and
 	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 *
 	 * rxr->next_check is set to 0 on a ring reinit
 	 */
 	if (netmap_no_pendintr || force_update) {
 		int crclen = iflib_crcstrip ? 0 : 4;
 		int error, avail;
 		uint16_t slot_flags = kring->nkr_slot_flags;
 
-		for (fl = rxq->ifr_fl, i = 0; i < rxq->ifr_nfl; i++, fl++) {
+		for (i = 0; i < rxq->ifr_nfl; i++) {
+			fl = &rxq->ifr_fl[i];
 			nic_i = fl->ifl_cidx;
 			nm_i = netmap_idx_n2k(kring, nic_i);
-			avail = ctx->isc_rxd_available(ctx->ifc_softc, kring->ring_id, nic_i, INT_MAX);
+			avail = iflib_rxd_avail(ctx, rxq, nic_i, USHRT_MAX);
 			for (n = 0; avail > 0; n++, avail--) {
+				rxd_info_zero(&ri);
+				ri.iri_frags = rxq->ifr_frags;
+				ri.iri_qsidx = kring->ring_id;
+				ri.iri_ifp = ctx->ifc_ifp;
+				ri.iri_cidx = nic_i;
+
 				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
-				if (error)
-					ring->slot[nm_i].len = 0;
-				else
-					ring->slot[nm_i].len = ri.iri_len - crclen;
+				ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen;
 				ring->slot[nm_i].flags = slot_flags;
-				bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
-								fl->ifl_sds[nic_i].ifsd_map, BUS_DMASYNC_POSTREAD);
+				if (fl->ifl_sds.ifsd_map)
+					bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
+							fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
 				nm_i = nm_next(nm_i, lim);
 				nic_i = nm_next(nic_i, lim);
 			}
 			if (n) { /* update the state variables */
 				if (netmap_no_pendintr && !force_update) {
 					/* diagnostics */
 					iflib_rx_miss ++;
 					iflib_rx_miss_bufs += n;
 				}
 				fl->ifl_cidx = nic_i;
-				kring->nr_hwtail = nm_i;
+				kring->nr_hwtail = netmap_idx_k2n(kring, nm_i);
 			}
 			kring->nr_kflags &= ~NKR_PENDINTR;
 		}
 	}
 	/*
 	 * Second part: skip past packets that userspace has released.
 	 * (kring->nr_hwcur to head excluded),
 	 * and make the buffers available for reception.
 	 * As usual nm_i is the index in the netmap ring,
 	 * nic_i is the index in the NIC ring, and
 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 */
 	/* XXX not sure how this will work with multiple free lists */
-	nm_i = kring->nr_hwcur;
-	if (nm_i != head) {
-		nic_i = netmap_idx_k2n(kring, nm_i);
-		for (n = 0; nm_i != head; n++) {
-			struct netmap_slot *slot = &ring->slot[nm_i];
-			uint64_t paddr;
-			caddr_t vaddr;
-			void *addr = PNMB(na, slot, &paddr);
+	nm_i = netmap_idx_n2k(kring, kring->nr_hwcur);
 
-			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
-				goto ring_reset;
+	return (netmap_fl_refill(rxq, kring, nm_i, false));
+}
 
-			vaddr = addr;
-			if (slot->flags & NS_BUF_CHANGED) {
-				/* buffer has changed, reload map */
-				netmap_reload_map(na, fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map, addr);
-				slot->flags &= ~NS_BUF_CHANGED;
-			}
-			/*
-			 * XXX we should be batching this operation - TODO
-			 */
-			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i, &paddr, &vaddr, 1, fl->ifl_buf_size);
-			bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map,
-			    BUS_DMASYNC_PREREAD);
-			nm_i = nm_next(nm_i, lim);
-			nic_i = nm_next(nic_i, lim);
-		}
-		kring->nr_hwcur = head;
+static void
+iflib_netmap_intr(struct netmap_adapter *na, int onoff)
+{
+	struct ifnet *ifp = na->ifp;
+	if_ctx_t ctx = ifp->if_softc;
 
-		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
-		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-		/*
-		 * IMPORTANT: we must leave one free slot in the ring,
-		 * so move nic_i back by one unit
-		 */
-		nic_i = nm_prev(nic_i, lim);
-		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
+	CTX_LOCK(ctx);
+	if (onoff) {
+		IFDI_INTR_ENABLE(ctx);
+	} else {
+		IFDI_INTR_DISABLE(ctx);
 	}
-
-	return 0;
-
-ring_reset:
-	return netmap_ring_reinit(kring);
+	CTX_UNLOCK(ctx);
 }
 
+
 static int
 iflib_netmap_attach(if_ctx_t ctx)
 {
 	struct netmap_adapter na;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 
 	bzero(&na, sizeof(na));
 
 	na.ifp = ctx->ifc_ifp;
 	na.na_flags = NAF_BDG_MAYSLEEP;
 	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
 	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
 
 	na.num_tx_desc = scctx->isc_ntxd[0];
 	na.num_rx_desc = scctx->isc_nrxd[0];
 	na.nm_txsync = iflib_netmap_txsync;
 	na.nm_rxsync = iflib_netmap_rxsync;
 	na.nm_register = iflib_netmap_register;
+	na.nm_intr = iflib_netmap_intr;
 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	return (netmap_attach(&na));
 }
 
 static void
 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
 	struct netmap_slot *slot;
 
 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
 	if (slot == NULL)
 		return;
+	if (txq->ift_sds.ifsd_map == NULL)
+		return;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 
 		/*
 		 * In netmap mode, set the map for the packet buffer.
 		 * NOTE: Some drivers (not this one) also need to set
 		 * the physical buffer address in the NIC ring.
 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
 		 * netmap slot index, si
 		 */
 		int si = netmap_idx_n2k(&na->tx_rings[txq->ift_id], i);
 		netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si));
 	}
 }
+
 static void
 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
+	struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
 	struct netmap_slot *slot;
-	iflib_rxsd_t sd;
-	int nrxd;
+	uint32_t nm_i;
 
 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
 	if (slot == NULL)
 		return;
-	sd = rxq->ifr_fl[0].ifl_sds;
-	nrxd = ctx->ifc_softc_ctx.isc_nrxd[0];
-	for (int i = 0; i < nrxd; i++, sd++) {
-			int sj = netmap_idx_n2k(&na->rx_rings[rxq->ifr_id], i);
-			uint64_t paddr;
-			void *addr;
-			caddr_t vaddr;
-
-			vaddr = addr = PNMB(na, slot + sj, &paddr);
-			netmap_load_map(na, rxq->ifr_fl[0].ifl_ifdi->idi_tag, sd->ifsd_map, addr);
-			/* Update descriptor and the cached value */
-			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, i, &paddr, &vaddr, 1, rxq->ifr_fl[0].ifl_buf_size);
-	}
-	/* preserve queue */
-	if (ctx->ifc_ifp->if_capenable & IFCAP_NETMAP) {
-		struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
-		int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
-		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, t);
-	} else
-		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, nrxd-1);
+	nm_i = netmap_idx_n2k(kring, 0);
+	netmap_fl_refill(rxq, kring, nm_i, true);
 }
 
 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
 
 #else
 #define iflib_netmap_txq_init(ctx, txq)
 #define iflib_netmap_rxq_init(ctx, rxq)
 #define iflib_netmap_detach(ifp)
 
 #define iflib_netmap_attach(ctx) (0)
 #define netmap_rx_irq(ifp, qid, budget) (0)
+#define netmap_tx_irq(ifp, qid) do {} while (0)
 
 #endif
 
 #if defined(__i386__) || defined(__amd64__)
 static __inline void
 prefetch(void *x)
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
+static __inline void
+prefetch2cachelines(void *x)
+{
+	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
+#if (CACHE_LINE_SIZE < 128)
+	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
+#endif
+}
 #else
 #define prefetch(x)
+#define prefetch2cachelines(x)
 #endif
 
 static void
+iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
+{
+	iflib_fl_t fl;
+
+	fl = &rxq->ifr_fl[flid];
+	iru->iru_paddrs = fl->ifl_bus_addrs;
+	iru->iru_vaddrs = &fl->ifl_vm_addrs[0];
+	iru->iru_idxs = fl->ifl_rxd_idxs;
+	iru->iru_qsidx = rxq->ifr_id;
+	iru->iru_buf_size = fl->ifl_buf_size;
+	iru->iru_flidx = fl->ifl_id;
+}
+
+static void
 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	if (err)
 		return;
 	*(bus_addr_t *) arg = segs[0].ds_addr;
 }
 
 int
 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
 {
 	int err;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	device_t dev = ctx->ifc_dev;
 
 	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
 
 	err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 				sctx->isc_q_align, 0,	/* alignment, bounds */
 				BUS_SPACE_MAXADDR,	/* lowaddr */
 				BUS_SPACE_MAXADDR,	/* highaddr */
 				NULL, NULL,		/* filter, filterarg */
 				size,			/* maxsize */
 				1,			/* nsegments */
 				size,			/* maxsegsize */
 				BUS_DMA_ALLOCNOW,	/* flags */
 				NULL,			/* lockfunc */
 				NULL,			/* lockarg */
 				&dma->idi_tag);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dma_tag_create failed: %d\n",
 		    __func__, err);
 		goto fail_0;
 	}
 
 	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
 	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
 	if (err) {
 		device_printf(dev,
 		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
 		    __func__, (uintmax_t)size, err);
 		goto fail_1;
 	}
 
 	dma->idi_paddr = IF_BAD_DMA;
 	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
 	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
 	if (err || dma->idi_paddr == IF_BAD_DMA) {
 		device_printf(dev,
 		    "%s: bus_dmamap_load failed: %d\n",
 		    __func__, err);
 		goto fail_2;
 	}
 
 	dma->idi_size = size;
 	return (0);
 
 fail_2:
 	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 fail_1:
 	bus_dma_tag_destroy(dma->idi_tag);
 fail_0:
 	dma->idi_tag = NULL;
 
 	return (err);
 }
 
 int
 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
 {
 	int i, err;
 	iflib_dma_info_t *dmaiter;
 
 	dmaiter = dmalist;
 	for (i = 0; i < count; i++, dmaiter++) {
 		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
 			break;
 	}
 	if (err)
 		iflib_dma_free_multi(dmalist, i);
 	return (err);
 }
 
 void
 iflib_dma_free(iflib_dma_info_t dma)
 {
 	if (dma->idi_tag == NULL)
 		return;
 	if (dma->idi_paddr != IF_BAD_DMA) {
 		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
 		dma->idi_paddr = IF_BAD_DMA;
 	}
 	if (dma->idi_vaddr != NULL) {
 		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 		dma->idi_vaddr = NULL;
 	}
 	bus_dma_tag_destroy(dma->idi_tag);
 	dma->idi_tag = NULL;
 }
 
 void
 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
 {
 	int i;
 	iflib_dma_info_t *dmaiter = dmalist;
 
 	for (i = 0; i < count; i++, dmaiter++)
 		iflib_dma_free(*dmaiter);
 }
 
+#ifdef EARLY_AP_STARTUP
+static const int iflib_started = 1;
+#else
+/*
+ * We used to abuse the smp_started flag to decide if the queues have been
+ * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
+ * That gave bad races, since the SYSINIT() runs strictly after smp_started
+ * is set.  Run a SYSINIT() strictly after that to just set a usable
+ * completion flag.
+ */
+
+static int iflib_started;
+
+static void
+iflib_record_started(void *arg)
+{
+	iflib_started = 1;
+}
+
+SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
+	iflib_record_started, NULL);
+#endif
+
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
+	if (!iflib_started)
+		return (FILTER_HANDLED);
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
 		return (FILTER_HANDLED);
 
 	GROUPTASK_ENQUEUE(gtask);
 	return (FILTER_HANDLED);
 }
 
 static int
+iflib_fast_intr_rxtx(void *arg)
+{
+	iflib_filter_info_t info = arg;
+	struct grouptask *gtask = info->ifi_task;
+	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
+	if_ctx_t ctx;
+	int i, cidx;
+
+	if (!iflib_started)
+		return (FILTER_HANDLED);
+
+	DBG_COUNTER_INC(fast_intrs);
+	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
+		return (FILTER_HANDLED);
+
+	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
+		qidx_t txqid = rxq->ifr_txqid[i];
+
+		ctx = rxq->ifr_ctx;
+
+		if (!ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false)) {
+			IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
+			continue;
+		}
+		GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
+	}
+	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
+		cidx = rxq->ifr_cq_cidx;
+	else
+		cidx = rxq->ifr_fl[0].ifl_cidx;
+	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
+		GROUPTASK_ENQUEUE(gtask);
+	else
+		IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
+	return (FILTER_HANDLED);
+}
+
+
+static int
+iflib_fast_intr_ctx(void *arg)
+{
+	iflib_filter_info_t info = arg;
+	struct grouptask *gtask = info->ifi_task;
+
+	if (!iflib_started)
+		return (FILTER_HANDLED);
+
+	DBG_COUNTER_INC(fast_intrs);
+	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
+		return (FILTER_HANDLED);
+
+	GROUPTASK_ENQUEUE(gtask);
+	return (FILTER_HANDLED);
+}
+
+static int
 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 	driver_filter_t filter, driver_intr_t handler, void *arg,
 				 char *name)
 {
-	int rc;
+	int rc, flags;
 	struct resource *res;
-	void *tag;
+	void *tag = NULL;
 	device_t dev = ctx->ifc_dev;
 
+	flags = RF_ACTIVE;
+	if (ctx->ifc_flags & IFC_LEGACY)
+		flags |= RF_SHAREABLE;
 	MPASS(rid < 512);
 	irq->ii_rid = rid;
-	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid,
-				     RF_SHAREABLE | RF_ACTIVE);
+	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid, flags);
 	if (res == NULL) {
 		device_printf(dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 		return (ENOMEM);
 	}
 	irq->ii_res = res;
 	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
 	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
 						filter, handler, arg, &tag);
 	if (rc != 0) {
 		device_printf(dev,
 		    "failed to setup interrupt for rid %d, name %s: %d\n",
 					  rid, name ? name : "unknown", rc);
 		return (rc);
 	} else if (name)
 		bus_describe_intr(dev, res, tag, "%s", name);
 
 	irq->ii_tag = tag;
 	return (0);
 }
 
 
 /*********************************************************************
  *
  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
  *  the information needed to transmit a packet on the wire. This is
  *  called only once at attach, setup is done every reset.
  *
  **********************************************************************/
 
 static int
 iflib_txsd_alloc(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int err, nsegments, ntsosegments;
 
 	nsegments = scctx->isc_tx_nsegments;
 	ntsosegments = scctx->isc_tx_tso_segments_max;
 	MPASS(scctx->isc_ntxd[0] > 0);
 	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
 	MPASS(nsegments > 0);
 	MPASS(ntsosegments > 0);
 	/*
 	 * Setup DMA descriptor areas.
 	 */
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       sctx->isc_tx_maxsize,		/* maxsize */
 			       nsegments,	/* nsegments */
 			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_desc_tag))) {
 		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
 		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
 		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
 		goto fail;
 	}
-#ifdef IFLIB_DIAGNOSTICS
-	device_printf(dev,"maxsize: %zd nsegments: %d maxsegsize: %zd\n",
-		      sctx->isc_tx_maxsize, nsegments, sctx->isc_tx_maxsegsize);
-
-#endif
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       scctx->isc_tx_tso_size_max,		/* maxsize */
 			       ntsosegments,	/* nsegments */
 			       scctx->isc_tx_tso_segsize_max,	/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txq->ift_tso_desc_tag))) {
 		device_printf(dev,"Unable to allocate TX TSO DMA tag: %d\n", err);
 
 		goto fail;
 	}
-#ifdef IFLIB_DIAGNOSTICS
-	device_printf(dev,"TSO maxsize: %d ntsosegments: %d maxsegsize: %d\n",
-		      scctx->isc_tx_tso_size_max, ntsosegments,
-		      scctx->isc_tx_tso_segsize_max);
-#endif
 	if (!(txq->ift_sds.ifsd_flags =
 	    (uint8_t *) malloc(sizeof(uint8_t) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 	if (!(txq->ift_sds.ifsd_m =
 	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
         /* Create the descriptor buffer dma maps */
-#if defined(ACPI_DMAR) || (!(defined(__i386__) && !defined(__amd64__)))
+#if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__)))
 	if ((ctx->ifc_flags & IFC_DMAR) == 0)
 		return (0);
 
 	if (!(txq->ift_sds.ifsd_map =
 	    (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer map memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
 		err = bus_dmamap_create(txq->ift_desc_tag, 0, &txq->ift_sds.ifsd_map[i]);
 		if (err != 0) {
 			device_printf(dev, "Unable to create TX DMA map\n");
 			goto fail;
 		}
 	}
 #endif
 	return (0);
 fail:
 	/* We free all, it handles case where we are in the middle */
 	iflib_tx_structures_free(ctx);
 	return (err);
 }
 
 static void
 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	bus_dmamap_t map;
 
 	map = NULL;
 	if (txq->ift_sds.ifsd_map != NULL)
 		map = txq->ift_sds.ifsd_map[i];
 	if (map != NULL) {
 		bus_dmamap_unload(txq->ift_desc_tag, map);
 		bus_dmamap_destroy(txq->ift_desc_tag, map);
 		txq->ift_sds.ifsd_map[i] = NULL;
 	}
 }
 
 static void
 iflib_txq_destroy(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 
 	for (int i = 0; i < txq->ift_size; i++)
 		iflib_txsd_destroy(ctx, txq, i);
 	if (txq->ift_sds.ifsd_map != NULL) {
 		free(txq->ift_sds.ifsd_map, M_IFLIB);
 		txq->ift_sds.ifsd_map = NULL;
 	}
 	if (txq->ift_sds.ifsd_m != NULL) {
 		free(txq->ift_sds.ifsd_m, M_IFLIB);
 		txq->ift_sds.ifsd_m = NULL;
 	}
 	if (txq->ift_sds.ifsd_flags != NULL) {
 		free(txq->ift_sds.ifsd_flags, M_IFLIB);
 		txq->ift_sds.ifsd_flags = NULL;
 	}
 	if (txq->ift_desc_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_desc_tag);
 		txq->ift_desc_tag = NULL;
 	}
 	if (txq->ift_tso_desc_tag != NULL) {
 		bus_dma_tag_destroy(txq->ift_tso_desc_tag);
 		txq->ift_tso_desc_tag = NULL;
 	}
 }
 
 static void
 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
 {
 	struct mbuf **mp;
 
 	mp = &txq->ift_sds.ifsd_m[i];
 	if (*mp == NULL)
 		return;
 
 	if (txq->ift_sds.ifsd_map != NULL) {
 		bus_dmamap_sync(txq->ift_desc_tag,
 				txq->ift_sds.ifsd_map[i],
 				BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(txq->ift_desc_tag,
 				  txq->ift_sds.ifsd_map[i]);
 	}
 	m_free(*mp);
 	DBG_COUNTER_INC(tx_frees);
 	*mp = NULL;
 }
 
 static int
 iflib_txq_setup(iflib_txq_t txq)
 {
 	if_ctx_t ctx = txq->ift_ctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_dma_info_t di;
 	int i;
 
-    /* Set number of descriptors available */
+	/* Set number of descriptors available */
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
+	/* XXX make configurable */
+	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
 
 	/* Reset indices */
-	txq->ift_cidx_processed = txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
+	txq->ift_cidx_processed = 0;
+	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 
 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
 		bzero((void *)di->idi_vaddr, di->idi_size);
 
 	IFDI_TXQ_SETUP(ctx, txq->ift_id);
 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
 		bus_dmamap_sync(di->idi_tag, di->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Allocate memory for rx_buffer structures. Since we use one
  *  rx_buffer per received packet, the maximum number of rx_buffer's
  *  that we'll need is equal to the number of receive descriptors
  *  that we've allocated.
  *
  **********************************************************************/
 static int
 iflib_rxsd_alloc(iflib_rxq_t rxq)
 {
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	iflib_fl_t fl;
-	iflib_rxsd_t	rxsd;
 	int			err;
 
 	MPASS(scctx->isc_nrxd[0] > 0);
 	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
 
 	fl = rxq->ifr_fl;
 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
-		fl->ifl_sds = malloc(sizeof(struct iflib_sw_rx_desc) *
-		    scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB,
-		    M_WAITOK | M_ZERO);
-		if (fl->ifl_sds == NULL) {
-			device_printf(dev, "Unable to allocate rx sw desc memory\n");
-			return (ENOMEM);
-		}
 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 					 1, 0,			/* alignment, bounds */
 					 BUS_SPACE_MAXADDR,	/* lowaddr */
 					 BUS_SPACE_MAXADDR,	/* highaddr */
 					 NULL, NULL,		/* filter, filterarg */
 					 sctx->isc_rx_maxsize,	/* maxsize */
 					 sctx->isc_rx_nsegments,	/* nsegments */
 					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
 					 0,			/* flags */
 					 NULL,			/* lockfunc */
 					 NULL,			/* lockarg */
 					 &fl->ifl_desc_tag);
 		if (err) {
 			device_printf(dev, "%s: bus_dma_tag_create failed %d\n",
 				__func__, err);
 			goto fail;
 		}
+		if (!(fl->ifl_sds.ifsd_flags =
+		      (uint8_t *) malloc(sizeof(uint8_t) *
+					 scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
+		if (!(fl->ifl_sds.ifsd_m =
+		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
+					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
+		if (!(fl->ifl_sds.ifsd_cl =
+		      (caddr_t *) malloc(sizeof(caddr_t) *
+					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
 
-		rxsd = fl->ifl_sds;
-		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++, rxsd++) {
-			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &rxsd->ifsd_map);
-			if (err) {
-				device_printf(dev, "%s: bus_dmamap_create failed: %d\n",
-					__func__, err);
+		/* Create the descriptor buffer dma maps */
+#if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__)))
+		if ((ctx->ifc_flags & IFC_DMAR) == 0)
+			continue;
+
+		if (!(fl->ifl_sds.ifsd_map =
+		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer map memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
+
+		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
+			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &fl->ifl_sds.ifsd_map[i]);
+			if (err != 0) {
+				device_printf(dev, "Unable to create RX buffer DMA map\n");
 				goto fail;
 			}
 		}
+#endif
 	}
 	return (0);
 
 fail:
 	iflib_rx_structures_free(ctx);
 	return (err);
 }
 
 
 /*
  * Internal service routines
  */
 
 struct rxq_refill_cb_arg {
 	int               error;
 	bus_dma_segment_t seg;
 	int               nseg;
 };
 
 static void
 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct rxq_refill_cb_arg *cb_arg = arg;
 
 	cb_arg->error = error;
 	cb_arg->seg = segs[0];
 	cb_arg->nseg = nseg;
 }
 
 
 #ifdef ACPI_DMAR
 #define IS_DMAR(ctx) (ctx->ifc_flags & IFC_DMAR)
 #else
 #define IS_DMAR(ctx) (0)
 #endif
 
 /**
  *	rxq_refill - refill an rxq  free-buffer list
  *	@ctx: the iflib context
  *	@rxq: the free-list to refill
  *	@n: the number of new buffers to allocate
  *
  *	(Re)populate an rxq free-buffer list with up to @n new packet buffers.
  *	The caller must assure that @n does not exceed the queue's capacity.
  */
 static void
 _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 {
 	struct mbuf *m;
-	int pidx = fl->ifl_pidx;
-	iflib_rxsd_t rxsd = &fl->ifl_sds[pidx];
-	caddr_t cl;
+	int idx, frag_idx = fl->ifl_fragidx;
+        int pidx = fl->ifl_pidx;
+	caddr_t cl, *sd_cl;
+	struct mbuf **sd_m;
+	uint8_t *sd_flags;
+	struct if_rxd_update iru;
+	bus_dmamap_t *sd_map;
 	int n, i = 0;
 	uint64_t bus_addr;
 	int err;
+	qidx_t credits;
 
+	sd_m = fl->ifl_sds.ifsd_m;
+	sd_map = fl->ifl_sds.ifsd_map;
+	sd_cl = fl->ifl_sds.ifsd_cl;
+	sd_flags = fl->ifl_sds.ifsd_flags;
+	idx = pidx;
+	credits = fl->ifl_credits;
+
 	n  = count;
 	MPASS(n > 0);
-	MPASS(fl->ifl_credits + n <= fl->ifl_size);
+	MPASS(credits + n <= fl->ifl_size);
 
 	if (pidx < fl->ifl_cidx)
 		MPASS(pidx + n <= fl->ifl_cidx);
-	if (pidx == fl->ifl_cidx && (fl->ifl_credits < fl->ifl_size))
+	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
 		MPASS(fl->ifl_gen == 0);
 	if (pidx > fl->ifl_cidx)
 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
 
 	DBG_COUNTER_INC(fl_refills);
 	if (n > 8)
 		DBG_COUNTER_INC(fl_refills_large);
-
+	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
 	while (n--) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
 		 * initialized after rx.
 		 *
 		 * If the cluster is still set then we know a minimum sized packet was received
 		 */
-		if ((cl = rxsd->ifsd_cl) == NULL) {
-			if ((cl = rxsd->ifsd_cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
+		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,  &frag_idx);
+		if ((frag_idx < 0) || (frag_idx >= fl->ifl_size))
+                	bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
+		if ((cl = sd_cl[frag_idx]) == NULL) {
+                       if ((cl = sd_cl[frag_idx] = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
 				break;
 #if MEMORY_LOGGING
 			fl->ifl_cl_enqueued++;
 #endif
 		}
 		if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
 			break;
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_enqueued++;
 #endif
 
 		DBG_COUNTER_INC(rx_allocs);
-#ifdef notyet
-		if ((rxsd->ifsd_flags & RX_SW_DESC_MAP_CREATED) == 0) {
-			int err;
-
-			if ((err = bus_dmamap_create(fl->ifl_ifdi->idi_tag, 0, &rxsd->ifsd_map))) {
-				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
-				uma_zfree(fl->ifl_zone, cl);
-				n = 0;
-				goto done;
-			}
-			rxsd->ifsd_flags |= RX_SW_DESC_MAP_CREATED;
-		}
-#endif
 #if defined(__i386__) || defined(__amd64__)
 		if (!IS_DMAR(ctx)) {
 			bus_addr = pmap_kextract((vm_offset_t)cl);
 		} else
 #endif
 		{
 			struct rxq_refill_cb_arg cb_arg;
 			iflib_rxq_t q;
 
 			cb_arg.error = 0;
 			q = fl->ifl_rxq;
-			err = bus_dmamap_load(fl->ifl_desc_tag, rxsd->ifsd_map,
+			MPASS(sd_map != NULL);
+			MPASS(sd_map[frag_idx] != NULL);
+			err = bus_dmamap_load(fl->ifl_desc_tag, sd_map[frag_idx],
 		         cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0);
+			bus_dmamap_sync(fl->ifl_desc_tag, sd_map[frag_idx],
+					BUS_DMASYNC_PREREAD);
 
 			if (err != 0 || cb_arg.error) {
 				/*
 				 * !zone_pack ?
 				 */
 				if (fl->ifl_zone == zone_pack)
 					uma_zfree(fl->ifl_zone, cl);
 				m_free(m);
 				n = 0;
 				goto done;
 			}
 			bus_addr = cb_arg.seg.ds_addr;
 		}
-		rxsd->ifsd_flags |= RX_SW_DESC_INUSE;
+                bit_set(fl->ifl_rx_bitmap, frag_idx);
+		sd_flags[frag_idx] |= RX_SW_DESC_INUSE;
 
-		MPASS(rxsd->ifsd_m == NULL);
-		rxsd->ifsd_cl = cl;
-		rxsd->ifsd_m = m;
+		MPASS(sd_m[frag_idx] == NULL);
+		sd_cl[frag_idx] = cl;
+		sd_m[frag_idx] = m;
+		fl->ifl_rxd_idxs[i] = frag_idx;
 		fl->ifl_bus_addrs[i] = bus_addr;
 		fl->ifl_vm_addrs[i] = cl;
-		rxsd++;
-		fl->ifl_credits++;
+		credits++;
 		i++;
-		MPASS(fl->ifl_credits <= fl->ifl_size);
-		if (++fl->ifl_pidx == fl->ifl_size) {
-			fl->ifl_pidx = 0;
+		MPASS(credits <= fl->ifl_size);
+		if (++idx == fl->ifl_size) {
 			fl->ifl_gen = 1;
-			rxsd = fl->ifl_sds;
+			idx = 0;
 		}
 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
-			ctx->isc_rxd_refill(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx,
-								 fl->ifl_bus_addrs, fl->ifl_vm_addrs, i, fl->ifl_buf_size);
+			iru.iru_pidx = pidx;
+			iru.iru_count = i;
+			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			i = 0;
-			pidx = fl->ifl_pidx;
+			pidx = idx;
+			fl->ifl_pidx = idx;
+			fl->ifl_credits = credits;
 		}
+
 	}
 done:
+	if (i) {
+		iru.iru_pidx = pidx;
+		iru.iru_count = i;
+		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
+		fl->ifl_pidx = idx;
+		fl->ifl_credits = credits;
+	}
 	DBG_COUNTER_INC(rxd_flush);
 	if (fl->ifl_pidx == 0)
 		pidx = fl->ifl_size - 1;
 	else
 		pidx = fl->ifl_pidx - 1;
+
+	if (sd_map)
+		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
+				BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
+	fl->ifl_fragidx = frag_idx;
 }
 
 static __inline void
 __iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max)
 {
 	/* we avoid allowing pidx to catch up with cidx as it confuses ixl */
 	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
 #ifdef INVARIANTS
 	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
 #endif
 
 	MPASS(fl->ifl_credits <= fl->ifl_size);
 	MPASS(reclaimable == delta);
 
 	if (reclaimable > 0)
 		_iflib_fl_refill(ctx, fl, min(max, reclaimable));
 }
 
 static void
 iflib_fl_bufs_free(iflib_fl_t fl)
 {
 	iflib_dma_info_t idi = fl->ifl_ifdi;
 	uint32_t i;
 
 	for (i = 0; i < fl->ifl_size; i++) {
-		iflib_rxsd_t d = &fl->ifl_sds[i];
+		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
+		uint8_t *sd_flags = &fl->ifl_sds.ifsd_flags[i];
+		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
 
-		if (d->ifsd_flags & RX_SW_DESC_INUSE) {
-			bus_dmamap_unload(fl->ifl_desc_tag, d->ifsd_map);
-			bus_dmamap_destroy(fl->ifl_desc_tag, d->ifsd_map);
-			if (d->ifsd_m != NULL) {
-				m_init(d->ifsd_m, M_NOWAIT, MT_DATA, 0);
-				uma_zfree(zone_mbuf, d->ifsd_m);
+		if (*sd_flags & RX_SW_DESC_INUSE) {
+			if (fl->ifl_sds.ifsd_map != NULL) {
+				bus_dmamap_t sd_map = fl->ifl_sds.ifsd_map[i];
+				bus_dmamap_unload(fl->ifl_desc_tag, sd_map);
+				if (fl->ifl_rxq->ifr_ctx->ifc_in_detach)
+					bus_dmamap_destroy(fl->ifl_desc_tag, sd_map);
 			}
-			if (d->ifsd_cl != NULL)
-				uma_zfree(fl->ifl_zone, d->ifsd_cl);
-			d->ifsd_flags = 0;
+			if (*sd_m != NULL) {
+				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
+				uma_zfree(zone_mbuf, *sd_m);
+			}
+			if (*sd_cl != NULL)
+				uma_zfree(fl->ifl_zone, *sd_cl);
+			*sd_flags = 0;
 		} else {
-			MPASS(d->ifsd_cl == NULL);
-			MPASS(d->ifsd_m == NULL);
+			MPASS(*sd_cl == NULL);
+			MPASS(*sd_m == NULL);
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_dequeued++;
 		fl->ifl_cl_dequeued++;
 #endif
-		d->ifsd_cl = NULL;
-		d->ifsd_m = NULL;
+		*sd_cl = NULL;
+		*sd_m = NULL;
 	}
+#ifdef INVARIANTS
+	for (i = 0; i < fl->ifl_size; i++) {
+		MPASS(fl->ifl_sds.ifsd_flags[i] == 0);
+		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
+		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
+	}
+#endif
 	/*
 	 * Reset free list values
 	 */
-	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = 0;;
+	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
 	bzero(idi->idi_vaddr, idi->idi_size);
 }
 
 /*********************************************************************
  *
  *  Initialize a receive ring and its buffers.
  *
  **********************************************************************/
 static int
 iflib_fl_setup(iflib_fl_t fl)
 {
 	iflib_rxq_t rxq = fl->ifl_rxq;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
+	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
 	/*
 	** Free current RX buffer structs and their mbufs
 	*/
 	iflib_fl_bufs_free(fl);
 	/* Now replenish the mbufs */
 	MPASS(fl->ifl_credits == 0);
 	/*
 	 * XXX don't set the max_frame_size to larger
 	 * than the hardware can handle
 	 */
 	if (sctx->isc_max_frame_size <= 2048)
 		fl->ifl_buf_size = MCLBYTES;
+#ifndef CONTIGMALLOC_WORKS
+	else
+		fl->ifl_buf_size = MJUMPAGESIZE;
+#else
 	else if (sctx->isc_max_frame_size <= 4096)
 		fl->ifl_buf_size = MJUMPAGESIZE;
 	else if (sctx->isc_max_frame_size <= 9216)
 		fl->ifl_buf_size = MJUM9BYTES;
 	else
 		fl->ifl_buf_size = MJUM16BYTES;
+#endif
 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
 	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 
 
 	/* avoid pre-allocating zillions of clusters to an idle card
 	 * potentially speeding up attach
 	 */
 	_iflib_fl_refill(ctx, fl, min(128, fl->ifl_size));
 	MPASS(min(128, fl->ifl_size) == fl->ifl_credits);
 	if (min(128, fl->ifl_size) != fl->ifl_credits)
 		return (ENOBUFS);
 	/*
 	 * handle failure
 	 */
 	MPASS(rxq != NULL);
 	MPASS(fl->ifl_ifdi != NULL);
 	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	return (0);
 }
 
 /*********************************************************************
  *
  *  Free receive ring data structures
  *
  **********************************************************************/
 static void
 iflib_rx_sds_free(iflib_rxq_t rxq)
 {
 	iflib_fl_t fl;
 	int i;
 
 	if (rxq->ifr_fl != NULL) {
 		for (i = 0; i < rxq->ifr_nfl; i++) {
 			fl = &rxq->ifr_fl[i];
 			if (fl->ifl_desc_tag != NULL) {
 				bus_dma_tag_destroy(fl->ifl_desc_tag);
 				fl->ifl_desc_tag = NULL;
 			}
+			free(fl->ifl_sds.ifsd_m, M_IFLIB);
+			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
+			/* XXX destroy maps first */
+			free(fl->ifl_sds.ifsd_map, M_IFLIB);
+			fl->ifl_sds.ifsd_m = NULL;
+			fl->ifl_sds.ifsd_cl = NULL;
+			fl->ifl_sds.ifsd_map = NULL;
 		}
-		if (rxq->ifr_fl->ifl_sds != NULL)
-			free(rxq->ifr_fl->ifl_sds, M_IFLIB);
-
 		free(rxq->ifr_fl, M_IFLIB);
 		rxq->ifr_fl = NULL;
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 }
 
 /*
  * MI independent logic
  *
  */
 static void
 iflib_timer(void *arg)
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
-	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
+	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 	/*
 	** Check on the state of the TX queue(s), this
 	** can be done without the lock because its RO
 	** and the HUNG state will be static if set.
 	*/
 	IFDI_TIMER(ctx, txq->ift_id);
 	if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
-		(ctx->ifc_pause_frames == 0))
+	    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
+	     (sctx->isc_pause_frames == 0)))
 		goto hung;
 
-	if (TXQ_AVAIL(txq) <= 2*scctx->isc_tx_nsegments ||
-	    ifmp_ring_is_stalled(txq->ift_br[0]))
+	if (ifmp_ring_is_stalled(txq->ift_br))
+		txq->ift_qstatus = IFLIB_QUEUE_HUNG;
+	txq->ift_cleaned_prev = txq->ift_cleaned;
+	/* handle any laggards */
+	if (txq->ift_db_pending)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 
-	ctx->ifc_pause_frames = 0;
+	sctx->isc_pause_frames = 0;
 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	return;
 hung:
 	CTX_LOCK(ctx);
-	if_setdrvflagbits(ctx->ifc_ifp, 0, IFF_DRV_RUNNING);
+	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	device_printf(ctx->ifc_dev,  "TX(%d) desc avail = %d, pidx = %d\n",
 				  txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 
 	IFDI_WATCHDOG_RESET(ctx);
 	ctx->ifc_watchdog_events++;
-	ctx->ifc_pause_frames = 0;
 
-	iflib_init_locked(ctx);
+	ctx->ifc_flags |= IFC_DO_RESET;
+	iflib_admin_intr_deferred(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_init_locked(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
+	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
-	int i, j;
+	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 
 
 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	IFDI_INTR_DISABLE(ctx);
 
+	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
+	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 	/* Set hardware offload abilities */
 	if_clearhwassist(ifp);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
-		if_sethwassistbits(ifp, CSUM_IP | CSUM_TCP | CSUM_UDP, 0);
+		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
-		if_sethwassistbits(ifp,  (CSUM_TCP_IPV6 | CSUM_UDP_IPV6), 0);
+		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 
 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
-		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		iflib_netmap_txq_init(ctx, txq);
 	}
-	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
-		iflib_netmap_rxq_init(ctx, rxq);
-	}
 #ifdef INVARIANTS
 	i = if_getdrvflags(ifp);
 #endif
 	IFDI_INIT(ctx);
 	MPASS(if_getdrvflags(ifp) == i);
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
+		/* XXX this should really be done on a per-queue basis */
+		if (if_getcapenable(ifp) & IFCAP_NETMAP) {
+			MPASS(rxq->ifr_id == i);
+			iflib_netmap_rxq_init(ctx, rxq);
+			continue;
+		}
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			if (iflib_fl_setup(fl)) {
 				device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n");
 				goto done;
 			}
 		}
 	}
 	done:
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 	IFDI_INTR_ENABLE(ctx);
 	txq = ctx->ifc_txqs;
 	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq,
 			txq->ift_timer.c_cpu);
 }
 
 static int
 iflib_media_change(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	int err;
 
 	CTX_LOCK(ctx);
 	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
 		iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 static void
 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	CTX_LOCK(ctx);
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	IFDI_MEDIA_STATUS(ctx, ifmr);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_stop(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	iflib_dma_info_t di;
 	iflib_fl_t fl;
 	int i, j;
 
 	/* Tell the stack that the interface is no longer active */
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	IFDI_INTR_DISABLE(ctx);
-	msleep(ctx, &ctx->ifc_mtx, PUSER, "iflib_init", hz);
+	DELAY(1000);
+	IFDI_STOP(ctx);
+	DELAY(1000);
 
+	iflib_debug_reset();
 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
+		CALLOUT_LOCK(txq);
+		callout_stop(&txq->ift_timer);
+		CALLOUT_UNLOCK(txq);
+
 		/* clean any enqueued buffers */
-		iflib_txq_check_drain(txq, 0);
+		iflib_ifmp_purge(txq);
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
 		}
 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
-		txq->ift_in_use = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
+		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 		txq->ift_pullups = 0;
-		ifmp_ring_reset_stats(txq->ift_br[0]);
+		ifmp_ring_reset_stats(txq->ift_br);
 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 	}
 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
-		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwrxqs; j++, di++)
+		for (j = 0, di = rxq->ifr_ifdi; j < rxq->ifr_nfl; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 		/* also resets the free lists pidx/cidx */
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			iflib_fl_bufs_free(fl);
 	}
-	IFDI_STOP(ctx);
 }
 
-static iflib_rxsd_t
-rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int *cltype, int unload)
+static inline caddr_t
+calc_next_rxd(iflib_fl_t fl, int cidx)
 {
+	qidx_t size;
+	int nrxd;
+	caddr_t start, end, cur, next;
+
+	nrxd = fl->ifl_size;
+	size = fl->ifl_rxd_size;
+	start = fl->ifl_ifdi->idi_vaddr;
+
+	if (__predict_false(size == 0))
+		return (start);
+	cur = start + size*cidx;
+	end = start + size*nrxd;
+	next = CACHE_PTR_NEXT(cur);
+	return (next < end ? next : start);
+}
+
+static inline void
+prefetch_pkts(iflib_fl_t fl, int cidx)
+{
+	int nextptr;
+	int nrxd = fl->ifl_size;
+	caddr_t next_rxd;
+
+
+	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
+	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
+	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
+	next_rxd = calc_next_rxd(fl, cidx);
+	prefetch(next_rxd);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
+}
+
+static void
+rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd)
+{
 	int flid, cidx;
-	iflib_rxsd_t sd;
+	bus_dmamap_t map;
 	iflib_fl_t fl;
 	iflib_dma_info_t di;
+	int next;
 
+	map = NULL;
 	flid = irf->irf_flid;
 	cidx = irf->irf_idx;
 	fl = &rxq->ifr_fl[flid];
+	sd->ifsd_fl = fl;
+	sd->ifsd_cidx = cidx;
+	sd->ifsd_m = &fl->ifl_sds.ifsd_m[cidx];
+	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
 	fl->ifl_credits--;
 #if MEMORY_LOGGING
 	fl->ifl_m_dequeued++;
-	if (cltype)
-		fl->ifl_cl_dequeued++;
 #endif
-	sd = &fl->ifl_sds[cidx];
-	di = fl->ifl_ifdi;
-	bus_dmamap_sync(di->idi_tag, di->idi_map,
-			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
+		prefetch_pkts(fl, cidx);
+	if (fl->ifl_sds.ifsd_map != NULL) {
+		next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
+		prefetch(&fl->ifl_sds.ifsd_map[next]);
+		map = fl->ifl_sds.ifsd_map[cidx];
+		di = fl->ifl_ifdi;
+		next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1);
+		prefetch(&fl->ifl_sds.ifsd_flags[next]);
+		bus_dmamap_sync(di->idi_tag, di->idi_map,
+				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/* not valid assert if bxe really does SGE from non-contiguous elements */
-	MPASS(fl->ifl_cidx == cidx);
-	if (unload)
-		bus_dmamap_unload(fl->ifl_desc_tag, sd->ifsd_map);
-
-	if (__predict_false(++fl->ifl_cidx == fl->ifl_size)) {
-		fl->ifl_cidx = 0;
-		fl->ifl_gen = 0;
+		MPASS(fl->ifl_cidx == cidx);
+		if (unload)
+			bus_dmamap_unload(fl->ifl_desc_tag, map);
 	}
-	/* YES ick */
-	if (cltype)
-		*cltype = fl->ifl_cltype;
-	return (sd);
+	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
+	if (__predict_false(fl->ifl_cidx == 0))
+		fl->ifl_gen = 0;
+	if (map != NULL)
+		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
+			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+        bit_clear(fl->ifl_rx_bitmap, cidx);
 }
 
 static struct mbuf *
-assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri)
+assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd)
 {
-	int i, padlen , flags, cltype;
+	int i, padlen , flags;
 	struct mbuf *m, *mh, *mt;
-	iflib_rxsd_t sd;
 	caddr_t cl;
 
 	i = 0;
 	mh = NULL;
 	do {
-		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[i], &cltype, TRUE);
+		rxd_frag_to_sd(rxq, &ri->iri_frags[i], TRUE, sd);
 
-		MPASS(sd->ifsd_cl != NULL);
-		MPASS(sd->ifsd_m != NULL);
+		MPASS(*sd->ifsd_cl != NULL);
+		MPASS(*sd->ifsd_m != NULL);
 
 		/* Don't include zero-length frags */
 		if (ri->iri_frags[i].irf_len == 0) {
 			/* XXX we can save the cluster here, but not the mbuf */
-			m_init(sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
-			m_free(sd->ifsd_m);
-			sd->ifsd_m = NULL;
+			m_init(*sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
+			m_free(*sd->ifsd_m);
+			*sd->ifsd_m = NULL;
 			continue;
 		}
-
-		m = sd->ifsd_m;
+		m = *sd->ifsd_m;
+		*sd->ifsd_m = NULL;
 		if (mh == NULL) {
 			flags = M_PKTHDR|M_EXT;
 			mh = mt = m;
 			padlen = ri->iri_pad;
 		} else {
 			flags = M_EXT;
 			mt->m_next = m;
 			mt = m;
 			/* assuming padding is only on the first fragment */
 			padlen = 0;
 		}
-		sd->ifsd_m = NULL;
-		cl = sd->ifsd_cl;
-		sd->ifsd_cl = NULL;
+		cl = *sd->ifsd_cl;
+		*sd->ifsd_cl = NULL;
 
 		/* Can these two be made one ? */
 		m_init(m, M_NOWAIT, MT_DATA, flags);
-		m_cljset(m, cl, cltype);
+		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
 		/*
 		 * These must follow m_init and m_cljset
 		 */
 		m->m_data += padlen;
 		ri->iri_len -= padlen;
 		m->m_len = ri->iri_frags[i].irf_len;
 	} while (++i < ri->iri_nfrags);
 
 	return (mh);
 }
 
 /*
  * Process one software descriptor
  */
 static struct mbuf *
 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
+	struct if_rxsd sd;
 	struct mbuf *m;
-	iflib_rxsd_t sd;
 
 	/* should I merge this back in now that the two paths are basically duplicated? */
 	if (ri->iri_nfrags == 1 &&
 	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
-		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[0], NULL, FALSE);
-		m = sd->ifsd_m;
-		sd->ifsd_m = NULL;
+		rxd_frag_to_sd(rxq, &ri->iri_frags[0], FALSE, &sd);
+		m = *sd.ifsd_m;
+		*sd.ifsd_m = NULL;
 		m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
-		memcpy(m->m_data, sd->ifsd_cl, ri->iri_len);
+#ifndef __NO_STRICT_ALIGNMENT
+		if (!IP_ALIGNED(m))
+			m->m_data += 2;
+#endif
+		memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
 		m->m_len = ri->iri_frags[0].irf_len;
        } else {
-		m = assemble_segments(rxq, ri);
+		m = assemble_segments(rxq, ri, &sd);
 	}
 	m->m_pkthdr.len = ri->iri_len;
 	m->m_pkthdr.rcvif = ri->iri_ifp;
 	m->m_flags |= ri->iri_flags;
 	m->m_pkthdr.ether_vtag = ri->iri_vtag;
 	m->m_pkthdr.flowid = ri->iri_flowid;
 	M_HASHTYPE_SET(m, ri->iri_rsstype);
 	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
 	m->m_pkthdr.csum_data = ri->iri_csum_data;
 	return (m);
 }
 
+#if defined(INET6) || defined(INET)
+static void
+iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
+{
+	CURVNET_SET(lc->ifp->if_vnet);
+#if defined(INET6)
+	*v6 = VNET(ip6_forwarding);
+#endif
+#if defined(INET)
+	*v4 = VNET(ipforwarding);
+#endif
+	CURVNET_RESTORE();
+}
+
+/*
+ * Returns true if it's possible this packet could be LROed.
+ * if it returns false, it is guaranteed that tcp_lro_rx()
+ * would not return zero.
+ */
 static bool
-iflib_rxeof(iflib_rxq_t rxq, int budget)
+iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
 {
+	struct ether_header *eh;
+	uint16_t eh_type;
+
+	eh = mtod(m, struct ether_header *);
+	eh_type = ntohs(eh->ether_type);
+	switch (eh_type) {
+#if defined(INET6)
+		case ETHERTYPE_IPV6:
+			return !v6_forwarding;
+#endif
+#if defined (INET)
+		case ETHERTYPE_IP:
+			return !v4_forwarding;
+#endif
+	}
+
+	return false;
+}
+#else
+static void
+iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
+{
+}
+#endif
+
+static bool
+iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
+{
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int avail, i;
-	uint16_t *cidxp;
+	qidx_t *cidxp;
 	struct if_rxd_info ri;
 	int err, budget_left, rx_bytes, rx_pkts;
 	iflib_fl_t fl;
 	struct ifnet *ifp;
 	int lro_enabled;
+	bool lro_possible = false;
+	bool v4_forwarding, v6_forwarding;
+
 	/*
 	 * XXX early demux data packets so that if_input processing only handles
 	 * acks in interrupt context
 	 */
-	struct mbuf *m, *mh, *mt;
+	struct mbuf *m, *mh, *mt, *mf;
 
-	if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &budget)) {
-		return (FALSE);
-	}
-
+	ifp = ctx->ifc_ifp;
 	mh = mt = NULL;
 	MPASS(budget > 0);
 	rx_pkts	= rx_bytes = 0;
 	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
 		cidxp = &rxq->ifr_cq_cidx;
 	else
 		cidxp = &rxq->ifr_fl[0].ifl_cidx;
 	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
 		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 			__iflib_fl_refill_lt(ctx, fl, budget + 8);
 		DBG_COUNTER_INC(rx_unavail);
 		return (false);
 	}
 
 	for (budget_left = budget; (budget_left > 0) && (avail > 0); budget_left--, avail--) {
 		if (__predict_false(!CTX_ACTIVE(ctx))) {
 			DBG_COUNTER_INC(rx_ctx_inactive);
 			break;
 		}
 		/*
 		 * Reset client set fields to their default values
 		 */
-		bzero(&ri, sizeof(ri));
+		rxd_info_zero(&ri);
 		ri.iri_qsidx = rxq->ifr_id;
 		ri.iri_cidx = *cidxp;
-		ri.iri_ifp = ctx->ifc_ifp;
+		ri.iri_ifp = ifp;
 		ri.iri_frags = rxq->ifr_frags;
 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 
-		/* in lieu of handling correctly - make sure it isn't being unhandled */
-		MPASS(err == 0);
+		if (err)
+			goto err;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			*cidxp = ri.iri_cidx;
 			/* Update our consumer index */
+			/* XXX NB: shurd - check if this is still safe */
 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) {
 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 				rxq->ifr_cq_gen = 0;
 			}
 			/* was this only a completion queue message? */
 			if (__predict_false(ri.iri_nfrags == 0))
 				continue;
 		}
 		MPASS(ri.iri_nfrags != 0);
 		MPASS(ri.iri_len != 0);
 
 		/* will advance the cidx on the corresponding free lists */
 		m = iflib_rxd_pkt_get(rxq, &ri);
 		if (avail == 0 && budget_left)
 			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
 
 		if (__predict_false(m == NULL)) {
 			DBG_COUNTER_INC(rx_mbuf_null);
 			continue;
 		}
 		/* imm_pkt: -- cxgb */
 		if (mh == NULL)
 			mh = mt = m;
 		else {
 			mt->m_nextpkt = m;
 			mt = m;
 		}
 	}
 	/* make sure that we can refill faster than drain */
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 		__iflib_fl_refill_lt(ctx, fl, budget + 8);
 
-	ifp = ctx->ifc_ifp;
 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
+	if (lro_enabled)
+		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
+	mt = mf = NULL;
 	while (mh != NULL) {
 		m = mh;
 		mh = mh->m_nextpkt;
 		m->m_nextpkt = NULL;
+#ifndef __NO_STRICT_ALIGNMENT
+		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
+			continue;
+#endif
 		rx_bytes += m->m_pkthdr.len;
 		rx_pkts++;
 #if defined(INET6) || defined(INET)
-		if (lro_enabled && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
-			continue;
+		if (lro_enabled) {
+			if (!lro_possible) {
+				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
+				if (lro_possible && mf != NULL) {
+					ifp->if_input(ifp, mf);
+					DBG_COUNTER_INC(rx_if_input);
+					mt = mf = NULL;
+				}
+			}
+			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
+			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
+				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
+					continue;
+			}
+		}
 #endif
+		if (lro_possible) {
+			ifp->if_input(ifp, m);
+			DBG_COUNTER_INC(rx_if_input);
+			continue;
+		}
+
+		if (mf == NULL)
+			mf = m;
+		if (mt != NULL)
+			mt->m_nextpkt = m;
+		mt = m;
+	}
+	if (mf != NULL) {
+		ifp->if_input(ifp, mf);
 		DBG_COUNTER_INC(rx_if_input);
-		ifp->if_input(ifp, m);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 
 	/*
 	 * Flush any outstanding LRO work
 	 */
 #if defined(INET6) || defined(INET)
 	tcp_lro_flush_all(&rxq->ifr_lc);
 #endif
 	if (avail)
 		return true;
 	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
+err:
+	CTX_LOCK(ctx);
+	ctx->ifc_flags |= IFC_DO_RESET;
+	iflib_admin_intr_deferred(ctx);
+	CTX_UNLOCK(ctx);
+	return (false);
 }
 
+#define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
+static inline qidx_t
+txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
+{
+	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
+	qidx_t minthresh = txq->ift_size / 8;
+	if (in_use > 4*minthresh)
+		return (notify_count);
+	if (in_use > 2*minthresh)
+		return (notify_count >> 1);
+	if (in_use > minthresh)
+		return (notify_count >> 3);
+	return (0);
+}
+
+static inline qidx_t
+txq_max_rs_deferred(iflib_txq_t txq)
+{
+	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
+	qidx_t minthresh = txq->ift_size / 8;
+	if (txq->ift_in_use > 4*minthresh)
+		return (notify_count);
+	if (txq->ift_in_use > 2*minthresh)
+		return (notify_count >> 1);
+	if (txq->ift_in_use > minthresh)
+		return (notify_count >> 2);
+	return (2);
+}
+
 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
-#define TXQ_MAX_DB_DEFERRED(size) (size >> 5)
+
+#define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
+#define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 
-static __inline void
-iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring)
-{
-	uint32_t dbval;
+/* forward compatibility for cxgb */
+#define FIRST_QSET(ctx) 0
+#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
+#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
+#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
+#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 
-	if (ring || txq->ift_db_pending >=
-	    TXQ_MAX_DB_DEFERRED(txq->ift_size)) {
+/* XXX we should be setting this to something other than zero */
+#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
+#define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
 
-		/* the lock will only ever be contended in the !min_latency case */
-		if (!TXDB_TRYLOCK(txq))
-			return;
+static inline bool
+iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use)
+{
+	qidx_t dbval, max;
+	bool rang;
+
+	rang = false;
+	max = TXQ_MAX_DB_DEFERRED(txq, in_use);
+	if (ring || txq->ift_db_pending >= max) {
 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 		txq->ift_db_pending = txq->ift_npending = 0;
-		TXDB_UNLOCK(txq);
+		rang = true;
 	}
+	return (rang);
 }
 
-static void
-iflib_txd_deferred_db_check(void * arg)
-{
-	iflib_txq_t txq = arg;
-
-	/* simple non-zero boolean so use bitwise OR */
-	if ((txq->ift_db_pending | txq->ift_npending) &&
-	    txq->ift_db_pending >= txq->ift_db_pending_queued)
-		iflib_txd_db_check(txq->ift_ctx, txq, TRUE);
-	txq->ift_db_pending_queued = 0;
-	if (ifmp_ring_is_stalled(txq->ift_br[0]))
-		iflib_txq_check_drain(txq, 4);
-}
-
 #ifdef PKT_DEBUG
 static void
 print_pkt(if_pkt_info_t pi)
 {
 	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
 	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
 	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
 	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
 	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
 	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
 }
 #endif
 
 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 
 static int
 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 {
+	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct ether_vlan_header *eh;
 	struct mbuf *m, *n;
 
 	n = m = *mp;
+	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
+	    M_WRITABLE(m) == 0) {
+		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
+			return (ENOMEM);
+		} else {
+			m_freem(*mp);
+			n = *mp = m;
+		}
+	}
+
 	/*
 	 * Determine where frame payload starts.
 	 * Jump over vlan headers if already present,
 	 * helpful for QinQ too.
 	 */
 	if (__predict_false(m->m_len < sizeof(*eh))) {
 		txq->ift_pullups++;
 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 			return (ENOMEM);
 	}
 	eh = mtod(m, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		pi->ipi_etype = ntohs(eh->evl_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
 	}
 
 	switch (pi->ipi_etype) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = NULL;
 		struct tcphdr *th = NULL;
 		int minthlen;
 
 		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
 		if (__predict_false(m->m_len < minthlen)) {
 			/*
 			 * if this code bloat is causing too much of a hit
 			 * move it to a separate function and mark it noinline
 			 */
 			if (m->m_len == pi->ipi_ehdrlen) {
 				n = m->m_next;
 				MPASS(n);
 				if (n->m_len >= sizeof(*ip))  {
 					ip = (struct ip *)n->m_data;
 					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 				} else {
 					txq->ift_pullups++;
 					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 						return (ENOMEM);
 					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				}
 			} else {
 				txq->ift_pullups++;
 				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 					return (ENOMEM);
 				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 			}
 		} else {
 			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		}
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
-		if (pi->ipi_csum_flags & CSUM_IP)
+		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
                        ip->ip_sum = 0;
 
-		if (pi->ipi_ipproto == IPPROTO_TCP) {
-			if (__predict_false(th == NULL)) {
-				txq->ift_pullups++;
-				if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
-					return (ENOMEM);
-				th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
-			}
-			pi->ipi_tcp_hflags = th->th_flags;
-			pi->ipi_tcp_hlen = th->th_off << 2;
-			pi->ipi_tcp_seq = th->th_seq;
-		}
 		if (IS_TSO4(pi)) {
+			if (pi->ipi_ipproto == IPPROTO_TCP) {
+				if (__predict_false(th == NULL)) {
+					txq->ift_pullups++;
+					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
+						return (ENOMEM);
+					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
+				}
+				pi->ipi_tcp_hflags = th->th_flags;
+				pi->ipi_tcp_hlen = th->th_off << 2;
+				pi->ipi_tcp_seq = th->th_seq;
+			}
 			if (__predict_false(ip->ip_p != IPPROTO_TCP))
 				return (ENXIO);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 					       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
+			if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
+				ip->ip_sum = 0;
+				ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
+			}
 		}
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 		struct tcphdr *th;
 		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 
 		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 				return (ENOMEM);
 		}
 		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
 
 		/* XXX-BZ this will go badly in case of ext hdrs. */
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_flags |= IPI_TX_IPV6;
 
-		if (pi->ipi_ipproto == IPPROTO_TCP) {
-			if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
-				if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
-					return (ENOMEM);
-			}
-			pi->ipi_tcp_hflags = th->th_flags;
-			pi->ipi_tcp_hlen = th->th_off << 2;
-		}
 		if (IS_TSO6(pi)) {
+			if (pi->ipi_ipproto == IPPROTO_TCP) {
+				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
+					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
+						return (ENOMEM);
+				}
+				pi->ipi_tcp_hflags = th->th_flags;
+				pi->ipi_tcp_hlen = th->th_off << 2;
+			}
 
 			if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 				return (ENXIO);
 			/*
 			 * The corresponding flag is set by the stack in the IPv4
 			 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
 			 * So, set it here because the rest of the flow requires it.
 			 */
 			pi->ipi_csum_flags |= CSUM_TCP_IPV6;
 			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 		}
 		break;
 	}
 #endif
 	default:
 		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 		pi->ipi_ip_hlen = 0;
 		break;
 	}
 	*mp = m;
+
 	return (0);
 }
 
-
 static  __noinline  struct mbuf *
 collapse_pkthdr(struct mbuf *m0)
 {
 	struct mbuf *m, *m_next, *tmp;
 
 	m = m0;
 	m_next = m->m_next;
 	while (m_next != NULL && m_next->m_len == 0) {
 		m = m_next;
 		m->m_next = NULL;
 		m_free(m);
 		m_next = m_next->m_next;
 	}
 	m = m0;
 	m->m_next = m_next;
 	if ((m_next->m_flags & M_EXT) == 0) {
 		m = m_defrag(m, M_NOWAIT);
 	} else {
 		tmp = m_next->m_next;
 		memcpy(m_next, m, MPKTHSIZE);
 		m = m_next;
 		m->m_next = tmp;
 	}
 	return (m);
 }
 
 /*
  * If dodgy hardware rejects the scatter gather chain we've handed it
  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
  * m_defrag'd mbufs
  */
 static __noinline struct mbuf *
 iflib_remove_mbuf(iflib_txq_t txq)
 {
 	int ntxd, i, pidx;
 	struct mbuf *m, *mh, **ifsd_m;
 
 	pidx = txq->ift_pidx;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	mh = m = ifsd_m[pidx];
 	ifsd_m[pidx] = NULL;
 #if MEMORY_LOGGING
 	txq->ift_dequeued++;
 #endif
 	i = 1;
 
 	while (m) {
 		ifsd_m[(pidx + i) & (ntxd -1)] = NULL;
 #if MEMORY_LOGGING
 		txq->ift_dequeued++;
 #endif
 		m = m->m_next;
 		i++;
 	}
 	return (mh);
 }
 
 static int
 iflib_busdma_load_mbuf_sg(iflib_txq_t txq, bus_dma_tag_t tag, bus_dmamap_t map,
 			  struct mbuf **m0, bus_dma_segment_t *segs, int *nsegs,
 			  int max_segs, int flags)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
-	int i, next, pidx, mask, err, maxsegsz, ntxd, count;
-	struct mbuf *m, *tmp, **ifsd_m, **mp;
+	int i, next, pidx, err, ntxd, count;
+	struct mbuf *m, *tmp, **ifsd_m;
 
 	m = *m0;
 
 	/*
 	 * Please don't ever do this
 	 */
 	if (__predict_false(m->m_len == 0))
 		*m0 = m = collapse_pkthdr(m);
 
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ntxd = txq->ift_size;
 	pidx = txq->ift_pidx;
 	if (map != NULL) {
 		uint8_t *ifsd_flags = txq->ift_sds.ifsd_flags;
 
 		err = bus_dmamap_load_mbuf_sg(tag, map,
 					      *m0, segs, nsegs, BUS_DMA_NOWAIT);
 		if (err)
 			return (err);
 		ifsd_flags[pidx] |= TX_SW_DESC_MAPPED;
-		i = 0;
-		next = pidx;
-		mask = (txq->ift_size-1);
+		count = 0;
 		m = *m0;
 		do {
-			mp = &ifsd_m[next];
-			*mp = m;
+			if (__predict_false(m->m_len <= 0)) {
+				tmp = m;
+				m = m->m_next;
+				tmp->m_next = NULL;
+				m_free(tmp);
+				continue;
+			}
 			m = m->m_next;
-			if (__predict_false((*mp)->m_len == 0)) {
-				m_free(*mp);
-				*mp = NULL;
-			} else
-				next = (pidx + i) & (ntxd-1);
+			count++;
 		} while (m != NULL);
+		if (count > *nsegs) {
+			ifsd_m[pidx] = *m0;
+			ifsd_m[pidx]->m_flags |= M_TOOBIG;
+			return (0);
+		}
+		m = *m0;
+		count = 0;
+		do {
+			next = (pidx + count) & (ntxd-1);
+			MPASS(ifsd_m[next] == NULL);
+			ifsd_m[next] = m;
+			count++;
+			tmp = m;
+			m = m->m_next;
+		} while (m != NULL);
 	} else {
-		int buflen, sgsize, max_sgsize;
+		int buflen, sgsize, maxsegsz, max_sgsize;
 		vm_offset_t vaddr;
 		vm_paddr_t curaddr;
 
 		count = i = 0;
-		maxsegsz = sctx->isc_tx_maxsize;
 		m = *m0;
+		if (m->m_pkthdr.csum_flags & CSUM_TSO)
+			maxsegsz = scctx->isc_tx_tso_segsize_max;
+		else
+			maxsegsz = sctx->isc_tx_maxsegsize;
+
 		do {
 			if (__predict_false(m->m_len <= 0)) {
 				tmp = m;
 				m = m->m_next;
 				tmp->m_next = NULL;
 				m_free(tmp);
 				continue;
 			}
 			buflen = m->m_len;
 			vaddr = (vm_offset_t)m->m_data;
 			/*
 			 * see if we can't be smarter about physically
 			 * contiguous mappings
 			 */
 			next = (pidx + count) & (ntxd-1);
 			MPASS(ifsd_m[next] == NULL);
 #if MEMORY_LOGGING
 			txq->ift_enqueued++;
 #endif
 			ifsd_m[next] = m;
 			while (buflen > 0) {
+				if (i >= max_segs)
+					goto err;
 				max_sgsize = MIN(buflen, maxsegsz);
 				curaddr = pmap_kextract(vaddr);
 				sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
 				sgsize = MIN(sgsize, max_sgsize);
 				segs[i].ds_addr = curaddr;
 				segs[i].ds_len = sgsize;
 				vaddr += sgsize;
 				buflen -= sgsize;
 				i++;
-				if (i >= max_segs)
-					goto err;
 			}
 			count++;
 			tmp = m;
 			m = m->m_next;
 		} while (m != NULL);
 		*nsegs = i;
 	}
 	return (0);
 err:
 	*m0 = iflib_remove_mbuf(txq);
 	return (EFBIG);
 }
 
+static inline caddr_t
+calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
+{
+	qidx_t size;
+	int ntxd;
+	caddr_t start, end, cur, next;
+
+	ntxd = txq->ift_size;
+	size = txq->ift_txd_size[qid];
+	start = txq->ift_ifdi[qid].idi_vaddr;
+
+	if (__predict_false(size == 0))
+		return (start);
+	cur = start + size*cidx;
+	end = start + size*ntxd;
+	next = CACHE_PTR_NEXT(cur);
+	return (next < end ? next : start);
+}
+
+/*
+ * Pad an mbuf to ensure a minimum ethernet frame size.
+ * min_frame_size is the frame size (less CRC) to pad the mbuf to
+ */
+static __noinline int
+iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
+{
+	/*
+	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
+	 * and ARP message is the smallest common payload I can think of
+	 */
+	static char pad[18];	/* just zeros */
+	int n;
+	struct mbuf *new_head;
+
+	if (!M_WRITABLE(*m_head)) {
+		new_head = m_dup(*m_head, M_NOWAIT);
+		if (new_head == NULL) {
+			m_freem(*m_head);
+			device_printf(dev, "cannot pad short frame, m_dup() failed");
+			DBG_COUNTER_INC(encap_pad_mbuf_fail);
+			return ENOMEM;
+		}
+		m_freem(*m_head);
+		*m_head = new_head;
+	}
+
+	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
+	     n > 0; n -= sizeof(pad))
+		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
+			break;
+
+	if (n > 0) {
+		m_freem(*m_head);
+		device_printf(dev, "cannot pad short frame\n");
+		DBG_COUNTER_INC(encap_pad_mbuf_fail);
+		return (ENOBUFS);
+	}
+
+	return 0;
+}
+
 static int
 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 {
 	if_ctx_t		ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
 	bus_dma_segment_t	*segs;
 	struct mbuf		*m_head;
+	void			*next_txd;
 	bus_dmamap_t		map;
 	struct if_pkt_info	pi;
 	int remap = 0;
 	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
 	bus_dma_tag_t desc_tag;
 
 	segs = txq->ift_segs;
 	ctx = txq->ift_ctx;
 	sctx = ctx->ifc_sctx;
 	scctx = &ctx->ifc_softc_ctx;
 	segs = txq->ift_segs;
 	ntxd = txq->ift_size;
 	m_head = *m_headp;
 	map = NULL;
 
 	/*
 	 * If we're doing TSO the next descriptor to clean may be quite far ahead
 	 */
 	cidx = txq->ift_cidx;
 	pidx = txq->ift_pidx;
-	next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
+	if (ctx->ifc_flags & IFC_PREFETCH) {
+		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
+		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
+			next_txd = calc_next_txd(txq, cidx, 0);
+			prefetch(next_txd);
+		}
 
-	/* prefetch the next cache line of mbuf pointers and flags */
-	prefetch(&txq->ift_sds.ifsd_m[next]);
-	if (txq->ift_sds.ifsd_map != NULL) {
-		prefetch(&txq->ift_sds.ifsd_map[next]);
+		/* prefetch the next cache line of mbuf pointers and flags */
+		prefetch(&txq->ift_sds.ifsd_m[next]);
+		if (txq->ift_sds.ifsd_map != NULL) {
+			prefetch(&txq->ift_sds.ifsd_map[next]);
+			next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
+			prefetch(&txq->ift_sds.ifsd_flags[next]);
+		}
+	} else if (txq->ift_sds.ifsd_map != NULL)
 		map = txq->ift_sds.ifsd_map[pidx];
-		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
-		prefetch(&txq->ift_sds.ifsd_flags[next]);
-	}
 
-
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		desc_tag = txq->ift_tso_desc_tag;
 		max_segs = scctx->isc_tx_tso_segments_max;
 	} else {
 		desc_tag = txq->ift_desc_tag;
 		max_segs = scctx->isc_tx_nsegments;
 	}
+	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
+	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
+		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
+		if (err)
+			return err;
+	}
 	m_head = *m_headp;
-	bzero(&pi, sizeof(pi));
-	pi.ipi_len = m_head->m_pkthdr.len;
+
+	pkt_info_zero(&pi);
 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
-	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
-	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
 	pi.ipi_pidx = pidx;
 	pi.ipi_qsidx = txq->ift_id;
+	pi.ipi_len = m_head->m_pkthdr.len;
+	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
+	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
 
 	/* deliberate bitwise OR to make one condition */
 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
 		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0))
 			return (err);
 		m_head = *m_headp;
 	}
 
 retry:
 	err = iflib_busdma_load_mbuf_sg(txq, desc_tag, map, m_headp, segs, &nsegs, max_segs, BUS_DMA_NOWAIT);
 defrag:
 	if (__predict_false(err)) {
 		switch (err) {
 		case EFBIG:
 			/* try collapse once and defrag once */
 			if (remap == 0)
 				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
 			if (remap == 1)
 				m_head = m_defrag(*m_headp, M_NOWAIT);
 			remap++;
 			if (__predict_false(m_head == NULL))
 				goto defrag_failed;
 			txq->ift_mbuf_defrag++;
 			*m_headp = m_head;
 			goto retry;
 			break;
 		case ENOMEM:
 			txq->ift_no_tx_dma_setup++;
 			break;
 		default:
 			txq->ift_no_tx_dma_setup++;
 			m_freem(*m_headp);
 			DBG_COUNTER_INC(tx_frees);
 			*m_headp = NULL;
 			break;
 		}
 		txq->ift_map_failed++;
 		DBG_COUNTER_INC(encap_load_mbuf_fail);
 		return (err);
 	}
 
 	/*
 	 * XXX assumes a 1 to 1 relationship between segments and
 	 *        descriptors - this does not hold true on all drivers, e.g.
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 		txq->ift_no_desc_avail++;
 		if (map != NULL)
 			bus_dmamap_unload(desc_tag, map);
 		DBG_COUNTER_INC(encap_txq_avail_fail);
 		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		return (ENOBUFS);
 	}
+	/*
+	 * On Intel cards we can greatly reduce the number of TX interrupts
+	 * we see by only setting report status on every Nth descriptor.
+	 * However, this also means that the driver will need to keep track
+	 * of the descriptors that RS was set on to check them for the DD bit.
+	 */
+	txq->ift_rs_pending += nsegs + 1;
+	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
+	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs - 1) <= MAX_TX_DESC(ctx)) {
+		pi.ipi_flags |= IPI_TX_INTR;
+		txq->ift_rs_pending = 0;
+	}
+
 	pi.ipi_segs = segs;
 	pi.ipi_nsegs = nsegs;
 
 	MPASS(pidx >= 0 && pidx < txq->ift_size);
 #ifdef PKT_DEBUG
 	print_pkt(&pi);
 #endif
+	if (map != NULL)
+		bus_dmamap_sync(desc_tag, map, BUS_DMASYNC_PREWRITE);
 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
-		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
-						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-
+		if (map != NULL)
+			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
+					BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		DBG_COUNTER_INC(tx_encap);
-		MPASS(pi.ipi_new_pidx >= 0 &&
-		    pi.ipi_new_pidx < txq->ift_size);
+		MPASS(pi.ipi_new_pidx < txq->ift_size);
 
 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
 			ndesc += txq->ift_size;
 			txq->ift_gen = 1;
 		}
+		/*
+		 * drivers can need as many as 
+		 * two sentinels
+		 */
+		MPASS(ndesc <= pi.ipi_nsegs + 2);
 		MPASS(pi.ipi_new_pidx != pidx);
 		MPASS(ndesc > 0);
 		txq->ift_in_use += ndesc;
+
 		/*
 		 * We update the last software descriptor again here because there may
 		 * be a sentinel and/or there may be more mbufs than segments
 		 */
 		txq->ift_pidx = pi.ipi_new_pidx;
 		txq->ift_npending += pi.ipi_ndescs;
 	} else if (__predict_false(err == EFBIG && remap < 2)) {
 		*m_headp = m_head = iflib_remove_mbuf(txq);
 		remap = 1;
 		txq->ift_txd_encap_efbig++;
 		goto defrag;
 	} else
 		DBG_COUNTER_INC(encap_txd_encap_fail);
 	return (err);
 
 defrag_failed:
 	txq->ift_mbuf_defrag_failed++;
 	txq->ift_map_failed++;
 	m_freem(*m_headp);
 	DBG_COUNTER_INC(tx_frees);
 	*m_headp = NULL;
 	return (ENOMEM);
 }
 
-/* forward compatibility for cxgb */
-#define FIRST_QSET(ctx) 0
-
-#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
-#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
-#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
-#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
-#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
-#define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
-
-
-
-/* if there are more than TXQ_MIN_OCCUPANCY packets pending we consider deferring
- * doorbell writes
- *
- * ORing with 2 assures that min occupancy is never less than 2 without any conditional logic
- */
-#define TXQ_MIN_OCCUPANCY(size) ((size >> 6)| 0x2)
-
-static inline int
-iflib_txq_min_occupancy(iflib_txq_t txq)
-{
-	if_ctx_t ctx;
-
-	ctx = txq->ift_ctx;
-	return (get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx,
-	    txq->ift_gen) < TXQ_MIN_OCCUPANCY(txq->ift_size) +
-	    MAX_TX_DESC(ctx));
-}
-
 static void
 iflib_tx_desc_free(iflib_txq_t txq, int n)
 {
 	int hasmap;
 	uint32_t qsize, cidx, mask, gen;
 	struct mbuf *m, **ifsd_m;
 	uint8_t *ifsd_flags;
 	bus_dmamap_t *ifsd_map;
+	bool do_prefetch;
 
 	cidx = txq->ift_cidx;
 	gen = txq->ift_gen;
 	qsize = txq->ift_size;
 	mask = qsize-1;
 	hasmap = txq->ift_sds.ifsd_map != NULL;
 	ifsd_flags = txq->ift_sds.ifsd_flags;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ifsd_map = txq->ift_sds.ifsd_map;
+	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
 
 	while (n--) {
-		prefetch(ifsd_m[(cidx + 3) & mask]);
-		prefetch(ifsd_m[(cidx + 4) & mask]);
-
+		if (do_prefetch) {
+			prefetch(ifsd_m[(cidx + 3) & mask]);
+			prefetch(ifsd_m[(cidx + 4) & mask]);
+		}
 		if (ifsd_m[cidx] != NULL) {
 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			if (hasmap && (ifsd_flags[cidx] & TX_SW_DESC_MAPPED)) {
 				/*
 				 * does it matter if it's not the TSO tag? If so we'll
 				 * have to add the type to flags
 				 */
 				bus_dmamap_unload(txq->ift_desc_tag, ifsd_map[cidx]);
 				ifsd_flags[cidx] &= ~TX_SW_DESC_MAPPED;
 			}
 			if ((m = ifsd_m[cidx]) != NULL) {
 				/* XXX we don't support any drivers that batch packets yet */
 				MPASS(m->m_nextpkt == NULL);
-
-				m_free(m);
+				/* if the number of clusters exceeds the number of segments
+				 * there won't be space on the ring to save a pointer to each
+				 * cluster so we simply free the list here
+				 */
+				if (m->m_flags & M_TOOBIG) {
+					m_freem(m);
+				} else {
+					m_free(m);
+				}
 				ifsd_m[cidx] = NULL;
 #if MEMORY_LOGGING
 				txq->ift_dequeued++;
 #endif
 				DBG_COUNTER_INC(tx_frees);
 			}
 		}
 		if (__predict_false(++cidx == qsize)) {
 			cidx = 0;
 			gen = 0;
 		}
 	}
 	txq->ift_cidx = cidx;
 	txq->ift_gen = gen;
 }
 
 static __inline int
 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
 {
 	int reclaim;
 	if_ctx_t ctx = txq->ift_ctx;
 
 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 
 	/*
 	 * Need a rate-limiting check so that this isn't called every time
 	 */
 	iflib_tx_credits_update(ctx, txq);
 	reclaim = DESC_RECLAIMABLE(txq);
 
 	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
 #ifdef INVARIANTS
 		if (iflib_verbose_debug) {
 			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
 			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
 			       reclaim, thresh);
 
 		}
 #endif
 		return (0);
 	}
 	iflib_tx_desc_free(txq, reclaim);
 	txq->ift_cleaned += reclaim;
 	txq->ift_in_use -= reclaim;
 
-	if (txq->ift_active == FALSE)
-		txq->ift_active = TRUE;
-
 	return (reclaim);
 }
 
 static struct mbuf **
-_ring_peek_one(struct ifmp_ring *r, int cidx, int offset)
+_ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
 {
+	int next, size;
+	struct mbuf **items;
 
-	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (r->size-1)]));
+	size = r->size;
+	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
+	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
+
+	prefetch(items[(cidx + offset) & (size-1)]);
+	if (remaining > 1) {
+		prefetch2cachelines(&items[next]);
+		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
+		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
+		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
+	}
+	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
 }
 
 static void
 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 {
 
-	ifmp_ring_check_drainage(txq->ift_br[0], budget);
+	ifmp_ring_check_drainage(txq->ift_br, budget);
 }
 
 static uint32_t
 iflib_txq_can_drain(struct ifmp_ring *r)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 
-	return ((TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx)) ||
-		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, false));
+	return ((TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) ||
+		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false));
 }
 
 static uint32_t
 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
-	if_t ifp = ctx->ifc_ifp;
+	struct ifnet *ifp = ctx->ifc_ifp;
 	struct mbuf **mp, *m;
-	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail, err, in_use_prev, desc_used;
+	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail;
+	int reclaimed, err, in_use_prev, desc_used;
+	bool do_prefetch, ring, rang;
 
 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 			    !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
-
+	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+	rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use);
 	avail = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 		DBG_COUNTER_INC(txq_drain_flushing);
 		for (i = 0; i < avail; i++) {
 			m_free(r->items[(cidx + i) & (r->size-1)]);
 			r->items[(cidx + i) & (r->size-1)] = NULL;
 		}
 		return (avail);
 	}
-	iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+
 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
-		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		DBG_COUNTER_INC(txq_drain_oactive);
 		return (0);
 	}
+	if (reclaimed)
+		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
 	count = MIN(avail, TX_BATCH_SIZE);
+#ifdef INVARIANTS
+	if (iflib_verbose_debug)
+		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
+		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
+#endif
+	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
+	avail = TXQ_AVAIL(txq);
+	for (desc_used = i = 0; i < count && avail > MAX_TX_DESC(ctx) + 2; i++) {
+		int pidx_prev, rem = do_prefetch ? count - i : 0;
 
-	for (desc_used = i = 0; i < count && TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2; i++) {
-		mp = _ring_peek_one(r, cidx, i);
+		mp = _ring_peek_one(r, cidx, i, rem);
+		MPASS(mp != NULL && *mp != NULL);
+		if (__predict_false(*mp == (struct mbuf *)txq)) {
+			consumed++;
+			reclaimed++;
+			continue;
+		}
 		in_use_prev = txq->ift_in_use;
+		pidx_prev = txq->ift_pidx;
 		err = iflib_encap(txq, mp);
-		/*
-		 * What other errors should we bail out for?
-		 */
-		if (err == ENOBUFS) {
+		if (__predict_false(err)) {
 			DBG_COUNTER_INC(txq_drain_encapfail);
-			break;
+			/* no room - bail out */
+			if (err == ENOBUFS)
+				break;
+			consumed++;
+			DBG_COUNTER_INC(txq_drain_encapfail);
+			/* we can't send this packet - skip it */
+			continue;
 		}
 		consumed++;
-		if (err)
-			continue;
-
 		pkt_sent++;
 		m = *mp;
 		DBG_COUNTER_INC(tx_sent);
 		bytes_sent += m->m_pkthdr.len;
-		if (m->m_flags & M_MCAST)
-			mcast_sent++;
+		mcast_sent += !!(m->m_flags & M_MCAST);
+		avail = TXQ_AVAIL(txq);
 
 		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
 		desc_used += (txq->ift_in_use - in_use_prev);
-		iflib_txd_db_check(ctx, txq, FALSE);
 		ETHER_BPF_MTAP(ifp, m);
-		if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
+		if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
 			break;
-
-		if (desc_used > TXQ_MAX_DB_CONSUMED(txq->ift_size))
-			break;
+		rang = iflib_txd_db_check(ctx, txq, false, in_use_prev);
 	}
 
-	if ((iflib_min_tx_latency || iflib_txq_min_occupancy(txq)) && txq->ift_db_pending)
-		iflib_txd_db_check(ctx, txq, TRUE);
-	else if ((txq->ift_db_pending || TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)) &&
-		 (callout_pending(&txq->ift_db_check) == 0)) {
-		txq->ift_db_pending_queued = txq->ift_db_pending;
-		callout_reset_on(&txq->ift_db_check, 1, iflib_txd_deferred_db_check,
-				 txq, txq->ift_db_check.c_cpu);
-	}
+	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
+	ring = rang ? false  : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx));
+	iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 	if (mcast_sent)
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
-
+#ifdef INVARIANTS
+	if (iflib_verbose_debug)
+		printf("consumed=%d\n", consumed);
+#endif
 	return (consumed);
 }
 
+static uint32_t
+iflib_txq_drain_always(struct ifmp_ring *r)
+{
+	return (1);
+}
+
+static uint32_t
+iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
+{
+	int i, avail;
+	struct mbuf **mp;
+	iflib_txq_t txq;
+
+	txq = r->cookie;
+
+	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
+	CALLOUT_LOCK(txq);
+	callout_stop(&txq->ift_timer);
+	CALLOUT_UNLOCK(txq);
+
+	avail = IDXDIFF(pidx, cidx, r->size);
+	for (i = 0; i < avail; i++) {
+		mp = _ring_peek_one(r, cidx, i, avail - i);
+		if (__predict_false(*mp == (struct mbuf *)txq))
+			continue;
+		m_freem(*mp);
+	}
+	MPASS(ifmp_ring_is_stalled(r) == 0);
+	return (avail);
+}
+
 static void
+iflib_ifmp_purge(iflib_txq_t txq)
+{
+	struct ifmp_ring *r;
+
+	r = txq->ift_br;
+	r->drain = iflib_txq_drain_free;
+	r->can_drain = iflib_txq_drain_always;
+
+	ifmp_ring_check_drainage(r, r->size);
+
+	r->drain = iflib_txq_drain;
+	r->can_drain = iflib_txq_can_drain;
+}
+
+static void
 _task_fn_tx(void *context)
 {
 	iflib_txq_t txq = context;
 	if_ctx_t ctx = txq->ift_ctx;
+	struct ifnet *ifp = ctx->ifc_ifp;
+	int rc;
 
+#ifdef IFLIB_DIAGNOSTICS
+	txq->ift_cpu_exec_count[curcpu]++;
+#endif
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
-	ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
+	if (if_getcapenable(ifp) & IFCAP_NETMAP) {
+		if (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false))
+			netmap_tx_irq(ifp, txq->ift_id);
+		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
+		return;
+	}
+	if (txq->ift_db_pending)
+		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE);
+	ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
+	if (ctx->ifc_flags & IFC_LEGACY)
+		IFDI_INTR_ENABLE(ctx);
+	else {
+		rc = IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
+		KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
+	}
 }
 
 static void
 _task_fn_rx(void *context)
 {
 	iflib_rxq_t rxq = context;
 	if_ctx_t ctx = rxq->ifr_ctx;
 	bool more;
 	int rc;
+	uint16_t budget;
 
+#ifdef IFLIB_DIAGNOSTICS
+	rxq->ifr_cpu_exec_count[curcpu]++;
+#endif
 	DBG_COUNTER_INC(task_fn_rxs);
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
-
-	if ((more = iflib_rxeof(rxq, 16 /* XXX */)) == false) {
+	more = true;
+#ifdef DEV_NETMAP
+	if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) {
+		u_int work = 0;
+		if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work)) {
+			more = false;
+		}
+	}
+#endif
+	budget = ctx->ifc_sysctl_rx_budget;
+	if (budget == 0)
+		budget = 16;	/* XXX */
+	if (more == false || (more = iflib_rxeof(rxq, budget)) == false) {
 		if (ctx->ifc_flags & IFC_LEGACY)
 			IFDI_INTR_ENABLE(ctx);
 		else {
 			DBG_COUNTER_INC(rx_intr_enables);
-			rc = IFDI_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
+			rc = IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
 		}
 	}
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
 	if (more)
 		GROUPTASK_ENQUEUE(&rxq->ifr_task);
 }
 
 static void
 _task_fn_admin(void *context)
 {
 	if_ctx_t ctx = context;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 	iflib_txq_t txq;
 	int i;
 
-	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
-		return;
+	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) {
+		if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
+			return;
+		}
+	}
 
 	CTX_LOCK(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
 		CALLOUT_UNLOCK(txq);
 	}
 	IFDI_UPDATE_ADMIN_STATUS(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	IFDI_LINK_INTR_ENABLE(ctx);
+	if (ctx->ifc_flags & IFC_DO_RESET) {
+		ctx->ifc_flags &= ~IFC_DO_RESET;
+		iflib_if_init_locked(ctx);
+	}
 	CTX_UNLOCK(ctx);
 
 	if (LINK_ACTIVE(ctx) == 0)
 		return;
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 }
 
 
 static void
 _task_fn_iov(void *context)
 {
 	if_ctx_t ctx = context;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VFLR_HANDLE(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 	if_int_delay_info_t info;
 	if_ctx_t ctx;
 
 	info = (if_int_delay_info_t)arg1;
 	ctx = info->iidi_ctx;
 	info->iidi_req = req;
 	info->iidi_oidp = oidp;
 	CTX_LOCK(ctx);
 	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
 	CTX_UNLOCK(ctx);
 	return (err);
 }
 
 /*********************************************************************
  *
  *  IFNET FUNCTIONS
  *
  **********************************************************************/
 
 static void
 iflib_if_init_locked(if_ctx_t ctx)
 {
 	iflib_stop(ctx);
 	iflib_init_locked(ctx);
 }
 
 
 static void
 iflib_if_init(void *arg)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static int
 iflib_if_transmit(if_t ifp, struct mbuf *m)
 {
 	if_ctx_t	ctx = if_getsoftc(ifp);
 
 	iflib_txq_t txq;
 	int err, qidx;
 
 	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
 		m_freem(m);
-		return (0);
+		return (ENOBUFS);
 	}
 
 	MPASS(m->m_nextpkt == NULL);
 	qidx = 0;
 	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m))
 		qidx = QIDX(ctx, m);
 	/*
 	 * XXX calculate buf_ring based on flowid (divvy up bits?)
 	 */
 	txq = &ctx->ifc_txqs[qidx];
 
 #ifdef DRIVER_BACKPRESSURE
 	if (txq->ift_closed) {
 		while (m != NULL) {
 			next = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 			m = next;
 		}
 		return (ENOBUFS);
 	}
 #endif
 #ifdef notyet
 	qidx = count = 0;
 	mp = marr;
 	next = m;
 	do {
 		count++;
 		next = next->m_nextpkt;
 	} while (next != NULL);
 
 	if (count > nitems(marr))
 		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
 			/* XXX check nextpkt */
 			m_freem(m);
 			/* XXX simplify for now */
 			DBG_COUNTER_INC(tx_frees);
 			return (ENOBUFS);
 		}
 	for (next = m, i = 0; next != NULL; i++) {
 		mp[i] = next;
 		next = next->m_nextpkt;
 		mp[i]->m_nextpkt = NULL;
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
-	err = ifmp_ring_enqueue(txq->ift_br[0], (void **)&m, 1, TX_BATCH_SIZE);
+	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE);
 
+	GROUPTASK_ENQUEUE(&txq->ift_task);
 	if (err) {
-		GROUPTASK_ENQUEUE(&txq->ift_task);
 		/* support forthcoming later */
 #ifdef DRIVER_BACKPRESSURE
 		txq->ift_closed = TRUE;
 #endif
-		ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
+		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 		m_freem(m);
-	} else if (TXQ_AVAIL(txq) < (txq->ift_size >> 1)) {
-		GROUPTASK_ENQUEUE(&txq->ift_task);
 	}
 
 	return (err);
 }
 
 static void
 iflib_if_qflush(if_t ifp)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	CTX_LOCK(ctx);
 	ctx->ifc_flags |= IFC_QFLUSH;
 	CTX_UNLOCK(ctx);
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
-		while (!(ifmp_ring_is_idle(txq->ift_br[0]) || ifmp_ring_is_stalled(txq->ift_br[0])))
+		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
 			iflib_txq_check_drain(txq, 0);
 	CTX_LOCK(ctx);
 	ctx->ifc_flags &= ~IFC_QFLUSH;
 	CTX_UNLOCK(ctx);
 
 	if_qflush(ifp);
 }
 
 
 #define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
-		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING |	\
+		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO)
 
-#define IFCAP_REINIT IFCAP_FLAGS
-
 static int
 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 	struct ifreq	*ifr = (struct ifreq *)data;
 #if defined(INET) || defined(INET6)
 	struct ifaddr	*ifa = (struct ifaddr *)data;
 #endif
 	bool		avoid_reset = FALSE;
 	int		err = 0, reinit = 0, bits;
 
 	switch (command) {
 	case SIOCSIFADDR:
 #ifdef INET
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			avoid_reset = TRUE;
 #endif
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6)
 			avoid_reset = TRUE;
 #endif
 		/*
 		** Calling init results in link renegotiation,
 		** so we avoid doing it when possible.
 		*/
 		if (avoid_reset) {
 			if_setflagbits(ifp, IFF_UP,0);
 			if (!(if_getdrvflags(ifp)& IFF_DRV_RUNNING))
 				reinit = 1;
 #ifdef INET
 			if (!(if_getflags(ifp) & IFF_NOARP))
 				arp_ifinit(ifp, ifa);
 #endif
 		} else
 			err = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCSIFMTU:
 		CTX_LOCK(ctx);
 		if (ifr->ifr_mtu == if_getmtu(ifp)) {
 			CTX_UNLOCK(ctx);
 			break;
 		}
 		bits = if_getdrvflags(ifp);
 		/* stop the driver and free any clusters before proceeding */
 		iflib_stop(ctx);
 
 		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
 			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
 				ctx->ifc_flags |= IFC_MULTISEG;
 			else
 				ctx->ifc_flags &= ~IFC_MULTISEG;
 			err = if_setmtu(ifp, ifr->ifr_mtu);
 		}
 		iflib_init_locked(ctx);
 		if_setdrvflags(ifp, bits);
 		CTX_UNLOCK(ctx);
 		break;
 	case SIOCSIFFLAGS:
 		CTX_LOCK(ctx);
 		if (if_getflags(ifp) & IFF_UP) {
 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
 				    (IFF_PROMISC | IFF_ALLMULTI)) {
 					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
 				}
 			} else
 				reinit = 1;
 		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			iflib_stop(ctx);
 		}
 		ctx->ifc_if_flags = if_getflags(ifp);
 		CTX_UNLOCK(ctx);
 		break;
-
-		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 			CTX_LOCK(ctx);
 			IFDI_INTR_DISABLE(ctx);
 			IFDI_MULTI_SET(ctx);
 			IFDI_INTR_ENABLE(ctx);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	case SIOCSIFMEDIA:
 		CTX_LOCK(ctx);
 		IFDI_MEDIA_SET(ctx);
 		CTX_UNLOCK(ctx);
 		/* falls thru */
 	case SIOCGIFMEDIA:
+	case SIOCGIFXMEDIA:
 		err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
 		break;
 	case SIOCGI2C:
 	{
 		struct ifi2creq i2c;
 
 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 		if (err != 0)
 			break;
 		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 			err = EINVAL;
 			break;
 		}
 		if (i2c.len > sizeof(i2c.data)) {
 			err = EINVAL;
 			break;
 		}
 
 		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
 			    sizeof(i2c));
 		break;
 	}
 	case SIOCSIFCAP:
 	{
 		int mask, setmask;
 
 		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
 		setmask = 0;
 #ifdef TCP_OFFLOAD
 		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
 #endif
 		setmask |= (mask & IFCAP_FLAGS);
 
+		if (setmask  & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
+			setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 		if ((mask & IFCAP_WOL) &&
 		    (if_getcapabilities(ifp) & IFCAP_WOL) != 0)
 			setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC));
 		if_vlancap(ifp);
 		/*
 		 * want to ensure that traffic has stopped before we change any of the flags
 		 */
 		if (setmask) {
 			CTX_LOCK(ctx);
 			bits = if_getdrvflags(ifp);
-			if (setmask & IFCAP_REINIT)
+			if (bits & IFF_DRV_RUNNING)
 				iflib_stop(ctx);
 			if_togglecapenable(ifp, setmask);
-			if (setmask & IFCAP_REINIT)
+			if (bits & IFF_DRV_RUNNING)
 				iflib_init_locked(ctx);
 			if_setdrvflags(ifp, bits);
 			CTX_UNLOCK(ctx);
 		}
 		break;
 	    }
 	case SIOCGPRIVATE_0:
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		CTX_LOCK(ctx);
 		err = IFDI_PRIV_IOCTL(ctx, command, data);
 		CTX_UNLOCK(ctx);
 		break;
 	default:
 		err = ether_ioctl(ifp, command, data);
 		break;
 	}
 	if (reinit)
 		iflib_if_init(ctx);
 	return (err);
 }
 
 static uint64_t
 iflib_if_get_counter(if_t ifp, ift_counter cnt)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	return (IFDI_GET_COUNTER(ctx, cnt));
 }
 
 /*********************************************************************
  *
  *  OTHER FUNCTIONS EXPORTED TO THE STACK
  *
  **********************************************************************/
 
 static void
 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_REGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
-		iflib_init_locked(ctx);
+		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
 {
 	if_ctx_t ctx = if_getsoftc(ifp);
 
 	if ((void *)ctx != arg)
 		return;
 
 	if ((vtag == 0) || (vtag > 4095))
 		return;
 
 	CTX_LOCK(ctx);
 	IFDI_VLAN_UNREGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
-		iflib_init_locked(ctx);
+		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 static void
 iflib_led_func(void *arg, int onoff)
 {
 	if_ctx_t ctx = arg;
 
 	CTX_LOCK(ctx);
 	IFDI_LED_FUNC(ctx, onoff);
 	CTX_UNLOCK(ctx);
 }
 
 /*********************************************************************
  *
  *  BUS FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 int
 iflib_device_probe(device_t dev)
 {
 	pci_vendor_info_t *ent;
 
 	uint16_t	pci_vendor_id, pci_device_id;
 	uint16_t	pci_subvendor_id, pci_subdevice_id;
 	uint16_t	pci_rev_id;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_vendor_id = pci_get_vendor(dev);
 	pci_device_id = pci_get_device(dev);
 	pci_subvendor_id = pci_get_subvendor(dev);
 	pci_subdevice_id = pci_get_subdevice(dev);
 	pci_rev_id = pci_get_revid(dev);
 	if (sctx->isc_parse_devinfo != NULL)
 		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
 
 	ent = sctx->isc_vendor_info;
 	while (ent->pvi_vendor_id != 0) {
 		if (pci_vendor_id != ent->pvi_vendor_id) {
 			ent++;
 			continue;
 		}
 		if ((pci_device_id == ent->pvi_device_id) &&
 		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
 		     (ent->pvi_subvendor_id == 0)) &&
 		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
 		     (ent->pvi_subdevice_id == 0)) &&
 		    ((pci_rev_id == ent->pvi_rev_id) ||
 		     (ent->pvi_rev_id == 0))) {
 
 			device_set_desc_copy(dev, ent->pvi_name);
 			/* this needs to be changed to zero if the bus probing code
 			 * ever stops re-probing on best match because the sctx
 			 * may have its values over written by register calls
 			 * in subsequent probes
 			 */
 			return (BUS_PROBE_DEFAULT);
 		}
 		ent++;
 	}
 	return (ENXIO);
 }
 
 int
 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
 {
 	int err, rid, msix, msix_bar;
 	if_ctx_t ctx;
 	if_t ifp;
 	if_softc_ctx_t scctx;
 	int i;
 	uint16_t main_txq;
 	uint16_t main_rxq;
 
 
 	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
 
 	if (sc == NULL) {
 		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 		device_set_softc(dev, ctx);
 		ctx->ifc_flags |= IFC_SC_ALLOCATED;
 	}
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_dev = dev;
-	ctx->ifc_txrx = *sctx->isc_txrx;
 	ctx->ifc_softc = sc;
 
 	if ((err = iflib_register(ctx)) != 0) {
 		device_printf(dev, "iflib_register failed %d\n", err);
 		return (err);
 	}
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
+	ifp = ctx->ifc_ifp;
+	ctx->ifc_nhwtxqs = sctx->isc_ntxqs;
+
 	/*
 	 * XXX sanity check that ntxd & nrxd are a power of 2
 	 */
 	if (ctx->ifc_sysctl_ntxqs != 0)
 		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
 	if (ctx->ifc_sysctl_nrxqs != 0)
 		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (ctx->ifc_sysctl_ntxds[i] != 0)
 			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
 		else
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (ctx->ifc_sysctl_nrxds[i] != 0)
 			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
 		else
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 	}
 
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
 			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
 		}
 		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
 			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
 				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
 			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
 		}
 	}
 
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
 			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
 		}
 		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
 			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
 				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
 			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
 		}
 	}
 
 	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		return (err);
 	}
-	if (scctx->isc_ntxqsets_max)
-		scctx->isc_ntxqsets = min(scctx->isc_ntxqsets, scctx->isc_ntxqsets_max);
-	if (scctx->isc_nrxqsets_max)
-		scctx->isc_nrxqsets = min(scctx->isc_nrxqsets, scctx->isc_nrxqsets_max);
+	_iflib_pre_assert(scctx);
+	ctx->ifc_txrx = *scctx->isc_txrx;
 
+#ifdef INVARIANTS
+	MPASS(scctx->isc_capenable);
+	if (scctx->isc_capenable & IFCAP_TXCSUM)
+		MPASS(scctx->isc_tx_csum_flags);
+#endif
+
+	if_setcapabilities(ifp, scctx->isc_capenable | IFCAP_HWSTATS);
+	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS);
+
+	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
+		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
+	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
+		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
+
 #ifdef ACPI_DMAR
 	if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL)
 		ctx->ifc_flags |= IFC_DMAR;
+#elif !(defined(__i386__) || defined(__amd64__))
+	/* set unconditionally for !x86 */
+	ctx->ifc_flags |= IFC_DMAR;
 #endif
 
 	msix_bar = scctx->isc_msix_bar;
+	main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
+	main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
 
-	ifp = ctx->ifc_ifp;
-
-	if(sctx->isc_flags & IFLIB_HAS_TXCQ)
-		main_txq = 1;
-	else
-		main_txq = 0;
-
-	if(sctx->isc_flags & IFLIB_HAS_RXCQ)
-		main_rxq = 1;
-	else
-		main_rxq = 0;
-
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "using %d tx descriptors and %d rx descriptors\n",
 		      scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
 	for (i = 0; i < sctx->isc_nrxqs; i++) {
 		if (!powerof2(scctx->isc_nrxd[i])) {
 			/* round down instead? */
 			device_printf(dev, "# rx descriptors must be a power of 2\n");
 			err = EINVAL;
 			goto fail;
 		}
 	}
 	for (i = 0; i < sctx->isc_ntxqs; i++) {
 		if (!powerof2(scctx->isc_ntxd[i])) {
 			device_printf(dev,
 			    "# tx descriptors must be a power of 2");
 			err = EINVAL;
 			goto fail;
 		}
 	}
 
 	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
 		    MAX_SINGLE_PACKET_FRACTION);
 	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
 	    MAX_SINGLE_PACKET_FRACTION)
 		scctx->isc_tx_tso_segments_max = max(1,
 		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
 
 	/*
 	 * Protect the stack against modern hardware
 	 */
 	if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX)
 		scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX;
 
 	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 	ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max;
 	ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max;
 	ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max;
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
+
+	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
+	/* XXX format name */
+	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin");
+
+	/* Set up cpu set.  If it fails, use the set of all CPUs. */
+	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
+		device_printf(dev, "Unable to fetch CPU list\n");
+		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
+	}
+	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
+
 	/*
 	** Now setup MSI or MSI/X, should
 	** return us the number of supported
 	** vectors. (Will be 1 for MSI)
 	*/
 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 		msix = scctx->isc_vectors;
 	} else if (scctx->isc_msix_bar != 0)
+	       /*
+		* The simple fact that isc_msix_bar is not 0 does not mean we
+		* we have a good value there that is known to work.
+		*/
 		msix = iflib_msix_init(ctx);
 	else {
 		scctx->isc_vectors = 1;
 		scctx->isc_ntxqsets = 1;
 		scctx->isc_nrxqsets = 1;
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 		msix = 0;
 	}
 	/* Get memory for the station queues */
 	if ((err = iflib_queues_alloc(ctx))) {
 		device_printf(dev, "Unable to allocate queue memory\n");
 		goto fail;
 	}
 
 	if ((err = iflib_qset_structures_setup(ctx))) {
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
-
+	/*
+	 * Group taskqueues aren't properly set up until SMP is started,
+	 * so we disable interrupts until we can handle them post
+	 * SI_SUB_SMP.
+	 *
+	 * XXX: disabling interrupts doesn't actually work, at least for
+	 * the non-MSI case.  When they occur before SI_SUB_SMP completes,
+	 * we do null handling and depend on this not causing too large an
+	 * interrupt storm.
+	 */
+	IFDI_INTR_DISABLE(ctx);
 	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
 		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
 		goto fail_intr_free;
 	}
 	if (msix <= 1) {
 		rid = 0;
 		if (scctx->isc_intr == IFLIB_INTR_MSI) {
 			MPASS(msix == 1);
 			rid = 1;
 		}
 		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
 			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
 			goto fail_intr_free;
 		}
 	}
 	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 		goto fail_detach;
 	}
 	if ((err = iflib_netmap_attach(ctx))) {
 		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
 		goto fail_detach;
 	}
 	*ctxp = ctx;
 
 	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 	iflib_add_device_sysctl_post(ctx);
 	return (0);
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_intr_free:
 	if (scctx->isc_intr == IFLIB_INTR_MSIX || scctx->isc_intr == IFLIB_INTR_MSI)
 		pci_release_msi(ctx->ifc_dev);
 fail_queues:
 	/* XXX free queues */
 fail:
 	IFDI_DETACH(ctx);
 	return (err);
 }
 
 int
 iflib_device_attach(device_t dev)
 {
 	if_ctx_t ctx;
 	if_shared_ctx_t sctx;
 
 	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 		return (ENOTSUP);
 
 	pci_enable_busmaster(dev);
 
 	return (iflib_device_register(dev, NULL, sctx, &ctx));
 }
 
 int
 iflib_device_deregister(if_ctx_t ctx)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	device_t dev = ctx->ifc_dev;
-	int i;
+	int i, j;
 	struct taskqgroup *tqg;
+	iflib_fl_t fl;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
 		device_printf(dev,"Vlan in use, detach first\n");
 		return (EBUSY);
 	}
 
 	CTX_LOCK(ctx);
 	ctx->ifc_in_detach = 1;
 	iflib_stop(ctx);
 	CTX_UNLOCK(ctx);
 
 	/* Unregister VLAN events */
 	if (ctx->ifc_vlan_attach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 	if (ctx->ifc_vlan_detach_event != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 
 	iflib_netmap_detach(ifp);
 	ether_ifdetach(ifp);
 	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 	CTX_LOCK_DESTROY(ctx);
 	if (ctx->ifc_led_dev != NULL)
 		led_destroy(ctx->ifc_led_dev);
 	/* XXX drain any dependent tasks */
-	tqg = qgroup_softirq;
+	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
-		callout_drain(&txq->ift_db_check);
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
+
+		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
+			free(fl->ifl_rx_bitmap, M_IFLIB);
+			
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 	if (ctx->ifc_vflr_task.gt_uniq != NULL)
 		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 
 	IFDI_DETACH(ctx);
 	device_set_softc(ctx->ifc_dev, NULL);
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
 		pci_release_msi(dev);
 	}
 	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
 		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
 	}
 	if (ctx->ifc_msix_mem != NULL) {
 		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
 			ctx->ifc_softc_ctx.isc_msix_bar, ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 	}
 
 	bus_generic_detach(dev);
 	if_free(ifp);
 
 	iflib_tx_structures_free(ctx);
 	iflib_rx_structures_free(ctx);
 	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 		free(ctx->ifc_softc, M_IFLIB);
 	free(ctx, M_IFLIB);
 	return (0);
 }
 
 
 int
 iflib_device_detach(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	return (iflib_device_deregister(ctx));
 }
 
 int
 iflib_device_suspend(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SUSPEND(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 int
 iflib_device_shutdown(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_SHUTDOWN(ctx);
 	CTX_UNLOCK(ctx);
 
 	return bus_generic_suspend(dev);
 }
 
 
 int
 iflib_device_resume(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 	iflib_txq_t txq = ctx->ifc_txqs;
 
 	CTX_LOCK(ctx);
 	IFDI_RESUME(ctx);
 	iflib_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 
 	return (bus_generic_resume(dev));
 }
 
 int
 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_INIT(ctx, num_vfs, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 void
 iflib_device_iov_uninit(device_t dev)
 {
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	IFDI_IOV_UNINIT(ctx);
 	CTX_UNLOCK(ctx);
 }
 
 int
 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
 {
 	int error;
 	if_ctx_t ctx = device_get_softc(dev);
 
 	CTX_LOCK(ctx);
 	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
 	CTX_UNLOCK(ctx);
 
 	return (error);
 }
 
 /*********************************************************************
  *
  *  MODULE FUNCTION DEFINITIONS
  *
  **********************************************************************/
 
 /*
  * - Start a fast taskqueue thread for each core
  * - Start a taskqueue for control operations
  */
 static int
 iflib_module_init(void)
 {
 	return (0);
 }
 
 static int
 iflib_module_event_handler(module_t mod, int what, void *arg)
 {
 	int err;
 
 	switch (what) {
 	case MOD_LOAD:
 		if ((err = iflib_module_init()) != 0)
 			return (err);
 		break;
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 /*********************************************************************
  *
  *  PUBLIC FUNCTION DEFINITIONS
  *     ordered as in iflib.h
  *
  **********************************************************************/
 
 
 static void
 _iflib_assert(if_shared_ctx_t sctx)
 {
 	MPASS(sctx->isc_tx_maxsize);
 	MPASS(sctx->isc_tx_maxsegsize);
 
 	MPASS(sctx->isc_rx_maxsize);
 	MPASS(sctx->isc_rx_nsegments);
 	MPASS(sctx->isc_rx_maxsegsize);
 
-
-	MPASS(sctx->isc_txrx->ift_txd_encap);
-	MPASS(sctx->isc_txrx->ift_txd_flush);
-	MPASS(sctx->isc_txrx->ift_txd_credits_update);
-	MPASS(sctx->isc_txrx->ift_rxd_available);
-	MPASS(sctx->isc_txrx->ift_rxd_pkt_get);
-	MPASS(sctx->isc_txrx->ift_rxd_refill);
-	MPASS(sctx->isc_txrx->ift_rxd_flush);
-
 	MPASS(sctx->isc_nrxd_min[0]);
 	MPASS(sctx->isc_nrxd_max[0]);
 	MPASS(sctx->isc_nrxd_default[0]);
 	MPASS(sctx->isc_ntxd_min[0]);
 	MPASS(sctx->isc_ntxd_max[0]);
 	MPASS(sctx->isc_ntxd_default[0]);
 }
 
+static void
+_iflib_pre_assert(if_softc_ctx_t scctx)
+{
+
+	MPASS(scctx->isc_txrx->ift_txd_encap);
+	MPASS(scctx->isc_txrx->ift_txd_flush);
+	MPASS(scctx->isc_txrx->ift_txd_credits_update);
+	MPASS(scctx->isc_txrx->ift_rxd_available);
+	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
+	MPASS(scctx->isc_txrx->ift_rxd_refill);
+	MPASS(scctx->isc_txrx->ift_rxd_flush);
+}
+
 static int
 iflib_register(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	driver_t *driver = sctx->isc_driver;
 	device_t dev = ctx->ifc_dev;
 	if_t ifp;
 
 	_iflib_assert(sctx);
 
 	CTX_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
 
 	ifp = ctx->ifc_ifp = if_gethandle(IFT_ETHER);
 	if (ifp == NULL) {
 		device_printf(dev, "can not allocate ifnet structure\n");
 		return (ENOMEM);
 	}
 
 	/*
 	 * Initialize our context's device specific methods
 	 */
 	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
 	kobj_class_compile((kobj_class_t) driver);
 	driver->refs++;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	if_setsoftc(ifp, ctx);
 	if_setdev(ifp, dev);
 	if_setinitfn(ifp, iflib_if_init);
 	if_setioctlfn(ifp, iflib_if_ioctl);
 	if_settransmitfn(ifp, iflib_if_transmit);
 	if_setqflushfn(ifp, iflib_if_qflush);
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 
-	if_setcapabilities(ifp, 0);
-	if_setcapenable(ifp, 0);
-
 	ctx->ifc_vlan_attach_event =
 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 	ctx->ifc_vlan_detach_event =
 		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
 							  EVENTHANDLER_PRI_FIRST);
 
 	ifmedia_init(&ctx->ifc_media, IFM_IMASK,
 					 iflib_media_change, iflib_media_status);
 
 	return (0);
 }
 
 
 static int
 iflib_queues_alloc(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	int nrxqsets = scctx->isc_nrxqsets;
 	int ntxqsets = scctx->isc_ntxqsets;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	iflib_fl_t fl = NULL;
 	int i, j, cpu, err, txconf, rxconf;
 	iflib_dma_info_t ifdip;
 	uint32_t *rxqsizes = scctx->isc_rxqsizes;
 	uint32_t *txqsizes = scctx->isc_txqsizes;
 	uint8_t nrxqs = sctx->isc_nrxqs;
 	uint8_t ntxqs = sctx->isc_ntxqs;
 	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
 	caddr_t *vaddrs;
 	uint64_t *paddrs;
 	struct ifmp_ring **brscp;
-	int nbuf_rings = 1; /* XXX determine dynamically */
 
 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
 
 	brscp = NULL;
 	txq = NULL;
 	rxq = NULL;
 
 /* Allocate the TX ring struct memory */
 	if (!(txq =
 	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
 	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate TX ring memory\n");
 		err = ENOMEM;
 		goto fail;
 	}
 
 	/* Now allocate the RX */
 	if (!(rxq =
 	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
 	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate RX ring memory\n");
 		err = ENOMEM;
 		goto rx_fail;
 	}
-	if (!(brscp = malloc(sizeof(void *) * nbuf_rings * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
-		device_printf(dev, "Unable to buf_ring_sc * memory\n");
-		err = ENOMEM;
-		goto rx_fail;
-	}
 
 	ctx->ifc_txqs = txq;
 	ctx->ifc_rxqs = rxq;
 
 	/*
 	 * XXX handle allocation failure
 	 */
 	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
 			device_printf(dev, "failed to allocate iflib_dma_info\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		txq->ift_ifdi = ifdip;
 		for (j = 0; j < ntxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
 				device_printf(dev, "Unable to allocate Descriptor memory\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
+			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 		}
 		txq->ift_ctx = ctx;
 		txq->ift_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
 			txq->ift_br_offset = 1;
 		} else {
 			txq->ift_br_offset = 0;
 		}
 		/* XXX fix this */
 		txq->ift_timer.c_cpu = cpu;
-		txq->ift_db_check.c_cpu = cpu;
-		txq->ift_nbr = nbuf_rings;
 
 		if (iflib_txsd_alloc(txq)) {
 			device_printf(dev, "Critical Failure setting up TX buffers\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		/* Initialize the TX lock */
 		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout",
 		    device_get_nameunit(dev), txq->ift_id);
 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
-		callout_init_mtx(&txq->ift_db_check, &txq->ift_mtx, 0);
 
 		snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db",
 			 device_get_nameunit(dev), txq->ift_id);
-		TXDB_LOCK_INIT(txq);
 
-		txq->ift_br = brscp + i*nbuf_rings;
-		for (j = 0; j < nbuf_rings; j++) {
-			err = ifmp_ring_alloc(&txq->ift_br[j], 2048, txq, iflib_txq_drain,
-					      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
-			if (err) {
-				/* XXX free any allocated rings */
-				device_printf(dev, "Unable to allocate buf_ring\n");
-				goto err_tx_desc;
-			}
+		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
+				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
+		if (err) {
+			/* XXX free any allocated rings */
+			device_printf(dev, "Unable to allocate buf_ring\n");
+			goto err_tx_desc;
 		}
 	}
 
 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
 		/* Set up some basics */
 
 		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
 			device_printf(dev, "failed to allocate iflib_dma_info\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 
 		rxq->ifr_ifdi = ifdip;
+		/* XXX this needs to be changed if #rx queues != #tx queues */
+		rxq->ifr_ntxqirq = 1;
+		rxq->ifr_txqid[0] = i;
 		for (j = 0; j < nrxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
 				device_printf(dev, "Unable to allocate Descriptor memory\n");
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
 			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
 		}
 		rxq->ifr_ctx = ctx;
 		rxq->ifr_id = i;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			rxq->ifr_fl_offset = 1;
 		} else {
 			rxq->ifr_fl_offset = 0;
 		}
 		rxq->ifr_nfl = nfree_lists;
 		if (!(fl =
 			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
 			device_printf(dev, "Unable to allocate free list memory\n");
 			err = ENOMEM;
 			goto err_tx_desc;
 		}
 		rxq->ifr_fl = fl;
 		for (j = 0; j < nfree_lists; j++) {
-			rxq->ifr_fl[j].ifl_rxq = rxq;
-			rxq->ifr_fl[j].ifl_id = j;
-			rxq->ifr_fl[j].ifl_ifdi =
-			    &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
+			fl[j].ifl_rxq = rxq;
+			fl[j].ifl_id = j;
+			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
+			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
 		}
         /* Allocate receive buffers for the ring*/
 		if (iflib_rxsd_alloc(rxq)) {
 			device_printf(dev,
 			    "Critical Failure setting up receive buffers\n");
 			err = ENOMEM;
 			goto err_rx_desc;
 		}
+
+		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) 
+			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB, M_WAITOK|M_ZERO);
 	}
 
 	/* TXQs */
 	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < ntxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
 
 		for (j = 0; j < ntxqs; j++, di++) {
 			vaddrs[i*ntxqs + j] = di->idi_vaddr;
 			paddrs[i*ntxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	/* RXQs */
 	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 	for (i = 0; i < nrxqsets; i++) {
 		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
 
 		for (j = 0; j < nrxqs; j++, di++) {
 			vaddrs[i*nrxqs + j] = di->idi_vaddr;
 			paddrs[i*nrxqs + j] = di->idi_paddr;
 		}
 	}
 	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
 		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
 		iflib_tx_structures_free(ctx);
 		free(vaddrs, M_IFLIB);
 		free(paddrs, M_IFLIB);
 		goto err_rx_desc;
 	}
 	free(vaddrs, M_IFLIB);
 	free(paddrs, M_IFLIB);
 
 	return (0);
 
 /* XXX handle allocation failure changes */
 err_rx_desc:
 err_tx_desc:
 	if (ctx->ifc_rxqs != NULL)
 		free(ctx->ifc_rxqs, M_IFLIB);
 	ctx->ifc_rxqs = NULL;
 	if (ctx->ifc_txqs != NULL)
 		free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 rx_fail:
 	if (brscp != NULL)
 		free(brscp, M_IFLIB);
 	if (rxq != NULL)
 		free(rxq, M_IFLIB);
 	if (txq != NULL)
 		free(txq, M_IFLIB);
 fail:
 	return (err);
 }
 
 static int
 iflib_tx_structures_setup(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 		iflib_txq_setup(txq);
 
 	return (0);
 }
 
 static void
 iflib_tx_structures_free(if_ctx_t ctx)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	int i, j;
 
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		iflib_txq_destroy(txq);
 		for (j = 0; j < ctx->ifc_nhwtxqs; j++)
 			iflib_dma_free(&txq->ift_ifdi[j]);
 	}
 	free(ctx->ifc_txqs, M_IFLIB);
 	ctx->ifc_txqs = NULL;
 	IFDI_QUEUES_FREE(ctx);
 }
 
 /*********************************************************************
  *
  *  Initialize all receive rings.
  *
  **********************************************************************/
 static int
 iflib_rx_structures_setup(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	int q;
 #if defined(INET6) || defined(INET)
 	int i, err;
 #endif
 
 	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
 #if defined(INET6) || defined(INET)
 		tcp_lro_free(&rxq->ifr_lc);
 		if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
 		    TCP_LRO_ENTRIES, min(1024,
 		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) {
 			device_printf(ctx->ifc_dev, "LRO Initialization failed!\n");
 			goto fail;
 		}
 		rxq->ifr_lro_enabled = TRUE;
 #endif
 		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
 	}
 	return (0);
 #if defined(INET6) || defined(INET)
 fail:
 	/*
 	 * Free RX software descriptors allocated so far, we will only handle
 	 * the rings that completed, the failing case will have
 	 * cleaned up for itself. 'q' failed, so its the terminus.
 	 */
 	rxq = ctx->ifc_rxqs;
 	for (i = 0; i < q; ++i, rxq++) {
 		iflib_rx_sds_free(rxq);
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
 	}
 	return (err);
 #endif
 }
 
 /*********************************************************************
  *
  *  Free all receive rings.
  *
  **********************************************************************/
 static void
 iflib_rx_structures_free(if_ctx_t ctx)
 {
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
 		iflib_rx_sds_free(rxq);
 	}
 }
 
 static int
 iflib_qset_structures_setup(if_ctx_t ctx)
 {
 	int err;
 
 	if ((err = iflib_tx_structures_setup(ctx)) != 0)
 		return (err);
 
 	if ((err = iflib_rx_structures_setup(ctx)) != 0) {
 		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
 		iflib_tx_structures_free(ctx);
 		iflib_rx_structures_free(ctx);
 	}
 	return (err);
 }
 
 int
 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 				driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, char *name)
 {
 
 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 }
 
-static void
-find_nth(if_ctx_t ctx, cpuset_t *cpus, int qid)
+#ifdef SMP
+static int
+find_nth(if_ctx_t ctx, int qid)
 {
-	int i, cpuid;
+	cpuset_t cpus;
+	int i, cpuid, eqid, count;
 
-	CPU_COPY(&ctx->ifc_cpus, cpus);
+	CPU_COPY(&ctx->ifc_cpus, &cpus);
+	count = CPU_COUNT(&cpus);
+	eqid = qid % count;
 	/* clear up to the qid'th bit */
-	for (i = 0; i < qid; i++) {
-		cpuid = CPU_FFS(cpus);
-		CPU_CLR(cpuid, cpus);
+	for (i = 0; i < eqid; i++) {
+		cpuid = CPU_FFS(&cpus);
+		MPASS(cpuid != 0);
+		CPU_CLR(cpuid-1, &cpus);
 	}
+	cpuid = CPU_FFS(&cpus);
+	MPASS(cpuid != 0);
+	return (cpuid-1);
 }
 
+#ifdef SCHED_ULE
+extern struct cpu_group *cpu_top;              /* CPU topology */
+
+static int
+find_child_with_core(int cpu, struct cpu_group *grp)
+{
+	int i;
+
+	if (grp->cg_children == 0)
+		return -1;
+
+	MPASS(grp->cg_child);
+	for (i = 0; i < grp->cg_children; i++) {
+		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
+			return i;
+	}
+
+	return -1;
+}
+
+/*
+ * Find the nth "close" core to the specified core
+ * "close" is defined as the deepest level that shares
+ * at least an L2 cache.  With threads, this will be
+ * threads on the same core.  If the sahred cache is L3
+ * or higher, simply returns the same core.
+ */
+static int
+find_close_core(int cpu, int core_offset)
+{
+	struct cpu_group *grp;
+	int i;
+	int fcpu;
+	cpuset_t cs;
+
+	grp = cpu_top;
+	if (grp == NULL)
+		return cpu;
+	i = 0;
+	while ((i = find_child_with_core(cpu, grp)) != -1) {
+		/* If the child only has one cpu, don't descend */
+		if (grp->cg_child[i].cg_count <= 1)
+			break;
+		grp = &grp->cg_child[i];
+	}
+
+	/* If they don't share at least an L2 cache, use the same CPU */
+	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
+		return cpu;
+
+	/* Now pick one */
+	CPU_COPY(&grp->cg_mask, &cs);
+
+	/* Add the selected CPU offset to core offset. */
+	for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) {
+		if (fcpu - 1 == cpu)
+			break;
+		CPU_CLR(fcpu - 1, &cs);
+	}
+	MPASS(fcpu);
+
+	core_offset += i;
+
+	CPU_COPY(&grp->cg_mask, &cs);
+	for (i = core_offset % grp->cg_count; i > 0; i--) {
+		MPASS(CPU_FFS(&cs));
+		CPU_CLR(CPU_FFS(&cs) - 1, &cs);
+	}
+	MPASS(CPU_FFS(&cs));
+	return CPU_FFS(&cs) - 1;
+}
+#else
+static int
+find_close_core(int cpu, int core_offset __unused)
+{
+	return cpu;
+}
+#endif
+
+static int
+get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid)
+{
+	switch (type) {
+	case IFLIB_INTR_TX:
+		/* TX queues get cores which share at least an L2 cache with the corresponding RX queue */
+		/* XXX handle multiple RX threads per core and more than two core per L2 group */
+		return qid / CPU_COUNT(&ctx->ifc_cpus) + 1;
+	case IFLIB_INTR_RX:
+	case IFLIB_INTR_RXTX:
+		/* RX queues get the specified core */
+		return qid / CPU_COUNT(&ctx->ifc_cpus);
+	default:
+		return -1;
+	}
+}
+#else
+#define get_core_offset(ctx, type, qid)	CPU_FIRST()
+#define find_close_core(cpuid, tid)	CPU_FIRST()
+#define find_nth(ctx, gid)		CPU_FIRST()
+#endif
+
+/* Just to avoid copy/paste */
+static inline int
+iflib_irq_set_affinity(if_ctx_t ctx, int irq, iflib_intr_type_t type, int qid,
+    struct grouptask *gtask, struct taskqgroup *tqg, void *uniq, char *name)
+{
+	int cpuid;
+	int err, tid;
+
+	cpuid = find_nth(ctx, qid);
+	tid = get_core_offset(ctx, type, qid);
+	MPASS(tid >= 0);
+	cpuid = find_close_core(cpuid, tid);
+	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, irq, name);
+	if (err) {
+		device_printf(ctx->ifc_dev, "taskqgroup_attach_cpu failed %d\n", err);
+		return (err);
+	}
+#ifdef notyet
+	if (cpuid > ctx->ifc_cpuid_highest)
+		ctx->ifc_cpuid_highest = cpuid;
+#endif
+	return 0;
+}
+
 int
 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 						iflib_intr_type_t type, driver_filter_t *filter,
 						void *filter_arg, int qid, char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	iflib_filter_info_t info;
-	cpuset_t cpus;
 	gtask_fn_t *fn;
 	int tqrid, err;
+	driver_filter_t *intr_fast;
 	void *q;
 
 	info = &ctx->ifc_filter_info;
+	tqrid = rid;
 
 	switch (type) {
 	/* XXX merge tx/rx for netmap? */
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		info = &ctx->ifc_txqs[qid].ift_filter_info;
 		gtask = &ctx->ifc_txqs[qid].ift_task;
-		tqg = qgroup_softirq;
-		tqrid = irq->ii_rid;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
+		intr_fast = iflib_fast_intr;
+		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
-		tqg = qgroup_softirq;
-		tqrid = irq->ii_rid;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
+		intr_fast = iflib_fast_intr;
+		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
+	case IFLIB_INTR_RXTX:
+		q = &ctx->ifc_rxqs[qid];
+		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
+		gtask = &ctx->ifc_rxqs[qid].ifr_task;
+		tqg = qgroup_if_io_tqg;
+		fn = _task_fn_rx;
+		intr_fast = iflib_fast_intr_rxtx;
+		GROUPTASK_INIT(gtask, 0, fn, q);
+		break;
 	case IFLIB_INTR_ADMIN:
 		q = ctx;
+		tqrid = -1;
 		info = &ctx->ifc_filter_info;
 		gtask = &ctx->ifc_admin_task;
 		tqg = qgroup_if_config_tqg;
-		tqrid = -1;
 		fn = _task_fn_admin;
+		intr_fast = iflib_fast_intr_ctx;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
-	GROUPTASK_INIT(gtask, 0, fn, q);
 
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
+	info->ifi_ctx = q;
 
-	/* XXX query cpu that rid belongs to */
-
-	err = _iflib_irq_alloc(ctx, irq, rid, iflib_fast_intr, NULL, info,  name);
-	if (err != 0)
+	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
+	if (err != 0) {
+		device_printf(ctx->ifc_dev, "_iflib_irq_alloc failed %d\n", err);
 		return (err);
+	}
+	if (type == IFLIB_INTR_ADMIN)
+		return (0);
+
 	if (tqrid != -1) {
-		find_nth(ctx, &cpus, qid);
-		taskqgroup_attach_cpu(tqg, gtask, q, CPU_FFS(&cpus), irq->ii_rid, name);
-	} else
-		taskqgroup_attach(tqg, gtask, q, tqrid, name);
+		err = iflib_irq_set_affinity(ctx, rman_get_start(irq->ii_res), type, qid, gtask, tqg, q, name);
+		if (err)
+			return (err);
+	} else {
+		taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res), name);
+	}
 
-
 	return (0);
 }
 
 void
-iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name)
+iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,  void *arg, int qid, char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	void *q;
+	int irq_num = -1;
+	int err;
 
 	switch (type) {
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		gtask = &ctx->ifc_txqs[qid].ift_task;
-		tqg = qgroup_softirq;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
+		if (irq != NULL)
+			irq_num = rman_get_start(irq->ii_res);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
-		tqg = qgroup_softirq;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
+		if (irq != NULL)
+			irq_num = rman_get_start(irq->ii_res);
 		break;
-	case IFLIB_INTR_ADMIN:
-		q = ctx;
-		gtask = &ctx->ifc_admin_task;
-		tqg = qgroup_if_config_tqg;
-		rid = -1;
-		fn = _task_fn_admin;
-		break;
 	case IFLIB_INTR_IOV:
 		q = ctx;
 		gtask = &ctx->ifc_vflr_task;
 		tqg = qgroup_if_config_tqg;
-		rid = -1;
 		fn = _task_fn_iov;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 	GROUPTASK_INIT(gtask, 0, fn, q);
-	taskqgroup_attach(tqg, gtask, q, rid, name);
+	if (irq_num != -1) {
+		err = iflib_irq_set_affinity(ctx, irq_num, type, qid, gtask, tqg, q, name);
+		if (err)
+			taskqgroup_attach(tqg, gtask, q, irq_num, name);
+	}
+	else {
+		taskqgroup_attach(tqg, gtask, q, irq_num, name);
+	}
 }
 
 void
 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
 {
 	if (irq->ii_tag)
 		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
 
 	if (irq->ii_res)
 		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, irq->ii_rid, irq->ii_res);
 }
 
 static int
 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, char *name)
 {
 	iflib_txq_t txq = ctx->ifc_txqs;
 	iflib_rxq_t rxq = ctx->ifc_rxqs;
 	if_irq_t irq = &ctx->ifc_legacy_irq;
 	iflib_filter_info_t info;
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	int tqrid;
 	void *q;
 	int err;
 
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;
-	tqg = qgroup_softirq;
+	tqg = qgroup_if_io_tqg;
 	tqrid = irq->ii_rid = *rid;
 	fn = _task_fn_rx;
 
 	ctx->ifc_flags |= IFC_LEGACY;
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
 
 	/* We allocate a single interrupt resource */
-	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr, NULL, info, name)) != 0)
+	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr_ctx, NULL, info, name)) != 0)
 		return (err);
 	GROUPTASK_INIT(gtask, 0, fn, q);
-	taskqgroup_attach(tqg, gtask, q, tqrid, name);
+	taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res), name);
 
 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
-	taskqgroup_attach(qgroup_softirq, &txq->ift_task, txq, tqrid, "tx");
-	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
-	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin/link");
-
+	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, rman_get_start(irq->ii_res), "tx");
 	return (0);
 }
 
 void
 iflib_led_create(if_ctx_t ctx)
 {
 
 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
-								  device_get_nameunit(ctx->ifc_dev));
+	    device_get_nameunit(ctx->ifc_dev));
 }
 
 void
 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
 }
 
 void
 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
 }
 
 void
 iflib_admin_intr_deferred(if_ctx_t ctx)
 {
+#ifdef INVARIANTS
+	struct grouptask *gtask;
 
+	gtask = &ctx->ifc_admin_task;
+	MPASS(gtask != NULL && gtask->gt_taskqueue != NULL);
+#endif
+
 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 }
 
 void
 iflib_iov_intr_deferred(if_ctx_t ctx)
 {
 
 	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
 }
 
 void
 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
 {
 
-	taskqgroup_attach_cpu(qgroup_softirq, gt, uniq, cpu, -1, name);
+	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name);
 }
 
 void
 iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask, gtask_fn_t *fn,
 	char *name)
 {
 
 	GROUPTASK_INIT(gtask, 0, fn, ctx);
 	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name);
 }
 
 void
 iflib_config_gtask_deinit(struct grouptask *gtask)
 {
 
 	taskqgroup_detach(qgroup_if_config_tqg, gtask);	
 }
 
 void
 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
 {
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq = ctx->ifc_txqs;
 
-
 	if_setbaudrate(ifp, baudrate);
+	if (baudrate >= IF_Gbps(10))
+		ctx->ifc_flags |= IFC_PREFETCH;
 
 	/* If link down, disable watchdog */
 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
 		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
 			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	}
 	ctx->ifc_link_state = link_state;
 	if_link_state_change(ifp, link_state);
 }
 
 static int
 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 {
 	int credits;
+#ifdef INVARIANTS
+	int credits_pre = txq->ift_cidx_processed;
+#endif
 
 	if (ctx->isc_txd_credits_update == NULL)
 		return (0);
 
-	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, true)) == 0)
+	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
 		return (0);
 
 	txq->ift_processed += credits;
 	txq->ift_cidx_processed += credits;
 
+	MPASS(credits_pre + credits == txq->ift_cidx_processed);
 	if (txq->ift_cidx_processed >= txq->ift_size)
 		txq->ift_cidx_processed -= txq->ift_size;
 	return (credits);
 }
 
 static int
-iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget)
+iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
 {
 
 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
 	    budget));
 }
 
 void
 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
 	const char *description, if_int_delay_info_t info,
 	int offset, int value)
 {
 	info->iidi_ctx = ctx;
 	info->iidi_offset = offset;
 	info->iidi_value = value;
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
 	    OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
 	    info, 0, iflib_sysctl_int_delay, "I", description);
 }
 
 struct mtx *
 iflib_ctx_lock_get(if_ctx_t ctx)
 {
 
 	return (&ctx->ifc_mtx);
 }
 
 static int
 iflib_msix_init(if_ctx_t ctx)
 {
 	device_t dev = ctx->ifc_dev;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs;
 	int iflib_num_tx_queues, iflib_num_rx_queues;
 	int err, admincnt, bar;
 
-	iflib_num_tx_queues = scctx->isc_ntxqsets;
-	iflib_num_rx_queues = scctx->isc_nrxqsets;
+	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
+	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
 
+	device_printf(dev, "msix_init qsets capped at %d\n", imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
+
 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
 	admincnt = sctx->isc_admin_intrcnt;
+	/* Override by global tuneable */
+	{
+		int i;
+		size_t len = sizeof(i);
+		err = kernel_sysctlbyname(curthread, "hw.pci.enable_msix", &i, &len, NULL, 0, NULL, 0);
+		if (err == 0) {
+			if (i == 0)
+				goto msi;
+		}
+		else {
+			device_printf(dev, "unable to read hw.pci.enable_msix.");
+		}
+	}
 	/* Override by tuneable */
-	if (enable_msix == 0)
+	if (scctx->isc_disable_msix)
 		goto msi;
 
 	/*
 	** When used in a virtualized environment
 	** PCI BUSMASTER capability may not be set
 	** so explicity set it here and rewrite
 	** the ENABLE in the MSIX control register
 	** at this point to cause the host to
 	** successfully initialize us.
 	*/
 	{
-		uint16_t pci_cmd_word;
 		int msix_ctrl, rid;
 
+ 		pci_enable_busmaster(dev);
 		rid = 0;
-		pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2);
-		pci_cmd_word |= PCIM_CMD_BUSMASTEREN;
-		pci_write_config(dev, PCIR_COMMAND, pci_cmd_word, 2);
-		pci_find_cap(dev, PCIY_MSIX, &rid);
-		rid += PCIR_MSIX_CTRL;
-		msix_ctrl = pci_read_config(dev, rid, 2);
-		msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
-		pci_write_config(dev, rid, msix_ctrl, 2);
+		if (pci_find_cap(dev, PCIY_MSIX, &rid) == 0 && rid != 0) {
+			rid += PCIR_MSIX_CTRL;
+			msix_ctrl = pci_read_config(dev, rid, 2);
+			msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
+			pci_write_config(dev, rid, msix_ctrl, 2);
+		} else {
+			device_printf(dev, "PCIY_MSIX capability not found; "
+			                   "or rid %d == 0.\n", rid);
+			goto msi;
+		}
 	}
 
 	/*
 	 * bar == -1 => "trust me I know what I'm doing"
 	 * Some drivers are for hardware that is so shoddily
 	 * documented that no one knows which bars are which
 	 * so the developer has to map all bars. This hack
 	 * allows shoddy garbage to use msix in this framework.
 	 */
 	if (bar != -1) {
 		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
 	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
 		if (ctx->ifc_msix_mem == NULL) {
 			/* May not be enabled */
 			device_printf(dev, "Unable to map MSIX table \n");
 			goto msi;
 		}
 	}
 	/* First try MSI/X */
 	if ((msgs = pci_msix_count(dev)) == 0) { /* system has msix disabled */
 		device_printf(dev, "System has MSIX disabled \n");
 		bus_release_resource(dev, SYS_RES_MEMORY,
 		    bar, ctx->ifc_msix_mem);
 		ctx->ifc_msix_mem = NULL;
 		goto msi;
 	}
 #if IFLIB_DEBUG
 	/* use only 1 qset in debug mode */
 	queuemsgs = min(msgs - admincnt, 1);
 #else
 	queuemsgs = msgs - admincnt;
 #endif
-	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) == 0) {
 #ifdef RSS
-		queues = imin(queuemsgs, rss_getnumbuckets());
+	queues = imin(queuemsgs, rss_getnumbuckets());
 #else
-		queues = queuemsgs;
+	queues = queuemsgs;
 #endif
-		queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
-		device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
-					  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
-	} else {
-		device_printf(dev, "Unable to fetch CPU list\n");
-		/* Figure out a reasonable auto config value */
-		queues = min(queuemsgs, mp_ncpus);
-	}
+	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
+	device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
+				  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 #ifdef  RSS
 	/* If we're doing RSS, clamp at the number of RSS buckets */
 	if (queues > rss_getnumbuckets())
 		queues = rss_getnumbuckets();
 #endif
 	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
 		rx_queues = iflib_num_rx_queues;
 	else
 		rx_queues = queues;
+
+	if (rx_queues > scctx->isc_nrxqsets)
+		rx_queues = scctx->isc_nrxqsets;
+
 	/*
 	 * We want this to be all logical CPUs by default
 	 */
 	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
 		tx_queues = iflib_num_tx_queues;
 	else
 		tx_queues = mp_ncpus;
 
+	if (tx_queues > scctx->isc_ntxqsets)
+		tx_queues = scctx->isc_ntxqsets;
+
 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
 #ifdef INVARIANTS
 		if (tx_queues != rx_queues)
 			device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
 				      min(rx_queues, tx_queues), min(rx_queues, tx_queues));
 #endif
 		tx_queues = min(rx_queues, tx_queues);
 		rx_queues = min(rx_queues, tx_queues);
 	}
 
 	device_printf(dev, "using %d rx queues %d tx queues \n", rx_queues, tx_queues);
 
 	vectors = rx_queues + admincnt;
 	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
 		device_printf(dev,
 					  "Using MSIX interrupts with %d vectors\n", vectors);
 		scctx->isc_vectors = vectors;
 		scctx->isc_nrxqsets = rx_queues;
 		scctx->isc_ntxqsets = tx_queues;
 		scctx->isc_intr = IFLIB_INTR_MSIX;
 
 		return (vectors);
 	} else {
 		device_printf(dev, "failed to allocate %d msix vectors, err: %d - using MSI\n", vectors, err);
 	}
 msi:
 	vectors = pci_msi_count(dev);
 	scctx->isc_nrxqsets = 1;
 	scctx->isc_ntxqsets = 1;
 	scctx->isc_vectors = vectors;
 	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
 		device_printf(dev,"Using an MSI interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_MSI;
 	} else {
 		device_printf(dev,"Using a Legacy interrupt\n");
 		scctx->isc_intr = IFLIB_INTR_LEGACY;
 	}
 
 	return (vectors);
 }
 
 char * ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
 
 static int
 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	int rc;
 	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
 	struct sbuf *sb;
 	char *ring_state = "UNKNOWN";
 
 	/* XXX needed ? */
 	rc = sysctl_wire_old_buffer(req, 0);
 	MPASS(rc == 0);
 	if (rc != 0)
 		return (rc);
 	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
 	MPASS(sb != NULL);
 	if (sb == NULL)
 		return (ENOMEM);
 	if (state[3] <= 3)
 		ring_state = ring_states[state[3]];
 
 	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
 		    state[0], state[1], state[2], ring_state);
 	rc = sbuf_finish(sb);
 	sbuf_delete(sb);
         return(rc);
 }
 
 enum iflib_ndesc_handler {
 	IFLIB_NTXD_HANDLER,
 	IFLIB_NRXD_HANDLER,
 };
 
 static int
 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 {
 	if_ctx_t ctx = (void *)arg1;
 	enum iflib_ndesc_handler type = arg2;
 	char buf[256] = {0};
-	uint16_t *ndesc;
+	qidx_t *ndesc;
 	char *p, *next;
 	int nqs, rc, i;
 
 	MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER);
 
 	nqs = 8;
 	switch(type) {
 	case IFLIB_NTXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_ntxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_ntxqs;
 		break;
 	case IFLIB_NRXD_HANDLER:
 		ndesc = ctx->ifc_sysctl_nrxds;
 		if (ctx->ifc_sctx)
 			nqs = ctx->ifc_sctx->isc_nrxqs;
 		break;
 	}
 	if (nqs == 0)
 		nqs = 8;
 
 	for (i=0; i<8; i++) {
 		if (i >= nqs)
 			break;
 		if (i)
 			strcat(buf, ",");
 		sprintf(strchr(buf, 0), "%d", ndesc[i]);
 	}
 
 	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (rc || req->newptr == NULL)
 		return rc;
 
 	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
 	    i++, p = strsep(&next, " ,")) {
 		ndesc[i] = strtoul(p, NULL, 10);
 	}
 
 	return(rc);
 }
 
 #define NAME_BUFLEN 32
 static void
 iflib_add_device_sysctl_pre(if_ctx_t ctx)
 {
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child, *oid_list;
 	struct sysctl_ctx_list *ctx_list;
 	struct sysctl_oid *node;
 
 	ctx_list = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
 						      CTLFLAG_RD, NULL, "IFLIB fields");
 	oid_list = SYSCTL_CHILDREN(node);
 
 	SYSCTL_ADD_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, 0,
 		       "driver version");
 
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 			"# of txqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 			"# of rxqs to use, 0 => use default #");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
                        "permit #txq != #rxq");
+	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
+                      CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
+                      "disable MSIX (default 0)");
+	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
+		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
+                       "set the rx budget");
 
 	/* XXX change for per-queue sizes */
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of tx descriptors to use, 0 = use default #");
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
 		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER,
                        mp_ndesc_handler, "A",
                        "list of # of rx descriptors to use, 0 = use default #");
 }
 
 static void
 iflib_add_device_sysctl_post(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
         device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child;
 	struct sysctl_ctx_list *ctx_list;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	int i, j;
 	char namebuf[NAME_BUFLEN];
 	char *qfmt;
 	struct sysctl_oid *queue_node, *fl_node, *node;
 	struct sysctl_oid_list *queue_list, *fl_list;
 	ctx_list = device_get_sysctl_ctx(dev);
 
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);
 
 	if (scctx->isc_ntxqsets > 100)
 		qfmt = "txq%03d";
 	else if (scctx->isc_ntxqsets > 10)
 		qfmt = "txq%02d";
 	else
 		qfmt = "txq%d";
 	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 #if MEMORY_LOGGING
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
 				CTLFLAG_RD,
 				&txq->ift_dequeued, "total mbufs freed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
 				CTLFLAG_RD,
 				&txq->ift_enqueued, "total mbufs enqueued");
 #endif
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
 				   CTLFLAG_RD,
 				   &txq->ift_pullups, "# of times m_pullup was called");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
 				   CTLFLAG_RD,
 				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
 				   CTLFLAG_RD,
 				   &txq->ift_map_failed, "# of times dma map failed");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
 				   CTLFLAG_RD,
 				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
 				   CTLFLAG_RD,
 				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
 				   CTLFLAG_RD,
 				   &txq->ift_pidx, 1, "Producer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx, 1, "Consumer Index");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
 		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
 				   CTLFLAG_RD,
 				   &txq->ift_in_use, 1, "descriptors in use");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
 				   CTLFLAG_RD,
 				   &txq->ift_processed, "descriptors procesed for clean");
 		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
 				   CTLFLAG_RD,
 				   &txq->ift_cleaned, "total cleaned");
 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
-				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br[0]->state),
+				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br->state),
 				0, mp_ring_state_handler, "A", "soft ring state");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
-				       CTLFLAG_RD, &txq->ift_br[0]->enqueues,
+				       CTLFLAG_RD, &txq->ift_br->enqueues,
 				       "# of enqueues to the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
-				       CTLFLAG_RD, &txq->ift_br[0]->drops,
+				       CTLFLAG_RD, &txq->ift_br->drops,
 				       "# of drops in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
-				       CTLFLAG_RD, &txq->ift_br[0]->starts,
+				       CTLFLAG_RD, &txq->ift_br->starts,
 				       "# of normal consumer starts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
-				       CTLFLAG_RD, &txq->ift_br[0]->stalls,
+				       CTLFLAG_RD, &txq->ift_br->stalls,
 					       "# of consumer stalls in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
-			       CTLFLAG_RD, &txq->ift_br[0]->restarts,
+			       CTLFLAG_RD, &txq->ift_br->restarts,
 				       "# of consumer restarts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
-				       CTLFLAG_RD, &txq->ift_br[0]->abdications,
+				       CTLFLAG_RD, &txq->ift_br->abdications,
 				       "# of consumer abdications in the mp_ring for this queue");
-
 	}
 
 	if (scctx->isc_nrxqsets > 100)
 		qfmt = "rxq%03d";
 	else if (scctx->isc_nrxqsets > 10)
 		qfmt = "rxq%02d";
 	else
 		qfmt = "rxq%d";
 	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
 		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 					     CTLFLAG_RD, NULL, "Queue Name");
 		queue_list = SYSCTL_CHILDREN(queue_node);
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
 		}
+
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
 						     CTLFLAG_RD, NULL, "freelist Name");
 			fl_list = SYSCTL_CHILDREN(fl_node);
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_pidx, 1, "Producer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
 				       CTLFLAG_RD,
 				       &fl->ifl_cidx, 1, "Consumer Index");
 			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
 				       CTLFLAG_RD,
 				       &fl->ifl_credits, 1, "credits available");
 #if MEMORY_LOGGING
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_m_enqueued, "mbufs allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_m_dequeued, "mbufs freed");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_enqueued, "clusters allocated");
 			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
 					CTLFLAG_RD,
 					&fl->ifl_cl_dequeued, "clusters freed");
 #endif
 
 		}
 	}
 
 }
+
+#ifndef __NO_STRICT_ALIGNMENT
+static struct mbuf *
+iflib_fixup_rx(struct mbuf *m)
+{
+	struct mbuf *n;
+
+	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
+		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
+		m->m_data += ETHER_HDR_LEN;
+		n = m;
+	} else {
+		MGETHDR(n, M_NOWAIT, MT_DATA);
+		if (n == NULL) {
+			m_freem(m);
+			return (NULL);
+		}
+		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
+		m->m_data += ETHER_HDR_LEN;
+		m->m_len -= ETHER_HDR_LEN;
+		n->m_len = ETHER_HDR_LEN;
+		M_MOVE_PKTHDR(n, m);
+		n->m_next = m;
+	}
+	return (n);
+}
+#endif
Index: stable/11/sys/net/iflib.h
===================================================================
--- stable/11/sys/net/iflib.h	(revision 333337)
+++ stable/11/sys/net/iflib.h	(revision 333338)
@@ -1,359 +1,408 @@
 /*-
- * Copyright (c) 2014-2015, Matthew Macy (mmacy@nextbsd.org)
+ * Copyright (c) 2014-2017, Matthew Macy (mmacy@nextbsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of Matthew Macy nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef __IFLIB_H_
 #define __IFLIB_H_
 
 #include <sys/kobj.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <machine/bus.h>
 #include <sys/bus_dma.h>
 #include <sys/nv.h>
 #include <sys/gtaskqueue.h>
 
-
 /*
+ * The value type for indexing, limits max descriptors
+ * to 65535 can be conditionally redefined to uint32_t
+ * in the future if the need arises.
+ */
+typedef uint16_t qidx_t;
+#define QIDX_INVALID 0xFFFF
+/*
  * Most cards can handle much larger TSO requests
  * but the FreeBSD TCP stack will break on larger
  * values
  */
 #define FREEBSD_TSO_SIZE_MAX 65518
 
 
 struct iflib_ctx;
 typedef struct iflib_ctx *if_ctx_t;
 struct if_shared_ctx;
 typedef struct if_shared_ctx *if_shared_ctx_t;
 struct if_int_delay_info;
 typedef struct if_int_delay_info  *if_int_delay_info_t;
 
 /*
  * File organization:
  *  - public structures
  *  - iflib accessors
  *  - iflib utility functions
  *  - iflib core functions
  */
 
 typedef struct if_rxd_frag {
 	uint8_t irf_flid;
-	uint16_t irf_idx;
+	qidx_t irf_idx;
 	uint16_t irf_len;
 } *if_rxd_frag_t;
 
 typedef struct if_rxd_info {
 	/* set by iflib */
 	uint16_t iri_qsidx;		/* qset index */
 	uint16_t iri_vtag;		/* vlan tag - if flag set */
 	/* XXX redundant with the new irf_len field */
 	uint16_t iri_len;		/* packet length */
-	uint16_t iri_cidx;		/* consumer index of cq */
+	qidx_t iri_cidx;		/* consumer index of cq */
 	struct ifnet *iri_ifp;		/* some drivers >1 interface per softc */
 
 	/* updated by driver */
-	uint16_t iri_flags;		/* mbuf flags for packet */
+	if_rxd_frag_t iri_frags;
 	uint32_t iri_flowid;		/* RSS hash for packet */
 	uint32_t iri_csum_flags;	/* m_pkthdr csum flags */
+
 	uint32_t iri_csum_data;		/* m_pkthdr csum data */
+	uint8_t iri_flags;		/* mbuf flags for packet */
 	uint8_t	 iri_nfrags;		/* number of fragments in packet */
 	uint8_t	 iri_rsstype;		/* RSS hash type */
 	uint8_t	 iri_pad;		/* any padding in the received data */
-	if_rxd_frag_t iri_frags;
 } *if_rxd_info_t;
 
+typedef struct if_rxd_update {
+	uint64_t	*iru_paddrs;
+	caddr_t		*iru_vaddrs;
+	qidx_t		*iru_idxs;
+	qidx_t		iru_pidx;
+	uint16_t	iru_qsidx;
+	uint16_t	iru_count;
+	uint16_t	iru_buf_size;
+	uint8_t		iru_flidx;
+} *if_rxd_update_t;
+
 #define IPI_TX_INTR	0x1		/* send an interrupt when this packet is sent */
 #define IPI_TX_IPV4	0x2		/* ethertype IPv4 */
 #define IPI_TX_IPV6	0x4		/* ethertype IPv6 */
 
 typedef struct if_pkt_info {
-	uint32_t			ipi_len;	/* packet length */
-	bus_dma_segment_t		*ipi_segs;	/* physical addresses */
-	uint16_t			ipi_qsidx;	/* queue set index */
-	uint16_t			ipi_nsegs;	/* number of segments */
-	uint16_t			ipi_ndescs;	/* number of descriptors used by encap */
-	uint16_t			ipi_flags;	/* iflib per-packet flags */
-	uint32_t			ipi_pidx;	/* start pidx for encap */
-	uint32_t			ipi_new_pidx;	/* next available pidx post-encap */
+	bus_dma_segment_t	*ipi_segs;	/* physical addresses */
+	uint32_t		ipi_len;	/* packet length */
+	uint16_t		ipi_qsidx;	/* queue set index */
+	qidx_t			ipi_nsegs;	/* number of segments */
+
+	qidx_t			ipi_ndescs;	/* number of descriptors used by encap */
+	uint16_t		ipi_flags;	/* iflib per-packet flags */
+	qidx_t			ipi_pidx;	/* start pidx for encap */
+	qidx_t			ipi_new_pidx;	/* next available pidx post-encap */
 	/* offload handling */
-	uint64_t			ipi_csum_flags;	/* packet checksum flags */
-	uint16_t			ipi_tso_segsz;	/* tso segment size */
-	uint16_t			ipi_mflags;	/* packet mbuf flags */
-	uint16_t			ipi_vtag;	/* VLAN tag */
-	uint16_t			ipi_etype;	/* ether header type */
-	uint8_t				ipi_ehdrlen;	/* ether header length */
-	uint8_t				ipi_ip_hlen;	/* ip header length */
-	uint8_t				ipi_tcp_hlen;	/* tcp header length */
-	uint8_t				ipi_tcp_hflags;	/* tcp header flags */
-	uint8_t				ipi_ipproto;	/* ip protocol */
-	/* implied padding */
-	uint32_t			ipi_tcp_seq;	/* tcp seqno */
-	uint32_t			ipi_tcp_sum;	/* tcp csum */
+	uint8_t			ipi_ehdrlen;	/* ether header length */
+	uint8_t			ipi_ip_hlen;	/* ip header length */
+	uint8_t			ipi_tcp_hlen;	/* tcp header length */
+	uint8_t			ipi_ipproto;	/* ip protocol */
+
+	uint32_t		ipi_csum_flags;	/* packet checksum flags */
+	uint16_t		ipi_tso_segsz;	/* tso segment size */
+	uint16_t		ipi_vtag;	/* VLAN tag */
+	uint16_t		ipi_etype;	/* ether header type */
+	uint8_t			ipi_tcp_hflags;	/* tcp header flags */
+	uint8_t			ipi_mflags;	/* packet mbuf flags */
+
+	uint32_t		ipi_tcp_seq;	/* tcp seqno */
+	uint32_t		ipi_tcp_sum;	/* tcp csum */
 } *if_pkt_info_t;
 
 typedef struct if_irq {
 	struct resource  *ii_res;
 	int               ii_rid;
 	void             *ii_tag;
 } *if_irq_t;
 
 struct if_int_delay_info {
 	if_ctx_t iidi_ctx;	/* Back-pointer to the iflib ctx (softc) */
 	int iidi_offset;			/* Register offset to read/write */
 	int iidi_value;			/* Current value in usecs */
 	struct sysctl_oid *iidi_oidp;
 	struct sysctl_req *iidi_req;
 };
 
 typedef enum {
 	IFLIB_INTR_LEGACY,
 	IFLIB_INTR_MSI,
 	IFLIB_INTR_MSIX
 } iflib_intr_mode_t;
 
 /*
  * This really belongs in pciio.h or some place more general
  * but this is the only consumer for now.
  */
 typedef struct pci_vendor_info {
 	uint32_t	pvi_vendor_id;
 	uint32_t	pvi_device_id;
 	uint32_t	pvi_subvendor_id;
 	uint32_t	pvi_subdevice_id;
 	uint32_t	pvi_rev_id;
 	uint32_t	pvi_class_mask;
 	caddr_t		pvi_name;
 } pci_vendor_info_t;
 
 #define PVID(vendor, devid, name) {vendor, devid, 0, 0, 0, 0, name}
 #define PVID_OEM(vendor, devid, svid, sdevid, revid, name) {vendor, devid, svid, sdevid, revid, 0, name}
 #define PVID_END {0, 0, 0, 0, 0, 0, NULL}
 
+#define IFLIB_PNP_DESCR "U32:vendor;U32:device;U32:subvendor;U32:subdevice;" \
+    "U32:revision;U32:class;D:human"
+#define IFLIB_PNP_INFO(b, u, t) \
+    MODULE_PNP_INFO(IFLIB_PNP_DESCR, b, u, t, sizeof(t[0]), nitems(t) - 1)
+
 typedef struct if_txrx {
 	int (*ift_txd_encap) (void *, if_pkt_info_t);
-	void (*ift_txd_flush) (void *, uint16_t, uint32_t);
-	int (*ift_txd_credits_update) (void *, uint16_t, uint32_t, bool);
+	void (*ift_txd_flush) (void *, uint16_t, qidx_t pidx);
+	int (*ift_txd_credits_update) (void *, uint16_t qsidx, bool clear);
 
-	int (*ift_rxd_available) (void *, uint16_t qsidx, uint32_t pidx,
-	    int budget);
+	int (*ift_rxd_available) (void *, uint16_t qsidx, qidx_t pidx, qidx_t budget);
 	int (*ift_rxd_pkt_get) (void *, if_rxd_info_t ri);
-	void (*ift_rxd_refill) (void * , uint16_t qsidx, uint8_t flidx, uint32_t pidx,
-							uint64_t *paddrs, caddr_t *vaddrs, uint16_t count, uint16_t buf_size);
-	void (*ift_rxd_flush) (void *, uint16_t qsidx, uint8_t flidx, uint32_t pidx);
+	void (*ift_rxd_refill) (void * , if_rxd_update_t iru);
+	void (*ift_rxd_flush) (void *, uint16_t qsidx, uint8_t flidx, qidx_t pidx);
 	int (*ift_legacy_intr) (void *);
 } *if_txrx_t;
 
 typedef struct if_softc_ctx {
 	int isc_vectors;
 	int isc_nrxqsets;
 	int isc_ntxqsets;
 	int isc_msix_bar;		/* can be model specific - initialize in attach_pre */
 	int isc_tx_nsegments;		/* can be model specific - initialize in attach_pre */
 	int isc_ntxd[8];
 	int isc_nrxd[8];
 
 	uint32_t isc_txqsizes[8];
 	uint32_t isc_rxqsizes[8];
-	int isc_max_txqsets;
-	int isc_max_rxqsets;
+	/* is there such thing as a descriptor that is more than 248 bytes ? */
+	uint8_t isc_txd_size[8];
+	uint8_t isc_rxd_size[8];
+
 	int isc_tx_tso_segments_max;
 	int isc_tx_tso_size_max;
 	int isc_tx_tso_segsize_max;
+	int isc_tx_csum_flags;
+	int isc_capenable;
 	int isc_rss_table_size;
 	int isc_rss_table_mask;
 	int isc_nrxqsets_max;
 	int isc_ntxqsets_max;
 
 	iflib_intr_mode_t isc_intr;
 	uint16_t isc_max_frame_size; /* set at init time by driver */
+	uint16_t isc_min_frame_size; /* set at init time by driver, only used if
+					IFLIB_NEED_ETHER_PAD is set. */
+	uint32_t isc_pause_frames;   /* set by driver for iflib_timer to detect */
 	pci_vendor_info_t isc_vendor_info;	/* set by iflib prior to attach_pre */
+	int isc_disable_msix;
+	if_txrx_t isc_txrx;
 } *if_softc_ctx_t;
 
 /*
  * Initialization values for device
  */
 struct if_shared_ctx {
-	int isc_magic;
-	if_txrx_t isc_txrx;
+	unsigned isc_magic;
 	driver_t *isc_driver;
-	int isc_nfl;
-	int isc_flags;
 	bus_size_t isc_q_align;
 	bus_size_t isc_tx_maxsize;
 	bus_size_t isc_tx_maxsegsize;
 	bus_size_t isc_rx_maxsize;
 	bus_size_t isc_rx_maxsegsize;
 	int isc_rx_nsegments;
-	int isc_rx_process_limit;
-	int isc_ntxqs;			/* # of tx queues per tx qset - usually 1 */
-	int isc_nrxqs;			/* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */
 	int isc_admin_intrcnt;		/* # of admin/link interrupts */
 
-
-	int isc_tx_reclaim_thresh;
-
 	/* fields necessary for probe */
 	pci_vendor_info_t *isc_vendor_info;
 	char *isc_driver_version;
 /* optional function to transform the read values to match the table*/
 	void (*isc_parse_devinfo) (uint16_t *device_id, uint16_t *subvendor_id,
 				   uint16_t *subdevice_id, uint16_t *rev_id);
 	int isc_nrxd_min[8];
 	int isc_nrxd_default[8];
 	int isc_nrxd_max[8];
 	int isc_ntxd_min[8];
 	int isc_ntxd_default[8];
 	int isc_ntxd_max[8];
+
+	/* actively used during operation */
+	int isc_nfl __aligned(CACHE_LINE_SIZE);
+	int isc_ntxqs;			/* # of tx queues per tx qset - usually 1 */
+	int isc_nrxqs;			/* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */
+	int isc_rx_process_limit;
+	int isc_tx_reclaim_thresh;
+	int isc_flags;
 };
 
 typedef struct iflib_dma_info {
 	bus_addr_t		idi_paddr;
 	caddr_t			idi_vaddr;
 	bus_dma_tag_t		idi_tag;
 	bus_dmamap_t		idi_map;
 	uint32_t		idi_size;
 } *iflib_dma_info_t;
 
 #define IFLIB_MAGIC 0xCAFEF00D
 
 typedef enum {
-	IFLIB_INTR_TX,
 	IFLIB_INTR_RX,
+	IFLIB_INTR_TX,
+	IFLIB_INTR_RXTX,
 	IFLIB_INTR_ADMIN,
 	IFLIB_INTR_IOV,
 } iflib_intr_type_t;
 
 #ifndef ETH_ADDR_LEN
 #define ETH_ADDR_LEN 6
 #endif
 
 
 /*
  * Interface has a separate command queue for RX
  */
-#define IFLIB_HAS_RXCQ		0x1
+#define IFLIB_HAS_RXCQ		0x01
 /*
  * Driver has already allocated vectors
  */
-#define IFLIB_SKIP_MSIX		0x2
-
+#define IFLIB_SKIP_MSIX		0x02
 /*
  * Interface is a virtual function
  */
-#define IFLIB_IS_VF		0x4
+#define IFLIB_IS_VF		0x04
 /*
  * Interface has a separate command queue for TX
  */
-#define IFLIB_HAS_TXCQ		0x8
+#define IFLIB_HAS_TXCQ		0x08
+/*
+ * Interface does checksum in place
+ */
+#define IFLIB_NEED_SCRATCH	0x10
+/*
+ * Interface doesn't expect in_pseudo for th_sum
+ */
+#define IFLIB_TSO_INIT_IP	0x20
+/*
+ * Interface doesn't align IP header
+ */
+#define IFLIB_DO_RX_FIXUP	0x40
+/*
+ * Driver needs csum zeroed for offloading
+ */
+#define IFLIB_NEED_ZERO_CSUM	0x80
+/*
+ * Driver needs frames padded to some minimum length
+ */
+#define IFLIB_NEED_ETHER_PAD	0x100
 
 
+
 /*
  * field accessors
  */
 void *iflib_get_softc(if_ctx_t ctx);
 
 device_t iflib_get_dev(if_ctx_t ctx);
 
 if_t iflib_get_ifp(if_ctx_t ctx);
 
 struct ifmedia *iflib_get_media(if_ctx_t ctx);
 
 if_softc_ctx_t iflib_get_softc_ctx(if_ctx_t ctx);
 if_shared_ctx_t iflib_get_sctx(if_ctx_t ctx);
 
 void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]);
 
-
-
-
 /*
  * If the driver can plug cleanly in to newbus use these
  */
 int iflib_device_probe(device_t);
 int iflib_device_attach(device_t);
 int iflib_device_detach(device_t);
 int iflib_device_suspend(device_t);
 int iflib_device_resume(device_t);
 int iflib_device_shutdown(device_t);
 
 
 int iflib_device_iov_init(device_t, uint16_t, const nvlist_t *);
 void iflib_device_iov_uninit(device_t);
 int iflib_device_iov_add_vf(device_t, uint16_t, const nvlist_t *);
 
 /*
  * If the driver can't plug cleanly in to newbus
  * use these
  */
 int iflib_device_register(device_t dev, void *softc, if_shared_ctx_t sctx, if_ctx_t *ctxp);
 int iflib_device_deregister(if_ctx_t);
 
 
 
 int iflib_irq_alloc(if_ctx_t, if_irq_t, int, driver_filter_t, void *filter_arg, driver_intr_t, void *arg, char *name);
 int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 							iflib_intr_type_t type, driver_filter_t *filter,
 							void *filter_arg, int qid, char *name);
-void iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name);
+void iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,  void *arg, int qid, char *name);
 
 void iflib_irq_free(if_ctx_t ctx, if_irq_t irq);
 
 void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name);
 
 void iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask,
 			     gtask_fn_t *fn, char *name);
 
 void iflib_config_gtask_deinit(struct grouptask *gtask);
 
 
 
 void iflib_tx_intr_deferred(if_ctx_t ctx, int txqid);
 void iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid);
 void iflib_admin_intr_deferred(if_ctx_t ctx);
 void iflib_iov_intr_deferred(if_ctx_t ctx);
 
 
 void iflib_link_state_change(if_ctx_t ctx, int linkstate, uint64_t baudrate);
 
 int iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags);
 void iflib_dma_free(iflib_dma_info_t dma);
 
 int iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count);
 
 void iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count);
 
 
 struct mtx *iflib_ctx_lock_get(if_ctx_t);
 struct mtx *iflib_qset_lock_get(if_ctx_t, uint16_t);
 
 void iflib_led_create(if_ctx_t ctx);
 
 void iflib_add_int_delay_sysctl(if_ctx_t, const char *, const char *,
 								if_int_delay_info_t, int, int);
 
 #endif /*  __IFLIB_H_ */
Index: stable/11/sys/net/mp_ring.c
===================================================================
--- stable/11/sys/net/mp_ring.c	(revision 333337)
+++ stable/11/sys/net/mp_ring.c	(revision 333338)
@@ -1,544 +1,540 @@
 /*-
  * Copyright (c) 2014 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <machine/cpu.h>
 
 #if defined(__powerpc__) || defined(__mips__)
 #define NO_64BIT_ATOMICS
 #endif
 
 #if defined(__i386__)
 #define atomic_cmpset_acq_64 atomic_cmpset_64
 #define atomic_cmpset_rel_64 atomic_cmpset_64
 #endif
 
 #include <net/mp_ring.h>
 
 union ring_state {
 	struct {
 		uint16_t pidx_head;
 		uint16_t pidx_tail;
 		uint16_t cidx;
 		uint16_t flags;
 	};
 	uint64_t state;
 };
 
 enum {
 	IDLE = 0,	/* consumer ran to completion, nothing more to do. */
 	BUSY,		/* consumer is running already, or will be shortly. */
 	STALLED,	/* consumer stopped due to lack of resources. */
 	ABDICATED,	/* consumer stopped even though there was work to be
 			   done because it wants another thread to take over. */
 };
 
 static inline uint16_t
 space_available(struct ifmp_ring *r, union ring_state s)
 {
 	uint16_t x = r->size - 1;
 
 	if (s.cidx == s.pidx_head)
 		return (x);
 	else if (s.cidx > s.pidx_head)
 		return (s.cidx - s.pidx_head - 1);
 	else
 		return (x - s.pidx_head + s.cidx);
 }
 
 static inline uint16_t
 increment_idx(struct ifmp_ring *r, uint16_t idx, uint16_t n)
 {
 	int x = r->size - idx;
 
 	MPASS(x > 0);
 	return (x > n ? idx + n : n - x);
 }
 
 /* Consumer is about to update the ring's state to s */
 static inline uint16_t
 state_to_flags(union ring_state s, int abdicate)
 {
 
 	if (s.cidx == s.pidx_tail)
 		return (IDLE);
 	else if (abdicate && s.pidx_tail != s.pidx_head)
 		return (ABDICATED);
 
 	return (BUSY);
 }
 
 #ifdef NO_64BIT_ATOMICS
 static void
 drain_ring_locked(struct ifmp_ring *r, union ring_state os, uint16_t prev, int budget)
 {
 	union ring_state ns;
 	int n, pending, total;
 	uint16_t cidx = os.cidx;
 	uint16_t pidx = os.pidx_tail;
 
 	MPASS(os.flags == BUSY);
 	MPASS(cidx != pidx);
 
 	if (prev == IDLE)
 		counter_u64_add(r->starts, 1);
 	pending = 0;
 	total = 0;
 
 	while (cidx != pidx) {
 
 		/* Items from cidx to pidx are available for consumption. */
 		n = r->drain(r, cidx, pidx);
 		if (n == 0) {
 			os.state = ns.state = r->state;
 			ns.cidx = cidx;
 			ns.flags = STALLED;
 			r->state = ns.state;
 			if (prev != STALLED)
 				counter_u64_add(r->stalls, 1);
 			else if (total > 0) {
 				counter_u64_add(r->restarts, 1);
 				counter_u64_add(r->stalls, 1);
 			}
 			break;
 		}
 		cidx = increment_idx(r, cidx, n);
 		pending += n;
 		total += n;
 
 		/*
 		 * We update the cidx only if we've caught up with the pidx, the
 		 * real cidx is getting too far ahead of the one visible to
 		 * everyone else, or we have exceeded our budget.
 		 */
 		if (cidx != pidx && pending < 64 && total < budget)
 			continue;
 
 		os.state = ns.state = r->state;
 		ns.cidx = cidx;
 		ns.flags = state_to_flags(ns, total >= budget);
 		r->state = ns.state;
 
 		if (ns.flags == ABDICATED)
 			counter_u64_add(r->abdications, 1);
 		if (ns.flags != BUSY) {
 			/* Wrong loop exit if we're going to stall. */
 			MPASS(ns.flags != STALLED);
 			if (prev == STALLED) {
 				MPASS(total > 0);
 				counter_u64_add(r->restarts, 1);
 			}
 			break;
 		}
 
 		/*
 		 * The acquire style atomic above guarantees visibility of items
 		 * associated with any pidx change that we notice here.
 		 */
 		pidx = ns.pidx_tail;
 		pending = 0;
 	}
 }
 #else
 /*
  * Caller passes in a state, with a guarantee that there is work to do and that
  * all items up to the pidx_tail in the state are visible.
  */
 static void
 drain_ring_lockless(struct ifmp_ring *r, union ring_state os, uint16_t prev, int budget)
 {
 	union ring_state ns;
 	int n, pending, total;
 	uint16_t cidx = os.cidx;
 	uint16_t pidx = os.pidx_tail;
 
 	MPASS(os.flags == BUSY);
 	MPASS(cidx != pidx);
 
 	if (prev == IDLE)
 		counter_u64_add(r->starts, 1);
 	pending = 0;
 	total = 0;
 
 	while (cidx != pidx) {
 
 		/* Items from cidx to pidx are available for consumption. */
 		n = r->drain(r, cidx, pidx);
 		if (n == 0) {
 			critical_enter();
 			do {
 				os.state = ns.state = r->state;
 				ns.cidx = cidx;
 				ns.flags = STALLED;
 			} while (atomic_cmpset_64(&r->state, os.state,
 			    ns.state) == 0);
 			critical_exit();
 			if (prev != STALLED)
 				counter_u64_add(r->stalls, 1);
 			else if (total > 0) {
 				counter_u64_add(r->restarts, 1);
 				counter_u64_add(r->stalls, 1);
 			}
 			break;
 		}
 		cidx = increment_idx(r, cidx, n);
 		pending += n;
 		total += n;
 
 		/*
 		 * We update the cidx only if we've caught up with the pidx, the
 		 * real cidx is getting too far ahead of the one visible to
 		 * everyone else, or we have exceeded our budget.
 		 */
 		if (cidx != pidx && pending < 64 && total < budget)
 			continue;
 		critical_enter();
 		do {
 			os.state = ns.state = r->state;
 			ns.cidx = cidx;
 			ns.flags = state_to_flags(ns, total >= budget);
 		} while (atomic_cmpset_acq_64(&r->state, os.state, ns.state) == 0);
 		critical_exit();
 
 		if (ns.flags == ABDICATED)
 			counter_u64_add(r->abdications, 1);
 		if (ns.flags != BUSY) {
 			/* Wrong loop exit if we're going to stall. */
 			MPASS(ns.flags != STALLED);
 			if (prev == STALLED) {
 				MPASS(total > 0);
 				counter_u64_add(r->restarts, 1);
 			}
 			break;
 		}
 
 		/*
 		 * The acquire style atomic above guarantees visibility of items
 		 * associated with any pidx change that we notice here.
 		 */
 		pidx = ns.pidx_tail;
 		pending = 0;
 	}
 }
 #endif
 
 int
 ifmp_ring_alloc(struct ifmp_ring **pr, int size, void *cookie, mp_ring_drain_t drain,
     mp_ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
 {
 	struct ifmp_ring *r;
 
 	/* All idx are 16b so size can be 65536 at most */
 	if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
 	    can_drain == NULL)
 		return (EINVAL);
 	*pr = NULL;
 	flags &= M_NOWAIT | M_WAITOK;
 	MPASS(flags != 0);
 
 	r = malloc(__offsetof(struct ifmp_ring, items[size]), mt, flags | M_ZERO);
 	if (r == NULL)
 		return (ENOMEM);
 	r->size = size;
 	r->cookie = cookie;
 	r->mt = mt;
 	r->drain = drain;
 	r->can_drain = can_drain;
 	r->enqueues = counter_u64_alloc(flags);
 	r->drops = counter_u64_alloc(flags);
 	r->starts = counter_u64_alloc(flags);
 	r->stalls = counter_u64_alloc(flags);
 	r->restarts = counter_u64_alloc(flags);
 	r->abdications = counter_u64_alloc(flags);
 	if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
 	    r->stalls == NULL || r->restarts == NULL ||
 	    r->abdications == NULL) {
 		ifmp_ring_free(r);
 		return (ENOMEM);
 	}
 
 	*pr = r;
 #ifdef NO_64BIT_ATOMICS
 	mtx_init(&r->lock, "mp_ring lock", NULL, MTX_DEF);
 #endif
 	return (0);
 }
 
 void
 ifmp_ring_free(struct ifmp_ring *r)
 {
 
 	if (r == NULL)
 		return;
 
 	if (r->enqueues != NULL)
 		counter_u64_free(r->enqueues);
 	if (r->drops != NULL)
 		counter_u64_free(r->drops);
 	if (r->starts != NULL)
 		counter_u64_free(r->starts);
 	if (r->stalls != NULL)
 		counter_u64_free(r->stalls);
 	if (r->restarts != NULL)
 		counter_u64_free(r->restarts);
 	if (r->abdications != NULL)
 		counter_u64_free(r->abdications);
 
 	free(r, r->mt);
 }
 
 /*
  * Enqueue n items and maybe drain the ring for some time.
  *
  * Returns an errno.
  */
 #ifdef NO_64BIT_ATOMICS
 int
 ifmp_ring_enqueue(struct ifmp_ring *r, void **items, int n, int budget)
 {
 	union ring_state os, ns;
 	uint16_t pidx_start, pidx_stop;
 	int i;
 
 	MPASS(items != NULL);
 	MPASS(n > 0);
 
 	mtx_lock(&r->lock);
 	/*
 	 * Reserve room for the new items.  Our reservation, if successful, is
 	 * from 'pidx_start' to 'pidx_stop'.
 	 */
 	os.state = r->state;
 	if (n >= space_available(r, os)) {
 		counter_u64_add(r->drops, n);
 		MPASS(os.flags != IDLE);
 		if (os.flags == STALLED)
 			ifmp_ring_check_drainage(r, 0);
 		return (ENOBUFS);
 	}
 	ns.state = os.state;
 	ns.pidx_head = increment_idx(r, os.pidx_head, n);
 	r->state = ns.state;
 	pidx_start = os.pidx_head;
 	pidx_stop = ns.pidx_head;
 
 	/*
 	 * Wait for other producers who got in ahead of us to enqueue their
 	 * items, one producer at a time.  It is our turn when the ring's
 	 * pidx_tail reaches the beginning of our reservation (pidx_start).
 	 */
 	while (ns.pidx_tail != pidx_start) {
 		cpu_spinwait();
 		ns.state = r->state;
 	}
 
 	/* Now it is our turn to fill up the area we reserved earlier. */
 	i = pidx_start;
 	do {
 		r->items[i] = *items++;
 		if (__predict_false(++i == r->size))
 			i = 0;
 	} while (i != pidx_stop);
 
 	/*
 	 * Update the ring's pidx_tail.  The release style atomic guarantees
 	 * that the items are visible to any thread that sees the updated pidx.
 	 */
 	os.state = ns.state = r->state;
 	ns.pidx_tail = pidx_stop;
 	ns.flags = BUSY;
 	r->state = ns.state;
 	counter_u64_add(r->enqueues, n);
 
 	/*
 	 * Turn into a consumer if some other thread isn't active as a consumer
 	 * already.
 	 */
 	if (os.flags != BUSY)
 		drain_ring_locked(r, ns, os.flags, budget);
 
 	mtx_unlock(&r->lock);
 	return (0);
 }
 
 #else
 int
 ifmp_ring_enqueue(struct ifmp_ring *r, void **items, int n, int budget)
 {
 	union ring_state os, ns;
 	uint16_t pidx_start, pidx_stop;
 	int i;
 
 	MPASS(items != NULL);
 	MPASS(n > 0);
 
 	/*
 	 * Reserve room for the new items.  Our reservation, if successful, is
 	 * from 'pidx_start' to 'pidx_stop'.
 	 */
 	for (;;) {
 		os.state = r->state;
 		if (n >= space_available(r, os)) {
 			counter_u64_add(r->drops, n);
 			MPASS(os.flags != IDLE);
 			if (os.flags == STALLED)
 				ifmp_ring_check_drainage(r, 0);
 			return (ENOBUFS);
 		}
 		ns.state = os.state;
 		ns.pidx_head = increment_idx(r, os.pidx_head, n);
 		critical_enter();
 		if (atomic_cmpset_64(&r->state, os.state, ns.state))
 			break;
 		critical_exit();
 		cpu_spinwait();
 	}
 	pidx_start = os.pidx_head;
 	pidx_stop = ns.pidx_head;
 
 	/*
 	 * Wait for other producers who got in ahead of us to enqueue their
 	 * items, one producer at a time.  It is our turn when the ring's
 	 * pidx_tail reaches the beginning of our reservation (pidx_start).
 	 */
 	while (ns.pidx_tail != pidx_start) {
 		cpu_spinwait();
 		ns.state = r->state;
 	}
 
 	/* Now it is our turn to fill up the area we reserved earlier. */
 	i = pidx_start;
 	do {
 		r->items[i] = *items++;
 		if (__predict_false(++i == r->size))
 			i = 0;
 	} while (i != pidx_stop);
 
 	/*
 	 * Update the ring's pidx_tail.  The release style atomic guarantees
 	 * that the items are visible to any thread that sees the updated pidx.
 	 */
 	do {
 		os.state = ns.state = r->state;
 		ns.pidx_tail = pidx_stop;
-		ns.flags = BUSY;
+		if (os.flags == IDLE)
+			ns.flags = ABDICATED;
 	} while (atomic_cmpset_rel_64(&r->state, os.state, ns.state) == 0);
 	critical_exit();
 	counter_u64_add(r->enqueues, n);
 
-	/*
-	 * Turn into a consumer if some other thread isn't active as a consumer
-	 * already.
-	 */
-	if (os.flags != BUSY)
-		drain_ring_lockless(r, ns, os.flags, budget);
-
 	return (0);
 }
 #endif
 
 void
 ifmp_ring_check_drainage(struct ifmp_ring *r, int budget)
 {
 	union ring_state os, ns;
 
 	os.state = r->state;
-	if (os.flags != STALLED || os.pidx_head != os.pidx_tail || r->can_drain(r) == 0)
+	if ((os.flags != STALLED && os.flags != ABDICATED) ||	// Only continue in STALLED and ABDICATED
+	    os.pidx_head != os.pidx_tail ||			// Require work to be available
+	    (os.flags != ABDICATED && r->can_drain(r) == 0))	// Can either drain, or everyone left
 		return;
 
 	MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
 	ns.state = os.state;
 	ns.flags = BUSY;
 
 
 #ifdef NO_64BIT_ATOMICS
 	mtx_lock(&r->lock);
 	if (r->state != os.state) {
 		mtx_unlock(&r->lock);
 		return;
 	}
 	r->state = ns.state;
 	drain_ring_locked(r, ns, os.flags, budget);
 	mtx_unlock(&r->lock);
 #else
 	/*
 	 * The acquire style atomic guarantees visibility of items associated
 	 * with the pidx that we read here.
 	 */
 	if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
 		return;
 
 
 	drain_ring_lockless(r, ns, os.flags, budget);
 #endif
 }
 
 void
 ifmp_ring_reset_stats(struct ifmp_ring *r)
 {
 
 	counter_u64_zero(r->enqueues);
 	counter_u64_zero(r->drops);
 	counter_u64_zero(r->starts);
 	counter_u64_zero(r->stalls);
 	counter_u64_zero(r->restarts);
 	counter_u64_zero(r->abdications);
 }
 
 int
 ifmp_ring_is_idle(struct ifmp_ring *r)
 {
 	union ring_state s;
 
 	s.state = r->state;
 	if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
 	    s.flags == IDLE)
 		return (1);
 
 	return (0);
 }
 
 int
 ifmp_ring_is_stalled(struct ifmp_ring *r)
 {
 	union ring_state s;
 
 	s.state = r->state;
 	if (s.pidx_head == s.pidx_tail && s.flags == STALLED)
 		return (1);
 
 	return (0);
 }
Index: stable/11/sys/sys/_task.h
===================================================================
--- stable/11/sys/sys/_task.h	(revision 333337)
+++ stable/11/sys/sys/_task.h	(revision 333338)
@@ -1,73 +1,74 @@
 /*-
  * Copyright (c) 2000 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS__TASK_H_
 #define _SYS__TASK_H_
 
 #include <sys/queue.h>
 
 /*
  * Each task includes a function which is called from
  * taskqueue_run().  The first argument is taken from the 'ta_context'
  * field of struct task and the second argument is a count of how many
  * times the task was enqueued before the call to taskqueue_run().
  *
  * List of locks	 
  * (c)	const after init	 
  * (q)	taskqueue lock
  */
 typedef void task_fn_t(void *context, int pending);
 typedef void gtask_fn_t(void *context);
 
 struct task {
 	STAILQ_ENTRY(task) ta_link;	/* (q) link for queue */
 	uint16_t ta_pending;		/* (q) count times queued */
 	u_short	ta_priority;		/* (c) Priority */
 	task_fn_t *ta_func;		/* (c) task handler */
 	void	*ta_context;		/* (c) argument for handler */
 };
 
 struct gtask {
 	STAILQ_ENTRY(gtask) ta_link;	/* (q) link for queue */
 	uint16_t ta_flags;		/* (q) state flags */
 	u_short	ta_priority;		/* (c) Priority */
 	gtask_fn_t *ta_func;		/* (c) task handler */
 	void	*ta_context;		/* (c) argument for handler */
 };
 
 struct grouptask {
 	struct	gtask		gt_task;
 	void			*gt_taskqueue;
 	LIST_ENTRY(grouptask)	gt_list;
 	void			*gt_uniq;
-	char			*gt_name;
+#define GROUPTASK_NAMELEN	32
+	char			gt_name[GROUPTASK_NAMELEN];
 	int16_t			gt_irq;
 	int16_t			gt_cpu;
 };
 
 #endif /* !_SYS__TASK_H_ */
Index: stable/11/sys/sys/cpuset.h
===================================================================
--- stable/11/sys/sys/cpuset.h	(revision 333337)
+++ stable/11/sys/sys/cpuset.h	(revision 333338)
@@ -1,153 +1,155 @@
 /*-
  * Copyright (c) 2008,	Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_CPUSET_H_
 #define	_SYS_CPUSET_H_
 
 #include <sys/_cpuset.h>
 
 #include <sys/bitset.h>
 
 #define	_NCPUBITS	_BITSET_BITS
 #define	_NCPUWORDS	__bitset_words(CPU_SETSIZE)
 
 #define	CPUSETBUFSIZ	((2 + sizeof(long) * 2) * _NCPUWORDS)
 
 #define	CPU_CLR(n, p)			BIT_CLR(CPU_SETSIZE, n, p)
 #define	CPU_COPY(f, t)			BIT_COPY(CPU_SETSIZE, f, t)
 #define	CPU_ISSET(n, p)			BIT_ISSET(CPU_SETSIZE, n, p)
 #define	CPU_SET(n, p)			BIT_SET(CPU_SETSIZE, n, p)
 #define	CPU_ZERO(p) 			BIT_ZERO(CPU_SETSIZE, p)
 #define	CPU_FILL(p) 			BIT_FILL(CPU_SETSIZE, p)
 #define	CPU_SETOF(n, p)			BIT_SETOF(CPU_SETSIZE, n, p)
 #define	CPU_EMPTY(p)			BIT_EMPTY(CPU_SETSIZE, p)
 #define	CPU_ISFULLSET(p)		BIT_ISFULLSET(CPU_SETSIZE, p)
 #define	CPU_SUBSET(p, c)		BIT_SUBSET(CPU_SETSIZE, p, c)
 #define	CPU_OVERLAP(p, c)		BIT_OVERLAP(CPU_SETSIZE, p, c)
 #define	CPU_CMP(p, c)			BIT_CMP(CPU_SETSIZE, p, c)
 #define	CPU_OR(d, s)			BIT_OR(CPU_SETSIZE, d, s)
 #define	CPU_AND(d, s)			BIT_AND(CPU_SETSIZE, d, s)
 #define	CPU_NAND(d, s)			BIT_NAND(CPU_SETSIZE, d, s)
 #define	CPU_CLR_ATOMIC(n, p)		BIT_CLR_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_SET_ATOMIC(n, p)		BIT_SET_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_SET_ATOMIC_ACQ(n, p)	BIT_SET_ATOMIC_ACQ(CPU_SETSIZE, n, p)
 #define	CPU_AND_ATOMIC(n, p)		BIT_AND_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_OR_ATOMIC(d, s)		BIT_OR_ATOMIC(CPU_SETSIZE, d, s)
 #define	CPU_COPY_STORE_REL(f, t)	BIT_COPY_STORE_REL(CPU_SETSIZE, f, t)
 #define	CPU_FFS(p)			BIT_FFS(CPU_SETSIZE, p)
 #define	CPU_COUNT(p)			BIT_COUNT(CPU_SETSIZE, p)
 #define	CPUSET_FSET			BITSET_FSET(_NCPUWORDS)
 #define	CPUSET_T_INITIALIZER		BITSET_T_INITIALIZER
 
 /*
  * Valid cpulevel_t values.
  */
 #define	CPU_LEVEL_ROOT		1	/* All system cpus. */
 #define	CPU_LEVEL_CPUSET	2	/* Available cpus for which. */
 #define	CPU_LEVEL_WHICH		3	/* Actual mask/id for which. */
 
 /*
  * Valid cpuwhich_t values.
  */
 #define	CPU_WHICH_TID		1	/* Specifies a thread id. */
 #define	CPU_WHICH_PID		2	/* Specifies a process id. */
 #define	CPU_WHICH_CPUSET	3	/* Specifies a set id. */
 #define	CPU_WHICH_IRQ		4	/* Specifies an irq #. */
 #define	CPU_WHICH_JAIL		5	/* Specifies a jail id. */
 #define	CPU_WHICH_DOMAIN	6	/* Specifies a NUMA domain id. */
+#define	CPU_WHICH_INTRHANDLER	7	/* Specifies an irq # (not ithread). */
+#define	CPU_WHICH_ITHREAD	8	/* Specifies an irq's ithread. */
 
 /*
  * Reserved cpuset identifiers.
  */
 #define	CPUSET_INVALID	-1
 #define	CPUSET_DEFAULT	0
 
 #ifdef _KERNEL
 #include <sys/queue.h>
 
 LIST_HEAD(setlist, cpuset);
 
 /*
  * cpusets encapsulate cpu binding information for one or more threads.
  *
  * 	a - Accessed with atomics.
  *	s - Set at creation, never modified.  Only a ref required to read.
  *	c - Locked internally by a cpuset lock.
  *
  * The bitmask is only modified while holding the cpuset lock.  It may be
  * read while only a reference is held but the consumer must be prepared
  * to deal with inconsistent results.
  */
 struct cpuset {
 	cpuset_t		cs_mask;	/* bitmask of valid cpus. */
 	volatile u_int		cs_ref;		/* (a) Reference count. */
 	int			cs_flags;	/* (s) Flags from below. */
 	cpusetid_t		cs_id;		/* (s) Id or INVALID. */
 	struct cpuset		*cs_parent;	/* (s) Pointer to our parent. */
 	LIST_ENTRY(cpuset)	cs_link;	/* (c) All identified sets. */
 	LIST_ENTRY(cpuset)	cs_siblings;	/* (c) Sibling set link. */
 	struct setlist		cs_children;	/* (c) List of children. */
 };
 
 #define CPU_SET_ROOT    0x0001  /* Set is a root set. */
 #define CPU_SET_RDONLY  0x0002  /* No modification allowed. */
 
 extern cpuset_t *cpuset_root;
 struct prison;
 struct proc;
 struct thread;
 
 struct cpuset *cpuset_thread0(void);
 struct cpuset *cpuset_ref(struct cpuset *);
 void	cpuset_rel(struct cpuset *);
 int	cpuset_setthread(lwpid_t id, cpuset_t *);
 int	cpuset_setithread(lwpid_t id, int cpu);
 int	cpuset_create_root(struct prison *, struct cpuset **);
 int	cpuset_setproc_update_set(struct proc *, struct cpuset *);
 int	cpuset_which(cpuwhich_t, id_t, struct proc **,
 	    struct thread **, struct cpuset **);
 
 char	*cpusetobj_strprint(char *, const cpuset_t *);
 int	cpusetobj_strscan(cpuset_t *, const char *);
 #ifdef DDB
 void	ddb_display_cpuset(const cpuset_t *);
 #endif
 
 #else
 __BEGIN_DECLS
 int	cpuset(cpusetid_t *);
 int	cpuset_setid(cpuwhich_t, id_t, cpusetid_t);
 int	cpuset_getid(cpulevel_t, cpuwhich_t, id_t, cpusetid_t *);
 int	cpuset_getaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, cpuset_t *);
 int	cpuset_setaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, const cpuset_t *);
 __END_DECLS
 #endif
 #endif /* !_SYS_CPUSET_H_ */
Index: stable/11/sys/sys/interrupt.h
===================================================================
--- stable/11/sys/sys/interrupt.h	(revision 333337)
+++ stable/11/sys/sys/interrupt.h	(revision 333338)
@@ -1,186 +1,188 @@
 /*-
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_INTERRUPT_H_
 #define _SYS_INTERRUPT_H_
 
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 struct intr_event;
 struct intr_thread;
 struct trapframe;
 
 /*
  * Describe a hardware interrupt handler.
  *
  * Multiple interrupt handlers for a specific event can be chained
  * together.
  */
 struct intr_handler {
 	driver_filter_t	*ih_filter;	/* Filter handler function. */
 	driver_intr_t	*ih_handler;	/* Threaded handler function. */
 	void		*ih_argument;	/* Argument to pass to handlers. */
 	int		 ih_flags;
 	char		 ih_name[MAXCOMLEN + 1]; /* Name of handler. */
 	struct intr_event *ih_event;	/* Event we are connected to. */
 	int		 ih_need;	/* Needs service. */
 	TAILQ_ENTRY(intr_handler) ih_next; /* Next handler for this event. */
 	u_char		 ih_pri;	/* Priority of this handler. */
 	struct intr_thread *ih_thread;	/* Ithread for filtered handler. */
 };
 
 /* Interrupt handle flags kept in ih_flags */
 #define	IH_EXCLUSIVE	0x00000002	/* Exclusive interrupt. */
 #define	IH_ENTROPY	0x00000004	/* Device is a good entropy source. */
 #define	IH_DEAD		0x00000008	/* Handler should be removed. */
 #define	IH_MPSAFE	0x80000000	/* Handler does not need Giant. */
 
 /*
  * Describe an interrupt event.  An event holds a list of handlers.
  * The 'pre_ithread', 'post_ithread', 'post_filter', and 'assign_cpu'
  * hooks are used to invoke MD code for certain operations.
  *
  * The 'pre_ithread' hook is called when an interrupt thread for
  * handlers without filters is scheduled.  It is responsible for
  * ensuring that 1) the system won't be swamped with an interrupt
  * storm from the associated source while the ithread runs and 2) the
  * current CPU is able to receive interrupts from other interrupt
  * sources.  The first is usually accomplished by disabling
  * level-triggered interrupts until the ithread completes.  The second
  * is accomplished on some platforms by acknowledging the interrupt
  * via an EOI.
  *
  * The 'post_ithread' hook is invoked when an ithread finishes.  It is
  * responsible for ensuring that the associated interrupt source will
  * trigger an interrupt when it is asserted in the future.  Usually
  * this is implemented by enabling a level-triggered interrupt that
  * was previously disabled via the 'pre_ithread' hook.
  *
  * The 'post_filter' hook is invoked when a filter handles an
  * interrupt.  It is responsible for ensuring that the current CPU is
  * able to receive interrupts again.  On some platforms this is done
  * by acknowledging the interrupts via an EOI.
  *
  * The 'assign_cpu' hook is used to bind an interrupt source to a
  * specific CPU.  If the interrupt cannot be bound, this function may
  * return an error.
  *
  * Note that device drivers may also use interrupt events to manage
  * multiplexing interrupt interrupt handler into handlers for child
  * devices.  In that case, the above hooks are not used.  The device
  * can create an event for its interrupt resource and register child
  * event handlers with that event.  It can then use
  * intr_event_execute_handlers() to execute non-filter handlers.
  * Currently filter handlers are not supported by this, but that can
  * be added by splitting out the filter loop from intr_event_handle()
  * if desired.
  */
 struct intr_event {
 	TAILQ_ENTRY(intr_event) ie_list;
 	TAILQ_HEAD(, intr_handler) ie_handlers; /* Interrupt handlers. */
 	char		ie_name[MAXCOMLEN + 1]; /* Individual event name. */
 	char		ie_fullname[MAXCOMLEN + 1];
 	struct mtx	ie_lock;
 	void		*ie_source;	/* Cookie used by MD code. */
 	struct intr_thread *ie_thread;	/* Thread we are connected to. */
 	void		(*ie_pre_ithread)(void *);
 	void		(*ie_post_ithread)(void *);
 	void		(*ie_post_filter)(void *);
 	int		(*ie_assign_cpu)(void *, int);
 	int		ie_flags;
 	int		ie_count;	/* Loop counter. */
 	int		ie_warncnt;	/* Rate-check interrupt storm warns. */
 	struct timeval	ie_warntm;
 	int		ie_irq;		/* Physical irq number if !SOFT. */
 	int		ie_cpu;		/* CPU this event is bound to. */
 };
 
 /* Interrupt event flags kept in ie_flags. */
 #define	IE_SOFT		0x000001	/* Software interrupt. */
 #define	IE_ENTROPY	0x000002	/* Interrupt is an entropy source. */
 #define	IE_ADDING_THREAD 0x000004	/* Currently building an ithread. */
 
 /* Flags to pass to sched_swi. */
 #define	SWI_DELAY	0x2
 
 /*
  * Software interrupt numbers in priority order.  The priority determines
  * the priority of the corresponding interrupt thread.
  */
 #define	SWI_TTY		0
 #define	SWI_NET		1
 #define	SWI_CAMBIO	2
 #define	SWI_VM		3
 #define	SWI_CLOCK	4
 #define	SWI_TQ_FAST	5
 #define	SWI_TQ		6
 #define	SWI_TQ_GIANT	6
 
 struct proc;
 
 extern struct	intr_event *tty_intr_event;
 extern struct	intr_event *clk_intr_event;
 extern void	*vm_ih;
 
 /* Counts and names for statistics (defined in MD code). */
 extern u_long 	intrcnt[];	/* counts for for each device and stray */
 extern char 	intrnames[];	/* string table containing device names */
 extern size_t	sintrcnt;	/* size of intrcnt table */
 extern size_t	sintrnames;	/* size of intrnames table */
 
 #ifdef DDB
 void	db_dump_intr_event(struct intr_event *ie, int handlers);
 #endif
 u_char	intr_priority(enum intr_type flags);
 int	intr_event_add_handler(struct intr_event *ie, const char *name,
 	    driver_filter_t filter, driver_intr_t handler, void *arg, 
 	    u_char pri, enum intr_type flags, void **cookiep);	    
 int	intr_event_bind(struct intr_event *ie, int cpu);
+int	intr_event_bind_irqonly(struct intr_event *ie, int cpu);
+int	intr_event_bind_ithread(struct intr_event *ie, int cpu);
 int	intr_event_create(struct intr_event **event, void *source,
 	    int flags, int irq, void (*pre_ithread)(void *),
 	    void (*post_ithread)(void *), void (*post_filter)(void *),
 	    int (*assign_cpu)(void *, int), const char *fmt, ...)
 	    __printflike(9, 10);
 int	intr_event_describe_handler(struct intr_event *ie, void *cookie,
 	    const char *descr);
 int	intr_event_destroy(struct intr_event *ie);
 void	intr_event_execute_handlers(struct proc *p, struct intr_event *ie);
 int	intr_event_handle(struct intr_event *ie, struct trapframe *frame);
 int	intr_event_remove_handler(void *cookie);
-int	intr_getaffinity(int irq, void *mask);
+int	intr_getaffinity(int irq, int mode, void *mask);
 void	*intr_handler_source(void *cookie);
-int	intr_setaffinity(int irq, void *mask);
+int	intr_setaffinity(int irq, int mode, void *mask);
 void	_intr_drain(int irq);  /* Linux compat only. */
 int	swi_add(struct intr_event **eventp, const char *name,
 	    driver_intr_t handler, void *arg, int pri, enum intr_type flags,
 	    void **cookiep);
 void	swi_sched(void *cookie, int flags);
 int	swi_remove(void *cookie);
 
 #endif
Index: stable/11
===================================================================
--- stable/11	(revision 333337)
+++ stable/11	(revision 333338)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r307560,307562-307563,307568,308792,311039,311837,312755,312903,312905,312924,313248,315217,315245,315288,316278,316281,316502,316596,317756,319917,319921,319984,319989,320059,320609,320611,321253,321629-321630,322337-322338,322823,323077,323825,323876,323879,323887,323941-323944,323954,324038,324937,325166-325168,325201,325241,325245,325487,325494,325901,326033,326369-326370,326432,326577-326578,326702,326706,326775,327013,327017,327052,327072,327098,327242,327244,327247,329651,329742,330289,330715,330721,332419,332422,332729