Index: sys/dev/bnxt/bnxt.h
===================================================================
--- sys/dev/bnxt/bnxt.h
+++ sys/dev/bnxt/bnxt.h
@@ -203,6 +203,8 @@
 /* Chip info */
 #define BNXT_TSO_SIZE	UINT16_MAX
 
+#define BNXT_MIN_FRAME_SIZE	52	/* Frames must be padded to this size for some A0 chips */
+
 /* NVRAM access */
 enum bnxt_nvm_directory_type {
 	BNX_DIR_TYPE_UNUSED = 0,
@@ -427,6 +429,7 @@
 	uint32_t		ring_size;	/* Must be a power of two */
 	uint16_t		id;		/* Logical ID */
 	uint16_t		phys_id;
+	struct bnxt_full_tpa_start *tpa_start;
 };
 
 struct bnxt_cp_ring {
@@ -552,7 +555,6 @@
 	struct sysctl_ctx_list	hw_stats;
 	struct sysctl_oid	*hw_stats_oid;
 
-	struct bnxt_full_tpa_start *tpa_start;
 	struct bnxt_ver_info	*ver_info;
 	struct bnxt_nvram_info	*nvm_info;
 };
Index: sys/dev/bnxt/bnxt_hwrm.c
===================================================================
--- sys/dev/bnxt/bnxt_hwrm.c
+++ sys/dev/bnxt/bnxt_hwrm.c
@@ -931,7 +931,7 @@
 	/* TODO: Calculate this based on ring size? */
 	req.max_agg_segs = htole16(3);
 	/* Base this in the allocated TPA start size... */
-	req.max_aggs = htole16(2);
+	req.max_aggs = htole16(7);
 	/*
 	 * TODO: max_agg_timer?
 	 * req.mag_agg_timer = htole32(XXX);
Index: sys/dev/bnxt/bnxt_txrx.c
===================================================================
--- sys/dev/bnxt/bnxt_txrx.c
+++ sys/dev/bnxt/bnxt_txrx.c
@@ -48,17 +48,19 @@
  */
 
 static int bnxt_isc_txd_encap(void *sc, if_pkt_info_t pi);
-static void bnxt_isc_txd_flush(void *sc, uint16_t txqid, uint32_t pidx);
-static int bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, uint32_t cidx,
-    bool clear);
+static void bnxt_isc_txd_flush(void *sc, uint16_t txqid, qidx_t pidx);
+static int bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, bool clear);
 
-static void bnxt_isc_rxd_refill(void *sc, uint16_t rxqid, uint8_t flid,
+static void bnxt_isc_rxd_refill(void *sc, if_rxd_update_t iru);
+
+/*				uint16_t rxqid, uint8_t flid,
     uint32_t pidx, uint64_t *paddrs, caddr_t *vaddrs, uint16_t count,
     uint16_t buf_size);
+*/
 static void bnxt_isc_rxd_flush(void *sc, uint16_t rxqid, uint8_t flid,
-    uint32_t pidx);
-static int bnxt_isc_rxd_available(void *sc, uint16_t rxqid, uint32_t idx,
-    int budget);
+    qidx_t pidx);
+static int bnxt_isc_rxd_available(void *sc, uint16_t rxqid, qidx_t idx,
+    qidx_t budget);
 static int bnxt_isc_rxd_pkt_get(void *sc, if_rxd_info_t ri);
 
 static int bnxt_intr(void *sc);
@@ -172,7 +174,7 @@
 }
 
 static void
-bnxt_isc_txd_flush(void *sc, uint16_t txqid, uint32_t pidx)
+bnxt_isc_txd_flush(void *sc, uint16_t txqid, qidx_t pidx)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_ring *tx_ring = &softc->tx_rings[txqid];
@@ -185,7 +187,7 @@
 }
 
 static int
-bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, uint32_t idx, bool clear)
+bnxt_isc_txd_credits_update(void *sc, uint16_t txqid, bool clear)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_cp_ring *cpr = &softc->tx_cp_rings[txqid];
@@ -249,16 +251,30 @@
 }
 
 static void
-bnxt_isc_rxd_refill(void *sc, uint16_t rxqid, uint8_t flid,
-				uint32_t pidx, uint64_t *paddrs,
-				caddr_t *vaddrs, uint16_t count, uint16_t len)
+bnxt_isc_rxd_refill(void *sc, if_rxd_update_t iru)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_ring *rx_ring;
 	struct rx_prod_pkt_bd *rxbd;
 	uint16_t type;
 	uint16_t i;
+	uint16_t rxqid;
+	uint16_t count, len;
+	uint32_t pidx;
+	uint8_t flid;
+	uint64_t *paddrs;
+	caddr_t *vaddrs;
+	qidx_t	*frag_idxs;
 
+	rxqid = iru->iru_qsidx;
+	count = iru->iru_count;
+	len = iru->iru_buf_size;
+	pidx = iru->iru_pidx;
+	flid = iru->iru_flidx;
+	vaddrs = iru->iru_vaddrs;
+	paddrs = iru->iru_paddrs;
+	frag_idxs = iru->iru_idxs;
+
 	if (flid == 0) {
 		rx_ring = &softc->rx_rings[rxqid];
 		type = RX_PROD_PKT_BD_TYPE_RX_PROD_PKT;
@@ -273,8 +289,8 @@
 		rxbd[pidx].flags_type = htole16(type);
 		rxbd[pidx].len = htole16(len);
 		/* No need to byte-swap the opaque value */
-		rxbd[pidx].opaque = ((rxqid & 0xff) << 24) | (flid << 16)
-		    | pidx;
+		rxbd[pidx].opaque = (((rxqid & 0xff) << 24) | (flid << 16)
+		    | (frag_idxs[i]));
 		rxbd[pidx].addr = htole64(paddrs[i]);
 		if (++pidx == rx_ring->ring_size)
 			pidx = 0;
@@ -284,7 +300,7 @@
 
 static void
 bnxt_isc_rxd_flush(void *sc, uint16_t rxqid, uint8_t flid,
-    uint32_t pidx)
+    qidx_t pidx)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_ring *rx_ring;
@@ -310,12 +326,11 @@
 }
 
 static int
-bnxt_isc_rxd_available(void *sc, uint16_t rxqid, uint32_t idx, int budget)
+bnxt_isc_rxd_available(void *sc, uint16_t rxqid, qidx_t idx, qidx_t budget)
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_cp_ring *cpr = &softc->rx_cp_rings[rxqid];
 	struct rx_pkt_cmpl *rcp;
-	struct rx_tpa_start_cmpl *rtpa;
 	struct rx_tpa_end_cmpl *rtpae;
 	struct cmpl_base *cmp = (struct cmpl_base *)cpr->ring.vaddr;
 	int avail = 0;
@@ -324,7 +339,6 @@
 	uint8_t ags;
 	int i;
 	uint16_t type;
-	uint8_t agg_id;
 
 	for (;;) {
 		NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
@@ -374,18 +388,11 @@
 			avail++;
 			break;
 		case CMPL_BASE_TYPE_RX_TPA_START:
-			rtpa = (void *)&cmp[cons];
-			agg_id = (rtpa->agg_id &
-			    RX_TPA_START_CMPL_AGG_ID_MASK) >>
-			    RX_TPA_START_CMPL_AGG_ID_SFT;
-			softc->tpa_start[agg_id].low = *rtpa;
 			NEXT_CP_CONS_V(&cpr->ring, cons, v_bit);
 			CMPL_PREFETCH_NEXT(cpr, cons);
 
 			if (!CMP_VALID(&cmp[cons], v_bit))
 				goto cmpl_invalid;
-			softc->tpa_start[agg_id].high =
-			    ((struct rx_tpa_start_cmpl_hi *)cmp)[cons];
 			break;
 		case CMPL_BASE_TYPE_RX_AGG:
 			break;
@@ -510,7 +517,7 @@
 	/* Get the agg_id */
 	agg_id = (agend->agg_id & RX_TPA_END_CMPL_AGG_ID_MASK) >>
 	    RX_TPA_END_CMPL_AGG_ID_SFT;
-	tpas = &softc->tpa_start[agg_id];
+	tpas = &(softc->rx_rings[ri->iri_qsidx].tpa_start[agg_id]);
 
 	/* Extract from the first 16-byte BD */
 	if (le16toh(tpas->low.flags_type) & RX_TPA_START_CMPL_FLAGS_RSS_VALID) {
@@ -530,8 +537,8 @@
 	    RX_TPA_END_CMPL_AGG_BUFS_SFT;
 	ri->iri_nfrags = ags + 1;
 	/* No need to byte-swap the opaque value */
-	ri->iri_frags[0].irf_flid = (tpas->low.opaque >> 16) & 0xff;
-	ri->iri_frags[0].irf_idx = tpas->low.opaque & 0xffff;
+	ri->iri_frags[0].irf_flid = ((tpas->low.opaque >> 16) & 0xff);
+	ri->iri_frags[0].irf_idx = (tpas->low.opaque & 0xffff);
 	ri->iri_frags[0].irf_len = le16toh(tpas->low.len);
 	ri->iri_len = le16toh(tpas->low.len);
 
@@ -567,8 +574,8 @@
 		acp = &((struct rx_abuf_cmpl *)cpr->ring.vaddr)[cpr->cons];
 
 		/* No need to byte-swap the opaque value */
-		ri->iri_frags[i].irf_flid = (acp->opaque >> 16) & 0xff;
-		ri->iri_frags[i].irf_idx = acp->opaque & 0xffff;
+		ri->iri_frags[i].irf_flid = ((acp->opaque >> 16) & 0xff);
+		ri->iri_frags[i].irf_idx = (acp->opaque & 0xffff);
 		ri->iri_frags[i].irf_len = le16toh(acp->len);
 		ri->iri_len += le16toh(acp->len);
 	}
@@ -576,8 +583,8 @@
 	/* And finally, the empty BD at the end... */
 	ri->iri_nfrags++;
 	/* No need to byte-swap the opaque value */
-	ri->iri_frags[i].irf_flid = (agend->opaque >> 16) % 0xff;
-	ri->iri_frags[i].irf_idx = agend->opaque & 0xffff;
+	ri->iri_frags[i].irf_flid = ((agend->opaque >> 16) & 0xff);
+	ri->iri_frags[i].irf_idx = (agend->opaque & 0xffff);
 	ri->iri_frags[i].irf_len = le16toh(agend->len);
 	ri->iri_len += le16toh(agend->len);
 
@@ -590,9 +597,12 @@
 {
 	struct bnxt_softc *softc = (struct bnxt_softc *)sc;
 	struct bnxt_cp_ring *cpr = &softc->rx_cp_rings[ri->iri_qsidx];
+	struct cmpl_base *cmp_q = (struct cmpl_base *)cpr->ring.vaddr;
 	struct cmpl_base *cmp;
+	struct rx_tpa_start_cmpl *rtpa;
 	uint16_t flags_type;
 	uint16_t type;
+	uint8_t agg_id;
 
 	for (;;) {
 		NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
@@ -609,9 +619,18 @@
 		case CMPL_BASE_TYPE_RX_TPA_END:
 			return bnxt_pkt_get_tpa(softc, ri, cpr, flags_type);
 		case CMPL_BASE_TYPE_RX_TPA_START:
+			rtpa = (void *)&cmp_q[cpr->cons];
+			agg_id = (rtpa->agg_id &
+			    RX_TPA_START_CMPL_AGG_ID_MASK) >>
+			    RX_TPA_START_CMPL_AGG_ID_SFT;
+			softc->rx_rings[ri->iri_qsidx].tpa_start[agg_id].low = *rtpa;
+
 			NEXT_CP_CONS_V(&cpr->ring, cpr->cons, cpr->v_bit);
 			ri->iri_cidx = RING_NEXT(&cpr->ring, ri->iri_cidx);
 			CMPL_PREFETCH_NEXT(cpr, cpr->cons);
+
+			softc->rx_rings[ri->iri_qsidx].tpa_start[agg_id].high =
+			    ((struct rx_tpa_start_cmpl_hi *)cmp_q)[cpr->cons];
 			break;
 		default:
 			device_printf(softc->dev,
Index: sys/dev/bnxt/if_bnxt.c
===================================================================
--- sys/dev/bnxt/if_bnxt.c
+++ sys/dev/bnxt/if_bnxt.c
@@ -235,6 +235,8 @@
 MODULE_DEPEND(bnxt, ether, 1, 1, 1);
 MODULE_DEPEND(bnxt, iflib, 1, 1, 1);
 
+IFLIB_PNP_INFO(pci, bnxt, bnxt_vendor_info_array);
+
 static device_method_t bnxt_iflib_methods[] = {
 	DEVMETHOD(ifdi_tx_queues_alloc, bnxt_tx_queues_alloc),
 	DEVMETHOD(ifdi_rx_queues_alloc, bnxt_rx_queues_alloc),
@@ -255,7 +257,8 @@
 	DEVMETHOD(ifdi_update_admin_status, bnxt_update_admin_status),
 
 	DEVMETHOD(ifdi_intr_enable, bnxt_intr_enable),
-	DEVMETHOD(ifdi_queue_intr_enable, bnxt_queue_intr_enable),
+	DEVMETHOD(ifdi_tx_queue_intr_enable, bnxt_queue_intr_enable),
+	DEVMETHOD(ifdi_rx_queue_intr_enable, bnxt_queue_intr_enable),
 	DEVMETHOD(ifdi_intr_disable, bnxt_disable_intr),
 	DEVMETHOD(ifdi_msix_intr_assign, bnxt_msix_intr_assign),
 
@@ -279,10 +282,9 @@
 extern struct if_txrx bnxt_txrx;
 static struct if_shared_ctx bnxt_sctx_init = {
 	.isc_magic = IFLIB_MAGIC,
-	.isc_txrx = &bnxt_txrx,
 	.isc_driver = &bnxt_iflib_driver,
 	.isc_nfl = 2,				// Number of Free Lists
-	.isc_flags = IFLIB_HAS_RXCQ | IFLIB_HAS_TXCQ,
+	.isc_flags = IFLIB_HAS_RXCQ | IFLIB_HAS_TXCQ | IFLIB_NEED_ETHER_PAD,
 	.isc_q_align = PAGE_SIZE,
 	.isc_tx_maxsize = BNXT_TSO_SIZE,
 	.isc_tx_maxsegsize = BNXT_TSO_SIZE,
@@ -494,6 +496,17 @@
 		softc->rx_rings[i].vaddr = vaddrs[i * nrxqs + 1];
 		softc->rx_rings[i].paddr = paddrs[i * nrxqs + 1];
 
+		/* Allocate the TPA start buffer */
+		softc->rx_rings[i].tpa_start = malloc(sizeof(struct bnxt_full_tpa_start) *
+	    		(RX_TPA_START_CMPL_AGG_ID_MASK >> RX_TPA_START_CMPL_AGG_ID_SFT),
+	    		M_DEVBUF, M_NOWAIT | M_ZERO);
+		if (softc->rx_rings[i].tpa_start == NULL) {
+			rc = -ENOMEM;
+			device_printf(softc->dev,
+					"Unable to allocate space for TPA\n");
+			goto tpa_alloc_fail;
+		}
+
 		/* Allocate the AG ring */
 		softc->ag_rings[i].phys_id = (uint16_t)HWRM_NA_SIGNATURE;
 		softc->ag_rings[i].softc = softc;
@@ -559,7 +572,10 @@
 	iflib_dma_free(&softc->vnic_info.rss_hash_key_tbl);
 rss_hash_alloc_fail:
 	iflib_dma_free(&softc->vnic_info.mc_list);
+tpa_alloc_fail:
 mc_list_alloc_fail:
+	for (i = i - 1; i >= 0; i--)
+		free(softc->rx_rings[i].tpa_start, M_DEVBUF);
 	iflib_dma_free(&softc->rx_stats);
 hw_stats_alloc_fail:
 	free(softc->grp_info, M_DEVBUF);
@@ -623,16 +639,6 @@
 	if (rc)
 		goto dma_fail;
 
-	/* Allocate the TPA start buffer */
-	softc->tpa_start = malloc(sizeof(struct bnxt_full_tpa_start) *
-	    (RX_TPA_START_CMPL_AGG_ID_MASK >> RX_TPA_START_CMPL_AGG_ID_SFT),
-	    M_DEVBUF, M_NOWAIT | M_ZERO);
-	if (softc->tpa_start == NULL) {
-		rc = ENOMEM;
-		device_printf(softc->dev,
-		    "Unable to allocate space for TPA\n");
-		goto tpa_failed;
-	}
 
 	/* Get firmware version and compare with driver */
 	softc->ver_info = malloc(sizeof(struct bnxt_ver_info),
@@ -681,6 +687,20 @@
 		goto failed;
 	iflib_set_mac(ctx, softc->func.mac_addr);
 
+	scctx->isc_txrx = &bnxt_txrx;
+	scctx->isc_tx_csum_flags = (CSUM_IP | CSUM_TCP | CSUM_UDP |
+	    CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_TSO);
+	scctx->isc_capenable =
+	    /* These are translated to hwassit bits */
+	    IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO4 | IFCAP_TSO6 |
+	    /* These are checked by iflib */
+	    IFCAP_LRO | IFCAP_VLAN_HWFILTER |
+	    /* These are part of the iflib mask */
+	    IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_VLAN_MTU |
+	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO |
+	    /* These likely get lost... */
+	    IFCAP_VLAN_HWCSUM | IFCAP_JUMBO_MTU;
+
 	/* Get the queue config */
 	rc = bnxt_hwrm_queue_qportcfg(softc);
 	if (rc) {
@@ -700,6 +720,9 @@
 	scctx->isc_tx_tso_size_max = BNXT_TSO_SIZE;
 	scctx->isc_tx_tso_segsize_max = BNXT_TSO_SIZE;
 	scctx->isc_vectors = softc->func.max_cp_rings;
+	scctx->isc_min_frame_size = BNXT_MIN_FRAME_SIZE;
+	scctx->isc_txrx = &bnxt_txrx;
+
 	if (scctx->isc_nrxd[0] <
 	    ((scctx->isc_nrxd[1] * 4) + scctx->isc_nrxd[2]))
 		device_printf(softc->dev,
@@ -717,12 +740,12 @@
 	    scctx->isc_nrxd[1];
 	scctx->isc_rxqsizes[2] = sizeof(struct rx_prod_pkt_bd) *
 	    scctx->isc_nrxd[2];
-	scctx->isc_max_rxqsets = min(pci_msix_count(softc->dev)-1,
+	scctx->isc_nrxqsets_max = min(pci_msix_count(softc->dev)-1,
 	    softc->func.max_cp_rings - 1);
-	scctx->isc_max_rxqsets = min(scctx->isc_max_rxqsets,
+	scctx->isc_nrxqsets_max = min(scctx->isc_nrxqsets_max,
 	    softc->func.max_rx_rings);
-	scctx->isc_max_txqsets = min(softc->func.max_rx_rings,
-	    softc->func.max_cp_rings - scctx->isc_max_rxqsets - 1);
+	scctx->isc_ntxqsets_max = min(softc->func.max_rx_rings,
+	    softc->func.max_cp_rings - scctx->isc_nrxqsets_max - 1);
 	scctx->isc_rss_table_size = HW_HASH_INDEX_SIZE;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size - 1;
 
@@ -780,8 +803,6 @@
 ver_fail:
 	free(softc->ver_info, M_DEVBUF);
 ver_alloc_fail:
-	free(softc->tpa_start, M_DEVBUF);
-tpa_failed:
 	bnxt_free_hwrm_dma_mem(softc);
 dma_fail:
 	BNXT_HWRM_LOCK_DESTROY(softc);
@@ -795,7 +816,6 @@
 {
 	struct bnxt_softc *softc = iflib_get_softc(ctx);
 	if_t ifp = iflib_get_ifp(ctx);
-	int capabilities, enabling;
 	int rc;
 
 	bnxt_create_config_sysctls_post(softc);
@@ -810,26 +830,6 @@
 	bnxt_add_media_types(softc);
 	ifmedia_set(softc->media, IFM_ETHER | IFM_AUTO);
 
-	if_sethwassist(ifp, (CSUM_TCP | CSUM_UDP | CSUM_TCP_IPV6 |
-	    CSUM_UDP_IPV6 | CSUM_TSO));
-
-	capabilities =
-	    /* These are translated to hwassit bits */
-	    IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO4 | IFCAP_TSO6 |
-	    /* These are checked by iflib */
-	    IFCAP_LRO | IFCAP_VLAN_HWFILTER |
-	    /* These are part of the iflib mask */
-	    IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_VLAN_MTU |
-	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO |
-	    /* These likely get lost... */
-	    IFCAP_VLAN_HWCSUM | IFCAP_JUMBO_MTU;
-
-	if_setcapabilities(ifp, capabilities);
-
-	enabling = capabilities;
-
-	if_setcapenable(ifp, enabling);
-
 	softc->scctx->isc_max_frame_size = ifp->if_mtu + ETHER_HDR_LEN +
 	    ETHER_CRC_LEN;
 
@@ -863,7 +863,8 @@
 	SLIST_FOREACH_SAFE(tag, &softc->vnic_info.vlan_tags, next, tmp)
 		free(tag, M_DEVBUF);
 	iflib_dma_free(&softc->def_cp_ring_mem);
-	free(softc->tpa_start, M_DEVBUF);
+	for (i = 0; i < softc->nrxqsets; i++)
+		free(softc->rx_rings[i].tpa_start, M_DEVBUF);
 	free(softc->ver_info, M_DEVBUF);
 	free(softc->nvm_info, M_DEVBUF);
 
@@ -995,14 +996,17 @@
 	if (rc)
 		goto fail;
 
-#ifdef notyet
-	/* Enable LRO/TPA/GRO */
+	/* 
+         * Enable LRO/TPA/GRO 
+         * TBD: 
+         *      Enable / Disable HW_LRO based on
+         *      ifconfig lro / ifconfig -lro setting
+         */
 	rc = bnxt_hwrm_vnic_tpa_cfg(softc, &softc->vnic_info,
 	    (if_getcapenable(iflib_get_ifp(ctx)) & IFCAP_LRO) ?
 	    HWRM_VNIC_TPA_CFG_INPUT_FLAGS_TPA : 0);
 	if (rc)
 		goto fail;
-#endif
 
 	for (i = 0; i < softc->ntxqsets; i++) {
 		/* Allocate the statistics context */
@@ -1489,7 +1493,7 @@
 
 	for (i=0; i<softc->scctx->isc_nrxqsets; i++) {
 		rc = iflib_irq_alloc_generic(ctx, &softc->rx_cp_rings[i].irq,
-		    softc->rx_cp_rings[i].ring.id + 1, IFLIB_INTR_RX,
+		    softc->rx_cp_rings[i].ring.id + 1, IFLIB_INTR_RXTX,
 		    bnxt_handle_rx_cp, &softc->rx_cp_rings[i], i, "rx_cp");
 		if (rc) {
 			device_printf(iflib_get_dev(ctx),
@@ -1500,8 +1504,7 @@
 	}
 
 	for (i=0; i<softc->scctx->isc_ntxqsets; i++)
-		iflib_softirq_alloc_generic(ctx, i + 1, IFLIB_INTR_TX, NULL, i,
-		    "tx_cp");
+		iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_TX, NULL, i, "tx_cp");
 
 	return rc;
 
Index: sys/kern/kern_cpuset.c
===================================================================
--- sys/kern/kern_cpuset.c
+++ sys/kern/kern_cpuset.c
@@ -1127,6 +1127,8 @@
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
@@ -1157,7 +1159,9 @@
 			CPU_COPY(&set->cs_mask, mask);
 			break;
 		case CPU_WHICH_IRQ:
-			error = intr_getaffinity(id, mask);
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
+			error = intr_getaffinity(id, which, mask);
 			break;
 		case CPU_WHICH_DOMAIN:
 			if (id < 0 || id >= MAXMEMDOM)
@@ -1260,6 +1264,8 @@
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
@@ -1289,7 +1295,9 @@
 			}
 			break;
 		case CPU_WHICH_IRQ:
-			error = intr_setaffinity(id, mask);
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
+			error = intr_setaffinity(id, which, mask);
 			break;
 		default:
 			error = EINVAL;
Index: sys/kern/kern_intr.c
===================================================================
--- sys/kern/kern_intr.c
+++ sys/kern/kern_intr.c
@@ -287,13 +287,11 @@
 /*
  * Bind an interrupt event to the specified CPU.  Note that not all
  * platforms support binding an interrupt to a CPU.  For those
- * platforms this request will fail.  For supported platforms, any
- * associated ithreads as well as the primary interrupt context will
- * be bound to the specificed CPU.  Using a cpu id of NOCPU unbinds
+ * platforms this request will fail.  Using a cpu id of NOCPU unbinds
  * the interrupt event.
  */
-int
-intr_event_bind(struct intr_event *ie, int cpu)
+static int
+_intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread)
 {
 	lwpid_t id;
 	int error;
@@ -313,35 +311,75 @@
 	 * If we have any ithreads try to set their mask first to verify
 	 * permissions, etc.
 	 */
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_thread != NULL) {
-		id = ie->ie_thread->it_thread->td_tid;
-		mtx_unlock(&ie->ie_lock);
-		error = cpuset_setithread(id, cpu);
-		if (error)
-			return (error);
-	} else
-		mtx_unlock(&ie->ie_lock);
-	error = ie->ie_assign_cpu(ie->ie_source, cpu);
-	if (error) {
+	if (bindithread) {
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_thread != NULL) {
-			cpu = ie->ie_cpu;
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
-			(void)cpuset_setithread(id, cpu);
+			error = cpuset_setithread(id, cpu);
+			if (error)
+				return (error);
 		} else
 			mtx_unlock(&ie->ie_lock);
+	}
+	if (bindirq)
+		error = ie->ie_assign_cpu(ie->ie_source, cpu);
+	if (error) {
+		if (bindithread) {
+			mtx_lock(&ie->ie_lock);
+			if (ie->ie_thread != NULL) {
+				cpu = ie->ie_cpu;
+				id = ie->ie_thread->it_thread->td_tid;
+				mtx_unlock(&ie->ie_lock);
+				(void)cpuset_setithread(id, cpu);
+			} else
+				mtx_unlock(&ie->ie_lock);
+		}
 		return (error);
 	}
 
-	mtx_lock(&ie->ie_lock);
-	ie->ie_cpu = cpu;
-	mtx_unlock(&ie->ie_lock);
+	if (bindirq) {
+		mtx_lock(&ie->ie_lock);
+		ie->ie_cpu = cpu;
+		mtx_unlock(&ie->ie_lock);
+	}
 
 	return (error);
 }
 
+/*
+ * Bind an interrupt event to the specified CPU.  For supported platforms, any
+ * associated ithreads as well as the primary interrupt context will be bound
+ * to the specificed CPU.
+ */
+int
+intr_event_bind(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, true, true));
+}
+
+/*
+ * Bind an interrupt event to the specified CPU, but do not bind associated
+ * ithreads.
+ */
+int
+intr_event_bind_irqonly(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, true, false));
+}
+
+/*
+ * Bind an interrupt event's ithread to the specified CPU.
+ */
+int
+intr_event_bind_ithread(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, false, true));
+}
+
 static struct intr_event *
 intr_lookup(int irq)
 {
@@ -358,7 +396,7 @@
 }
 
 int
-intr_setaffinity(int irq, void *m)
+intr_setaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
 	cpuset_t *mask;
@@ -382,26 +420,62 @@
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
-	return (intr_event_bind(ie, cpu));
+	switch (mode) {
+	case CPU_WHICH_IRQ:
+		return (intr_event_bind(ie, cpu));
+	case CPU_WHICH_INTRHANDLER:
+		return (intr_event_bind_irqonly(ie, cpu));
+	case CPU_WHICH_ITHREAD:
+		return (intr_event_bind_ithread(ie, cpu));
+	default:
+		return (EINVAL);
+	}
 }
 
 int
-intr_getaffinity(int irq, void *m)
+intr_getaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
+	struct thread *td;
+	struct proc *p;
 	cpuset_t *mask;
+	lwpid_t id;
+	int error;
 
 	mask = m;
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
+
+	error = 0;
 	CPU_ZERO(mask);
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_cpu == NOCPU)
-		CPU_COPY(cpuset_root, mask);
-	else
-		CPU_SET(ie->ie_cpu, mask);
-	mtx_unlock(&ie->ie_lock);
+	switch (mode) {
+	case CPU_WHICH_IRQ:
+	case CPU_WHICH_INTRHANDLER:
+		mtx_lock(&ie->ie_lock);
+		if (ie->ie_cpu == NOCPU)
+			CPU_COPY(cpuset_root, mask);
+		else
+			CPU_SET(ie->ie_cpu, mask);
+		mtx_unlock(&ie->ie_lock);
+		break;
+	case CPU_WHICH_ITHREAD:
+		mtx_lock(&ie->ie_lock);
+		if (ie->ie_thread == NULL) {
+			mtx_unlock(&ie->ie_lock);
+			CPU_COPY(cpuset_root, mask);
+		} else {
+			id = ie->ie_thread->it_thread->td_tid;
+			mtx_unlock(&ie->ie_lock);
+			error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL);
+			if (error != 0)
+				return (error);
+			CPU_COPY(&td->td_cpuset->cs_mask, mask);
+			PROC_UNLOCK(p);
+		}
+	default:
+		return (EINVAL);
+	}
 	return (0);
 }
 
Index: sys/kern/subr_gtaskqueue.c
===================================================================
--- sys/kern/subr_gtaskqueue.c
+++ sys/kern/subr_gtaskqueue.c
@@ -48,7 +48,7 @@
 #include <sys/unistd.h>
 #include <machine/stdarg.h>
 
-static MALLOC_DEFINE(M_GTASKQUEUE, "taskqueue", "Task Queues");
+static MALLOC_DEFINE(M_GTASKQUEUE, "gtaskqueue", "Group Task Queues");
 static void	gtaskqueue_thread_enqueue(void *);
 static void	gtaskqueue_thread_loop(void *arg);
 
@@ -134,8 +134,10 @@
 	snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue");
 
 	queue = malloc(sizeof(struct gtaskqueue), M_GTASKQUEUE, mflags | M_ZERO);
-	if (!queue)
+	if (!queue) {
+		free(tq_name, M_GTASKQUEUE);
 		return (NULL);
+	}
 
 	STAILQ_INIT(&queue->tq_queue);
 	TAILQ_INIT(&queue->tq_active);
@@ -663,10 +665,10 @@
     void *uniq, int irq, char *name)
 {
 	cpuset_t mask;
-	int qid;
+	int qid, error;
 
 	gtask->gt_uniq = uniq;
-	gtask->gt_name = name;
+	snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = -1;
 	mtx_lock(&qgroup->tqg_lock);
@@ -679,7 +681,9 @@
 		CPU_ZERO(&mask);
 		CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
 		mtx_unlock(&qgroup->tqg_lock);
-		intr_setaffinity(irq, &mask);
+		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
+		if (error)
+			printf("%s: setaffinity failed for %s: %d\n", __func__, gtask->gt_name, error);
 	} else
 		mtx_unlock(&qgroup->tqg_lock);
 }
@@ -688,7 +692,7 @@
 taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
-	int qid, cpu;
+	int qid, cpu, error;
 
 	mtx_lock(&qgroup->tqg_lock);
 	qid = taskqgroup_find(qgroup, gtask->gt_uniq);
@@ -698,9 +702,11 @@
 
 		CPU_ZERO(&mask);
 		CPU_SET(cpu, &mask);
-		intr_setaffinity(gtask->gt_irq, &mask);
-
+		error = intr_setaffinity(gtask->gt_irq, CPU_WHICH_IRQ, &mask);
 		mtx_lock(&qgroup->tqg_lock);
+		if (error)
+			printf("%s: %s setaffinity failed: %d\n", __func__, gtask->gt_name, error);
+
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
 
@@ -716,11 +722,11 @@
 	void *uniq, int cpu, int irq, char *name)
 {
 	cpuset_t mask;
-	int i, qid;
+	int i, qid, error;
 
 	qid = -1;
 	gtask->gt_uniq = uniq;
-	gtask->gt_name = name;
+	snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = cpu;
 	mtx_lock(&qgroup->tqg_lock);
@@ -732,6 +738,7 @@
 			}
 		if (qid == -1) {
 			mtx_unlock(&qgroup->tqg_lock);
+			printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu);
 			return (EINVAL);
 		}
 	} else
@@ -744,8 +751,11 @@
 
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
-	if (irq != -1 && tqg_smp_started)
-		intr_setaffinity(irq, &mask);
+	if (irq != -1 && tqg_smp_started) {
+		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
+		if (error)
+			printf("%s: setaffinity failed: %d\n", __func__, error);
+	}
 	return (0);
 }
 
@@ -753,7 +763,7 @@
 taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
-	int i, qid, irq, cpu;
+	int i, qid, irq, cpu, error;
 
 	qid = -1;
 	irq = gtask->gt_irq;
@@ -767,6 +777,7 @@
 		}
 	if (qid == -1) {
 		mtx_unlock(&qgroup->tqg_lock);
+		printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu);
 		return (EINVAL);
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
@@ -778,8 +789,11 @@
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
 
-	if (irq != -1)
-		intr_setaffinity(irq, &mask);
+	if (irq != -1) {
+		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
+		if (error)
+			printf("%s: setaffinity failed: %d\n", __func__, error);
+	}
 	return (0);
 }
 
@@ -793,7 +807,7 @@
 		if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue)
 			break;
 	if (i == qgroup->tqg_cnt)
-		panic("taskqgroup_detach: task not in group\n");
+		panic("taskqgroup_detach: task %s not in group\n", gtask->gt_name);
 	qgroup->tqg_queue[i].tgc_cnt--;
 	LIST_REMOVE(gtask, gt_list);
 	mtx_unlock(&qgroup->tqg_lock);
@@ -815,7 +829,7 @@
 	thread_unlock(curthread);
 
 	if (error)
-		printf("taskqgroup_binder: setaffinity failed: %d\n",
+		printf("%s: setaffinity failed: %d\n", __func__,
 		    error);
 	free(gtask, M_DEVBUF);
 }
@@ -858,7 +872,7 @@
 		return (EINVAL);
 	}
 	if (qgroup->tqg_adjusting) {
-		printf("taskqgroup_adjust failed: adjusting\n");
+		printf("%s failed: adjusting\n", __func__);
 		return (EBUSY);
 	}
 	qgroup->tqg_adjusting = 1;
Index: sys/net/ifdi_if.m
===================================================================
--- sys/net/ifdi_if.m
+++ sys/net/ifdi_if.m
@@ -195,11 +195,16 @@
 	if_ctx_t _ctx;
 };
 
-METHOD int queue_intr_enable {
+METHOD int rx_queue_intr_enable {
 	if_ctx_t _ctx;
 	uint16_t _qid;
 } DEFAULT null_queue_intr_enable;
 
+METHOD int tx_queue_intr_enable {
+	if_ctx_t _ctx;
+	uint16_t _qid;
+} DEFAULT null_queue_intr_enable;
+
 METHOD void link_intr_enable {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
@@ -229,6 +234,7 @@
 METHOD void crcstrip_set {
 	if_ctx_t _ctx;
 	int _onoff;
+	int _strip;
 };
 
 #
@@ -332,4 +338,6 @@
 	if_int_delay_info_t _iidi;
 } DEFAULT null_sysctl_int_delay;
 
-
+METHOD void debug {
+	if_ctx_t _ctx;
+} DEFAULT null_void_op;
Index: sys/net/iflib.h
===================================================================
--- sys/net/iflib.h
+++ sys/net/iflib.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014-2015, Matthew Macy (mmacy@nextbsd.org)
+ * Copyright (c) 2014-2017, Matthew Macy (mmacy@nextbsd.org)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,14 @@
 #include <sys/nv.h>
 #include <sys/gtaskqueue.h>
 
-
 /*
+ * The value type for indexing, limits max descriptors
+ * to 65535 can be conditionally redefined to uint32_t
+ * in the future if the need arises.
+ */
+typedef uint16_t qidx_t;
+#define QIDX_INVALID 0xFFFF
+/*
  * Most cards can handle much larger TSO requests
  * but the FreeBSD TCP stack will break on larger
  * values
@@ -63,7 +69,7 @@
 
 typedef struct if_rxd_frag {
 	uint8_t irf_flid;
-	uint16_t irf_idx;
+	qidx_t irf_idx;
 	uint16_t irf_len;
 } *if_rxd_frag_t;
 
@@ -73,47 +79,61 @@
 	uint16_t iri_vtag;		/* vlan tag - if flag set */
 	/* XXX redundant with the new irf_len field */
 	uint16_t iri_len;		/* packet length */
-	uint16_t iri_cidx;		/* consumer index of cq */
+	qidx_t iri_cidx;		/* consumer index of cq */
 	struct ifnet *iri_ifp;		/* some drivers >1 interface per softc */
 
 	/* updated by driver */
-	uint16_t iri_flags;		/* mbuf flags for packet */
+	if_rxd_frag_t iri_frags;
 	uint32_t iri_flowid;		/* RSS hash for packet */
 	uint32_t iri_csum_flags;	/* m_pkthdr csum flags */
+
 	uint32_t iri_csum_data;		/* m_pkthdr csum data */
+	uint8_t iri_flags;		/* mbuf flags for packet */
 	uint8_t	 iri_nfrags;		/* number of fragments in packet */
 	uint8_t	 iri_rsstype;		/* RSS hash type */
 	uint8_t	 iri_pad;		/* any padding in the received data */
-	if_rxd_frag_t iri_frags;
 } *if_rxd_info_t;
 
+typedef struct if_rxd_update {
+	uint64_t	*iru_paddrs;
+	caddr_t		*iru_vaddrs;
+	qidx_t		*iru_idxs;
+	qidx_t		iru_pidx;
+	uint16_t	iru_qsidx;
+	uint16_t	iru_count;
+	uint16_t	iru_buf_size;
+	uint8_t		iru_flidx;
+} *if_rxd_update_t;
+
 #define IPI_TX_INTR	0x1		/* send an interrupt when this packet is sent */
 #define IPI_TX_IPV4	0x2		/* ethertype IPv4 */
 #define IPI_TX_IPV6	0x4		/* ethertype IPv6 */
 
 typedef struct if_pkt_info {
-	uint32_t			ipi_len;	/* packet length */
-	bus_dma_segment_t		*ipi_segs;	/* physical addresses */
-	uint16_t			ipi_qsidx;	/* queue set index */
-	uint16_t			ipi_nsegs;	/* number of segments */
-	uint16_t			ipi_ndescs;	/* number of descriptors used by encap */
-	uint16_t			ipi_flags;	/* iflib per-packet flags */
-	uint32_t			ipi_pidx;	/* start pidx for encap */
-	uint32_t			ipi_new_pidx;	/* next available pidx post-encap */
+	bus_dma_segment_t	*ipi_segs;	/* physical addresses */
+	uint32_t		ipi_len;	/* packet length */
+	uint16_t		ipi_qsidx;	/* queue set index */
+	qidx_t			ipi_nsegs;	/* number of segments */
+
+	qidx_t			ipi_ndescs;	/* number of descriptors used by encap */
+	uint16_t		ipi_flags;	/* iflib per-packet flags */
+	qidx_t			ipi_pidx;	/* start pidx for encap */
+	qidx_t			ipi_new_pidx;	/* next available pidx post-encap */
 	/* offload handling */
-	uint64_t			ipi_csum_flags;	/* packet checksum flags */
-	uint16_t			ipi_tso_segsz;	/* tso segment size */
-	uint16_t			ipi_mflags;	/* packet mbuf flags */
-	uint16_t			ipi_vtag;	/* VLAN tag */
-	uint16_t			ipi_etype;	/* ether header type */
-	uint8_t				ipi_ehdrlen;	/* ether header length */
-	uint8_t				ipi_ip_hlen;	/* ip header length */
-	uint8_t				ipi_tcp_hlen;	/* tcp header length */
-	uint8_t				ipi_tcp_hflags;	/* tcp header flags */
-	uint8_t				ipi_ipproto;	/* ip protocol */
-	/* implied padding */
-	uint32_t			ipi_tcp_seq;	/* tcp seqno */
-	uint32_t			ipi_tcp_sum;	/* tcp csum */
+	uint8_t			ipi_ehdrlen;	/* ether header length */
+	uint8_t			ipi_ip_hlen;	/* ip header length */
+	uint8_t			ipi_tcp_hlen;	/* tcp header length */
+	uint8_t			ipi_ipproto;	/* ip protocol */
+
+	uint32_t		ipi_csum_flags;	/* packet checksum flags */
+	uint16_t		ipi_tso_segsz;	/* tso segment size */
+	uint16_t		ipi_vtag;	/* VLAN tag */
+	uint16_t		ipi_etype;	/* ether header type */
+	uint8_t			ipi_tcp_hflags;	/* tcp header flags */
+	uint8_t			ipi_mflags;	/* packet mbuf flags */
+
+	uint32_t		ipi_tcp_seq;	/* tcp seqno */
+	uint32_t		ipi_tcp_sum;	/* tcp csum */
 } *if_pkt_info_t;
 
 typedef struct if_irq {
@@ -154,17 +174,20 @@
 #define PVID_OEM(vendor, devid, svid, sdevid, revid, name) {vendor, devid, svid, sdevid, revid, 0, name}
 #define PVID_END {0, 0, 0, 0, 0, 0, NULL}
 
+#define IFLIB_PNP_DESCR "U32:vendor;U32:device;U32:subvendor;U32:subdevice;" \
+    "U32:revision;U32:class;D:human"
+#define IFLIB_PNP_INFO(b, u, t) \
+    MODULE_PNP_INFO(IFLIB_PNP_DESCR, b, u, t, sizeof(t[0]), nitems(t) - 1)
+
 typedef struct if_txrx {
 	int (*ift_txd_encap) (void *, if_pkt_info_t);
-	void (*ift_txd_flush) (void *, uint16_t, uint32_t);
-	int (*ift_txd_credits_update) (void *, uint16_t, uint32_t, bool);
+	void (*ift_txd_flush) (void *, uint16_t, qidx_t pidx);
+	int (*ift_txd_credits_update) (void *, uint16_t qsidx, bool clear);
 
-	int (*ift_rxd_available) (void *, uint16_t qsidx, uint32_t pidx,
-	    int budget);
+	int (*ift_rxd_available) (void *, uint16_t qsidx, qidx_t pidx, qidx_t budget);
 	int (*ift_rxd_pkt_get) (void *, if_rxd_info_t ri);
-	void (*ift_rxd_refill) (void * , uint16_t qsidx, uint8_t flidx, uint32_t pidx,
-							uint64_t *paddrs, caddr_t *vaddrs, uint16_t count, uint16_t buf_size);
-	void (*ift_rxd_flush) (void *, uint16_t qsidx, uint8_t flidx, uint32_t pidx);
+	void (*ift_rxd_refill) (void * , if_rxd_update_t iru);
+	void (*ift_rxd_flush) (void *, uint16_t qsidx, uint8_t flidx, qidx_t pidx);
 	int (*ift_legacy_intr) (void *);
 } *if_txrx_t;
 
@@ -179,11 +202,15 @@
 
 	uint32_t isc_txqsizes[8];
 	uint32_t isc_rxqsizes[8];
-	int isc_max_txqsets;
-	int isc_max_rxqsets;
+	/* is there such thing as a descriptor that is more than 248 bytes ? */
+	uint8_t isc_txd_size[8];
+	uint8_t isc_rxd_size[8];
+
 	int isc_tx_tso_segments_max;
 	int isc_tx_tso_size_max;
 	int isc_tx_tso_segsize_max;
+	int isc_tx_csum_flags;
+	int isc_capenable;
 	int isc_rss_table_size;
 	int isc_rss_table_mask;
 	int isc_nrxqsets_max;
@@ -191,32 +218,28 @@
 
 	iflib_intr_mode_t isc_intr;
 	uint16_t isc_max_frame_size; /* set at init time by driver */
+	uint16_t isc_min_frame_size; /* set at init time by driver, only used if
+					IFLIB_NEED_ETHER_PAD is set. */
+	uint32_t isc_pause_frames;   /* set by driver for iflib_timer to detect */
 	pci_vendor_info_t isc_vendor_info;	/* set by iflib prior to attach_pre */
+	int isc_disable_msix;
+	if_txrx_t isc_txrx;
 } *if_softc_ctx_t;
 
 /*
  * Initialization values for device
  */
 struct if_shared_ctx {
-	int isc_magic;
-	if_txrx_t isc_txrx;
+	unsigned isc_magic;
 	driver_t *isc_driver;
-	int isc_nfl;
-	int isc_flags;
 	bus_size_t isc_q_align;
 	bus_size_t isc_tx_maxsize;
 	bus_size_t isc_tx_maxsegsize;
 	bus_size_t isc_rx_maxsize;
 	bus_size_t isc_rx_maxsegsize;
 	int isc_rx_nsegments;
-	int isc_rx_process_limit;
-	int isc_ntxqs;			/* # of tx queues per tx qset - usually 1 */
-	int isc_nrxqs;			/* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */
 	int isc_admin_intrcnt;		/* # of admin/link interrupts */
 
-
-	int isc_tx_reclaim_thresh;
-
 	/* fields necessary for probe */
 	pci_vendor_info_t *isc_vendor_info;
 	char *isc_driver_version;
@@ -229,6 +252,14 @@
 	int isc_ntxd_min[8];
 	int isc_ntxd_default[8];
 	int isc_ntxd_max[8];
+
+	/* actively used during operation */
+	int isc_nfl __aligned(CACHE_LINE_SIZE);
+	int isc_ntxqs;			/* # of tx queues per tx qset - usually 1 */
+	int isc_nrxqs;			/* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */
+	int isc_rx_process_limit;
+	int isc_tx_reclaim_thresh;
+	int isc_flags;
 };
 
 typedef struct iflib_dma_info {
@@ -242,8 +273,9 @@
 #define IFLIB_MAGIC 0xCAFEF00D
 
 typedef enum {
-	IFLIB_INTR_TX,
 	IFLIB_INTR_RX,
+	IFLIB_INTR_TX,
+	IFLIB_INTR_RXTX,
 	IFLIB_INTR_ADMIN,
 	IFLIB_INTR_IOV,
 } iflib_intr_type_t;
@@ -256,22 +288,42 @@
 /*
  * Interface has a separate command queue for RX
  */
-#define IFLIB_HAS_RXCQ		0x1
+#define IFLIB_HAS_RXCQ		0x01
 /*
  * Driver has already allocated vectors
  */
-#define IFLIB_SKIP_MSIX		0x2
-
+#define IFLIB_SKIP_MSIX		0x02
 /*
  * Interface is a virtual function
  */
-#define IFLIB_IS_VF		0x4
+#define IFLIB_IS_VF		0x04
 /*
  * Interface has a separate command queue for TX
  */
-#define IFLIB_HAS_TXCQ		0x8
+#define IFLIB_HAS_TXCQ		0x08
+/*
+ * Interface does checksum in place
+ */
+#define IFLIB_NEED_SCRATCH	0x10
+/*
+ * Interface doesn't expect in_pseudo for th_sum
+ */
+#define IFLIB_TSO_INIT_IP	0x20
+/*
+ * Interface doesn't align IP header
+ */
+#define IFLIB_DO_RX_FIXUP	0x40
+/*
+ * Driver needs csum zeroed for offloading
+ */
+#define IFLIB_NEED_ZERO_CSUM	0x80
+/*
+ * Driver needs frames padded to some minimum length
+ */
+#define IFLIB_NEED_ETHER_PAD	0x100
 
 
+
 /*
  * field accessors
  */
@@ -288,9 +340,6 @@
 
 void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]);
 
-
-
-
 /*
  * If the driver can plug cleanly in to newbus use these
  */
@@ -319,7 +368,7 @@
 int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 							iflib_intr_type_t type, driver_filter_t *filter,
 							void *filter_arg, int qid, char *name);
-void iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name);
+void iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,  void *arg, int qid, char *name);
 
 void iflib_irq_free(if_ctx_t ctx, if_irq_t irq);
 
Index: sys/net/iflib.c
===================================================================
--- sys/net/iflib.c
+++ sys/net/iflib.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2014-2016, Matthew Macy <mmacy@nextbsd.org>
+ * Copyright (c) 2014-2017, Matthew Macy <mmacy@nextbsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_acpi.h"
+#include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
@@ -59,6 +60,7 @@
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/mp_ring.h>
+#include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
@@ -68,6 +70,8 @@
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
+#include <netinet/ip_var.h>
+#include <netinet6/ip6_var.h>
 
 #include <machine/bus.h>
 #include <machine/in_cksum.h>
@@ -93,9 +97,10 @@
 #include <x86/iommu/busdma_dmar.h>
 #endif
 
-
+#include <sys/bitstring.h>
 /*
- * enable accounting of every mbuf as it comes in to and goes out of iflib's software descriptor references
+ * enable accounting of every mbuf as it comes in to and goes out of
+ * iflib's software descriptor references
  */
 #define MEMORY_LOGGING 0
 /*
@@ -134,10 +139,13 @@
 struct iflib_fl;
 typedef struct iflib_fl *iflib_fl_t;
 
+static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
+
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
 	void *ifi_filter_arg;
 	struct grouptask *ifi_task;
+	void *ifi_ctx;
 } *iflib_filter_info_t;
 
 struct iflib_ctx {
@@ -156,7 +164,6 @@
 	struct mtx ifc_mtx;
 
 	uint16_t ifc_nhwtxqs;
-	uint16_t ifc_nhwrxqs;
 
 	iflib_txq_t ifc_txqs;
 	iflib_rxq_t ifc_rxqs;
@@ -167,7 +174,6 @@
 
 	int ifc_link_state;
 	int ifc_link_irq;
-	int ifc_pause_frames;
 	int ifc_watchdog_events;
 	struct cdev *ifc_led_dev;
 	struct resource *ifc_msix_mem;
@@ -182,9 +188,10 @@
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
 	uint16_t ifc_sysctl_qs_eq_override;
+	uint16_t ifc_sysctl_rx_budget;
 
-	uint16_t ifc_sysctl_ntxds[8];
-	uint16_t ifc_sysctl_nrxds[8];
+	qidx_t ifc_sysctl_ntxds[8];
+	qidx_t ifc_sysctl_nrxds[8];
 	struct if_txrx ifc_txrx;
 #define isc_txd_encap  ifc_txrx.ift_txd_encap
 #define isc_txd_flush  ifc_txrx.ift_txd_flush
@@ -252,7 +259,9 @@
 	return (ctx->ifc_sctx);
 }
 
+#define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
+#define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
 
 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
@@ -262,62 +271,70 @@
 #define RX_SW_DESC_INUSE        (1 << 3)
 #define TX_SW_DESC_MAPPED       (1 << 4)
 
-typedef struct iflib_sw_rx_desc {
-	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
-	struct mbuf    *ifsd_m;           /* rx: uninitialized mbuf */
-	caddr_t         ifsd_cl;          /* direct cluster pointer for rx */
-	uint16_t	ifsd_flags;
-} *iflib_rxsd_t;
+#define	M_TOOBIG		M_PROTO1
 
-typedef struct iflib_sw_tx_desc_val {
-	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
-	struct mbuf    *ifsd_m;           /* pkthdr mbuf */
-	uint8_t		ifsd_flags;
-} *iflib_txsd_val_t;
+typedef struct iflib_sw_rx_desc_array {
+	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
+	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
+	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
+	uint8_t		*ifsd_flags;
+} iflib_rxsd_array_t;
 
 typedef struct iflib_sw_tx_desc_array {
 	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
 	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
 	uint8_t		*ifsd_flags;
-} iflib_txsd_array_t;
+} if_txsd_vec_t;
 
 
 /* magic number that should be high enough for any hardware */
 #define IFLIB_MAX_TX_SEGS		128
-#define IFLIB_MAX_RX_SEGS		32
+/* bnxt supports 64 with hardware LRO enabled */
+#define IFLIB_MAX_RX_SEGS		64
 #define IFLIB_RX_COPY_THRESH		128
 #define IFLIB_MAX_RX_REFRESH		32
+/* The minimum descriptors per second before we start coalescing */
+#define IFLIB_MIN_DESC_SEC		16384
+#define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
 #define IFLIB_QUEUE_IDLE		0
 #define IFLIB_QUEUE_HUNG		1
 #define IFLIB_QUEUE_WORKING		2
+/* maximum number of txqs that can share an rx interrupt */
+#define IFLIB_MAX_TX_SHARED_INTR	4
 
-/* this should really scale with ring size - 32 is a fairly arbitrary value for this */
-#define TX_BATCH_SIZE			16
+/* this should really scale with ring size - this is a fairly arbitrary value */
+#define TX_BATCH_SIZE			32
 
 #define IFLIB_RESTART_BUDGET		8
 
-#define	IFC_LEGACY		0x01
-#define	IFC_QFLUSH		0x02
-#define	IFC_MULTISEG		0x04
-#define	IFC_DMAR		0x08
-#define	IFC_SC_ALLOCATED	0x10
+#define	IFC_LEGACY		0x001
+#define	IFC_QFLUSH		0x002
+#define	IFC_MULTISEG		0x004
+#define	IFC_DMAR		0x008
+#define	IFC_SC_ALLOCATED	0x010
+#define	IFC_INIT_DONE		0x020
+#define	IFC_PREFETCH		0x040
+#define	IFC_DO_RESET		0x080
+#define	IFC_CHECK_HUNG		0x100
 
 #define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
 				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
 				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
 struct iflib_txq {
-	uint16_t	ift_in_use;
-	uint16_t	ift_cidx;
-	uint16_t	ift_cidx_processed;
-	uint16_t	ift_pidx;
+	qidx_t		ift_in_use;
+	qidx_t		ift_cidx;
+	qidx_t		ift_cidx_processed;
+	qidx_t		ift_pidx;
 	uint8_t		ift_gen;
-	uint8_t		ift_db_pending;
-	uint8_t		ift_db_pending_queued;
-	uint8_t		ift_npending;
 	uint8_t		ift_br_offset;
+	uint16_t	ift_npending;
+	uint16_t	ift_db_pending;
+	uint16_t	ift_rs_pending;
 	/* implicit pad */
+	uint8_t		ift_txd_size[8];
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
+	uint64_t	ift_cleaned_prev;
 #if MEMORY_LOGGING
 	uint64_t	ift_enqueued;
 	uint64_t	ift_dequeued;
@@ -335,19 +352,16 @@
 
 	/* constant values */
 	if_ctx_t	ift_ctx;
-	struct ifmp_ring        **ift_br;
+	struct ifmp_ring        *ift_br;
 	struct grouptask	ift_task;
-	uint16_t	ift_size;
+	qidx_t		ift_size;
 	uint16_t	ift_id;
 	struct callout	ift_timer;
-	struct callout	ift_db_check;
 
-	iflib_txsd_array_t	ift_sds;
-	uint8_t			ift_nbr;
-	uint8_t			ift_qstatus;
-	uint8_t			ift_active;
-	uint8_t			ift_closed;
-	int			ift_watchdog_time;
+	if_txsd_vec_t	ift_sds;
+	uint8_t		ift_qstatus;
+	uint8_t		ift_closed;
+	uint8_t		ift_update_freq;
 	struct iflib_filter_info ift_filter_info;
 	bus_dma_tag_t		ift_desc_tag;
 	bus_dma_tag_t		ift_tso_desc_tag;
@@ -356,13 +370,17 @@
 	char                    ift_mtx_name[MTX_NAME_LEN];
 	char                    ift_db_mtx_name[MTX_NAME_LEN];
 	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
+#ifdef IFLIB_DIAGNOSTICS
+	uint64_t ift_cpu_exec_count[256];
+#endif
 } __aligned(CACHE_LINE_SIZE);
 
 struct iflib_fl {
-	uint16_t	ifl_cidx;
-	uint16_t	ifl_pidx;
-	uint16_t	ifl_credits;
+	qidx_t		ifl_cidx;
+	qidx_t		ifl_pidx;
+	qidx_t		ifl_credits;
 	uint8_t		ifl_gen;
+	uint8_t		ifl_rxd_size;
 #if MEMORY_LOGGING
 	uint64_t	ifl_m_enqueued;
 	uint64_t	ifl_m_dequeued;
@@ -371,24 +389,27 @@
 #endif
 	/* implicit pad */
 
+	bitstr_t 	*ifl_rx_bitmap;
+	qidx_t		ifl_fragidx;
 	/* constant */
-	uint16_t	ifl_size;
+	qidx_t		ifl_size;
 	uint16_t	ifl_buf_size;
 	uint16_t	ifl_cltype;
 	uma_zone_t	ifl_zone;
-	iflib_rxsd_t	ifl_sds;
+	iflib_rxsd_array_t	ifl_sds;
 	iflib_rxq_t	ifl_rxq;
 	uint8_t		ifl_id;
 	bus_dma_tag_t           ifl_desc_tag;
 	iflib_dma_info_t	ifl_ifdi;
 	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
 	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
+	qidx_t	ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
 }  __aligned(CACHE_LINE_SIZE);
 
-static inline int
-get_inuse(int size, int cidx, int pidx, int gen)
+static inline qidx_t
+get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
 {
-	int used;
+	qidx_t used;
 
 	if (pidx > cidx)
 		used = pidx - cidx;
@@ -414,9 +435,9 @@
 	 * these are the cq cidx and pidx. Otherwise
 	 * these are unused.
 	 */
-	uint16_t	ifr_size;
-	uint16_t	ifr_cq_cidx;
-	uint16_t	ifr_cq_pidx;
+	qidx_t		ifr_size;
+	qidx_t		ifr_cq_cidx;
+	qidx_t		ifr_cq_pidx;
 	uint8_t		ifr_cq_gen;
 	uint8_t		ifr_fl_offset;
 
@@ -426,26 +447,89 @@
 	uint16_t	ifr_id;
 	uint8_t		ifr_lro_enabled;
 	uint8_t		ifr_nfl;
+	uint8_t		ifr_ntxqirq;
+	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
 	struct lro_ctrl			ifr_lc;
 	struct grouptask        ifr_task;
 	struct iflib_filter_info ifr_filter_info;
 	iflib_dma_info_t		ifr_ifdi;
+
 	/* dynamically allocate if any drivers need a value substantially larger than this */
 	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
+#ifdef IFLIB_DIAGNOSTICS
+	uint64_t ifr_cpu_exec_count[256];
+#endif
 }  __aligned(CACHE_LINE_SIZE);
 
+typedef struct if_rxsd {
+	caddr_t *ifsd_cl;
+	struct mbuf **ifsd_m;
+	iflib_fl_t ifsd_fl;
+	qidx_t ifsd_cidx;
+} *if_rxsd_t;
+
+/* multiple of word size */
+#ifdef __LP64__
+#define PKT_INFO_SIZE	6
+#define RXD_INFO_SIZE	5
+#define PKT_TYPE uint64_t
+#else
+#define PKT_INFO_SIZE	11
+#define RXD_INFO_SIZE	8
+#define PKT_TYPE uint32_t
+#endif
+#define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
+#define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
+
+typedef struct if_pkt_info_pad {
+	PKT_TYPE pkt_val[PKT_INFO_SIZE];
+} *if_pkt_info_pad_t;
+typedef struct if_rxd_info_pad {
+	PKT_TYPE rxd_val[RXD_INFO_SIZE];
+} *if_rxd_info_pad_t;
+
+CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
+CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
+
+
+static inline void
+pkt_info_zero(if_pkt_info_t pi)
+{
+	if_pkt_info_pad_t pi_pad;
+
+	pi_pad = (if_pkt_info_pad_t)pi;
+	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
+	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
+#ifndef __LP64__
+	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
+	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
+#endif	
+}
+
+static inline void
+rxd_info_zero(if_rxd_info_t ri)
+{
+	if_rxd_info_pad_t ri_pad;
+	int i;
+
+	ri_pad = (if_rxd_info_pad_t)ri;
+	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
+		ri_pad->rxd_val[i] = 0;
+		ri_pad->rxd_val[i+1] = 0;
+		ri_pad->rxd_val[i+2] = 0;
+		ri_pad->rxd_val[i+3] = 0;
+	}
+#ifdef __LP64__
+	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
+#endif
+}
+
 /*
  * Only allow a single packet to take up most 1/nth of the tx ring
  */
 #define MAX_SINGLE_PACKET_FRACTION 12
 #define IF_BAD_DMA (bus_addr_t)-1
 
-static int enable_msix = 1;
-
-#define mtx_held(m)	(((m)->mtx_lock & ~MTX_FLAGMASK) != (uintptr_t)0)
-
-
-
 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
 
 #define CTX_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_mtx, _name, "iflib ctx lock", MTX_DEF)
@@ -455,12 +539,6 @@
 #define CTX_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_mtx)
 
 
-#define TXDB_LOCK_INIT(txq)  mtx_init(&(txq)->ift_db_mtx, (txq)->ift_db_mtx_name, NULL, MTX_DEF)
-#define TXDB_TRYLOCK(txq) mtx_trylock(&(txq)->ift_db_mtx)
-#define TXDB_LOCK(txq) mtx_lock(&(txq)->ift_db_mtx)
-#define TXDB_UNLOCK(txq) mtx_unlock(&(txq)->ift_db_mtx)
-#define TXDB_LOCK_DESTROY(txq) mtx_destroy(&(txq)->ift_db_mtx)
-
 #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
 #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
 
@@ -480,6 +558,7 @@
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
+TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
 
 #ifndef IFLIB_DEBUG_COUNTERS
@@ -497,9 +576,11 @@
  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  */
 static int iflib_min_tx_latency = 0;
-
 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
-		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possibel expense of throughput");
+		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
+static int iflib_no_tx_batch = 0;
+SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
+		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
 
 
 #if IFLIB_DEBUG_COUNTERS
@@ -544,11 +625,14 @@
 
 
 static int iflib_encap_load_mbuf_fail;
+static int iflib_encap_pad_mbuf_fail;
 static int iflib_encap_txq_avail_fail;
 static int iflib_encap_txd_encap_fail;
 
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
 		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
+SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
+		   &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
 		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
@@ -594,10 +678,24 @@
 		   &iflib_verbose_debug, 0, "enable verbose debugging");
 
 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
+static void
+iflib_debug_reset(void)
+{
+	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
+		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
+		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
+		iflib_txq_drain_notready = iflib_txq_drain_encapfail =
+		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
+		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
+		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
+		iflib_intr_link = iflib_intr_msix = iflib_rx_unavail =
+		iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input =
+		iflib_rx_mbuf_null = iflib_rxd_flush = 0;
+}
 
 #else
 #define DBG_COUNTER_INC(name)
-
+static void iflib_debug_reset(void) {}
 #endif
 
 
@@ -608,7 +706,7 @@
 static void iflib_rx_structures_free(if_ctx_t ctx);
 static int iflib_queues_alloc(if_ctx_t ctx);
 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
-static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget);
+static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
 static int iflib_qset_structures_setup(if_ctx_t ctx);
 static int iflib_msix_init(if_ctx_t ctx);
 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str);
@@ -618,8 +716,14 @@
 static void iflib_init_locked(if_ctx_t ctx);
 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
+static void iflib_ifmp_purge(iflib_txq_t txq);
+static void _iflib_pre_assert(if_softc_ctx_t scctx);
+static void iflib_stop(if_ctx_t ctx);
+static void iflib_if_init_locked(if_ctx_t ctx);
+#ifndef __NO_STRICT_ALIGNMENT
+static struct mbuf * iflib_fixup_rx(struct mbuf *m);
+#endif
 
-
 #ifdef DEV_NETMAP
 #include <sys/selinfo.h>
 #include <net/netmap.h>
@@ -627,6 +731,8 @@
 
 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
 
+static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init);
+
 /*
  * device-specific sysctl variables:
  *
@@ -662,6 +768,7 @@
 {
 	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
+	int status;
 
 	CTX_LOCK(ctx);
 	IFDI_INTR_DISABLE(ctx);
@@ -670,7 +777,7 @@
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 	if (!CTX_IS_VF(ctx))
-		IFDI_CRCSTRIP_SET(ctx, onoff);
+		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
 
 	/* enable or disable flags and callbacks in na and ifp */
 	if (onoff) {
@@ -678,12 +785,90 @@
 	} else {
 		nm_clear_native_flags(na);
 	}
-	IFDI_INIT(ctx);
-	IFDI_CRCSTRIP_SET(ctx, onoff); // XXX why twice ?
+	iflib_stop(ctx);
+	iflib_init_locked(ctx);
+	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
+	status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
+	if (status)
+		nm_clear_native_flags(na);
 	CTX_UNLOCK(ctx);
-	return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
+	return (status);
 }
 
+static int
+netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init)
+{
+	struct netmap_adapter *na = kring->na;
+	u_int const lim = kring->nkr_num_slots - 1;
+	u_int head = kring->rhead;
+	struct netmap_ring *ring = kring->ring;
+	bus_dmamap_t *map;
+	struct if_rxd_update iru;
+	if_ctx_t ctx = rxq->ifr_ctx;
+	iflib_fl_t fl = &rxq->ifr_fl[0];
+	uint32_t refill_pidx, nic_i;
+
+	if (nm_i == head && __predict_true(!init))
+		return 0;
+	iru_init(&iru, rxq, 0 /* flid */);
+	map = fl->ifl_sds.ifsd_map;
+	refill_pidx = netmap_idx_k2n(kring, nm_i);
+	/*
+	 * IMPORTANT: we must leave one free slot in the ring,
+	 * so move head back by one unit
+	 */
+	head = nm_prev(head, lim);
+	while (nm_i != head) {
+		for (int tmp_pidx = 0; tmp_pidx < IFLIB_MAX_RX_REFRESH && nm_i != head; tmp_pidx++) {
+			struct netmap_slot *slot = &ring->slot[nm_i];
+			void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[tmp_pidx]);
+			uint32_t nic_i_dma = refill_pidx;
+			nic_i = netmap_idx_k2n(kring, nm_i);
+
+			MPASS(tmp_pidx < IFLIB_MAX_RX_REFRESH);
+
+			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
+			        return netmap_ring_reinit(kring);
+
+			fl->ifl_vm_addrs[tmp_pidx] = addr;
+			if (__predict_false(init) && map) {
+				netmap_load_map(na, fl->ifl_ifdi->idi_tag, map[nic_i], addr);
+			} else if (map && (slot->flags & NS_BUF_CHANGED)) {
+				/* buffer has changed, reload map */
+				netmap_reload_map(na, fl->ifl_ifdi->idi_tag, map[nic_i], addr);
+			}
+			slot->flags &= ~NS_BUF_CHANGED;
+
+			nm_i = nm_next(nm_i, lim);
+			fl->ifl_rxd_idxs[tmp_pidx] = nic_i = nm_next(nic_i, lim);
+			if (nm_i != head && tmp_pidx < IFLIB_MAX_RX_REFRESH-1)
+				continue;
+
+			iru.iru_pidx = refill_pidx;
+			iru.iru_count = tmp_pidx+1;
+			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
+
+			refill_pidx = nic_i;
+			if (map == NULL)
+				continue;
+
+			for (int n = 0; n < iru.iru_count; n++) {
+				bus_dmamap_sync(fl->ifl_ifdi->idi_tag, map[nic_i_dma],
+						BUS_DMASYNC_PREREAD);
+				/* XXX - change this to not use the netmap func*/
+				nic_i_dma = nm_next(nic_i_dma, lim);
+			}
+		}
+	}
+	kring->nr_hwcur = head;
+
+	if (map)
+		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
+				BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
+	return (0);
+}
+
 /*
  * Reconcile kernel and user view of the transmit ring.
  *
@@ -720,14 +905,11 @@
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 
-	pi.ipi_segs = txq->ift_segs;
-	pi.ipi_qsidx = kring->ring_id;
-	pi.ipi_ndescs = 0;
+	if (txq->ift_sds.ifsd_map)
+		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
+				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
-	bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
-					BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
-
 	/*
 	 * First part: process new packets to send.
 	 * nm_i is the current index in the netmap ring,
@@ -750,13 +932,17 @@
 	 * to prefetch the next slot and txr entry.
 	 */
 
-	nm_i = kring->nr_hwcur;
+	nm_i = netmap_idx_n2k(kring, kring->nr_hwcur);
+	pkt_info_zero(&pi);
+	pi.ipi_segs = txq->ift_segs;
+	pi.ipi_qsidx = kring->ring_id;
 	if (nm_i != head) {	/* we have new packets to send */
 		nic_i = netmap_idx_k2n(kring, nm_i);
 
 		__builtin_prefetch(&ring->slot[nm_i]);
 		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
-		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
+		if (txq->ift_sds.ifsd_map)
+			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
@@ -768,6 +954,11 @@
 				IPI_TX_INTR : 0;
 
 			/* device-specific */
+			pi.ipi_len = len;
+			pi.ipi_segs[0].ds_addr = paddr;
+			pi.ipi_segs[0].ds_len = len;
+			pi.ipi_nsegs = 1;
+			pi.ipi_ndescs = 0;
 			pi.ipi_pidx = nic_i;
 			pi.ipi_flags = flags;
 
@@ -777,27 +968,28 @@
 			/* prefetch for next round */
 			__builtin_prefetch(&ring->slot[nm_i + 1]);
 			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
-			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
+			if (txq->ift_sds.ifsd_map) {
+				__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 
-			NM_CHECK_ADDR_LEN(na, addr, len);
+				NM_CHECK_ADDR_LEN(na, addr, len);
 
-			if (slot->flags & NS_BUF_CHANGED) {
-				/* buffer has changed, reload map */
-				netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
+				if (slot->flags & NS_BUF_CHANGED) {
+					/* buffer has changed, reload map */
+					netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
+				}
+				/* make sure changes to the buffer are synced */
+				bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
+						BUS_DMASYNC_PREWRITE);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
-
-			/* make sure changes to the buffer are synced */
-			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
-							BUS_DMASYNC_PREWRITE);
-
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
 		}
 		kring->nr_hwcur = head;
 
 		/* synchronize the NIC ring */
-		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
+		if (txq->ift_sds.ifsd_map)
+			bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
 						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* (re)start the tx unit up to slot nic_i (excluded) */
@@ -832,30 +1024,29 @@
 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
-	struct ifnet *ifp = na->ifp;
 	struct netmap_ring *ring = kring->ring;
-	u_int nm_i;	/* index into the netmap ring */
-	u_int nic_i;	/* index into the NIC ring */
+	uint32_t nm_i;	/* index into the netmap ring */
+	uint32_t nic_i;	/* index into the NIC ring */
 	u_int i, n;
 	u_int const lim = kring->nkr_num_slots - 1;
-	u_int const head = kring->rhead;
+	u_int const head = netmap_idx_n2k(kring, kring->rhead);
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 	struct if_rxd_info ri;
-	/* device-specific */
+
+	struct ifnet *ifp = na->ifp;
 	if_ctx_t ctx = ifp->if_softc;
 	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 	iflib_fl_t fl = rxq->ifr_fl;
 	if (head > lim)
 		return netmap_ring_reinit(kring);
 
-	bzero(&ri, sizeof(ri));
-	ri.iri_qsidx = kring->ring_id;
-	ri.iri_ifp = ctx->ifc_ifp;
 	/* XXX check sync modes */
-	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++)
+	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) {
+		if (fl->ifl_sds.ifsd_map == NULL)
+			continue;
 		bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map,
 				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
-
+	}
 	/*
 	 * First part: import newly received packets.
 	 *
@@ -876,19 +1067,24 @@
 		int error, avail;
 		uint16_t slot_flags = kring->nkr_slot_flags;
 
-		for (fl = rxq->ifr_fl, i = 0; i < rxq->ifr_nfl; i++, fl++) {
+		for (i = 0; i < rxq->ifr_nfl; i++) {
+			fl = &rxq->ifr_fl[i];
 			nic_i = fl->ifl_cidx;
 			nm_i = netmap_idx_n2k(kring, nic_i);
-			avail = ctx->isc_rxd_available(ctx->ifc_softc, kring->ring_id, nic_i, INT_MAX);
+			avail = iflib_rxd_avail(ctx, rxq, nic_i, USHRT_MAX);
 			for (n = 0; avail > 0; n++, avail--) {
+				rxd_info_zero(&ri);
+				ri.iri_frags = rxq->ifr_frags;
+				ri.iri_qsidx = kring->ring_id;
+				ri.iri_ifp = ctx->ifc_ifp;
+				ri.iri_cidx = nic_i;
+
 				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
-				if (error)
-					ring->slot[nm_i].len = 0;
-				else
-					ring->slot[nm_i].len = ri.iri_len - crclen;
+				ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen;
 				ring->slot[nm_i].flags = slot_flags;
-				bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
-								fl->ifl_sds[nic_i].ifsd_map, BUS_DMASYNC_POSTREAD);
+				if (fl->ifl_sds.ifsd_map)
+					bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
+							fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
 				nm_i = nm_next(nm_i, lim);
 				nic_i = nm_next(nic_i, lim);
 			}
@@ -899,7 +1095,7 @@
 					iflib_rx_miss_bufs += n;
 				}
 				fl->ifl_cidx = nic_i;
-				kring->nr_hwtail = nm_i;
+				kring->nr_hwtail = netmap_idx_k2n(kring, nm_i);
 			}
 			kring->nr_kflags &= ~NKR_PENDINTR;
 		}
@@ -913,51 +1109,27 @@
 	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 	 */
 	/* XXX not sure how this will work with multiple free lists */
-	nm_i = kring->nr_hwcur;
-	if (nm_i != head) {
-		nic_i = netmap_idx_k2n(kring, nm_i);
-		for (n = 0; nm_i != head; n++) {
-			struct netmap_slot *slot = &ring->slot[nm_i];
-			uint64_t paddr;
-			caddr_t vaddr;
-			void *addr = PNMB(na, slot, &paddr);
+	nm_i = netmap_idx_n2k(kring, kring->nr_hwcur);
 
-			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
-				goto ring_reset;
+	return (netmap_fl_refill(rxq, kring, nm_i, false));
+}
 
-			vaddr = addr;
-			if (slot->flags & NS_BUF_CHANGED) {
-				/* buffer has changed, reload map */
-				netmap_reload_map(na, fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map, addr);
-				slot->flags &= ~NS_BUF_CHANGED;
-			}
-			/*
-			 * XXX we should be batching this operation - TODO
-			 */
-			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i, &paddr, &vaddr, 1, fl->ifl_buf_size);
-			bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map,
-			    BUS_DMASYNC_PREREAD);
-			nm_i = nm_next(nm_i, lim);
-			nic_i = nm_next(nic_i, lim);
-		}
-		kring->nr_hwcur = head;
+static void
+iflib_netmap_intr(struct netmap_adapter *na, int onoff)
+{
+	struct ifnet *ifp = na->ifp;
+	if_ctx_t ctx = ifp->if_softc;
 
-		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
-		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-		/*
-		 * IMPORTANT: we must leave one free slot in the ring,
-		 * so move nic_i back by one unit
-		 */
-		nic_i = nm_prev(nic_i, lim);
-		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
+	CTX_LOCK(ctx);
+	if (onoff) {
+		IFDI_INTR_ENABLE(ctx);
+	} else {
+		IFDI_INTR_DISABLE(ctx);
 	}
-
-	return 0;
-
-ring_reset:
-	return netmap_ring_reinit(kring);
+	CTX_UNLOCK(ctx);
 }
 
+
 static int
 iflib_netmap_attach(if_ctx_t ctx)
 {
@@ -976,6 +1148,7 @@
 	na.nm_txsync = iflib_netmap_txsync;
 	na.nm_rxsync = iflib_netmap_rxsync;
 	na.nm_register = iflib_netmap_register;
+	na.nm_intr = iflib_netmap_intr;
 	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 	return (netmap_attach(&na));
@@ -990,6 +1163,8 @@
 	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
 	if (slot == NULL)
 		return;
+	if (txq->ift_sds.ifsd_map == NULL)
+		return;
 
 	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 
@@ -1004,37 +1179,20 @@
 		netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si));
 	}
 }
+
 static void
 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 {
 	struct netmap_adapter *na = NA(ctx->ifc_ifp);
+	struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
 	struct netmap_slot *slot;
-	iflib_rxsd_t sd;
-	int nrxd;
+	uint32_t nm_i;
 
 	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
 	if (slot == NULL)
 		return;
-	sd = rxq->ifr_fl[0].ifl_sds;
-	nrxd = ctx->ifc_softc_ctx.isc_nrxd[0];
-	for (int i = 0; i < nrxd; i++, sd++) {
-			int sj = netmap_idx_n2k(&na->rx_rings[rxq->ifr_id], i);
-			uint64_t paddr;
-			void *addr;
-			caddr_t vaddr;
-
-			vaddr = addr = PNMB(na, slot + sj, &paddr);
-			netmap_load_map(na, rxq->ifr_fl[0].ifl_ifdi->idi_tag, sd->ifsd_map, addr);
-			/* Update descriptor and the cached value */
-			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, i, &paddr, &vaddr, 1, rxq->ifr_fl[0].ifl_buf_size);
-	}
-	/* preserve queue */
-	if (ctx->ifc_ifp->if_capenable & IFCAP_NETMAP) {
-		struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
-		int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
-		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, t);
-	} else
-		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, nrxd-1);
+	nm_i = netmap_idx_n2k(kring, 0);
+	netmap_fl_refill(rxq, kring, nm_i, true);
 }
 
 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
@@ -1046,6 +1204,7 @@
 
 #define iflib_netmap_attach(ctx) (0)
 #define netmap_rx_irq(ifp, qid, budget) (0)
+#define netmap_tx_irq(ifp, qid) do {} while (0)
 
 #endif
 
@@ -1055,11 +1214,34 @@
 {
 	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 }
+static __inline void
+prefetch2cachelines(void *x)
+{
+	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
+#if (CACHE_LINE_SIZE < 128)
+	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
+#endif
+}
 #else
 #define prefetch(x)
+#define prefetch2cachelines(x)
 #endif
 
 static void
+iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
+{
+	iflib_fl_t fl;
+
+	fl = &rxq->ifr_fl[flid];
+	iru->iru_paddrs = fl->ifl_bus_addrs;
+	iru->iru_vaddrs = &fl->ifl_vm_addrs[0];
+	iru->iru_idxs = fl->ifl_rxd_idxs;
+	iru->iru_qsidx = rxq->ifr_id;
+	iru->iru_buf_size = fl->ifl_buf_size;
+	iru->iru_flidx = fl->ifl_id;
+}
+
+static void
 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 {
 	if (err)
@@ -1172,11 +1354,36 @@
 		iflib_dma_free(*dmaiter);
 }
 
+#ifdef EARLY_AP_STARTUP
+static const int iflib_started = 1;
+#else
+/*
+ * We used to abuse the smp_started flag to decide if the queues have been
+ * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
+ * That gave bad races, since the SYSINIT() runs strictly after smp_started
+ * is set.  Run a SYSINIT() strictly after that to just set a usable
+ * completion flag.
+ */
+
+static int iflib_started;
+
+static void
+iflib_record_started(void *arg)
+{
+	iflib_started = 1;
+}
+
+SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
+	iflib_record_started, NULL);
+#endif
+
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
+	if (!iflib_started)
+		return (FILTER_HANDLED);
 
 	DBG_COUNTER_INC(fast_intrs);
 	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
@@ -1187,19 +1394,77 @@
 }
 
 static int
+iflib_fast_intr_rxtx(void *arg)
+{
+	iflib_filter_info_t info = arg;
+	struct grouptask *gtask = info->ifi_task;
+	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
+	if_ctx_t ctx;
+	int i, cidx;
+
+	if (!iflib_started)
+		return (FILTER_HANDLED);
+
+	DBG_COUNTER_INC(fast_intrs);
+	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
+		return (FILTER_HANDLED);
+
+	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
+		qidx_t txqid = rxq->ifr_txqid[i];
+
+		ctx = rxq->ifr_ctx;
+
+		if (!ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false)) {
+			IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
+			continue;
+		}
+		GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
+	}
+	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
+		cidx = rxq->ifr_cq_cidx;
+	else
+		cidx = rxq->ifr_fl[0].ifl_cidx;
+	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
+		GROUPTASK_ENQUEUE(gtask);
+	else
+		IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
+	return (FILTER_HANDLED);
+}
+
+
+static int
+iflib_fast_intr_ctx(void *arg)
+{
+	iflib_filter_info_t info = arg;
+	struct grouptask *gtask = info->ifi_task;
+
+	if (!iflib_started)
+		return (FILTER_HANDLED);
+
+	DBG_COUNTER_INC(fast_intrs);
+	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
+		return (FILTER_HANDLED);
+
+	GROUPTASK_ENQUEUE(gtask);
+	return (FILTER_HANDLED);
+}
+
+static int
 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 	driver_filter_t filter, driver_intr_t handler, void *arg,
 				 char *name)
 {
-	int rc;
+	int rc, flags;
 	struct resource *res;
-	void *tag;
+	void *tag = NULL;
 	device_t dev = ctx->ifc_dev;
 
+	flags = RF_ACTIVE;
+	if (ctx->ifc_flags & IFC_LEGACY)
+		flags |= RF_SHAREABLE;
 	MPASS(rid < 512);
 	irq->ii_rid = rid;
-	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid,
-				     RF_SHAREABLE | RF_ACTIVE);
+	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid, flags);
 	if (res == NULL) {
 		device_printf(dev,
 		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
@@ -1265,11 +1530,6 @@
 		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
 		goto fail;
 	}
-#ifdef IFLIB_DIAGNOSTICS
-	device_printf(dev,"maxsize: %zd nsegments: %d maxsegsize: %zd\n",
-		      sctx->isc_tx_maxsize, nsegments, sctx->isc_tx_maxsegsize);
-
-#endif
 	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
@@ -1286,11 +1546,6 @@
 
 		goto fail;
 	}
-#ifdef IFLIB_DIAGNOSTICS
-	device_printf(dev,"TSO maxsize: %d ntsosegments: %d maxsegsize: %d\n",
-		      scctx->isc_tx_tso_size_max, ntsosegments,
-		      scctx->isc_tx_tso_segsize_max);
-#endif
 	if (!(txq->ift_sds.ifsd_flags =
 	    (uint8_t *) malloc(sizeof(uint8_t) *
 	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
@@ -1307,7 +1562,7 @@
 	}
 
         /* Create the descriptor buffer dma maps */
-#if defined(ACPI_DMAR) || (!(defined(__i386__) && !defined(__amd64__)))
+#if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__)))
 	if ((ctx->ifc_flags & IFC_DMAR) == 0)
 		return (0);
 
@@ -1406,11 +1661,14 @@
 	iflib_dma_info_t di;
 	int i;
 
-    /* Set number of descriptors available */
+	/* Set number of descriptors available */
 	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
+	/* XXX make configurable */
+	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
 
 	/* Reset indices */
-	txq->ift_cidx_processed = txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
+	txq->ift_cidx_processed = 0;
+	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 
 	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
@@ -1439,7 +1697,6 @@
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	device_t dev = ctx->ifc_dev;
 	iflib_fl_t fl;
-	iflib_rxsd_t	rxsd;
 	int			err;
 
 	MPASS(scctx->isc_nrxd[0] > 0);
@@ -1447,13 +1704,6 @@
 
 	fl = rxq->ifr_fl;
 	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
-		fl->ifl_sds = malloc(sizeof(struct iflib_sw_rx_desc) *
-		    scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB,
-		    M_WAITOK | M_ZERO);
-		if (fl->ifl_sds == NULL) {
-			device_printf(dev, "Unable to allocate rx sw desc memory\n");
-			return (ENOMEM);
-		}
 		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 					 1, 0,			/* alignment, bounds */
@@ -1472,16 +1722,48 @@
 				__func__, err);
 			goto fail;
 		}
+		if (!(fl->ifl_sds.ifsd_flags =
+		      (uint8_t *) malloc(sizeof(uint8_t) *
+					 scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
+		if (!(fl->ifl_sds.ifsd_m =
+		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
+					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
+		if (!(fl->ifl_sds.ifsd_cl =
+		      (caddr_t *) malloc(sizeof(caddr_t) *
+					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
 
-		rxsd = fl->ifl_sds;
-		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++, rxsd++) {
-			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &rxsd->ifsd_map);
-			if (err) {
-				device_printf(dev, "%s: bus_dmamap_create failed: %d\n",
-					__func__, err);
+		/* Create the descriptor buffer dma maps */
+#if defined(ACPI_DMAR) || (! (defined(__i386__) || defined(__amd64__)))
+		if ((ctx->ifc_flags & IFC_DMAR) == 0)
+			continue;
+
+		if (!(fl->ifl_sds.ifsd_map =
+		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
+			device_printf(dev, "Unable to allocate tx_buffer map memory\n");
+			err = ENOMEM;
+			goto fail;
+		}
+
+		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
+			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &fl->ifl_sds.ifsd_map[i]);
+			if (err != 0) {
+				device_printf(dev, "Unable to create RX buffer DMA map\n");
 				goto fail;
 			}
 		}
+#endif
 	}
 	return (0);
 
@@ -1531,20 +1813,32 @@
 _iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 {
 	struct mbuf *m;
-	int pidx = fl->ifl_pidx;
-	iflib_rxsd_t rxsd = &fl->ifl_sds[pidx];
-	caddr_t cl;
+	int idx, frag_idx = fl->ifl_fragidx;
+        int pidx = fl->ifl_pidx;
+	caddr_t cl, *sd_cl;
+	struct mbuf **sd_m;
+	uint8_t *sd_flags;
+	struct if_rxd_update iru;
+	bus_dmamap_t *sd_map;
 	int n, i = 0;
 	uint64_t bus_addr;
 	int err;
+	qidx_t credits;
 
+	sd_m = fl->ifl_sds.ifsd_m;
+	sd_map = fl->ifl_sds.ifsd_map;
+	sd_cl = fl->ifl_sds.ifsd_cl;
+	sd_flags = fl->ifl_sds.ifsd_flags;
+	idx = pidx;
+	credits = fl->ifl_credits;
+
 	n  = count;
 	MPASS(n > 0);
-	MPASS(fl->ifl_credits + n <= fl->ifl_size);
+	MPASS(credits + n <= fl->ifl_size);
 
 	if (pidx < fl->ifl_cidx)
 		MPASS(pidx + n <= fl->ifl_cidx);
-	if (pidx == fl->ifl_cidx && (fl->ifl_credits < fl->ifl_size))
+	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
 		MPASS(fl->ifl_gen == 0);
 	if (pidx > fl->ifl_cidx)
 		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
@@ -1552,7 +1846,7 @@
 	DBG_COUNTER_INC(fl_refills);
 	if (n > 8)
 		DBG_COUNTER_INC(fl_refills_large);
-
+	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
 	while (n--) {
 		/*
 		 * We allocate an uninitialized mbuf + cluster, mbuf is
@@ -1560,8 +1854,11 @@
 		 *
 		 * If the cluster is still set then we know a minimum sized packet was received
 		 */
-		if ((cl = rxsd->ifsd_cl) == NULL) {
-			if ((cl = rxsd->ifsd_cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
+		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,  &frag_idx);
+		if ((frag_idx < 0) || (frag_idx >= fl->ifl_size))
+                	bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
+		if ((cl = sd_cl[frag_idx]) == NULL) {
+                       if ((cl = sd_cl[frag_idx] = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
 				break;
 #if MEMORY_LOGGING
 			fl->ifl_cl_enqueued++;
@@ -1575,19 +1872,6 @@
 #endif
 
 		DBG_COUNTER_INC(rx_allocs);
-#ifdef notyet
-		if ((rxsd->ifsd_flags & RX_SW_DESC_MAP_CREATED) == 0) {
-			int err;
-
-			if ((err = bus_dmamap_create(fl->ifl_ifdi->idi_tag, 0, &rxsd->ifsd_map))) {
-				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
-				uma_zfree(fl->ifl_zone, cl);
-				n = 0;
-				goto done;
-			}
-			rxsd->ifsd_flags |= RX_SW_DESC_MAP_CREATED;
-		}
-#endif
 #if defined(__i386__) || defined(__amd64__)
 		if (!IS_DMAR(ctx)) {
 			bus_addr = pmap_kextract((vm_offset_t)cl);
@@ -1599,8 +1883,12 @@
 
 			cb_arg.error = 0;
 			q = fl->ifl_rxq;
-			err = bus_dmamap_load(fl->ifl_desc_tag, rxsd->ifsd_map,
+			MPASS(sd_map != NULL);
+			MPASS(sd_map[frag_idx] != NULL);
+			err = bus_dmamap_load(fl->ifl_desc_tag, sd_map[frag_idx],
 		         cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0);
+			bus_dmamap_sync(fl->ifl_desc_tag, sd_map[frag_idx],
+					BUS_DMASYNC_PREREAD);
 
 			if (err != 0 || cb_arg.error) {
 				/*
@@ -1614,36 +1902,52 @@
 			}
 			bus_addr = cb_arg.seg.ds_addr;
 		}
-		rxsd->ifsd_flags |= RX_SW_DESC_INUSE;
+                bit_set(fl->ifl_rx_bitmap, frag_idx);
+		sd_flags[frag_idx] |= RX_SW_DESC_INUSE;
 
-		MPASS(rxsd->ifsd_m == NULL);
-		rxsd->ifsd_cl = cl;
-		rxsd->ifsd_m = m;
+		MPASS(sd_m[frag_idx] == NULL);
+		sd_cl[frag_idx] = cl;
+		sd_m[frag_idx] = m;
+		fl->ifl_rxd_idxs[i] = frag_idx;
 		fl->ifl_bus_addrs[i] = bus_addr;
 		fl->ifl_vm_addrs[i] = cl;
-		rxsd++;
-		fl->ifl_credits++;
+		credits++;
 		i++;
-		MPASS(fl->ifl_credits <= fl->ifl_size);
-		if (++fl->ifl_pidx == fl->ifl_size) {
-			fl->ifl_pidx = 0;
+		MPASS(credits <= fl->ifl_size);
+		if (++idx == fl->ifl_size) {
 			fl->ifl_gen = 1;
-			rxsd = fl->ifl_sds;
+			idx = 0;
 		}
 		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
-			ctx->isc_rxd_refill(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx,
-								 fl->ifl_bus_addrs, fl->ifl_vm_addrs, i, fl->ifl_buf_size);
+			iru.iru_pidx = pidx;
+			iru.iru_count = i;
+			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 			i = 0;
-			pidx = fl->ifl_pidx;
+			pidx = idx;
+			fl->ifl_pidx = idx;
+			fl->ifl_credits = credits;
 		}
+
 	}
 done:
+	if (i) {
+		iru.iru_pidx = pidx;
+		iru.iru_count = i;
+		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
+		fl->ifl_pidx = idx;
+		fl->ifl_credits = credits;
+	}
 	DBG_COUNTER_INC(rxd_flush);
 	if (fl->ifl_pidx == 0)
 		pidx = fl->ifl_size - 1;
 	else
 		pidx = fl->ifl_pidx - 1;
+
+	if (sd_map)
+		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
+				BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
+	fl->ifl_fragidx = frag_idx;
 }
 
 static __inline void
@@ -1669,33 +1973,46 @@
 	uint32_t i;
 
 	for (i = 0; i < fl->ifl_size; i++) {
-		iflib_rxsd_t d = &fl->ifl_sds[i];
+		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
+		uint8_t *sd_flags = &fl->ifl_sds.ifsd_flags[i];
+		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
 
-		if (d->ifsd_flags & RX_SW_DESC_INUSE) {
-			bus_dmamap_unload(fl->ifl_desc_tag, d->ifsd_map);
-			bus_dmamap_destroy(fl->ifl_desc_tag, d->ifsd_map);
-			if (d->ifsd_m != NULL) {
-				m_init(d->ifsd_m, M_NOWAIT, MT_DATA, 0);
-				uma_zfree(zone_mbuf, d->ifsd_m);
+		if (*sd_flags & RX_SW_DESC_INUSE) {
+			if (fl->ifl_sds.ifsd_map != NULL) {
+				bus_dmamap_t sd_map = fl->ifl_sds.ifsd_map[i];
+				bus_dmamap_unload(fl->ifl_desc_tag, sd_map);
+				if (fl->ifl_rxq->ifr_ctx->ifc_in_detach)
+					bus_dmamap_destroy(fl->ifl_desc_tag, sd_map);
 			}
-			if (d->ifsd_cl != NULL)
-				uma_zfree(fl->ifl_zone, d->ifsd_cl);
-			d->ifsd_flags = 0;
+			if (*sd_m != NULL) {
+				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
+				uma_zfree(zone_mbuf, *sd_m);
+			}
+			if (*sd_cl != NULL)
+				uma_zfree(fl->ifl_zone, *sd_cl);
+			*sd_flags = 0;
 		} else {
-			MPASS(d->ifsd_cl == NULL);
-			MPASS(d->ifsd_m == NULL);
+			MPASS(*sd_cl == NULL);
+			MPASS(*sd_m == NULL);
 		}
 #if MEMORY_LOGGING
 		fl->ifl_m_dequeued++;
 		fl->ifl_cl_dequeued++;
 #endif
-		d->ifsd_cl = NULL;
-		d->ifsd_m = NULL;
+		*sd_cl = NULL;
+		*sd_m = NULL;
 	}
+#ifdef INVARIANTS
+	for (i = 0; i < fl->ifl_size; i++) {
+		MPASS(fl->ifl_sds.ifsd_flags[i] == 0);
+		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
+		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
+	}
+#endif
 	/*
 	 * Reset free list values
 	 */
-	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = 0;;
+	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
 	bzero(idi->idi_vaddr, idi->idi_size);
 }
 
@@ -1711,6 +2028,7 @@
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
+	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
 	/*
 	** Free current RX buffer structs and their mbufs
 	*/
@@ -1723,12 +2041,17 @@
 	 */
 	if (sctx->isc_max_frame_size <= 2048)
 		fl->ifl_buf_size = MCLBYTES;
+#ifndef CONTIGMALLOC_WORKS
+	else
+		fl->ifl_buf_size = MJUMPAGESIZE;
+#else
 	else if (sctx->isc_max_frame_size <= 4096)
 		fl->ifl_buf_size = MJUMPAGESIZE;
 	else if (sctx->isc_max_frame_size <= 9216)
 		fl->ifl_buf_size = MJUM9BYTES;
 	else
 		fl->ifl_buf_size = MJUM16BYTES;
+#endif
 	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
@@ -1770,10 +2093,14 @@
 				bus_dma_tag_destroy(fl->ifl_desc_tag);
 				fl->ifl_desc_tag = NULL;
 			}
+			free(fl->ifl_sds.ifsd_m, M_IFLIB);
+			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
+			/* XXX destroy maps first */
+			free(fl->ifl_sds.ifsd_map, M_IFLIB);
+			fl->ifl_sds.ifsd_m = NULL;
+			fl->ifl_sds.ifsd_cl = NULL;
+			fl->ifl_sds.ifsd_map = NULL;
 		}
-		if (rxq->ifr_fl->ifl_sds != NULL)
-			free(rxq->ifr_fl->ifl_sds, M_IFLIB);
-
 		free(rxq->ifr_fl, M_IFLIB);
 		rxq->ifr_fl = NULL;
 		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
@@ -1789,7 +2116,7 @@
 {
 	iflib_txq_t txq = arg;
 	if_ctx_t ctx = txq->ift_ctx;
-	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
+	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
@@ -1800,28 +2127,32 @@
 	*/
 	IFDI_TIMER(ctx, txq->ift_id);
 	if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
-		(ctx->ifc_pause_frames == 0))
+	    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
+	     (sctx->isc_pause_frames == 0)))
 		goto hung;
 
-	if (TXQ_AVAIL(txq) <= 2*scctx->isc_tx_nsegments ||
-	    ifmp_ring_is_stalled(txq->ift_br[0]))
+	if (ifmp_ring_is_stalled(txq->ift_br))
+		txq->ift_qstatus = IFLIB_QUEUE_HUNG;
+	txq->ift_cleaned_prev = txq->ift_cleaned;
+	/* handle any laggards */
+	if (txq->ift_db_pending)
 		GROUPTASK_ENQUEUE(&txq->ift_task);
 
-	ctx->ifc_pause_frames = 0;
+	sctx->isc_pause_frames = 0;
 	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	return;
 hung:
 	CTX_LOCK(ctx);
-	if_setdrvflagbits(ctx->ifc_ifp, 0, IFF_DRV_RUNNING);
+	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	device_printf(ctx->ifc_dev,  "TX(%d) desc avail = %d, pidx = %d\n",
 				  txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 
 	IFDI_WATCHDOG_RESET(ctx);
 	ctx->ifc_watchdog_events++;
-	ctx->ifc_pause_frames = 0;
 
-	iflib_init_locked(ctx);
+	ctx->ifc_flags |= IFC_DO_RESET;
+	iflib_admin_intr_deferred(ctx);
 	CTX_UNLOCK(ctx);
 }
 
@@ -1829,22 +2160,25 @@
 iflib_init_locked(if_ctx_t ctx)
 {
 	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
+	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	if_t ifp = ctx->ifc_ifp;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
-	int i, j;
+	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 
 
 	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	IFDI_INTR_DISABLE(ctx);
 
+	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
+	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 	/* Set hardware offload abilities */
 	if_clearhwassist(ifp);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
-		if_sethwassistbits(ifp, CSUM_IP | CSUM_TCP | CSUM_UDP, 0);
+		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
-		if_sethwassistbits(ifp,  (CSUM_TCP_IPV6 | CSUM_UDP_IPV6), 0);
+		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO4)
 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 	if (if_getcapenable(ifp) & IFCAP_TSO6)
@@ -1853,19 +2187,21 @@
 	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
-		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		iflib_netmap_txq_init(ctx, txq);
 	}
-	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
-		iflib_netmap_rxq_init(ctx, rxq);
-	}
 #ifdef INVARIANTS
 	i = if_getdrvflags(ifp);
 #endif
 	IFDI_INIT(ctx);
 	MPASS(if_getdrvflags(ifp) == i);
 	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
+		/* XXX this should really be done on a per-queue basis */
+		if (if_getcapenable(ifp) & IFCAP_NETMAP) {
+			MPASS(rxq->ifr_id == i);
+			iflib_netmap_rxq_init(ctx, rxq);
+			continue;
+		}
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			if (iflib_fl_setup(fl)) {
 				device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n");
@@ -1920,102 +2256,159 @@
 	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 
 	IFDI_INTR_DISABLE(ctx);
-	msleep(ctx, &ctx->ifc_mtx, PUSER, "iflib_init", hz);
+	DELAY(1000);
+	IFDI_STOP(ctx);
+	DELAY(1000);
 
+	iflib_debug_reset();
 	/* Wait for current tx queue users to exit to disarm watchdog timer. */
 	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
+		CALLOUT_LOCK(txq);
+		callout_stop(&txq->ift_timer);
+		CALLOUT_UNLOCK(txq);
+
 		/* clean any enqueued buffers */
-		iflib_txq_check_drain(txq, 0);
+		iflib_ifmp_purge(txq);
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
 		}
 		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
-		txq->ift_in_use = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
+		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
 		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 		txq->ift_pullups = 0;
-		ifmp_ring_reset_stats(txq->ift_br[0]);
+		ifmp_ring_reset_stats(txq->ift_br);
 		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 	}
 	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 		/* make sure all transmitters have completed before proceeding XXX */
 
-		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwrxqs; j++, di++)
+		for (j = 0, di = rxq->ifr_ifdi; j < rxq->ifr_nfl; j++, di++)
 			bzero((void *)di->idi_vaddr, di->idi_size);
 		/* also resets the free lists pidx/cidx */
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 			iflib_fl_bufs_free(fl);
 	}
-	IFDI_STOP(ctx);
 }
 
-static iflib_rxsd_t
-rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int *cltype, int unload)
+static inline caddr_t
+calc_next_rxd(iflib_fl_t fl, int cidx)
 {
+	qidx_t size;
+	int nrxd;
+	caddr_t start, end, cur, next;
+
+	nrxd = fl->ifl_size;
+	size = fl->ifl_rxd_size;
+	start = fl->ifl_ifdi->idi_vaddr;
+
+	if (__predict_false(size == 0))
+		return (start);
+	cur = start + size*cidx;
+	end = start + size*nrxd;
+	next = CACHE_PTR_NEXT(cur);
+	return (next < end ? next : start);
+}
+
+static inline void
+prefetch_pkts(iflib_fl_t fl, int cidx)
+{
+	int nextptr;
+	int nrxd = fl->ifl_size;
+	caddr_t next_rxd;
+
+
+	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
+	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
+	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
+	next_rxd = calc_next_rxd(fl, cidx);
+	prefetch(next_rxd);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
+	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
+}
+
+static void
+rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd)
+{
 	int flid, cidx;
-	iflib_rxsd_t sd;
+	bus_dmamap_t map;
 	iflib_fl_t fl;
 	iflib_dma_info_t di;
+	int next;
 
+	map = NULL;
 	flid = irf->irf_flid;
 	cidx = irf->irf_idx;
 	fl = &rxq->ifr_fl[flid];
+	sd->ifsd_fl = fl;
+	sd->ifsd_cidx = cidx;
+	sd->ifsd_m = &fl->ifl_sds.ifsd_m[cidx];
+	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
 	fl->ifl_credits--;
 #if MEMORY_LOGGING
 	fl->ifl_m_dequeued++;
-	if (cltype)
-		fl->ifl_cl_dequeued++;
 #endif
-	sd = &fl->ifl_sds[cidx];
-	di = fl->ifl_ifdi;
-	bus_dmamap_sync(di->idi_tag, di->idi_map,
-			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
+		prefetch_pkts(fl, cidx);
+	if (fl->ifl_sds.ifsd_map != NULL) {
+		next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
+		prefetch(&fl->ifl_sds.ifsd_map[next]);
+		map = fl->ifl_sds.ifsd_map[cidx];
+		di = fl->ifl_ifdi;
+		next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1);
+		prefetch(&fl->ifl_sds.ifsd_flags[next]);
+		bus_dmamap_sync(di->idi_tag, di->idi_map,
+				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 	/* not valid assert if bxe really does SGE from non-contiguous elements */
-	MPASS(fl->ifl_cidx == cidx);
-	if (unload)
-		bus_dmamap_unload(fl->ifl_desc_tag, sd->ifsd_map);
-
-	if (__predict_false(++fl->ifl_cidx == fl->ifl_size)) {
-		fl->ifl_cidx = 0;
-		fl->ifl_gen = 0;
+		MPASS(fl->ifl_cidx == cidx);
+		if (unload)
+			bus_dmamap_unload(fl->ifl_desc_tag, map);
 	}
-	/* YES ick */
-	if (cltype)
-		*cltype = fl->ifl_cltype;
-	return (sd);
+	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
+	if (__predict_false(fl->ifl_cidx == 0))
+		fl->ifl_gen = 0;
+	if (map != NULL)
+		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
+			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+        bit_clear(fl->ifl_rx_bitmap, cidx);
 }
 
 static struct mbuf *
-assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri)
+assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd)
 {
-	int i, padlen , flags, cltype;
+	int i, padlen , flags;
 	struct mbuf *m, *mh, *mt;
-	iflib_rxsd_t sd;
 	caddr_t cl;
 
 	i = 0;
 	mh = NULL;
 	do {
-		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[i], &cltype, TRUE);
+		rxd_frag_to_sd(rxq, &ri->iri_frags[i], TRUE, sd);
 
-		MPASS(sd->ifsd_cl != NULL);
-		MPASS(sd->ifsd_m != NULL);
+		MPASS(*sd->ifsd_cl != NULL);
+		MPASS(*sd->ifsd_m != NULL);
 
 		/* Don't include zero-length frags */
 		if (ri->iri_frags[i].irf_len == 0) {
 			/* XXX we can save the cluster here, but not the mbuf */
-			m_init(sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
-			m_free(sd->ifsd_m);
-			sd->ifsd_m = NULL;
+			m_init(*sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
+			m_free(*sd->ifsd_m);
+			*sd->ifsd_m = NULL;
 			continue;
 		}
-
-		m = sd->ifsd_m;
+		m = *sd->ifsd_m;
+		*sd->ifsd_m = NULL;
 		if (mh == NULL) {
 			flags = M_PKTHDR|M_EXT;
 			mh = mt = m;
@@ -2027,13 +2420,12 @@
 			/* assuming padding is only on the first fragment */
 			padlen = 0;
 		}
-		sd->ifsd_m = NULL;
-		cl = sd->ifsd_cl;
-		sd->ifsd_cl = NULL;
+		cl = *sd->ifsd_cl;
+		*sd->ifsd_cl = NULL;
 
 		/* Can these two be made one ? */
 		m_init(m, M_NOWAIT, MT_DATA, flags);
-		m_cljset(m, cl, cltype);
+		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
 		/*
 		 * These must follow m_init and m_cljset
 		 */
@@ -2051,20 +2443,24 @@
 static struct mbuf *
 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 {
+	struct if_rxsd sd;
 	struct mbuf *m;
-	iflib_rxsd_t sd;
 
 	/* should I merge this back in now that the two paths are basically duplicated? */
 	if (ri->iri_nfrags == 1 &&
 	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
-		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[0], NULL, FALSE);
-		m = sd->ifsd_m;
-		sd->ifsd_m = NULL;
+		rxd_frag_to_sd(rxq, &ri->iri_frags[0], FALSE, &sd);
+		m = *sd.ifsd_m;
+		*sd.ifsd_m = NULL;
 		m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
-		memcpy(m->m_data, sd->ifsd_cl, ri->iri_len);
+#ifndef __NO_STRICT_ALIGNMENT
+		if (!IP_ALIGNED(m))
+			m->m_data += 2;
+#endif
+		memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
 		m->m_len = ri->iri_frags[0].irf_len;
        } else {
-		m = assemble_segments(rxq, ri);
+		m = assemble_segments(rxq, ri, &sd);
 	}
 	m->m_pkthdr.len = ri->iri_len;
 	m->m_pkthdr.rcvif = ri->iri_ifp;
@@ -2077,29 +2473,76 @@
 	return (m);
 }
 
+#if defined(INET6) || defined(INET)
+static void
+iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
+{
+	CURVNET_SET(lc->ifp->if_vnet);
+#if defined(INET6)
+	*v6 = VNET(ip6_forwarding);
+#endif
+#if defined(INET)
+	*v4 = VNET(ipforwarding);
+#endif
+	CURVNET_RESTORE();
+}
+
+/*
+ * Returns true if it's possible this packet could be LROed.
+ * if it returns false, it is guaranteed that tcp_lro_rx()
+ * would not return zero.
+ */
 static bool
-iflib_rxeof(iflib_rxq_t rxq, int budget)
+iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
 {
+	struct ether_header *eh;
+	uint16_t eh_type;
+
+	eh = mtod(m, struct ether_header *);
+	eh_type = ntohs(eh->ether_type);
+	switch (eh_type) {
+#if defined(INET6)
+		case ETHERTYPE_IPV6:
+			return !v6_forwarding;
+#endif
+#if defined (INET)
+		case ETHERTYPE_IP:
+			return !v4_forwarding;
+#endif
+	}
+
+	return false;
+}
+#else
+static void
+iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
+{
+}
+#endif
+
+static bool
+iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
+{
 	if_ctx_t ctx = rxq->ifr_ctx;
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 	int avail, i;
-	uint16_t *cidxp;
+	qidx_t *cidxp;
 	struct if_rxd_info ri;
 	int err, budget_left, rx_bytes, rx_pkts;
 	iflib_fl_t fl;
 	struct ifnet *ifp;
 	int lro_enabled;
+	bool lro_possible = false;
+	bool v4_forwarding, v6_forwarding;
+
 	/*
 	 * XXX early demux data packets so that if_input processing only handles
 	 * acks in interrupt context
 	 */
-	struct mbuf *m, *mh, *mt;
+	struct mbuf *m, *mh, *mt, *mf;
 
-	if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &budget)) {
-		return (FALSE);
-	}
-
+	ifp = ctx->ifc_ifp;
 	mh = mt = NULL;
 	MPASS(budget > 0);
 	rx_pkts	= rx_bytes = 0;
@@ -2122,18 +2565,19 @@
 		/*
 		 * Reset client set fields to their default values
 		 */
-		bzero(&ri, sizeof(ri));
+		rxd_info_zero(&ri);
 		ri.iri_qsidx = rxq->ifr_id;
 		ri.iri_cidx = *cidxp;
-		ri.iri_ifp = ctx->ifc_ifp;
+		ri.iri_ifp = ifp;
 		ri.iri_frags = rxq->ifr_frags;
 		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 
-		/* in lieu of handling correctly - make sure it isn't being unhandled */
-		MPASS(err == 0);
+		if (err)
+			goto err;
 		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 			*cidxp = ri.iri_cidx;
 			/* Update our consumer index */
+			/* XXX NB: shurd - check if this is still safe */
 			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) {
 				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 				rxq->ifr_cq_gen = 0;
@@ -2166,20 +2610,52 @@
 	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 		__iflib_fl_refill_lt(ctx, fl, budget + 8);
 
-	ifp = ctx->ifc_ifp;
 	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
+	if (lro_enabled)
+		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
+	mt = mf = NULL;
 	while (mh != NULL) {
 		m = mh;
 		mh = mh->m_nextpkt;
 		m->m_nextpkt = NULL;
+#ifndef __NO_STRICT_ALIGNMENT
+		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
+			continue;
+#endif
 		rx_bytes += m->m_pkthdr.len;
 		rx_pkts++;
 #if defined(INET6) || defined(INET)
-		if (lro_enabled && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
-			continue;
+		if (lro_enabled) {
+			if (!lro_possible) {
+				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
+				if (lro_possible && mf != NULL) {
+					ifp->if_input(ifp, mf);
+					DBG_COUNTER_INC(rx_if_input);
+					mt = mf = NULL;
+				}
+			}
+			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
+			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
+				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
+					continue;
+			}
+		}
 #endif
+		if (lro_possible) {
+			ifp->if_input(ifp, m);
+			DBG_COUNTER_INC(rx_if_input);
+			continue;
+		}
+
+		if (mf == NULL)
+			mf = m;
+		if (mt != NULL)
+			mt->m_nextpkt = m;
+		mt = m;
+	}
+	if (mf != NULL) {
+		ifp->if_input(ifp, mf);
 		DBG_COUNTER_INC(rx_if_input);
-		ifp->if_input(ifp, m);
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
@@ -2194,45 +2670,78 @@
 	if (avail)
 		return true;
 	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
+err:
+	CTX_LOCK(ctx);
+	ctx->ifc_flags |= IFC_DO_RESET;
+	iflib_admin_intr_deferred(ctx);
+	CTX_UNLOCK(ctx);
+	return (false);
 }
 
+#define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
+static inline qidx_t
+txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
+{
+	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
+	qidx_t minthresh = txq->ift_size / 8;
+	if (in_use > 4*minthresh)
+		return (notify_count);
+	if (in_use > 2*minthresh)
+		return (notify_count >> 1);
+	if (in_use > minthresh)
+		return (notify_count >> 3);
+	return (0);
+}
+
+static inline qidx_t
+txq_max_rs_deferred(iflib_txq_t txq)
+{
+	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
+	qidx_t minthresh = txq->ift_size / 8;
+	if (txq->ift_in_use > 4*minthresh)
+		return (notify_count);
+	if (txq->ift_in_use > 2*minthresh)
+		return (notify_count >> 1);
+	if (txq->ift_in_use > minthresh)
+		return (notify_count >> 2);
+	return (2);
+}
+
 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
-#define TXQ_MAX_DB_DEFERRED(size) (size >> 5)
+
+#define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
+#define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 
-static __inline void
-iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring)
-{
-	uint32_t dbval;
+/* forward compatibility for cxgb */
+#define FIRST_QSET(ctx) 0
+#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
+#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
+#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
+#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 
-	if (ring || txq->ift_db_pending >=
-	    TXQ_MAX_DB_DEFERRED(txq->ift_size)) {
+/* XXX we should be setting this to something other than zero */
+#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
+#define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
 
-		/* the lock will only ever be contended in the !min_latency case */
-		if (!TXDB_TRYLOCK(txq))
-			return;
+static inline bool
+iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use)
+{
+	qidx_t dbval, max;
+	bool rang;
+
+	rang = false;
+	max = TXQ_MAX_DB_DEFERRED(txq, in_use);
+	if (ring || txq->ift_db_pending >= max) {
 		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 		txq->ift_db_pending = txq->ift_npending = 0;
-		TXDB_UNLOCK(txq);
+		rang = true;
 	}
+	return (rang);
 }
 
-static void
-iflib_txd_deferred_db_check(void * arg)
-{
-	iflib_txq_t txq = arg;
-
-	/* simple non-zero boolean so use bitwise OR */
-	if ((txq->ift_db_pending | txq->ift_npending) &&
-	    txq->ift_db_pending >= txq->ift_db_pending_queued)
-		iflib_txd_db_check(txq->ift_ctx, txq, TRUE);
-	txq->ift_db_pending_queued = 0;
-	if (ifmp_ring_is_stalled(txq->ift_br[0]))
-		iflib_txq_check_drain(txq, 4);
-}
-
 #ifdef PKT_DEBUG
 static void
 print_pkt(if_pkt_info_t pi)
@@ -2252,10 +2761,21 @@
 static int
 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 {
+	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct ether_vlan_header *eh;
 	struct mbuf *m, *n;
 
 	n = m = *mp;
+	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
+	    M_WRITABLE(m) == 0) {
+		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
+			return (ENOMEM);
+		} else {
+			m_freem(*mp);
+			n = *mp = m;
+		}
+	}
+
 	/*
 	 * Determine where frame payload starts.
 	 * Jump over vlan headers if already present,
@@ -2319,26 +2839,30 @@
 		pi->ipi_ipproto = ip->ip_p;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
-		if (pi->ipi_csum_flags & CSUM_IP)
+		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
                        ip->ip_sum = 0;
 
-		if (pi->ipi_ipproto == IPPROTO_TCP) {
-			if (__predict_false(th == NULL)) {
-				txq->ift_pullups++;
-				if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
-					return (ENOMEM);
-				th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
-			}
-			pi->ipi_tcp_hflags = th->th_flags;
-			pi->ipi_tcp_hlen = th->th_off << 2;
-			pi->ipi_tcp_seq = th->th_seq;
-		}
 		if (IS_TSO4(pi)) {
+			if (pi->ipi_ipproto == IPPROTO_TCP) {
+				if (__predict_false(th == NULL)) {
+					txq->ift_pullups++;
+					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
+						return (ENOMEM);
+					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
+				}
+				pi->ipi_tcp_hflags = th->th_flags;
+				pi->ipi_tcp_hlen = th->th_off << 2;
+				pi->ipi_tcp_seq = th->th_seq;
+			}
 			if (__predict_false(ip->ip_p != IPPROTO_TCP))
 				return (ENXIO);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 					       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
+			if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
+				ip->ip_sum = 0;
+				ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
+			}
 		}
 		break;
 	}
@@ -2360,15 +2884,15 @@
 		pi->ipi_ipproto = ip6->ip6_nxt;
 		pi->ipi_flags |= IPI_TX_IPV6;
 
-		if (pi->ipi_ipproto == IPPROTO_TCP) {
-			if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
-				if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
-					return (ENOMEM);
-			}
-			pi->ipi_tcp_hflags = th->th_flags;
-			pi->ipi_tcp_hlen = th->th_off << 2;
-		}
 		if (IS_TSO6(pi)) {
+			if (pi->ipi_ipproto == IPPROTO_TCP) {
+				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
+					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
+						return (ENOMEM);
+				}
+				pi->ipi_tcp_hflags = th->th_flags;
+				pi->ipi_tcp_hlen = th->th_off << 2;
+			}
 
 			if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 				return (ENXIO);
@@ -2390,10 +2914,10 @@
 		break;
 	}
 	*mp = m;
+
 	return (0);
 }
 
-
 static  __noinline  struct mbuf *
 collapse_pkthdr(struct mbuf *m0)
 {
@@ -2460,8 +2984,8 @@
 	if_ctx_t ctx;
 	if_shared_ctx_t		sctx;
 	if_softc_ctx_t		scctx;
-	int i, next, pidx, mask, err, maxsegsz, ntxd, count;
-	struct mbuf *m, *tmp, **ifsd_m, **mp;
+	int i, next, pidx, err, ntxd, count;
+	struct mbuf *m, *tmp, **ifsd_m;
 
 	m = *m0;
 
@@ -2485,28 +3009,46 @@
 		if (err)
 			return (err);
 		ifsd_flags[pidx] |= TX_SW_DESC_MAPPED;
-		i = 0;
-		next = pidx;
-		mask = (txq->ift_size-1);
+		count = 0;
 		m = *m0;
 		do {
-			mp = &ifsd_m[next];
-			*mp = m;
+			if (__predict_false(m->m_len <= 0)) {
+				tmp = m;
+				m = m->m_next;
+				tmp->m_next = NULL;
+				m_free(tmp);
+				continue;
+			}
 			m = m->m_next;
-			if (__predict_false((*mp)->m_len == 0)) {
-				m_free(*mp);
-				*mp = NULL;
-			} else
-				next = (pidx + i) & (ntxd-1);
+			count++;
 		} while (m != NULL);
+		if (count > *nsegs) {
+			ifsd_m[pidx] = *m0;
+			ifsd_m[pidx]->m_flags |= M_TOOBIG;
+			return (0);
+		}
+		m = *m0;
+		count = 0;
+		do {
+			next = (pidx + count) & (ntxd-1);
+			MPASS(ifsd_m[next] == NULL);
+			ifsd_m[next] = m;
+			count++;
+			tmp = m;
+			m = m->m_next;
+		} while (m != NULL);
 	} else {
-		int buflen, sgsize, max_sgsize;
+		int buflen, sgsize, maxsegsz, max_sgsize;
 		vm_offset_t vaddr;
 		vm_paddr_t curaddr;
 
 		count = i = 0;
-		maxsegsz = sctx->isc_tx_maxsize;
 		m = *m0;
+		if (m->m_pkthdr.csum_flags & CSUM_TSO)
+			maxsegsz = scctx->isc_tx_tso_segsize_max;
+		else
+			maxsegsz = sctx->isc_tx_maxsegsize;
+
 		do {
 			if (__predict_false(m->m_len <= 0)) {
 				tmp = m;
@@ -2528,6 +3070,8 @@
 #endif
 			ifsd_m[next] = m;
 			while (buflen > 0) {
+				if (i >= max_segs)
+					goto err;
 				max_sgsize = MIN(buflen, maxsegsz);
 				curaddr = pmap_kextract(vaddr);
 				sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
@@ -2537,8 +3081,6 @@
 				vaddr += sgsize;
 				buflen -= sgsize;
 				i++;
-				if (i >= max_segs)
-					goto err;
 			}
 			count++;
 			tmp = m;
@@ -2552,6 +3094,67 @@
 	return (EFBIG);
 }
 
+static inline caddr_t
+calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
+{
+	qidx_t size;
+	int ntxd;
+	caddr_t start, end, cur, next;
+
+	ntxd = txq->ift_size;
+	size = txq->ift_txd_size[qid];
+	start = txq->ift_ifdi[qid].idi_vaddr;
+
+	if (__predict_false(size == 0))
+		return (start);
+	cur = start + size*cidx;
+	end = start + size*ntxd;
+	next = CACHE_PTR_NEXT(cur);
+	return (next < end ? next : start);
+}
+
+/*
+ * Pad an mbuf to ensure a minimum ethernet frame size.
+ * min_frame_size is the frame size (less CRC) to pad the mbuf to
+ */
+static __noinline int
+iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
+{
+	/*
+	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
+	 * and ARP message is the smallest common payload I can think of
+	 */
+	static char pad[18];	/* just zeros */
+	int n;
+	struct mbuf *new_head;
+
+	if (!M_WRITABLE(*m_head)) {
+		new_head = m_dup(*m_head, M_NOWAIT);
+		if (new_head == NULL) {
+			m_freem(*m_head);
+			device_printf(dev, "cannot pad short frame, m_dup() failed");
+			DBG_COUNTER_INC(encap_pad_mbuf_fail);
+			return ENOMEM;
+		}
+		m_freem(*m_head);
+		*m_head = new_head;
+	}
+
+	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
+	     n > 0; n -= sizeof(pad))
+		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
+			break;
+
+	if (n > 0) {
+		m_freem(*m_head);
+		device_printf(dev, "cannot pad short frame\n");
+		DBG_COUNTER_INC(encap_pad_mbuf_fail);
+		return (ENOBUFS);
+	}
+
+	return 0;
+}
+
 static int
 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 {
@@ -2560,6 +3163,7 @@
 	if_softc_ctx_t		scctx;
 	bus_dma_segment_t	*segs;
 	struct mbuf		*m_head;
+	void			*next_txd;
 	bus_dmamap_t		map;
 	struct if_pkt_info	pi;
 	int remap = 0;
@@ -2580,18 +3184,23 @@
 	 */
 	cidx = txq->ift_cidx;
 	pidx = txq->ift_pidx;
-	next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
+	if (ctx->ifc_flags & IFC_PREFETCH) {
+		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
+		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
+			next_txd = calc_next_txd(txq, cidx, 0);
+			prefetch(next_txd);
+		}
 
-	/* prefetch the next cache line of mbuf pointers and flags */
-	prefetch(&txq->ift_sds.ifsd_m[next]);
-	if (txq->ift_sds.ifsd_map != NULL) {
-		prefetch(&txq->ift_sds.ifsd_map[next]);
+		/* prefetch the next cache line of mbuf pointers and flags */
+		prefetch(&txq->ift_sds.ifsd_m[next]);
+		if (txq->ift_sds.ifsd_map != NULL) {
+			prefetch(&txq->ift_sds.ifsd_map[next]);
+			next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
+			prefetch(&txq->ift_sds.ifsd_flags[next]);
+		}
+	} else if (txq->ift_sds.ifsd_map != NULL)
 		map = txq->ift_sds.ifsd_map[pidx];
-		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
-		prefetch(&txq->ift_sds.ifsd_flags[next]);
-	}
 
-
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		desc_tag = txq->ift_tso_desc_tag;
 		max_segs = scctx->isc_tx_tso_segments_max;
@@ -2599,14 +3208,21 @@
 		desc_tag = txq->ift_desc_tag;
 		max_segs = scctx->isc_tx_nsegments;
 	}
+	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
+	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
+		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
+		if (err)
+			return err;
+	}
 	m_head = *m_headp;
-	bzero(&pi, sizeof(pi));
-	pi.ipi_len = m_head->m_pkthdr.len;
+
+	pkt_info_zero(&pi);
 	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
-	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
-	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
 	pi.ipi_pidx = pidx;
 	pi.ipi_qsidx = txq->ift_id;
+	pi.ipi_len = m_head->m_pkthdr.len;
+	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
+	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
 
 	/* deliberate bitwise OR to make one condition */
 	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
@@ -2662,6 +3278,19 @@
 			GROUPTASK_ENQUEUE(&txq->ift_task);
 		return (ENOBUFS);
 	}
+	/*
+	 * On Intel cards we can greatly reduce the number of TX interrupts
+	 * we see by only setting report status on every Nth descriptor.
+	 * However, this also means that the driver will need to keep track
+	 * of the descriptors that RS was set on to check them for the DD bit.
+	 */
+	txq->ift_rs_pending += nsegs + 1;
+	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
+	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs - 1) <= MAX_TX_DESC(ctx)) {
+		pi.ipi_flags |= IPI_TX_INTR;
+		txq->ift_rs_pending = 0;
+	}
+
 	pi.ipi_segs = segs;
 	pi.ipi_nsegs = nsegs;
 
@@ -2669,22 +3298,29 @@
 #ifdef PKT_DEBUG
 	print_pkt(&pi);
 #endif
+	if (map != NULL)
+		bus_dmamap_sync(desc_tag, map, BUS_DMASYNC_PREWRITE);
 	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
-		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
-						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
-
+		if (map != NULL)
+			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
+					BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 		DBG_COUNTER_INC(tx_encap);
-		MPASS(pi.ipi_new_pidx >= 0 &&
-		    pi.ipi_new_pidx < txq->ift_size);
+		MPASS(pi.ipi_new_pidx < txq->ift_size);
 
 		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 		if (pi.ipi_new_pidx < pi.ipi_pidx) {
 			ndesc += txq->ift_size;
 			txq->ift_gen = 1;
 		}
+		/*
+		 * drivers can need as many as 
+		 * two sentinels
+		 */
+		MPASS(ndesc <= pi.ipi_nsegs + 2);
 		MPASS(pi.ipi_new_pidx != pidx);
 		MPASS(ndesc > 0);
 		txq->ift_in_use += ndesc;
+
 		/*
 		 * We update the last software descriptor again here because there may
 		 * be a sentinel and/or there may be more mbufs than segments
@@ -2709,36 +3345,6 @@
 	return (ENOMEM);
 }
 
-/* forward compatibility for cxgb */
-#define FIRST_QSET(ctx) 0
-
-#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
-#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
-#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
-#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
-#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
-#define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
-
-
-
-/* if there are more than TXQ_MIN_OCCUPANCY packets pending we consider deferring
- * doorbell writes
- *
- * ORing with 2 assures that min occupancy is never less than 2 without any conditional logic
- */
-#define TXQ_MIN_OCCUPANCY(size) ((size >> 6)| 0x2)
-
-static inline int
-iflib_txq_min_occupancy(iflib_txq_t txq)
-{
-	if_ctx_t ctx;
-
-	ctx = txq->ift_ctx;
-	return (get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx,
-	    txq->ift_gen) < TXQ_MIN_OCCUPANCY(txq->ift_size) +
-	    MAX_TX_DESC(ctx));
-}
-
 static void
 iflib_tx_desc_free(iflib_txq_t txq, int n)
 {
@@ -2747,6 +3353,7 @@
 	struct mbuf *m, **ifsd_m;
 	uint8_t *ifsd_flags;
 	bus_dmamap_t *ifsd_map;
+	bool do_prefetch;
 
 	cidx = txq->ift_cidx;
 	gen = txq->ift_gen;
@@ -2756,11 +3363,13 @@
 	ifsd_flags = txq->ift_sds.ifsd_flags;
 	ifsd_m = txq->ift_sds.ifsd_m;
 	ifsd_map = txq->ift_sds.ifsd_map;
+	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
 
 	while (n--) {
-		prefetch(ifsd_m[(cidx + 3) & mask]);
-		prefetch(ifsd_m[(cidx + 4) & mask]);
-
+		if (do_prefetch) {
+			prefetch(ifsd_m[(cidx + 3) & mask]);
+			prefetch(ifsd_m[(cidx + 4) & mask]);
+		}
 		if (ifsd_m[cidx] != NULL) {
 			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 			prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]);
@@ -2775,8 +3384,15 @@
 			if ((m = ifsd_m[cidx]) != NULL) {
 				/* XXX we don't support any drivers that batch packets yet */
 				MPASS(m->m_nextpkt == NULL);
-
-				m_free(m);
+				/* if the number of clusters exceeds the number of segments
+				 * there won't be space on the ring to save a pointer to each
+				 * cluster so we simply free the list here
+				 */
+				if (m->m_flags & M_TOOBIG) {
+					m_freem(m);
+				} else {
+					m_free(m);
+				}
 				ifsd_m[cidx] = NULL;
 #if MEMORY_LOGGING
 				txq->ift_dequeued++;
@@ -2823,24 +3439,34 @@
 	txq->ift_cleaned += reclaim;
 	txq->ift_in_use -= reclaim;
 
-	if (txq->ift_active == FALSE)
-		txq->ift_active = TRUE;
-
 	return (reclaim);
 }
 
 static struct mbuf **
-_ring_peek_one(struct ifmp_ring *r, int cidx, int offset)
+_ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
 {
+	int next, size;
+	struct mbuf **items;
 
-	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (r->size-1)]));
+	size = r->size;
+	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
+	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
+
+	prefetch(items[(cidx + offset) & (size-1)]);
+	if (remaining > 1) {
+		prefetch2cachelines(&items[next]);
+		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
+		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
+		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
+	}
+	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
 }
 
 static void
 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 {
 
-	ifmp_ring_check_drainage(txq->ift_br[0], budget);
+	ifmp_ring_check_drainage(txq->ift_br, budget);
 }
 
 static uint32_t
@@ -2849,8 +3475,8 @@
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
 
-	return ((TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx)) ||
-		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, false));
+	return ((TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2) ||
+		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false));
 }
 
 static uint32_t
@@ -2858,16 +3484,19 @@
 {
 	iflib_txq_t txq = r->cookie;
 	if_ctx_t ctx = txq->ift_ctx;
-	if_t ifp = ctx->ifc_ifp;
+	struct ifnet *ifp = ctx->ifc_ifp;
 	struct mbuf **mp, *m;
-	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail, err, in_use_prev, desc_used;
+	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail;
+	int reclaimed, err, in_use_prev, desc_used;
+	bool do_prefetch, ring, rang;
 
 	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 			    !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
-
+	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+	rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use);
 	avail = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 		DBG_COUNTER_INC(txq_drain_flushing);
@@ -2877,77 +3506,153 @@
 		}
 		return (avail);
 	}
-	iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+
 	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 		CALLOUT_LOCK(txq);
 		callout_stop(&txq->ift_timer);
-		callout_stop(&txq->ift_db_check);
 		CALLOUT_UNLOCK(txq);
 		DBG_COUNTER_INC(txq_drain_oactive);
 		return (0);
 	}
+	if (reclaimed)
+		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
 	count = MIN(avail, TX_BATCH_SIZE);
+#ifdef INVARIANTS
+	if (iflib_verbose_debug)
+		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
+		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
+#endif
+	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
+	avail = TXQ_AVAIL(txq);
+	for (desc_used = i = 0; i < count && avail > MAX_TX_DESC(ctx) + 2; i++) {
+		int pidx_prev, rem = do_prefetch ? count - i : 0;
 
-	for (desc_used = i = 0; i < count && TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2; i++) {
-		mp = _ring_peek_one(r, cidx, i);
+		mp = _ring_peek_one(r, cidx, i, rem);
+		MPASS(mp != NULL && *mp != NULL);
+		if (__predict_false(*mp == (struct mbuf *)txq)) {
+			consumed++;
+			reclaimed++;
+			continue;
+		}
 		in_use_prev = txq->ift_in_use;
+		pidx_prev = txq->ift_pidx;
 		err = iflib_encap(txq, mp);
-		/*
-		 * What other errors should we bail out for?
-		 */
-		if (err == ENOBUFS) {
+		if (__predict_false(err)) {
 			DBG_COUNTER_INC(txq_drain_encapfail);
-			break;
+			/* no room - bail out */
+			if (err == ENOBUFS)
+				break;
+			consumed++;
+			DBG_COUNTER_INC(txq_drain_encapfail);
+			/* we can't send this packet - skip it */
+			continue;
 		}
 		consumed++;
-		if (err)
-			continue;
-
 		pkt_sent++;
 		m = *mp;
 		DBG_COUNTER_INC(tx_sent);
 		bytes_sent += m->m_pkthdr.len;
-		if (m->m_flags & M_MCAST)
-			mcast_sent++;
+		mcast_sent += !!(m->m_flags & M_MCAST);
+		avail = TXQ_AVAIL(txq);
 
 		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
 		desc_used += (txq->ift_in_use - in_use_prev);
-		iflib_txd_db_check(ctx, txq, FALSE);
 		ETHER_BPF_MTAP(ifp, m);
-		if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
+		if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
 			break;
-
-		if (desc_used > TXQ_MAX_DB_CONSUMED(txq->ift_size))
-			break;
+		rang = iflib_txd_db_check(ctx, txq, false, in_use_prev);
 	}
 
-	if ((iflib_min_tx_latency || iflib_txq_min_occupancy(txq)) && txq->ift_db_pending)
-		iflib_txd_db_check(ctx, txq, TRUE);
-	else if ((txq->ift_db_pending || TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)) &&
-		 (callout_pending(&txq->ift_db_check) == 0)) {
-		txq->ift_db_pending_queued = txq->ift_db_pending;
-		callout_reset_on(&txq->ift_db_check, 1, iflib_txd_deferred_db_check,
-				 txq, txq->ift_db_check.c_cpu);
-	}
+	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
+	ring = rang ? false  : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx));
+	iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 	if (mcast_sent)
 		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
-
+#ifdef INVARIANTS
+	if (iflib_verbose_debug)
+		printf("consumed=%d\n", consumed);
+#endif
 	return (consumed);
 }
 
+static uint32_t
+iflib_txq_drain_always(struct ifmp_ring *r)
+{
+	return (1);
+}
+
+static uint32_t
+iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
+{
+	int i, avail;
+	struct mbuf **mp;
+	iflib_txq_t txq;
+
+	txq = r->cookie;
+
+	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
+	CALLOUT_LOCK(txq);
+	callout_stop(&txq->ift_timer);
+	CALLOUT_UNLOCK(txq);
+
+	avail = IDXDIFF(pidx, cidx, r->size);
+	for (i = 0; i < avail; i++) {
+		mp = _ring_peek_one(r, cidx, i, avail - i);
+		if (__predict_false(*mp == (struct mbuf *)txq))
+			continue;
+		m_freem(*mp);
+	}
+	MPASS(ifmp_ring_is_stalled(r) == 0);
+	return (avail);
+}
+
 static void
+iflib_ifmp_purge(iflib_txq_t txq)
+{
+	struct ifmp_ring *r;
+
+	r = txq->ift_br;
+	r->drain = iflib_txq_drain_free;
+	r->can_drain = iflib_txq_drain_always;
+
+	ifmp_ring_check_drainage(r, r->size);
+
+	r->drain = iflib_txq_drain;
+	r->can_drain = iflib_txq_can_drain;
+}
+
+static void
 _task_fn_tx(void *context)
 {
 	iflib_txq_t txq = context;
 	if_ctx_t ctx = txq->ift_ctx;
+	struct ifnet *ifp = ctx->ifc_ifp;
+	int rc;
 
+#ifdef IFLIB_DIAGNOSTICS
+	txq->ift_cpu_exec_count[curcpu]++;
+#endif
 	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 		return;
-	ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
+	if (if_getcapenable(ifp) & IFCAP_NETMAP) {
+		if (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false))
+			netmap_tx_irq(ifp, txq->ift_id);
+		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
+		return;
+	}
+	if (txq->ift_db_pending)
+		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE);
+	ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
+	if (ctx->ifc_flags & IFC_LEGACY)
+		IFDI_INTR_ENABLE(ctx);
+	else {
+		rc = IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
+		KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
+	}
 }
 
 static void
@@ -2957,17 +3662,32 @@
 	if_ctx_t ctx = rxq->ifr_ctx;
 	bool more;
 	int rc;
+	uint16_t budget;
 
+#ifdef IFLIB_DIAGNOSTICS
+	rxq->ifr_cpu_exec_count[curcpu]++;
+#endif
 	DBG_COUNTER_INC(task_fn_rxs);
 	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 		return;
-
-	if ((more = iflib_rxeof(rxq, 16 /* XXX */)) == false) {
+	more = true;
+#ifdef DEV_NETMAP
+	if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) {
+		u_int work = 0;
+		if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work)) {
+			more = false;
+		}
+	}
+#endif
+	budget = ctx->ifc_sysctl_rx_budget;
+	if (budget == 0)
+		budget = 16;	/* XXX */
+	if (more == false || (more = iflib_rxeof(rxq, budget)) == false) {
 		if (ctx->ifc_flags & IFC_LEGACY)
 			IFDI_INTR_ENABLE(ctx);
 		else {
 			DBG_COUNTER_INC(rx_intr_enables);
-			rc = IFDI_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
+			rc = IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
 		}
 	}
@@ -2985,8 +3705,11 @@
 	iflib_txq_t txq;
 	int i;
 
-	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
-		return;
+	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)) {
+		if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
+			return;
+		}
+	}
 
 	CTX_LOCK(ctx);
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
@@ -2998,6 +3721,10 @@
 	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
 	IFDI_LINK_INTR_ENABLE(ctx);
+	if (ctx->ifc_flags & IFC_DO_RESET) {
+		ctx->ifc_flags &= ~IFC_DO_RESET;
+		iflib_if_init_locked(ctx);
+	}
 	CTX_UNLOCK(ctx);
 
 	if (LINK_ACTIVE(ctx) == 0)
@@ -3072,7 +3799,7 @@
 	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
 		m_freem(m);
-		return (0);
+		return (ENOBUFS);
 	}
 
 	MPASS(m->m_nextpkt == NULL);
@@ -3119,18 +3846,16 @@
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
-	err = ifmp_ring_enqueue(txq->ift_br[0], (void **)&m, 1, TX_BATCH_SIZE);
+	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE);
 
+	GROUPTASK_ENQUEUE(&txq->ift_task);
 	if (err) {
-		GROUPTASK_ENQUEUE(&txq->ift_task);
 		/* support forthcoming later */
 #ifdef DRIVER_BACKPRESSURE
 		txq->ift_closed = TRUE;
 #endif
-		ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
+		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 		m_freem(m);
-	} else if (TXQ_AVAIL(txq) < (txq->ift_size >> 1)) {
-		GROUPTASK_ENQUEUE(&txq->ift_task);
 	}
 
 	return (err);
@@ -3147,7 +3872,7 @@
 	ctx->ifc_flags |= IFC_QFLUSH;
 	CTX_UNLOCK(ctx);
 	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
-		while (!(ifmp_ring_is_idle(txq->ift_br[0]) || ifmp_ring_is_stalled(txq->ift_br[0])))
+		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
 			iflib_txq_check_drain(txq, 0);
 	CTX_LOCK(ctx);
 	ctx->ifc_flags &= ~IFC_QFLUSH;
@@ -3158,11 +3883,9 @@
 
 
 #define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
-		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING |	\
+		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
 		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO)
 
-#define IFCAP_REINIT IFCAP_FLAGS
-
 static int
 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 {
@@ -3236,8 +3959,6 @@
 		ctx->ifc_if_flags = if_getflags(ifp);
 		CTX_UNLOCK(ctx);
 		break;
-
-		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
@@ -3254,6 +3975,7 @@
 		CTX_UNLOCK(ctx);
 		/* falls thru */
 	case SIOCGIFMEDIA:
+	case SIOCGIFXMEDIA:
 		err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
 		break;
 	case SIOCGI2C:
@@ -3288,6 +4010,8 @@
 #endif
 		setmask |= (mask & IFCAP_FLAGS);
 
+		if (setmask  & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
+			setmask |= (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 		if ((mask & IFCAP_WOL) &&
 		    (if_getcapabilities(ifp) & IFCAP_WOL) != 0)
 			setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC));
@@ -3298,10 +4022,10 @@
 		if (setmask) {
 			CTX_LOCK(ctx);
 			bits = if_getdrvflags(ifp);
-			if (setmask & IFCAP_REINIT)
+			if (bits & IFF_DRV_RUNNING)
 				iflib_stop(ctx);
 			if_togglecapenable(ifp, setmask);
-			if (setmask & IFCAP_REINIT)
+			if (bits & IFF_DRV_RUNNING)
 				iflib_init_locked(ctx);
 			if_setdrvflags(ifp, bits);
 			CTX_UNLOCK(ctx);
@@ -3353,7 +4077,7 @@
 	IFDI_VLAN_REGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
-		iflib_init_locked(ctx);
+		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
@@ -3372,7 +4096,7 @@
 	IFDI_VLAN_UNREGISTER(ctx, vtag);
 	/* Re-init to load the changes */
 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
-		iflib_init_locked(ctx);
+		iflib_if_init_locked(ctx);
 	CTX_UNLOCK(ctx);
 }
 
@@ -3462,7 +4186,6 @@
 
 	ctx->ifc_sctx = sctx;
 	ctx->ifc_dev = dev;
-	ctx->ifc_txrx = *sctx->isc_txrx;
 	ctx->ifc_softc = sc;
 
 	if ((err = iflib_register(ctx)) != 0) {
@@ -3472,6 +4195,9 @@
 	iflib_add_device_sysctl_pre(ctx);
 
 	scctx = &ctx->ifc_softc_ctx;
+	ifp = ctx->ifc_ifp;
+	ctx->ifc_nhwtxqs = sctx->isc_ntxqs;
+
 	/*
 	 * XXX sanity check that ntxd & nrxd are a power of 2
 	 */
@@ -3524,30 +4250,35 @@
 		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 		return (err);
 	}
-	if (scctx->isc_ntxqsets_max)
-		scctx->isc_ntxqsets = min(scctx->isc_ntxqsets, scctx->isc_ntxqsets_max);
-	if (scctx->isc_nrxqsets_max)
-		scctx->isc_nrxqsets = min(scctx->isc_nrxqsets, scctx->isc_nrxqsets_max);
+	_iflib_pre_assert(scctx);
+	ctx->ifc_txrx = *scctx->isc_txrx;
 
+#ifdef INVARIANTS
+	MPASS(scctx->isc_capenable);
+	if (scctx->isc_capenable & IFCAP_TXCSUM)
+		MPASS(scctx->isc_tx_csum_flags);
+#endif
+
+	if_setcapabilities(ifp, scctx->isc_capenable | IFCAP_HWSTATS);
+	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS);
+
+	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
+		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
+	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
+		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
+
 #ifdef ACPI_DMAR
 	if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL)
 		ctx->ifc_flags |= IFC_DMAR;
+#elif !(defined(__i386__) || defined(__amd64__))
+	/* set unconditionally for !x86 */
+	ctx->ifc_flags |= IFC_DMAR;
 #endif
 
 	msix_bar = scctx->isc_msix_bar;
+	main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
+	main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
 
-	ifp = ctx->ifc_ifp;
-
-	if(sctx->isc_flags & IFLIB_HAS_TXCQ)
-		main_txq = 1;
-	else
-		main_txq = 0;
-
-	if(sctx->isc_flags & IFLIB_HAS_RXCQ)
-		main_rxq = 1;
-	else
-		main_rxq = 0;
-
 	/* XXX change for per-queue sizes */
 	device_printf(dev, "using %d tx descriptors and %d rx descriptors\n",
 		      scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
@@ -3590,6 +4321,18 @@
 	if (scctx->isc_rss_table_size == 0)
 		scctx->isc_rss_table_size = 64;
 	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
+
+	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
+	/* XXX format name */
+	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin");
+
+	/* Set up cpu set.  If it fails, use the set of all CPUs. */
+	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
+		device_printf(dev, "Unable to fetch CPU list\n");
+		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
+	}
+	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
+
 	/*
 	** Now setup MSI or MSI/X, should
 	** return us the number of supported
@@ -3598,6 +4341,10 @@
 	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 		msix = scctx->isc_vectors;
 	} else if (scctx->isc_msix_bar != 0)
+	       /*
+		* The simple fact that isc_msix_bar is not 0 does not mean we
+		* we have a good value there that is known to work.
+		*/
 		msix = iflib_msix_init(ctx);
 	else {
 		scctx->isc_vectors = 1;
@@ -3616,7 +4363,17 @@
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
-
+	/*
+	 * Group taskqueues aren't properly set up until SMP is started,
+	 * so we disable interrupts until we can handle them post
+	 * SI_SUB_SMP.
+	 *
+	 * XXX: disabling interrupts doesn't actually work, at least for
+	 * the non-MSI case.  When they occur before SI_SUB_SMP completes,
+	 * we do null handling and depend on this not causing too large an
+	 * interrupt storm.
+	 */
+	IFDI_INTR_DISABLE(ctx);
 	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
 		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
 		goto fail_intr_free;
@@ -3679,8 +4436,9 @@
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
 	device_t dev = ctx->ifc_dev;
-	int i;
+	int i, j;
 	struct taskqgroup *tqg;
+	iflib_fl_t fl;
 
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
@@ -3706,16 +4464,19 @@
 	if (ctx->ifc_led_dev != NULL)
 		led_destroy(ctx->ifc_led_dev);
 	/* XXX drain any dependent tasks */
-	tqg = qgroup_softirq;
+	tqg = qgroup_if_io_tqg;
 	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 		callout_drain(&txq->ift_timer);
-		callout_drain(&txq->ift_db_check);
 		if (txq->ift_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &txq->ift_task);
 	}
 	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 		if (rxq->ifr_task.gt_uniq != NULL)
 			taskqgroup_detach(tqg, &rxq->ifr_task);
+
+		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
+			free(fl->ifl_rx_bitmap, M_IFLIB);
+			
 	}
 	tqg = qgroup_if_config_tqg;
 	if (ctx->ifc_admin_task.gt_uniq != NULL)
@@ -3886,15 +4647,6 @@
 	MPASS(sctx->isc_rx_nsegments);
 	MPASS(sctx->isc_rx_maxsegsize);
 
-
-	MPASS(sctx->isc_txrx->ift_txd_encap);
-	MPASS(sctx->isc_txrx->ift_txd_flush);
-	MPASS(sctx->isc_txrx->ift_txd_credits_update);
-	MPASS(sctx->isc_txrx->ift_rxd_available);
-	MPASS(sctx->isc_txrx->ift_rxd_pkt_get);
-	MPASS(sctx->isc_txrx->ift_rxd_refill);
-	MPASS(sctx->isc_txrx->ift_rxd_flush);
-
 	MPASS(sctx->isc_nrxd_min[0]);
 	MPASS(sctx->isc_nrxd_max[0]);
 	MPASS(sctx->isc_nrxd_default[0]);
@@ -3903,6 +4655,19 @@
 	MPASS(sctx->isc_ntxd_default[0]);
 }
 
+static void
+_iflib_pre_assert(if_softc_ctx_t scctx)
+{
+
+	MPASS(scctx->isc_txrx->ift_txd_encap);
+	MPASS(scctx->isc_txrx->ift_txd_flush);
+	MPASS(scctx->isc_txrx->ift_txd_credits_update);
+	MPASS(scctx->isc_txrx->ift_rxd_available);
+	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
+	MPASS(scctx->isc_txrx->ift_rxd_refill);
+	MPASS(scctx->isc_txrx->ift_rxd_flush);
+}
+
 static int
 iflib_register(if_ctx_t ctx)
 {
@@ -3937,9 +4702,6 @@
 	if_setqflushfn(ifp, iflib_if_qflush);
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 
-	if_setcapabilities(ifp, 0);
-	if_setcapenable(ifp, 0);
-
 	ctx->ifc_vlan_attach_event =
 		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 							  EVENTHANDLER_PRI_FIRST);
@@ -3975,7 +4737,6 @@
 	caddr_t *vaddrs;
 	uint64_t *paddrs;
 	struct ifmp_ring **brscp;
-	int nbuf_rings = 1; /* XXX determine dynamically */
 
 	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
@@ -4001,11 +4762,6 @@
 		err = ENOMEM;
 		goto rx_fail;
 	}
-	if (!(brscp = malloc(sizeof(void *) * nbuf_rings * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
-		device_printf(dev, "Unable to buf_ring_sc * memory\n");
-		err = ENOMEM;
-		goto rx_fail;
-	}
 
 	ctx->ifc_txqs = txq;
 	ctx->ifc_rxqs = rxq;
@@ -4028,6 +4784,7 @@
 				err = ENOMEM;
 				goto err_tx_desc;
 			}
+			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
 			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 		}
 		txq->ift_ctx = ctx;
@@ -4039,8 +4796,6 @@
 		}
 		/* XXX fix this */
 		txq->ift_timer.c_cpu = cpu;
-		txq->ift_db_check.c_cpu = cpu;
-		txq->ift_nbr = nbuf_rings;
 
 		if (iflib_txsd_alloc(txq)) {
 			device_printf(dev, "Critical Failure setting up TX buffers\n");
@@ -4053,21 +4808,16 @@
 		    device_get_nameunit(dev), txq->ift_id);
 		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
-		callout_init_mtx(&txq->ift_db_check, &txq->ift_mtx, 0);
 
 		snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db",
 			 device_get_nameunit(dev), txq->ift_id);
-		TXDB_LOCK_INIT(txq);
 
-		txq->ift_br = brscp + i*nbuf_rings;
-		for (j = 0; j < nbuf_rings; j++) {
-			err = ifmp_ring_alloc(&txq->ift_br[j], 2048, txq, iflib_txq_drain,
-					      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
-			if (err) {
-				/* XXX free any allocated rings */
-				device_printf(dev, "Unable to allocate buf_ring\n");
-				goto err_tx_desc;
-			}
+		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
+				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
+		if (err) {
+			/* XXX free any allocated rings */
+			device_printf(dev, "Unable to allocate buf_ring\n");
+			goto err_tx_desc;
 		}
 	}
 
@@ -4081,6 +4831,9 @@
 		}
 
 		rxq->ifr_ifdi = ifdip;
+		/* XXX this needs to be changed if #rx queues != #tx queues */
+		rxq->ifr_ntxqirq = 1;
+		rxq->ifr_txqid[0] = i;
 		for (j = 0; j < nrxqs; j++, ifdip++) {
 			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
 				device_printf(dev, "Unable to allocate Descriptor memory\n");
@@ -4105,10 +4858,10 @@
 		}
 		rxq->ifr_fl = fl;
 		for (j = 0; j < nfree_lists; j++) {
-			rxq->ifr_fl[j].ifl_rxq = rxq;
-			rxq->ifr_fl[j].ifl_id = j;
-			rxq->ifr_fl[j].ifl_ifdi =
-			    &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
+			fl[j].ifl_rxq = rxq;
+			fl[j].ifl_id = j;
+			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
+			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
 		}
         /* Allocate receive buffers for the ring*/
 		if (iflib_rxsd_alloc(rxq)) {
@@ -4117,6 +4870,9 @@
 			err = ENOMEM;
 			goto err_rx_desc;
 		}
+
+		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) 
+			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB, M_WAITOK|M_ZERO);
 	}
 
 	/* TXQs */
@@ -4294,19 +5050,152 @@
 	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 }
 
-static void
-find_nth(if_ctx_t ctx, cpuset_t *cpus, int qid)
+#ifdef SMP
+static int
+find_nth(if_ctx_t ctx, int qid)
 {
-	int i, cpuid;
+	cpuset_t cpus;
+	int i, cpuid, eqid, count;
 
-	CPU_COPY(&ctx->ifc_cpus, cpus);
+	CPU_COPY(&ctx->ifc_cpus, &cpus);
+	count = CPU_COUNT(&cpus);
+	eqid = qid % count;
 	/* clear up to the qid'th bit */
-	for (i = 0; i < qid; i++) {
-		cpuid = CPU_FFS(cpus);
-		CPU_CLR(cpuid, cpus);
+	for (i = 0; i < eqid; i++) {
+		cpuid = CPU_FFS(&cpus);
+		MPASS(cpuid != 0);
+		CPU_CLR(cpuid-1, &cpus);
 	}
+	cpuid = CPU_FFS(&cpus);
+	MPASS(cpuid != 0);
+	return (cpuid-1);
 }
 
+#ifdef SCHED_ULE
+extern struct cpu_group *cpu_top;              /* CPU topology */
+
+static int
+find_child_with_core(int cpu, struct cpu_group *grp)
+{
+	int i;
+
+	if (grp->cg_children == 0)
+		return -1;
+
+	MPASS(grp->cg_child);
+	for (i = 0; i < grp->cg_children; i++) {
+		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
+			return i;
+	}
+
+	return -1;
+}
+
+/*
+ * Find the nth "close" core to the specified core
+ * "close" is defined as the deepest level that shares
+ * at least an L2 cache.  With threads, this will be
+ * threads on the same core.  If the sahred cache is L3
+ * or higher, simply returns the same core.
+ */
+static int
+find_close_core(int cpu, int core_offset)
+{
+	struct cpu_group *grp;
+	int i;
+	int fcpu;
+	cpuset_t cs;
+
+	grp = cpu_top;
+	if (grp == NULL)
+		return cpu;
+	i = 0;
+	while ((i = find_child_with_core(cpu, grp)) != -1) {
+		/* If the child only has one cpu, don't descend */
+		if (grp->cg_child[i].cg_count <= 1)
+			break;
+		grp = &grp->cg_child[i];
+	}
+
+	/* If they don't share at least an L2 cache, use the same CPU */
+	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
+		return cpu;
+
+	/* Now pick one */
+	CPU_COPY(&grp->cg_mask, &cs);
+
+	/* Add the selected CPU offset to core offset. */
+	for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) {
+		if (fcpu - 1 == cpu)
+			break;
+		CPU_CLR(fcpu - 1, &cs);
+	}
+	MPASS(fcpu);
+
+	core_offset += i;
+
+	CPU_COPY(&grp->cg_mask, &cs);
+	for (i = core_offset % grp->cg_count; i > 0; i--) {
+		MPASS(CPU_FFS(&cs));
+		CPU_CLR(CPU_FFS(&cs) - 1, &cs);
+	}
+	MPASS(CPU_FFS(&cs));
+	return CPU_FFS(&cs) - 1;
+}
+#else
+static int
+find_close_core(int cpu, int core_offset __unused)
+{
+	return cpu;
+}
+#endif
+
+static int
+get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid)
+{
+	switch (type) {
+	case IFLIB_INTR_TX:
+		/* TX queues get cores which share at least an L2 cache with the corresponding RX queue */
+		/* XXX handle multiple RX threads per core and more than two core per L2 group */
+		return qid / CPU_COUNT(&ctx->ifc_cpus) + 1;
+	case IFLIB_INTR_RX:
+	case IFLIB_INTR_RXTX:
+		/* RX queues get the specified core */
+		return qid / CPU_COUNT(&ctx->ifc_cpus);
+	default:
+		return -1;
+	}
+}
+#else
+#define get_core_offset(ctx, type, qid)	CPU_FIRST()
+#define find_close_core(cpuid, tid)	CPU_FIRST()
+#define find_nth(ctx, gid)		CPU_FIRST()
+#endif
+
+/* Just to avoid copy/paste */
+static inline int
+iflib_irq_set_affinity(if_ctx_t ctx, int irq, iflib_intr_type_t type, int qid,
+    struct grouptask *gtask, struct taskqgroup *tqg, void *uniq, char *name)
+{
+	int cpuid;
+	int err, tid;
+
+	cpuid = find_nth(ctx, qid);
+	tid = get_core_offset(ctx, type, qid);
+	MPASS(tid >= 0);
+	cpuid = find_close_core(cpuid, tid);
+	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, irq, name);
+	if (err) {
+		device_printf(ctx->ifc_dev, "taskqgroup_attach_cpu failed %d\n", err);
+		return (err);
+	}
+#ifdef notyet
+	if (cpuid > ctx->ifc_cpuid_highest)
+		ctx->ifc_cpuid_highest = cpuid;
+#endif
+	return 0;
+}
+
 int
 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 						iflib_intr_type_t type, driver_filter_t *filter,
@@ -4315,12 +5204,13 @@
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	iflib_filter_info_t info;
-	cpuset_t cpus;
 	gtask_fn_t *fn;
 	int tqrid, err;
+	driver_filter_t *intr_fast;
 	void *q;
 
 	info = &ctx->ifc_filter_info;
+	tqrid = rid;
 
 	switch (type) {
 	/* XXX merge tx/rx for netmap? */
@@ -4328,90 +5218,111 @@
 		q = &ctx->ifc_txqs[qid];
 		info = &ctx->ifc_txqs[qid].ift_filter_info;
 		gtask = &ctx->ifc_txqs[qid].ift_task;
-		tqg = qgroup_softirq;
-		tqrid = irq->ii_rid;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
+		intr_fast = iflib_fast_intr;
+		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
-		tqg = qgroup_softirq;
-		tqrid = irq->ii_rid;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
+		intr_fast = iflib_fast_intr;
+		GROUPTASK_INIT(gtask, 0, fn, q);
 		break;
+	case IFLIB_INTR_RXTX:
+		q = &ctx->ifc_rxqs[qid];
+		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
+		gtask = &ctx->ifc_rxqs[qid].ifr_task;
+		tqg = qgroup_if_io_tqg;
+		fn = _task_fn_rx;
+		intr_fast = iflib_fast_intr_rxtx;
+		GROUPTASK_INIT(gtask, 0, fn, q);
+		break;
 	case IFLIB_INTR_ADMIN:
 		q = ctx;
+		tqrid = -1;
 		info = &ctx->ifc_filter_info;
 		gtask = &ctx->ifc_admin_task;
 		tqg = qgroup_if_config_tqg;
-		tqrid = -1;
 		fn = _task_fn_admin;
+		intr_fast = iflib_fast_intr_ctx;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
-	GROUPTASK_INIT(gtask, 0, fn, q);
 
 	info->ifi_filter = filter;
 	info->ifi_filter_arg = filter_arg;
 	info->ifi_task = gtask;
+	info->ifi_ctx = q;
 
-	/* XXX query cpu that rid belongs to */
-
-	err = _iflib_irq_alloc(ctx, irq, rid, iflib_fast_intr, NULL, info,  name);
-	if (err != 0)
+	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
+	if (err != 0) {
+		device_printf(ctx->ifc_dev, "_iflib_irq_alloc failed %d\n", err);
 		return (err);
+	}
+	if (type == IFLIB_INTR_ADMIN)
+		return (0);
+
 	if (tqrid != -1) {
-		find_nth(ctx, &cpus, qid);
-		taskqgroup_attach_cpu(tqg, gtask, q, CPU_FFS(&cpus), irq->ii_rid, name);
-	} else
-		taskqgroup_attach(tqg, gtask, q, tqrid, name);
+		err = iflib_irq_set_affinity(ctx, rman_get_start(irq->ii_res), type, qid, gtask, tqg, q, name);
+		if (err)
+			return (err);
+	} else {
+		taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res), name);
+	}
 
-
 	return (0);
 }
 
 void
-iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name)
+iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,  void *arg, int qid, char *name)
 {
 	struct grouptask *gtask;
 	struct taskqgroup *tqg;
 	gtask_fn_t *fn;
 	void *q;
+	int irq_num = -1;
+	int err;
 
 	switch (type) {
 	case IFLIB_INTR_TX:
 		q = &ctx->ifc_txqs[qid];
 		gtask = &ctx->ifc_txqs[qid].ift_task;
-		tqg = qgroup_softirq;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_tx;
+		if (irq != NULL)
+			irq_num = rman_get_start(irq->ii_res);
 		break;
 	case IFLIB_INTR_RX:
 		q = &ctx->ifc_rxqs[qid];
 		gtask = &ctx->ifc_rxqs[qid].ifr_task;
-		tqg = qgroup_softirq;
+		tqg = qgroup_if_io_tqg;
 		fn = _task_fn_rx;
+		if (irq != NULL)
+			irq_num = rman_get_start(irq->ii_res);
 		break;
-	case IFLIB_INTR_ADMIN:
-		q = ctx;
-		gtask = &ctx->ifc_admin_task;
-		tqg = qgroup_if_config_tqg;
-		rid = -1;
-		fn = _task_fn_admin;
-		break;
 	case IFLIB_INTR_IOV:
 		q = ctx;
 		gtask = &ctx->ifc_vflr_task;
 		tqg = qgroup_if_config_tqg;
-		rid = -1;
 		fn = _task_fn_iov;
 		break;
 	default:
 		panic("unknown net intr type");
 	}
 	GROUPTASK_INIT(gtask, 0, fn, q);
-	taskqgroup_attach(tqg, gtask, q, rid, name);
+	if (irq_num != -1) {
+		err = iflib_irq_set_affinity(ctx, irq_num, type, qid, gtask, tqg, q, name);
+		if (err)
+			taskqgroup_attach(tqg, gtask, q, irq_num, name);
+	}
+	else {
+		taskqgroup_attach(tqg, gtask, q, irq_num, name);
+	}
 }
 
 void
@@ -4441,7 +5352,7 @@
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;
-	tqg = qgroup_softirq;
+	tqg = qgroup_if_io_tqg;
 	tqrid = irq->ii_rid = *rid;
 	fn = _task_fn_rx;
 
@@ -4451,16 +5362,13 @@
 	info->ifi_task = gtask;
 
 	/* We allocate a single interrupt resource */
-	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr, NULL, info, name)) != 0)
+	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr_ctx, NULL, info, name)) != 0)
 		return (err);
 	GROUPTASK_INIT(gtask, 0, fn, q);
-	taskqgroup_attach(tqg, gtask, q, tqrid, name);
+	taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res), name);
 
 	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
-	taskqgroup_attach(qgroup_softirq, &txq->ift_task, txq, tqrid, "tx");
-	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
-	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin/link");
-
+	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, rman_get_start(irq->ii_res), "tx");
 	return (0);
 }
 
@@ -4469,7 +5377,7 @@
 {
 
 	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
-								  device_get_nameunit(ctx->ifc_dev));
+	    device_get_nameunit(ctx->ifc_dev));
 }
 
 void
@@ -4489,7 +5397,13 @@
 void
 iflib_admin_intr_deferred(if_ctx_t ctx)
 {
+#ifdef INVARIANTS
+	struct grouptask *gtask;
 
+	gtask = &ctx->ifc_admin_task;
+	MPASS(gtask != NULL && gtask->gt_taskqueue != NULL);
+#endif
+
 	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 }
 
@@ -4504,7 +5418,7 @@
 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
 {
 
-	taskqgroup_attach_cpu(qgroup_softirq, gt, uniq, cpu, -1, name);
+	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name);
 }
 
 void
@@ -4529,8 +5443,9 @@
 	if_t ifp = ctx->ifc_ifp;
 	iflib_txq_t txq = ctx->ifc_txqs;
 
-
 	if_setbaudrate(ifp, baudrate);
+	if (baudrate >= IF_Gbps(10))
+		ctx->ifc_flags |= IFC_PREFETCH;
 
 	/* If link down, disable watchdog */
 	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
@@ -4545,23 +5460,27 @@
 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 {
 	int credits;
+#ifdef INVARIANTS
+	int credits_pre = txq->ift_cidx_processed;
+#endif
 
 	if (ctx->isc_txd_credits_update == NULL)
 		return (0);
 
-	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, true)) == 0)
+	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
 		return (0);
 
 	txq->ift_processed += credits;
 	txq->ift_cidx_processed += credits;
 
+	MPASS(credits_pre + credits == txq->ift_cidx_processed);
 	if (txq->ift_cidx_processed >= txq->ift_size)
 		txq->ift_cidx_processed -= txq->ift_size;
 	return (credits);
 }
 
 static int
-iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget)
+iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
 {
 
 	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
@@ -4599,13 +5518,28 @@
 	int iflib_num_tx_queues, iflib_num_rx_queues;
 	int err, admincnt, bar;
 
-	iflib_num_tx_queues = scctx->isc_ntxqsets;
-	iflib_num_rx_queues = scctx->isc_nrxqsets;
+	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
+	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
 
+	device_printf(dev, "msix_init qsets capped at %d\n", imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
+
 	bar = ctx->ifc_softc_ctx.isc_msix_bar;
 	admincnt = sctx->isc_admin_intrcnt;
+	/* Override by global tuneable */
+	{
+		int i;
+		size_t len = sizeof(i);
+		err = kernel_sysctlbyname(curthread, "hw.pci.enable_msix", &i, &len, NULL, 0, NULL, 0);
+		if (err == 0) {
+			if (i == 0)
+				goto msi;
+		}
+		else {
+			device_printf(dev, "unable to read hw.pci.enable_msix.");
+		}
+	}
 	/* Override by tuneable */
-	if (enable_msix == 0)
+	if (scctx->isc_disable_msix)
 		goto msi;
 
 	/*
@@ -4617,18 +5551,20 @@
 	** successfully initialize us.
 	*/
 	{
-		uint16_t pci_cmd_word;
 		int msix_ctrl, rid;
 
+ 		pci_enable_busmaster(dev);
 		rid = 0;
-		pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2);
-		pci_cmd_word |= PCIM_CMD_BUSMASTEREN;
-		pci_write_config(dev, PCIR_COMMAND, pci_cmd_word, 2);
-		pci_find_cap(dev, PCIY_MSIX, &rid);
-		rid += PCIR_MSIX_CTRL;
-		msix_ctrl = pci_read_config(dev, rid, 2);
-		msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
-		pci_write_config(dev, rid, msix_ctrl, 2);
+		if (pci_find_cap(dev, PCIY_MSIX, &rid) == 0 && rid != 0) {
+			rid += PCIR_MSIX_CTRL;
+			msix_ctrl = pci_read_config(dev, rid, 2);
+			msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
+			pci_write_config(dev, rid, msix_ctrl, 2);
+		} else {
+			device_printf(dev, "PCIY_MSIX capability not found; "
+			                   "or rid %d == 0.\n", rid);
+			goto msi;
+		}
 	}
 
 	/*
@@ -4661,20 +5597,14 @@
 #else
 	queuemsgs = msgs - admincnt;
 #endif
-	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) == 0) {
 #ifdef RSS
-		queues = imin(queuemsgs, rss_getnumbuckets());
+	queues = imin(queuemsgs, rss_getnumbuckets());
 #else
-		queues = queuemsgs;
+	queues = queuemsgs;
 #endif
-		queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
-		device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
-					  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
-	} else {
-		device_printf(dev, "Unable to fetch CPU list\n");
-		/* Figure out a reasonable auto config value */
-		queues = min(queuemsgs, mp_ncpus);
-	}
+	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
+	device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
+				  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 #ifdef  RSS
 	/* If we're doing RSS, clamp at the number of RSS buckets */
 	if (queues > rss_getnumbuckets())
@@ -4684,6 +5614,10 @@
 		rx_queues = iflib_num_rx_queues;
 	else
 		rx_queues = queues;
+
+	if (rx_queues > scctx->isc_nrxqsets)
+		rx_queues = scctx->isc_nrxqsets;
+
 	/*
 	 * We want this to be all logical CPUs by default
 	 */
@@ -4692,6 +5626,9 @@
 	else
 		tx_queues = mp_ncpus;
 
+	if (tx_queues > scctx->isc_ntxqsets)
+		tx_queues = scctx->isc_ntxqsets;
+
 	if (ctx->ifc_sysctl_qs_eq_override == 0) {
 #ifdef INVARIANTS
 		if (tx_queues != rx_queues)
@@ -4773,7 +5710,7 @@
 	if_ctx_t ctx = (void *)arg1;
 	enum iflib_ndesc_handler type = arg2;
 	char buf[256] = {0};
-	uint16_t *ndesc;
+	qidx_t *ndesc;
 	char *p, *next;
 	int nqs, rc, i;
 
@@ -4843,6 +5780,12 @@
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
                        "permit #txq != #rxq");
+	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
+                      CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
+                      "disable MSIX (default 0)");
+	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
+		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
+                       "set the rx budget");
 
 	/* XXX change for per-queue sizes */
 	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
@@ -4935,27 +5878,26 @@
 				   CTLFLAG_RD,
 				   &txq->ift_cleaned, "total cleaned");
 		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
-				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br[0]->state),
+				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br->state),
 				0, mp_ring_state_handler, "A", "soft ring state");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
-				       CTLFLAG_RD, &txq->ift_br[0]->enqueues,
+				       CTLFLAG_RD, &txq->ift_br->enqueues,
 				       "# of enqueues to the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
-				       CTLFLAG_RD, &txq->ift_br[0]->drops,
+				       CTLFLAG_RD, &txq->ift_br->drops,
 				       "# of drops in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
-				       CTLFLAG_RD, &txq->ift_br[0]->starts,
+				       CTLFLAG_RD, &txq->ift_br->starts,
 				       "# of normal consumer starts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
-				       CTLFLAG_RD, &txq->ift_br[0]->stalls,
+				       CTLFLAG_RD, &txq->ift_br->stalls,
 					       "# of consumer stalls in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
-			       CTLFLAG_RD, &txq->ift_br[0]->restarts,
+			       CTLFLAG_RD, &txq->ift_br->restarts,
 				       "# of consumer restarts in the mp_ring for this queue");
 		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
-				       CTLFLAG_RD, &txq->ift_br[0]->abdications,
+				       CTLFLAG_RD, &txq->ift_br->abdications,
 				       "# of consumer abdications in the mp_ring for this queue");
-
 	}
 
 	if (scctx->isc_nrxqsets > 100)
@@ -4977,6 +5919,7 @@
 				       CTLFLAG_RD,
 				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
 		}
+
 		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
@@ -5010,3 +5953,30 @@
 	}
 
 }
+
+#ifndef __NO_STRICT_ALIGNMENT
+static struct mbuf *
+iflib_fixup_rx(struct mbuf *m)
+{
+	struct mbuf *n;
+
+	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
+		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
+		m->m_data += ETHER_HDR_LEN;
+		n = m;
+	} else {
+		MGETHDR(n, M_NOWAIT, MT_DATA);
+		if (n == NULL) {
+			m_freem(m);
+			return (NULL);
+		}
+		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
+		m->m_data += ETHER_HDR_LEN;
+		m->m_len -= ETHER_HDR_LEN;
+		n->m_len = ETHER_HDR_LEN;
+		M_MOVE_PKTHDR(n, m);
+		n->m_next = m;
+	}
+	return (n);
+}
+#endif
Index: sys/net/mp_ring.c
===================================================================
--- sys/net/mp_ring.c
+++ sys/net/mp_ring.c
@@ -454,18 +454,12 @@
 	do {
 		os.state = ns.state = r->state;
 		ns.pidx_tail = pidx_stop;
-		ns.flags = BUSY;
+		if (os.flags == IDLE)
+			ns.flags = ABDICATED;
 	} while (atomic_cmpset_rel_64(&r->state, os.state, ns.state) == 0);
 	critical_exit();
 	counter_u64_add(r->enqueues, n);
 
-	/*
-	 * Turn into a consumer if some other thread isn't active as a consumer
-	 * already.
-	 */
-	if (os.flags != BUSY)
-		drain_ring_lockless(r, ns, os.flags, budget);
-
 	return (0);
 }
 #endif
@@ -476,7 +470,9 @@
 	union ring_state os, ns;
 
 	os.state = r->state;
-	if (os.flags != STALLED || os.pidx_head != os.pidx_tail || r->can_drain(r) == 0)
+	if ((os.flags != STALLED && os.flags != ABDICATED) ||	// Only continue in STALLED and ABDICATED
+	    os.pidx_head != os.pidx_tail ||			// Require work to be available
+	    (os.flags != ABDICATED && r->can_drain(r) == 0))	// Can either drain, or everyone left
 		return;
 
 	MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
Index: sys/sys/_task.h
===================================================================
--- sys/sys/_task.h
+++ sys/sys/_task.h
@@ -65,7 +65,8 @@
 	void			*gt_taskqueue;
 	LIST_ENTRY(grouptask)	gt_list;
 	void			*gt_uniq;
-	char			*gt_name;
+#define GROUPTASK_NAMELEN	32
+	char			gt_name[GROUPTASK_NAMELEN];
 	int16_t			gt_irq;
 	int16_t			gt_cpu;
 };
Index: sys/sys/cpuset.h
===================================================================
--- sys/sys/cpuset.h
+++ sys/sys/cpuset.h
@@ -83,6 +83,8 @@
 #define	CPU_WHICH_IRQ		4	/* Specifies an irq #. */
 #define	CPU_WHICH_JAIL		5	/* Specifies a jail id. */
 #define	CPU_WHICH_DOMAIN	6	/* Specifies a NUMA domain id. */
+#define	CPU_WHICH_INTRHANDLER	7	/* Specifies an irq # (not ithread). */
+#define	CPU_WHICH_ITHREAD	8	/* Specifies an irq's ithread. */
 
 /*
  * Reserved cpuset identifiers.
Index: sys/sys/interrupt.h
===================================================================
--- sys/sys/interrupt.h
+++ sys/sys/interrupt.h
@@ -162,6 +162,8 @@
 	    driver_filter_t filter, driver_intr_t handler, void *arg, 
 	    u_char pri, enum intr_type flags, void **cookiep);	    
 int	intr_event_bind(struct intr_event *ie, int cpu);
+int	intr_event_bind_irqonly(struct intr_event *ie, int cpu);
+int	intr_event_bind_ithread(struct intr_event *ie, int cpu);
 int	intr_event_create(struct intr_event **event, void *source,
 	    int flags, int irq, void (*pre_ithread)(void *),
 	    void (*post_ithread)(void *), void (*post_filter)(void *),
@@ -173,9 +175,9 @@
 void	intr_event_execute_handlers(struct proc *p, struct intr_event *ie);
 int	intr_event_handle(struct intr_event *ie, struct trapframe *frame);
 int	intr_event_remove_handler(void *cookie);
-int	intr_getaffinity(int irq, void *mask);
+int	intr_getaffinity(int irq, int mode, void *mask);
 void	*intr_handler_source(void *cookie);
-int	intr_setaffinity(int irq, void *mask);
+int	intr_setaffinity(int irq, int mode, void *mask);
 void	_intr_drain(int irq);  /* Linux compat only. */
 int	swi_add(struct intr_event **eventp, const char *name,
 	    driver_intr_t handler, void *arg, int pri, enum intr_type flags,