Index: sys/dev/cxgbe/adapter.h
===================================================================
--- sys/dev/cxgbe/adapter.h
+++ sys/dev/cxgbe/adapter.h
@@ -550,6 +550,23 @@
 
 struct mp_ring;
 
+struct txpkts {
+	uint8_t wr_type;	/* type 0 or type 1 */
+	uint8_t npkt;		/* # of packets in this work request */
+	uint8_t len16;		/* # of 16B pieces used by this work request */
+	uint8_t score;		/* 1-10. coalescing attempted if score > 3 */
+	uint8_t max_npkt;	/* maximum number of packets allowed */
+	uint16_t plen;		/* total payload (sum of all packets) */
+
+	/* straight from fw_eth_tx_pkts_vm_wr. */
+	__u8   ethmacdst[6];
+	__u8   ethmacsrc[6];
+	__be16 ethtype;
+	__be16 vlantci;
+
+	struct mbuf *mb[15];
+};
+
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
@@ -560,6 +577,7 @@
 	struct sglist *gl;
 	__be32 cpl_ctrl0;	/* for convenience */
 	int tc_idx;		/* traffic class */
+	struct txpkts txp;
 
 	struct task tx_reclaim_task;
 	/* stats for common events first */
Index: sys/dev/cxgbe/common/common.h
===================================================================
--- sys/dev/cxgbe/common/common.h
+++ sys/dev/cxgbe/common/common.h
@@ -389,6 +389,7 @@
 	bool ulptx_memwrite_dsgl;	/* use of T5 DSGL allowed */
 	bool fr_nsmr_tpte_wr_support;	/* FW support for FR_NSMR_TPTE_WR */
 	bool viid_smt_extn_support;	/* FW returns vin, vfvld & smt index? */
+	unsigned int max_pkts_per_eth_tx_pkts_wr;
 };
 
 #define CHELSIO_T4		0x4
Index: sys/dev/cxgbe/t4_main.c
===================================================================
--- sys/dev/cxgbe/t4_main.c
+++ sys/dev/cxgbe/t4_main.c
@@ -2191,7 +2191,7 @@
 		    vi->rsrv_noflowq);
 
 	items[0] = m;
-	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
+	rc = mp_ring_enqueue(txq->r, items, 1, 256);
 	if (__predict_false(rc != 0))
 		m_freem(m);
 
@@ -2212,7 +2212,7 @@
 			txq->eq.flags |= EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 			while (!mp_ring_is_idle(txq->r)) {
-				mp_ring_check_drainage(txq->r, 0);
+				mp_ring_check_drainage(txq->r, 4096);
 				pause("qflush", 1);
 			}
 			TXQ_LOCK(txq);
@@ -2261,7 +2261,7 @@
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
-				drops += counter_u64_fetch(txq->r->drops);
+				drops += counter_u64_fetch(txq->r->dropped);
 		}
 
 		return (drops);
@@ -2326,7 +2326,7 @@
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
-				drops += counter_u64_fetch(txq->r->drops);
+				drops += counter_u64_fetch(txq->r->dropped);
 		}
 
 		return (drops);
@@ -4457,6 +4457,13 @@
 	else
 		sc->params.fr_nsmr_tpte_wr_support = false;
 
+	param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
+	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
+	if (rc == 0)
+		sc->params.max_pkts_per_eth_tx_pkts_wr = val[0];
+	else
+		sc->params.max_pkts_per_eth_tx_pkts_wr = 15;
+
 	/* get capabilites */
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
@@ -5965,7 +5972,7 @@
 
 	/* Wait for the mp_ring to empty. */
 	while (!mp_ring_is_idle(txq->r)) {
-		mp_ring_check_drainage(txq->r, 0);
+		mp_ring_check_drainage(txq->r, 4096);
 		pause("rquiesce", 1);
 	}
 
Index: sys/dev/cxgbe/t4_mp_ring.h
===================================================================
--- sys/dev/cxgbe/t4_mp_ring.h
+++ sys/dev/cxgbe/t4_mp_ring.h
@@ -36,33 +36,38 @@
 #endif
 
 struct mp_ring;
-typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int);
+typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int, bool *);
 typedef u_int (*ring_can_drain_t)(struct mp_ring *);
 
 struct mp_ring {
 	volatile uint64_t	state __aligned(CACHE_LINE_SIZE);
+	struct malloc_type *	mt;
 
 	int			size __aligned(CACHE_LINE_SIZE);
 	void *			cookie;
-	struct malloc_type *	mt;
 	ring_drain_t		drain;
 	ring_can_drain_t	can_drain;	/* cheap, may be unreliable */
-	counter_u64_t		enqueues;
-	counter_u64_t		drops;
-	counter_u64_t		starts;
-	counter_u64_t		stalls;
-	counter_u64_t		restarts;	/* recovered after stalling */
+	struct mtx *		cons_lock;
+	counter_u64_t		dropped;
+	counter_u64_t		consumer[4];
+	counter_u64_t		not_consumer;
 	counter_u64_t		abdications;
+	counter_u64_t		consumed;
+	counter_u64_t		cons_idle;
+	counter_u64_t		cons_idle2;
+	counter_u64_t		stalls;
 
 	void * volatile		items[] __aligned(CACHE_LINE_SIZE);
 };
 
 int mp_ring_alloc(struct mp_ring **, int, void *, ring_drain_t,
-    ring_can_drain_t, struct malloc_type *, int);
+    ring_can_drain_t, struct malloc_type *, struct mtx *, int);
 void mp_ring_free(struct mp_ring *);
 int mp_ring_enqueue(struct mp_ring *, void **, int, int);
 void mp_ring_check_drainage(struct mp_ring *, int);
 void mp_ring_reset_stats(struct mp_ring *);
-int mp_ring_is_idle(struct mp_ring *);
+bool mp_ring_is_idle(struct mp_ring *);
+void mp_ring_sysctls(struct mp_ring *, struct sysctl_ctx_list *,
+    struct sysctl_oid_list *);
 
 #endif
Index: sys/dev/cxgbe/t4_mp_ring.c
===================================================================
--- sys/dev/cxgbe/t4_mp_ring.c
+++ sys/dev/cxgbe/t4_mp_ring.c
@@ -34,6 +34,8 @@
 #include <sys/counter.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
 #include <machine/cpu.h>
 
 #include "t4_mp_ring.h"
@@ -43,6 +45,23 @@
 #define atomic_cmpset_rel_64 atomic_cmpset_64
 #endif
 
+/*
+ * mp_ring handles multiple threads (producers) enqueueing data to a tx queue.
+ * The thread that is writing the hardware descriptors is the consumer and it
+ * runs with the consumer lock held.  A producer becomes the consumer if there
+ * isn't one already.  The consumer runs with the flags sets to BUSY and
+ * consumes everything (IDLE or COALESCING) or gets STALLED.  If it is running
+ * over its budget it sets flags to TOO_BUSY.  A producer that observes a
+ * TOO_BUSY consumer will become the new consumer by setting flags to
+ * TAKING_OVER.  The original consumer stops and sets the flags back to BUSY for
+ * the new consumer.
+ *
+ * COALESCING is the same as IDLE except there are items being held in the hope
+ * that they can be coalesced with items that follow.  The driver must arrange
+ * for a tx update or some other event that transmits all the held items in a
+ * timely manner if nothing else is enqueued.
+ */
+
 union ring_state {
 	struct {
 		uint16_t pidx_head;
@@ -54,13 +73,21 @@
 };
 
 enum {
-	IDLE = 0,	/* consumer ran to completion, nothing more to do. */
+	IDLE = 0,	/* tx is all caught up, nothing to do. */
+	COALESCING,	/* IDLE, but tx frames are being held for coalescing */
 	BUSY,		/* consumer is running already, or will be shortly. */
+	TOO_BUSY,	/* consumer is running and is beyond its budget */
+	TAKING_OVER,	/* new consumer taking over from a TOO_BUSY consumer */
 	STALLED,	/* consumer stopped due to lack of resources. */
-	ABDICATED,	/* consumer stopped even though there was work to be
-			   done because it wants another thread to take over. */
 };
 
+enum {
+	C_FAST = 0,
+	C_2,
+	C_3,
+	C_TAKEOVER,
+};
+
 static inline uint16_t
 space_available(struct mp_ring *r, union ring_state s)
 {
@@ -83,93 +110,104 @@
 	return (x > n ? idx + n : n - x);
 }
 
-/* Consumer is about to update the ring's state to s */
-static inline uint16_t
-state_to_flags(union ring_state s, int abdicate)
-{
-
-	if (s.cidx == s.pidx_tail)
-		return (IDLE);
-	else if (abdicate && s.pidx_tail != s.pidx_head)
-		return (ABDICATED);
-
-	return (BUSY);
-}
-
 /*
- * Caller passes in a state, with a guarantee that there is work to do and that
- * all items up to the pidx_tail in the state are visible.
+ * Consumer.  Called with the consumer lock held and a guarantee that there is
+ * work to do.
  */
 static void
-drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
+drain_ring(struct mp_ring *r, int budget)
 {
-	union ring_state ns;
+	union ring_state os, ns;
 	int n, pending, total;
-	uint16_t cidx = os.cidx;
-	uint16_t pidx = os.pidx_tail;
+	uint16_t cidx;
+	uint16_t pidx;
+	bool coalescing;
 
+	mtx_assert(r->cons_lock, MA_OWNED);
+
+	os.state = atomic_load_acq_64(&r->state);
 	MPASS(os.flags == BUSY);
+
+	cidx = os.cidx;
+	pidx = os.pidx_tail;
 	MPASS(cidx != pidx);
 
-	if (prev == IDLE)
-		counter_u64_add(r->starts, 1);
 	pending = 0;
 	total = 0;
 
 	while (cidx != pidx) {
 
 		/* Items from cidx to pidx are available for consumption. */
-		n = r->drain(r, cidx, pidx);
+		n = r->drain(r, cidx, pidx, &coalescing);
 		if (n == 0) {
 			critical_enter();
 			os.state = r->state;
 			do {
 				ns.state = os.state;
 				ns.cidx = cidx;
-				ns.flags = STALLED;
+
+				MPASS(os.flags == BUSY ||
+				    os.flags == TOO_BUSY ||
+				    os.flags == TAKING_OVER);
+
+				if (os.flags == TAKING_OVER)
+					ns.flags = BUSY;
+				else
+					ns.flags = STALLED;
 			} while (atomic_fcmpset_64(&r->state, &os.state,
 			    ns.state) == 0);
 			critical_exit();
-			if (prev != STALLED)
+			if (os.flags == TAKING_OVER)
+				counter_u64_add(r->abdications, 1);
+			else if (ns.flags == STALLED)
 				counter_u64_add(r->stalls, 1);
-			else if (total > 0) {
-				counter_u64_add(r->restarts, 1);
-				counter_u64_add(r->stalls, 1);
-			}
 			break;
 		}
 		cidx = increment_idx(r, cidx, n);
 		pending += n;
 		total += n;
+		counter_u64_add(r->consumed, n);
 
-		/*
-		 * We update the cidx only if we've caught up with the pidx, the
-		 * real cidx is getting too far ahead of the one visible to
-		 * everyone else, or we have exceeded our budget.
-		 */
-		if (cidx != pidx && pending < 64 && total < budget)
-			continue;
-		critical_enter();
-		os.state = r->state;
+		os.state = atomic_load_acq_64(&r->state);
 		do {
+			MPASS(os.flags == BUSY || os.flags == TOO_BUSY ||
+			    os.flags == TAKING_OVER);
+
 			ns.state = os.state;
 			ns.cidx = cidx;
-			ns.flags = state_to_flags(ns, total >= budget);
+			if (__predict_false(os.flags == TAKING_OVER)) {
+				MPASS(total >= budget);
+				ns.flags = BUSY;
+				continue;
+			}
+			if (cidx == os.pidx_tail) {
+				ns.flags = coalescing ? COALESCING : IDLE;
+				continue;
+			}
+			if (total >= budget) {
+				ns.flags = TOO_BUSY;
+				continue;
+			}
+			MPASS(os.flags == BUSY);
+			if (pending < 32)
+				break;
 		} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
-		critical_exit();
 
-		if (ns.flags == ABDICATED)
+		if (__predict_false(os.flags == TAKING_OVER)) {
+			MPASS(ns.flags == BUSY);
 			counter_u64_add(r->abdications, 1);
-		if (ns.flags != BUSY) {
-			/* Wrong loop exit if we're going to stall. */
-			MPASS(ns.flags != STALLED);
-			if (prev == STALLED) {
-				MPASS(total > 0);
-				counter_u64_add(r->restarts, 1);
-			}
 			break;
 		}
 
+		if (ns.flags == IDLE || ns.flags == COALESCING) {
+			MPASS(ns.pidx_tail == cidx);
+			if (ns.pidx_head != ns.pidx_tail)
+				counter_u64_add(r->cons_idle2, 1);
+			else
+				counter_u64_add(r->cons_idle, 1);
+			break;
+		}
+
 		/*
 		 * The acquire style atomic above guarantees visibility of items
 		 * associated with any pidx change that we notice here.
@@ -177,13 +215,55 @@
 		pidx = ns.pidx_tail;
 		pending = 0;
 	}
+
+#ifdef INVARIANTS
+	if (os.flags == TAKING_OVER)
+		MPASS(ns.flags == BUSY);
+	else {
+		MPASS(ns.flags == IDLE || ns.flags == COALESCING ||
+		    ns.flags == STALLED);
+	}
+#endif
 }
 
+static void
+drain_txpkts(struct mp_ring *r, union ring_state os, int budget)
+{
+	union ring_state ns;
+	uint16_t cidx = os.cidx;
+	uint16_t pidx = os.pidx_tail;
+	bool coalescing;
+
+	mtx_assert(r->cons_lock, MA_OWNED);
+	MPASS(os.flags == BUSY);
+	MPASS(cidx == pidx);
+
+	r->drain(r, cidx, pidx, &coalescing);
+	MPASS(coalescing == false);
+	critical_enter();
+	os.state = r->state;
+	do {
+		ns.state = os.state;
+		MPASS(os.flags == BUSY);
+		MPASS(os.cidx == cidx);
+		if (ns.cidx == ns.pidx_tail)
+			ns.flags = IDLE;
+		else
+			ns.flags = BUSY;
+	} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
+	critical_exit();
+
+	if (ns.flags == BUSY)
+		drain_ring(r, budget);
+}
+
 int
 mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain,
-    ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
+    ring_can_drain_t can_drain, struct malloc_type *mt, struct mtx *lck,
+    int flags)
 {
 	struct mp_ring *r;
+	int i;
 
 	/* All idx are 16b so size can be 65536 at most */
 	if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
@@ -201,43 +281,59 @@
 	r->mt = mt;
 	r->drain = drain;
 	r->can_drain = can_drain;
-	r->enqueues = counter_u64_alloc(flags);
-	r->drops = counter_u64_alloc(flags);
-	r->starts = counter_u64_alloc(flags);
-	r->stalls = counter_u64_alloc(flags);
-	r->restarts = counter_u64_alloc(flags);
-	r->abdications = counter_u64_alloc(flags);
-	if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
-	    r->stalls == NULL || r->restarts == NULL ||
-	    r->abdications == NULL) {
-		mp_ring_free(r);
-		return (ENOMEM);
+	r->cons_lock = lck;
+	if ((r->dropped = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	for (i = 0; i < nitems(r->consumer); i++) {
+		if ((r->consumer[i] = counter_u64_alloc(flags)) == NULL)
+			goto failed;
 	}
-
+	if ((r->not_consumer = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->abdications = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->stalls = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->consumed = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->cons_idle = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->cons_idle2 = counter_u64_alloc(flags)) == NULL)
+		goto failed;
 	*pr = r;
 	return (0);
+failed:
+	mp_ring_free(r);
+	return (ENOMEM);
 }
 
 void
 
 mp_ring_free(struct mp_ring *r)
 {
+	int i;
 
 	if (r == NULL)
 		return;
 
-	if (r->enqueues != NULL)
-		counter_u64_free(r->enqueues);
-	if (r->drops != NULL)
-		counter_u64_free(r->drops);
-	if (r->starts != NULL)
-		counter_u64_free(r->starts);
-	if (r->stalls != NULL)
-		counter_u64_free(r->stalls);
-	if (r->restarts != NULL)
-		counter_u64_free(r->restarts);
+	if (r->dropped != NULL)
+		counter_u64_free(r->dropped);
+	for (i = 0; i < nitems(r->consumer); i++) {
+		if (r->consumer[i] != NULL)
+			counter_u64_free(r->consumer[i]);
+	}
+	if (r->not_consumer != NULL)
+		counter_u64_free(r->not_consumer);
 	if (r->abdications != NULL)
 		counter_u64_free(r->abdications);
+	if (r->stalls != NULL)
+		counter_u64_free(r->stalls);
+	if (r->consumed != NULL)
+		counter_u64_free(r->consumed);
+	if (r->cons_idle != NULL)
+		counter_u64_free(r->cons_idle);
+	if (r->cons_idle2 != NULL)
+		counter_u64_free(r->cons_idle2);
 
 	free(r, r->mt);
 }
@@ -252,7 +348,8 @@
 {
 	union ring_state os, ns;
 	uint16_t pidx_start, pidx_stop;
-	int i;
+	int i, nospc, cons;
+	bool consumer;
 
 	MPASS(items != NULL);
 	MPASS(n > 0);
@@ -261,26 +358,70 @@
 	 * Reserve room for the new items.  Our reservation, if successful, is
 	 * from 'pidx_start' to 'pidx_stop'.
 	 */
+	nospc = 0;
 	os.state = r->state;
 	for (;;) {
-		if (n >= space_available(r, os)) {
-			counter_u64_add(r->drops, n);
+		for (;;) {
+			if (__predict_true(space_available(r, os) >= n))
+				break;
+
+			/* Not enough room in the ring. */
+
 			MPASS(os.flags != IDLE);
+			MPASS(os.flags != COALESCING);
+			if (__predict_false(++nospc > 100)) {
+				counter_u64_add(r->dropped, n);
+				return (ENOBUFS);
+			}
 			if (os.flags == STALLED)
-				mp_ring_check_drainage(r, 0);
-			return (ENOBUFS);
+				mp_ring_check_drainage(r, 64);
+			else
+				cpu_spinwait();
+			os.state = r->state;
 		}
+
+		/* There is room in the ring. */
+
+		cons = -1;
 		ns.state = os.state;
 		ns.pidx_head = increment_idx(r, os.pidx_head, n);
+		if (os.flags == IDLE || os.flags == COALESCING) {
+			MPASS(os.pidx_tail == os.cidx);
+			if (os.pidx_head == os.pidx_tail) {
+				cons = C_FAST;
+				ns.pidx_tail = increment_idx(r, os.pidx_tail, n);
+			} else
+				cons = C_2;
+			ns.flags = BUSY;
+		} else if (os.flags == TOO_BUSY) {
+			cons = C_TAKEOVER;
+			ns.flags = TAKING_OVER;
+		}
 		critical_enter();
 		if (atomic_fcmpset_64(&r->state, &os.state, ns.state))
 			break;
 		critical_exit();
 		cpu_spinwait();
-	}
+	};
+
 	pidx_start = os.pidx_head;
 	pidx_stop = ns.pidx_head;
 
+	if (cons == C_FAST) {
+		i = pidx_start;
+		do {
+			r->items[i] = *items++;
+			if (__predict_false(++i == r->size))
+				i = 0;
+		} while (i != pidx_stop);
+		critical_exit();
+		counter_u64_add(r->consumer[C_FAST], 1);
+		mtx_lock(r->cons_lock);
+		drain_ring(r, budget);
+		mtx_unlock(r->cons_lock);
+		return (0);
+	}
+
 	/*
 	 * Wait for other producers who got in ahead of us to enqueue their
 	 * items, one producer at a time.  It is our turn when the ring's
@@ -305,19 +446,31 @@
 	 */
 	os.state = r->state;
 	do {
+		consumer = false;
 		ns.state = os.state;
 		ns.pidx_tail = pidx_stop;
-		ns.flags = BUSY;
+		if (os.flags == IDLE || os.flags == COALESCING ||
+		    (os.flags == STALLED && r->can_drain(r))) {
+			MPASS(cons == -1);
+			consumer = true;
+			ns.flags = BUSY;
+		}
 	} while (atomic_fcmpset_rel_64(&r->state, &os.state, ns.state) == 0);
 	critical_exit();
-	counter_u64_add(r->enqueues, n);
 
-	/*
-	 * Turn into a consumer if some other thread isn't active as a consumer
-	 * already.
-	 */
-	if (os.flags != BUSY)
-		drain_ring(r, ns, os.flags, budget);
+	if (cons == -1) {
+		if (consumer)
+			cons = C_3;
+		else {
+			counter_u64_add(r->not_consumer, 1);
+			return (0);
+		}
+	}
+	MPASS(cons > C_FAST && cons < nitems(r->consumer));
+	counter_u64_add(r->consumer[cons], 1);
+	mtx_lock(r->cons_lock);
+	drain_ring(r, budget);
+	mtx_unlock(r->cons_lock);
 
 	return (0);
 }
@@ -328,37 +481,44 @@
 	union ring_state os, ns;
 
 	os.state = r->state;
-	if (os.flags != STALLED || os.pidx_head != os.pidx_tail ||
-	    r->can_drain(r) == 0)
-		return;
-
-	MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
-	ns.state = os.state;
-	ns.flags = BUSY;
-
-	/*
-	 * The acquire style atomic guarantees visibility of items associated
-	 * with the pidx that we read here.
-	 */
-	if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
-		return;
-
-	drain_ring(r, ns, os.flags, budget);
+	if (os.flags == STALLED && r->can_drain(r)) {
+		MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
+		ns.state = os.state;
+		ns.flags = BUSY;
+		if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
+			mtx_lock(r->cons_lock);
+			drain_ring(r, budget);
+			mtx_unlock(r->cons_lock);
+		}
+	} else if (os.flags == COALESCING) {
+		MPASS(os.cidx == os.pidx_tail);
+		ns.state = os.state;
+		ns.flags = BUSY;
+		if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
+			mtx_lock(r->cons_lock);
+			drain_txpkts(r, ns, budget);
+			mtx_unlock(r->cons_lock);
+		}
+	}
 }
 
 void
 mp_ring_reset_stats(struct mp_ring *r)
 {
+	int i;
 
-	counter_u64_zero(r->enqueues);
-	counter_u64_zero(r->drops);
-	counter_u64_zero(r->starts);
-	counter_u64_zero(r->stalls);
-	counter_u64_zero(r->restarts);
+	counter_u64_zero(r->dropped);
+	for (i = 0; i < nitems(r->consumer); i++)
+		counter_u64_zero(r->consumer[i]);
+	counter_u64_zero(r->not_consumer);
 	counter_u64_zero(r->abdications);
+	counter_u64_zero(r->stalls);
+	counter_u64_zero(r->consumed);
+	counter_u64_zero(r->cons_idle);
+	counter_u64_zero(r->cons_idle2);
 }
 
-int
+bool
 mp_ring_is_idle(struct mp_ring *r)
 {
 	union ring_state s;
@@ -366,7 +526,50 @@
 	s.state = r->state;
 	if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
 	    s.flags == IDLE)
-		return (1);
+		return (true);
 
-	return (0);
+	return (false);
+}
+
+void
+mp_ring_sysctls(struct mp_ring *r, struct sysctl_ctx_list *ctx,
+    struct sysctl_oid_list *children)
+{
+	struct sysctl_oid *oid;
+
+	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "mp_ring", CTLFLAG_RD |
+	    CTLFLAG_MPSAFE, NULL, "mp_ring statistics");
+	children = SYSCTL_CHILDREN(oid);
+
+	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "state", CTLFLAG_RD,
+	    __DEVOLATILE(uint64_t *, &r->state), 0, "ring state");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "dropped", CTLFLAG_RD,
+	    &r->dropped, "# of items dropped");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumed",
+	    CTLFLAG_RD, &r->consumed, "# of items consumed");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "fast_consumer",
+	    CTLFLAG_RD, &r->consumer[C_FAST],
+	    "# of times producer became consumer (fast)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer2",
+	    CTLFLAG_RD, &r->consumer[C_2],
+	    "# of times producer became consumer (2)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer3",
+	    CTLFLAG_RD, &r->consumer[C_3],
+	    "# of times producer became consumer (3)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "takeovers",
+	    CTLFLAG_RD, &r->consumer[C_TAKEOVER],
+	    "# of times producer took over from another consumer.");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "not_consumer",
+	    CTLFLAG_RD, &r->not_consumer,
+	    "# of times producer did not become consumer");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "abdications",
+	    CTLFLAG_RD, &r->abdications, "# of consumer abdications");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "stalls",
+	    CTLFLAG_RD, &r->stalls, "# of consumer stalls");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle",
+	    CTLFLAG_RD, &r->cons_idle,
+	    "# of times consumer ran fully to completion");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle2",
+	    CTLFLAG_RD, &r->cons_idle2,
+	    "# of times consumer idled when another enqueue was in progress");
 }
Index: sys/dev/cxgbe/t4_sge.c
===================================================================
--- sys/dev/cxgbe/t4_sge.c
+++ sys/dev/cxgbe/t4_sge.c
@@ -203,19 +203,6 @@
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
     "Enable presorting of LRO frames");
 
-struct txpkts {
-	u_int wr_type;		/* type 0 or type 1 */
-	u_int npkt;		/* # of packets in this work request */
-	u_int plen;		/* total payload (sum of all packets) */
-	u_int len16;		/* # of 16B pieces used by this work request */
-};
-
-/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
-struct sgl {
-	struct sglist sg;
-	struct sglist_seg seg[TX_SGL_SEGS];
-};
-
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
@@ -284,14 +271,16 @@
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
-static u_int write_txpkt_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int);
+static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *,
+    u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
-static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
-static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
-static u_int write_txpkts_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int);
+    struct mbuf *);
+static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *,
+    int, bool *);
+static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *,
+    int, bool *);
+static u_int write_txpkts_wr(struct adapter *, struct sge_txq *);
+static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
@@ -2839,7 +2828,7 @@
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
-static inline int
+static inline bool
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
@@ -2855,8 +2844,9 @@
 }
 
 static inline int
-wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
+wr_can_update_eq(void *p)
 {
+	struct fw_eth_tx_pkts_wr *wr = p;
 
 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
@@ -2864,159 +2854,232 @@
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKTS2_WR:
 	case FW_ETH_TX_PKT_VM_WR:
+	case FW_ETH_TX_PKTS_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
+static inline void
+set_txupdate_flags(struct sge_txq *txq, u_int avail,
+    struct fw_eth_tx_pkt_wr *wr)
+{
+	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp = &txq->txp;
+
+	if ((txp->npkt > 0 || avail < eq->sidx / 2) &&
+	    atomic_cmpset_int(&eq->equiq, 0, 1)) {
+		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
+		eq->equeqidx = eq->pidx;
+	} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
+		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
+		eq->equeqidx = eq->pidx;
+	}
+}
+
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
-eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
+eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
 {
 	struct sge_txq *txq = r->cookie;
-	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
+	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp = &txq->txp;
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->adapter;
 	u_int total, remaining;		/* # of packets */
-	u_int available, dbdiff;	/* # of hardware descriptors */
-	u_int n, next_cidx;
-	struct mbuf *m0, *tail;
-	struct txpkts txp;
-	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
+	u_int n, avail, dbdiff;		/* # of hardware descriptors */
+	int i, rc;
+	struct mbuf *m0;
+	bool snd;
+	void *wr;	/* start of the last WR written to the ring */
 
-	remaining = IDXDIFF(pidx, cidx, r->size);
-	MPASS(remaining > 0);	/* Must not be called without work to do. */
-	total = 0;
+	TXQ_LOCK_ASSERT_OWNED(txq);
 
-	TXQ_LOCK(txq);
+	remaining = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(discard_tx(eq))) {
+		for (i = 0; i < txp->npkt; i++)
+			m_freem(txp->mb[i]);
+		txp->npkt = 0;
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
-		reclaim_tx_descs(txq, 2048);
-		total = remaining;
-		goto done;
+		reclaim_tx_descs(txq, eq->sidx);
+		*coalescing = false;
+		return (remaining);	/* emptied */
 	}
 
 	/* How many hardware descriptors do we have readily available. */
-	if (eq->pidx == eq->cidx)
-		available = eq->sidx - 1;
-	else
-		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
-	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
+	if (eq->pidx == eq->cidx) {
+		avail = eq->sidx - 1;
+		if (txp->score++ >= 5)
+			txp->score = 5;	/* tx is completely idle, reset. */
+	} else
+		avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
-	while (remaining > 0) {
+	total = 0;
+	if (remaining == 0) {
+		if (txp->score-- == 1)	/* egr_update had to drain txpkts */
+			txp->score = 1;
+		goto send_txpkts;
+	}
 
+	dbdiff = 0;
+	MPASS(remaining > 0);
+	while (remaining > 0) {
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
-		if (available < tx_len16_to_desc(mbuf_len16(m0))) {
-			available += reclaim_tx_descs(txq, 64);
-			if (available < tx_len16_to_desc(mbuf_len16(m0)))
-				break;	/* out of descriptors */
+		if (avail < 2 * SGE_MAX_WR_NDESC)
+			avail += reclaim_tx_descs(txq, 64);
+
+		if (txp->npkt > 0 || remaining > 1 || txp->score > 3 ||
+		    atomic_load_int(&txq->eq.equiq) != 0) {
+			if (sc->flags & IS_VF)
+				rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
+			else
+				rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd);
+		} else {
+			snd = false;
+			rc = EINVAL;
 		}
+		if (snd) {
+			MPASS(txp->npkt > 0);
+			for (i = 0; i < txp->npkt; i++)
+				ETHER_BPF_MTAP(ifp, txp->mb[i]);
+			if (txp->npkt > 1) {
+				if (txp->score++ >= 10)
+					txp->score = 10;
+				MPASS(avail >= tx_len16_to_desc(txp->len16));
+				if (sc->flags & IS_VF)
+					n = write_txpkts_vm_wr(sc, txq);
+				else
+					n = write_txpkts_wr(sc, txq);
+			} else {
+				MPASS(avail >=
+				    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
+				if (sc->flags & IS_VF)
+					n = write_txpkt_vm_wr(sc, txq,
+					    txp->mb[0]);
+				else
+					n = write_txpkt_wr(sc, txq, txp->mb[0],
+					    avail);
+			}
+			MPASS(n <= SGE_MAX_WR_NDESC);
+			avail -= n;
+			dbdiff += n;
+			wr = &eq->desc[eq->pidx];
+			IDXINCR(eq->pidx, n, eq->sidx);
+			txp->npkt = 0;	/* emptied */
+		}
+		if (rc == 0) {
+			/* m0 was coalesced into txq->txpkts. */
+			goto next_mbuf;
+		}
+		if (rc == EAGAIN) {
+			/*
+			 * m0 is suitable for tx coalescing but could not be
+			 * combined with the existing txq->txpkts, which has now
+			 * been transmitted.  Start a new txpkts with m0.
+			 */
+			MPASS(snd);
+			MPASS(txp->npkt == 0);
+			continue;
+		}
 
-		next_cidx = cidx + 1;
-		if (__predict_false(next_cidx == r->size))
-			next_cidx = 0;
-
-		wr = (void *)&eq->desc[eq->pidx];
+		MPASS(rc != 0 && rc != EAGAIN);
+		MPASS(txp->npkt == 0);
+		wr = &eq->desc[eq->pidx];
 		if (mbuf_cflags(m0) & MC_RAW_WR) {
-			total++;
-			remaining--;
-			n = write_raw_wr(txq, (void *)wr, m0, available);
+			n = write_raw_wr(txq, wr, m0, avail);
 #ifdef KERN_TLS
 		} else if (mbuf_cflags(m0) & MC_TLS) {
-			total++;
-			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
-			n = t6_ktls_write_wr(txq,(void *)wr, m0,
-			    mbuf_nsegs(m0), available);
+			n = t6_ktls_write_wr(txq, wr, m0, mbuf_nsegs(m0),
+			    avail);
 #endif
-		} else if (sc->flags & IS_VF) {
-			total++;
-			remaining--;
-			ETHER_BPF_MTAP(ifp, m0);
-			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
-			    available);
-		} else if (remaining > 1 &&
-		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
-
-			/* pkts at cidx, next_cidx should both be in txp. */
-			MPASS(txp.npkt == 2);
-			tail = r->items[next_cidx];
-			MPASS(tail->m_nextpkt == NULL);
-			ETHER_BPF_MTAP(ifp, m0);
-			ETHER_BPF_MTAP(ifp, tail);
-			m0->m_nextpkt = tail;
-
-			if (__predict_false(++next_cidx == r->size))
-				next_cidx = 0;
-
-			while (next_cidx != pidx) {
-				if (add_to_txpkts(r->items[next_cidx], &txp,
-				    available) != 0)
-					break;
-				tail->m_nextpkt = r->items[next_cidx];
-				tail = tail->m_nextpkt;
-				ETHER_BPF_MTAP(ifp, tail);
-				if (__predict_false(++next_cidx == r->size))
-					next_cidx = 0;
-			}
-
-			n = write_txpkts_wr(sc, txq, wr, m0, &txp, available);
-			total += txp.npkt;
-			remaining -= txp.npkt;
 		} else {
-			total++;
-			remaining--;
-			ETHER_BPF_MTAP(ifp, m0);
-			n = write_txpkt_wr(sc, txq, (void *)wr, m0, available);
+			n = tx_len16_to_desc(mbuf_len16(m0));
+			if (__predict_false(avail < n)) {
+				avail += reclaim_tx_descs(txq, 32);
+				if (avail < n)
+					break;	/* out of descriptors */
+			}
+			if (sc->flags & IS_VF)
+				n = write_txpkt_vm_wr(sc, txq, m0);
+			else
+				n = write_txpkt_wr(sc, txq, m0, avail);
 		}
-		MPASS(n >= 1 && n <= available);
+		MPASS(n >= 1 && n <= avail);
 		if (!(mbuf_cflags(m0) & MC_TLS))
 			MPASS(n <= SGE_MAX_WR_NDESC);
 
-		available -= n;
+		avail -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
-		if (wr_can_update_eq(wr)) {
-			if (total_available_tx_desc(eq) < eq->sidx / 4 &&
-			    atomic_cmpset_int(&eq->equiq, 0, 1)) {
-				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
-				    F_FW_WR_EQUEQ);
-				eq->equeqidx = eq->pidx;
-			} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >=
-			    32) {
-				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
-				eq->equeqidx = eq->pidx;
-			}
-		}
-
-		if (dbdiff >= 16 && remaining >= 4) {
+		if (dbdiff >= 512 / EQ_ESIZE) {	/* X_FETCHBURSTMAX_512B */
+			if (wr_can_update_eq(wr))
+				set_txupdate_flags(txq, avail, wr);
 			ring_eq_db(sc, eq, dbdiff);
-			available += reclaim_tx_descs(txq, 4 * dbdiff);
+			avail += reclaim_tx_descs(txq, 32);
 			dbdiff = 0;
 		}
-
-		cidx = next_cidx;
+next_mbuf:
+		total++;
+		remaining--;
+		if (__predict_false(++cidx == r->size))
+			cidx = 0;
 	}
 	if (dbdiff != 0) {
+		if (wr_can_update_eq(wr))
+			set_txupdate_flags(txq, avail, wr);
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
+	} else if (eq->pidx == eq->cidx && txp->npkt > 0 &&
+	    atomic_load_int(&txq->eq.equiq) == 0) {
+		/*
+		 * If nothing was submitted to the chip for tx (it was coalesced
+		 * into txpkts instead) and there is no tx update outstanding
+		 * then we need to send txpkts now.
+		 */
+send_txpkts:
+		MPASS(txp->npkt > 0);
+		for (i = 0; i < txp->npkt; i++)
+			ETHER_BPF_MTAP(ifp, txp->mb[i]);
+		if (txp->npkt > 1) {
+			MPASS(avail >= tx_len16_to_desc(txp->len16));
+			if (sc->flags & IS_VF)
+				n = write_txpkts_vm_wr(sc, txq);
+			else
+				n = write_txpkts_wr(sc, txq);
+		} else {
+			MPASS(avail >=
+			    tx_len16_to_desc(mbuf_len16(txp->mb[0])));
+			if (sc->flags & IS_VF)
+				n = write_txpkt_vm_wr(sc, txq, txp->mb[0]);
+			else
+				n = write_txpkt_wr(sc, txq, txp->mb[0], avail);
+		}
+		MPASS(n <= SGE_MAX_WR_NDESC);
+		wr = &eq->desc[eq->pidx];
+		IDXINCR(eq->pidx, n, eq->sidx);
+		txp->npkt = 0;	/* emptied */
+
+		MPASS(wr_can_update_eq(wr));
+		set_txupdate_flags(txq, avail - n, wr);
+		ring_eq_db(sc, eq, n);
+		reclaim_tx_descs(txq, 32);
 	}
-done:
-	TXQ_UNLOCK(txq);
+	*coalescing = txp->npkt > 0;
 
 	return (total);
 }
@@ -4106,11 +4169,12 @@
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
-	    M_CXGBE, M_WAITOK);
+	    M_CXGBE, &eq->eq_lock, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
@@ -4147,6 +4211,12 @@
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
+	txp = &txq->txp;
+	txp->score = 5;
+	MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr);
+	txq->txp.max_npkt = min(nitems(txp->mb),
+	    sc->params.max_pkts_per_eth_tx_pkts_wr);
+
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue");
@@ -4242,26 +4312,8 @@
 		    "# of NIC TLS sessions using AES-GCM");
 	}
 #endif
+	mp_ring_sysctls(txq->r, &vi->ctx, children);
 
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
-	    CTLFLAG_RD, &txq->r->enqueues,
-	    "# of enqueues to the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
-	    CTLFLAG_RD, &txq->r->drops,
-	    "# of drops in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
-	    CTLFLAG_RD, &txq->r->starts,
-	    "# of normal consumer starts in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
-	    CTLFLAG_RD, &txq->r->stalls,
-	    "# of consumer stalls in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
-	    CTLFLAG_RD, &txq->r->restarts,
-	    "# of consumer restarts in the mp_ring for this queue");
-	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
-	    CTLFLAG_RD, &txq->r->abdications,
-	    "# of consumer abdications in the mp_ring for this queue");
-
 	return (0);
 }
 
@@ -4655,10 +4707,10 @@
  * The return value is the # of hardware descriptors used.
  */
 static u_int
-write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
-    struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
+write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0)
 {
-	struct sge_eq *eq = &txq->eq;
+	struct sge_eq *eq;
+	struct fw_eth_tx_pkt_vm_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
@@ -4668,7 +4720,6 @@
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
-	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
@@ -4677,10 +4728,10 @@
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = tx_len16_to_desc(len16);
-	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
-	MPASS(wr == (void *)&eq->desc[eq->pidx]);
+	eq = &txq->eq;
+	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
@@ -4760,7 +4811,6 @@
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
-
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
@@ -4811,10 +4861,11 @@
  * The return value is the # of hardware descriptors used.
  */
 static u_int
-write_txpkt_wr(struct adapter *sc, struct sge_txq *txq,
-    struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available)
+write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0,
+    u_int available)
 {
-	struct sge_eq *eq = &txq->eq;
+	struct sge_eq *eq;
+	struct fw_eth_tx_pkt_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
@@ -4824,7 +4875,6 @@
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
-	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
@@ -4844,7 +4894,8 @@
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
-	MPASS(wr == (void *)&eq->desc[eq->pidx]);
+	eq = &txq->eq;
+	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
@@ -4927,71 +4978,151 @@
 	return (ndesc);
 }
 
+static inline bool
+cmp_l2hdr(struct txpkts *txp, struct mbuf *m)
+{
+	int len;
+
+	MPASS(txp->npkt > 0);
+	MPASS(m->m_len >= 16);	/* type1 implies 1 GL with all of the frame. */
+
+	if (txp->ethtype == be16toh(ETHERTYPE_VLAN))
+		len = sizeof(struct ether_vlan_header);
+	else
+		len = sizeof(struct ether_header);
+
+	return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0);
+}
+
+static inline void
+save_l2hdr(struct txpkts *txp, struct mbuf *m)
+{
+	MPASS(m->m_len >= 16);	/* type1 implies 1 GL with all of the frame. */
+
+	memcpy(&txp->ethmacdst[0], mtod(m, const void *), 16);
+}
+
 static int
-try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
+add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
+    int avail, bool *send)
 {
-	u_int needed, nsegs1, nsegs2, l1, l2;
+	struct txpkts *txp = &txq->txp;
 
-	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
-		return (1);
+	MPASS(sc->flags & IS_VF);
 
-	nsegs1 = mbuf_nsegs(m);
-	nsegs2 = mbuf_nsegs(n);
-	if (nsegs1 + nsegs2 == 2) {
-		txp->wr_type = 1;
-		l1 = l2 = txpkts1_len16();
-	} else {
-		txp->wr_type = 0;
-		l1 = txpkts0_len16(nsegs1);
-		l2 = txpkts0_len16(nsegs2);
+	/* Cannot have TSO and coalesce at the same time. */
+	if (cannot_use_txpkts(m)) {
+cannot_coalesce:
+		*send = txp->npkt > 0;
+		return (EINVAL);
 	}
-	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
-	needed = tx_len16_to_desc(txp->len16);
-	if (needed > SGE_MAX_WR_NDESC || needed > available)
-		return (1);
 
-	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
-	if (txp->plen > 65535)
-		return (1);
+	/* VF allows coalescing of type 1 (1 GL) only */
+	if (mbuf_nsegs(m) > 1)
+		goto cannot_coalesce;
 
-	txp->npkt = 2;
-	set_mbuf_len16(m, l1);
-	set_mbuf_len16(n, l2);
+	*send = false;
+	if (txp->npkt > 0) {
+		MPASS(tx_len16_to_desc(txp->len16) <= avail);
+		MPASS(txp->npkt < txp->max_npkt);
+		MPASS(txp->wr_type == 1);	/* VF supports type 1 only */
 
+		if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) {
+retry_after_send:
+			*send = true;
+			return (EAGAIN);
+		}
+		if (m->m_pkthdr.len + txp->plen > 65535)
+			goto retry_after_send;
+		if (cmp_l2hdr(txp, m))
+			goto retry_after_send;
+
+		txp->len16 += txpkts1_len16();
+		txp->plen += m->m_pkthdr.len;
+		txp->mb[txp->npkt++] = m;
+		if (txp->npkt == txp->max_npkt)
+			*send = true;
+	} else {
+		txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) +
+		    txpkts1_len16();
+		if (tx_len16_to_desc(txp->len16) > avail)
+			goto cannot_coalesce;
+		txp->npkt = 1;
+		txp->wr_type = 1;
+		txp->plen = m->m_pkthdr.len;
+		txp->mb[0] = m;
+		save_l2hdr(txp, m);
+	}
 	return (0);
 }
 
 static int
-add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
+add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m,
+    int avail, bool *send)
 {
-	u_int plen, len16, needed, nsegs;
+	struct txpkts *txp = &txq->txp;
+	int nsegs;
 
-	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
+	MPASS(!(sc->flags & IS_VF));
 
-	if (cannot_use_txpkts(m))
-		return (1);
+	/* Cannot have TSO and coalesce at the same time. */
+	if (cannot_use_txpkts(m)) {
+cannot_coalesce:
+		*send = txp->npkt > 0;
+		return (EINVAL);
+	}
 
+	*send = false;
 	nsegs = mbuf_nsegs(m);
-	if (txp->wr_type == 1 && nsegs != 1)
-		return (1);
+	if (txp->npkt == 0) {
+		if (m->m_pkthdr.len > 65535)
+			goto cannot_coalesce;
+		if (nsegs > 1) {
+			txp->wr_type = 0;
+			txp->len16 =
+			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
+			    txpkts0_len16(nsegs);
+		} else {
+			txp->wr_type = 1;
+			txp->len16 =
+			    howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
+			    txpkts1_len16();
+		}
+		if (tx_len16_to_desc(txp->len16) > avail)
+			goto cannot_coalesce;
+		txp->npkt = 1;
+		txp->plen = m->m_pkthdr.len;
+		txp->mb[0] = m;
+	} else {
+		MPASS(tx_len16_to_desc(txp->len16) <= avail);
+		MPASS(txp->npkt < txp->max_npkt);
 
-	plen = txp->plen + m->m_pkthdr.len;
-	if (plen > 65535)
-		return (1);
+		if (m->m_pkthdr.len + txp->plen > 65535) {
+retry_after_send:
+			*send = true;
+			return (EAGAIN);
+		}
 
-	if (txp->wr_type == 0)
-		len16 = txpkts0_len16(nsegs);
-	else
-		len16 = txpkts1_len16();
-	needed = tx_len16_to_desc(txp->len16 + len16);
-	if (needed > SGE_MAX_WR_NDESC || needed > available)
-		return (1);
+		MPASS(txp->wr_type == 0 || txp->wr_type == 1);
+		if (txp->wr_type == 0) {
+			if (tx_len16_to_desc(txp->len16 +
+			    txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC))
+				goto retry_after_send;
+			txp->len16 += txpkts0_len16(nsegs);
+		} else {
+			if (nsegs != 1)
+				goto retry_after_send;
+			if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) >
+			    avail)
+				goto retry_after_send;
+			txp->len16 += txpkts1_len16();
+		}
 
-	txp->npkt++;
-	txp->plen = plen;
-	txp->len16 += len16;
-	set_mbuf_len16(m, len16);
-
+		txp->plen += m->m_pkthdr.len;
+		txp->mb[txp->npkt++] = m;
+		if (txp->npkt == txp->max_npkt)
+			*send = true;
+	}
 	return (0);
 }
 
@@ -5003,34 +5134,25 @@
  * The return value is the # of hardware descriptors used.
  */
 static u_int
-write_txpkts_wr(struct adapter *sc, struct sge_txq *txq,
-    struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp,
-    u_int available)
+write_txpkts_wr(struct adapter *sc, struct sge_txq *txq)
 {
+	const struct txpkts *txp = &txq->txp;
 	struct sge_eq *eq = &txq->eq;
+	struct fw_eth_tx_pkts_wr *wr;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
-	uint32_t ctrl;
 	uint64_t ctrl1;
-	int ndesc, checkwrap;
-	struct mbuf *m;
+	int ndesc, i, checkwrap;
+	struct mbuf *m, *last;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
-	MPASS(txp->plen < 65536);
-	MPASS(m0 != NULL);
-	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
-	MPASS(available > 0 && available < eq->sidx);
 
-	ndesc = tx_len16_to_desc(txp->len16);
-	MPASS(ndesc <= available);
-
-	MPASS(wr == (void *)&eq->desc[eq->pidx]);
+	wr = (void *)&eq->desc[eq->pidx];
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
-	ctrl = V_FW_WR_LEN16(txp->len16);
-	wr->equiq_to_len16 = htobe32(ctrl);
+	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
@@ -5042,8 +5164,11 @@
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
+	ndesc = tx_len16_to_desc(txp->len16);
+	last = NULL;
 	checkwrap = eq->sidx - ndesc < eq->pidx;
-	for (m = m0; m != NULL; m = m->m_nextpkt) {
+	for (i = 0; i < txp->npkt; i++) {
+		m = txp->mb[i];
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
@@ -5052,7 +5177,7 @@
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
-			ulpmc->len = htobe32(mbuf_len16(m));
+			ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m)));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
@@ -5093,8 +5218,12 @@
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
+		if (last != NULL)
+			last->m_nextpkt = m;
+		last = m;
 	}
 
+	txq->sgl_wrs++;
 	if (txp->wr_type == 0) {
 		txq->txpkts0_pkts += txp->npkt;
 		txq->txpkts0_wrs++;
@@ -5104,12 +5233,92 @@
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
-	txsd->m = m0;
+	txsd->m = txp->mb[0];
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
+static u_int
+write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq)
+{
+	const struct txpkts *txp = &txq->txp;
+	struct sge_eq *eq = &txq->eq;
+	struct fw_eth_tx_pkts_vm_wr *wr;
+	struct tx_sdesc *txsd;
+	struct cpl_tx_pkt_core *cpl;
+	uint64_t ctrl1;
+	int ndesc, i;
+	struct mbuf *m, *last;
+	void *flitp;
+
+	TXQ_LOCK_ASSERT_OWNED(txq);
+	MPASS(txp->npkt > 0);
+	MPASS(txp->wr_type == 1);	/* VF supports type 1 only */
+	MPASS(txp->mb[0] != NULL);
+	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
+
+	wr = (void *)&eq->desc[eq->pidx];
+	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR));
+	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
+	wr->r3 = 0;
+	wr->plen = htobe16(txp->plen);
+	wr->npkt = txp->npkt;
+	wr->r4 = 0;
+	memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16);
+	flitp = wr + 1;
+
+	/*
+	 * At this point we are 32B into a hardware descriptor.  Each mbuf in
+	 * the WR will take 32B so we check for the end of the descriptor ring
+	 * before writing odd mbufs (mb[1], 3, 5, ..)
+	 */
+	ndesc = tx_len16_to_desc(txp->len16);
+	last = NULL;
+	for (i = 0; i < txp->npkt; i++) {
+		m = txp->mb[i];
+		if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
+			flitp = &eq->desc[0];
+		cpl = flitp;
+
+		/* Checksum offload */
+		ctrl1 = csum_to_ctrl(sc, m);
+		if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
+			txq->txcsum++;	/* some hardware assistance provided */
+
+		/* VLAN tag insertion */
+		if (needs_vlan_insertion(m)) {
+			ctrl1 |= F_TXPKT_VLAN_VLD |
+			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
+			txq->vlan_insertion++;
+		}
+
+		/* CPL header */
+		cpl->ctrl0 = txq->cpl_ctrl0;
+		cpl->pack = 0;
+		cpl->len = htobe16(m->m_pkthdr.len);
+		cpl->ctrl1 = htobe64(ctrl1);
+
+		flitp = cpl + 1;
+		MPASS(mbuf_nsegs(m) == 1);
+		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0);
+
+		if (last != NULL)
+			last->m_nextpkt = m;
+		last = m;
+	}
+
+	txq->sgl_wrs++;
+	txq->txpkts1_pkts += txp->npkt;
+	txq->txpkts1_wrs++;
+
+	txsd = &txq->sdesc[eq->pidx];
+	txsd->m = txp->mb[0];
+	txsd->desc_used = ndesc;
+
+	return (ndesc);
+}
+
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
@@ -5444,8 +5653,10 @@
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
-	mp_ring_check_drainage(txq->r, 0);
-	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
+	if (mp_ring_is_idle(txq->r))
+		taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
+	else
+		mp_ring_check_drainage(txq->r, 64);
 }
 
 static int
Index: sys/dev/cxgbe/t4_vf.c
===================================================================
--- sys/dev/cxgbe/t4_vf.c
+++ sys/dev/cxgbe/t4_vf.c
@@ -231,6 +231,7 @@
 get_params__post_init(struct adapter *sc)
 {
 	int rc;
+	uint32_t param, val;
 
 	rc = -t4vf_get_sge_params(sc);
 	if (rc != 0) {
@@ -281,6 +282,13 @@
 		return (EINVAL);
 	}
 	sc->params.portvec = sc->params.vfres.pmask;
+
+	param = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
+	rc = -t4vf_query_params(sc, 1, &param, &val);
+	if (rc == 0)
+		sc->params.max_pkts_per_eth_tx_pkts_wr = val;
+	else
+		sc->params.max_pkts_per_eth_tx_pkts_wr = 14;
 
 	return (0);
 }