D25454.diff
No OneTemporary
Actions

Size

47 KB

Referenced Files

None

Subscribers

None

D25454.diff
View Options

	Index: head/sys/dev/cxgbe/adapter.h
	===================================================================
	--- head/sys/dev/cxgbe/adapter.h
	+++ head/sys/dev/cxgbe/adapter.h
	@@ -550,6 +550,23 @@

	struct mp_ring;

	+struct txpkts {
	+ uint8_t wr_type; /* type 0 or type 1 */
	+ uint8_t npkt; /* # of packets in this work request */
	+ uint8_t len16; /* # of 16B pieces used by this work request */
	+ uint8_t score; /* 1-10. coalescing attempted if score > 3 */
	+ uint8_t max_npkt; /* maximum number of packets allowed */
	+ uint16_t plen; /* total payload (sum of all packets) */
	+
	+ /* straight from fw_eth_tx_pkts_vm_wr. */
	+ __u8 ethmacdst[6];
	+ __u8 ethmacsrc[6];
	+ __be16 ethtype;
	+ __be16 vlantci;
	+
	+ struct mbuf *mb[15];
	+};
	+
	/* txq: SGE egress queue + what's needed for Ethernet NIC */
	struct sge_txq {
	struct sge_eq eq; /* MUST be first */
	@@ -560,6 +577,7 @@
	struct sglist *gl;
	__be32 cpl_ctrl0; /* for convenience */
	int tc_idx; /* traffic class */
	+ struct txpkts txp;

	struct task tx_reclaim_task;
	/* stats for common events first */
	Index: head/sys/dev/cxgbe/common/common.h
	===================================================================
	--- head/sys/dev/cxgbe/common/common.h
	+++ head/sys/dev/cxgbe/common/common.h
	@@ -389,6 +389,7 @@
	bool ulptx_memwrite_dsgl; /* use of T5 DSGL allowed */
	bool fr_nsmr_tpte_wr_support; /* FW support for FR_NSMR_TPTE_WR */
	bool viid_smt_extn_support; /* FW returns vin, vfvld & smt index? */
	+ unsigned int max_pkts_per_eth_tx_pkts_wr;
	};

	#define CHELSIO_T4 0x4
	Index: head/sys/dev/cxgbe/t4_main.c
	===================================================================
	--- head/sys/dev/cxgbe/t4_main.c
	+++ head/sys/dev/cxgbe/t4_main.c
	@@ -2191,7 +2191,7 @@
	vi->rsrv_noflowq);

	items[0] = m;
	- rc = mp_ring_enqueue(txq->r, items, 1, 4096);
	+ rc = mp_ring_enqueue(txq->r, items, 1, 256);
	if (__predict_false(rc != 0))
	m_freem(m);

	@@ -2212,7 +2212,7 @@
	txq->eq.flags \|= EQ_QFLUSH;
	TXQ_UNLOCK(txq);
	while (!mp_ring_is_idle(txq->r)) {
	- mp_ring_check_drainage(txq->r, 0);
	+ mp_ring_check_drainage(txq->r, 4096);
	pause("qflush", 1);
	}
	TXQ_LOCK(txq);
	@@ -2261,7 +2261,7 @@
	struct sge_txq *txq;

	for_each_txq(vi, i, txq)
	- drops += counter_u64_fetch(txq->r->drops);
	+ drops += counter_u64_fetch(txq->r->dropped);
	}

	return (drops);
	@@ -2326,7 +2326,7 @@
	struct sge_txq *txq;

	for_each_txq(vi, i, txq)
	- drops += counter_u64_fetch(txq->r->drops);
	+ drops += counter_u64_fetch(txq->r->dropped);
	}

	return (drops);
	@@ -4457,6 +4457,13 @@
	else
	sc->params.fr_nsmr_tpte_wr_support = false;

	+ param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
	+ rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
	+ if (rc == 0)
	+ sc->params.max_pkts_per_eth_tx_pkts_wr = val[0];
	+ else
	+ sc->params.max_pkts_per_eth_tx_pkts_wr = 15;
	+
	/* get capabilites */
	bzero(&caps, sizeof(caps));
	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) \|
	@@ -5965,7 +5972,7 @@

	/* Wait for the mp_ring to empty. */
	while (!mp_ring_is_idle(txq->r)) {
	- mp_ring_check_drainage(txq->r, 0);
	+ mp_ring_check_drainage(txq->r, 4096);
	pause("rquiesce", 1);
	}

	Index: head/sys/dev/cxgbe/t4_mp_ring.h
	===================================================================
	--- head/sys/dev/cxgbe/t4_mp_ring.h
	+++ head/sys/dev/cxgbe/t4_mp_ring.h
	@@ -36,33 +36,38 @@
	#endif

	struct mp_ring;
	-typedef u_int (ring_drain_t)(struct mp_ring , u_int, u_int);
	+typedef u_int (ring_drain_t)(struct mp_ring , u_int, u_int, bool *);
	typedef u_int (ring_can_drain_t)(struct mp_ring );

	struct mp_ring {
	volatile uint64_t state __aligned(CACHE_LINE_SIZE);
	+ struct malloc_type * mt;

	int size __aligned(CACHE_LINE_SIZE);
	void * cookie;
	- struct malloc_type * mt;
	ring_drain_t drain;
	ring_can_drain_t can_drain; /* cheap, may be unreliable */
	- counter_u64_t enqueues;
	- counter_u64_t drops;
	- counter_u64_t starts;
	- counter_u64_t stalls;
	- counter_u64_t restarts; /* recovered after stalling */
	+ struct mtx * cons_lock;
	+ counter_u64_t dropped;
	+ counter_u64_t consumer[4];
	+ counter_u64_t not_consumer;
	counter_u64_t abdications;
	+ counter_u64_t consumed;
	+ counter_u64_t cons_idle;
	+ counter_u64_t cons_idle2;
	+ counter_u64_t stalls;

	void * volatile items[] __aligned(CACHE_LINE_SIZE);
	};

	int mp_ring_alloc(struct mp_ring *, int, void , ring_drain_t,
	- ring_can_drain_t, struct malloc_type *, int);
	+ ring_can_drain_t, struct malloc_type , struct mtx , int);
	void mp_ring_free(struct mp_ring *);
	int mp_ring_enqueue(struct mp_ring , void *, int, int);
	void mp_ring_check_drainage(struct mp_ring *, int);
	void mp_ring_reset_stats(struct mp_ring *);
	-int mp_ring_is_idle(struct mp_ring *);
	+bool mp_ring_is_idle(struct mp_ring *);
	+void mp_ring_sysctls(struct mp_ring , struct sysctl_ctx_list ,
	+ struct sysctl_oid_list *);

	#endif
	Index: head/sys/dev/cxgbe/t4_mp_ring.c
	===================================================================
	--- head/sys/dev/cxgbe/t4_mp_ring.c
	+++ head/sys/dev/cxgbe/t4_mp_ring.c
	@@ -34,6 +34,8 @@
	#include <sys/counter.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	+#include <sys/mutex.h>
	+#include <sys/sysctl.h>
	#include <machine/cpu.h>

	#include "t4_mp_ring.h"
	@@ -43,6 +45,23 @@
	#define atomic_cmpset_rel_64 atomic_cmpset_64
	#endif

	+/*
	+ * mp_ring handles multiple threads (producers) enqueueing data to a tx queue.
	+ * The thread that is writing the hardware descriptors is the consumer and it
	+ * runs with the consumer lock held. A producer becomes the consumer if there
	+ * isn't one already. The consumer runs with the flags sets to BUSY and
	+ * consumes everything (IDLE or COALESCING) or gets STALLED. If it is running
	+ * over its budget it sets flags to TOO_BUSY. A producer that observes a
	+ * TOO_BUSY consumer will become the new consumer by setting flags to
	+ * TAKING_OVER. The original consumer stops and sets the flags back to BUSY for
	+ * the new consumer.
	+ *
	+ * COALESCING is the same as IDLE except there are items being held in the hope
	+ * that they can be coalesced with items that follow. The driver must arrange
	+ * for a tx update or some other event that transmits all the held items in a
	+ * timely manner if nothing else is enqueued.
	+ */
	+
	union ring_state {
	struct {
	uint16_t pidx_head;
	@@ -54,13 +73,21 @@
	};

	enum {
	- IDLE = 0, /* consumer ran to completion, nothing more to do. */
	+ IDLE = 0, /* tx is all caught up, nothing to do. */
	+ COALESCING, /* IDLE, but tx frames are being held for coalescing */
	BUSY, /* consumer is running already, or will be shortly. */
	+ TOO_BUSY, /* consumer is running and is beyond its budget */
	+ TAKING_OVER, /* new consumer taking over from a TOO_BUSY consumer */
	STALLED, /* consumer stopped due to lack of resources. */
	- ABDICATED, /* consumer stopped even though there was work to be
	- done because it wants another thread to take over. */
	};

	+enum {
	+ C_FAST = 0,
	+ C_2,
	+ C_3,
	+ C_TAKEOVER,
	+};
	+
	static inline uint16_t
	space_available(struct mp_ring *r, union ring_state s)
	{
	@@ -83,93 +110,104 @@
	return (x > n ? idx + n : n - x);
	}

	-/* Consumer is about to update the ring's state to s */
	-static inline uint16_t
	-state_to_flags(union ring_state s, int abdicate)
	-{
	-
	- if (s.cidx == s.pidx_tail)
	- return (IDLE);
	- else if (abdicate && s.pidx_tail != s.pidx_head)
	- return (ABDICATED);
	-
	- return (BUSY);
	-}
	-
	/*
	- * Caller passes in a state, with a guarantee that there is work to do and that
	- * all items up to the pidx_tail in the state are visible.
	+ * Consumer. Called with the consumer lock held and a guarantee that there is
	+ * work to do.
	*/
	static void
	-drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
	+drain_ring(struct mp_ring *r, int budget)
	{
	- union ring_state ns;
	+ union ring_state os, ns;
	int n, pending, total;
	- uint16_t cidx = os.cidx;
	- uint16_t pidx = os.pidx_tail;
	+ uint16_t cidx;
	+ uint16_t pidx;
	+ bool coalescing;

	+ mtx_assert(r->cons_lock, MA_OWNED);
	+
	+ os.state = atomic_load_acq_64(&r->state);
	MPASS(os.flags == BUSY);
	+
	+ cidx = os.cidx;
	+ pidx = os.pidx_tail;
	MPASS(cidx != pidx);

	- if (prev == IDLE)
	- counter_u64_add(r->starts, 1);
	pending = 0;
	total = 0;

	while (cidx != pidx) {

	/* Items from cidx to pidx are available for consumption. */
	- n = r->drain(r, cidx, pidx);
	+ n = r->drain(r, cidx, pidx, &coalescing);
	if (n == 0) {
	critical_enter();
	- os.state = r->state;
	+ os.state = atomic_load_64(&r->state);
	do {
	ns.state = os.state;
	ns.cidx = cidx;
	- ns.flags = STALLED;
	+
	+ MPASS(os.flags == BUSY \|\|
	+ os.flags == TOO_BUSY \|\|
	+ os.flags == TAKING_OVER);
	+
	+ if (os.flags == TAKING_OVER)
	+ ns.flags = BUSY;
	+ else
	+ ns.flags = STALLED;
	} while (atomic_fcmpset_64(&r->state, &os.state,
	ns.state) == 0);
	critical_exit();
	- if (prev != STALLED)
	+ if (os.flags == TAKING_OVER)
	+ counter_u64_add(r->abdications, 1);
	+ else if (ns.flags == STALLED)
	counter_u64_add(r->stalls, 1);
	- else if (total > 0) {
	- counter_u64_add(r->restarts, 1);
	- counter_u64_add(r->stalls, 1);
	- }
	break;
	}
	cidx = increment_idx(r, cidx, n);
	pending += n;
	total += n;
	+ counter_u64_add(r->consumed, n);

	- /*
	- * We update the cidx only if we've caught up with the pidx, the
	- * real cidx is getting too far ahead of the one visible to
	- * everyone else, or we have exceeded our budget.
	- */
	- if (cidx != pidx && pending < 64 && total < budget)
	- continue;
	- critical_enter();
	- os.state = r->state;
	+ os.state = atomic_load_64(&r->state);
	do {
	+ MPASS(os.flags == BUSY \|\| os.flags == TOO_BUSY \|\|
	+ os.flags == TAKING_OVER);
	+
	ns.state = os.state;
	ns.cidx = cidx;
	- ns.flags = state_to_flags(ns, total >= budget);
	+ if (__predict_false(os.flags == TAKING_OVER)) {
	+ MPASS(total >= budget);
	+ ns.flags = BUSY;
	+ continue;
	+ }
	+ if (cidx == os.pidx_tail) {
	+ ns.flags = coalescing ? COALESCING : IDLE;
	+ continue;
	+ }
	+ if (total >= budget) {
	+ ns.flags = TOO_BUSY;
	+ continue;
	+ }
	+ MPASS(os.flags == BUSY);
	+ if (pending < 32)
	+ break;
	} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
	- critical_exit();

	- if (ns.flags == ABDICATED)
	+ if (__predict_false(os.flags == TAKING_OVER)) {
	+ MPASS(ns.flags == BUSY);
	counter_u64_add(r->abdications, 1);
	- if (ns.flags != BUSY) {
	- /* Wrong loop exit if we're going to stall. */
	- MPASS(ns.flags != STALLED);
	- if (prev == STALLED) {
	- MPASS(total > 0);
	- counter_u64_add(r->restarts, 1);
	- }
	break;
	}

	+ if (ns.flags == IDLE \|\| ns.flags == COALESCING) {
	+ MPASS(ns.pidx_tail == cidx);
	+ if (ns.pidx_head != ns.pidx_tail)
	+ counter_u64_add(r->cons_idle2, 1);
	+ else
	+ counter_u64_add(r->cons_idle, 1);
	+ break;
	+ }
	+
	/*
	* The acquire style atomic above guarantees visibility of items
	* associated with any pidx change that we notice here.
	@@ -177,13 +215,55 @@
	pidx = ns.pidx_tail;
	pending = 0;
	}
	+
	+#ifdef INVARIANTS
	+ if (os.flags == TAKING_OVER)
	+ MPASS(ns.flags == BUSY);
	+ else {
	+ MPASS(ns.flags == IDLE \|\| ns.flags == COALESCING \|\|
	+ ns.flags == STALLED);
	+ }
	+#endif
	}

	+static void
	+drain_txpkts(struct mp_ring *r, union ring_state os, int budget)
	+{
	+ union ring_state ns;
	+ uint16_t cidx = os.cidx;
	+ uint16_t pidx = os.pidx_tail;
	+ bool coalescing;
	+
	+ mtx_assert(r->cons_lock, MA_OWNED);
	+ MPASS(os.flags == BUSY);
	+ MPASS(cidx == pidx);
	+
	+ r->drain(r, cidx, pidx, &coalescing);
	+ MPASS(coalescing == false);
	+ critical_enter();
	+ os.state = atomic_load_64(&r->state);
	+ do {
	+ ns.state = os.state;
	+ MPASS(os.flags == BUSY);
	+ MPASS(os.cidx == cidx);
	+ if (ns.cidx == ns.pidx_tail)
	+ ns.flags = IDLE;
	+ else
	+ ns.flags = BUSY;
	+ } while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
	+ critical_exit();
	+
	+ if (ns.flags == BUSY)
	+ drain_ring(r, budget);
	+}
	+
	int
	mp_ring_alloc(struct mp_ring *pr, int size, void cookie, ring_drain_t drain,
	- ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
	+ ring_can_drain_t can_drain, struct malloc_type mt, struct mtx lck,
	+ int flags)
	{
	struct mp_ring *r;
	+ int i;

	/* All idx are 16b so size can be 65536 at most */
	if (pr == NULL \|\| size < 2 \|\| size > 65536 \|\| drain == NULL \|\|
	@@ -201,43 +281,59 @@
	r->mt = mt;
	r->drain = drain;
	r->can_drain = can_drain;
	- r->enqueues = counter_u64_alloc(flags);
	- r->drops = counter_u64_alloc(flags);
	- r->starts = counter_u64_alloc(flags);
	- r->stalls = counter_u64_alloc(flags);
	- r->restarts = counter_u64_alloc(flags);
	- r->abdications = counter_u64_alloc(flags);
	- if (r->enqueues == NULL \|\| r->drops == NULL \|\| r->starts == NULL \|\|
	- r->stalls == NULL \|\| r->restarts == NULL \|\|
	- r->abdications == NULL) {
	- mp_ring_free(r);
	- return (ENOMEM);
	+ r->cons_lock = lck;
	+ if ((r->dropped = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	+ for (i = 0; i < nitems(r->consumer); i++) {
	+ if ((r->consumer[i] = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	}
	-
	+ if ((r->not_consumer = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	+ if ((r->abdications = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	+ if ((r->stalls = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	+ if ((r->consumed = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	+ if ((r->cons_idle = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	+ if ((r->cons_idle2 = counter_u64_alloc(flags)) == NULL)
	+ goto failed;
	*pr = r;
	return (0);
	+failed:
	+ mp_ring_free(r);
	+ return (ENOMEM);
	}

	void

	mp_ring_free(struct mp_ring *r)
	{
	+ int i;

	if (r == NULL)
	return;

	- if (r->enqueues != NULL)
	- counter_u64_free(r->enqueues);
	- if (r->drops != NULL)
	- counter_u64_free(r->drops);
	- if (r->starts != NULL)
	- counter_u64_free(r->starts);
	- if (r->stalls != NULL)
	- counter_u64_free(r->stalls);
	- if (r->restarts != NULL)
	- counter_u64_free(r->restarts);
	+ if (r->dropped != NULL)
	+ counter_u64_free(r->dropped);
	+ for (i = 0; i < nitems(r->consumer); i++) {
	+ if (r->consumer[i] != NULL)
	+ counter_u64_free(r->consumer[i]);
	+ }
	+ if (r->not_consumer != NULL)
	+ counter_u64_free(r->not_consumer);
	if (r->abdications != NULL)
	counter_u64_free(r->abdications);
	+ if (r->stalls != NULL)
	+ counter_u64_free(r->stalls);
	+ if (r->consumed != NULL)
	+ counter_u64_free(r->consumed);
	+ if (r->cons_idle != NULL)
	+ counter_u64_free(r->cons_idle);
	+ if (r->cons_idle2 != NULL)
	+ counter_u64_free(r->cons_idle2);

	free(r, r->mt);
	}
	@@ -252,7 +348,8 @@
	{
	union ring_state os, ns;
	uint16_t pidx_start, pidx_stop;
	- int i;
	+ int i, nospc, cons;
	+ bool consumer;

	MPASS(items != NULL);
	MPASS(n > 0);
	@@ -261,26 +358,70 @@
	* Reserve room for the new items. Our reservation, if successful, is
	* from 'pidx_start' to 'pidx_stop'.
	*/
	- os.state = r->state;
	+ nospc = 0;
	+ os.state = atomic_load_64(&r->state);
	for (;;) {
	- if (n >= space_available(r, os)) {
	- counter_u64_add(r->drops, n);
	+ for (;;) {
	+ if (__predict_true(space_available(r, os) >= n))
	+ break;
	+
	+ /* Not enough room in the ring. */
	+
	MPASS(os.flags != IDLE);
	+ MPASS(os.flags != COALESCING);
	+ if (__predict_false(++nospc > 100)) {
	+ counter_u64_add(r->dropped, n);
	+ return (ENOBUFS);
	+ }
	if (os.flags == STALLED)
	- mp_ring_check_drainage(r, 0);
	- return (ENOBUFS);
	+ mp_ring_check_drainage(r, 64);
	+ else
	+ cpu_spinwait();
	+ os.state = atomic_load_64(&r->state);
	}
	+
	+ /* There is room in the ring. */
	+
	+ cons = -1;
	ns.state = os.state;
	ns.pidx_head = increment_idx(r, os.pidx_head, n);
	+ if (os.flags == IDLE \|\| os.flags == COALESCING) {
	+ MPASS(os.pidx_tail == os.cidx);
	+ if (os.pidx_head == os.pidx_tail) {
	+ cons = C_FAST;
	+ ns.pidx_tail = increment_idx(r, os.pidx_tail, n);
	+ } else
	+ cons = C_2;
	+ ns.flags = BUSY;
	+ } else if (os.flags == TOO_BUSY) {
	+ cons = C_TAKEOVER;
	+ ns.flags = TAKING_OVER;
	+ }
	critical_enter();
	if (atomic_fcmpset_64(&r->state, &os.state, ns.state))
	break;
	critical_exit();
	cpu_spinwait();
	- }
	+ };
	+
	pidx_start = os.pidx_head;
	pidx_stop = ns.pidx_head;

	+ if (cons == C_FAST) {
	+ i = pidx_start;
	+ do {
	+ r->items[i] = *items++;
	+ if (__predict_false(++i == r->size))
	+ i = 0;
	+ } while (i != pidx_stop);
	+ critical_exit();
	+ counter_u64_add(r->consumer[C_FAST], 1);
	+ mtx_lock(r->cons_lock);
	+ drain_ring(r, budget);
	+ mtx_unlock(r->cons_lock);
	+ return (0);
	+ }
	+
	/*
	* Wait for other producers who got in ahead of us to enqueue their
	* items, one producer at a time. It is our turn when the ring's
	@@ -288,7 +429,7 @@
	*/
	while (ns.pidx_tail != pidx_start) {
	cpu_spinwait();
	- ns.state = r->state;
	+ ns.state = atomic_load_64(&r->state);
	}

	/* Now it is our turn to fill up the area we reserved earlier. */
	@@ -303,21 +444,33 @@
	* Update the ring's pidx_tail. The release style atomic guarantees
	* that the items are visible to any thread that sees the updated pidx.
	*/
	- os.state = r->state;
	+ os.state = atomic_load_64(&r->state);
	do {
	+ consumer = false;
	ns.state = os.state;
	ns.pidx_tail = pidx_stop;
	- ns.flags = BUSY;
	+ if (os.flags == IDLE \|\| os.flags == COALESCING \|\|
	+ (os.flags == STALLED && r->can_drain(r))) {
	+ MPASS(cons == -1);
	+ consumer = true;
	+ ns.flags = BUSY;
	+ }
	} while (atomic_fcmpset_rel_64(&r->state, &os.state, ns.state) == 0);
	critical_exit();
	- counter_u64_add(r->enqueues, n);

	- /*
	- * Turn into a consumer if some other thread isn't active as a consumer
	- * already.
	- */
	- if (os.flags != BUSY)
	- drain_ring(r, ns, os.flags, budget);
	+ if (cons == -1) {
	+ if (consumer)
	+ cons = C_3;
	+ else {
	+ counter_u64_add(r->not_consumer, 1);
	+ return (0);
	+ }
	+ }
	+ MPASS(cons > C_FAST && cons < nitems(r->consumer));
	+ counter_u64_add(r->consumer[cons], 1);
	+ mtx_lock(r->cons_lock);
	+ drain_ring(r, budget);
	+ mtx_unlock(r->cons_lock);

	return (0);
	}
	@@ -327,46 +480,96 @@
	{
	union ring_state os, ns;

	- os.state = r->state;
	- if (os.flags != STALLED \|\| os.pidx_head != os.pidx_tail \|\|
	- r->can_drain(r) == 0)
	- return;
	-
	- MPASS(os.cidx != os.pidx_tail); /* implied by STALLED */
	- ns.state = os.state;
	- ns.flags = BUSY;
	-
	- /*
	- * The acquire style atomic guarantees visibility of items associated
	- * with the pidx that we read here.
	- */
	- if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
	- return;
	-
	- drain_ring(r, ns, os.flags, budget);
	+ os.state = atomic_load_64(&r->state);
	+ if (os.flags == STALLED && r->can_drain(r)) {
	+ MPASS(os.cidx != os.pidx_tail); /* implied by STALLED */
	+ ns.state = os.state;
	+ ns.flags = BUSY;
	+ if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
	+ mtx_lock(r->cons_lock);
	+ drain_ring(r, budget);
	+ mtx_unlock(r->cons_lock);
	+ }
	+ } else if (os.flags == COALESCING) {
	+ MPASS(os.cidx == os.pidx_tail);
	+ ns.state = os.state;
	+ ns.flags = BUSY;
	+ if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
	+ mtx_lock(r->cons_lock);
	+ drain_txpkts(r, ns, budget);
	+ mtx_unlock(r->cons_lock);
	+ }
	+ }
	}

	void
	mp_ring_reset_stats(struct mp_ring *r)
	{
	+ int i;

	- counter_u64_zero(r->enqueues);
	- counter_u64_zero(r->drops);
	- counter_u64_zero(r->starts);
	- counter_u64_zero(r->stalls);
	- counter_u64_zero(r->restarts);
	+ counter_u64_zero(r->dropped);
	+ for (i = 0; i < nitems(r->consumer); i++)
	+ counter_u64_zero(r->consumer[i]);
	+ counter_u64_zero(r->not_consumer);
	counter_u64_zero(r->abdications);
	+ counter_u64_zero(r->stalls);
	+ counter_u64_zero(r->consumed);
	+ counter_u64_zero(r->cons_idle);
	+ counter_u64_zero(r->cons_idle2);
	}

	-int
	+bool
	mp_ring_is_idle(struct mp_ring *r)
	{
	union ring_state s;

	- s.state = r->state;
	+ s.state = atomic_load_64(&r->state);
	if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
	s.flags == IDLE)
	- return (1);
	+ return (true);

	- return (0);
	+ return (false);
	+}
	+
	+void
	+mp_ring_sysctls(struct mp_ring r, struct sysctl_ctx_list ctx,
	+ struct sysctl_oid_list *children)
	+{
	+ struct sysctl_oid *oid;
	+
	+ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "mp_ring", CTLFLAG_RD \|
	+ CTLFLAG_MPSAFE, NULL, "mp_ring statistics");
	+ children = SYSCTL_CHILDREN(oid);
	+
	+ SYSCTL_ADD_U64(ctx, children, OID_AUTO, "state", CTLFLAG_RD,
	+ __DEVOLATILE(uint64_t *, &r->state), 0, "ring state");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "dropped", CTLFLAG_RD,
	+ &r->dropped, "# of items dropped");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumed",
	+ CTLFLAG_RD, &r->consumed, "# of items consumed");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "fast_consumer",
	+ CTLFLAG_RD, &r->consumer[C_FAST],
	+ "# of times producer became consumer (fast)");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer2",
	+ CTLFLAG_RD, &r->consumer[C_2],
	+ "# of times producer became consumer (2)");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer3",
	+ CTLFLAG_RD, &r->consumer[C_3],
	+ "# of times producer became consumer (3)");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "takeovers",
	+ CTLFLAG_RD, &r->consumer[C_TAKEOVER],
	+ "# of times producer took over from another consumer.");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "not_consumer",
	+ CTLFLAG_RD, &r->not_consumer,
	+ "# of times producer did not become consumer");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "abdications",
	+ CTLFLAG_RD, &r->abdications, "# of consumer abdications");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "stalls",
	+ CTLFLAG_RD, &r->stalls, "# of consumer stalls");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle",
	+ CTLFLAG_RD, &r->cons_idle,
	+ "# of times consumer ran fully to completion");
	+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle2",
	+ CTLFLAG_RD, &r->cons_idle2,
	+ "# of times consumer idled when another enqueue was in progress");
	}
	Index: head/sys/dev/cxgbe/t4_sge.c
	===================================================================
	--- head/sys/dev/cxgbe/t4_sge.c
	+++ head/sys/dev/cxgbe/t4_sge.c
	@@ -203,19 +203,6 @@
	SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
	"Enable presorting of LRO frames");

	-struct txpkts {
	- u_int wr_type; /* type 0 or type 1 */
	- u_int npkt; /* # of packets in this work request */
	- u_int plen; /* total payload (sum of all packets) */
	- u_int len16; /* # of 16B pieces used by this work request */
	-};
	-
	-/* A packet's SGL. This + m_pkthdr has all info needed for tx */
	-struct sgl {
	- struct sglist sg;
	- struct sglist_seg seg[TX_SGL_SEGS];
	-};
	-
	static int service_iq(struct sge_iq *, int);
	static int service_iq_fl(struct sge_iq *, int);
	static struct mbuf get_fl_payload(struct adapter , struct sge_fl *, uint32_t);
	@@ -284,14 +271,16 @@
	static inline u_int txpkts0_len16(u_int);
	static inline u_int txpkts1_len16(void);
	static u_int write_raw_wr(struct sge_txq , void , struct mbuf *, u_int);
	-static u_int write_txpkt_wr(struct adapter , struct sge_txq ,
	- struct fw_eth_tx_pkt_wr , struct mbuf , u_int);
	+static u_int write_txpkt_wr(struct adapter , struct sge_txq , struct mbuf *,
	+ u_int);
	static u_int write_txpkt_vm_wr(struct adapter , struct sge_txq ,
	- struct fw_eth_tx_pkt_vm_wr , struct mbuf , u_int);
	-static int try_txpkts(struct mbuf , struct mbuf , struct txpkts *, u_int);
	-static int add_to_txpkts(struct mbuf , struct txpkts , u_int);
	-static u_int write_txpkts_wr(struct adapter , struct sge_txq ,
	- struct fw_eth_tx_pkts_wr , struct mbuf , const struct txpkts *, u_int);
	+ struct mbuf *);
	+static int add_to_txpkts_vf(struct adapter , struct sge_txq , struct mbuf *,
	+ int, bool *);
	+static int add_to_txpkts_pf(struct adapter , struct sge_txq , struct mbuf *,
	+ int, bool *);
	+static u_int write_txpkts_wr(struct adapter , struct sge_txq );
	+static u_int write_txpkts_vm_wr(struct adapter , struct sge_txq );
	static void write_gl_to_txd(struct sge_txq , struct mbuf , caddr_t *, int);
	static inline void copy_to_txd(struct sge_eq , caddr_t, caddr_t , int);
	static inline void ring_eq_db(struct adapter , struct sge_eq , u_int);
	@@ -2839,7 +2828,7 @@
	return (total_available_tx_desc(eq) > eq->sidx / 8);
	}

	-static inline int
	+static inline bool
	cannot_use_txpkts(struct mbuf *m)
	{
	/* maybe put a GL limit too, to avoid silliness? */
	@@ -2855,8 +2844,9 @@
	}

	static inline int
	-wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
	+wr_can_update_eq(void *p)
	{
	+ struct fw_eth_tx_pkts_wr *wr = p;

	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
	case FW_ULPTX_WR:
	@@ -2864,159 +2854,232 @@
	case FW_ETH_TX_PKTS_WR:
	case FW_ETH_TX_PKTS2_WR:
	case FW_ETH_TX_PKT_VM_WR:
	+ case FW_ETH_TX_PKTS_VM_WR:
	return (1);
	default:
	return (0);
	}
	}

	+static inline void
	+set_txupdate_flags(struct sge_txq *txq, u_int avail,
	+ struct fw_eth_tx_pkt_wr *wr)
	+{
	+ struct sge_eq *eq = &txq->eq;
	+ struct txpkts *txp = &txq->txp;
	+
	+ if ((txp->npkt > 0 \|\| avail < eq->sidx / 2) &&
	+ atomic_cmpset_int(&eq->equiq, 0, 1)) {
	+ wr->equiq_to_len16 \|= htobe32(F_FW_WR_EQUEQ \| F_FW_WR_EQUIQ);
	+ eq->equeqidx = eq->pidx;
	+ } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
	+ wr->equiq_to_len16 \|= htobe32(F_FW_WR_EQUEQ);
	+ eq->equeqidx = eq->pidx;
	+ }
	+}
	+
	/*
	* r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
	* be consumed. Return the actual number consumed. 0 indicates a stall.
	*/
	static u_int
	-eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
	+eth_tx(struct mp_ring r, u_int cidx, u_int pidx, bool coalescing)
	{
	struct sge_txq *txq = r->cookie;
	- struct sge_eq *eq = &txq->eq;
	struct ifnet *ifp = txq->ifp;
	+ struct sge_eq *eq = &txq->eq;
	+ struct txpkts *txp = &txq->txp;
	struct vi_info *vi = ifp->if_softc;
	struct adapter *sc = vi->adapter;
	u_int total, remaining; /* # of packets */
	- u_int available, dbdiff; /* # of hardware descriptors */
	- u_int n, next_cidx;
	- struct mbuf m0, tail;
	- struct txpkts txp;
	- struct fw_eth_tx_pkts_wr wr; / any fw WR struct will do */
	+ u_int n, avail, dbdiff; /* # of hardware descriptors */
	+ int i, rc;
	+ struct mbuf *m0;
	+ bool snd;
	+ void wr; / start of the last WR written to the ring */

	- remaining = IDXDIFF(pidx, cidx, r->size);
	- MPASS(remaining > 0); /* Must not be called without work to do. */
	- total = 0;
	+ TXQ_LOCK_ASSERT_OWNED(txq);

	- TXQ_LOCK(txq);
	+ remaining = IDXDIFF(pidx, cidx, r->size);
	if (__predict_false(discard_tx(eq))) {
	+ for (i = 0; i < txp->npkt; i++)
	+ m_freem(txp->mb[i]);
	+ txp->npkt = 0;
	while (cidx != pidx) {
	m0 = r->items[cidx];
	m_freem(m0);
	if (++cidx == r->size)
	cidx = 0;
	}
	- reclaim_tx_descs(txq, 2048);
	- total = remaining;
	- goto done;
	+ reclaim_tx_descs(txq, eq->sidx);
	+ *coalescing = false;
	+ return (remaining); /* emptied */
	}

	/* How many hardware descriptors do we have readily available. */
	- if (eq->pidx == eq->cidx)
	- available = eq->sidx - 1;
	- else
	- available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
	- dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
	+ if (eq->pidx == eq->cidx) {
	+ avail = eq->sidx - 1;
	+ if (txp->score++ >= 5)
	+ txp->score = 5; /* tx is completely idle, reset. */
	+ } else
	+ avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;

	- while (remaining > 0) {
	+ total = 0;
	+ if (remaining == 0) {
	+ if (txp->score-- == 1) /* egr_update had to drain txpkts */
	+ txp->score = 1;
	+ goto send_txpkts;
	+ }

	+ dbdiff = 0;
	+ MPASS(remaining > 0);
	+ while (remaining > 0) {
	m0 = r->items[cidx];
	M_ASSERTPKTHDR(m0);
	MPASS(m0->m_nextpkt == NULL);

	- if (available < tx_len16_to_desc(mbuf_len16(m0))) {
	- available += reclaim_tx_descs(txq, 64);
	- if (available < tx_len16_to_desc(mbuf_len16(m0)))
	- break; /* out of descriptors */
	+ if (avail < 2 * SGE_MAX_WR_NDESC)
	+ avail += reclaim_tx_descs(txq, 64);
	+
	+ if (txp->npkt > 0 \|\| remaining > 1 \|\| txp->score > 3 \|\|
	+ atomic_load_int(&txq->eq.equiq) != 0) {
	+ if (sc->flags & IS_VF)
	+ rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
	+ else
	+ rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd);
	+ } else {
	+ snd = false;
	+ rc = EINVAL;
	}
	+ if (snd) {
	+ MPASS(txp->npkt > 0);
	+ for (i = 0; i < txp->npkt; i++)
	+ ETHER_BPF_MTAP(ifp, txp->mb[i]);
	+ if (txp->npkt > 1) {
	+ if (txp->score++ >= 10)
	+ txp->score = 10;
	+ MPASS(avail >= tx_len16_to_desc(txp->len16));
	+ if (sc->flags & IS_VF)
	+ n = write_txpkts_vm_wr(sc, txq);
	+ else
	+ n = write_txpkts_wr(sc, txq);
	+ } else {
	+ MPASS(avail >=
	+ tx_len16_to_desc(mbuf_len16(txp->mb[0])));
	+ if (sc->flags & IS_VF)
	+ n = write_txpkt_vm_wr(sc, txq,
	+ txp->mb[0]);
	+ else
	+ n = write_txpkt_wr(sc, txq, txp->mb[0],
	+ avail);
	+ }
	+ MPASS(n <= SGE_MAX_WR_NDESC);
	+ avail -= n;
	+ dbdiff += n;
	+ wr = &eq->desc[eq->pidx];
	+ IDXINCR(eq->pidx, n, eq->sidx);
	+ txp->npkt = 0; /* emptied */
	+ }
	+ if (rc == 0) {
	+ /* m0 was coalesced into txq->txpkts. */
	+ goto next_mbuf;
	+ }
	+ if (rc == EAGAIN) {
	+ /*
	+ * m0 is suitable for tx coalescing but could not be
	+ * combined with the existing txq->txpkts, which has now
	+ * been transmitted. Start a new txpkts with m0.
	+ */
	+ MPASS(snd);
	+ MPASS(txp->npkt == 0);
	+ continue;
	+ }

	- next_cidx = cidx + 1;
	- if (__predict_false(next_cidx == r->size))
	- next_cidx = 0;
	-
	- wr = (void *)&eq->desc[eq->pidx];
	+ MPASS(rc != 0 && rc != EAGAIN);
	+ MPASS(txp->npkt == 0);
	+ wr = &eq->desc[eq->pidx];
	if (mbuf_cflags(m0) & MC_RAW_WR) {
	- total++;
	- remaining--;
	- n = write_raw_wr(txq, (void *)wr, m0, available);
	+ n = write_raw_wr(txq, wr, m0, avail);
	#ifdef KERN_TLS
	} else if (mbuf_cflags(m0) & MC_TLS) {
	- total++;
	- remaining--;
	ETHER_BPF_MTAP(ifp, m0);
	- n = t6_ktls_write_wr(txq,(void *)wr, m0,
	- mbuf_nsegs(m0), available);
	+ n = t6_ktls_write_wr(txq, wr, m0, mbuf_nsegs(m0),
	+ avail);
	#endif
	- } else if (sc->flags & IS_VF) {
	- total++;
	- remaining--;
	- ETHER_BPF_MTAP(ifp, m0);
	- n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
	- available);
	- } else if (remaining > 1 &&
	- try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
	-
	- /* pkts at cidx, next_cidx should both be in txp. */
	- MPASS(txp.npkt == 2);
	- tail = r->items[next_cidx];
	- MPASS(tail->m_nextpkt == NULL);
	- ETHER_BPF_MTAP(ifp, m0);
	- ETHER_BPF_MTAP(ifp, tail);
	- m0->m_nextpkt = tail;
	-
	- if (__predict_false(++next_cidx == r->size))
	- next_cidx = 0;
	-
	- while (next_cidx != pidx) {
	- if (add_to_txpkts(r->items[next_cidx], &txp,
	- available) != 0)
	- break;
	- tail->m_nextpkt = r->items[next_cidx];
	- tail = tail->m_nextpkt;
	- ETHER_BPF_MTAP(ifp, tail);
	- if (__predict_false(++next_cidx == r->size))
	- next_cidx = 0;
	- }
	-
	- n = write_txpkts_wr(sc, txq, wr, m0, &txp, available);
	- total += txp.npkt;
	- remaining -= txp.npkt;
	} else {
	- total++;
	- remaining--;
	- ETHER_BPF_MTAP(ifp, m0);
	- n = write_txpkt_wr(sc, txq, (void *)wr, m0, available);
	+ n = tx_len16_to_desc(mbuf_len16(m0));
	+ if (__predict_false(avail < n)) {
	+ avail += reclaim_tx_descs(txq, 32);
	+ if (avail < n)
	+ break; /* out of descriptors */
	+ }
	+ if (sc->flags & IS_VF)
	+ n = write_txpkt_vm_wr(sc, txq, m0);
	+ else
	+ n = write_txpkt_wr(sc, txq, m0, avail);
	}
	- MPASS(n >= 1 && n <= available);
	+ MPASS(n >= 1 && n <= avail);
	if (!(mbuf_cflags(m0) & MC_TLS))
	MPASS(n <= SGE_MAX_WR_NDESC);

	- available -= n;
	+ avail -= n;
	dbdiff += n;
	IDXINCR(eq->pidx, n, eq->sidx);

	- if (wr_can_update_eq(wr)) {
	- if (total_available_tx_desc(eq) < eq->sidx / 4 &&
	- atomic_cmpset_int(&eq->equiq, 0, 1)) {
	- wr->equiq_to_len16 \|= htobe32(F_FW_WR_EQUIQ \|
	- F_FW_WR_EQUEQ);
	- eq->equeqidx = eq->pidx;
	- } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >=
	- 32) {
	- wr->equiq_to_len16 \|= htobe32(F_FW_WR_EQUEQ);
	- eq->equeqidx = eq->pidx;
	- }
	- }
	-
	- if (dbdiff >= 16 && remaining >= 4) {
	+ if (dbdiff >= 512 / EQ_ESIZE) { /* X_FETCHBURSTMAX_512B */
	+ if (wr_can_update_eq(wr))
	+ set_txupdate_flags(txq, avail, wr);
	ring_eq_db(sc, eq, dbdiff);
	- available += reclaim_tx_descs(txq, 4 * dbdiff);
	+ avail += reclaim_tx_descs(txq, 32);
	dbdiff = 0;
	}
	-
	- cidx = next_cidx;
	+next_mbuf:
	+ total++;
	+ remaining--;
	+ if (__predict_false(++cidx == r->size))
	+ cidx = 0;
	}
	if (dbdiff != 0) {
	+ if (wr_can_update_eq(wr))
	+ set_txupdate_flags(txq, avail, wr);
	ring_eq_db(sc, eq, dbdiff);
	reclaim_tx_descs(txq, 32);
	+ } else if (eq->pidx == eq->cidx && txp->npkt > 0 &&
	+ atomic_load_int(&txq->eq.equiq) == 0) {
	+ /*
	+ * If nothing was submitted to the chip for tx (it was coalesced
	+ * into txpkts instead) and there is no tx update outstanding
	+ * then we need to send txpkts now.
	+ */
	+send_txpkts:
	+ MPASS(txp->npkt > 0);
	+ for (i = 0; i < txp->npkt; i++)
	+ ETHER_BPF_MTAP(ifp, txp->mb[i]);
	+ if (txp->npkt > 1) {
	+ MPASS(avail >= tx_len16_to_desc(txp->len16));
	+ if (sc->flags & IS_VF)
	+ n = write_txpkts_vm_wr(sc, txq);
	+ else
	+ n = write_txpkts_wr(sc, txq);
	+ } else {
	+ MPASS(avail >=
	+ tx_len16_to_desc(mbuf_len16(txp->mb[0])));
	+ if (sc->flags & IS_VF)
	+ n = write_txpkt_vm_wr(sc, txq, txp->mb[0]);
	+ else
	+ n = write_txpkt_wr(sc, txq, txp->mb[0], avail);
	+ }
	+ MPASS(n <= SGE_MAX_WR_NDESC);
	+ wr = &eq->desc[eq->pidx];
	+ IDXINCR(eq->pidx, n, eq->sidx);
	+ txp->npkt = 0; /* emptied */
	+
	+ MPASS(wr_can_update_eq(wr));
	+ set_txupdate_flags(txq, avail - n, wr);
	+ ring_eq_db(sc, eq, n);
	+ reclaim_tx_descs(txq, 32);
	}
	-done:
	- TXQ_UNLOCK(txq);
	+ *coalescing = txp->npkt > 0;

	return (total);
	}
	@@ -4106,11 +4169,12 @@
	struct port_info *pi = vi->pi;
	struct adapter *sc = pi->adapter;
	struct sge_eq *eq = &txq->eq;
	+ struct txpkts *txp;
	char name[16];
	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);

	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
	- M_CXGBE, M_WAITOK);
	+ M_CXGBE, &eq->eq_lock, M_WAITOK);
	if (rc != 0) {
	device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
	return (rc);
	@@ -4147,6 +4211,12 @@
	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
	M_ZERO \| M_WAITOK);

	+ txp = &txq->txp;
	+ txp->score = 5;
	+ MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr);
	+ txq->txp.max_npkt = min(nitems(txp->mb),
	+ sc->params.max_pkts_per_eth_tx_pkts_wr);
	+
	snprintf(name, sizeof(name), "%d", idx);
	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, NULL, "tx queue");
	@@ -4242,26 +4312,8 @@
	"# of NIC TLS sessions using AES-GCM");
	}
	#endif
	+ mp_ring_sysctls(txq->r, &vi->ctx, children);

	- SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
	- CTLFLAG_RD, &txq->r->enqueues,
	- "# of enqueues to the mp_ring for this queue");
	- SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
	- CTLFLAG_RD, &txq->r->drops,
	- "# of drops in the mp_ring for this queue");
	- SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
	- CTLFLAG_RD, &txq->r->starts,
	- "# of normal consumer starts in the mp_ring for this queue");
	- SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
	- CTLFLAG_RD, &txq->r->stalls,
	- "# of consumer stalls in the mp_ring for this queue");
	- SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
	- CTLFLAG_RD, &txq->r->restarts,
	- "# of consumer restarts in the mp_ring for this queue");
	- SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
	- CTLFLAG_RD, &txq->r->abdications,
	- "# of consumer abdications in the mp_ring for this queue");
	-
	return (0);
	}

	@@ -4655,10 +4707,10 @@
	* The return value is the # of hardware descriptors used.
	*/
	static u_int
	-write_txpkt_vm_wr(struct adapter sc, struct sge_txq txq,
	- struct fw_eth_tx_pkt_vm_wr wr, struct mbuf m0, u_int available)
	+write_txpkt_vm_wr(struct adapter sc, struct sge_txq txq, struct mbuf *m0)
	{
	- struct sge_eq *eq = &txq->eq;
	+ struct sge_eq *eq;
	+ struct fw_eth_tx_pkt_vm_wr *wr;
	struct tx_sdesc *txsd;
	struct cpl_tx_pkt_core *cpl;
	uint32_t ctrl; /* used in many unrelated places */
	@@ -4668,7 +4720,6 @@

	TXQ_LOCK_ASSERT_OWNED(txq);
	M_ASSERTPKTHDR(m0);
	- MPASS(available > 0 && available < eq->sidx);

	len16 = mbuf_len16(m0);
	nsegs = mbuf_nsegs(m0);
	@@ -4677,10 +4728,10 @@
	if (needs_tso(m0))
	ctrl += sizeof(struct cpl_tx_pkt_lso_core);
	ndesc = tx_len16_to_desc(len16);
	- MPASS(ndesc <= available);

	/* Firmware work request header */
	- MPASS(wr == (void *)&eq->desc[eq->pidx]);
	+ eq = &txq->eq;
	+ wr = (void *)&eq->desc[eq->pidx];
	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) \|
	V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));

	@@ -4760,7 +4811,6 @@
	} else
	write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
	txq->sgl_wrs++;
	-
	txq->txpkt_wrs++;

	txsd = &txq->sdesc[eq->pidx];
	@@ -4811,10 +4861,11 @@
	* The return value is the # of hardware descriptors used.
	*/
	static u_int
	-write_txpkt_wr(struct adapter sc, struct sge_txq txq,
	- struct fw_eth_tx_pkt_wr wr, struct mbuf m0, u_int available)
	+write_txpkt_wr(struct adapter sc, struct sge_txq txq, struct mbuf *m0,
	+ u_int available)
	{
	- struct sge_eq *eq = &txq->eq;
	+ struct sge_eq *eq;
	+ struct fw_eth_tx_pkt_wr *wr;
	struct tx_sdesc *txsd;
	struct cpl_tx_pkt_core *cpl;
	uint32_t ctrl; /* used in many unrelated places */
	@@ -4824,7 +4875,6 @@

	TXQ_LOCK_ASSERT_OWNED(txq);
	M_ASSERTPKTHDR(m0);
	- MPASS(available > 0 && available < eq->sidx);

	len16 = mbuf_len16(m0);
	nsegs = mbuf_nsegs(m0);
	@@ -4844,7 +4894,8 @@
	MPASS(ndesc <= available);

	/* Firmware work request header */
	- MPASS(wr == (void *)&eq->desc[eq->pidx]);
	+ eq = &txq->eq;
	+ wr = (void *)&eq->desc[eq->pidx];
	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) \|
	V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));

	@@ -4927,71 +4978,151 @@
	return (ndesc);
	}

	+static inline bool
	+cmp_l2hdr(struct txpkts txp, struct mbuf m)
	+{
	+ int len;
	+
	+ MPASS(txp->npkt > 0);
	+ MPASS(m->m_len >= 16); /* type1 implies 1 GL with all of the frame. */
	+
	+ if (txp->ethtype == be16toh(ETHERTYPE_VLAN))
	+ len = sizeof(struct ether_vlan_header);
	+ else
	+ len = sizeof(struct ether_header);
	+
	+ return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0);
	+}
	+
	+static inline void
	+save_l2hdr(struct txpkts txp, struct mbuf m)
	+{
	+ MPASS(m->m_len >= 16); /* type1 implies 1 GL with all of the frame. */
	+
	+ memcpy(&txp->ethmacdst[0], mtod(m, const void *), 16);
	+}
	+
	static int
	-try_txpkts(struct mbuf m, struct mbuf n, struct txpkts *txp, u_int available)
	+add_to_txpkts_vf(struct adapter sc, struct sge_txq txq, struct mbuf *m,
	+ int avail, bool *send)
	{
	- u_int needed, nsegs1, nsegs2, l1, l2;
	+ struct txpkts *txp = &txq->txp;

	- if (cannot_use_txpkts(m) \|\| cannot_use_txpkts(n))
	- return (1);
	+ MPASS(sc->flags & IS_VF);

	- nsegs1 = mbuf_nsegs(m);
	- nsegs2 = mbuf_nsegs(n);
	- if (nsegs1 + nsegs2 == 2) {
	- txp->wr_type = 1;
	- l1 = l2 = txpkts1_len16();
	- } else {
	- txp->wr_type = 0;
	- l1 = txpkts0_len16(nsegs1);
	- l2 = txpkts0_len16(nsegs2);
	+ /* Cannot have TSO and coalesce at the same time. */
	+ if (cannot_use_txpkts(m)) {
	+cannot_coalesce:
	+ *send = txp->npkt > 0;
	+ return (EINVAL);
	}
	- txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
	- needed = tx_len16_to_desc(txp->len16);
	- if (needed > SGE_MAX_WR_NDESC \|\| needed > available)
	- return (1);

	- txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
	- if (txp->plen > 65535)
	- return (1);
	+ /* VF allows coalescing of type 1 (1 GL) only */
	+ if (mbuf_nsegs(m) > 1)
	+ goto cannot_coalesce;

	- txp->npkt = 2;
	- set_mbuf_len16(m, l1);
	- set_mbuf_len16(n, l2);
	+ *send = false;
	+ if (txp->npkt > 0) {
	+ MPASS(tx_len16_to_desc(txp->len16) <= avail);
	+ MPASS(txp->npkt < txp->max_npkt);
	+ MPASS(txp->wr_type == 1); /* VF supports type 1 only */

	+ if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) {
	+retry_after_send:
	+ *send = true;
	+ return (EAGAIN);
	+ }
	+ if (m->m_pkthdr.len + txp->plen > 65535)
	+ goto retry_after_send;
	+ if (cmp_l2hdr(txp, m))
	+ goto retry_after_send;
	+
	+ txp->len16 += txpkts1_len16();
	+ txp->plen += m->m_pkthdr.len;
	+ txp->mb[txp->npkt++] = m;
	+ if (txp->npkt == txp->max_npkt)
	+ *send = true;
	+ } else {
	+ txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) +
	+ txpkts1_len16();
	+ if (tx_len16_to_desc(txp->len16) > avail)
	+ goto cannot_coalesce;
	+ txp->npkt = 1;
	+ txp->wr_type = 1;
	+ txp->plen = m->m_pkthdr.len;
	+ txp->mb[0] = m;
	+ save_l2hdr(txp, m);
	+ }
	return (0);
	}

	static int
	-add_to_txpkts(struct mbuf m, struct txpkts txp, u_int available)
	+add_to_txpkts_pf(struct adapter sc, struct sge_txq txq, struct mbuf *m,
	+ int avail, bool *send)
	{
	- u_int plen, len16, needed, nsegs;
	+ struct txpkts *txp = &txq->txp;
	+ int nsegs;

	- MPASS(txp->wr_type == 0 \|\| txp->wr_type == 1);
	+ MPASS(!(sc->flags & IS_VF));

	- if (cannot_use_txpkts(m))
	- return (1);
	+ /* Cannot have TSO and coalesce at the same time. */
	+ if (cannot_use_txpkts(m)) {
	+cannot_coalesce:
	+ *send = txp->npkt > 0;
	+ return (EINVAL);
	+ }

	+ *send = false;
	nsegs = mbuf_nsegs(m);
	- if (txp->wr_type == 1 && nsegs != 1)
	- return (1);
	+ if (txp->npkt == 0) {
	+ if (m->m_pkthdr.len > 65535)
	+ goto cannot_coalesce;
	+ if (nsegs > 1) {
	+ txp->wr_type = 0;
	+ txp->len16 =
	+ howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
	+ txpkts0_len16(nsegs);
	+ } else {
	+ txp->wr_type = 1;
	+ txp->len16 =
	+ howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) +
	+ txpkts1_len16();
	+ }
	+ if (tx_len16_to_desc(txp->len16) > avail)
	+ goto cannot_coalesce;
	+ txp->npkt = 1;
	+ txp->plen = m->m_pkthdr.len;
	+ txp->mb[0] = m;
	+ } else {
	+ MPASS(tx_len16_to_desc(txp->len16) <= avail);
	+ MPASS(txp->npkt < txp->max_npkt);

	- plen = txp->plen + m->m_pkthdr.len;
	- if (plen > 65535)
	- return (1);
	+ if (m->m_pkthdr.len + txp->plen > 65535) {
	+retry_after_send:
	+ *send = true;
	+ return (EAGAIN);
	+ }

	- if (txp->wr_type == 0)
	- len16 = txpkts0_len16(nsegs);
	- else
	- len16 = txpkts1_len16();
	- needed = tx_len16_to_desc(txp->len16 + len16);
	- if (needed > SGE_MAX_WR_NDESC \|\| needed > available)
	- return (1);
	+ MPASS(txp->wr_type == 0 \|\| txp->wr_type == 1);
	+ if (txp->wr_type == 0) {
	+ if (tx_len16_to_desc(txp->len16 +
	+ txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC))
	+ goto retry_after_send;
	+ txp->len16 += txpkts0_len16(nsegs);
	+ } else {
	+ if (nsegs != 1)
	+ goto retry_after_send;
	+ if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) >
	+ avail)
	+ goto retry_after_send;
	+ txp->len16 += txpkts1_len16();
	+ }

	- txp->npkt++;
	- txp->plen = plen;
	- txp->len16 += len16;
	- set_mbuf_len16(m, len16);
	-
	+ txp->plen += m->m_pkthdr.len;
	+ txp->mb[txp->npkt++] = m;
	+ if (txp->npkt == txp->max_npkt)
	+ *send = true;
	+ }
	return (0);
	}

	@@ -5003,34 +5134,25 @@
	* The return value is the # of hardware descriptors used.
	*/
	static u_int
	-write_txpkts_wr(struct adapter sc, struct sge_txq txq,
	- struct fw_eth_tx_pkts_wr wr, struct mbuf m0, const struct txpkts *txp,
	- u_int available)
	+write_txpkts_wr(struct adapter sc, struct sge_txq txq)
	{
	+ const struct txpkts *txp = &txq->txp;
	struct sge_eq *eq = &txq->eq;
	+ struct fw_eth_tx_pkts_wr *wr;
	struct tx_sdesc *txsd;
	struct cpl_tx_pkt_core *cpl;
	- uint32_t ctrl;
	uint64_t ctrl1;
	- int ndesc, checkwrap;
	- struct mbuf *m;
	+ int ndesc, i, checkwrap;
	+ struct mbuf m, last;
	void *flitp;

	TXQ_LOCK_ASSERT_OWNED(txq);
	MPASS(txp->npkt > 0);
	- MPASS(txp->plen < 65536);
	- MPASS(m0 != NULL);
	- MPASS(m0->m_nextpkt != NULL);
	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
	- MPASS(available > 0 && available < eq->sidx);

	- ndesc = tx_len16_to_desc(txp->len16);
	- MPASS(ndesc <= available);
	-
	- MPASS(wr == (void *)&eq->desc[eq->pidx]);
	+ wr = (void *)&eq->desc[eq->pidx];
	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
	- ctrl = V_FW_WR_LEN16(txp->len16);
	- wr->equiq_to_len16 = htobe32(ctrl);
	+ wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
	wr->plen = htobe16(txp->plen);
	wr->npkt = txp->npkt;
	wr->r3 = 0;
	@@ -5042,8 +5164,11 @@
	* set then we know the WR is going to wrap around somewhere. We'll
	* check for that at appropriate points.
	*/
	+ ndesc = tx_len16_to_desc(txp->len16);
	+ last = NULL;
	checkwrap = eq->sidx - ndesc < eq->pidx;
	- for (m = m0; m != NULL; m = m->m_nextpkt) {
	+ for (i = 0; i < txp->npkt; i++) {
	+ m = txp->mb[i];
	if (txp->wr_type == 0) {
	struct ulp_txpkt *ulpmc;
	struct ulptx_idata *ulpsc;
	@@ -5052,7 +5177,7 @@
	ulpmc = flitp;
	ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) \|
	V_ULP_TXPKT_DEST(0) \| V_ULP_TXPKT_FID(eq->iqid));
	- ulpmc->len = htobe32(mbuf_len16(m));
	+ ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m)));

	/* ULP subcommand */
	ulpsc = (void *)(ulpmc + 1);
	@@ -5093,8 +5218,12 @@

	write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);

	+ if (last != NULL)
	+ last->m_nextpkt = m;
	+ last = m;
	}

	+ txq->sgl_wrs++;
	if (txp->wr_type == 0) {
	txq->txpkts0_pkts += txp->npkt;
	txq->txpkts0_wrs++;
	@@ -5104,12 +5233,92 @@
	}

	txsd = &txq->sdesc[eq->pidx];
	- txsd->m = m0;
	+ txsd->m = txp->mb[0];
	txsd->desc_used = ndesc;

	return (ndesc);
	}

	+static u_int
	+write_txpkts_vm_wr(struct adapter sc, struct sge_txq txq)
	+{
	+ const struct txpkts *txp = &txq->txp;
	+ struct sge_eq *eq = &txq->eq;
	+ struct fw_eth_tx_pkts_vm_wr *wr;
	+ struct tx_sdesc *txsd;
	+ struct cpl_tx_pkt_core *cpl;
	+ uint64_t ctrl1;
	+ int ndesc, i;
	+ struct mbuf m, last;
	+ void *flitp;
	+
	+ TXQ_LOCK_ASSERT_OWNED(txq);
	+ MPASS(txp->npkt > 0);
	+ MPASS(txp->wr_type == 1); /* VF supports type 1 only */
	+ MPASS(txp->mb[0] != NULL);
	+ MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
	+
	+ wr = (void *)&eq->desc[eq->pidx];
	+ wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR));
	+ wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16));
	+ wr->r3 = 0;
	+ wr->plen = htobe16(txp->plen);
	+ wr->npkt = txp->npkt;
	+ wr->r4 = 0;
	+ memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16);
	+ flitp = wr + 1;
	+
	+ /*
	+ * At this point we are 32B into a hardware descriptor. Each mbuf in
	+ * the WR will take 32B so we check for the end of the descriptor ring
	+ * before writing odd mbufs (mb[1], 3, 5, ..)
	+ */
	+ ndesc = tx_len16_to_desc(txp->len16);
	+ last = NULL;
	+ for (i = 0; i < txp->npkt; i++) {
	+ m = txp->mb[i];
	+ if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
	+ flitp = &eq->desc[0];
	+ cpl = flitp;
	+
	+ /* Checksum offload */
	+ ctrl1 = csum_to_ctrl(sc, m);
	+ if (ctrl1 != (F_TXPKT_IPCSUM_DIS \| F_TXPKT_L4CSUM_DIS))
	+ txq->txcsum++; /* some hardware assistance provided */
	+
	+ /* VLAN tag insertion */
	+ if (needs_vlan_insertion(m)) {
	+ ctrl1 \|= F_TXPKT_VLAN_VLD \|
	+ V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
	+ txq->vlan_insertion++;
	+ }
	+
	+ /* CPL header */
	+ cpl->ctrl0 = txq->cpl_ctrl0;
	+ cpl->pack = 0;
	+ cpl->len = htobe16(m->m_pkthdr.len);
	+ cpl->ctrl1 = htobe64(ctrl1);
	+
	+ flitp = cpl + 1;
	+ MPASS(mbuf_nsegs(m) == 1);
	+ write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0);
	+
	+ if (last != NULL)
	+ last->m_nextpkt = m;
	+ last = m;
	+ }
	+
	+ txq->sgl_wrs++;
	+ txq->txpkts1_pkts += txp->npkt;
	+ txq->txpkts1_wrs++;
	+
	+ txsd = &txq->sdesc[eq->pidx];
	+ txsd->m = txp->mb[0];
	+ txsd->desc_used = ndesc;
	+
	+ return (ndesc);
	+}
	+
	/*
	* If the SGL ends on an address that is not 16 byte aligned, this function will
	* add a 0 filled flit at the end.
	@@ -5444,8 +5653,10 @@
	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);

	atomic_readandclear_int(&eq->equiq);
	- mp_ring_check_drainage(txq->r, 0);
	- taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
	+ if (mp_ring_is_idle(txq->r))
	+ taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
	+ else
	+ mp_ring_check_drainage(txq->r, 64);
	}

	static int
	Index: head/sys/dev/cxgbe/t4_vf.c
	===================================================================
	--- head/sys/dev/cxgbe/t4_vf.c
	+++ head/sys/dev/cxgbe/t4_vf.c
	@@ -231,6 +231,7 @@
	get_params__post_init(struct adapter *sc)
	{
	int rc;
	+ uint32_t param, val;

	rc = -t4vf_get_sge_params(sc);
	if (rc != 0) {
	@@ -281,6 +282,13 @@
	return (EINVAL);
	}
	sc->params.portvec = sc->params.vfres.pmask;
	+
	+ param = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
	+ rc = -t4vf_query_params(sc, 1, &param, &val);
	+ if (rc == 0)
	+ sc->params.max_pkts_per_eth_tx_pkts_wr = val;
	+ else
	+ sc->params.max_pkts_per_eth_tx_pkts_wr = 14;

	return (0);
	}

File Metadata

Mime Type: text/plain
Expires: Thu, Nov 20, 6:16 AM (6 h, 36 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 25711521
Default Alt Text: D25454.diff (47 KB)

D25454.diffNo OneTemporaryActions

D25454.diffView Options

File Metadata

Event Timeline

D25454.diff
No OneTemporary
Actions

D25454.diff
View Options