Index: sys/dev/cxgbe/adapter.h =================================================================== --- sys/dev/cxgbe/adapter.h +++ sys/dev/cxgbe/adapter.h @@ -550,6 +550,23 @@ struct mp_ring; +struct txpkts { + uint8_t wr_type; /* type 0 or type 1 */ + uint8_t npkt; /* # of packets in this work request */ + uint8_t len16; /* # of 16B pieces used by this work request */ + uint8_t score; /* 1-10. coalescing attempted if score > 3 */ + uint8_t max_npkt; /* maximum number of packets allowed */ + uint16_t plen; /* total payload (sum of all packets) */ + + /* straight from fw_eth_tx_pkts_vm_wr. */ + __u8 ethmacdst[6]; + __u8 ethmacsrc[6]; + __be16 ethtype; + __be16 vlantci; + + struct mbuf *mb[15]; +}; + /* txq: SGE egress queue + what's needed for Ethernet NIC */ struct sge_txq { struct sge_eq eq; /* MUST be first */ @@ -560,6 +577,7 @@ struct sglist *gl; __be32 cpl_ctrl0; /* for convenience */ int tc_idx; /* traffic class */ + struct txpkts txp; struct task tx_reclaim_task; /* stats for common events first */ Index: sys/dev/cxgbe/common/common.h =================================================================== --- sys/dev/cxgbe/common/common.h +++ sys/dev/cxgbe/common/common.h @@ -389,6 +389,7 @@ bool ulptx_memwrite_dsgl; /* use of T5 DSGL allowed */ bool fr_nsmr_tpte_wr_support; /* FW support for FR_NSMR_TPTE_WR */ bool viid_smt_extn_support; /* FW returns vin, vfvld & smt index? */ + unsigned int max_pkts_per_eth_tx_pkts_wr; }; #define CHELSIO_T4 0x4 Index: sys/dev/cxgbe/t4_main.c =================================================================== --- sys/dev/cxgbe/t4_main.c +++ sys/dev/cxgbe/t4_main.c @@ -2191,7 +2191,7 @@ vi->rsrv_noflowq); items[0] = m; - rc = mp_ring_enqueue(txq->r, items, 1, 4096); + rc = mp_ring_enqueue(txq->r, items, 1, 256); if (__predict_false(rc != 0)) m_freem(m); @@ -2212,7 +2212,7 @@ txq->eq.flags |= EQ_QFLUSH; TXQ_UNLOCK(txq); while (!mp_ring_is_idle(txq->r)) { - mp_ring_check_drainage(txq->r, 0); + mp_ring_check_drainage(txq->r, 4096); pause("qflush", 1); } TXQ_LOCK(txq); @@ -2261,7 +2261,7 @@ struct sge_txq *txq; for_each_txq(vi, i, txq) - drops += counter_u64_fetch(txq->r->drops); + drops += counter_u64_fetch(txq->r->dropped); } return (drops); @@ -2326,7 +2326,7 @@ struct sge_txq *txq; for_each_txq(vi, i, txq) - drops += counter_u64_fetch(txq->r->drops); + drops += counter_u64_fetch(txq->r->dropped); } return (drops); @@ -4457,6 +4457,13 @@ else sc->params.fr_nsmr_tpte_wr_support = false; + param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR); + rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); + if (rc == 0) + sc->params.max_pkts_per_eth_tx_pkts_wr = val[0]; + else + sc->params.max_pkts_per_eth_tx_pkts_wr = 15; + /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | @@ -5965,7 +5972,7 @@ /* Wait for the mp_ring to empty. */ while (!mp_ring_is_idle(txq->r)) { - mp_ring_check_drainage(txq->r, 0); + mp_ring_check_drainage(txq->r, 4096); pause("rquiesce", 1); } Index: sys/dev/cxgbe/t4_mp_ring.h =================================================================== --- sys/dev/cxgbe/t4_mp_ring.h +++ sys/dev/cxgbe/t4_mp_ring.h @@ -36,33 +36,38 @@ #endif struct mp_ring; -typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int); +typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int, bool *); typedef u_int (*ring_can_drain_t)(struct mp_ring *); struct mp_ring { volatile uint64_t state __aligned(CACHE_LINE_SIZE); + struct malloc_type * mt; int size __aligned(CACHE_LINE_SIZE); void * cookie; - struct malloc_type * mt; ring_drain_t drain; ring_can_drain_t can_drain; /* cheap, may be unreliable */ - counter_u64_t enqueues; - counter_u64_t drops; - counter_u64_t starts; - counter_u64_t stalls; - counter_u64_t restarts; /* recovered after stalling */ + struct mtx * cons_lock; + counter_u64_t dropped; + counter_u64_t consumer[4]; + counter_u64_t not_consumer; counter_u64_t abdications; + counter_u64_t consumed; + counter_u64_t cons_idle; + counter_u64_t cons_idle2; + counter_u64_t stalls; void * volatile items[] __aligned(CACHE_LINE_SIZE); }; int mp_ring_alloc(struct mp_ring **, int, void *, ring_drain_t, - ring_can_drain_t, struct malloc_type *, int); + ring_can_drain_t, struct malloc_type *, struct mtx *, int); void mp_ring_free(struct mp_ring *); int mp_ring_enqueue(struct mp_ring *, void **, int, int); void mp_ring_check_drainage(struct mp_ring *, int); void mp_ring_reset_stats(struct mp_ring *); -int mp_ring_is_idle(struct mp_ring *); +bool mp_ring_is_idle(struct mp_ring *); +void mp_ring_sysctls(struct mp_ring *, struct sysctl_ctx_list *, + struct sysctl_oid_list *); #endif Index: sys/dev/cxgbe/t4_mp_ring.c =================================================================== --- sys/dev/cxgbe/t4_mp_ring.c +++ sys/dev/cxgbe/t4_mp_ring.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include "t4_mp_ring.h" @@ -43,6 +45,23 @@ #define atomic_cmpset_rel_64 atomic_cmpset_64 #endif +/* + * mp_ring handles multiple threads (producers) enqueueing data to a tx queue. + * The thread that is writing the hardware descriptors is the consumer and it + * runs with the consumer lock held. A producer becomes the consumer if there + * isn't one already. The consumer runs with the flags sets to BUSY and + * consumes everything (IDLE or COALESCING) or gets STALLED. If it is running + * over its budget it sets flags to TOO_BUSY. A producer that observes a + * TOO_BUSY consumer will become the new consumer by setting flags to + * TAKING_OVER. The original consumer stops and sets the flags back to BUSY for + * the new consumer. + * + * COALESCING is the same as IDLE except there are items being held in the hope + * that they can be coalesced with items that follow. The driver must arrange + * for a tx update or some other event that transmits all the held items in a + * timely manner if nothing else is enqueued. + */ + union ring_state { struct { uint16_t pidx_head; @@ -54,13 +73,21 @@ }; enum { - IDLE = 0, /* consumer ran to completion, nothing more to do. */ + IDLE = 0, /* tx is all caught up, nothing to do. */ + COALESCING, /* IDLE, but tx frames are being held for coalescing */ BUSY, /* consumer is running already, or will be shortly. */ + TOO_BUSY, /* consumer is running and is beyond its budget */ + TAKING_OVER, /* new consumer taking over from a TOO_BUSY consumer */ STALLED, /* consumer stopped due to lack of resources. */ - ABDICATED, /* consumer stopped even though there was work to be - done because it wants another thread to take over. */ }; +enum { + C_FAST = 0, + C_2, + C_3, + C_TAKEOVER, +}; + static inline uint16_t space_available(struct mp_ring *r, union ring_state s) { @@ -83,93 +110,104 @@ return (x > n ? idx + n : n - x); } -/* Consumer is about to update the ring's state to s */ -static inline uint16_t -state_to_flags(union ring_state s, int abdicate) -{ - - if (s.cidx == s.pidx_tail) - return (IDLE); - else if (abdicate && s.pidx_tail != s.pidx_head) - return (ABDICATED); - - return (BUSY); -} - /* - * Caller passes in a state, with a guarantee that there is work to do and that - * all items up to the pidx_tail in the state are visible. + * Consumer. Called with the consumer lock held and a guarantee that there is + * work to do. */ static void -drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget) +drain_ring(struct mp_ring *r, int budget) { - union ring_state ns; + union ring_state os, ns; int n, pending, total; - uint16_t cidx = os.cidx; - uint16_t pidx = os.pidx_tail; + uint16_t cidx; + uint16_t pidx; + bool coalescing; + mtx_assert(r->cons_lock, MA_OWNED); + + os.state = atomic_load_acq_64(&r->state); MPASS(os.flags == BUSY); + + cidx = os.cidx; + pidx = os.pidx_tail; MPASS(cidx != pidx); - if (prev == IDLE) - counter_u64_add(r->starts, 1); pending = 0; total = 0; while (cidx != pidx) { /* Items from cidx to pidx are available for consumption. */ - n = r->drain(r, cidx, pidx); + n = r->drain(r, cidx, pidx, &coalescing); if (n == 0) { critical_enter(); os.state = r->state; do { ns.state = os.state; ns.cidx = cidx; - ns.flags = STALLED; + + MPASS(os.flags == BUSY || + os.flags == TOO_BUSY || + os.flags == TAKING_OVER); + + if (os.flags == TAKING_OVER) + ns.flags = BUSY; + else + ns.flags = STALLED; } while (atomic_fcmpset_64(&r->state, &os.state, ns.state) == 0); critical_exit(); - if (prev != STALLED) + if (os.flags == TAKING_OVER) + counter_u64_add(r->abdications, 1); + else if (ns.flags == STALLED) counter_u64_add(r->stalls, 1); - else if (total > 0) { - counter_u64_add(r->restarts, 1); - counter_u64_add(r->stalls, 1); - } break; } cidx = increment_idx(r, cidx, n); pending += n; total += n; + counter_u64_add(r->consumed, n); - /* - * We update the cidx only if we've caught up with the pidx, the - * real cidx is getting too far ahead of the one visible to - * everyone else, or we have exceeded our budget. - */ - if (cidx != pidx && pending < 64 && total < budget) - continue; - critical_enter(); - os.state = r->state; + os.state = atomic_load_acq_64(&r->state); do { + MPASS(os.flags == BUSY || os.flags == TOO_BUSY || + os.flags == TAKING_OVER); + ns.state = os.state; ns.cidx = cidx; - ns.flags = state_to_flags(ns, total >= budget); + if (__predict_false(os.flags == TAKING_OVER)) { + MPASS(total >= budget); + ns.flags = BUSY; + continue; + } + if (cidx == os.pidx_tail) { + ns.flags = coalescing ? COALESCING : IDLE; + continue; + } + if (total >= budget) { + ns.flags = TOO_BUSY; + continue; + } + MPASS(os.flags == BUSY); + if (pending < 32) + break; } while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0); - critical_exit(); - if (ns.flags == ABDICATED) + if (__predict_false(os.flags == TAKING_OVER)) { + MPASS(ns.flags == BUSY); counter_u64_add(r->abdications, 1); - if (ns.flags != BUSY) { - /* Wrong loop exit if we're going to stall. */ - MPASS(ns.flags != STALLED); - if (prev == STALLED) { - MPASS(total > 0); - counter_u64_add(r->restarts, 1); - } break; } + if (ns.flags == IDLE || ns.flags == COALESCING) { + MPASS(ns.pidx_tail == cidx); + if (ns.pidx_head != ns.pidx_tail) + counter_u64_add(r->cons_idle2, 1); + else + counter_u64_add(r->cons_idle, 1); + break; + } + /* * The acquire style atomic above guarantees visibility of items * associated with any pidx change that we notice here. @@ -177,13 +215,55 @@ pidx = ns.pidx_tail; pending = 0; } + +#ifdef INVARIANTS + if (os.flags == TAKING_OVER) + MPASS(ns.flags == BUSY); + else { + MPASS(ns.flags == IDLE || ns.flags == COALESCING || + ns.flags == STALLED); + } +#endif } +static void +drain_txpkts(struct mp_ring *r, union ring_state os, int budget) +{ + union ring_state ns; + uint16_t cidx = os.cidx; + uint16_t pidx = os.pidx_tail; + bool coalescing; + + mtx_assert(r->cons_lock, MA_OWNED); + MPASS(os.flags == BUSY); + MPASS(cidx == pidx); + + r->drain(r, cidx, pidx, &coalescing); + MPASS(coalescing == false); + critical_enter(); + os.state = r->state; + do { + ns.state = os.state; + MPASS(os.flags == BUSY); + MPASS(os.cidx == cidx); + if (ns.cidx == ns.pidx_tail) + ns.flags = IDLE; + else + ns.flags = BUSY; + } while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0); + critical_exit(); + + if (ns.flags == BUSY) + drain_ring(r, budget); +} + int mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain, - ring_can_drain_t can_drain, struct malloc_type *mt, int flags) + ring_can_drain_t can_drain, struct malloc_type *mt, struct mtx *lck, + int flags) { struct mp_ring *r; + int i; /* All idx are 16b so size can be 65536 at most */ if (pr == NULL || size < 2 || size > 65536 || drain == NULL || @@ -201,43 +281,59 @@ r->mt = mt; r->drain = drain; r->can_drain = can_drain; - r->enqueues = counter_u64_alloc(flags); - r->drops = counter_u64_alloc(flags); - r->starts = counter_u64_alloc(flags); - r->stalls = counter_u64_alloc(flags); - r->restarts = counter_u64_alloc(flags); - r->abdications = counter_u64_alloc(flags); - if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL || - r->stalls == NULL || r->restarts == NULL || - r->abdications == NULL) { - mp_ring_free(r); - return (ENOMEM); + r->cons_lock = lck; + if ((r->dropped = counter_u64_alloc(flags)) == NULL) + goto failed; + for (i = 0; i < nitems(r->consumer); i++) { + if ((r->consumer[i] = counter_u64_alloc(flags)) == NULL) + goto failed; } - + if ((r->not_consumer = counter_u64_alloc(flags)) == NULL) + goto failed; + if ((r->abdications = counter_u64_alloc(flags)) == NULL) + goto failed; + if ((r->stalls = counter_u64_alloc(flags)) == NULL) + goto failed; + if ((r->consumed = counter_u64_alloc(flags)) == NULL) + goto failed; + if ((r->cons_idle = counter_u64_alloc(flags)) == NULL) + goto failed; + if ((r->cons_idle2 = counter_u64_alloc(flags)) == NULL) + goto failed; *pr = r; return (0); +failed: + mp_ring_free(r); + return (ENOMEM); } void mp_ring_free(struct mp_ring *r) { + int i; if (r == NULL) return; - if (r->enqueues != NULL) - counter_u64_free(r->enqueues); - if (r->drops != NULL) - counter_u64_free(r->drops); - if (r->starts != NULL) - counter_u64_free(r->starts); - if (r->stalls != NULL) - counter_u64_free(r->stalls); - if (r->restarts != NULL) - counter_u64_free(r->restarts); + if (r->dropped != NULL) + counter_u64_free(r->dropped); + for (i = 0; i < nitems(r->consumer); i++) { + if (r->consumer[i] != NULL) + counter_u64_free(r->consumer[i]); + } + if (r->not_consumer != NULL) + counter_u64_free(r->not_consumer); if (r->abdications != NULL) counter_u64_free(r->abdications); + if (r->stalls != NULL) + counter_u64_free(r->stalls); + if (r->consumed != NULL) + counter_u64_free(r->consumed); + if (r->cons_idle != NULL) + counter_u64_free(r->cons_idle); + if (r->cons_idle2 != NULL) + counter_u64_free(r->cons_idle2); free(r, r->mt); } @@ -252,7 +348,8 @@ { union ring_state os, ns; uint16_t pidx_start, pidx_stop; - int i; + int i, nospc, cons; + bool consumer; MPASS(items != NULL); MPASS(n > 0); @@ -261,26 +358,70 @@ * Reserve room for the new items. Our reservation, if successful, is * from 'pidx_start' to 'pidx_stop'. */ + nospc = 0; os.state = r->state; for (;;) { - if (n >= space_available(r, os)) { - counter_u64_add(r->drops, n); + for (;;) { + if (__predict_true(space_available(r, os) >= n)) + break; + + /* Not enough room in the ring. */ + MPASS(os.flags != IDLE); + MPASS(os.flags != COALESCING); + if (__predict_false(++nospc > 100)) { + counter_u64_add(r->dropped, n); + return (ENOBUFS); + } if (os.flags == STALLED) - mp_ring_check_drainage(r, 0); - return (ENOBUFS); + mp_ring_check_drainage(r, 64); + else + cpu_spinwait(); + os.state = r->state; } + + /* There is room in the ring. */ + + cons = -1; ns.state = os.state; ns.pidx_head = increment_idx(r, os.pidx_head, n); + if (os.flags == IDLE || os.flags == COALESCING) { + MPASS(os.pidx_tail == os.cidx); + if (os.pidx_head == os.pidx_tail) { + cons = C_FAST; + ns.pidx_tail = increment_idx(r, os.pidx_tail, n); + } else + cons = C_2; + ns.flags = BUSY; + } else if (os.flags == TOO_BUSY) { + cons = C_TAKEOVER; + ns.flags = TAKING_OVER; + } critical_enter(); if (atomic_fcmpset_64(&r->state, &os.state, ns.state)) break; critical_exit(); cpu_spinwait(); - } + }; + pidx_start = os.pidx_head; pidx_stop = ns.pidx_head; + if (cons == C_FAST) { + i = pidx_start; + do { + r->items[i] = *items++; + if (__predict_false(++i == r->size)) + i = 0; + } while (i != pidx_stop); + critical_exit(); + counter_u64_add(r->consumer[C_FAST], 1); + mtx_lock(r->cons_lock); + drain_ring(r, budget); + mtx_unlock(r->cons_lock); + return (0); + } + /* * Wait for other producers who got in ahead of us to enqueue their * items, one producer at a time. It is our turn when the ring's @@ -305,19 +446,31 @@ */ os.state = r->state; do { + consumer = false; ns.state = os.state; ns.pidx_tail = pidx_stop; - ns.flags = BUSY; + if (os.flags == IDLE || os.flags == COALESCING || + (os.flags == STALLED && r->can_drain(r))) { + MPASS(cons == -1); + consumer = true; + ns.flags = BUSY; + } } while (atomic_fcmpset_rel_64(&r->state, &os.state, ns.state) == 0); critical_exit(); - counter_u64_add(r->enqueues, n); - /* - * Turn into a consumer if some other thread isn't active as a consumer - * already. - */ - if (os.flags != BUSY) - drain_ring(r, ns, os.flags, budget); + if (cons == -1) { + if (consumer) + cons = C_3; + else { + counter_u64_add(r->not_consumer, 1); + return (0); + } + } + MPASS(cons > C_FAST && cons < nitems(r->consumer)); + counter_u64_add(r->consumer[cons], 1); + mtx_lock(r->cons_lock); + drain_ring(r, budget); + mtx_unlock(r->cons_lock); return (0); } @@ -328,37 +481,44 @@ union ring_state os, ns; os.state = r->state; - if (os.flags != STALLED || os.pidx_head != os.pidx_tail || - r->can_drain(r) == 0) - return; - - MPASS(os.cidx != os.pidx_tail); /* implied by STALLED */ - ns.state = os.state; - ns.flags = BUSY; - - /* - * The acquire style atomic guarantees visibility of items associated - * with the pidx that we read here. - */ - if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state)) - return; - - drain_ring(r, ns, os.flags, budget); + if (os.flags == STALLED && r->can_drain(r)) { + MPASS(os.cidx != os.pidx_tail); /* implied by STALLED */ + ns.state = os.state; + ns.flags = BUSY; + if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) { + mtx_lock(r->cons_lock); + drain_ring(r, budget); + mtx_unlock(r->cons_lock); + } + } else if (os.flags == COALESCING) { + MPASS(os.cidx == os.pidx_tail); + ns.state = os.state; + ns.flags = BUSY; + if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) { + mtx_lock(r->cons_lock); + drain_txpkts(r, ns, budget); + mtx_unlock(r->cons_lock); + } + } } void mp_ring_reset_stats(struct mp_ring *r) { + int i; - counter_u64_zero(r->enqueues); - counter_u64_zero(r->drops); - counter_u64_zero(r->starts); - counter_u64_zero(r->stalls); - counter_u64_zero(r->restarts); + counter_u64_zero(r->dropped); + for (i = 0; i < nitems(r->consumer); i++) + counter_u64_zero(r->consumer[i]); + counter_u64_zero(r->not_consumer); counter_u64_zero(r->abdications); + counter_u64_zero(r->stalls); + counter_u64_zero(r->consumed); + counter_u64_zero(r->cons_idle); + counter_u64_zero(r->cons_idle2); } -int +bool mp_ring_is_idle(struct mp_ring *r) { union ring_state s; @@ -366,7 +526,50 @@ s.state = r->state; if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx && s.flags == IDLE) - return (1); + return (true); - return (0); + return (false); +} + +void +mp_ring_sysctls(struct mp_ring *r, struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *children) +{ + struct sysctl_oid *oid; + + oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "mp_ring", CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, "mp_ring statistics"); + children = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_U64(ctx, children, OID_AUTO, "state", CTLFLAG_RD, + __DEVOLATILE(uint64_t *, &r->state), 0, "ring state"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "dropped", CTLFLAG_RD, + &r->dropped, "# of items dropped"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumed", + CTLFLAG_RD, &r->consumed, "# of items consumed"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "fast_consumer", + CTLFLAG_RD, &r->consumer[C_FAST], + "# of times producer became consumer (fast)"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer2", + CTLFLAG_RD, &r->consumer[C_2], + "# of times producer became consumer (2)"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer3", + CTLFLAG_RD, &r->consumer[C_3], + "# of times producer became consumer (3)"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "takeovers", + CTLFLAG_RD, &r->consumer[C_TAKEOVER], + "# of times producer took over from another consumer."); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "not_consumer", + CTLFLAG_RD, &r->not_consumer, + "# of times producer did not become consumer"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "abdications", + CTLFLAG_RD, &r->abdications, "# of consumer abdications"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "stalls", + CTLFLAG_RD, &r->stalls, "# of consumer stalls"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle", + CTLFLAG_RD, &r->cons_idle, + "# of times consumer ran fully to completion"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle2", + CTLFLAG_RD, &r->cons_idle2, + "# of times consumer idled when another enqueue was in progress"); } Index: sys/dev/cxgbe/t4_sge.c =================================================================== --- sys/dev/cxgbe/t4_sge.c +++ sys/dev/cxgbe/t4_sge.c @@ -203,19 +203,6 @@ SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0, "Enable presorting of LRO frames"); -struct txpkts { - u_int wr_type; /* type 0 or type 1 */ - u_int npkt; /* # of packets in this work request */ - u_int plen; /* total payload (sum of all packets) */ - u_int len16; /* # of 16B pieces used by this work request */ -}; - -/* A packet's SGL. This + m_pkthdr has all info needed for tx */ -struct sgl { - struct sglist sg; - struct sglist_seg seg[TX_SGL_SEGS]; -}; - static int service_iq(struct sge_iq *, int); static int service_iq_fl(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); @@ -284,14 +271,16 @@ static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int); -static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, - struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int); +static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *, + u_int); static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *, - struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int); -static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int); -static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int); -static u_int write_txpkts_wr(struct adapter *, struct sge_txq *, - struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int); + struct mbuf *); +static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *, + int, bool *); +static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *, + int, bool *); +static u_int write_txpkts_wr(struct adapter *, struct sge_txq *); +static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *); static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); @@ -2839,7 +2828,7 @@ return (total_available_tx_desc(eq) > eq->sidx / 8); } -static inline int +static inline bool cannot_use_txpkts(struct mbuf *m) { /* maybe put a GL limit too, to avoid silliness? */ @@ -2855,8 +2844,9 @@ } static inline int -wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr) +wr_can_update_eq(void *p) { + struct fw_eth_tx_pkts_wr *wr = p; switch (G_FW_WR_OP(be32toh(wr->op_pkd))) { case FW_ULPTX_WR: @@ -2864,159 +2854,232 @@ case FW_ETH_TX_PKTS_WR: case FW_ETH_TX_PKTS2_WR: case FW_ETH_TX_PKT_VM_WR: + case FW_ETH_TX_PKTS_VM_WR: return (1); default: return (0); } } +static inline void +set_txupdate_flags(struct sge_txq *txq, u_int avail, + struct fw_eth_tx_pkt_wr *wr) +{ + struct sge_eq *eq = &txq->eq; + struct txpkts *txp = &txq->txp; + + if ((txp->npkt > 0 || avail < eq->sidx / 2) && + atomic_cmpset_int(&eq->equiq, 0, 1)) { + wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); + eq->equeqidx = eq->pidx; + } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { + wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); + eq->equeqidx = eq->pidx; + } +} + /* * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to * be consumed. Return the actual number consumed. 0 indicates a stall. */ static u_int -eth_tx(struct mp_ring *r, u_int cidx, u_int pidx) +eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing) { struct sge_txq *txq = r->cookie; - struct sge_eq *eq = &txq->eq; struct ifnet *ifp = txq->ifp; + struct sge_eq *eq = &txq->eq; + struct txpkts *txp = &txq->txp; struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->adapter; u_int total, remaining; /* # of packets */ - u_int available, dbdiff; /* # of hardware descriptors */ - u_int n, next_cidx; - struct mbuf *m0, *tail; - struct txpkts txp; - struct fw_eth_tx_pkts_wr *wr; /* any fw WR struct will do */ + u_int n, avail, dbdiff; /* # of hardware descriptors */ + int i, rc; + struct mbuf *m0; + bool snd; + void *wr; /* start of the last WR written to the ring */ - remaining = IDXDIFF(pidx, cidx, r->size); - MPASS(remaining > 0); /* Must not be called without work to do. */ - total = 0; + TXQ_LOCK_ASSERT_OWNED(txq); - TXQ_LOCK(txq); + remaining = IDXDIFF(pidx, cidx, r->size); if (__predict_false(discard_tx(eq))) { + for (i = 0; i < txp->npkt; i++) + m_freem(txp->mb[i]); + txp->npkt = 0; while (cidx != pidx) { m0 = r->items[cidx]; m_freem(m0); if (++cidx == r->size) cidx = 0; } - reclaim_tx_descs(txq, 2048); - total = remaining; - goto done; + reclaim_tx_descs(txq, eq->sidx); + *coalescing = false; + return (remaining); /* emptied */ } /* How many hardware descriptors do we have readily available. */ - if (eq->pidx == eq->cidx) - available = eq->sidx - 1; - else - available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; - dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); + if (eq->pidx == eq->cidx) { + avail = eq->sidx - 1; + if (txp->score++ >= 5) + txp->score = 5; /* tx is completely idle, reset. */ + } else + avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; - while (remaining > 0) { + total = 0; + if (remaining == 0) { + if (txp->score-- == 1) /* egr_update had to drain txpkts */ + txp->score = 1; + goto send_txpkts; + } + dbdiff = 0; + MPASS(remaining > 0); + while (remaining > 0) { m0 = r->items[cidx]; M_ASSERTPKTHDR(m0); MPASS(m0->m_nextpkt == NULL); - if (available < tx_len16_to_desc(mbuf_len16(m0))) { - available += reclaim_tx_descs(txq, 64); - if (available < tx_len16_to_desc(mbuf_len16(m0))) - break; /* out of descriptors */ + if (avail < 2 * SGE_MAX_WR_NDESC) + avail += reclaim_tx_descs(txq, 64); + + if (txp->npkt > 0 || remaining > 1 || txp->score > 3 || + atomic_load_int(&txq->eq.equiq) != 0) { + if (sc->flags & IS_VF) + rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd); + else + rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd); + } else { + snd = false; + rc = EINVAL; } + if (snd) { + MPASS(txp->npkt > 0); + for (i = 0; i < txp->npkt; i++) + ETHER_BPF_MTAP(ifp, txp->mb[i]); + if (txp->npkt > 1) { + if (txp->score++ >= 10) + txp->score = 10; + MPASS(avail >= tx_len16_to_desc(txp->len16)); + if (sc->flags & IS_VF) + n = write_txpkts_vm_wr(sc, txq); + else + n = write_txpkts_wr(sc, txq); + } else { + MPASS(avail >= + tx_len16_to_desc(mbuf_len16(txp->mb[0]))); + if (sc->flags & IS_VF) + n = write_txpkt_vm_wr(sc, txq, + txp->mb[0]); + else + n = write_txpkt_wr(sc, txq, txp->mb[0], + avail); + } + MPASS(n <= SGE_MAX_WR_NDESC); + avail -= n; + dbdiff += n; + wr = &eq->desc[eq->pidx]; + IDXINCR(eq->pidx, n, eq->sidx); + txp->npkt = 0; /* emptied */ + } + if (rc == 0) { + /* m0 was coalesced into txq->txpkts. */ + goto next_mbuf; + } + if (rc == EAGAIN) { + /* + * m0 is suitable for tx coalescing but could not be + * combined with the existing txq->txpkts, which has now + * been transmitted. Start a new txpkts with m0. + */ + MPASS(snd); + MPASS(txp->npkt == 0); + continue; + } - next_cidx = cidx + 1; - if (__predict_false(next_cidx == r->size)) - next_cidx = 0; - - wr = (void *)&eq->desc[eq->pidx]; + MPASS(rc != 0 && rc != EAGAIN); + MPASS(txp->npkt == 0); + wr = &eq->desc[eq->pidx]; if (mbuf_cflags(m0) & MC_RAW_WR) { - total++; - remaining--; - n = write_raw_wr(txq, (void *)wr, m0, available); + n = write_raw_wr(txq, wr, m0, avail); #ifdef KERN_TLS } else if (mbuf_cflags(m0) & MC_TLS) { - total++; - remaining--; ETHER_BPF_MTAP(ifp, m0); - n = t6_ktls_write_wr(txq,(void *)wr, m0, - mbuf_nsegs(m0), available); + n = t6_ktls_write_wr(txq, wr, m0, mbuf_nsegs(m0), + avail); #endif - } else if (sc->flags & IS_VF) { - total++; - remaining--; - ETHER_BPF_MTAP(ifp, m0); - n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0, - available); - } else if (remaining > 1 && - try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) { - - /* pkts at cidx, next_cidx should both be in txp. */ - MPASS(txp.npkt == 2); - tail = r->items[next_cidx]; - MPASS(tail->m_nextpkt == NULL); - ETHER_BPF_MTAP(ifp, m0); - ETHER_BPF_MTAP(ifp, tail); - m0->m_nextpkt = tail; - - if (__predict_false(++next_cidx == r->size)) - next_cidx = 0; - - while (next_cidx != pidx) { - if (add_to_txpkts(r->items[next_cidx], &txp, - available) != 0) - break; - tail->m_nextpkt = r->items[next_cidx]; - tail = tail->m_nextpkt; - ETHER_BPF_MTAP(ifp, tail); - if (__predict_false(++next_cidx == r->size)) - next_cidx = 0; - } - - n = write_txpkts_wr(sc, txq, wr, m0, &txp, available); - total += txp.npkt; - remaining -= txp.npkt; } else { - total++; - remaining--; - ETHER_BPF_MTAP(ifp, m0); - n = write_txpkt_wr(sc, txq, (void *)wr, m0, available); + n = tx_len16_to_desc(mbuf_len16(m0)); + if (__predict_false(avail < n)) { + avail += reclaim_tx_descs(txq, 32); + if (avail < n) + break; /* out of descriptors */ + } + if (sc->flags & IS_VF) + n = write_txpkt_vm_wr(sc, txq, m0); + else + n = write_txpkt_wr(sc, txq, m0, avail); } - MPASS(n >= 1 && n <= available); + MPASS(n >= 1 && n <= avail); if (!(mbuf_cflags(m0) & MC_TLS)) MPASS(n <= SGE_MAX_WR_NDESC); - available -= n; + avail -= n; dbdiff += n; IDXINCR(eq->pidx, n, eq->sidx); - if (wr_can_update_eq(wr)) { - if (total_available_tx_desc(eq) < eq->sidx / 4 && - atomic_cmpset_int(&eq->equiq, 0, 1)) { - wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | - F_FW_WR_EQUEQ); - eq->equeqidx = eq->pidx; - } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= - 32) { - wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); - eq->equeqidx = eq->pidx; - } - } - - if (dbdiff >= 16 && remaining >= 4) { + if (dbdiff >= 512 / EQ_ESIZE) { /* X_FETCHBURSTMAX_512B */ + if (wr_can_update_eq(wr)) + set_txupdate_flags(txq, avail, wr); ring_eq_db(sc, eq, dbdiff); - available += reclaim_tx_descs(txq, 4 * dbdiff); + avail += reclaim_tx_descs(txq, 32); dbdiff = 0; } - - cidx = next_cidx; +next_mbuf: + total++; + remaining--; + if (__predict_false(++cidx == r->size)) + cidx = 0; } if (dbdiff != 0) { + if (wr_can_update_eq(wr)) + set_txupdate_flags(txq, avail, wr); ring_eq_db(sc, eq, dbdiff); reclaim_tx_descs(txq, 32); + } else if (eq->pidx == eq->cidx && txp->npkt > 0 && + atomic_load_int(&txq->eq.equiq) == 0) { + /* + * If nothing was submitted to the chip for tx (it was coalesced + * into txpkts instead) and there is no tx update outstanding + * then we need to send txpkts now. + */ +send_txpkts: + MPASS(txp->npkt > 0); + for (i = 0; i < txp->npkt; i++) + ETHER_BPF_MTAP(ifp, txp->mb[i]); + if (txp->npkt > 1) { + MPASS(avail >= tx_len16_to_desc(txp->len16)); + if (sc->flags & IS_VF) + n = write_txpkts_vm_wr(sc, txq); + else + n = write_txpkts_wr(sc, txq); + } else { + MPASS(avail >= + tx_len16_to_desc(mbuf_len16(txp->mb[0]))); + if (sc->flags & IS_VF) + n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); + else + n = write_txpkt_wr(sc, txq, txp->mb[0], avail); + } + MPASS(n <= SGE_MAX_WR_NDESC); + wr = &eq->desc[eq->pidx]; + IDXINCR(eq->pidx, n, eq->sidx); + txp->npkt = 0; /* emptied */ + + MPASS(wr_can_update_eq(wr)); + set_txupdate_flags(txq, avail - n, wr); + ring_eq_db(sc, eq, n); + reclaim_tx_descs(txq, 32); } -done: - TXQ_UNLOCK(txq); + *coalescing = txp->npkt > 0; return (total); } @@ -4106,11 +4169,12 @@ struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; + struct txpkts *txp; char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, - M_CXGBE, M_WAITOK); + M_CXGBE, &eq->eq_lock, M_WAITOK); if (rc != 0) { device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc); return (rc); @@ -4147,6 +4211,12 @@ txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); + txp = &txq->txp; + txp->score = 5; + MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr); + txq->txp.max_npkt = min(nitems(txp->mb), + sc->params.max_pkts_per_eth_tx_pkts_wr); + snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue"); @@ -4242,26 +4312,8 @@ "# of NIC TLS sessions using AES-GCM"); } #endif + mp_ring_sysctls(txq->r, &vi->ctx, children); - SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues", - CTLFLAG_RD, &txq->r->enqueues, - "# of enqueues to the mp_ring for this queue"); - SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops", - CTLFLAG_RD, &txq->r->drops, - "# of drops in the mp_ring for this queue"); - SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts", - CTLFLAG_RD, &txq->r->starts, - "# of normal consumer starts in the mp_ring for this queue"); - SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls", - CTLFLAG_RD, &txq->r->stalls, - "# of consumer stalls in the mp_ring for this queue"); - SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts", - CTLFLAG_RD, &txq->r->restarts, - "# of consumer restarts in the mp_ring for this queue"); - SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications", - CTLFLAG_RD, &txq->r->abdications, - "# of consumer abdications in the mp_ring for this queue"); - return (0); } @@ -4655,10 +4707,10 @@ * The return value is the # of hardware descriptors used. */ static u_int -write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, - struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available) +write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0) { - struct sge_eq *eq = &txq->eq; + struct sge_eq *eq; + struct fw_eth_tx_pkt_vm_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ @@ -4668,7 +4720,6 @@ TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); - MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); @@ -4677,10 +4728,10 @@ if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); ndesc = tx_len16_to_desc(len16); - MPASS(ndesc <= available); /* Firmware work request header */ - MPASS(wr == (void *)&eq->desc[eq->pidx]); + eq = &txq->eq; + wr = (void *)&eq->desc[eq->pidx]; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); @@ -4760,7 +4811,6 @@ } else write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; - txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; @@ -4811,10 +4861,11 @@ * The return value is the # of hardware descriptors used. */ static u_int -write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, - struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available) +write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0, + u_int available) { - struct sge_eq *eq = &txq->eq; + struct sge_eq *eq; + struct fw_eth_tx_pkt_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ @@ -4824,7 +4875,6 @@ TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); - MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); @@ -4844,7 +4894,8 @@ MPASS(ndesc <= available); /* Firmware work request header */ - MPASS(wr == (void *)&eq->desc[eq->pidx]); + eq = &txq->eq; + wr = (void *)&eq->desc[eq->pidx]; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); @@ -4927,71 +4978,151 @@ return (ndesc); } +static inline bool +cmp_l2hdr(struct txpkts *txp, struct mbuf *m) +{ + int len; + + MPASS(txp->npkt > 0); + MPASS(m->m_len >= 16); /* type1 implies 1 GL with all of the frame. */ + + if (txp->ethtype == be16toh(ETHERTYPE_VLAN)) + len = sizeof(struct ether_vlan_header); + else + len = sizeof(struct ether_header); + + return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0); +} + +static inline void +save_l2hdr(struct txpkts *txp, struct mbuf *m) +{ + MPASS(m->m_len >= 16); /* type1 implies 1 GL with all of the frame. */ + + memcpy(&txp->ethmacdst[0], mtod(m, const void *), 16); +} + static int -try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available) +add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, + int avail, bool *send) { - u_int needed, nsegs1, nsegs2, l1, l2; + struct txpkts *txp = &txq->txp; - if (cannot_use_txpkts(m) || cannot_use_txpkts(n)) - return (1); + MPASS(sc->flags & IS_VF); - nsegs1 = mbuf_nsegs(m); - nsegs2 = mbuf_nsegs(n); - if (nsegs1 + nsegs2 == 2) { - txp->wr_type = 1; - l1 = l2 = txpkts1_len16(); - } else { - txp->wr_type = 0; - l1 = txpkts0_len16(nsegs1); - l2 = txpkts0_len16(nsegs2); + /* Cannot have TSO and coalesce at the same time. */ + if (cannot_use_txpkts(m)) { +cannot_coalesce: + *send = txp->npkt > 0; + return (EINVAL); } - txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2; - needed = tx_len16_to_desc(txp->len16); - if (needed > SGE_MAX_WR_NDESC || needed > available) - return (1); - txp->plen = m->m_pkthdr.len + n->m_pkthdr.len; - if (txp->plen > 65535) - return (1); + /* VF allows coalescing of type 1 (1 GL) only */ + if (mbuf_nsegs(m) > 1) + goto cannot_coalesce; - txp->npkt = 2; - set_mbuf_len16(m, l1); - set_mbuf_len16(n, l2); + *send = false; + if (txp->npkt > 0) { + MPASS(tx_len16_to_desc(txp->len16) <= avail); + MPASS(txp->npkt < txp->max_npkt); + MPASS(txp->wr_type == 1); /* VF supports type 1 only */ + if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) { +retry_after_send: + *send = true; + return (EAGAIN); + } + if (m->m_pkthdr.len + txp->plen > 65535) + goto retry_after_send; + if (cmp_l2hdr(txp, m)) + goto retry_after_send; + + txp->len16 += txpkts1_len16(); + txp->plen += m->m_pkthdr.len; + txp->mb[txp->npkt++] = m; + if (txp->npkt == txp->max_npkt) + *send = true; + } else { + txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) + + txpkts1_len16(); + if (tx_len16_to_desc(txp->len16) > avail) + goto cannot_coalesce; + txp->npkt = 1; + txp->wr_type = 1; + txp->plen = m->m_pkthdr.len; + txp->mb[0] = m; + save_l2hdr(txp, m); + } return (0); } static int -add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available) +add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, + int avail, bool *send) { - u_int plen, len16, needed, nsegs; + struct txpkts *txp = &txq->txp; + int nsegs; - MPASS(txp->wr_type == 0 || txp->wr_type == 1); + MPASS(!(sc->flags & IS_VF)); - if (cannot_use_txpkts(m)) - return (1); + /* Cannot have TSO and coalesce at the same time. */ + if (cannot_use_txpkts(m)) { +cannot_coalesce: + *send = txp->npkt > 0; + return (EINVAL); + } + *send = false; nsegs = mbuf_nsegs(m); - if (txp->wr_type == 1 && nsegs != 1) - return (1); + if (txp->npkt == 0) { + if (m->m_pkthdr.len > 65535) + goto cannot_coalesce; + if (nsegs > 1) { + txp->wr_type = 0; + txp->len16 = + howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + + txpkts0_len16(nsegs); + } else { + txp->wr_type = 1; + txp->len16 = + howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + + txpkts1_len16(); + } + if (tx_len16_to_desc(txp->len16) > avail) + goto cannot_coalesce; + txp->npkt = 1; + txp->plen = m->m_pkthdr.len; + txp->mb[0] = m; + } else { + MPASS(tx_len16_to_desc(txp->len16) <= avail); + MPASS(txp->npkt < txp->max_npkt); - plen = txp->plen + m->m_pkthdr.len; - if (plen > 65535) - return (1); + if (m->m_pkthdr.len + txp->plen > 65535) { +retry_after_send: + *send = true; + return (EAGAIN); + } - if (txp->wr_type == 0) - len16 = txpkts0_len16(nsegs); - else - len16 = txpkts1_len16(); - needed = tx_len16_to_desc(txp->len16 + len16); - if (needed > SGE_MAX_WR_NDESC || needed > available) - return (1); + MPASS(txp->wr_type == 0 || txp->wr_type == 1); + if (txp->wr_type == 0) { + if (tx_len16_to_desc(txp->len16 + + txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC)) + goto retry_after_send; + txp->len16 += txpkts0_len16(nsegs); + } else { + if (nsegs != 1) + goto retry_after_send; + if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > + avail) + goto retry_after_send; + txp->len16 += txpkts1_len16(); + } - txp->npkt++; - txp->plen = plen; - txp->len16 += len16; - set_mbuf_len16(m, len16); - + txp->plen += m->m_pkthdr.len; + txp->mb[txp->npkt++] = m; + if (txp->npkt == txp->max_npkt) + *send = true; + } return (0); } @@ -5003,34 +5134,25 @@ * The return value is the # of hardware descriptors used. */ static u_int -write_txpkts_wr(struct adapter *sc, struct sge_txq *txq, - struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp, - u_int available) +write_txpkts_wr(struct adapter *sc, struct sge_txq *txq) { + const struct txpkts *txp = &txq->txp; struct sge_eq *eq = &txq->eq; + struct fw_eth_tx_pkts_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; - uint32_t ctrl; uint64_t ctrl1; - int ndesc, checkwrap; - struct mbuf *m; + int ndesc, i, checkwrap; + struct mbuf *m, *last; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); - MPASS(txp->plen < 65536); - MPASS(m0 != NULL); - MPASS(m0->m_nextpkt != NULL); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); - MPASS(available > 0 && available < eq->sidx); - ndesc = tx_len16_to_desc(txp->len16); - MPASS(ndesc <= available); - - MPASS(wr == (void *)&eq->desc[eq->pidx]); + wr = (void *)&eq->desc[eq->pidx]; wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); - ctrl = V_FW_WR_LEN16(txp->len16); - wr->equiq_to_len16 = htobe32(ctrl); + wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r3 = 0; @@ -5042,8 +5164,11 @@ * set then we know the WR is going to wrap around somewhere. We'll * check for that at appropriate points. */ + ndesc = tx_len16_to_desc(txp->len16); + last = NULL; checkwrap = eq->sidx - ndesc < eq->pidx; - for (m = m0; m != NULL; m = m->m_nextpkt) { + for (i = 0; i < txp->npkt; i++) { + m = txp->mb[i]; if (txp->wr_type == 0) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; @@ -5052,7 +5177,7 @@ ulpmc = flitp; ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); - ulpmc->len = htobe32(mbuf_len16(m)); + ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m))); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); @@ -5093,8 +5218,12 @@ write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); + if (last != NULL) + last->m_nextpkt = m; + last = m; } + txq->sgl_wrs++; if (txp->wr_type == 0) { txq->txpkts0_pkts += txp->npkt; txq->txpkts0_wrs++; @@ -5104,12 +5233,92 @@ } txsd = &txq->sdesc[eq->pidx]; - txsd->m = m0; + txsd->m = txp->mb[0]; txsd->desc_used = ndesc; return (ndesc); } +static u_int +write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq) +{ + const struct txpkts *txp = &txq->txp; + struct sge_eq *eq = &txq->eq; + struct fw_eth_tx_pkts_vm_wr *wr; + struct tx_sdesc *txsd; + struct cpl_tx_pkt_core *cpl; + uint64_t ctrl1; + int ndesc, i; + struct mbuf *m, *last; + void *flitp; + + TXQ_LOCK_ASSERT_OWNED(txq); + MPASS(txp->npkt > 0); + MPASS(txp->wr_type == 1); /* VF supports type 1 only */ + MPASS(txp->mb[0] != NULL); + MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); + + wr = (void *)&eq->desc[eq->pidx]; + wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR)); + wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); + wr->r3 = 0; + wr->plen = htobe16(txp->plen); + wr->npkt = txp->npkt; + wr->r4 = 0; + memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16); + flitp = wr + 1; + + /* + * At this point we are 32B into a hardware descriptor. Each mbuf in + * the WR will take 32B so we check for the end of the descriptor ring + * before writing odd mbufs (mb[1], 3, 5, ..) + */ + ndesc = tx_len16_to_desc(txp->len16); + last = NULL; + for (i = 0; i < txp->npkt; i++) { + m = txp->mb[i]; + if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) + flitp = &eq->desc[0]; + cpl = flitp; + + /* Checksum offload */ + ctrl1 = csum_to_ctrl(sc, m); + if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) + txq->txcsum++; /* some hardware assistance provided */ + + /* VLAN tag insertion */ + if (needs_vlan_insertion(m)) { + ctrl1 |= F_TXPKT_VLAN_VLD | + V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); + txq->vlan_insertion++; + } + + /* CPL header */ + cpl->ctrl0 = txq->cpl_ctrl0; + cpl->pack = 0; + cpl->len = htobe16(m->m_pkthdr.len); + cpl->ctrl1 = htobe64(ctrl1); + + flitp = cpl + 1; + MPASS(mbuf_nsegs(m) == 1); + write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0); + + if (last != NULL) + last->m_nextpkt = m; + last = m; + } + + txq->sgl_wrs++; + txq->txpkts1_pkts += txp->npkt; + txq->txpkts1_wrs++; + + txsd = &txq->sdesc[eq->pidx]; + txsd->m = txp->mb[0]; + txsd->desc_used = ndesc; + + return (ndesc); +} + /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. @@ -5444,8 +5653,10 @@ MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH); atomic_readandclear_int(&eq->equiq); - mp_ring_check_drainage(txq->r, 0); - taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); + if (mp_ring_is_idle(txq->r)) + taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); + else + mp_ring_check_drainage(txq->r, 64); } static int Index: sys/dev/cxgbe/t4_vf.c =================================================================== --- sys/dev/cxgbe/t4_vf.c +++ sys/dev/cxgbe/t4_vf.c @@ -231,6 +231,7 @@ get_params__post_init(struct adapter *sc) { int rc; + uint32_t param, val; rc = -t4vf_get_sge_params(sc); if (rc != 0) { @@ -281,6 +282,13 @@ return (EINVAL); } sc->params.portvec = sc->params.vfres.pmask; + + param = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR); + rc = -t4vf_query_params(sc, 1, ¶m, &val); + if (rc == 0) + sc->params.max_pkts_per_eth_tx_pkts_wr = val; + else + sc->params.max_pkts_per_eth_tx_pkts_wr = 14; return (0); }