Index: head/sys/dev/cxgb/cxgb_sge.c =================================================================== --- head/sys/dev/cxgb/cxgb_sge.c (revision 294326) +++ head/sys/dev/cxgb/cxgb_sge.c (revision 294327) @@ -1,3712 +1,3712 @@ /************************************************************************** Copyright (c) 2007-2009, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int txq_fills = 0; int multiq_tx_enable = 1; #ifdef TCP_OFFLOAD CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS); #endif extern struct sysctl_oid_list sysctl__hw_cxgb_children; int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE; SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0, "size of per-queue mbuf ring"); static int cxgb_tx_coalesce_force = 0; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN, &cxgb_tx_coalesce_force, 0, "coalesce small packets into a single work request regardless of ring state"); #define COALESCE_START_DEFAULT TX_ETH_Q_SIZE>>1 #define COALESCE_START_MAX (TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3)) #define COALESCE_STOP_DEFAULT TX_ETH_Q_SIZE>>2 #define COALESCE_STOP_MIN TX_ETH_Q_SIZE>>5 #define TX_RECLAIM_DEFAULT TX_ETH_Q_SIZE>>5 #define TX_RECLAIM_MAX TX_ETH_Q_SIZE>>2 #define TX_RECLAIM_MIN TX_ETH_Q_SIZE>>6 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN, &cxgb_tx_coalesce_enable_start, 0, "coalesce enable threshold"); static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN, &cxgb_tx_coalesce_enable_stop, 0, "coalesce disable threshold"); static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN, &cxgb_tx_reclaim_threshold, 0, "tx cleaning minimum threshold"); /* * XXX don't re-enable this until TOE stops assuming * we have an m_ext */ static int recycle_enable = 0; extern int cxgb_use_16k_clusters; extern int nmbjumbop; extern int nmbjumbo9; extern int nmbjumbo16; #define USE_GTS 0 #define SGE_RX_SM_BUF_SIZE 1536 #define SGE_RX_DROP_THRES 16 #define SGE_RX_COPY_THRES 128 /* * Period of the Tx buffer reclaim timer. This timer does not need to run * frequently as Tx buffers are usually reclaimed by new Tx packets. */ #define TX_RECLAIM_PERIOD (hz >> 1) /* * Values for sge_txq.flags */ enum { TXQ_RUNNING = 1 << 0, /* fetch engine is running */ TXQ_LAST_PKT_DB = 1 << 1, /* last packet rang the doorbell */ }; struct tx_desc { uint64_t flit[TX_DESC_FLITS]; } __packed; struct rx_desc { uint32_t addr_lo; uint32_t len_gen; uint32_t gen2; uint32_t addr_hi; } __packed; struct rsp_desc { /* response queue descriptor */ struct rss_header rss_hdr; uint32_t flags; uint32_t len_cq; uint8_t imm_data[47]; uint8_t intr_gen; } __packed; #define RX_SW_DESC_MAP_CREATED (1 << 0) #define TX_SW_DESC_MAP_CREATED (1 << 1) #define RX_SW_DESC_INUSE (1 << 3) #define TX_SW_DESC_MAPPED (1 << 4) #define RSPQ_NSOP_NEOP G_RSPD_SOP_EOP(0) #define RSPQ_EOP G_RSPD_SOP_EOP(F_RSPD_EOP) #define RSPQ_SOP G_RSPD_SOP_EOP(F_RSPD_SOP) #define RSPQ_SOP_EOP G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP) struct tx_sw_desc { /* SW state per Tx descriptor */ struct mbuf *m; bus_dmamap_t map; int flags; }; struct rx_sw_desc { /* SW state per Rx descriptor */ caddr_t rxsd_cl; struct mbuf *m; bus_dmamap_t map; int flags; }; struct txq_state { unsigned int compl; unsigned int gen; unsigned int pidx; }; struct refill_fl_cb_arg { int error; bus_dma_segment_t seg; int nseg; }; /* * Maps a number of flits to the number of Tx descriptors that can hold them. * The formula is * * desc = 1 + (flits - 2) / (WR_FLITS - 1). * * HW allows up to 4 descriptors to be combined into a WR. */ static uint8_t flit_desc_map[] = { 0, #if SGE_NUM_GENBITS == 1 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 #elif SGE_NUM_GENBITS == 2 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, #else # error "SGE_NUM_GENBITS must be 1 or 2" #endif }; #define TXQ_LOCK_ASSERT(qs) mtx_assert(&(qs)->lock, MA_OWNED) #define TXQ_TRYLOCK(qs) mtx_trylock(&(qs)->lock) #define TXQ_LOCK(qs) mtx_lock(&(qs)->lock) #define TXQ_UNLOCK(qs) mtx_unlock(&(qs)->lock) #define TXQ_RING_EMPTY(qs) drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_NEEDS_ENQUEUE(qs) \ drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_FLUSH(qs) drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_DEQUEUE_COND(qs, func, arg) \ drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg) #define TXQ_RING_DEQUEUE(qs) \ drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) int cxgb_debug = 0; static void sge_timer_cb(void *arg); static void sge_timer_reclaim(void *arg, int ncount); static void sge_txq_reclaim_handler(void *arg, int ncount); static void cxgb_start_locked(struct sge_qset *qs); /* * XXX need to cope with bursty scheduling by looking at a wider * window than we are now for determining the need for coalescing * */ static __inline uint64_t check_pkt_coalesce(struct sge_qset *qs) { struct adapter *sc; struct sge_txq *txq; uint8_t *fill; if (__predict_false(cxgb_tx_coalesce_force)) return (1); txq = &qs->txq[TXQ_ETH]; sc = qs->port->adapter; fill = &sc->tunq_fill[qs->idx]; if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX) cxgb_tx_coalesce_enable_start = COALESCE_START_MAX; if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN) cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN; /* * if the hardware transmit queue is more than 1/8 full * we mark it as coalescing - we drop back from coalescing * when we go below 1/32 full and there are no packets enqueued, * this provides us with some degree of hysteresis */ if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) && TXQ_RING_EMPTY(qs) && (qs->coalescing == 0)) *fill = 0; else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start)) *fill = 1; return (sc->tunq_coalesce); } #ifdef __LP64__ static void set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo) { uint64_t wr_hilo; #if _BYTE_ORDER == _LITTLE_ENDIAN wr_hilo = wr_hi; wr_hilo |= (((uint64_t)wr_lo)<<32); #else wr_hilo = wr_lo; wr_hilo |= (((uint64_t)wr_hi)<<32); #endif wrp->wrh_hilo = wr_hilo; } #else static void set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo) { wrp->wrh_hi = wr_hi; wmb(); wrp->wrh_lo = wr_lo; } #endif struct coalesce_info { int count; int nbytes; }; static int coalesce_check(struct mbuf *m, void *arg) { struct coalesce_info *ci = arg; int *count = &ci->count; int *nbytes = &ci->nbytes; if ((*nbytes == 0) || ((*nbytes + m->m_len <= 10500) && (*count < 7) && (m->m_next == NULL))) { *count += 1; *nbytes += m->m_len; return (1); } return (0); } static struct mbuf * cxgb_dequeue(struct sge_qset *qs) { struct mbuf *m, *m_head, *m_tail; struct coalesce_info ci; if (check_pkt_coalesce(qs) == 0) return TXQ_RING_DEQUEUE(qs); m_head = m_tail = NULL; ci.count = ci.nbytes = 0; do { m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci); if (m_head == NULL) { m_tail = m_head = m; } else if (m != NULL) { m_tail->m_nextpkt = m; m_tail = m; } } while (m != NULL); if (ci.count > 7) panic("trying to coalesce %d packets in to one WR", ci.count); return (m_head); } /** * reclaim_completed_tx - reclaims completed Tx descriptors * @adapter: the adapter * @q: the Tx queue to reclaim completed descriptors from * * Reclaims Tx descriptors that the SGE has indicated it has processed, * and frees the associated buffers if possible. Called with the Tx * queue's lock held. */ static __inline int reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue) { struct sge_txq *q = &qs->txq[queue]; int reclaim = desc_reclaimable(q); if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) || (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN)) cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT; if (reclaim < reclaim_min) return (0); mtx_assert(&qs->lock, MA_OWNED); if (reclaim > 0) { t3_free_tx_desc(qs, reclaim, queue); q->cleaned += reclaim; q->in_use -= reclaim; } if (isset(&qs->txq_stopped, TXQ_ETH)) clrbit(&qs->txq_stopped, TXQ_ETH); return (reclaim); } /** * should_restart_tx - are there enough resources to restart a Tx queue? * @q: the Tx queue * * Checks if there are enough descriptors to restart a suspended Tx queue. */ static __inline int should_restart_tx(const struct sge_txq *q) { unsigned int r = q->processed - q->cleaned; return q->in_use - r < (q->size >> 1); } /** * t3_sge_init - initialize SGE * @adap: the adapter * @p: the SGE parameters * * Performs SGE initialization needed every time after a chip reset. * We do not initialize any of the queue sets here, instead the driver * top-level must request those individually. We also do not enable DMA * here, that should be done after the queues have been set up. */ void t3_sge_init(adapter_t *adap, struct sge_params *p) { u_int ctrl, ups; ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */ ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL | F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN | V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS | V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING; #if SGE_NUM_GENBITS == 1 ctrl |= F_EGRGENCTRL; #endif if (adap->params.rev > 0) { if (!(adap->flags & (USING_MSIX | USING_MSI))) ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ; } t3_write_reg(adap, A_SG_CONTROL, ctrl); t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) | V_LORCQDRBTHRSH(512)); t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10); t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) | V_TIMEOUT(200 * core_ticks_per_usec(adap))); t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, adap->params.rev < T3_REV_C ? 1000 : 500); t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256); t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000); t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256); t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff)); t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024); } /** * sgl_len - calculates the size of an SGL of the given capacity * @n: the number of SGL entries * * Calculates the number of flits needed for a scatter/gather list that * can hold the given number of entries. */ static __inline unsigned int sgl_len(unsigned int n) { return ((3 * n) / 2 + (n & 1)); } /** * get_imm_packet - return the next ingress packet buffer from a response * @resp: the response descriptor containing the packet data * * Return a packet containing the immediate data of the given response. */ static int get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m) { if (resp->rss_hdr.opcode == CPL_RX_DATA) { const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0]; m->m_len = sizeof(*cpl) + ntohs(cpl->len); } else if (resp->rss_hdr.opcode == CPL_RX_PKT) { const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0]; m->m_len = sizeof(*cpl) + ntohs(cpl->len); } else m->m_len = IMMED_PKT_SIZE; m->m_ext.ext_buf = NULL; m->m_ext.ext_type = 0; memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len); return (0); } static __inline u_int flits_to_desc(u_int n) { return (flit_desc_map[n]); } #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \ F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \ V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \ F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \ F_HIRCQPARITYERROR) #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR) #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \ F_RSPQDISABLED) /** * t3_sge_err_intr_handler - SGE async event interrupt handler * @adapter: the adapter * * Interrupt handler for SGE asynchronous (non-data) events. */ void t3_sge_err_intr_handler(adapter_t *adapter) { unsigned int v, status; status = t3_read_reg(adapter, A_SG_INT_CAUSE); if (status & SGE_PARERR) CH_ALERT(adapter, "SGE parity error (0x%x)\n", status & SGE_PARERR); if (status & SGE_FRAMINGERR) CH_ALERT(adapter, "SGE framing error (0x%x)\n", status & SGE_FRAMINGERR); if (status & F_RSPQCREDITOVERFOW) CH_ALERT(adapter, "SGE response queue credit overflow\n"); if (status & F_RSPQDISABLED) { v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS); CH_ALERT(adapter, "packet delivered to disabled response queue (0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff); } t3_write_reg(adapter, A_SG_INT_CAUSE, status); if (status & SGE_FATALERR) t3_fatal_err(adapter); } void t3_sge_prep(adapter_t *adap, struct sge_params *p) { int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size; nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus); nqsets *= adap->params.nports; fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE); while (!powerof2(fl_q_size)) fl_q_size--; use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters : is_offload(adap); #if __FreeBSD_version >= 700111 if (use_16k) { jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUM16BYTES; } else { jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUM9BYTES; } #else jumbo_q_size = min(nmbjumbop/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUMPAGESIZE; #endif while (!powerof2(jumbo_q_size)) jumbo_q_size--; if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2)) device_printf(adap->dev, "Insufficient clusters and/or jumbo buffers.\n"); p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data); for (i = 0; i < SGE_QSETS; ++i) { struct qset_params *q = p->qset + i; if (adap->params.nports > 2) { q->coalesce_usecs = 50; } else { #ifdef INVARIANTS q->coalesce_usecs = 10; #else q->coalesce_usecs = 5; #endif } q->polling = 0; q->rspq_size = RSPQ_Q_SIZE; q->fl_size = fl_q_size; q->jumbo_size = jumbo_q_size; q->jumbo_buf_size = jumbo_buf_size; q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE; q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16; q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE; q->cong_thres = 0; } } int t3_sge_alloc(adapter_t *sc) { /* The parent tag. */ if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */ 1, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ BUS_SPACE_UNRESTRICTED, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */ 0, /* flags */ NULL, NULL, /* lock, lockarg */ &sc->parent_dmat)) { device_printf(sc->dev, "Cannot allocate parent DMA tag\n"); return (ENOMEM); } /* * DMA tag for normal sized RX frames */ if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) { device_printf(sc->dev, "Cannot allocate RX DMA tag\n"); return (ENOMEM); } /* * DMA tag for jumbo sized RX frames. */ if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) { device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n"); return (ENOMEM); } /* * DMA tag for TX frames. */ if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS, TX_MAX_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->tx_dmat)) { device_printf(sc->dev, "Cannot allocate TX DMA tag\n"); return (ENOMEM); } return (0); } int t3_sge_free(struct adapter * sc) { if (sc->tx_dmat != NULL) bus_dma_tag_destroy(sc->tx_dmat); if (sc->rx_jumbo_dmat != NULL) bus_dma_tag_destroy(sc->rx_jumbo_dmat); if (sc->rx_dmat != NULL) bus_dma_tag_destroy(sc->rx_dmat); if (sc->parent_dmat != NULL) bus_dma_tag_destroy(sc->parent_dmat); return (0); } void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p) { qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U); qs->rspq.polling = 0 /* p->polling */; } #if !defined(__i386__) && !defined(__amd64__) static void refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct refill_fl_cb_arg *cb_arg = arg; cb_arg->error = error; cb_arg->seg = segs[0]; cb_arg->nseg = nseg; } #endif /** * refill_fl - refill an SGE free-buffer list * @sc: the controller softc * @q: the free-list to refill * @n: the number of new buffers to allocate * * (Re)populate an SGE free-buffer list with up to @n new packet buffers. * The caller must assure that @n does not exceed the queue's capacity. */ static void refill_fl(adapter_t *sc, struct sge_fl *q, int n) { struct rx_sw_desc *sd = &q->sdesc[q->pidx]; struct rx_desc *d = &q->desc[q->pidx]; struct refill_fl_cb_arg cb_arg; struct mbuf *m; caddr_t cl; int err; cb_arg.error = 0; while (n--) { /* * We allocate an uninitialized mbuf + cluster, mbuf is * initialized after rx. */ if (q->zone == zone_pack) { if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL) break; cl = m->m_ext.ext_buf; } else { if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL) break; if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) { uma_zfree(q->zone, cl); break; } } if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) { if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) { log(LOG_WARNING, "bus_dmamap_create failed %d\n", err); uma_zfree(q->zone, cl); goto done; } sd->flags |= RX_SW_DESC_MAP_CREATED; } #if !defined(__i386__) && !defined(__amd64__) err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size, refill_fl_cb, &cb_arg, 0); if (err != 0 || cb_arg.error) { if (q->zone == zone_pack) uma_zfree(q->zone, cl); m_free(m); goto done; } #else cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl); #endif sd->flags |= RX_SW_DESC_INUSE; sd->rxsd_cl = cl; sd->m = m; d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff); d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff); d->len_gen = htobe32(V_FLD_GEN1(q->gen)); d->gen2 = htobe32(V_FLD_GEN2(q->gen)); d++; sd++; if (++q->pidx == q->size) { q->pidx = 0; q->gen ^= 1; sd = q->sdesc; d = q->desc; } q->credits++; q->db_pending++; } done: if (q->db_pending >= 32) { q->db_pending = 0; t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id)); } } /** * free_rx_bufs - free the Rx buffers on an SGE free list * @sc: the controle softc * @q: the SGE free list to clean up * * Release the buffers on an SGE free-buffer Rx queue. HW fetching from * this queue should be stopped before calling this function. */ static void free_rx_bufs(adapter_t *sc, struct sge_fl *q) { u_int cidx = q->cidx; while (q->credits--) { struct rx_sw_desc *d = &q->sdesc[cidx]; if (d->flags & RX_SW_DESC_INUSE) { bus_dmamap_unload(q->entry_tag, d->map); bus_dmamap_destroy(q->entry_tag, d->map); if (q->zone == zone_pack) { m_init(d->m, zone_pack, MCLBYTES, M_NOWAIT, MT_DATA, M_EXT); uma_zfree(zone_pack, d->m); } else { m_init(d->m, zone_mbuf, MLEN, M_NOWAIT, MT_DATA, 0); uma_zfree(zone_mbuf, d->m); uma_zfree(q->zone, d->rxsd_cl); } } d->rxsd_cl = NULL; d->m = NULL; if (++cidx == q->size) cidx = 0; } } static __inline void __refill_fl(adapter_t *adap, struct sge_fl *fl) { refill_fl(adap, fl, min(16U, fl->size - fl->credits)); } static __inline void __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max) { uint32_t reclaimable = fl->size - fl->credits; if (reclaimable > 0) refill_fl(adap, fl, min(max, reclaimable)); } /** * recycle_rx_buf - recycle a receive buffer * @adapter: the adapter * @q: the SGE free list * @idx: index of buffer to recycle * * Recycles the specified buffer on the given free list by adding it at * the next available slot on the list. */ static void recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx) { struct rx_desc *from = &q->desc[idx]; struct rx_desc *to = &q->desc[q->pidx]; q->sdesc[q->pidx] = q->sdesc[idx]; to->addr_lo = from->addr_lo; // already big endian to->addr_hi = from->addr_hi; // likewise wmb(); /* necessary ? */ to->len_gen = htobe32(V_FLD_GEN1(q->gen)); to->gen2 = htobe32(V_FLD_GEN2(q->gen)); q->credits++; if (++q->pidx == q->size) { q->pidx = 0; q->gen ^= 1; } t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id)); } static void alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { uint32_t *addr; addr = arg; *addr = segs[0].ds_addr; } static int alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size, bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag) { size_t len = nelem * elem_size; void *s = NULL; void *p = NULL; int err; if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0, BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor tag\n"); return (ENOMEM); } if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT, map)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor memory\n"); return (ENOMEM); } bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0); bzero(p, len); *(void **)desc = p; if (sw_size) { len = nelem * sw_size; s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); *(void **)sdesc = s; } if (parent_entry_tag == NULL) return (0); if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS, TX_MAX_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL, entry_tag)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor entry tag\n"); return (ENOMEM); } return (0); } static void sge_slow_intr_handler(void *arg, int ncount) { adapter_t *sc = arg; t3_slow_intr_handler(sc); t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask); (void) t3_read_reg(sc, A_PL_INT_ENABLE0); } /** * sge_timer_cb - perform periodic maintenance of an SGE qset * @data: the SGE queue set to maintain * * Runs periodically from a timer to perform maintenance of an SGE queue * set. It performs two tasks: * * a) Cleans up any completed Tx descriptors that may still be pending. * Normal descriptor cleanup happens when new packets are added to a Tx * queue so this timer is relatively infrequent and does any cleanup only * if the Tx queue has not seen any new packets in a while. We make a * best effort attempt to reclaim descriptors, in that we don't wait * around if we cannot get a queue's lock (which most likely is because * someone else is queueing new packets and so will also handle the clean * up). Since control queues use immediate data exclusively we don't * bother cleaning them up here. * * b) Replenishes Rx queues that have run out due to memory shortage. * Normally new Rx buffers are added when existing ones are consumed but * when out of memory a queue can become empty. We try to add only a few * buffers here, the queue will be replenished fully as these new buffers * are used up if memory shortage has subsided. * * c) Return coalesced response queue credits in case a response queue is * starved. * * d) Ring doorbells for T304 tunnel queues since we have seen doorbell * fifo overflows and the FW doesn't implement any recovery scheme yet. */ static void sge_timer_cb(void *arg) { adapter_t *sc = arg; if ((sc->flags & USING_MSIX) == 0) { struct port_info *pi; struct sge_qset *qs; struct sge_txq *txq; int i, j; int reclaim_ofl, refill_rx; if (sc->open_device_map == 0) return; for (i = 0; i < sc->params.nports; i++) { pi = &sc->port[i]; for (j = 0; j < pi->nqsets; j++) { qs = &sc->sge.qs[pi->first_qset + j]; txq = &qs->txq[0]; reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned; refill_rx = ((qs->fl[0].credits < qs->fl[0].size) || (qs->fl[1].credits < qs->fl[1].size)); if (reclaim_ofl || refill_rx) { taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task); break; } } } } if (sc->params.nports > 2) { int i; for_each_port(sc, i) { struct port_info *pi = &sc->port[i]; t3_write_reg(sc, A_SG_KDOORBELL, F_SELEGRCNTX | (FW_TUNNEL_SGEEC_START + pi->first_qset)); } } if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) && sc->open_device_map != 0) callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); } /* * This is meant to be a catch-all function to keep sge state private * to sge.c * */ int t3_sge_init_adapter(adapter_t *sc) { callout_init(&sc->sge_timer_ch, 1); callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc); return (0); } int t3_sge_reset_adapter(adapter_t *sc) { callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); return (0); } int t3_sge_init_port(struct port_info *pi) { TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi); return (0); } /** * refill_rspq - replenish an SGE response queue * @adapter: the adapter * @q: the response queue to replenish * @credits: how many new responses to make available * * Replenishes a response queue by making the supplied number of responses * available to HW. */ static __inline void refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits) { /* mbufs are allocated on demand when a rspq entry is processed. */ t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN, V_RSPQ(q->cntxt_id) | V_CREDITS(credits)); } static void sge_txq_reclaim_handler(void *arg, int ncount) { struct sge_qset *qs = arg; int i; for (i = 0; i < 3; i++) reclaim_completed_tx(qs, 16, i); } static void sge_timer_reclaim(void *arg, int ncount) { struct port_info *pi = arg; int i, nqsets = pi->nqsets; adapter_t *sc = pi->adapter; struct sge_qset *qs; struct mtx *lock; KASSERT((sc->flags & USING_MSIX) == 0, ("can't call timer reclaim for msi-x")); for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[pi->first_qset + i]; reclaim_completed_tx(qs, 16, TXQ_OFLD); lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock : &sc->sge.qs[0].rspq.lock; if (mtx_trylock(lock)) { /* XXX currently assume that we are *NOT* polling */ uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS); if (qs->fl[0].credits < qs->fl[0].size - 16) __refill_fl(sc, &qs->fl[0]); if (qs->fl[1].credits < qs->fl[1].size - 16) __refill_fl(sc, &qs->fl[1]); if (status & (1 << qs->rspq.cntxt_id)) { if (qs->rspq.credits) { refill_rspq(sc, &qs->rspq, 1); qs->rspq.credits--; t3_write_reg(sc, A_SG_RSPQ_FL_STATUS, 1 << qs->rspq.cntxt_id); } } mtx_unlock(lock); } } } /** * init_qset_cntxt - initialize an SGE queue set context info * @qs: the queue set * @id: the queue set id * * Initializes the TIDs and context ids for the queues of a queue set. */ static void init_qset_cntxt(struct sge_qset *qs, u_int id) { qs->rspq.cntxt_id = id; qs->fl[0].cntxt_id = 2 * id; qs->fl[1].cntxt_id = 2 * id + 1; qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id; qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id; qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id; qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id; qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id; /* XXX: a sane limit is needed instead of INT_MAX */ mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX); mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX); mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX); } static void txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs) { txq->in_use += ndesc; /* * XXX we don't handle stopping of queue * presumably start handles this when we bump against the end */ txqs->gen = txq->gen; txq->unacked += ndesc; txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5); txq->unacked &= 31; txqs->pidx = txq->pidx; txq->pidx += ndesc; #ifdef INVARIANTS if (((txqs->pidx > txq->cidx) && (txq->pidx < txqs->pidx) && (txq->pidx >= txq->cidx)) || ((txqs->pidx < txq->cidx) && (txq->pidx >= txq-> cidx)) || ((txqs->pidx < txq->cidx) && (txq->cidx < txqs->pidx))) panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d", txqs->pidx, txq->pidx, txq->cidx); #endif if (txq->pidx >= txq->size) { txq->pidx -= txq->size; txq->gen ^= 1; } } /** * calc_tx_descs - calculate the number of Tx descriptors for a packet * @m: the packet mbufs * @nsegs: the number of segments * * Returns the number of Tx descriptors needed for the given Ethernet * packet. Ethernet packets require addition of WR and CPL headers. */ static __inline unsigned int calc_tx_descs(const struct mbuf *m, int nsegs) { unsigned int flits; if (m->m_pkthdr.len <= PIO_LEN) return 1; flits = sgl_len(nsegs) + 2; if (m->m_pkthdr.csum_flags & CSUM_TSO) flits++; return flits_to_desc(flits); } /** * make_sgl - populate a scatter/gather list for a packet * @sgp: the SGL to populate * @segs: the packet dma segments * @nsegs: the number of segments * * Generates a scatter/gather list for the buffers that make up a packet * and returns the SGL size in 8-byte words. The caller must size the SGL * appropriately. */ static __inline void make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs) { int i, idx; for (idx = 0, i = 0; i < nsegs; i++) { /* * firmware doesn't like empty segments */ if (segs[i].ds_len == 0) continue; if (i && idx == 0) ++sgp; sgp->len[idx] = htobe32(segs[i].ds_len); sgp->addr[idx] = htobe64(segs[i].ds_addr); idx ^= 1; } if (idx) { sgp->len[idx] = 0; sgp->addr[idx] = 0; } } /** * check_ring_tx_db - check and potentially ring a Tx queue's doorbell * @adap: the adapter * @q: the Tx queue * * Ring the doorbell if a Tx queue is asleep. There is a natural race, * where the HW is going to sleep just after we checked, however, * then the interrupt handler will detect the outstanding TX packet * and ring the doorbell for us. * * When GTS is disabled we unconditionally ring the doorbell. */ static __inline void check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring) { #if USE_GTS clear_bit(TXQ_LAST_PKT_DB, &q->flags); if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) { set_bit(TXQ_LAST_PKT_DB, &q->flags); #ifdef T3_TRACE T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d", q->cntxt_id); #endif t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } #else if (mustring || ++q->db_pending >= 32) { wmb(); /* write descriptors before telling HW */ t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); q->db_pending = 0; } #endif } static __inline void wr_gen2(struct tx_desc *d, unsigned int gen) { #if SGE_NUM_GENBITS == 2 d->flit[TX_DESC_FLITS - 1] = htobe64(gen); #endif } /** * write_wr_hdr_sgl - write a WR header and, optionally, SGL * @ndesc: number of Tx descriptors spanned by the SGL * @txd: first Tx descriptor to be written * @txqs: txq state (generation and producer index) * @txq: the SGE Tx queue * @sgl: the SGL * @flits: number of flits to the start of the SGL in the first descriptor * @sgl_flits: the SGL size in flits * @wr_hi: top 32 bits of WR header based on WR type (big endian) * @wr_lo: low 32 bits of WR header based on WR type (big endian) * * Write a work request header and an associated SGL. If the SGL is * small enough to fit into one Tx descriptor it has already been written * and we just need to write the WR header. Otherwise we distribute the * SGL across the number of descriptors it spans. */ static void write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs, const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits, unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo) { struct work_request_hdr *wrp = (struct work_request_hdr *)txd; struct tx_sw_desc *txsd = &txq->sdesc[txqs->pidx]; if (__predict_true(ndesc == 1)) { set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | wr_hi, htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) | wr_lo); wr_gen2(txd, txqs->gen); } else { unsigned int ogen = txqs->gen; const uint64_t *fp = (const uint64_t *)sgl; struct work_request_hdr *wp = wrp; wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | wr_hi; while (sgl_flits) { unsigned int avail = WR_FLITS - flits; if (avail > sgl_flits) avail = sgl_flits; memcpy(&txd->flit[flits], fp, avail * sizeof(*fp)); sgl_flits -= avail; ndesc--; if (!sgl_flits) break; fp += avail; txd++; txsd++; if (++txqs->pidx == txq->size) { txqs->pidx = 0; txqs->gen ^= 1; txd = txq->desc; txsd = txq->sdesc; } /* * when the head of the mbuf chain * is freed all clusters will be freed * with it */ wrp = (struct work_request_hdr *)txd; wrp->wrh_hi = htonl(V_WR_DATATYPE(1) | V_WR_SGLSFLT(1)) | wr_hi; wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS, sgl_flits + 1)) | V_WR_GEN(txqs->gen)) | wr_lo; wr_gen2(txd, txqs->gen); flits = 1; } wrp->wrh_hi |= htonl(F_WR_EOP); wmb(); wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo; wr_gen2((struct tx_desc *)wp, ogen); } } /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */ #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20) #define GET_VTAG(cntrl, m) \ do { \ if ((m)->m_flags & M_VLANTAG) \ cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \ } while (0) static int t3_encap(struct sge_qset *qs, struct mbuf **m) { adapter_t *sc; struct mbuf *m0; struct sge_txq *txq; struct txq_state txqs; struct port_info *pi; unsigned int ndesc, flits, cntrl, mlen; int err, nsegs, tso_info = 0; struct work_request_hdr *wrp; struct tx_sw_desc *txsd; struct sg_ent *sgp, *sgl; uint32_t wr_hi, wr_lo, sgl_flits; bus_dma_segment_t segs[TX_MAX_SEGS]; struct tx_desc *txd; pi = qs->port; sc = pi->adapter; txq = &qs->txq[TXQ_ETH]; txd = &txq->desc[txq->pidx]; txsd = &txq->sdesc[txq->pidx]; sgl = txq->txq_sgl; prefetch(txd); m0 = *m; mtx_assert(&qs->lock, MA_OWNED); cntrl = V_TXPKT_INTF(pi->txpkt_intf); KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n")); if (m0->m_nextpkt == NULL && m0->m_next != NULL && m0->m_pkthdr.csum_flags & (CSUM_TSO)) tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz); if (m0->m_nextpkt != NULL) { busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs); ndesc = 1; mlen = 0; } else { if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map, &m0, segs, &nsegs))) { if (cxgb_debug) printf("failed ... err=%d\n", err); return (err); } mlen = m0->m_pkthdr.len; ndesc = calc_tx_descs(m0, nsegs); } txq_prod(txq, ndesc, &txqs); KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs)); txsd->m = m0; if (m0->m_nextpkt != NULL) { struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd; int i, fidx; if (nsegs > 7) panic("trying to coalesce %d packets in to one WR", nsegs); txq->txq_coalesced += nsegs; wrp = (struct work_request_hdr *)txd; flits = nsegs*2 + 1; for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) { struct cpl_tx_pkt_batch_entry *cbe; uint64_t flit; uint32_t *hflit = (uint32_t *)&flit; int cflags = m0->m_pkthdr.csum_flags; cntrl = V_TXPKT_INTF(pi->txpkt_intf); GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); if (__predict_false(!(cflags & CSUM_IP))) cntrl |= F_TXPKT_IPCSUM_DIS; if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6)))) cntrl |= F_TXPKT_L4CSUM_DIS; hflit[0] = htonl(cntrl); hflit[1] = htonl(segs[i].ds_len | 0x80000000); flit |= htobe64(1 << 24); cbe = &cpl_batch->pkt_entry[i]; cbe->cntrl = hflit[0]; cbe->len = hflit[1]; cbe->addr = htobe64(segs[i].ds_addr); } wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token)); set_wr_hdr(wrp, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); return (0); } else if (tso_info) { uint16_t eth_type; struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd; struct ether_header *eh; void *l3hdr; struct tcphdr *tcp; txd->flit[2] = 0; GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO); hdr->cntrl = htonl(cntrl); hdr->len = htonl(mlen | 0x80000000); if (__predict_false(mlen < TCPPKTHDRSIZE)) { printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x", m0, mlen, m0->m_pkthdr.tso_segsz, (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags); panic("tx tso packet too small"); } /* Make sure that ether, ip, tcp headers are all in m0 */ if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) { m0 = m_pullup(m0, TCPPKTHDRSIZE); if (__predict_false(m0 == NULL)) { /* XXX panic probably an overreaction */ panic("couldn't fit header into mbuf"); } } eh = mtod(m0, struct ether_header *); eth_type = eh->ether_type; if (eth_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *evh = (void *)eh; tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN); l3hdr = evh + 1; eth_type = evh->evl_proto; } else { tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II); l3hdr = eh + 1; } if (eth_type == htons(ETHERTYPE_IP)) { struct ip *ip = l3hdr; tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl); tcp = (struct tcphdr *)(ip + 1); } else if (eth_type == htons(ETHERTYPE_IPV6)) { struct ip6_hdr *ip6 = l3hdr; KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO with ip6_nxt %d", __func__, ip6->ip6_nxt)); tso_info |= F_LSO_IPV6; tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2); tcp = (struct tcphdr *)(ip6 + 1); } else panic("%s: CSUM_TSO but neither ip nor ip6", __func__); tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off); hdr->lso_info = htonl(tso_info); if (__predict_false(mlen <= PIO_LEN)) { /* * pkt not undersized but fits in PIO_LEN * Indicates a TSO bug at the higher levels. */ txsd->m = NULL; m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]); flits = (mlen + 7) / 8 + 3; wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | F_WR_SOP | F_WR_EOP | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen) | V_WR_TID(txq->token)); set_wr_hdr(&hdr->wr, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); m_freem(m0); return (0); } flits = 3; } else { struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd; GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP))) cntrl |= F_TXPKT_IPCSUM_DIS; if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6)))) cntrl |= F_TXPKT_L4CSUM_DIS; cpl->cntrl = htonl(cntrl); cpl->len = htonl(mlen | 0x80000000); if (mlen <= PIO_LEN) { txsd->m = NULL; m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]); flits = (mlen + 7) / 8 + 2; wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | F_WR_SOP | F_WR_EOP | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen) | V_WR_TID(txq->token)); set_wr_hdr(&cpl->wr, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); m_freem(m0); return (0); } flits = 2; } wrp = (struct work_request_hdr *)txd; sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl; make_sgl(sgp, segs, nsegs); sgl_flits = sgl_len(nsegs); ETHER_BPF_MTAP(pi->ifp, m0); KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc)); wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); wr_lo = htonl(V_WR_TID(txq->token)); write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo); check_ring_tx_db(sc, txq, 0); return (0); } void cxgb_tx_watchdog(void *arg) { struct sge_qset *qs = arg; struct sge_txq *txq = &qs->txq[TXQ_ETH]; if (qs->coalescing != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) && TXQ_RING_EMPTY(qs)) qs->coalescing = 0; else if (qs->coalescing == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start)) qs->coalescing = 1; if (TXQ_TRYLOCK(qs)) { qs->qs_flags |= QS_FLUSHING; cxgb_start_locked(qs); qs->qs_flags &= ~QS_FLUSHING; TXQ_UNLOCK(qs); } if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING) callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog, qs, txq->txq_watchdog.c_cpu); } static void cxgb_tx_timeout(void *arg) { struct sge_qset *qs = arg; struct sge_txq *txq = &qs->txq[TXQ_ETH]; if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3))) qs->coalescing = 1; if (TXQ_TRYLOCK(qs)) { qs->qs_flags |= QS_TIMEOUT; cxgb_start_locked(qs); qs->qs_flags &= ~QS_TIMEOUT; TXQ_UNLOCK(qs); } } static void cxgb_start_locked(struct sge_qset *qs) { struct mbuf *m_head = NULL; struct sge_txq *txq = &qs->txq[TXQ_ETH]; struct port_info *pi = qs->port; struct ifnet *ifp = pi->ifp; if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT)) reclaim_completed_tx(qs, 0, TXQ_ETH); if (!pi->link_config.link_ok) { TXQ_RING_FLUSH(qs); return; } TXQ_LOCK_ASSERT(qs); while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) && pi->link_config.link_ok) { reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH); if (txq->size - txq->in_use <= TX_MAX_DESC) break; if ((m_head = cxgb_dequeue(qs)) == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (t3_encap(qs, &m_head) || m_head == NULL) break; m_head = NULL; } if (txq->db_pending) check_ring_tx_db(pi->adapter, txq, 1); if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 && pi->link_config.link_ok) callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout, qs, txq->txq_timer.c_cpu); if (m_head != NULL) m_freem(m_head); } static int cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m) { struct port_info *pi = qs->port; struct sge_txq *txq = &qs->txq[TXQ_ETH]; struct buf_ring *br = txq->txq_mr; int error, avail; avail = txq->size - txq->in_use; TXQ_LOCK_ASSERT(qs); /* * We can only do a direct transmit if the following are true: * - we aren't coalescing (ring < 3/4 full) * - the link is up -- checked in caller * - there are no packets enqueued already * - there is space in hardware transmit queue */ if (check_pkt_coalesce(qs) == 0 && !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) { if (t3_encap(qs, &m)) { if (m != NULL && (error = drbr_enqueue(ifp, br, m)) != 0) return (error); } else { if (txq->db_pending) check_ring_tx_db(pi->adapter, txq, 1); /* * We've bypassed the buf ring so we need to update * the stats directly */ txq->txq_direct_packets++; txq->txq_direct_bytes += m->m_pkthdr.len; } } else if ((error = drbr_enqueue(ifp, br, m)) != 0) return (error); reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH); if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok && (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7))) cxgb_start_locked(qs); else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer)) callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout, qs, txq->txq_timer.c_cpu); return (0); } int cxgb_transmit(struct ifnet *ifp, struct mbuf *m) { struct sge_qset *qs; struct port_info *pi = ifp->if_softc; int error, qidx = pi->first_qset; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||(!pi->link_config.link_ok)) { m_freem(m); return (0); } /* check if flowid is set */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset; qs = &pi->adapter->sge.qs[qidx]; if (TXQ_TRYLOCK(qs)) { /* XXX running */ error = cxgb_transmit_locked(ifp, qs, m); TXQ_UNLOCK(qs); } else error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m); return (error); } void cxgb_qflush(struct ifnet *ifp) { /* * flush any enqueued mbufs in the buf_rings * and in the transmit queues * no-op for now */ return; } /** * write_imm - write a packet into a Tx descriptor as immediate data * @d: the Tx descriptor to write * @m: the packet * @len: the length of packet data to write as immediate data * @gen: the generation bit value to write * * Writes a packet as immediate data into a Tx descriptor. The packet * contains a work request at its beginning. We must write the packet * carefully so the SGE doesn't read accidentally before it's written in * its entirety. */ static __inline void write_imm(struct tx_desc *d, caddr_t src, unsigned int len, unsigned int gen) { struct work_request_hdr *from = (struct work_request_hdr *)src; struct work_request_hdr *to = (struct work_request_hdr *)d; uint32_t wr_hi, wr_lo; KASSERT(len <= WR_LEN && len >= sizeof(*from), ("%s: invalid len %d", __func__, len)); memcpy(&to[1], &from[1], len - sizeof(*from)); wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP | V_WR_BCNTLFLT(len & 7)); wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8)); set_wr_hdr(to, wr_hi, wr_lo); wmb(); wr_gen2(d, gen); } /** * check_desc_avail - check descriptor availability on a send queue * @adap: the adapter * @q: the TX queue * @m: the packet needing the descriptors * @ndesc: the number of Tx descriptors needed * @qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL) * * Checks if the requested number of Tx descriptors is available on an * SGE send queue. If the queue is already suspended or not enough * descriptors are available the packet is queued for later transmission. * Must be called with the Tx queue locked. * * Returns 0 if enough descriptors are available, 1 if there aren't * enough descriptors and the packet has been queued, and 2 if the caller * needs to retry because there weren't enough descriptors at the * beginning of the call but some freed up in the mean time. */ static __inline int check_desc_avail(adapter_t *adap, struct sge_txq *q, struct mbuf *m, unsigned int ndesc, unsigned int qid) { /* * XXX We currently only use this for checking the control queue * the control queue is only used for binding qsets which happens * at init time so we are guaranteed enough descriptors */ if (__predict_false(mbufq_len(&q->sendq))) { addq_exit: (void )mbufq_enqueue(&q->sendq, m); return 1; } if (__predict_false(q->size - q->in_use < ndesc)) { struct sge_qset *qs = txq_to_qset(q, qid); setbit(&qs->txq_stopped, qid); if (should_restart_tx(q) && test_and_clear_bit(qid, &qs->txq_stopped)) return 2; q->stops++; goto addq_exit; } return 0; } /** * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs * @q: the SGE control Tx queue * * This is a variant of reclaim_completed_tx() that is used for Tx queues * that send only immediate data (presently just the control queues) and * thus do not have any mbufs */ static __inline void reclaim_completed_tx_imm(struct sge_txq *q) { unsigned int reclaim = q->processed - q->cleaned; q->in_use -= reclaim; q->cleaned += reclaim; } /** * ctrl_xmit - send a packet through an SGE control Tx queue * @adap: the adapter * @q: the control queue * @m: the packet * * Send a packet through an SGE control Tx queue. Packets sent through * a control queue must fit entirely as immediate data in a single Tx * descriptor and have no page fragments. */ static int ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) { int ret; struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *); struct sge_txq *q = &qs->txq[TXQ_CTRL]; KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__)); wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP); wrp->wrh_lo = htonl(V_WR_TID(q->token)); TXQ_LOCK(qs); again: reclaim_completed_tx_imm(q); ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL); if (__predict_false(ret)) { if (ret == 1) { TXQ_UNLOCK(qs); return (ENOSPC); } goto again; } write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); q->in_use++; if (++q->pidx >= q->size) { q->pidx = 0; q->gen ^= 1; } TXQ_UNLOCK(qs); wmb(); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); m_free(m); return (0); } /** * restart_ctrlq - restart a suspended control queue * @qs: the queue set cotaining the control queue * * Resumes transmission on a suspended Tx control queue. */ static void restart_ctrlq(void *data, int npending) { struct mbuf *m; struct sge_qset *qs = (struct sge_qset *)data; struct sge_txq *q = &qs->txq[TXQ_CTRL]; adapter_t *adap = qs->port->adapter; TXQ_LOCK(qs); again: reclaim_completed_tx_imm(q); while (q->in_use < q->size && (m = mbufq_dequeue(&q->sendq)) != NULL) { write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); m_free(m); if (++q->pidx >= q->size) { q->pidx = 0; q->gen ^= 1; } q->in_use++; } if (mbufq_len(&q->sendq)) { setbit(&qs->txq_stopped, TXQ_CTRL); if (should_restart_tx(q) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) goto again; q->stops++; } TXQ_UNLOCK(qs); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } /* * Send a management message through control queue 0 */ int t3_mgmt_tx(struct adapter *adap, struct mbuf *m) { return ctrl_xmit(adap, &adap->sge.qs[0], m); } /** * free_qset - free the resources of an SGE queue set * @sc: the controller owning the queue set * @q: the queue set * * Release the HW and SW resources associated with an SGE queue set, such * as HW contexts, packet buffers, and descriptor rings. Traffic to the * queue set must be quiesced prior to calling this. */ static void t3_free_qset(adapter_t *sc, struct sge_qset *q) { int i; reclaim_completed_tx(q, 0, TXQ_ETH); if (q->txq[TXQ_ETH].txq_mr != NULL) buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF); if (q->txq[TXQ_ETH].txq_ifq != NULL) { ifq_delete(q->txq[TXQ_ETH].txq_ifq); free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF); } for (i = 0; i < SGE_RXQ_PER_SET; ++i) { if (q->fl[i].desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_disable_fl(sc, q->fl[i].cntxt_id); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map); bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc, q->fl[i].desc_map); bus_dma_tag_destroy(q->fl[i].desc_tag); bus_dma_tag_destroy(q->fl[i].entry_tag); } if (q->fl[i].sdesc) { free_rx_bufs(sc, &q->fl[i]); free(q->fl[i].sdesc, M_DEVBUF); } } mtx_unlock(&q->lock); MTX_DESTROY(&q->lock); for (i = 0; i < SGE_TXQ_PER_SET; i++) { if (q->txq[i].desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->txq[i].desc_tag, q->txq[i].desc_map); bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc, q->txq[i].desc_map); bus_dma_tag_destroy(q->txq[i].desc_tag); bus_dma_tag_destroy(q->txq[i].entry_tag); } if (q->txq[i].sdesc) { free(q->txq[i].sdesc, M_DEVBUF); } } if (q->rspq.desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map); bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc, q->rspq.desc_map); bus_dma_tag_destroy(q->rspq.desc_tag); MTX_DESTROY(&q->rspq.lock); } #if defined(INET6) || defined(INET) tcp_lro_free(&q->lro.ctrl); #endif bzero(q, sizeof(*q)); } /** * t3_free_sge_resources - free SGE resources * @sc: the adapter softc * * Frees resources used by the SGE queue sets. */ void t3_free_sge_resources(adapter_t *sc, int nqsets) { int i; for (i = 0; i < nqsets; ++i) { TXQ_LOCK(&sc->sge.qs[i]); t3_free_qset(sc, &sc->sge.qs[i]); } } /** * t3_sge_start - enable SGE * @sc: the controller softc * * Enables the SGE for DMAs. This is the last step in starting packet * transfers. */ void t3_sge_start(adapter_t *sc) { t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE); } /** * t3_sge_stop - disable SGE operation * @sc: the adapter * * Disables the DMA engine. This can be called in emeregencies (e.g., * from error interrupts) or from normal process context. In the latter * case it also disables any pending queue restart tasklets. Note that * if it is called in interrupt context it cannot disable the restart * tasklets as it cannot wait, however the tasklets will have no effect * since the doorbells are disabled and the driver will call this again * later from process context, at which time the tasklets will be stopped * if they are still running. */ void t3_sge_stop(adapter_t *sc) { int i, nqsets; t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0); if (sc->tq == NULL) return; for (nqsets = i = 0; i < (sc)->params.nports; i++) nqsets += sc->port[i].nqsets; #ifdef notyet /* * * XXX */ for (i = 0; i < nqsets; ++i) { struct sge_qset *qs = &sc->sge.qs[i]; taskqueue_drain(sc->tq, &qs->txq[TXQ_OFLD].qresume_task); taskqueue_drain(sc->tq, &qs->txq[TXQ_CTRL].qresume_task); } #endif } /** * t3_free_tx_desc - reclaims Tx descriptors and their buffers * @adapter: the adapter * @q: the Tx queue to reclaim descriptors from * @reclaimable: the number of descriptors to reclaim * @m_vec_size: maximum number of buffers to reclaim * @desc_reclaimed: returns the number of descriptors reclaimed * * Reclaims Tx descriptors from an SGE Tx queue and frees the associated * Tx buffers. Called with the Tx queue lock held. * * Returns number of buffers of reclaimed */ void t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue) { struct tx_sw_desc *txsd; unsigned int cidx, mask; struct sge_txq *q = &qs->txq[queue]; #ifdef T3_TRACE T3_TRACE2(sc->tb[q->cntxt_id & 7], "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx); #endif cidx = q->cidx; mask = q->size - 1; txsd = &q->sdesc[cidx]; mtx_assert(&qs->lock, MA_OWNED); while (reclaimable--) { prefetch(q->sdesc[(cidx + 1) & mask].m); prefetch(q->sdesc[(cidx + 2) & mask].m); if (txsd->m != NULL) { if (txsd->flags & TX_SW_DESC_MAPPED) { bus_dmamap_unload(q->entry_tag, txsd->map); txsd->flags &= ~TX_SW_DESC_MAPPED; } m_freem_list(txsd->m); txsd->m = NULL; } else q->txq_skipped++; ++txsd; if (++cidx == q->size) { cidx = 0; txsd = q->sdesc; } } q->cidx = cidx; } /** * is_new_response - check if a response is newly written * @r: the response descriptor * @q: the response queue * * Returns true if a response descriptor contains a yet unprocessed * response. */ static __inline int is_new_response(const struct rsp_desc *r, const struct sge_rspq *q) { return (r->intr_gen & F_RSPD_GEN2) == q->gen; } #define RSPD_GTS_MASK (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS) #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \ V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \ V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \ V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR)) /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */ #define NOMEM_INTR_DELAY 2500 #ifdef TCP_OFFLOAD /** * write_ofld_wr - write an offload work request * @adap: the adapter * @m: the packet to send * @q: the Tx queue * @pidx: index of the first Tx descriptor to write * @gen: the generation value to use * @ndesc: number of descriptors the packet will occupy * * Write an offload work request to send the supplied packet. The packet * data already carry the work request with most fields populated. */ static void write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q, unsigned int pidx, unsigned int gen, unsigned int ndesc) { unsigned int sgl_flits, flits; int i, idx, nsegs, wrlen; struct work_request_hdr *from; struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1]; struct tx_desc *d = &q->desc[pidx]; struct txq_state txqs; struct sglist_seg *segs; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); struct sglist *sgl; from = (void *)(oh + 1); /* Start of WR within mbuf */ wrlen = m->m_len - sizeof(*oh); if (!(oh->flags & F_HDR_SGL)) { write_imm(d, (caddr_t)from, wrlen, gen); /* * mbuf with "real" immediate tx data will be enqueue_wr'd by * t3_push_frames and freed in wr_ack. Others, like those sent * down by close_conn, t3_send_reset, etc. should be freed here. */ if (!(oh->flags & F_HDR_DF)) m_free(m); return; } memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from)); sgl = oh->sgl; flits = wrlen / 8; sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl; nsegs = sgl->sg_nseg; segs = sgl->sg_segs; for (idx = 0, i = 0; i < nsegs; i++) { KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__)); if (i && idx == 0) ++sgp; sgp->len[idx] = htobe32(segs[i].ss_len); sgp->addr[idx] = htobe64(segs[i].ss_paddr); idx ^= 1; } if (idx) { sgp->len[idx] = 0; sgp->addr[idx] = 0; } sgl_flits = sgl_len(nsegs); txqs.gen = gen; txqs.pidx = pidx; txqs.compl = 0; write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits, from->wrh_hi, from->wrh_lo); } /** * ofld_xmit - send a packet through an offload queue * @adap: the adapter * @q: the Tx offload queue * @m: the packet * * Send an offload packet through an SGE offload queue. */ static int ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) { int ret; unsigned int ndesc; unsigned int pidx, gen; struct sge_txq *q = &qs->txq[TXQ_OFLD]; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); ndesc = G_HDR_NDESC(oh->flags); TXQ_LOCK(qs); again: reclaim_completed_tx(qs, 16, TXQ_OFLD); ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD); if (__predict_false(ret)) { if (ret == 1) { TXQ_UNLOCK(qs); return (EINTR); } goto again; } gen = q->gen; q->in_use += ndesc; pidx = q->pidx; q->pidx += ndesc; if (q->pidx >= q->size) { q->pidx -= q->size; q->gen ^= 1; } write_ofld_wr(adap, m, q, pidx, gen, ndesc); check_ring_tx_db(adap, q, 1); TXQ_UNLOCK(qs); return (0); } /** * restart_offloadq - restart a suspended offload queue * @qs: the queue set cotaining the offload queue * * Resumes transmission on a suspended Tx offload queue. */ static void restart_offloadq(void *data, int npending) { struct mbuf *m; struct sge_qset *qs = data; struct sge_txq *q = &qs->txq[TXQ_OFLD]; adapter_t *adap = qs->port->adapter; int cleaned; TXQ_LOCK(qs); again: cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD); while ((m = mbufq_first(&q->sendq)) != NULL) { unsigned int gen, pidx; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); unsigned int ndesc = G_HDR_NDESC(oh->flags); if (__predict_false(q->size - q->in_use < ndesc)) { setbit(&qs->txq_stopped, TXQ_OFLD); if (should_restart_tx(q) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) goto again; q->stops++; break; } gen = q->gen; q->in_use += ndesc; pidx = q->pidx; q->pidx += ndesc; if (q->pidx >= q->size) { q->pidx -= q->size; q->gen ^= 1; } (void)mbufq_dequeue(&q->sendq); TXQ_UNLOCK(qs); write_ofld_wr(adap, m, q, pidx, gen, ndesc); TXQ_LOCK(qs); } #if USE_GTS set_bit(TXQ_RUNNING, &q->flags); set_bit(TXQ_LAST_PKT_DB, &q->flags); #endif TXQ_UNLOCK(qs); wmb(); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } /** * t3_offload_tx - send an offload packet * @m: the packet * * Sends an offload packet. We use the packet priority to select the * appropriate Tx queue as follows: bit 0 indicates whether the packet * should be sent as regular or control, bits 1-3 select the queue set. */ int t3_offload_tx(struct adapter *sc, struct mbuf *m) { struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)]; if (oh->flags & F_HDR_CTRL) { m_adj(m, sizeof (*oh)); /* trim ofld_hdr off */ return (ctrl_xmit(sc, qs, m)); } else return (ofld_xmit(sc, qs, m)); } #endif static void restart_tx(struct sge_qset *qs) { struct adapter *sc = qs->port->adapter; if (isset(&qs->txq_stopped, TXQ_OFLD) && should_restart_tx(&qs->txq[TXQ_OFLD]) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) { qs->txq[TXQ_OFLD].restarts++; taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task); } if (isset(&qs->txq_stopped, TXQ_CTRL) && should_restart_tx(&qs->txq[TXQ_CTRL]) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) { qs->txq[TXQ_CTRL].restarts++; taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task); } } /** * t3_sge_alloc_qset - initialize an SGE queue set * @sc: the controller softc * @id: the queue set id * @nports: how many Ethernet ports will be using this queue set * @irq_vec_idx: the IRQ vector index for response queue interrupts * @p: configuration parameters for this queue set * @ntxq: number of Tx queues for the queue set * @pi: port info for queue set * * Allocate resources and initialize an SGE queue set. A queue set * comprises a response queue, two Rx free-buffer queues, and up to 3 * Tx queues. The Tx queues are assigned roles in the order Ethernet * queue, offload queue, and control queue. */ int t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, const struct qset_params *p, int ntxq, struct port_info *pi) { struct sge_qset *q = &sc->sge.qs[id]; int i, ret = 0; MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF); q->port = pi; q->adap = sc; if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size, M_DEVBUF, M_WAITOK, &q->lock)) == NULL) { device_printf(sc->dev, "failed to allocate mbuf ring\n"); goto err; } if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT | M_ZERO)) == NULL) { device_printf(sc->dev, "failed to allocate ifq\n"); goto err; } ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp); callout_init(&q->txq[TXQ_ETH].txq_timer, 1); callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1); q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus; q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus; init_qset_cntxt(q, id); q->idx = id; if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc), sizeof(struct rx_sw_desc), &q->fl[0].phys_addr, &q->fl[0].desc, &q->fl[0].sdesc, &q->fl[0].desc_tag, &q->fl[0].desc_map, sc->rx_dmat, &q->fl[0].entry_tag)) != 0) { printf("error %d from alloc ring fl0\n", ret); goto err; } if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc), sizeof(struct rx_sw_desc), &q->fl[1].phys_addr, &q->fl[1].desc, &q->fl[1].sdesc, &q->fl[1].desc_tag, &q->fl[1].desc_map, sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) { printf("error %d from alloc ring fl1\n", ret); goto err; } if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0, &q->rspq.phys_addr, &q->rspq.desc, NULL, &q->rspq.desc_tag, &q->rspq.desc_map, NULL, NULL)) != 0) { printf("error %d from alloc ring rspq\n", ret); goto err; } snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d", device_get_unit(sc->dev), irq_vec_idx); MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF); for (i = 0; i < ntxq; ++i) { size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc); if ((ret = alloc_ring(sc, p->txq_size[i], sizeof(struct tx_desc), sz, &q->txq[i].phys_addr, &q->txq[i].desc, &q->txq[i].sdesc, &q->txq[i].desc_tag, &q->txq[i].desc_map, sc->tx_dmat, &q->txq[i].entry_tag)) != 0) { printf("error %d from alloc ring tx %i\n", ret, i); goto err; } mbufq_init(&q->txq[i].sendq, INT_MAX); q->txq[i].gen = 1; q->txq[i].size = p->txq_size[i]; } #ifdef TCP_OFFLOAD TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q); #endif TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q); TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q); TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q); q->fl[0].gen = q->fl[1].gen = 1; q->fl[0].size = p->fl_size; q->fl[1].size = p->jumbo_size; q->rspq.gen = 1; q->rspq.cidx = 0; q->rspq.size = p->rspq_size; q->txq[TXQ_ETH].stop_thres = nports * flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3); q->fl[0].buf_size = MCLBYTES; q->fl[0].zone = zone_pack; q->fl[0].type = EXT_PACKET; if (p->jumbo_buf_size == MJUM16BYTES) { q->fl[1].zone = zone_jumbo16; q->fl[1].type = EXT_JUMBO16; } else if (p->jumbo_buf_size == MJUM9BYTES) { q->fl[1].zone = zone_jumbo9; q->fl[1].type = EXT_JUMBO9; } else if (p->jumbo_buf_size == MJUMPAGESIZE) { q->fl[1].zone = zone_jumbop; q->fl[1].type = EXT_JUMBOP; } else { KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size)); ret = EDOOFUS; goto err; } q->fl[1].buf_size = p->jumbo_buf_size; /* Allocate and setup the lro_ctrl structure */ q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO); #if defined(INET6) || defined(INET) ret = tcp_lro_init(&q->lro.ctrl); if (ret) { printf("error %d from tcp_lro_init\n", ret); goto err; } #endif q->lro.ctrl.ifp = pi->ifp; mtx_lock_spin(&sc->sge.reg_lock); ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx, q->rspq.phys_addr, q->rspq.size, q->fl[0].buf_size, 1, 0); if (ret) { printf("error %d from t3_sge_init_rspcntxt\n", ret); goto err_unlock; } for (i = 0; i < SGE_RXQ_PER_SET; ++i) { ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0, q->fl[i].phys_addr, q->fl[i].size, q->fl[i].buf_size, p->cong_thres, 1, 0); if (ret) { printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i); goto err_unlock; } } ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS, SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr, q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } if (ntxq > 1) { ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id, USE_GTS, SGE_CNTXT_OFLD, id, q->txq[TXQ_OFLD].phys_addr, q->txq[TXQ_OFLD].size, 0, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } } if (ntxq > 2) { ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0, SGE_CNTXT_CTRL, id, q->txq[TXQ_CTRL].phys_addr, q->txq[TXQ_CTRL].size, q->txq[TXQ_CTRL].token, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } } mtx_unlock_spin(&sc->sge.reg_lock); t3_update_qset_coalesce(q, p); refill_fl(sc, &q->fl[0], q->fl[0].size); refill_fl(sc, &q->fl[1], q->fl[1].size); refill_rspq(sc, &q->rspq, q->rspq.size - 1); t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) | V_NEWTIMER(q->rspq.holdoff_tmr)); return (0); err_unlock: mtx_unlock_spin(&sc->sge.reg_lock); err: TXQ_LOCK(q); t3_free_qset(sc, q); return (ret); } /* * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with * ethernet data. Hardware assistance with various checksums and any vlan tag * will also be taken into account here. */ void t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad) { struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad); struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]]; struct ifnet *ifp = pi->ifp; if (cpl->vlan_valid) { m->m_pkthdr.ether_vtag = ntohs(cpl->vlan); m->m_flags |= M_VLANTAG; } m->m_pkthdr.rcvif = ifp; /* * adjust after conversion to mbuf chain */ m->m_pkthdr.len -= (sizeof(*cpl) + ethpad); m->m_len -= (sizeof(*cpl) + ethpad); m->m_data += (sizeof(*cpl) + ethpad); if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) { struct ether_header *eh = mtod(m, void *); uint16_t eh_type; if (eh->ether_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *evh = mtod(m, void *); eh_type = evh->evl_proto; } else eh_type = eh->ether_type; if (ifp->if_capenable & IFCAP_RXCSUM && eh_type == htons(ETHERTYPE_IP)) { m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && eh_type == htons(ETHERTYPE_IPV6)) { m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } } /** * get_packet - return the next ingress packet buffer from a free list * @adap: the adapter that received the packet * @drop_thres: # of remaining buffers before we start dropping packets * @qs: the qset that the SGE free list holding the packet belongs to * @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain * @r: response descriptor * * Get the next packet from a free list and complete setup of the * sk_buff. If the packet is small we make a copy and recycle the * original buffer, otherwise we use the original buffer itself. If a * positive drop threshold is supplied packets are dropped and their * buffers recycled if (a) the number of remaining buffers is under the * threshold and the packet is too big to copy, or (b) the packet should * be copied but there is no memory for the copy. */ static int get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs, struct t3_mbuf_hdr *mh, struct rsp_desc *r) { unsigned int len_cq = ntohl(r->len_cq); struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0]; int mask, cidx = fl->cidx; struct rx_sw_desc *sd = &fl->sdesc[cidx]; uint32_t len = G_RSPD_LEN(len_cq); uint32_t flags = M_EXT; uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags)); caddr_t cl; struct mbuf *m; int ret = 0; mask = fl->size - 1; prefetch(fl->sdesc[(cidx + 1) & mask].m); prefetch(fl->sdesc[(cidx + 2) & mask].m); prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl); prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl); fl->credits--; bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD); if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) { if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) goto skip_recycle; cl = mtod(m, void *); memcpy(cl, sd->rxsd_cl, len); recycle_rx_buf(adap, fl, fl->cidx); m->m_pkthdr.len = m->m_len = len; m->m_flags = 0; mh->mh_head = mh->mh_tail = m; ret = 1; goto done; } else { skip_recycle: bus_dmamap_unload(fl->entry_tag, sd->map); cl = sd->rxsd_cl; m = sd->m; if ((sopeop == RSPQ_SOP_EOP) || (sopeop == RSPQ_SOP)) flags |= M_PKTHDR; m_init(m, fl->zone, fl->buf_size, M_NOWAIT, MT_DATA, flags); if (fl->zone == zone_pack) { /* * restore clobbered data pointer */ m->m_data = m->m_ext.ext_buf; } else { m_cljset(m, cl, fl->type); } m->m_len = len; } switch(sopeop) { case RSPQ_SOP_EOP: ret = 1; /* FALLTHROUGH */ case RSPQ_SOP: mh->mh_head = mh->mh_tail = m; m->m_pkthdr.len = len; break; case RSPQ_EOP: ret = 1; /* FALLTHROUGH */ case RSPQ_NSOP_NEOP: if (mh->mh_tail == NULL) { log(LOG_ERR, "discarding intermediate descriptor entry\n"); m_freem(m); break; } mh->mh_tail->m_next = m; mh->mh_tail = m; mh->mh_head->m_pkthdr.len += len; break; } if (cxgb_debug) printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len); done: if (++fl->cidx == fl->size) fl->cidx = 0; return (ret); } /** * handle_rsp_cntrl_info - handles control information in a response * @qs: the queue set corresponding to the response * @flags: the response control flags * * Handles the control information of an SGE response, such as GTS * indications and completion credits for the queue set's Tx queues. * HW coalesces credits, we don't do any extra SW coalescing. */ static __inline void handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags) { unsigned int credits; #if USE_GTS if (flags & F_RSPD_TXQ0_GTS) clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags); #endif credits = G_RSPD_TXQ0_CR(flags); if (credits) qs->txq[TXQ_ETH].processed += credits; credits = G_RSPD_TXQ2_CR(flags); if (credits) qs->txq[TXQ_CTRL].processed += credits; # if USE_GTS if (flags & F_RSPD_TXQ1_GTS) clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags); # endif credits = G_RSPD_TXQ1_CR(flags); if (credits) qs->txq[TXQ_OFLD].processed += credits; } static void check_ring_db(adapter_t *adap, struct sge_qset *qs, unsigned int sleeping) { ; } /** * process_responses - process responses from an SGE response queue * @adap: the adapter * @qs: the queue set to which the response queue belongs * @budget: how many responses can be processed in this round * * Process responses from an SGE response queue up to the supplied budget. * Responses include received packets as well as credits and other events * for the queues that belong to the response queue's queue set. * A negative budget is effectively unlimited. * * Additionally choose the interrupt holdoff time for the next interrupt * on this queue. If the system is under memory shortage use a fairly * long delay to help recovery. */ static int process_responses(adapter_t *adap, struct sge_qset *qs, int budget) { struct sge_rspq *rspq = &qs->rspq; struct rsp_desc *r = &rspq->desc[rspq->cidx]; int budget_left = budget; unsigned int sleeping = 0; #if defined(INET6) || defined(INET) int lro_enabled = qs->lro.enabled; int skip_lro; struct lro_ctrl *lro_ctrl = &qs->lro.ctrl; #endif struct t3_mbuf_hdr *mh = &rspq->rspq_mh; #ifdef DEBUG static int last_holdoff = 0; if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) { printf("next_holdoff=%d\n", rspq->holdoff_tmr); last_holdoff = rspq->holdoff_tmr; } #endif rspq->next_holdoff = rspq->holdoff_tmr; while (__predict_true(budget_left && is_new_response(r, rspq))) { int eth, eop = 0, ethpad = 0; uint32_t flags = ntohl(r->flags); uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val); uint8_t opcode = r->rss_hdr.opcode; eth = (opcode == CPL_RX_PKT); if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) { struct mbuf *m; if (cxgb_debug) printf("async notification\n"); if (mh->mh_head == NULL) { mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA); m = mh->mh_head; } else { m = m_gethdr(M_NOWAIT, MT_DATA); } if (m == NULL) goto no_mem; memcpy(mtod(m, char *), r, AN_PKT_SIZE); m->m_len = m->m_pkthdr.len = AN_PKT_SIZE; *mtod(m, char *) = CPL_ASYNC_NOTIF; opcode = CPL_ASYNC_NOTIF; eop = 1; rspq->async_notif++; goto skip; } else if (flags & F_RSPD_IMM_DATA_VALID) { struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { no_mem: rspq->next_holdoff = NOMEM_INTR_DELAY; budget_left--; break; } if (mh->mh_head == NULL) mh->mh_head = m; else mh->mh_tail->m_next = m; mh->mh_tail = m; get_imm_packet(adap, r, m); mh->mh_head->m_pkthdr.len += m->m_len; eop = 1; rspq->imm_data++; } else if (r->len_cq) { int drop_thresh = eth ? SGE_RX_DROP_THRES : 0; eop = get_packet(adap, drop_thresh, qs, mh, r); if (eop) { if (r->rss_hdr.hash_type && !adap->timestamp) { M_HASHTYPE_SET(mh->mh_head, M_HASHTYPE_OPAQUE); mh->mh_head->m_pkthdr.flowid = rss_hash; } } ethpad = 2; } else { rspq->pure_rsps++; } skip: if (flags & RSPD_CTRL_MASK) { sleeping |= flags & RSPD_GTS_MASK; handle_rsp_cntrl_info(qs, flags); } if (!eth && eop) { rspq->offload_pkts++; #ifdef TCP_OFFLOAD adap->cpl_handler[opcode](qs, r, mh->mh_head); #else m_freem(mh->mh_head); #endif mh->mh_head = NULL; } else if (eth && eop) { struct mbuf *m = mh->mh_head; t3_rx_eth(adap, m, ethpad); /* * The T304 sends incoming packets on any qset. If LRO * is also enabled, we could end up sending packet up * lro_ctrl->ifp's input. That is incorrect. * * The mbuf's rcvif was derived from the cpl header and * is accurate. Skip LRO and just use that. */ #if defined(INET6) || defined(INET) skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif); if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro && (tcp_lro_rx(lro_ctrl, m, 0) == 0) ) { /* successfully queue'd for LRO */ } else #endif { /* * LRO not enabled, packet unsuitable for LRO, * or unable to queue. Pass it up right now in * either case. */ struct ifnet *ifp = m->m_pkthdr.rcvif; (*ifp->if_input)(ifp, m); } mh->mh_head = NULL; } r++; if (__predict_false(++rspq->cidx == rspq->size)) { rspq->cidx = 0; rspq->gen ^= 1; r = rspq->desc; } if (++rspq->credits >= 64) { refill_rspq(adap, rspq, rspq->credits); rspq->credits = 0; } __refill_fl_lt(adap, &qs->fl[0], 32); __refill_fl_lt(adap, &qs->fl[1], 32); --budget_left; } #if defined(INET6) || defined(INET) /* Flush LRO */ while (!SLIST_EMPTY(&lro_ctrl->lro_active)) { struct lro_entry *queued = SLIST_FIRST(&lro_ctrl->lro_active); SLIST_REMOVE_HEAD(&lro_ctrl->lro_active, next); tcp_lro_flush(lro_ctrl, queued); } #endif if (sleeping) check_ring_db(adap, qs, sleeping); mb(); /* commit Tx queue processed updates */ if (__predict_false(qs->txq_stopped > 1)) restart_tx(qs); __refill_fl_lt(adap, &qs->fl[0], 512); __refill_fl_lt(adap, &qs->fl[1], 512); budget -= budget_left; return (budget); } /* * A helper function that processes responses and issues GTS. */ static __inline int process_responses_gts(adapter_t *adap, struct sge_rspq *rq) { int work; static int last_holdoff = 0; work = process_responses(adap, rspq_to_qset(rq), -1); if (cxgb_debug && (rq->next_holdoff != last_holdoff)) { printf("next_holdoff=%d\n", rq->next_holdoff); last_holdoff = rq->next_holdoff; } t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) | V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx)); return (work); } /* * Interrupt handler for legacy INTx interrupts for T3B-based cards. * Handles data events from SGE response queues as well as error and other * async events as they all use the same interrupt pin. We use one SGE * response queue per port in this mode and protect all response queues with * queue 0's lock. */ void t3b_intr(void *data) { uint32_t i, map; adapter_t *adap = data; struct sge_rspq *q0 = &adap->sge.qs[0].rspq; t3_write_reg(adap, A_PL_CLI, 0); map = t3_read_reg(adap, A_SG_DATA_INTR); if (!map) return; if (__predict_false(map & F_ERRINTR)) { t3_write_reg(adap, A_PL_INT_ENABLE0, 0); (void) t3_read_reg(adap, A_PL_INT_ENABLE0); taskqueue_enqueue(adap->tq, &adap->slow_intr_task); } mtx_lock(&q0->lock); for_each_port(adap, i) if (map & (1 << i)) process_responses_gts(adap, &adap->sge.qs[i].rspq); mtx_unlock(&q0->lock); } /* * The MSI interrupt handler. This needs to handle data events from SGE * response queues as well as error and other async events as they all use * the same MSI vector. We use one SGE response queue per port in this mode * and protect all response queues with queue 0's lock. */ void t3_intr_msi(void *data) { adapter_t *adap = data; struct sge_rspq *q0 = &adap->sge.qs[0].rspq; int i, new_packets = 0; mtx_lock(&q0->lock); for_each_port(adap, i) if (process_responses_gts(adap, &adap->sge.qs[i].rspq)) new_packets = 1; mtx_unlock(&q0->lock); if (new_packets == 0) { t3_write_reg(adap, A_PL_INT_ENABLE0, 0); (void) t3_read_reg(adap, A_PL_INT_ENABLE0); taskqueue_enqueue(adap->tq, &adap->slow_intr_task); } } void t3_intr_msix(void *data) { struct sge_qset *qs = data; adapter_t *adap = qs->port->adapter; struct sge_rspq *rspq = &qs->rspq; if (process_responses_gts(adap, rspq) == 0) rspq->unhandled_irqs++; } #define QDUMP_SBUF_SIZE 32 * 400 static int t3_dump_rspq(SYSCTL_HANDLER_ARGS) { struct sge_rspq *rspq; struct sge_qset *qs; int i, err, dump_end, idx; struct sbuf *sb; struct rsp_desc *rspd; uint32_t data[4]; rspq = arg1; qs = rspq_to_qset(rspq); if (rspq->rspq_dump_count == 0) return (0); if (rspq->rspq_dump_count > RSPQ_Q_SIZE) { log(LOG_WARNING, "dump count is too large %d\n", rspq->rspq_dump_count); rspq->rspq_dump_count = 0; return (EINVAL); } if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", rspq->rspq_dump_start); rspq->rspq_dump_start = 0; return (EINVAL); } err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data); if (err) return (err); err = sysctl_wire_old_buffer(req, 0); if (err) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n", (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f), ((data[2] >> 26) & 1), ((data[2] >> 27) & 1)); sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n", ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]); sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start, (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1)); dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count; for (i = rspq->rspq_dump_start; i < dump_end; i++) { idx = i & (RSPQ_Q_SIZE-1); rspd = &rspq->desc[idx]; sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n", idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx, rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx)); sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n", rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags), be32toh(rspd->len_cq), rspd->intr_gen); } err = sbuf_finish(sb); sbuf_delete(sb); return (err); } static int t3_dump_txq_eth(SYSCTL_HANDLER_ARGS) { struct sge_txq *txq; struct sge_qset *qs; int i, j, err, dump_end; struct sbuf *sb; struct tx_desc *txd; uint32_t *WR, wr_hi, wr_lo, gen; uint32_t data[4]; txq = arg1; qs = txq_to_qset(txq, TXQ_ETH); if (txq->txq_dump_count == 0) { return (0); } if (txq->txq_dump_count > TX_ETH_Q_SIZE) { log(LOG_WARNING, "dump count is too large %d\n", txq->txq_dump_count); txq->txq_dump_count = 1; return (EINVAL); } if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", txq->txq_dump_start); txq->txq_dump_start = 0; return (EINVAL); } err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data); if (err) return (err); err = sysctl_wire_old_buffer(req, 0); if (err) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n", (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16), (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1)); sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n", ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1), ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1)); sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx, txq->txq_dump_start, (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1)); dump_end = txq->txq_dump_start + txq->txq_dump_count; for (i = txq->txq_dump_start; i < dump_end; i++) { txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)]; WR = (uint32_t *)txd->flit; wr_hi = ntohl(WR[0]); wr_lo = ntohl(WR[1]); gen = G_WR_GEN(wr_lo); sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n", wr_hi, wr_lo, gen); for (j = 2; j < 30; j += 4) sbuf_printf(sb, "\t%08x %08x %08x %08x \n", WR[j], WR[j + 1], WR[j + 2], WR[j + 3]); } err = sbuf_finish(sb); sbuf_delete(sb); return (err); } static int t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS) { struct sge_txq *txq; struct sge_qset *qs; int i, j, err, dump_end; struct sbuf *sb; struct tx_desc *txd; uint32_t *WR, wr_hi, wr_lo, gen; txq = arg1; qs = txq_to_qset(txq, TXQ_CTRL); if (txq->txq_dump_count == 0) { return (0); } if (txq->txq_dump_count > 256) { log(LOG_WARNING, "dump count is too large %d\n", txq->txq_dump_count); txq->txq_dump_count = 1; return (EINVAL); } if (txq->txq_dump_start > 255) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", txq->txq_dump_start); txq->txq_dump_start = 0; return (EINVAL); } err = sysctl_wire_old_buffer(req, 0); if (err != 0) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx, txq->txq_dump_start, (txq->txq_dump_start + txq->txq_dump_count) & 255); dump_end = txq->txq_dump_start + txq->txq_dump_count; for (i = txq->txq_dump_start; i < dump_end; i++) { txd = &txq->desc[i & (255)]; WR = (uint32_t *)txd->flit; wr_hi = ntohl(WR[0]); wr_lo = ntohl(WR[1]); gen = G_WR_GEN(wr_lo); sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n", wr_hi, wr_lo, gen); for (j = 2; j < 30; j += 4) sbuf_printf(sb, "\t%08x %08x %08x %08x \n", WR[j], WR[j + 1], WR[j + 2], WR[j + 3]); } err = sbuf_finish(sb); sbuf_delete(sb); return (err); } static int t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS) { adapter_t *sc = arg1; struct qset_params *qsp = &sc->params.sge.qset[0]; int coalesce_usecs; struct sge_qset *qs; int i, j, err, nqsets = 0; struct mtx *lock; if ((sc->flags & FULL_INIT_DONE) == 0) return (ENXIO); coalesce_usecs = qsp->coalesce_usecs; err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req); if (err != 0) { return (err); } if (coalesce_usecs == qsp->coalesce_usecs) return (0); for (i = 0; i < sc->params.nports; i++) for (j = 0; j < sc->port[i].nqsets; j++) nqsets++; coalesce_usecs = max(1, coalesce_usecs); for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[i]; qsp = &sc->params.sge.qset[i]; qsp->coalesce_usecs = coalesce_usecs; lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock : &sc->sge.qs[0].rspq.lock; mtx_lock(lock); t3_update_qset_coalesce(qs, qsp); t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) | V_NEWTIMER(qs->rspq.holdoff_tmr)); mtx_unlock(lock); } return (0); } static int t3_pkt_timestamp(SYSCTL_HANDLER_ARGS) { adapter_t *sc = arg1; int rc, timestamp; if ((sc->flags & FULL_INIT_DONE) == 0) return (ENXIO); timestamp = sc->timestamp; rc = sysctl_handle_int(oidp, ×tamp, arg2, req); if (rc != 0) return (rc); if (timestamp != sc->timestamp) { t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS, timestamp ? F_ENABLERXPKTTMSTPRSS : 0); sc->timestamp = timestamp; } return (0); } void t3_add_attach_sysctls(adapter_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); /* random information */ SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, &sc->params.rev, 0, "chip model"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "port_types", CTLFLAG_RD, sc->port_types, 0, "type of ports"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "enable_debug", CTLFLAG_RW, &cxgb_debug, 0, "enable verbose debugging output"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce", CTLFLAG_RD, &sc->tunq_coalesce, "#tunneled packets freed"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "txq_overrun", CTLFLAG_RD, &txq_fills, 0, "#times txq overrun"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, &sc->params.vpd.cclk, 0, "core clock frequency (in KHz)"); } static const char *rspq_name = "rspq"; static const char *txq_names[] = { "txq_eth", "txq_ofld", "txq_ctrl" }; static int sysctl_handle_macstat(SYSCTL_HANDLER_ARGS) { struct port_info *p = arg1; uint64_t *parg; if (!p) return (EINVAL); cxgb_refresh_stats(p); parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2); return (sysctl_handle_64(oidp, parg, 0, req)); } void t3_add_configured_sysctls(adapter_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; int i, j; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal", CTLTYPE_INT|CTLFLAG_RW, sc, 0, t3_set_coalesce_usecs, "I", "interrupt coalescing timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pkt_timestamp", CTLTYPE_INT | CTLFLAG_RW, sc, 0, t3_pkt_timestamp, "I", "provide packet timestamp instead of connection hash"); for (i = 0; i < sc->params.nports; i++) { struct port_info *pi = &sc->port[i]; struct sysctl_oid *poid; struct sysctl_oid_list *poidlist; struct mac_stats *mstats = &pi->mac.stats; snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i); poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, pi->namebuf, CTLFLAG_RD, NULL, "port statistics"); poidlist = SYSCTL_CHILDREN(poid); SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO, "nqsets", CTLFLAG_RD, &pi->nqsets, 0, "#queue sets"); for (j = 0; j < pi->nqsets; j++) { struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j]; struct sysctl_oid *qspoid, *rspqpoid, *txqpoid, *ctrlqpoid, *lropoid; struct sysctl_oid_list *qspoidlist, *rspqpoidlist, *txqpoidlist, *ctrlqpoidlist, *lropoidlist; struct sge_txq *txq = &qs->txq[TXQ_ETH]; snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j); qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, qs->namebuf, CTLFLAG_RD, NULL, "qset statistics"); qspoidlist = SYSCTL_CHILDREN(qspoid); SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty", CTLFLAG_RD, &qs->fl[0].empty, 0, "freelist #0 empty"); SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty", CTLFLAG_RD, &qs->fl[1].empty, 0, "freelist #1 empty"); rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, rspq_name, CTLFLAG_RD, NULL, "rspq statistics"); rspqpoidlist = SYSCTL_CHILDREN(rspqpoid); txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, txq_names[0], CTLFLAG_RD, NULL, "txq statistics"); txqpoidlist = SYSCTL_CHILDREN(txqpoid); ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, txq_names[2], CTLFLAG_RD, NULL, "ctrlq statistics"); ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid); lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, "lro_stats", CTLFLAG_RD, NULL, "LRO statistics"); lropoidlist = SYSCTL_CHILDREN(lropoid); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size", CTLFLAG_RD, &qs->rspq.size, 0, "#entries in response queue"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx", CTLFLAG_RD, &qs->rspq.cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits", CTLFLAG_RD, &qs->rspq.credits, 0, "#credits"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved", CTLFLAG_RD, &qs->rspq.starved, 0, "#times starved"); SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr", CTLFLAG_RD, &qs->rspq.phys_addr, "physical_address_of the queue"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->rspq.rspq_dump_start, 0, "start rspq dump entry"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->rspq.rspq_dump_count, 0, "#rspq entries to dump"); SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD, &qs->rspq, 0, t3_dump_rspq, "A", "dump of the response queue"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped", CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops, "#tunneled packets dropped"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen", CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len, 0, "#tunneled packets waiting to be sent"); #if 0 SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod, 0, "#tunneled packets queue producer index"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons, 0, "#tunneled packets queue consumer index"); #endif SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed", CTLFLAG_RD, &qs->txq[TXQ_ETH].processed, 0, "#tunneled packets processed by the card"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned", CTLFLAG_RD, &txq->cleaned, 0, "#tunneled packets cleaned"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use", CTLFLAG_RD, &txq->in_use, 0, "#tunneled packet slots in use"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees", CTLFLAG_RD, &txq->txq_frees, "#tunneled packets freed"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped", CTLFLAG_RD, &txq->txq_skipped, 0, "#tunneled packet descriptors skipped"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced", CTLFLAG_RD, &txq->txq_coalesced, "#tunneled packets coalesced"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued", CTLFLAG_RD, &txq->txq_enqueued, 0, "#tunneled packets enqueued to hardware"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags", CTLFLAG_RD, &qs->txq_stopped, 0, "tx queues stopped"); SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr", CTLFLAG_RD, &txq->phys_addr, "physical_address_of the queue"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen", CTLFLAG_RW, &qs->txq[TXQ_ETH].gen, 0, "txq generation"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx", CTLFLAG_RD, &txq->cidx, 0, "hardware queue cidx"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx", CTLFLAG_RD, &txq->pidx, 0, "hardware queue pidx"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start, 0, "txq start idx for dump"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count, 0, "txq #entries to dump"); SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_ETH], 0, t3_dump_txq_eth, "A", "dump of the transmit queue"); SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start, 0, "ctrlq start idx for dump"); SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count, 0, "ctrl #entries to dump"); SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL], 0, t3_dump_txq_ctrl, "A", "dump of the transmit queue"); - SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued", + SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued", CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL); - SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed", + SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed", CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL); - SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum", + SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum", CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL); SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt", CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL); } /* Now add a node for mac stats. */ poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC statistics"); poidlist = SYSCTL_CHILDREN(poid); /* * We (ab)use the length argument (arg2) to pass on the offset * of the data that we are interested in. This is only required * for the quad counters that are updated from the hardware (we * make sure that we return the latest value). * sysctl_handle_macstat first updates *all* the counters from * the hardware, and then returns the latest value of the * requested counter. Best would be to update only the * requested counter from hardware, but t3_mac_update_stats() * hides all the register details and we don't want to dive into * all that here. */ #define CXGB_SYSCTL_ADD_QUAD(a) SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \ (CTLTYPE_U64 | CTLFLAG_RD), pi, offsetof(struct mac_stats, a), \ sysctl_handle_macstat, "QU", 0) CXGB_SYSCTL_ADD_QUAD(tx_octets); CXGB_SYSCTL_ADD_QUAD(tx_octets_bad); CXGB_SYSCTL_ADD_QUAD(tx_frames); CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames); CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames); CXGB_SYSCTL_ADD_QUAD(tx_pause); CXGB_SYSCTL_ADD_QUAD(tx_deferred); CXGB_SYSCTL_ADD_QUAD(tx_late_collisions); CXGB_SYSCTL_ADD_QUAD(tx_total_collisions); CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions); CXGB_SYSCTL_ADD_QUAD(tx_underrun); CXGB_SYSCTL_ADD_QUAD(tx_len_errs); CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs); CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral); CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs); CXGB_SYSCTL_ADD_QUAD(tx_frames_64); CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127); CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255); CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511); CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023); CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518); CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max); CXGB_SYSCTL_ADD_QUAD(rx_octets); CXGB_SYSCTL_ADD_QUAD(rx_octets_bad); CXGB_SYSCTL_ADD_QUAD(rx_frames); CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames); CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames); CXGB_SYSCTL_ADD_QUAD(rx_pause); CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs); CXGB_SYSCTL_ADD_QUAD(rx_align_errs); CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs); CXGB_SYSCTL_ADD_QUAD(rx_data_errs); CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs); CXGB_SYSCTL_ADD_QUAD(rx_runt); CXGB_SYSCTL_ADD_QUAD(rx_jabber); CXGB_SYSCTL_ADD_QUAD(rx_short); CXGB_SYSCTL_ADD_QUAD(rx_too_long); CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs); CXGB_SYSCTL_ADD_QUAD(rx_cong_drops); CXGB_SYSCTL_ADD_QUAD(rx_frames_64); CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127); CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255); CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511); CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023); CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518); CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max); #undef CXGB_SYSCTL_ADD_QUAD #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \ CTLFLAG_RD, &mstats->a, 0) CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err); CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err); CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun); CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl); CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss); CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err); CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change); CXGB_SYSCTL_ADD_ULONG(num_toggled); CXGB_SYSCTL_ADD_ULONG(num_resets); CXGB_SYSCTL_ADD_ULONG(link_faults); #undef CXGB_SYSCTL_ADD_ULONG } } /** * t3_get_desc - dump an SGE descriptor for debugging purposes * @qs: the queue set * @qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx) * @idx: the descriptor index in the queue * @data: where to dump the descriptor contents * * Dumps the contents of a HW descriptor of an SGE queue. Returns the * size of the descriptor. */ int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx, unsigned char *data) { if (qnum >= 6) return (EINVAL); if (qnum < 3) { if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size) return -EINVAL; memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc)); return sizeof(struct tx_desc); } if (qnum == 3) { if (!qs->rspq.desc || idx >= qs->rspq.size) return (EINVAL); memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc)); return sizeof(struct rsp_desc); } qnum -= 4; if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size) return (EINVAL); memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc)); return sizeof(struct rx_desc); } Index: head/sys/dev/cxgbe/t4_sge.c =================================================================== --- head/sys/dev/cxgbe/t4_sge.c (revision 294326) +++ head/sys/dev/cxgbe/t4_sge.c (revision 294327) @@ -1,4738 +1,4738 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_NETMAP #include #include #include #include #include #endif #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" #include "t4_mp_ring.h" #ifdef T4_PKT_TIMESTAMP #define RX_COPY_THRESHOLD (MINCLSIZE - 8) #else #define RX_COPY_THRESHOLD MINCLSIZE #endif /* * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ int fl_pktshift = 2; TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift); /* * Pad ethernet payload up to this boundary. * -1: driver should figure out a good value. * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ int fl_pad = -1; TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad); /* * Status page length. * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ int spg_len = -1; TUNABLE_INT("hw.cxgbe.spg_len", &spg_len); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. */ static int cong_drop = 0; TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop); /* * Deliver multiple frames in the same free list buffer if they fit. * -1: let the driver decide whether to enable buffer packing or not. * 0: disable buffer packing. * 1: enable buffer packing. */ static int buffer_packing = -1; TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. * T4: driver will ignore this and use the same value as fl_pad above. * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack); /* * Allow the driver to create mbuf(s) in a cluster allocated for rx. * 0: never; always allocate mbufs from the zone_mbuf UMA zone. * 1: ok to create mbuf(s) within a cluster if there is room. */ static int allow_mbufs_in_cluster = 1; TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster); /* * Largest rx cluster size that the driver is allowed to allocate. */ static int largest_rx_cluster = MJUM16BYTES; TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster); /* * Size of cluster allocation that's most likely to succeed. The driver will * fall back to this size if it fails to allocate clusters larger than this. */ static int safest_rx_cluster = PAGE_SIZE; TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster); struct txpkts { u_int wr_type; /* type 0 or type 1 */ u_int npkt; /* # of packets in this work request */ u_int plen; /* total payload (sum of all packets) */ u_int len16; /* # of 16B pieces used by this work request */ }; /* A packet's SGL. This + m_pkthdr has all info needed for tx */ struct sgl { struct sglist sg; struct sglist_seg seg[TX_SGL_SEGS]; }; static int service_iq(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t, char *); static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *, int, int); static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *); static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_fl *); static int alloc_fwq(struct adapter *); static int free_fwq(struct adapter *); static int alloc_mgmtq(struct adapter *); static int free_mgmtq(struct adapter *); static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, struct sysctl_oid *); static int free_rxq(struct vi_info *, struct sge_rxq *); #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *); #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int, struct sysctl_oid *); static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *); static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int, struct sysctl_oid *); static int free_nm_txq(struct vi_info *, struct sge_nm_txq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *); static int free_eq(struct adapter *, struct sge_eq *); static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *, struct sysctl_oid *); static int free_wrq(struct adapter *, struct sge_wrq *); static int alloc_txq(struct vi_info *, struct sge_txq *, int, struct sysctl_oid *); static int free_txq(struct vi_info *, struct sge_txq *); static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int alloc_fl_sdesc(struct sge_fl *); static void free_fl_sdesc(struct adapter *, struct sge_fl *); static void find_best_refill_source(struct adapter *, struct sge_fl *, int); static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); static inline u_int txpkt_len16(u_int, u_int); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int); static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int); static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int); static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int); static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); static inline uint16_t read_hw_cidx(struct sge_eq *); static inline u_int reclaimable_tx_desc(struct sge_eq *); static inline u_int total_available_tx_desc(struct sge_eq *); static u_int reclaim_tx_descs(struct sge_txq *, u_int); static void tx_reclaim(void *, int); static __be64 get_flit(struct sglist_seg *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static void wrq_tx_drain(void *, int); static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); static counter_u64_t extfree_refs; static counter_u64_t extfree_rels; /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. */ void t4_sge_modload(void) { if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," " using 2 instead.\n", fl_pktshift); fl_pktshift = 2; } if (spg_len != 64 && spg_len != 128) { int len; #if defined(__i386__) || defined(__amd64__) len = cpu_clflush_line_size > 64 ? 128 : 64; #else len = 64; #endif if (spg_len != -1) { printf("Invalid hw.cxgbe.spg_len value (%d)," " using %d instead.\n", spg_len, len); } spg_len = len; } if (cong_drop < -1 || cong_drop > 1) { printf("Invalid hw.cxgbe.cong_drop value (%d)," " using 0 instead.\n", cong_drop); cong_drop = 0; } extfree_refs = counter_u64_alloc(M_WAITOK); extfree_rels = counter_u64_alloc(M_WAITOK); counter_u64_zero(extfree_refs); counter_u64_zero(extfree_rels); } void t4_sge_modunload(void) { counter_u64_free(extfree_refs); counter_u64_free(extfree_rels); } uint64_t t4_sge_extfree_refs(void) { uint64_t refs, rels; rels = counter_u64_fetch(extfree_rels); refs = counter_u64_fetch(extfree_refs); return (refs - rels); } void t4_init_sge_cpl_handlers(struct adapter *sc) { t4_register_cpl_handler(sc, CPL_FW4_MSG, handle_fw_msg); t4_register_cpl_handler(sc, CPL_FW6_MSG, handle_fw_msg); t4_register_cpl_handler(sc, CPL_SGE_EGR_UPDATE, handle_sge_egr_update); t4_register_cpl_handler(sc, CPL_RX_PKT, t4_eth_rx); t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); } static inline void setup_pad_and_pack_boundaries(struct adapter *sc) { uint32_t v, m; int pad, pack; pad = fl_pad; if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) { /* * If there is any chance that we might use buffer packing and * the chip is a T4, then pick 64 as the pad/pack boundary. Set * it to 32 in all other cases. */ pad = is_t4(sc) && buffer_packing ? 64 : 32; /* * For fl_pad = 0 we'll still write a reasonable value to the * register but all the freelists will opt out of padding. * We'll complain here only if the user tried to set it to a * value greater than 0 that was invalid. */ if (fl_pad > 0) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" " (%d), using %d instead.\n", fl_pad, pad); } } m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); v = V_INGPADBOUNDARY(ilog2(pad) - 5); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); if (is_t4(sc)) { if (fl_pack != -1 && fl_pack != pad) { /* Complain but carry on. */ device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," " using %d instead.\n", fl_pack, pad); } return; } pack = fl_pack; if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) { pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); MPASS(powerof2(pack)); if (pack < 16) pack = 16; if (pack == 32) pack = 64; if (pack > 4096) pack = 4096; if (fl_pack != -1) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" " (%d), using %d instead.\n", fl_pack, pack); } } m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(pack) - 5); MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); } /* * adap->params.vpd.cclk must be set up before this is called. */ void t4_tweak_chip_settings(struct adapter *sc) { int i; uint32_t v, m; int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sge_flbuf_sizes[] = { MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, MJUMPAGESIZE - CL_METADATA_SIZE, MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE, #endif MJUM9BYTES, MJUM16BYTES, MCLBYTES - MSIZE - CL_METADATA_SIZE, MJUM9BYTES - CL_METADATA_SIZE, MJUM16BYTES - CL_METADATA_SIZE, }; KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); setup_pad_and_pack_boundaries(sc); v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES, ("%s: hw buffer size table too big", __func__)); for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) { t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i), sge_flbuf_sizes[i]); } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); KASSERT(intr_timer[0] <= timer_max, ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], timer_max)); for (i = 1; i < nitems(intr_timer); i++) { KASSERT(intr_timer[i] >= intr_timer[i - 1], ("%s: timers not listed in increasing order (%d)", __func__, i)); while (intr_timer[i] > timer_max) { if (i == nitems(intr_timer) - 1) { intr_timer[i] = timer_max; break; } intr_timer[i] += intr_timer[i - 1]; intr_timer[i] /= 2; } } v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); /* 4K, 16K, 64K, 256K DDP "page sizes" */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); m = v = F_TDDPTAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } /* * SGE wants the buffer to be at least 64B and then a multiple of 16. If * padding is is use the buffer's start and end need to be aligned to the pad * boundary as well. We'll just make sure that the size is a multiple of the * boundary here, it is up to the buffer allocation code to make sure the start * of the buffer is aligned as well. */ static inline int hwsz_ok(struct adapter *sc, int hwsz) { int mask = fl_pad ? sc->sge.pad_boundary - 1 : 16 - 1; return (hwsz >= 64 && (hwsz & mask) == 0); } /* * XXX: driver really should be able to deal with unexpected settings. */ int t4_read_chip_settings(struct adapter *sc) { struct sge *s = &sc->sge; int i, j, n, rc = 0; uint32_t m, v, r; uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, #endif MJUM9BYTES, MJUM16BYTES }; struct sw_zone_info *swz, *safe_swz; struct hw_buf_info *hwb; m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); r = t4_read_reg(sc, A_SGE_CONTROL); if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } s->pad_boundary = 1 << (G_INGPADBOUNDARY(r) + 5); if (is_t4(sc)) s->pack_boundary = s->pad_boundary; else { r = t4_read_reg(sc, A_SGE_CONTROL2); if (G_INGPACKBOUNDARY(r) == 0) s->pack_boundary = 16; else s->pack_boundary = 1 << (G_INGPACKBOUNDARY(r) + 5); } v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); r = t4_read_reg(sc, A_SGE_HOST_PAGE_SIZE); if (r != v) { device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); rc = EINVAL; } /* Filter out unusable hw buffer sizes entirely (mark with -2). */ hwb = &s->hw_buf_info[0]; for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i)); hwb->size = r; hwb->zidx = hwsz_ok(sc, r) ? -1 : -2; hwb->next = -1; } /* * Create a sorted list in decreasing order of hw buffer sizes (and so * increasing order of spare area) for each software zone. * * If padding is enabled then the start and end of the buffer must align * to the pad boundary; if packing is enabled then they must align with * the pack boundary as well. Allocations from the cluster zones are * aligned to min(size, 4K), so the buffer starts at that alignment and * ends at hwb->size alignment. If mbuf inlining is allowed the * starting alignment will be reduced to MSIZE and the driver will * exercise appropriate caution when deciding on the best buffer layout * to use. */ n = 0; /* no usable buffer size to begin with */ swz = &s->sw_zone_info[0]; safe_swz = NULL; for (i = 0; i < SW_ZONE_SIZES; i++, swz++) { int8_t head = -1, tail = -1; swz->size = sw_buf_sizes[i]; swz->zone = m_getzone(swz->size); swz->type = m_gettype(swz->size); if (swz->size < PAGE_SIZE) { MPASS(powerof2(swz->size)); if (fl_pad && (swz->size % sc->sge.pad_boundary != 0)) continue; } if (swz->size == safest_rx_cluster) safe_swz = swz; hwb = &s->hw_buf_info[0]; for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { if (hwb->zidx != -1 || hwb->size > swz->size) continue; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sc->sge.pad_boundary == 0); #endif hwb->zidx = i; if (head == -1) head = tail = j; else if (hwb->size < s->hw_buf_info[tail].size) { s->hw_buf_info[tail].next = j; tail = j; } else { int8_t *cur; struct hw_buf_info *t; for (cur = &head; *cur != -1; cur = &t->next) { t = &s->hw_buf_info[*cur]; if (hwb->size == t->size) { hwb->zidx = -2; break; } if (hwb->size > t->size) { hwb->next = *cur; *cur = j; break; } } } } swz->head_hwidx = head; swz->tail_hwidx = tail; if (tail != -1) { n++; if (swz->size - s->hw_buf_info[tail].size >= CL_METADATA_SIZE) sc->flags |= BUF_PACKING_OK; } } if (n == 0) { device_printf(sc->dev, "no usable SGE FL buffer size.\n"); rc = EINVAL; } s->safe_hwidx1 = -1; s->safe_hwidx2 = -1; if (safe_swz != NULL) { s->safe_hwidx1 = safe_swz->head_hwidx; for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) { int spare; hwb = &s->hw_buf_info[i]; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sc->sge.pad_boundary == 0); #endif spare = safe_swz->size - hwb->size; if (spare >= CL_METADATA_SIZE) { s->safe_hwidx2 = i; break; } } } r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD); s->counter_val[0] = G_THRESHOLD_0(r); s->counter_val[1] = G_THRESHOLD_1(r); s->counter_val[2] = G_THRESHOLD_2(r); s->counter_val[3] = G_THRESHOLD_3(r); r = t4_read_reg(sc, A_SGE_TIMER_VALUE_0_AND_1); s->timer_val[0] = G_TIMERVALUE0(r) / core_ticks_per_usec(sc); s->timer_val[1] = G_TIMERVALUE1(r) / core_ticks_per_usec(sc); r = t4_read_reg(sc, A_SGE_TIMER_VALUE_2_AND_3); s->timer_val[2] = G_TIMERVALUE2(r) / core_ticks_per_usec(sc); s->timer_val[3] = G_TIMERVALUE3(r) / core_ticks_per_usec(sc); r = t4_read_reg(sc, A_SGE_TIMER_VALUE_4_AND_5); s->timer_val[4] = G_TIMERVALUE4(r) / core_ticks_per_usec(sc); s->timer_val[5] = G_TIMERVALUE5(r) / core_ticks_per_usec(sc); v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); if (r != v) { device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); rc = EINVAL; } m = v = F_TDDPTAGTCB; r = t4_read_reg(sc, A_ULP_RX_CTL); if ((r & m) != v) { device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); rc = EINVAL; } m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; r = t4_read_reg(sc, A_TP_PARA_REG5); if ((r & m) != v) { device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); rc = EINVAL; } r = t4_read_reg(sc, A_SGE_CONM_CTRL); s->fl_starve_threshold = G_EGRTHRESHOLD(r) * 2 + 1; if (is_t4(sc)) s->fl_starve_threshold2 = s->fl_starve_threshold; else s->fl_starve_threshold2 = G_EGRTHRESHOLDPACKING(r) * 2 + 1; /* egress queues: log2 of # of doorbells per BAR2 page */ r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF); r >>= S_QUEUESPERPAGEPF0 + (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf; s->eq_s_qpp = r & M_QUEUESPERPAGEPF0; /* ingress queues: log2 of # of doorbells per BAR2 page */ r = t4_read_reg(sc, A_SGE_INGRESS_QUEUES_PER_PAGE_PF); r >>= S_QUEUESPERPAGEPF0 + (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf; s->iq_s_qpp = r & M_QUEUESPERPAGEPF0; t4_init_tp_params(sc); t4_read_mtu_tbl(sc, sc->params.mtus, NULL); t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd); return (rc); } int t4_create_dma_tag(struct adapter *sc) { int rc; rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmat); if (rc != 0) { device_printf(sc->dev, "failed to create main DMA tag: %d\n", rc); } return (rc); } void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A", "freelist buffer sizes"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, NULL, sc->sge.pad_boundary, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, spg_len, "status page size (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)"); } int t4_destroy_dma_tag(struct adapter *sc) { if (sc->dmat) bus_dma_tag_destroy(sc->dmat); return (0); } /* * Allocate and initialize the firmware event queue and the management queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ int t4_setup_adapter_queues(struct adapter *sc) { int rc; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); sysctl_ctx_init(&sc->ctx); sc->flags |= ADAP_SYSCTL_CTX; /* * Firmware event queue */ rc = alloc_fwq(sc); if (rc != 0) return (rc); /* * Management queue. This is just a control queue that uses the fwq as * its associated iq. */ rc = alloc_mgmtq(sc); return (rc); } /* * Idempotent */ int t4_teardown_adapter_queues(struct adapter *sc) { ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* Do this before freeing the queue */ if (sc->flags & ADAP_SYSCTL_CTX) { sysctl_ctx_free(&sc->ctx); sc->flags &= ~ADAP_SYSCTL_CTX; } free_mgmtq(sc); free_fwq(sc); return (0); } static inline int first_vector(struct vi_info *vi) { struct adapter *sc = vi->pi->adapter; if (sc->intr_count == 1) return (0); return (vi->first_intr); } /* * Given an arbitrary "index," come up with an iq that can be used by other * queues (of this VI) for interrupt forwarding, SGE egress updates, etc. * The iq returned is guaranteed to be something that takes direct interrupts. */ static struct sge_iq * vi_intr_iq(struct vi_info *vi, int idx) { struct adapter *sc = vi->pi->adapter; struct sge *s = &sc->sge; struct sge_iq *iq = NULL; int nintr, i; if (sc->intr_count == 1) return (&sc->sge.fwq); KASSERT(!(vi->flags & VI_NETMAP), ("%s: called on netmap VI", __func__)); nintr = vi->nintr; KASSERT(nintr != 0, ("%s: vi %p has no exclusive interrupts, total interrupts = %d", __func__, vi, sc->intr_count)); i = idx % nintr; if (vi->flags & INTR_RXQ) { if (i < vi->nrxq) { iq = &s->rxq[vi->first_rxq + i].iq; goto done; } i -= vi->nrxq; } #ifdef TCP_OFFLOAD if (vi->flags & INTR_OFLD_RXQ) { if (i < vi->nofldrxq) { iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq; goto done; } i -= vi->nofldrxq; } #endif panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__, vi, vi->flags & INTR_ALL, idx, nintr); done: MPASS(iq != NULL); KASSERT(iq->flags & IQ_INTR, ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi, vi->flags & INTR_ALL, idx)); return (iq); } /* Maximum payload that can be delivered with a single iq descriptor */ static inline int mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) { int payload; #ifdef TCP_OFFLOAD if (toe) { payload = sc->tt.rx_coalesce ? G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; } else { #endif /* large enough even when hw VLAN extraction is disabled */ payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu; #ifdef TCP_OFFLOAD } #endif return (payload); } int t4_setup_vi_queues(struct vi_info *vi) { int rc = 0, i, j, intr_idx, iqid; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *ctrlq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif char name[16]; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); int maxp, mtu = ifp->if_mtu; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = first_vector(vi); #ifdef DEV_NETMAP if (vi->flags & VI_NETMAP) { /* * We don't have buffers to back the netmap rx queues * right now so we create the queues in a way that * doesn't set off any congestion signal in the chip. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_nm_rxq(vi, i, nm_rxq) { rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD, NULL, "tx queues"); for_each_nm_txq(vi, i, nm_txq) { iqid = vi->first_rxq + (i % vi->nrxq); rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid); if (rc != 0) goto done; } goto done; } #endif /* * First pass over all NIC and TOE rx queues: * a) initialize iq and fl * b) allocate queue iff it will take direct interrupts. */ maxp = mtu_to_max_payload(sc, mtu, 0); if (vi->flags & INTR_RXQ) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); } for_each_rxq(vi, i, rxq) { init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name); if (vi->flags & INTR_RXQ) { rxq->iq.flags |= IQ_INTR; rc = alloc_rxq(vi, rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); if (vi->flags & INTR_OFLD_RXQ) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); } for_each_ofld_rxq(vi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name); if (vi->flags & INTR_OFLD_RXQ) { ofld_rxq->iq.flags |= IQ_INTR; rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #endif /* * Second pass over all NIC and TOE rx queues. The queues forwarding * their interrupts are allocated now. */ j = 0; if (!(vi->flags & INTR_RXQ)) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_rxq(vi, i, rxq) { MPASS(!(rxq->iq.flags & IQ_INTR)); intr_idx = vi_intr_iq(vi, j)->abs_id; rc = alloc_rxq(vi, rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); for_each_ofld_rxq(vi, i, ofld_rxq) { MPASS(!(ofld_rxq->iq.flags & IQ_INTR)); intr_idx = vi_intr_iq(vi, j)->abs_id; rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #endif /* * Now the tx queues. Only one pass needed. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD, NULL, "tx queues"); j = 0; for_each_txq(vi, i, txq) { iqid = vi_intr_iq(vi, j)->cntxt_id; snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(vi->dev), i); init_eq(&txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid, name); rc = alloc_txq(vi, txq, i, oid); if (rc != 0) goto done; j++; } #ifdef TCP_OFFLOAD oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections"); for_each_ofld_txq(vi, i, ofld_txq) { struct sysctl_oid *oid2; iqid = vi_intr_iq(vi, j)->cntxt_id; snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(vi->dev), i); init_eq(&ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, iqid, name); snprintf(name, sizeof(name), "%d", i); oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "offload tx queue"); rc = alloc_wrq(sc, vi, ofld_txq, oid2); if (rc != 0) goto done; j++; } #endif /* * Finally, the control queue. */ if (!IS_MAIN_VI(vi)) goto done; oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD, NULL, "ctrl queue"); ctrlq = &sc->sge.ctrlq[pi->port_id]; iqid = vi_intr_iq(vi, 0)->cntxt_id; snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev)); init_eq(&ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid, name); rc = alloc_wrq(sc, vi, ctrlq, oid); done: if (rc) t4_teardown_vi_queues(vi); return (rc); } /* * Idempotent */ int t4_teardown_vi_queues(struct vi_info *vi) { int i; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif /* Do this before freeing the queues */ if (vi->flags & VI_SYSCTL_CTX) { sysctl_ctx_free(&vi->ctx); vi->flags &= ~VI_SYSCTL_CTX; } #ifdef DEV_NETMAP if (vi->flags & VI_NETMAP) { for_each_nm_txq(vi, i, nm_txq) { free_nm_txq(vi, nm_txq); } for_each_nm_rxq(vi, i, nm_rxq) { free_nm_rxq(vi, nm_rxq); } return (0); } #endif /* * Take down all the tx queues first, as they reference the rx queues * (for egress updates, etc.). */ if (IS_MAIN_VI(vi)) free_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(vi, i, txq) { free_txq(vi, txq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(vi, i, ofld_txq) { free_wrq(sc, ofld_txq); } #endif /* * Then take down the rx queues that forward their interrupts, as they * reference other rx queues. */ for_each_rxq(vi, i, rxq) { if ((rxq->iq.flags & IQ_INTR) == 0) free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { if ((ofld_rxq->iq.flags & IQ_INTR) == 0) free_ofld_rxq(vi, ofld_rxq); } #endif /* * Then take down the rx queues that take direct interrupts. */ for_each_rxq(vi, i, rxq) { if (rxq->iq.flags & IQ_INTR) free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) free_ofld_rxq(vi, ofld_rxq); } #endif return (0); } /* * Deals with errors and the firmware event queue. All data rx queues forward * their interrupt to the firmware event queue. */ void t4_intr_all(void *arg) { struct adapter *sc = arg; struct sge_iq *fwq = &sc->sge.fwq; t4_intr_err(arg); if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) { service_iq(fwq, 0); atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE); } } /* Deals with error interrupts */ void t4_intr_err(void *arg) { struct adapter *sc = arg; t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); t4_slow_intr_handler(sc); } void t4_intr_evt(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } void t4_intr(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } /* * Deals with anything and everything on the given ingress queue. */ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ struct sge_fl *fl; /* Use iff IQ_HAS_FL */ struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type, refill; uint32_t lq; uint16_t fl_hw_cidx; struct mbuf *m0; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; #endif KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); limit = budget ? budget : iq->qsize / 16; if (iq->flags & IQ_HAS_FL) { fl = &rxq->fl; fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ } else { fl = NULL; fl_hw_cidx = 0; /* to silence gcc warning */ } /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); refill = 0; m0 = NULL; rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: KASSERT(iq->flags & IQ_HAS_FL, ("%s: data for an iq (%p) with no freelist", __func__, iq)); m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto process_iql; refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2; #ifdef T4_PKT_TIMESTAMP /* * 60 bit timestamp for the payload is * *(uint64_t *)m0->m_pktdat. Note that it is * in the leading free-space in the mbuf. The * kernel can clobber it during a pullup, * m_copymdata, etc. You need to make sure that * the mbuf reaches you unmolested if you care * about the timestamp. */ *(uint64_t *)m0->m_pktdat = be64toh(ctrl->u.last_flit) & 0xfffffffffffffff; #endif /* fall through */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); sc->cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: /* * Interrupts should be forwarded only to queues * that are not forwarding their interrupts. * This means service_iq can recurse but only 1 * level deep. */ KASSERT(budget == 0, ("%s: budget %u, rsp_type %u", __func__, budget, rsp_type)); /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. */ if (lq >= 1024) { sc->an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { if (service_iq(q, q->qsize / 16) == 0) { atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { STAILQ_INSERT_TAIL(&iql, q, link); } } break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && sc->lro_timeout != 0) { tcp_lro_flush_inactive(&rxq->lro, &lro_timeout); } #endif if (budget) { if (iq->flags & IQ_HAS_FL) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); } return (EINPROGRESS); } } if (refill) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); fl_hw_cidx = fl->hw_cidx; } } process_iql: if (STAILQ_EMPTY(&iql)) break; /* * Process the head only, and send it to the back of the list if * it's still not done. */ q = STAILQ_FIRST(&iql); STAILQ_REMOVE_HEAD(&iql, link); if (service_iq(q, q->qsize / 8) == 0) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); else STAILQ_INSERT_TAIL(&iql, q, link); } #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { struct lro_ctrl *lro = &rxq->lro; struct lro_entry *l; while (!SLIST_EMPTY(&lro->lro_active)) { l = SLIST_FIRST(&lro->lro_active); SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, l); } } #endif t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); if (iq->flags & IQ_HAS_FL) { int starved; FL_LOCK(fl); starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); } return (0); } static inline int cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0; if (rc) MPASS(cll->region3 >= CL_METADATA_SIZE); return (rc); } static inline struct cluster_metadata * cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll, caddr_t cl) { if (cl_has_metadata(fl, cll)) { struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; return ((struct cluster_metadata *)(cl + swz->size) - 1); } return (NULL); } static void rxb_free(struct mbuf *m, void *arg1, void *arg2) { uma_zone_t zone = arg1; caddr_t cl = arg2; uma_zfree(zone, cl); counter_u64_add(extfree_rels, 1); } /* * The mbuf returned by this function could be allocated from zone_mbuf or * constructed in spare room in the cluster. * * The mbuf carries the payload in one of these ways * a) frame inside the mbuf (mbuf from zone_mbuf) * b) m_cljset (for clusters without metadata) zone_mbuf * c) m_extaddref (cluster with metadata) inline mbuf * d) m_extaddref (cluster with metadata) zone_mbuf */ static struct mbuf * get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct cluster_layout *cll = &sd->cll; struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx]; struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl); int len, blen; caddr_t payload; blen = hwb->size - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); payload = sd->cl + cll->region1 + fl->rx_offset; if (fl->flags & FL_BUF_PACKING) { const u_int l = fr_offset + len; const u_int pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < hwb->size) blen = len + pad; MPASS(fl->rx_offset + blen <= hwb->size); } else { MPASS(fl->rx_offset == 0); /* not packing */ } if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { /* * Copy payload into a freshly allocated mbuf. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; #ifdef T4_PKT_TIMESTAMP /* Leave room for a timestamp */ m->m_data += 8; #endif /* copy data to mbuf */ bcopy(payload, mtod(m, caddr_t), len); } else if (sd->nmbuf * MSIZE < cll->region1) { /* * There's spare room in the cluster for an mbuf. Create one * and associate it with the payload that's in the cluster. */ MPASS(clm != NULL); m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE); /* No bzero required */ if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA, fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE)) return (NULL); fl->mbuf_inlined++; m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { /* * Grab an mbuf from zone_mbuf and associate it with the * payload in the cluster. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; if (clm != NULL) { m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { m_cljset(m, sd->cl, swz->type); sd->cl = NULL; /* consumed, not a recycle candidate */ } } if (fr_offset == 0) m->m_pkthdr.len = remaining; m->m_len = len; if (fl->flags & FL_BUF_PACKING) { fl->rx_offset += blen; MPASS(fl->rx_offset <= hwb->size); if (fl->rx_offset < hwb->size) return (m); /* without advancing the cidx */ } if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } fl->rx_offset = 0; return (m); } static struct mbuf * get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf) { struct mbuf *m0, *m, **pnext; u_int remaining; const u_int total = G_RSPD_LEN(len_newbuf); if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(fl->m0->m_pkthdr.len == total); MPASS(fl->remaining < total); m0 = fl->m0; pnext = fl->pnext; remaining = fl->remaining; fl->flags &= ~FL_BUF_RESUME; goto get_segment; } if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { fl->rx_offset = 0; if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } } /* * Payload starts at rx_offset in the current hw buffer. Its length is * 'len' and it may span multiple hw buffers. */ m0 = get_scatter_segment(sc, fl, 0, total); if (m0 == NULL) return (NULL); remaining = total - m0->m_len; pnext = &m0->m_next; while (remaining > 0) { get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, total - remaining, remaining); if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = remaining; fl->flags |= FL_BUF_RESUME; return (NULL); } *pnext = m; pnext = &m->m_next; remaining -= m->m_len; } *pnext = NULL; M_ASSERTPKTHDR(m0); return (m0); } static int t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct sge_rxq *rxq = iq_to_rxq(iq); struct ifnet *ifp = rxq->ifp; const struct cpl_rx_pkt *cpl = (const void *)(rss + 1); #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6}, {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6}, }; KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__, rss->opcode)); m0->m_pkthdr.len -= fl_pktshift; m0->m_len -= fl_pktshift; m0->m_data += fl_pktshift; m0->m_pkthdr.rcvif = ifp; M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]); m0->m_pkthdr.flowid = be32toh(rss->hash_val); if (cpl->csum_calc && !cpl->err_vec) { if (ifp->if_capenable & IFCAP_RXCSUM && cpl->l2info & htobe32(F_RXF_IP)) { m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); rxq->rxcsum++; } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && cpl->l2info & htobe32(F_RXF_IP6)) { m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); rxq->rxcsum++; } if (__predict_false(cpl->ip_frag)) m0->m_pkthdr.csum_data = be16toh(cpl->csum); else m0->m_pkthdr.csum_data = 0xffff; } if (cpl->vlan_ex) { m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); m0->m_flags |= M_VLANTAG; rxq->vlan_extraction++; } #if defined(INET) || defined(INET6) if (cpl->l2info & htobe32(F_RXF_LRO) && iq->flags & IQ_LRO_ENABLED && tcp_lro_rx(lro, m0, 0) == 0) { /* queued for LRO */ } else #endif ifp->if_input(ifp, m0); return (0); } /* * Must drain the wrq or make sure that someone else will. */ static void wrq_tx_drain(void *arg, int n) { struct sge_wrq *wrq = arg; struct sge_eq *eq = &wrq->eq; EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(wrq->adapter, wrq); EQ_UNLOCK(eq); } static void drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) { struct sge_eq *eq = &wrq->eq; u_int available, dbdiff; /* # of hardware descriptors */ u_int n; struct wrqe *wr; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ EQ_LOCK_ASSERT_OWNED(eq); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); wr = STAILQ_FIRST(&wrq->wr_list); MPASS(wr != NULL); /* Must be called with something useful to do */ dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); do { eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; MPASS(wr->wrq == wrq); n = howmany(wr->wr_len, EQ_ESIZE); if (available < n) return; dst = (void *)&eq->desc[eq->pidx]; if (__predict_true(eq->sidx - eq->pidx > n)) { /* Won't wrap, won't end exactly at the status page. */ bcopy(&wr->wr[0], dst, wr->wr_len); eq->pidx += n; } else { int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; bcopy(&wr->wr[0], dst, first_portion); if (wr->wr_len > first_portion) { bcopy(&wr->wr[first_portion], &eq->desc[0], wr->wr_len - first_portion); } eq->pidx = n - (eq->sidx - eq->pidx); } if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } dbdiff += n; if (dbdiff >= 16) { ring_eq_db(sc, eq, dbdiff); dbdiff = 0; } STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); MPASS(wrq->nwr_pending > 0); wrq->nwr_pending--; MPASS(wrq->ndesc_needed >= n); wrq->ndesc_needed -= n; } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); if (dbdiff) ring_eq_db(sc, eq, dbdiff); } /* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { #ifdef INVARIANTS struct sge_eq *eq = &wrq->eq; #endif EQ_LOCK_ASSERT_OWNED(eq); MPASS(wr != NULL); MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); MPASS((wr->wr_len & 0x7) == 0); STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); wrq->nwr_pending++; wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) return; /* commit_wrq_wr will drain wr_list as well. */ drain_wrq_wr_list(sc, wrq); /* Doorbell must have caught up to the pidx. */ MPASS(eq->pidx == eq->dbidx); } void t4_update_fl_bufsize(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; int i, maxp, mtu = ifp->if_mtu; maxp = mtu_to_max_payload(sc, mtu, 0); for_each_rxq(vi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(vi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #endif } static inline int mbuf_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.l5hlen > 0, ("%s: mbuf %p missing information on # of segments.", __func__, m)); return (m->m_pkthdr.l5hlen); } static inline void set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.l5hlen = nsegs; } static inline int mbuf_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[0]; MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } static inline void set_mbuf_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[0] = len16; } static inline int needs_tso(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & CSUM_TSO) { KASSERT(m->m_pkthdr.tso_segsz > 0, ("%s: TSO requested in mbuf %p but MSS not provided", __func__, m)); return (1); } return (0); } static inline int needs_l3_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) return (1); return (0); } static inline int needs_l4_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) return (1); return (0); } static inline int needs_vlan_insertion(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_flags & M_VLANTAG) { KASSERT(m->m_pkthdr.ether_vtag != 0, ("%s: HWVLAN requested in mbuf %p but tag not provided", __func__, m)); return (1); } return (0); } static void * m_advance(struct mbuf **pm, int *poffset, int len) { struct mbuf *m = *pm; int offset = *poffset; uintptr_t p = 0; MPASS(len > 0); while (len) { if (offset + len < m->m_len) { offset += len; p = mtod(m, uintptr_t) + offset; break; } len -= m->m_len - offset; m = m->m_next; offset = 0; MPASS(m != NULL); } *poffset = offset; *pm = m; return ((void *)p); } static inline int same_paddr(char *a, char *b) { if (a == b) return (1); else if (a != NULL && b != NULL) { vm_offset_t x = (vm_offset_t)a; vm_offset_t y = (vm_offset_t)b; if ((x & PAGE_MASK) == (y & PAGE_MASK) && pmap_kextract(x) == pmap_kextract(y)) return (1); } return (0); } /* * Can deal with empty mbufs in the chain that have m_len = 0, but the chain * must have at least one mbuf that's not empty. */ static inline int count_mbuf_nsegs(struct mbuf *m) { char *prev_end, *start; int len, nsegs; MPASS(m != NULL); nsegs = 0; prev_end = NULL; for (; m; m = m->m_next) { len = m->m_len; if (__predict_false(len == 0)) continue; start = mtod(m, char *); nsegs += sglist_count(start, len); if (same_paddr(prev_end, start)) nsegs--; prev_end = start + len; } MPASS(nsegs > 0); return (nsegs); } /* * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: * a) caller can assume it's been freed if this function returns with an error. * b) it may get defragged up if the gather list is too long for the hardware. */ int parse_pkt(struct mbuf **mp) { struct mbuf *m0 = *mp, *m; int rc, nsegs, defragged = 0, offset; struct ether_header *eh; void *l3hdr; #if defined(INET) || defined(INET6) struct tcphdr *tcp; #endif uint16_t eh_type; M_ASSERTPKTHDR(m0); if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { rc = EINVAL; fail: m_freem(m0); *mp = NULL; return (rc); } restart: /* * First count the number of gather list segments in the payload. * Defrag the mbuf if nsegs exceeds the hardware limit. */ M_ASSERTPKTHDR(m0); MPASS(m0->m_pkthdr.len > 0); nsegs = count_mbuf_nsegs(m0); if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) { if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = EFBIG; goto fail; } *mp = m0 = m; /* update caller's copy after defrag */ goto restart; } if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) { m0 = m_pullup(m0, m0->m_pkthdr.len); if (m0 == NULL) { /* Should have left well enough alone. */ rc = EFBIG; goto fail; } *mp = m0; /* update caller's copy after pullup */ goto restart; } set_mbuf_nsegs(m0, nsegs); set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0))); if (!needs_tso(m0)) return (0); m = m0; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.l2hlen = sizeof(*evh); } else m0->m_pkthdr.l2hlen = sizeof(*eh); offset = 0; l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = l3hdr; MPASS(ip6->ip6_nxt == IPPROTO_TCP); m0->m_pkthdr.l3hlen = sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; m0->m_pkthdr.l3hlen = ip->ip_hl * 4; break; } #endif default: panic("%s: ethertype 0x%04x unknown. if_cxgbe must be compiled" " with the same INET/INET6 options as the kernel.", __func__, eh_type); } #if defined(INET) || defined(INET6) tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); m0->m_pkthdr.l4hlen = tcp->th_off * 4; #endif MPASS(m0 == *mp); return (0); } void * start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, available; struct wrqe *wr; void *w; MPASS(len16 > 0); ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); EQ_LOCK(eq); if (!STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); if (!STAILQ_EMPTY(&wrq->wr_list)) { slowpath: EQ_UNLOCK(eq); wr = alloc_wrqe(len16 * 16, wrq); if (__predict_false(wr == NULL)) return (NULL); cookie->pidx = -1; cookie->ndesc = ndesc; return (&wr->wr); } eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < ndesc) goto slowpath; cookie->pidx = eq->pidx; cookie->ndesc = ndesc; TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); w = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, ndesc, eq->sidx); if (__predict_false(eq->pidx < ndesc - 1)) { w = &wrq->ss[0]; wrq->ss_pidx = cookie->pidx; wrq->ss_len = len16 * 16; } EQ_UNLOCK(eq); return (w); } void commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, pidx; struct wrq_cookie *prev, *next; if (cookie->pidx == -1) { struct wrqe *wr = __containerof(w, struct wrqe, wr); t4_wrq_tx(sc, wr); return; } ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ pidx = cookie->pidx; MPASS(pidx >= 0 && pidx < eq->sidx); if (__predict_false(w == &wrq->ss[0])) { int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; MPASS(wrq->ss_len > n); /* WR had better wrap around. */ bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); wrq->tx_wrs_ss++; } else wrq->tx_wrs_direct++; EQ_LOCK(eq); prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); next = TAILQ_NEXT(cookie, link); if (prev == NULL) { MPASS(pidx == eq->dbidx); if (next == NULL || ndesc >= 16) ring_eq_db(wrq->adapter, eq, ndesc); else { MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); next->pidx = pidx; next->ndesc += ndesc; } } else { MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); prev->ndesc += ndesc; } TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); #ifdef INVARIANTS if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { /* Doorbell must have caught up to the pidx. */ MPASS(wrq->eq.pidx == wrq->eq.dbidx); } #endif EQ_UNLOCK(eq); } static u_int can_resume_eth_tx(struct mp_ring *r) { struct sge_eq *eq = r->cookie; return (total_available_tx_desc(eq) > eq->sidx / 8); } static inline int cannot_use_txpkts(struct mbuf *m) { /* maybe put a GL limit too, to avoid silliness? */ return (needs_tso(m)); } /* * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to * be consumed. Return the actual number consumed. 0 indicates a stall. */ static u_int eth_tx(struct mp_ring *r, u_int cidx, u_int pidx) { struct sge_txq *txq = r->cookie; struct sge_eq *eq = &txq->eq; struct ifnet *ifp = txq->ifp; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; u_int total, remaining; /* # of packets */ u_int available, dbdiff; /* # of hardware descriptors */ u_int n, next_cidx; struct mbuf *m0, *tail; struct txpkts txp; struct fw_eth_tx_pkts_wr *wr; /* any fw WR struct will do */ remaining = IDXDIFF(pidx, cidx, r->size); MPASS(remaining > 0); /* Must not be called without work to do. */ total = 0; TXQ_LOCK(txq); if (__predict_false((eq->flags & EQ_ENABLED) == 0)) { while (cidx != pidx) { m0 = r->items[cidx]; m_freem(m0); if (++cidx == r->size) cidx = 0; } reclaim_tx_descs(txq, 2048); total = remaining; goto done; } /* How many hardware descriptors do we have readily available. */ if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); while (remaining > 0) { m0 = r->items[cidx]; M_ASSERTPKTHDR(m0); MPASS(m0->m_nextpkt == NULL); if (available < SGE_MAX_WR_NDESC) { available += reclaim_tx_descs(txq, 64); if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16)) break; /* out of descriptors */ } next_cidx = cidx + 1; if (__predict_false(next_cidx == r->size)) next_cidx = 0; wr = (void *)&eq->desc[eq->pidx]; if (remaining > 1 && try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) { /* pkts at cidx, next_cidx should both be in txp. */ MPASS(txp.npkt == 2); tail = r->items[next_cidx]; MPASS(tail->m_nextpkt == NULL); ETHER_BPF_MTAP(ifp, m0); ETHER_BPF_MTAP(ifp, tail); m0->m_nextpkt = tail; if (__predict_false(++next_cidx == r->size)) next_cidx = 0; while (next_cidx != pidx) { if (add_to_txpkts(r->items[next_cidx], &txp, available) != 0) break; tail->m_nextpkt = r->items[next_cidx]; tail = tail->m_nextpkt; ETHER_BPF_MTAP(ifp, tail); if (__predict_false(++next_cidx == r->size)) next_cidx = 0; } n = write_txpkts_wr(txq, wr, m0, &txp, available); total += txp.npkt; remaining -= txp.npkt; } else { total++; remaining--; n = write_txpkt_wr(txq, (void *)wr, m0, available); ETHER_BPF_MTAP(ifp, m0); } MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC); available -= n; dbdiff += n; IDXINCR(eq->pidx, n, eq->sidx); if (total_available_tx_desc(eq) < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } if (dbdiff >= 16 && remaining >= 4) { ring_eq_db(sc, eq, dbdiff); available += reclaim_tx_descs(txq, 4 * dbdiff); dbdiff = 0; } cidx = next_cidx; } if (dbdiff != 0) { ring_eq_db(sc, eq, dbdiff); reclaim_tx_descs(txq, 32); } done: TXQ_UNLOCK(txq); return (total); } static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, int qsize) { KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ ("%s: bad pktc_idx %d", __func__, pktc_idx)); iq->flags = 0; iq->adapter = sc; iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); iq->intr_pktc_idx = SGE_NCOUNTERS - 1; if (pktc_idx >= 0) { iq->intr_params |= F_QINTR_CNT_EN; iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ iq->sidx = iq->qsize - spg_len / IQ_ESIZE; } static inline void init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) { fl->qsize = qsize; fl->sidx = qsize - spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (sc->flags & BUF_PACKING_OK && ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ fl->flags |= FL_BUF_PACKING; find_best_refill_source(sc, fl, maxp); find_safe_refill_source(sc, fl); } static inline void init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan, uint16_t iqid, char *name) { KASSERT(tx_chan < NCHAN, ("%s: bad tx channel %d", __func__, tx_chan)); KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype)); eq->flags = eqtype & EQ_TYPEMASK; eq->tx_chan = tx_chan; eq->iqid = iqid; eq->sidx = qsize - spg_len / EQ_ESIZE; strlcpy(eq->lockname, name, sizeof(eq->lockname)); } static int alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_addr_t *pa, void **va) { int rc; rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc); goto done; } rc = bus_dmamem_alloc(*tag, va, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc); goto done; } rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); if (rc != 0) { device_printf(sc->dev, "cannot load DMA map: %d\n", rc); goto done; } done: if (rc) free_ring(sc, *tag, *map, *pa, *va); return (rc); } static int free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, bus_addr_t pa, void *va) { if (pa) bus_dmamap_unload(tag, map); if (va) bus_dmamem_free(tag, va, map); if (tag) bus_dma_tag_destroy(tag); return (0); } /* * Allocates the ring for an ingress queue and an optional freelist. If the * freelist is specified it will be allocated and then associated with the * ingress queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. * * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then * the intr_idx specifies the vector, starting from 0. Otherwise it specifies * the abs_id of the ingress queue to which its interrupts should be forwarded. */ static int alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl, int intr_idx, int cong) { int rc, i, cntxt_id; size_t len; struct fw_iq_cmd c; struct port_info *pi = vi->pi; struct adapter *sc = iq->adapter; __be32 v = 0; len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) return (rc); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); /* Special handling for firmware event queue */ if (iq == &sc->sge.fwq) v |= F_FW_IQ_CMD_IQASYNCH; if (iq->flags & IQ_INTR) { KASSERT(intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, intr_idx)); } else v |= F_FW_IQ_CMD_IQANDST; v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx); c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); if (cong >= 0) c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) return (rc); /* Allocate space for one software descriptor per buffer. */ rc = alloc_fl_sdesc(fl); if (rc != 0) { device_printf(sc->dev, "failed to setup fl software descriptors: %d\n", rc); return (rc); } if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sc->sge.fl_starve_threshold2, 8); fl->buf_boundary = sc->sge.pack_boundary; } else { fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8); fl->buf_boundary = 16; } if (fl_pad && fl->buf_boundary < sc->sge.pad_boundary) fl->buf_boundary = sc->sge.pad_boundary; c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 0)); if (cong >= 0) { c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_128B) | V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B)); c.fl0size = htobe16(fl->qsize); c.fl0addr = htobe64(fl->ba); } rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create ingress queue: %d\n", rc); return (rc); } iq->cidx = 0; iq->gen = F_RSPD_GEN; iq->intr_next = iq->intr_params; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); iq->flags |= IQ_ALLOCATED; cntxt_id = iq->cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.niq) { panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.niq - 1); } sc->sge.iqmap[cntxt_id] = iq; if (fl) { u_int qid; iq->flags |= IQ_HAS_FL; fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = 0; cntxt_id = fl->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) { panic("%s: fl->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); } sc->sge.eqmap[cntxt_id] = (void *)fl; qid = fl->cntxt_id; if (isset(&sc->doorbells, DOORBELL_UDB)) { uint32_t s_qpp = sc->sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (qid >> s_qpp) << PAGE_SHIFT; qid &= mask; if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { udb += qid << UDBS_SEG_SHIFT; qid = 0; } fl->udb = (volatile void *)udb; } fl->dbval = F_DBPRIO | V_QID(qid); if (is_t5(sc)) fl->dbval |= F_DBTYPE; FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); } if (is_t5(sc) && cong >= 0) { uint32_t param, val; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(iq->cntxt_id); if (cong == 0) val = 1 << 19; else { val = 2 << 19; for (i = 0; i < 4; i++) { if (cong & (1 << i)) val |= 1 << (i << 2); } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* report error but carry on */ device_printf(sc->dev, "failed to set congestion manager context for " "ingress queue %d: %d\n", iq->cntxt_id, rc); } } /* Enable IQ interrupts */ atomic_store_rel_int(&iq->state, IQS_IDLE); t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) | V_INGRESSQID(iq->cntxt_id)); return (0); } static int free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl) { int rc; struct adapter *sc = iq->adapter; device_t dev; if (sc == NULL) return (0); /* nothing to do */ dev = vi ? vi->dev : sc->dev; if (iq->flags & IQ_ALLOCATED) { rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); if (rc != 0) { device_printf(dev, "failed to free queue %p: %d\n", iq, rc); return (rc); } iq->flags &= ~IQ_ALLOCATED; } free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); bzero(iq, sizeof(*iq)); if (fl) { free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); if (fl->sdesc) free_fl_sdesc(sc, fl); if (mtx_initialized(&fl->fl_lock)) mtx_destroy(&fl->fl_lock); bzero(fl, sizeof(*fl)); } return (0); } static void add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_fl *fl) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, fl_pad ? 1 : 0, "padding enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); } SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 0, "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated", CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined", CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); } static int alloc_fwq(struct adapter *sc) { int rc, intr_idx; struct sge_iq *fwq = &sc->sge.fwq; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE); fwq->flags |= IQ_INTR; /* always */ intr_idx = sc->intr_count > 1 ? 1 : 0; rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1); if (rc != 0) { device_printf(sc->dev, "failed to create firmware event queue: %d\n", rc); return (rc); } oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD, NULL, "firmware event queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I", "consumer index"); return (0); } static int free_fwq(struct adapter *sc) { return free_iq_fl(NULL, &sc->sge.fwq, NULL); } static int alloc_mgmtq(struct adapter *sc) { int rc; struct sge_wrq *mgmtq = &sc->sge.mgmtq; char name[16]; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD, NULL, "management queue"); snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev)); init_eq(&mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan, sc->sge.fwq.cntxt_id, name); rc = alloc_wrq(sc, NULL, mgmtq, oid); if (rc != 0) { device_printf(sc->dev, "failed to create management queue: %d\n", rc); return (rc); } return (0); } static int free_mgmtq(struct adapter *sc) { return free_wrq(sc, &sc->sge.mgmtq); } int tnl_cong(struct port_info *pi, int drop) { if (drop == -1) return (-1); else if (drop == 1) return (0); else return (pi->rx_chan_map); } static int alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(vi->pi, cong_drop)); if (rc != 0) return (rc); /* * The freelist is just barely above the starvation threshold right now, * fill it up a bit more. */ FL_LOCK(&rxq->fl); refill_fl(vi->pi->adapter, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); #if defined(INET) || defined(INET6) rc = tcp_lro_init(&rxq->lro); if (rc != 0) return (rc); rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */ if (vi->ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; #endif rxq->ifp = vi->ifp; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); #if defined(INET) || defined(INET6) - SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, + SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &rxq->lro.lro_queued, 0, NULL); - SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, + SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); add_fl_sysctls(&vi->ctx, oid, &rxq->fl); return (rc); } static int free_rxq(struct vi_info *vi, struct sge_rxq *rxq) { int rc; #if defined(INET) || defined(INET6) if (rxq->lro.ifp) { tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; } #endif rc = free_iq_fl(vi, &rxq->iq, &rxq->fl); if (rc == 0) bzero(rxq, sizeof(*rxq)); return (rc); } #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, vi->pi->rx_chan_map); if (rc != 0) return (rc); children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); add_fl_sysctls(&vi->ctx, oid, &ofld_rxq->fl); return (rc); } static int free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq) { int rc; rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl); if (rc == 0) bzero(ofld_rxq, sizeof(*ofld_rxq)); return (rc); } #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; struct sysctl_ctx_list *ctx; char name[16]; size_t len; struct adapter *sc = vi->pi->adapter; struct netmap_adapter *na = NA(vi->ifp); MPASS(na != NULL); len = vi->qsize_rxq * IQ_ESIZE; rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map, &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc); if (rc != 0) return (rc); len = na->num_rx_desc * EQ_ESIZE + spg_len; rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map, &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc); if (rc != 0) return (rc); nm_rxq->vi = vi; nm_rxq->nid = idx; nm_rxq->iq_cidx = 0; nm_rxq->iq_sidx = vi->qsize_rxq - spg_len / IQ_ESIZE; nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; nm_rxq->fl_sidx = na->num_rx_desc; nm_rxq->intr_idx = intr_idx; ctx = &vi->ctx; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I", "consumer index"); children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_rxq->fl_cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &nm_rxq->fl_pidx, 0, "producer index"); return (rc); } static int free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = vi->pi->adapter; free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba, nm_rxq->iq_desc); free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba, nm_rxq->fl_desc); return (0); } static int alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx, struct sysctl_oid *oid) { int rc; size_t len; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct netmap_adapter *na = NA(vi->ifp); char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); len = na->num_tx_desc * EQ_ESIZE + spg_len; rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map, &nm_txq->ba, (void **)&nm_txq->desc); if (rc) return (rc); nm_txq->pidx = nm_txq->cidx = 0; nm_txq->sidx = na->num_tx_desc; nm_txq->nid = idx; nm_txq->iqidx = iqidx; nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_VF_VLD(1) | V_TXPKT_VF(vi->viid)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "netmap tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_txq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I", "producer index"); return (rc); } static int free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { struct adapter *sc = vi->pi->adapter; free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba, nm_txq->desc); return (0); } #endif static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; int qsize = eq->sidx + spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create control queue %d: %d\n", eq->tx_chan, rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } static int eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; int qsize = eq->sidx + spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create Ethernet egress queue: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; int qsize = eq->sidx + spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | V_FW_EQ_OFLD_CMD_VFN(0)); c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create egress queue for TCP offload: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #endif static int alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, qsize; size_t len; mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); qsize = eq->sidx + spg_len / EQ_ESIZE; len = qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); eq->pidx = eq->cidx = 0; eq->equeqidx = eq->dbidx = 0; eq->doorbells = sc->doorbells; switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = ctrl_eq_alloc(sc, eq); break; case EQ_ETH: rc = eth_eq_alloc(sc, vi, eq); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = ofld_eq_alloc(sc, vi, eq); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to allocate egress queue(%d): %d\n", eq->flags & EQ_TYPEMASK, rc); } if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ eq->udb_qid = 0; } eq->udb = (volatile void *)udb; } return (rc); } static int free_eq(struct adapter *sc, struct sge_eq *eq) { int rc; if (eq->flags & EQ_ALLOCATED) { switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; case EQ_ETH: rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to free egress queue (%d): %d\n", eq->flags & EQ_TYPEMASK, rc); return (rc); } eq->flags &= ~EQ_ALLOCATED; } free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); if (mtx_initialized(&eq->eq_lock)) mtx_destroy(&eq->eq_lock); bzero(eq, sizeof(*eq)); return (0); } static int alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq, struct sysctl_oid *oid) { int rc; struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = alloc_eq(sc, vi, &wrq->eq); if (rc) return (rc); wrq->adapter = sc; TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); TAILQ_INIT(&wrq->incomplete_wrs); STAILQ_INIT(&wrq->wr_list); wrq->nwr_pending = 0; wrq->ndesc_needed = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, &wrq->tx_wrs_direct, "# of work requests (direct)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, &wrq->tx_wrs_copied, "# of work requests (copied)"); return (rc); } static int free_wrq(struct adapter *sc, struct sge_wrq *wrq) { int rc; rc = free_eq(sc, &wrq->eq); if (rc) return (rc); bzero(wrq, sizeof(*wrq)); return (0); } static int alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx, struct sysctl_oid *oid) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, M_CXGBE, M_WAITOK); if (rc != 0) { device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc); return (rc); } rc = alloc_eq(sc, vi, eq); if (rc != 0) { mp_ring_free(txq->r); txq->r = NULL; return (rc); } /* Can't fail after this point. */ TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); txq->ifp = vi->ifp; txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_VF_VLD(1) | V_TXPKT_VF(vi->viid)); txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &eq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, &txq->txcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, &txq->tso_wrs, "# of TSO work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, &txq->imm_wrs, "# of work requests with immediate data"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD, &txq->txpkts0_wrs, "# of txpkts (type 0) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD, &txq->txpkts1_wrs, "# of txpkts (type 1) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD, &txq->txpkts0_pkts, "# of frames tx'd using type0 txpkts work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD, &txq->txpkts1_pkts, "# of frames tx'd using type1 txpkts work requests"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues", CTLFLAG_RD, &txq->r->enqueues, "# of enqueues to the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops", CTLFLAG_RD, &txq->r->drops, "# of drops in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts", CTLFLAG_RD, &txq->r->starts, "# of normal consumer starts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls", CTLFLAG_RD, &txq->r->stalls, "# of consumer stalls in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts", CTLFLAG_RD, &txq->r->restarts, "# of consumer restarts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications", CTLFLAG_RD, &txq->r->abdications, "# of consumer abdications in the mp_ring for this queue"); return (0); } static int free_txq(struct vi_info *vi, struct sge_txq *txq) { int rc; struct adapter *sc = vi->pi->adapter; struct sge_eq *eq = &txq->eq; rc = free_eq(sc, eq); if (rc) return (rc); sglist_free(txq->gl); free(txq->sdesc, M_CXGBE); mp_ring_free(txq->r); bzero(txq, sizeof(*txq)); return (0); } static void oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *ba = arg; KASSERT(nseg == 1, ("%s meant for single segment mappings only.", __func__)); *ba = error ? 0 : segs->ds_addr; } static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { uint32_t n, v; n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx); MPASS(n > 0); wmb(); v = fl->dbval | V_PIDX(n); if (fl->udb) *fl->udb = htole32(v); else t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v); IDXINCR(fl->dbidx, n, fl->sidx); } /* * Fills up the freelist by allocating upto 'n' buffers. Buffers that are * recycled do not count towards this allocation budget. * * Returns non-zero to indicate that this freelist should be added to the list * of starving freelists. */ static int refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { __be64 *d; struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; struct cluster_layout *cll; struct sw_zone_info *swz; struct cluster_metadata *clm; uint16_t max_pidx; uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); /* * We always stop at the begining of the hardware descriptor that's just * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, * which would mean an empty freelist to the chip. */ max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; if (fl->pidx == max_pidx * 8) return (0); d = &fl->desc[fl->pidx]; sd = &fl->sdesc[fl->pidx]; cll = &fl->cll_def; /* default layout */ swz = &sc->sge.sw_zone_info[cll->zidx]; while (n > 0) { if (sd->cl != NULL) { if (sd->nmbuf == 0) { /* * Fast recycle without involving any atomics on * the cluster's metadata (if the cluster has * metadata). This happens when all frames * received in the cluster were small enough to * fit within a single mbuf each. */ fl->cl_fast_recycled++; #ifdef INVARIANTS clm = cl_metadata(sc, fl, &sd->cll, sd->cl); if (clm != NULL) MPASS(clm->refcount == 1); #endif goto recycled_fast; } /* * Cluster is guaranteed to have metadata. Clusters * without metadata always take the fast recycle path * when they're recycled. */ clm = cl_metadata(sc, fl, &sd->cll, sd->cl); MPASS(clm != NULL); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { fl->cl_recycled++; counter_u64_add(extfree_rels, 1); goto recycled; } sd->cl = NULL; /* gave up my reference */ } MPASS(sd->cl == NULL); alloc: cl = uma_zalloc(swz->zone, M_NOWAIT); if (__predict_false(cl == NULL)) { if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 || fl->cll_def.zidx == fl->cll_alt.zidx) break; /* fall back to the safe zone */ cll = &fl->cll_alt; swz = &sc->sge.sw_zone_info[cll->zidx]; goto alloc; } fl->cl_allocated++; n--; pa = pmap_kextract((vm_offset_t)cl); pa += cll->region1; sd->cl = cl; sd->cll = *cll; *d = htobe64(pa | cll->hwidx); clm = cl_metadata(sc, fl, cll, cl); if (clm != NULL) { recycled: #ifdef INVARIANTS clm->sd = sd; #endif clm->refcount = 1; } sd->nmbuf = 0; recycled_fast: d++; sd++; if (__predict_false(++fl->pidx % 8 == 0)) { uint16_t pidx = fl->pidx / 8; if (__predict_false(pidx == fl->sidx)) { fl->pidx = 0; pidx = 0; sd = fl->sdesc; d = fl->desc; } if (pidx == max_pidx) break; if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) ring_fl_db(sc, fl); } } if (fl->pidx / 8 != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); } /* * Attempt to refill all starving freelists. */ static void refill_sfl(void *arg) { struct adapter *sc = arg; struct sge_fl *fl, *fl_temp; mtx_assert(&sc->sfl_lock, MA_OWNED); TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { FL_LOCK(fl); refill_fl(sc, fl, 64); if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { TAILQ_REMOVE(&sc->sfl, fl, link); fl->flags &= ~FL_STARVING; } FL_UNLOCK(fl); } if (!TAILQ_EMPTY(&sc->sfl)) callout_schedule(&sc->sfl_callout, hz / 5); } static int alloc_fl_sdesc(struct sge_fl *fl) { fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); return (0); } static void free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; struct cluster_metadata *clm; struct cluster_layout *cll; int i; sd = fl->sdesc; for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; cll = &sd->cll; clm = cl_metadata(sc, fl, cll, sd->cl); if (sd->nmbuf == 0) uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) { uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); counter_u64_add(extfree_rels, 1); } sd->cl = NULL; } free(fl->sdesc, M_CXGBE); fl->sdesc = NULL; } static inline void get_pkt_gl(struct mbuf *m, struct sglist *gl) { int rc; M_ASSERTPKTHDR(m); sglist_reset(gl); rc = sglist_append_mbuf(gl, m); if (__predict_false(rc != 0)) { panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " "with %d.", __func__, m, mbuf_nsegs(m), rc); } KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)); } /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_len16(u_int nsegs, u_int tso) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } /* * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts0_len16(u_int nsegs) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts1_len16(void) { u_int n; n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); return (howmany(n, 16)); } static inline u_int imm_payload(u_int ndesc) { u_int n; n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - sizeof(struct cpl_tx_pkt_core); return (n); } /* * Write a txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); else if (pktlen <= imm_payload(2) && available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + pktlen, 16); nsegs = 0; } ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc <= available); /* Firmware work request header */ MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); txq->tso_wrs++; } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m0) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m0) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); if (nsegs > 0) { write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; } else { struct mbuf *m; for (m = m0; m != NULL; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; #endif } #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif txq->imm_wrs++; } txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } static int try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available) { u_int needed, nsegs1, nsegs2, l1, l2; if (cannot_use_txpkts(m) || cannot_use_txpkts(n)) return (1); nsegs1 = mbuf_nsegs(m); nsegs2 = mbuf_nsegs(n); if (nsegs1 + nsegs2 == 2) { txp->wr_type = 1; l1 = l2 = txpkts1_len16(); } else { txp->wr_type = 0; l1 = txpkts0_len16(nsegs1); l2 = txpkts0_len16(nsegs2); } txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2; needed = howmany(txp->len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->plen = m->m_pkthdr.len + n->m_pkthdr.len; if (txp->plen > 65535) return (1); txp->npkt = 2; set_mbuf_len16(m, l1); set_mbuf_len16(n, l2); return (0); } static int add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available) { u_int plen, len16, needed, nsegs; MPASS(txp->wr_type == 0 || txp->wr_type == 1); nsegs = mbuf_nsegs(m); if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1)) return (1); plen = txp->plen + m->m_pkthdr.len; if (plen > 65535) return (1); if (txp->wr_type == 0) len16 = txpkts0_len16(nsegs); else len16 = txpkts1_len16(); needed = howmany(txp->len16 + len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->npkt++; txp->plen = plen; txp->len16 += len16; set_mbuf_len16(m, len16); return (0); } /* * Write a txpkts WR for the packets in txp to the hardware descriptors, update * the software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int ndesc, checkwrap; struct mbuf *m; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->plen < 65536); MPASS(m0 != NULL); MPASS(m0->m_nextpkt != NULL); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); MPASS(available > 0 && available < eq->sidx); ndesc = howmany(txp->len16, EQ_ESIZE / 16); MPASS(ndesc <= available); MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); ctrl = V_FW_WR_LEN16(txp->len16); wr->equiq_to_len16 = htobe32(ctrl); wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r3 = 0; wr->type = txp->wr_type; flitp = wr + 1; /* * At this point we are 16B into a hardware descriptor. If checkwrap is * set then we know the WR is going to wrap around somewhere. We'll * check for that at appropriate points. */ checkwrap = eq->sidx - ndesc < eq->pidx; for (m = m0; m != NULL; m = m->m_nextpkt) { if (txp->wr_type == 0) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; /* ULP master command */ ulpmc = flitp; ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); ulpmc->len = htobe32(mbuf_len16(m)); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); cpl = (void *)(ulpsc + 1); if (checkwrap && (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) cpl = (void *)&eq->desc[0]; txq->txpkts0_pkts += txp->npkt; txq->txpkts0_wrs++; } else { cpl = flitp; txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; } /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; if (checkwrap && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = (void *)&eq->desc[0]; write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); } txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) { struct sge_eq *eq = &txq->eq; struct sglist *gl = txq->gl; struct sglist_seg *seg; __be64 *flitp, *wrap; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); get_pkt_gl(m, gl); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)(*to); wrap = (__be64 *)(&eq->desc[eq->sidx]); seg = &gl->sg_segs[0]; usgl = (void *)flitp; /* * We start at a 16 byte boundary somewhere inside the tx descriptor * ring, so we're at least 16 bytes away from the status page. There is * no chance of a wrap around in the middle of usgl (which is 16 bytes). */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { /* Won't wrap around at all */ for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); for (i = 0; i < nflits - 2; i++) { if (flitp == wrap) flitp = (void *)eq->desc; *flitp++ = get_flit(seg, nsegs - 1, i); } } if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); if (__predict_false(flitp == wrap)) *to = (void *)eq->desc; else *to = (void *)flitp; } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static inline void ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) { u_int db; MPASS(n > 0); db = eq->doorbells; if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; int i; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(eq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", __func__, eq->doorbells, n, eq->dbidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); i = eq->dbidx; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), V_QID(eq->cntxt_id) | V_PIDX(n)); break; } IDXINCR(eq->dbidx, n, eq->sidx); } static inline u_int reclaimable_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx; hw_cidx = read_hw_cidx(eq); return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); } static inline u_int total_available_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx, pidx; hw_cidx = read_hw_cidx(eq); pidx = eq->pidx; if (pidx == hw_cidx) return (eq->sidx - 1); else return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); } static inline uint16_t read_hw_cidx(struct sge_eq *eq) { struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; uint16_t cidx = spg->cidx; /* stable snapshot */ return (be16toh(cidx)); } /* * Reclaim 'n' descriptors approximately. */ static u_int reclaim_tx_descs(struct sge_txq *txq, u_int n) { struct tx_sdesc *txsd; struct sge_eq *eq = &txq->eq; u_int can_reclaim, reclaimed; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(n > 0); reclaimed = 0; can_reclaim = reclaimable_tx_desc(eq); while (can_reclaim && reclaimed < n) { int ndesc; struct mbuf *m, *nextpkt; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; /* Firmware doesn't return "partial" credits. */ KASSERT(can_reclaim >= ndesc, ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } reclaimed += ndesc; can_reclaim -= ndesc; IDXINCR(eq->cidx, ndesc, eq->sidx); } return (reclaimed); } static void tx_reclaim(void *arg, int n) { struct sge_txq *txq = arg; struct sge_eq *eq = &txq->eq; do { if (TXQ_TRYLOCK(txq) == 0) break; n = reclaim_tx_descs(txq, 32); if (eq->cidx == eq->pidx) eq->equeqidx = eq->pidx; TXQ_UNLOCK(txq); } while (n > 0); } static __be64 get_flit(struct sglist_seg *segs, int nsegs, int idx) { int i = (idx / 3) * 2; switch (idx % 3) { case 0: { __be64 rc; rc = htobe32(segs[i].ss_len); if (i + 1 < nsegs) rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32; return (rc); } case 1: return (htobe64(segs[i].ss_paddr)); case 2: return (htobe64(segs[i + 1].ss_paddr)); } return (0); } static void find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) { int8_t zidx, hwidx, idx; uint16_t region1, region3; int spare, spare_needed, n; struct sw_zone_info *swz; struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0]; /* * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize * large enough for the max payload and cluster metadata. Otherwise * settle for the largest bufsize that leaves enough room in the cluster * for metadata. * * Without buffer packing: Look for the smallest zone which has a * bufsize large enough for the max payload. Settle for the largest * bufsize available if there's nothing big enough for max payload. */ spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0; swz = &sc->sge.sw_zone_info[0]; hwidx = -1; for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) { if (swz->size > largest_rx_cluster) { if (__predict_true(hwidx != -1)) break; /* * This is a misconfiguration. largest_rx_cluster is * preventing us from finding a refill source. See * dev.t5nex..buffer_sizes to figure out why. */ device_printf(sc->dev, "largest_rx_cluster=%u leaves no" " refill source for fl %p (dma %u). Ignored.\n", largest_rx_cluster, fl, maxp); } for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (spare < spare_needed) continue; hwidx = idx; /* best option so far */ if (hwb->size >= maxp) { if ((fl->flags & FL_BUF_PACKING) == 0) goto done; /* stop looking (not packing) */ if (swz->size >= safest_rx_cluster) goto done; /* stop looking (packing) */ } break; /* keep looking, next zone */ } } done: /* A usable hwidx has been located. */ MPASS(hwidx != -1); hwb = &hwb_list[hwidx]; zidx = hwb->zidx; swz = &sc->sge.sw_zone_info[zidx]; region1 = 0; region3 = swz->size - hwb->size; /* * Stay within this zone and see if there is a better match when mbuf * inlining is allowed. Remember that the hwidx's are sorted in * decreasing order of size (so in increasing order of spare area). */ for (idx = hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) break; /* * Do not inline mbufs if doing so would violate the pad/pack * boundary alignment requirement. */ if (fl_pad && (MSIZE % sc->sge.pad_boundary) != 0) continue; if (fl->flags & FL_BUF_PACKING && (MSIZE % sc->sge.pack_boundary) != 0) continue; if (spare < CL_METADATA_SIZE + MSIZE) continue; n = (spare - CL_METADATA_SIZE) / MSIZE; if (n > howmany(hwb->size, maxp)) break; hwidx = idx; if (fl->flags & FL_BUF_PACKING) { region1 = n * MSIZE; region3 = spare - region1; } else { region1 = MSIZE; region3 = spare - region1; break; } } KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES, ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp)); KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES, ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp)); KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 == sc->sge.sw_zone_info[zidx].size, ("%s: bad buffer layout for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); if (fl->flags & FL_BUF_PACKING || region1 > 0) { KASSERT(region3 >= CL_METADATA_SIZE, ("%s: no room for metadata. fl %p, maxp %d; " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); KASSERT(region1 % MSIZE == 0, ("%s: bad mbuf region for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); } fl->cll_def.zidx = zidx; fl->cll_def.hwidx = hwidx; fl->cll_def.region1 = region1; fl->cll_def.region3 = region3; } static void find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) { struct sge *s = &sc->sge; struct hw_buf_info *hwb; struct sw_zone_info *swz; int spare; int8_t hwidx; if (fl->flags & FL_BUF_PACKING) hwidx = s->safe_hwidx2; /* with room for metadata */ else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) { hwidx = s->safe_hwidx2; hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; /* no good if there isn't room for an mbuf as well */ if (spare < CL_METADATA_SIZE + MSIZE) hwidx = s->safe_hwidx1; } else hwidx = s->safe_hwidx1; if (hwidx == -1) { /* No fallback source */ fl->cll_alt.hwidx = -1; fl->cll_alt.zidx = -1; return; } hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; fl->cll_alt.hwidx = hwidx; fl->cll_alt.zidx = hwb->zidx; if (allow_mbufs_in_cluster && (fl_pad == 0 || (MSIZE % sc->sge.pad_boundary) == 0)) fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else fl->cll_alt.region1 = 0; fl->cll_alt.region3 = spare - fl->cll_alt.region1; } static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); if ((fl->flags & FL_DOOMED) == 0) { fl->flags |= FL_STARVING; TAILQ_INSERT_TAIL(&sc->sfl, fl, link); callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); } FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); } static void handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_wrq *wrq = (void *)eq; atomic_readandclear_int(&eq->equiq); taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task); } static void handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_txq *txq = (void *)eq; MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH); atomic_readandclear_int(&eq->equiq); mp_ring_check_drainage(txq->r, 0); taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); } static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, &handle_wrq_egr_update, &handle_eth_egr_update, &handle_wrq_egr_update}; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start]; (*h[eq->flags & EQ_TYPEMASK])(sc, eq); return (0); } /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ offsetof(struct cpl_fw6_msg, data)); static int handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { const struct rss_header *rss2; rss2 = (const struct rss_header *)&cpl->data[0]; return (sc->cpl_handler[rss2->opcode](iq, rss2, m)); } return (sc->fw_msg_handler[cpl->type](sc, &cpl->data[0])); } static int sysctl_uint16(SYSCTL_HANDLER_ARGS) { uint16_t *id = arg1; int i = *id; return sysctl_handle_int(oidp, &i, 0, req); } static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS) { struct sge *s = arg1; struct hw_buf_info *hwb = &s->hw_buf_info[0]; struct sw_zone_info *swz = &s->sw_zone_info[0]; int i, rc; struct sbuf sb; char c; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster) c = '*'; else c = '\0'; sbuf_printf(&sb, "%u%c ", hwb->size, c); } sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } Index: head/sys/dev/e1000/if_igb.c =================================================================== --- head/sys/dev/e1000/if_igb.c (revision 294326) +++ head/sys/dev/e1000/if_igb.c (revision 294327) @@ -1,6393 +1,6393 @@ /****************************************************************************** Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #include "opt_altq.h" #endif #include "if_igb.h" /********************************************************************* * Driver version: *********************************************************************/ char igb_driver_version[] = "2.5.2"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into e1000_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static igb_vendor_info_t igb_vendor_info_array[] = { {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82575EB_COPPER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82575EB_FIBER_SERDES, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82575GB_QUAD_COPPER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_NS, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_NS_SERDES, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_FIBER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_SERDES, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_SERDES_QUAD, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_QUAD_COPPER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_QUAD_COPPER_ET2, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82576_VF, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82580_COPPER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82580_FIBER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82580_SERDES, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82580_SGMII, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82580_COPPER_DUAL, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_82580_QUAD_FIBER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_DH89XXCC_SERDES, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_DH89XXCC_SGMII, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_DH89XXCC_SFP, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_DH89XXCC_BACKPLANE, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I350_COPPER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I350_FIBER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I350_SERDES, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I350_SGMII, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I350_VF, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_COPPER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_COPPER_IT, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_COPPER_OEM1, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_COPPER_FLASHLESS, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_SERDES_FLASHLESS, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_FIBER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_SERDES, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I210_SGMII, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I211_COPPER, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I354_BACKPLANE_1GBPS, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I354_BACKPLANE_2_5GBPS, 0, 0, 0}, {IGB_INTEL_VENDOR_ID, E1000_DEV_ID_I354_SGMII, 0, 0, 0}, /* required last entry */ {0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings for all supported NICs. *********************************************************************/ static char *igb_strings[] = { "Intel(R) PRO/1000 Network Connection" }; /********************************************************************* * Function prototypes *********************************************************************/ static int igb_probe(device_t); static int igb_attach(device_t); static int igb_detach(device_t); static int igb_shutdown(device_t); static int igb_suspend(device_t); static int igb_resume(device_t); #ifndef IGB_LEGACY_TX static int igb_mq_start(struct ifnet *, struct mbuf *); static int igb_mq_start_locked(struct ifnet *, struct tx_ring *); static void igb_qflush(struct ifnet *); static void igb_deferred_mq_start(void *, int); #else static void igb_start(struct ifnet *); static void igb_start_locked(struct tx_ring *, struct ifnet *ifp); #endif static int igb_ioctl(struct ifnet *, u_long, caddr_t); static uint64_t igb_get_counter(if_t, ift_counter); static void igb_init(void *); static void igb_init_locked(struct adapter *); static void igb_stop(void *); static void igb_media_status(struct ifnet *, struct ifmediareq *); static int igb_media_change(struct ifnet *); static void igb_identify_hardware(struct adapter *); static int igb_allocate_pci_resources(struct adapter *); static int igb_allocate_msix(struct adapter *); static int igb_allocate_legacy(struct adapter *); static int igb_setup_msix(struct adapter *); static void igb_free_pci_resources(struct adapter *); static void igb_local_timer(void *); static void igb_reset(struct adapter *); static int igb_setup_interface(device_t, struct adapter *); static int igb_allocate_queues(struct adapter *); static void igb_configure_queues(struct adapter *); static int igb_allocate_transmit_buffers(struct tx_ring *); static void igb_setup_transmit_structures(struct adapter *); static void igb_setup_transmit_ring(struct tx_ring *); static void igb_initialize_transmit_units(struct adapter *); static void igb_free_transmit_structures(struct adapter *); static void igb_free_transmit_buffers(struct tx_ring *); static int igb_allocate_receive_buffers(struct rx_ring *); static int igb_setup_receive_structures(struct adapter *); static int igb_setup_receive_ring(struct rx_ring *); static void igb_initialize_receive_units(struct adapter *); static void igb_free_receive_structures(struct adapter *); static void igb_free_receive_buffers(struct rx_ring *); static void igb_free_receive_ring(struct rx_ring *); static void igb_enable_intr(struct adapter *); static void igb_disable_intr(struct adapter *); static void igb_update_stats_counters(struct adapter *); static bool igb_txeof(struct tx_ring *); static __inline void igb_rx_discard(struct rx_ring *, int); static __inline void igb_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u32); static bool igb_rxeof(struct igb_queue *, int, int *); static void igb_rx_checksum(u32, struct mbuf *, u32); static int igb_tx_ctx_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static int igb_tso_setup(struct tx_ring *, struct mbuf *, u32 *, u32 *); static void igb_set_promisc(struct adapter *); static void igb_disable_promisc(struct adapter *); static void igb_set_multi(struct adapter *); static void igb_update_link_status(struct adapter *); static void igb_refresh_mbufs(struct rx_ring *, int); static void igb_register_vlan(void *, struct ifnet *, u16); static void igb_unregister_vlan(void *, struct ifnet *, u16); static void igb_setup_vlan_hw_support(struct adapter *); static int igb_xmit(struct tx_ring *, struct mbuf **); static int igb_dma_malloc(struct adapter *, bus_size_t, struct igb_dma_alloc *, int); static void igb_dma_free(struct adapter *, struct igb_dma_alloc *); static int igb_sysctl_nvm_info(SYSCTL_HANDLER_ARGS); static void igb_print_nvm_info(struct adapter *); static int igb_is_valid_ether_addr(u8 *); static void igb_add_hw_stats(struct adapter *); static void igb_vf_init_stats(struct adapter *); static void igb_update_vf_stats_counters(struct adapter *); /* Management and WOL Support */ static void igb_init_manageability(struct adapter *); static void igb_release_manageability(struct adapter *); static void igb_get_hw_control(struct adapter *); static void igb_release_hw_control(struct adapter *); static void igb_enable_wakeup(device_t); static void igb_led_func(void *, int); static int igb_irq_fast(void *); static void igb_msix_que(void *); static void igb_msix_link(void *); static void igb_handle_que(void *context, int pending); static void igb_handle_link(void *context, int pending); static void igb_handle_link_locked(struct adapter *); static void igb_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); static int igb_set_flowcntl(SYSCTL_HANDLER_ARGS); static int igb_sysctl_dmac(SYSCTL_HANDLER_ARGS); static int igb_sysctl_eee(SYSCTL_HANDLER_ARGS); #ifdef DEVICE_POLLING static poll_handler_t igb_poll; #endif /* POLLING */ /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t igb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, igb_probe), DEVMETHOD(device_attach, igb_attach), DEVMETHOD(device_detach, igb_detach), DEVMETHOD(device_shutdown, igb_shutdown), DEVMETHOD(device_suspend, igb_suspend), DEVMETHOD(device_resume, igb_resume), DEVMETHOD_END }; static driver_t igb_driver = { "igb", igb_methods, sizeof(struct adapter), }; static devclass_t igb_devclass; DRIVER_MODULE(igb, pci, igb_driver, igb_devclass, 0, 0); MODULE_DEPEND(igb, pci, 1, 1, 1); MODULE_DEPEND(igb, ether, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(igb, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ /********************************************************************* * Tunable default values. *********************************************************************/ static SYSCTL_NODE(_hw, OID_AUTO, igb, CTLFLAG_RD, 0, "IGB driver parameters"); /* Descriptor defaults */ static int igb_rxd = IGB_DEFAULT_RXD; static int igb_txd = IGB_DEFAULT_TXD; SYSCTL_INT(_hw_igb, OID_AUTO, rxd, CTLFLAG_RDTUN, &igb_rxd, 0, "Number of receive descriptors per queue"); SYSCTL_INT(_hw_igb, OID_AUTO, txd, CTLFLAG_RDTUN, &igb_txd, 0, "Number of transmit descriptors per queue"); /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int igb_enable_aim = TRUE; SYSCTL_INT(_hw_igb, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &igb_enable_aim, 0, "Enable adaptive interrupt moderation"); /* * MSIX should be the default for best performance, * but this allows it to be forced off for testing. */ static int igb_enable_msix = 1; SYSCTL_INT(_hw_igb, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &igb_enable_msix, 0, "Enable MSI-X interrupts"); /* ** Tuneable Interrupt rate */ static int igb_max_interrupt_rate = 8000; SYSCTL_INT(_hw_igb, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, &igb_max_interrupt_rate, 0, "Maximum interrupts per second"); #ifndef IGB_LEGACY_TX /* ** Tuneable number of buffers in the buf-ring (drbr_xxx) */ static int igb_buf_ring_size = IGB_BR_SIZE; SYSCTL_INT(_hw_igb, OID_AUTO, buf_ring_size, CTLFLAG_RDTUN, &igb_buf_ring_size, 0, "Size of the bufring"); #endif /* ** Header split causes the packet header to ** be dma'd to a seperate mbuf from the payload. ** this can have memory alignment benefits. But ** another plus is that small packets often fit ** into the header and thus use no cluster. Its ** a very workload dependent type feature. */ static int igb_header_split = FALSE; SYSCTL_INT(_hw_igb, OID_AUTO, header_split, CTLFLAG_RDTUN, &igb_header_split, 0, "Enable receive mbuf header split"); /* ** This will autoconfigure based on the ** number of CPUs and max supported ** MSIX messages if left at 0. */ static int igb_num_queues = 0; SYSCTL_INT(_hw_igb, OID_AUTO, num_queues, CTLFLAG_RDTUN, &igb_num_queues, 0, "Number of queues to configure, 0 indicates autoconfigure"); /* ** Global variable to store last used CPU when binding queues ** to CPUs in igb_allocate_msix. Starts at CPU_FIRST and increments when a ** queue is bound to a cpu. */ static int igb_last_bind_cpu = -1; /* How many packets rxeof tries to clean at a time */ static int igb_rx_process_limit = 100; SYSCTL_INT(_hw_igb, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &igb_rx_process_limit, 0, "Maximum number of received packets to process at a time, -1 means unlimited"); /* How many packets txeof tries to clean at a time */ static int igb_tx_process_limit = -1; SYSCTL_INT(_hw_igb, OID_AUTO, tx_process_limit, CTLFLAG_RDTUN, &igb_tx_process_limit, 0, "Maximum number of sent packets to process at a time, -1 means unlimited"); #ifdef DEV_NETMAP /* see ixgbe.c for details */ #include #endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * * igb_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int igb_probe(device_t dev) { char adapter_name[256]; uint16_t pci_vendor_id = 0; uint16_t pci_device_id = 0; uint16_t pci_subvendor_id = 0; uint16_t pci_subdevice_id = 0; igb_vendor_info_t *ent; INIT_DEBUGOUT("igb_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IGB_INTEL_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = igb_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == 0)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == 0))) { sprintf(adapter_name, "%s, Version - %s", igb_strings[ent->index], igb_driver_version); device_set_desc_copy(dev, adapter_name); return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int igb_attach(device_t dev) { struct adapter *adapter; int error = 0; u16 eeprom_data; INIT_DEBUGOUT("igb_attach: begin"); if (resource_disabled("igb", device_get_unit(dev))) { device_printf(dev, "Disabled by device hint\n"); return (ENXIO); } adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; IGB_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTLs */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_nvm_info, "I", "NVM Information"); igb_set_sysctl_value(adapter, "enable_aim", "Interrupt Moderation", &adapter->enable_aim, igb_enable_aim); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "fc", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_set_flowcntl, "I", "Flow Control"); callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware and mac info */ igb_identify_hardware(adapter); /* Setup PCI resources */ if (igb_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_pci; } /* Do Shared Code initialization */ if (e1000_setup_init_funcs(&adapter->hw, TRUE)) { device_printf(dev, "Setup of Shared code failed\n"); error = ENXIO; goto err_pci; } e1000_get_bus_info(&adapter->hw); /* Sysctls for limiting the amount of work done in the taskqueues */ igb_set_sysctl_value(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, igb_rx_process_limit); igb_set_sysctl_value(adapter, "tx_processing_limit", "max number of tx packets to process", &adapter->tx_process_limit, igb_tx_process_limit); /* * Validate number of transmit and receive descriptors. It * must not exceed hardware maximum, and must be multiple * of E1000_DBA_ALIGN. */ if (((igb_txd * sizeof(struct e1000_tx_desc)) % IGB_DBA_ALIGN) != 0 || (igb_txd > IGB_MAX_TXD) || (igb_txd < IGB_MIN_TXD)) { device_printf(dev, "Using %d TX descriptors instead of %d!\n", IGB_DEFAULT_TXD, igb_txd); adapter->num_tx_desc = IGB_DEFAULT_TXD; } else adapter->num_tx_desc = igb_txd; if (((igb_rxd * sizeof(struct e1000_rx_desc)) % IGB_DBA_ALIGN) != 0 || (igb_rxd > IGB_MAX_RXD) || (igb_rxd < IGB_MIN_RXD)) { device_printf(dev, "Using %d RX descriptors instead of %d!\n", IGB_DEFAULT_RXD, igb_rxd); adapter->num_rx_desc = IGB_DEFAULT_RXD; } else adapter->num_rx_desc = igb_rxd; adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_wait_to_complete = FALSE; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; /* Copper options */ if (adapter->hw.phy.media_type == e1000_media_type_copper) { adapter->hw.phy.mdix = AUTO_ALL_MODES; adapter->hw.phy.disable_polarity_correction = FALSE; adapter->hw.phy.ms_type = IGB_MASTER_SLAVE; } /* * Set the frame limits assuming * standard ethernet sized frames. */ adapter->max_frame_size = ETHERMTU + ETHER_HDR_LEN + ETHERNET_FCS_SIZE; /* ** Allocate and Setup Queues */ if (igb_allocate_queues(adapter)) { error = ENOMEM; goto err_pci; } /* Allocate the appropriate stats memory */ if (adapter->vf_ifp) { adapter->stats = (struct e1000_vf_stats *)malloc(sizeof \ (struct e1000_vf_stats), M_DEVBUF, M_NOWAIT | M_ZERO); igb_vf_init_stats(adapter); } else adapter->stats = (struct e1000_hw_stats *)malloc(sizeof \ (struct e1000_hw_stats), M_DEVBUF, M_NOWAIT | M_ZERO); if (adapter->stats == NULL) { device_printf(dev, "Can not allocate stats memory\n"); error = ENOMEM; goto err_late; } /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(u8) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_late; } /* Some adapter-specific advanced features */ if (adapter->hw.mac.type >= e1000_i350) { SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "dmac", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_dmac, "I", "DMA Coalesce"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "eee_disabled", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_eee, "I", "Disable Energy Efficient Ethernet"); if (adapter->hw.phy.media_type == e1000_media_type_copper) { if (adapter->hw.mac.type == e1000_i354) e1000_set_eee_i354(&adapter->hw); else e1000_set_eee_i350(&adapter->hw); } } /* ** Start from a known state, this is ** important in reading the nvm and ** mac from that. */ e1000_reset_hw(&adapter->hw); /* Make sure we have a good EEPROM before we read from it */ if (((adapter->hw.mac.type != e1000_i210) && (adapter->hw.mac.type != e1000_i211)) && (e1000_validate_nvm_checksum(&adapter->hw) < 0)) { /* ** Some PCI-E parts fail the first check due to ** the link being in sleep state, call it again, ** if it fails a second time its a real issue. */ if (e1000_validate_nvm_checksum(&adapter->hw) < 0) { device_printf(dev, "The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_late; } } /* ** Copy the permanent MAC address out of the EEPROM */ if (e1000_read_mac_addr(&adapter->hw) < 0) { device_printf(dev, "EEPROM read error while reading MAC" " address\n"); error = EIO; goto err_late; } /* Check its sanity */ if (!igb_is_valid_ether_addr(adapter->hw.mac.addr)) { device_printf(dev, "Invalid MAC address\n"); error = EIO; goto err_late; } /* Setup OS specific network interface */ if (igb_setup_interface(dev, adapter) != 0) goto err_late; /* Now get a good starting state */ igb_reset(adapter); /* Initialize statistics */ igb_update_stats_counters(adapter); adapter->hw.mac.get_link_status = 1; igb_update_link_status(adapter); /* Indicate SOL/IDER usage */ if (e1000_check_reset_block(&adapter->hw)) device_printf(dev, "PHY reset is blocked due to SOL/IDER session.\n"); /* Determine if we have to control management hardware */ adapter->has_manage = e1000_enable_mng_pass_thru(&adapter->hw); /* * Setup Wake-on-Lan */ /* APME bit in EEPROM is mapped to WUC.APME */ eeprom_data = E1000_READ_REG(&adapter->hw, E1000_WUC) & E1000_WUC_APME; if (eeprom_data) adapter->wol = E1000_WUFC_MAG; /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, igb_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, igb_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); igb_add_hw_stats(adapter); /* Tell the stack that the interface is not active */ adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->ifp->if_drv_flags |= IFF_DRV_OACTIVE; adapter->led_dev = led_create(igb_led_func, adapter, device_get_nameunit(dev)); /* ** Configure Interrupts */ if ((adapter->msix > 1) && (igb_enable_msix)) error = igb_allocate_msix(adapter); else /* MSI or Legacy */ error = igb_allocate_legacy(adapter); if (error) goto err_late; #ifdef DEV_NETMAP igb_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("igb_attach: end"); return (0); err_late: igb_detach(dev); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); igb_release_hw_control(adapter); err_pci: igb_free_pci_resources(adapter); if (adapter->ifp != NULL) if_free(adapter->ifp); free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int igb_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; INIT_DEBUGOUT("igb_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } ether_ifdetach(adapter->ifp); if (adapter->led_dev != NULL) led_destroy(adapter->led_dev); #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) ether_poll_deregister(ifp); #endif IGB_CORE_LOCK(adapter); adapter->in_detach = 1; igb_stop(adapter); IGB_CORE_UNLOCK(adapter); e1000_phy_hw_reset(&adapter->hw); /* Give control back to firmware */ igb_release_manageability(adapter); igb_release_hw_control(adapter); if (adapter->wol) { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); igb_enable_wakeup(dev); } /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(adapter->ifp); #endif /* DEV_NETMAP */ igb_free_pci_resources(adapter); bus_generic_detach(dev); if_free(ifp); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); if (adapter->mta != NULL) free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int igb_shutdown(device_t dev) { return igb_suspend(dev); } /* * Suspend/resume device methods. */ static int igb_suspend(device_t dev) { struct adapter *adapter = device_get_softc(dev); IGB_CORE_LOCK(adapter); igb_stop(adapter); igb_release_manageability(adapter); igb_release_hw_control(adapter); if (adapter->wol) { E1000_WRITE_REG(&adapter->hw, E1000_WUC, E1000_WUC_PME_EN); E1000_WRITE_REG(&adapter->hw, E1000_WUFC, adapter->wol); igb_enable_wakeup(dev); } IGB_CORE_UNLOCK(adapter); return bus_generic_suspend(dev); } static int igb_resume(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; IGB_CORE_LOCK(adapter); igb_init_locked(adapter); igb_init_manageability(adapter); if ((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } } IGB_CORE_UNLOCK(adapter); return bus_generic_resume(dev); } #ifdef IGB_LEGACY_TX /********************************************************************* * Transmit entry point * * igb_start is called by the stack to initiate a transmit. * The driver will remain in this routine as long as there are * packets to transmit and transmit resources are available. * In case resources are not available stack is notified and * the packet is requeued. **********************************************************************/ static void igb_start_locked(struct tx_ring *txr, struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct mbuf *m_head; IGB_TX_LOCK_ASSERT(txr); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; if (!adapter->link_active) return; /* Call cleanup if number of TX descriptors low */ if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) igb_txeof(txr); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { if (txr->tx_avail <= IGB_MAX_SCATTER) { txr->queue_status |= IGB_QUEUE_DEPLETED; break; } IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (igb_xmit(txr, &m_head)) { if (m_head != NULL) IFQ_DRV_PREPEND(&ifp->if_snd, m_head); if (txr->tx_avail <= IGB_MAX_SCATTER) txr->queue_status |= IGB_QUEUE_DEPLETED; break; } /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, m_head); /* Set watchdog on */ txr->watchdog_time = ticks; txr->queue_status |= IGB_QUEUE_WORKING; } } /* * Legacy TX driver routine, called from the * stack, always uses tx[0], and spins for it. * Should not be used with multiqueue tx */ static void igb_start(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IGB_TX_LOCK(txr); igb_start_locked(txr, ifp); IGB_TX_UNLOCK(txr); } return; } #else /* ~IGB_LEGACY_TX */ /* ** Multiqueue Transmit Entry: ** quick turnaround to the stack ** */ static int igb_mq_start(struct ifnet *ifp, struct mbuf *m) { struct adapter *adapter = ifp->if_softc; struct igb_queue *que; struct tx_ring *txr; int i, err = 0; #ifdef RSS uint32_t bucket_id; #endif /* Which queue to use */ /* * When doing RSS, map it to the same outbound queue * as the incoming flow would be mapped to. * * If everything is setup correctly, it should be the * same bucket that the current CPU we're on is. */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bucket_id) == 0) { /* XXX TODO: spit out something if bucket_id > num_queues? */ i = bucket_id % adapter->num_queues; } else { #endif i = m->m_pkthdr.flowid % adapter->num_queues; #ifdef RSS } #endif } else { i = curcpu % adapter->num_queues; } txr = &adapter->tx_rings[i]; que = &adapter->queues[i]; err = drbr_enqueue(ifp, txr->br, m); if (err) return (err); if (IGB_TX_TRYLOCK(txr)) { igb_mq_start_locked(ifp, txr); IGB_TX_UNLOCK(txr); } else taskqueue_enqueue(que->tq, &txr->txq_task); return (0); } static int igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct mbuf *next; int err = 0, enq = 0; IGB_TX_LOCK_ASSERT(txr); if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || adapter->link_active == 0) return (ENETDOWN); /* Process the queue */ while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = igb_xmit(txr, &next)) != 0) { if (next == NULL) { /* It was freed, move forward */ drbr_advance(ifp, txr->br); } else { /* * Still have one left, it may not be * the same since the transmit function * may have changed it. */ drbr_putback(ifp, txr->br, next); } break; } drbr_advance(ifp, txr->br); enq++; if (next->m_flags & M_MCAST && adapter->vf_ifp) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (enq > 0) { /* Set the watchdog */ txr->queue_status |= IGB_QUEUE_WORKING; txr->watchdog_time = ticks; } if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) igb_txeof(txr); if (txr->tx_avail <= IGB_MAX_SCATTER) txr->queue_status |= IGB_QUEUE_DEPLETED; return (err); } /* * Called from a taskqueue to drain queued transmit packets. */ static void igb_deferred_mq_start(void *arg, int pending) { struct tx_ring *txr = arg; struct adapter *adapter = txr->adapter; struct ifnet *ifp = adapter->ifp; IGB_TX_LOCK(txr); if (!drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); IGB_TX_UNLOCK(txr); } /* ** Flush all ring buffers */ static void igb_qflush(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = adapter->tx_rings; struct mbuf *m; for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IGB_TX_UNLOCK(txr); } if_qflush(ifp); } #endif /* ~IGB_LEGACY_TX */ /********************************************************************* * Ioctl entry point * * igb_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif bool avoid_reset = FALSE; int error = 0; if (adapter->in_detach) return (error); switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) igb_init(adapter); #ifdef INET if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else error = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: { int max_frame_size; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)"); IGB_CORE_LOCK(adapter); max_frame_size = 9234; if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN - ETHER_CRC_LEN) { IGB_CORE_UNLOCK(adapter); error = EINVAL; break; } ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); break; } case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl rcv'd:\ SIOCSIFFLAGS (Set Interface Flags)"); IGB_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { igb_disable_promisc(adapter); igb_set_promisc(adapter); } } else igb_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) igb_stop(adapter); adapter->if_flags = ifp->if_flags; IGB_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl rcv'd: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); igb_set_multi(adapter); #ifdef DEVICE_POLLING if (!(ifp->if_capenable & IFCAP_POLLING)) #endif igb_enable_intr(adapter); IGB_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: /* Check SOL/IDER usage */ IGB_CORE_LOCK(adapter); if (e1000_check_reset_block(&adapter->hw)) { IGB_CORE_UNLOCK(adapter); device_printf(adapter->dev, "Media change is" " blocked due to SOL/IDER session.\n"); break; } IGB_CORE_UNLOCK(adapter); case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl rcv'd: \ SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask, reinit; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFCAP (Set Capabilities)"); reinit = 0; mask = ifr->ifr_reqcap ^ ifp->if_capenable; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { if (ifr->ifr_reqcap & IFCAP_POLLING) { error = ether_poll_register(igb_poll, ifp); if (error) return (error); IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); ifp->if_capenable |= IFCAP_POLLING; IGB_CORE_UNLOCK(adapter); } else { error = ether_poll_deregister(ifp); /* Enable interrupt even in error case */ IGB_CORE_LOCK(adapter); igb_enable_intr(adapter); ifp->if_capenable &= ~IFCAP_POLLING; IGB_CORE_UNLOCK(adapter); } } #endif if (mask & IFCAP_HWCSUM) { ifp->if_capenable ^= IFCAP_HWCSUM; reinit = 1; } if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; reinit = 1; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; reinit = 1; } if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; reinit = 1; } if (mask & IFCAP_VLAN_HWFILTER) { ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; reinit = 1; } if (mask & IFCAP_VLAN_HWTSO) { ifp->if_capenable ^= IFCAP_VLAN_HWTSO; reinit = 1; } if (mask & IFCAP_LRO) { ifp->if_capenable ^= IFCAP_LRO; reinit = 1; } if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) igb_init(adapter); VLAN_CAPABILITIES(ifp); break; } default: error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ static void igb_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; INIT_DEBUGOUT("igb_init: begin"); IGB_CORE_LOCK_ASSERT(adapter); igb_disable_intr(adapter); callout_stop(&adapter->timer); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), adapter->hw.mac.addr, ETHER_ADDR_LEN); /* Put the address into the Receive Address Array */ e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); igb_reset(adapter); igb_update_link_status(adapter); E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); /* Set hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 if ((adapter->hw.mac.type == e1000_82576) || (adapter->hw.mac.type == e1000_82580)) ifp->if_hwassist |= CSUM_SCTP; #endif } if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; /* Configure for OS presence */ igb_init_manageability(adapter); /* Prepare transmit descriptors and buffers */ igb_setup_transmit_structures(adapter); igb_initialize_transmit_units(adapter); /* Setup Multicast table */ igb_set_multi(adapter); /* ** Figure out the desired mbuf pool ** for doing jumbo/packetsplit */ if (adapter->max_frame_size <= 2048) adapter->rx_mbuf_sz = MCLBYTES; else if (adapter->max_frame_size <= 4096) adapter->rx_mbuf_sz = MJUMPAGESIZE; else adapter->rx_mbuf_sz = MJUM9BYTES; /* Prepare receive descriptors and buffers */ if (igb_setup_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); return; } igb_initialize_receive_units(adapter); e1000_rx_fifo_flush_82575(&adapter->hw); /* Enable VLAN support */ if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) igb_setup_vlan_hw_support(adapter); /* Don't lose promiscuous settings */ igb_set_promisc(adapter); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; callout_reset(&adapter->timer, hz, igb_local_timer, adapter); e1000_clear_hw_cntrs_base_generic(&adapter->hw); if (adapter->msix > 1) /* Set up queue routing */ igb_configure_queues(adapter); /* this clears any pending interrupts */ E1000_READ_REG(&adapter->hw, E1000_ICR); #ifdef DEVICE_POLLING /* * Only enable interrupts if we are not polling, make sure * they are off otherwise. */ if (ifp->if_capenable & IFCAP_POLLING) igb_disable_intr(adapter); else #endif /* DEVICE_POLLING */ { igb_enable_intr(adapter); E1000_WRITE_REG(&adapter->hw, E1000_ICS, E1000_ICS_LSC); } /* Set Energy Efficient Ethernet */ if (adapter->hw.phy.media_type == e1000_media_type_copper) { if (adapter->hw.mac.type == e1000_i354) e1000_set_eee_i354(&adapter->hw); else e1000_set_eee_i350(&adapter->hw); } } static void igb_init(void *arg) { struct adapter *adapter = arg; IGB_CORE_LOCK(adapter); igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_handle_que(void *context, int pending) { struct igb_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bool more; more = igb_rxeof(que, adapter->rx_process_limit, NULL); IGB_TX_LOCK(txr); igb_txeof(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); /* Do we need another? */ if (more) { taskqueue_enqueue(que->tq, &que->que_task); return; } } #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) return; #endif /* Reenable this interrupt */ if (que->eims) E1000_WRITE_REG(&adapter->hw, E1000_EIMS, que->eims); else igb_enable_intr(adapter); } /* Deal with link in a sleepable context */ static void igb_handle_link(void *context, int pending) { struct adapter *adapter = context; IGB_CORE_LOCK(adapter); igb_handle_link_locked(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_handle_link_locked(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; IGB_CORE_LOCK_ASSERT(adapter); adapter->hw.mac.get_link_status = 1; igb_update_link_status(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } } } /********************************************************************* * * MSI/Legacy Deferred * Interrupt Service routine * *********************************************************************/ static int igb_irq_fast(void *arg) { struct adapter *adapter = arg; struct igb_queue *que = adapter->queues; u32 reg_icr; reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Hot eject? */ if (reg_icr == 0xffffffff) return FILTER_STRAY; /* Definitely not our interrupt. */ if (reg_icr == 0x0) return FILTER_STRAY; if ((reg_icr & E1000_ICR_INT_ASSERTED) == 0) return FILTER_STRAY; /* * Mask interrupts until the taskqueue is finished running. This is * cheap, just assume that it is needed. This also works around the * MSI message reordering errata on certain systems. */ igb_disable_intr(adapter); taskqueue_enqueue(que->tq, &que->que_task); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) taskqueue_enqueue(que->tq, &adapter->link_task); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; return FILTER_HANDLED; } #ifdef DEVICE_POLLING #if __FreeBSD_version >= 800000 #define POLL_RETURN_COUNT(a) (a) static int #else #define POLL_RETURN_COUNT(a) static void #endif igb_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct adapter *adapter = ifp->if_softc; struct igb_queue *que; struct tx_ring *txr; u32 reg_icr, rx_done = 0; u32 loop = IGB_MAX_LOOP; bool more; IGB_CORE_LOCK(adapter); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { IGB_CORE_UNLOCK(adapter); return POLL_RETURN_COUNT(rx_done); } if (cmd == POLL_AND_CHECK_STATUS) { reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) igb_handle_link_locked(adapter); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; } IGB_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; txr = que->txr; igb_rxeof(que, count, &rx_done); IGB_TX_LOCK(txr); do { more = igb_txeof(txr); } while (loop-- && more); #ifndef IGB_LEGACY_TX if (!drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); } return POLL_RETURN_COUNT(rx_done); } #endif /* DEVICE_POLLING */ /********************************************************************* * * MSIX Que Interrupt Service routine * **********************************************************************/ static void igb_msix_que(void *arg) { struct igb_queue *que = arg; struct adapter *adapter = que->adapter; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; u32 newitr = 0; bool more_rx; /* Ignore spurious interrupts */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; E1000_WRITE_REG(&adapter->hw, E1000_EIMC, que->eims); ++que->irqs; IGB_TX_LOCK(txr); igb_txeof(txr); #ifndef IGB_LEGACY_TX /* Process the stack queue only if not depleted */ if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && !drbr_empty(ifp, txr->br)) igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); more_rx = igb_rxeof(que, adapter->rx_process_limit, NULL); if (adapter->enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) E1000_WRITE_REG(&adapter->hw, E1000_EITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; /* Used half Default if sub-gig */ if (adapter->link_speed != 1000) newitr = IGB_DEFAULT_ITR / 2; else { if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); } newitr &= 0x7FFC; /* Mask invalid bits */ if (adapter->hw.mac.type == e1000_82575) newitr |= newitr << 16; else newitr |= E1000_EITR_CNT_IGNR; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: /* Schedule a clean task if needed*/ if (more_rx) taskqueue_enqueue(que->tq, &que->que_task); else /* Reenable this interrupt */ E1000_WRITE_REG(&adapter->hw, E1000_EIMS, que->eims); return; } /********************************************************************* * * MSIX Link Interrupt Service routine * **********************************************************************/ static void igb_msix_link(void *arg) { struct adapter *adapter = arg; u32 icr; ++adapter->link_irq; icr = E1000_READ_REG(&adapter->hw, E1000_ICR); if (!(icr & E1000_ICR_LSC)) goto spurious; igb_handle_link(adapter, 0); spurious: /* Rearm */ E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); E1000_WRITE_REG(&adapter->hw, E1000_EIMS, adapter->link_mask); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct adapter *adapter = ifp->if_softc; INIT_DEBUGOUT("igb_media_status: begin"); IGB_CORE_LOCK(adapter); igb_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IGB_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; switch (adapter->link_speed) { case 10: ifmr->ifm_active |= IFM_10_T; break; case 100: /* ** Support for 100Mb SFP - these are Fiber ** but the media type appears as serdes */ if (adapter->hw.phy.media_type == e1000_media_type_internal_serdes) ifmr->ifm_active |= IFM_100_FX; else ifmr->ifm_active |= IFM_100_TX; break; case 1000: ifmr->ifm_active |= IFM_1000_T; break; case 2500: ifmr->ifm_active |= IFM_2500_SX; break; } if (adapter->link_duplex == FULL_DUPLEX) ifmr->ifm_active |= IFM_FDX; else ifmr->ifm_active |= IFM_HDX; IGB_CORE_UNLOCK(adapter); } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int igb_media_change(struct ifnet *ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("igb_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); IGB_CORE_LOCK(adapter); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT; break; case IFM_1000_LX: case IFM_1000_SX: case IFM_1000_T: adapter->hw.mac.autoneg = DO_AUTO_NEG; adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL; break; case IFM_100_TX: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_100_HALF; break; case IFM_10_T: adapter->hw.mac.autoneg = FALSE; adapter->hw.phy.autoneg_advertised = 0; if ((ifm->ifm_media & IFM_GMASK) == IFM_FDX) adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_FULL; else adapter->hw.mac.forced_speed_duplex = ADVERTISE_10_HALF; break; default: device_printf(adapter->dev, "Unsupported media type\n"); } igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); return (0); } /********************************************************************* * * This routine maps the mbufs to Advanced TX descriptors. * **********************************************************************/ static int igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) { struct adapter *adapter = txr->adapter; u32 olinfo_status = 0, cmd_type_len; int i, j, error, nsegs; int first; bool remap = TRUE; struct mbuf *m_head; bus_dma_segment_t segs[IGB_MAX_SCATTER]; bus_dmamap_t map; struct igb_tx_buf *txbuf; union e1000_adv_tx_desc *txd = NULL; m_head = *m_headp; /* Basic descriptor defines */ cmd_type_len = (E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT); if (m_head->m_flags & M_VLANTAG) cmd_type_len |= E1000_ADVTXD_DCMD_VLE; /* * Important to capture the first descriptor * used because it will contain the index of * the one we tell the hardware to report back */ first = txr->next_avail_desc; txbuf = &txr->tx_buffers[first]; map = txbuf->map; /* * Map the packet for DMA. */ retry: error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (__predict_false(error)) { struct mbuf *m; switch (error) { case EFBIG: /* Try it again? - one try */ if (remap == TRUE) { remap = FALSE; m = m_collapse(*m_headp, M_NOWAIT, IGB_MAX_SCATTER); if (m == NULL) { adapter->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; goto retry; } else return (error); default: txr->no_tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } /* Make certain there are enough descriptors */ if (nsegs > txr->tx_avail - 2) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); } m_head = *m_headp; /* ** Set up the appropriate offload context ** this will consume the first descriptor */ error = igb_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status); if (__predict_false(error)) { m_freem(*m_headp); *m_headp = NULL; return (error); } /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) olinfo_status |= txr->me << 4; i = txr->next_avail_desc; for (j = 0; j < nsegs; j++) { bus_size_t seglen; bus_addr_t segaddr; txbuf = &txr->tx_buffers[i]; txd = &txr->tx_base[i]; seglen = segs[j].ds_len; segaddr = htole64(segs[j].ds_addr); txd->read.buffer_addr = segaddr; txd->read.cmd_type_len = htole32(E1000_TXD_CMD_IFCS | cmd_type_len | seglen); txd->read.olinfo_status = htole32(olinfo_status); if (++i == txr->num_desc) i = 0; } txd->read.cmd_type_len |= htole32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS); txr->tx_avail -= nsegs; txr->next_avail_desc = i; txbuf->m_head = m_head; /* ** Here we swap the map so the last descriptor, ** which gets the completion interrupt has the ** real map, and the first descriptor gets the ** unused map from this descriptor. */ txr->tx_buffers[first].map = txbuf->map; txbuf->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); /* Set the EOP descriptor that will be marked done */ txbuf = &txr->tx_buffers[first]; txbuf->eop = txd; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * Advance the Transmit Descriptor Tail (Tdt), this tells the * hardware that this frame is available to transmit. */ ++txr->total_packets; E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), i); return (0); } static void igb_set_promisc(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u32 reg; if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_enabled); return; } reg = E1000_READ_REG(hw, E1000_RCTL); if (ifp->if_flags & IFF_PROMISC) { reg |= (E1000_RCTL_UPE | E1000_RCTL_MPE); E1000_WRITE_REG(hw, E1000_RCTL, reg); } else if (ifp->if_flags & IFF_ALLMULTI) { reg |= E1000_RCTL_MPE; reg &= ~E1000_RCTL_UPE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } } static void igb_disable_promisc(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 reg; int mcnt = 0; if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_disabled); return; } reg = E1000_READ_REG(hw, E1000_RCTL); reg &= (~E1000_RCTL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } /* Don't disable if in MAX groups */ if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg &= (~E1000_RCTL_MPE); E1000_WRITE_REG(hw, E1000_RCTL, reg); } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ static void igb_set_multi(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ifmultiaddr *ifma; u32 reg_rctl = 0; u8 *mta; int mcnt = 0; IOCTL_DEBUGOUT("igb_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(uint8_t) * ETH_ADDR_LEN * MAX_NUM_MULTICAST_ADDRESSES); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &mta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES) { reg_rctl = E1000_READ_REG(&adapter->hw, E1000_RCTL); reg_rctl |= E1000_RCTL_MPE; E1000_WRITE_REG(&adapter->hw, E1000_RCTL, reg_rctl); } else e1000_update_mc_addr_list(&adapter->hw, mta, mcnt); } /********************************************************************* * Timer routine: * This routine checks for link status, * updates statistics, and does the watchdog. * **********************************************************************/ static void igb_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; struct igb_queue *que = adapter->queues; int hung = 0, busy = 0; IGB_CORE_LOCK_ASSERT(adapter); igb_update_link_status(adapter); igb_update_stats_counters(adapter); /* ** Check the TX queues status ** - central locked handling of OACTIVE ** - watchdog only if all queues show hung */ for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if ((txr->queue_status & IGB_QUEUE_HUNG) && (adapter->pause_frames == 0)) ++hung; if (txr->queue_status & IGB_QUEUE_DEPLETED) ++busy; if ((txr->queue_status & IGB_QUEUE_IDLE) == 0) taskqueue_enqueue(que->tq, &que->que_task); } if (hung == adapter->num_queues) goto timeout; if (busy == adapter->num_queues) ifp->if_drv_flags |= IFF_DRV_OACTIVE; else if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) && (busy < adapter->num_queues)) ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; adapter->pause_frames = 0; callout_reset(&adapter->timer, hz, igb_local_timer, adapter); #ifndef DEVICE_POLLING /* Schedule all queue interrupts - deadlock protection */ E1000_WRITE_REG(&adapter->hw, E1000_EICS, adapter->que_mask); #endif return; timeout: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); device_printf(dev,"Queue(%d) tdh = %d, hw tdt = %d\n", txr->me, E1000_READ_REG(&adapter->hw, E1000_TDH(txr->me)), E1000_READ_REG(&adapter->hw, E1000_TDT(txr->me))); device_printf(dev,"TX(%d) desc avail = %d," "Next TX to Clean = %d\n", txr->me, txr->tx_avail, txr->next_to_clean); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; igb_init_locked(adapter); } static void igb_update_link_status(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_fc_info *fc = &hw->fc; struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; u32 link_check, thstat, ctrl; char *flowctl = NULL; link_check = thstat = ctrl = 0; /* Get the cached link value or read for real */ switch (hw->phy.media_type) { case e1000_media_type_copper: if (hw->mac.get_link_status) { /* Do the work to read phy */ e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; } else link_check = TRUE; break; case e1000_media_type_fiber: e1000_check_for_link(hw); link_check = (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU); break; case e1000_media_type_internal_serdes: e1000_check_for_link(hw); link_check = adapter->hw.mac.serdes_has_link; break; /* VF device is type_unknown */ case e1000_media_type_unknown: e1000_check_for_link(hw); link_check = !hw->mac.get_link_status; /* Fall thru */ default: break; } /* Check for thermal downshift or shutdown */ if (hw->mac.type == e1000_i350) { thstat = E1000_READ_REG(hw, E1000_THSTAT); ctrl = E1000_READ_REG(hw, E1000_CTRL_EXT); } /* Get the flow control for display */ switch (fc->current_mode) { case e1000_fc_rx_pause: flowctl = "RX"; break; case e1000_fc_tx_pause: flowctl = "TX"; break; case e1000_fc_full: flowctl = "Full"; break; case e1000_fc_none: default: flowctl = "None"; break; } /* Now we check if a transition has happened */ if (link_check && (adapter->link_active == 0)) { e1000_get_speed_and_duplex(&adapter->hw, &adapter->link_speed, &adapter->link_duplex); if (bootverbose) device_printf(dev, "Link is up %d Mbps %s," " Flow Control: %s\n", adapter->link_speed, ((adapter->link_duplex == FULL_DUPLEX) ? "Full Duplex" : "Half Duplex"), flowctl); adapter->link_active = 1; ifp->if_baudrate = adapter->link_speed * 1000000; if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && (thstat & E1000_THSTAT_LINK_THROTTLE)) device_printf(dev, "Link: thermal downshift\n"); /* Delay Link Up for Phy update */ if (((hw->mac.type == e1000_i210) || (hw->mac.type == e1000_i211)) && (hw->phy.id == I210_I_PHY_ID)) msec_delay(I210_LINK_DELAY); /* Reset if the media type changed. */ if (hw->dev_spec._82575.media_changed) { hw->dev_spec._82575.media_changed = false; adapter->flags |= IGB_MEDIA_RESET; igb_reset(adapter); } /* This can sleep */ if_link_state_change(ifp, LINK_STATE_UP); } else if (!link_check && (adapter->link_active == 1)) { ifp->if_baudrate = adapter->link_speed = 0; adapter->link_duplex = 0; if (bootverbose) device_printf(dev, "Link is Down\n"); if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && (thstat & E1000_THSTAT_PWR_DOWN)) device_printf(dev, "Link: thermal shutdown\n"); adapter->link_active = 0; /* This can sleep */ if_link_state_change(ifp, LINK_STATE_DOWN); /* Reset queue state */ for (int i = 0; i < adapter->num_queues; i++, txr++) txr->queue_status = IGB_QUEUE_IDLE; } } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void igb_stop(void *arg) { struct adapter *adapter = arg; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; IGB_CORE_LOCK_ASSERT(adapter); INIT_DEBUGOUT("igb_stop: begin"); igb_disable_intr(adapter); callout_stop(&adapter->timer); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* Disarm watchdog timer. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); txr->queue_status = IGB_QUEUE_IDLE; IGB_TX_UNLOCK(txr); } e1000_reset_hw(&adapter->hw); E1000_WRITE_REG(&adapter->hw, E1000_WUC, 0); e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void igb_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; /* Make sure our PCI config space has the necessary stuff set */ pci_enable_busmaster(dev); adapter->hw.bus.pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2); /* Save off the information about this board */ adapter->hw.vendor_id = pci_get_vendor(dev); adapter->hw.device_id = pci_get_device(dev); adapter->hw.revision_id = pci_read_config(dev, PCIR_REVID, 1); adapter->hw.subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); adapter->hw.subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* Set MAC type early for PCI setup */ e1000_set_mac_type(&adapter->hw); /* Are we a VF device? */ if ((adapter->hw.mac.type == e1000_vfadapt) || (adapter->hw.mac.type == e1000_vfadapt_i350)) adapter->vf_ifp = 1; else adapter->vf_ifp = 0; } static int igb_allocate_pci_resources(struct adapter *adapter) { device_t dev = adapter->dev; int rid; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->pci_mem == NULL) { device_printf(dev, "Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; adapter->num_queues = 1; /* Defaults for Legacy or MSI */ /* This will setup either MSI/X or MSI */ adapter->msix = igb_setup_msix(adapter); adapter->hw.back = &adapter->osdep; return (0); } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ static int igb_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = adapter->queues; #ifndef IGB_LEGACY_TX struct tx_ring *txr = adapter->tx_rings; #endif int error, rid = 0; /* Turn off all interrupts */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); /* MSI RID is 1 */ if (adapter->msix == 1) rid = 1; /* We allocate a single interrupt resource */ adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } #ifndef IGB_LEGACY_TX TASK_INIT(&txr->txq_task, 0, igb_deferred_mq_start, txr); #endif /* * Try allocating a fast interrupt and the associated deferred * processing contexts. */ TASK_INIT(&que->que_task, 0, igb_handle_que, que); /* Make tasklet for deferred link handling */ TASK_INIT(&adapter->link_task, 0, igb_handle_link, adapter); que->tq = taskqueue_create_fast("igb_taskq", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s taskq", device_get_nameunit(adapter->dev)); if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, igb_irq_fast, NULL, adapter, &adapter->tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(que->tq); que->tq = NULL; return (error); } return (0); } /********************************************************************* * * Setup the MSIX Queue Interrupt handlers: * **********************************************************************/ static int igb_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = adapter->queues; int error, rid, vector = 0; int cpu_id = 0; #ifdef RSS cpuset_t cpu_mask; #endif /* Be sure to start with all interrupts disabled */ E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0); E1000_WRITE_FLUSH(&adapter->hw); #ifdef RSS /* * If we're doing RSS, the number of queues needs to * match the number of RSS buckets that are configured. * * + If there's more queues than RSS buckets, we'll end * up with queues that get no traffic. * * + If there's more RSS buckets than queues, we'll end * up having multiple RSS buckets map to the same queue, * so there'll be some contention. */ if (adapter->num_queues != rss_getnumbuckets()) { device_printf(dev, "%s: number of queues (%d) != number of RSS buckets (%d)" "; performance will be impacted.\n", __func__, adapter->num_queues, rss_getnumbuckets()); } #endif for (int i = 0; i < adapter->num_queues; i++, vector++, que++) { rid = vector +1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "MSIX Queue Interrupt\n"); return (ENXIO); } error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, igb_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register Queue handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "que %d", i); #endif que->msix = vector; if (adapter->hw.mac.type == e1000_82575) que->eims = E1000_EICR_TX_QUEUE0 << i; else que->eims = 1 << vector; #ifdef RSS /* * The queue ID is used as the RSS layer bucket ID. * We look up the queue ID -> RSS CPU ID and select * that. */ cpu_id = rss_getcpu(i % rss_getnumbuckets()); #else /* * Bind the msix vector, and thus the * rings to the corresponding cpu. * * This just happens to match the default RSS round-robin * bucket -> queue -> CPU allocation. */ if (adapter->num_queues > 1) { if (igb_last_bind_cpu < 0) igb_last_bind_cpu = CPU_FIRST(); cpu_id = igb_last_bind_cpu; } #endif if (adapter->num_queues > 1) { bus_bind_intr(dev, que->res, cpu_id); #ifdef RSS device_printf(dev, "Bound queue %d to RSS bucket %d\n", i, cpu_id); #else device_printf(dev, "Bound queue %d to cpu %d\n", i, cpu_id); #endif } #ifndef IGB_LEGACY_TX TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start, que->txr); #endif /* Make tasklet for deferred handling */ TASK_INIT(&que->que_task, 0, igb_handle_que, que); que->tq = taskqueue_create("igb_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); if (adapter->num_queues > 1) { /* * Only pin the taskqueue thread to a CPU if * RSS is in use. * * This again just happens to match the default RSS * round-robin bucket -> queue -> CPU allocation. */ #ifdef RSS CPU_SETOF(cpu_id, &cpu_mask); taskqueue_start_threads_cpuset(&que->tq, 1, PI_NET, &cpu_mask, "%s que (bucket %d)", device_get_nameunit(adapter->dev), cpu_id); #else taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que (qid %d)", device_get_nameunit(adapter->dev), cpu_id); #endif } else { taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); } /* Finally update the last bound CPU id */ if (adapter->num_queues > 1) igb_last_bind_cpu = CPU_NEXT(igb_last_bind_cpu); } /* And Link */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "MSIX Link Interrupt\n"); return (ENXIO); } if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, igb_msix_link, adapter, &adapter->tag)) != 0) { device_printf(dev, "Failed to register Link handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->linkvec = vector; return (0); } static void igb_configure_queues(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct igb_queue *que; u32 tmp, ivar = 0, newitr = 0; /* First turn on RSS capability */ if (adapter->hw.mac.type != e1000_82575) E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE | E1000_GPIE_EIAME | E1000_GPIE_PBA | E1000_GPIE_NSICR); /* Turn on MSIX */ switch (adapter->hw.mac.type) { case e1000_82580: case e1000_i350: case e1000_i354: case e1000_i210: case e1000_i211: case e1000_vfadapt: case e1000_vfadapt_i350: /* RX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i >> 1; ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i & 1) { ivar &= 0xFF00FFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 16; } else { ivar &= 0xFFFFFF00; ivar |= que->msix | E1000_IVAR_VALID; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); } /* TX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i >> 1; ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i & 1) { ivar &= 0x00FFFFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 24; } else { ivar &= 0xFFFF00FF; ivar |= (que->msix | E1000_IVAR_VALID) << 8; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; case e1000_82576: /* RX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i & 0x7; /* Each IVAR has two entries */ ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i < 8) { ivar &= 0xFFFFFF00; ivar |= que->msix | E1000_IVAR_VALID; } else { ivar &= 0xFF00FFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 16; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* TX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i & 0x7; /* Each IVAR has two entries */ ivar = E1000_READ_REG_ARRAY(hw, E1000_IVAR0, index); que = &adapter->queues[i]; if (i < 8) { ivar &= 0xFFFF00FF; ivar |= (que->msix | E1000_IVAR_VALID) << 8; } else { ivar &= 0x00FFFFFF; ivar |= (que->msix | E1000_IVAR_VALID) << 24; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; case e1000_82575: /* enable MSI-X support*/ tmp = E1000_READ_REG(hw, E1000_CTRL_EXT); tmp |= E1000_CTRL_EXT_PBA_CLR; /* Auto-Mask interrupts upon ICR read. */ tmp |= E1000_CTRL_EXT_EIAME; tmp |= E1000_CTRL_EXT_IRCA; E1000_WRITE_REG(hw, E1000_CTRL_EXT, tmp); /* Queues */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; tmp = E1000_EICR_RX_QUEUE0 << i; tmp |= E1000_EICR_TX_QUEUE0 << i; que->eims = tmp; E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), i, que->eims); adapter->que_mask |= que->eims; } /* Link */ E1000_WRITE_REG(hw, E1000_MSIXBM(adapter->linkvec), E1000_EIMS_OTHER); adapter->link_mask |= E1000_EIMS_OTHER; default: break; } /* Set the starting interrupt rate */ if (igb_max_interrupt_rate > 0) newitr = (4000000 / igb_max_interrupt_rate) & 0x7FFC; if (hw->mac.type == e1000_82575) newitr |= newitr << 16; else newitr |= E1000_EITR_CNT_IGNR; for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; E1000_WRITE_REG(hw, E1000_EITR(que->msix), newitr); } return; } static void igb_free_pci_resources(struct adapter *adapter) { struct igb_queue *que = adapter->queues; device_t dev = adapter->dev; int rid; /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* * First release all the interrupt resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->linkvec) /* we are doing MSIX */ rid = adapter->linkvec + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); que = adapter->queues; if (adapter->tag != NULL) { taskqueue_drain(que->tq, &adapter->link_task); bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); for (int i = 0; i < adapter->num_queues; i++, que++) { if (que->tq != NULL) { #ifndef IGB_LEGACY_TX taskqueue_drain(que->tq, &que->txr->txq_task); #endif taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, adapter->memrid, adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); } /* * Setup Either MSI/X or MSI */ static int igb_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int bar, want, queues, msgs, maxqueues; /* tuneable override */ if (igb_enable_msix == 0) goto msi; /* First try MSI/X */ msgs = pci_msix_count(dev); if (msgs == 0) goto msi; /* ** Some new devices, as with ixgbe, now may ** use a different BAR, so we need to keep ** track of which is used. */ adapter->memrid = PCIR_BAR(IGB_MSIX_BAR); bar = pci_read_config(dev, adapter->memrid, 4); if (bar == 0) /* use next bar */ adapter->memrid += 4; adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &adapter->memrid, RF_ACTIVE); if (adapter->msix_mem == NULL) { /* May not be enabled */ device_printf(adapter->dev, "Unable to map MSIX table \n"); goto msi; } queues = (mp_ncpus > (msgs-1)) ? (msgs-1) : mp_ncpus; /* Override via tuneable */ if (igb_num_queues != 0) queues = igb_num_queues; #ifdef RSS /* If we're doing RSS, clamp at the number of RSS buckets */ if (queues > rss_getnumbuckets()) queues = rss_getnumbuckets(); #endif /* Sanity check based on HW */ switch (adapter->hw.mac.type) { case e1000_82575: maxqueues = 4; break; case e1000_82576: case e1000_82580: case e1000_i350: case e1000_i354: maxqueues = 8; break; case e1000_i210: maxqueues = 4; break; case e1000_i211: maxqueues = 2; break; default: /* VF interfaces */ maxqueues = 1; break; } /* Final clamp on the actual hardware capability */ if (queues > maxqueues) queues = maxqueues; /* ** One vector (RX/TX pair) per queue ** plus an additional for Link interrupt */ want = queues + 1; if (msgs >= want) msgs = want; else { device_printf(adapter->dev, "MSIX Configuration Problem, " "%d vectors configured, but %d queues wanted!\n", msgs, want); goto msi; } if ((pci_alloc_msix(dev, &msgs) == 0) && (msgs == want)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", msgs); adapter->num_queues = queues; return (msgs); } /* ** If MSIX alloc failed or provided us with ** less than needed, free and fall through to MSI */ pci_release_msi(dev); msi: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(IGB_MSIX_BAR), adapter->msix_mem); adapter->msix_mem = NULL; } msgs = 1; if (pci_alloc_msi(dev, &msgs) == 0) { device_printf(adapter->dev," Using an MSI interrupt\n"); return (msgs); } device_printf(adapter->dev," Using a Legacy interrupt\n"); return (0); } /********************************************************************* * * Initialize the DMA Coalescing feature * **********************************************************************/ static void igb_init_dmac(struct adapter *adapter, u32 pba) { device_t dev = adapter->dev; struct e1000_hw *hw = &adapter->hw; u32 dmac, reg = ~E1000_DMACR_DMAC_EN; u16 hwm; if (hw->mac.type == e1000_i211) return; if (hw->mac.type > e1000_82580) { if (adapter->dmac == 0) { /* Disabling it */ E1000_WRITE_REG(hw, E1000_DMACR, reg); return; } else device_printf(dev, "DMA Coalescing enabled\n"); /* Set starting threshold */ E1000_WRITE_REG(hw, E1000_DMCTXTH, 0); hwm = 64 * pba - adapter->max_frame_size / 16; if (hwm < 64 * (pba - 6)) hwm = 64 * (pba - 6); reg = E1000_READ_REG(hw, E1000_FCRTC); reg &= ~E1000_FCRTC_RTH_COAL_MASK; reg |= ((hwm << E1000_FCRTC_RTH_COAL_SHIFT) & E1000_FCRTC_RTH_COAL_MASK); E1000_WRITE_REG(hw, E1000_FCRTC, reg); dmac = pba - adapter->max_frame_size / 512; if (dmac < pba - 10) dmac = pba - 10; reg = E1000_READ_REG(hw, E1000_DMACR); reg &= ~E1000_DMACR_DMACTHR_MASK; reg = ((dmac << E1000_DMACR_DMACTHR_SHIFT) & E1000_DMACR_DMACTHR_MASK); /* transition to L0x or L1 if available..*/ reg |= (E1000_DMACR_DMAC_EN | E1000_DMACR_DMAC_LX_MASK); /* Check if status is 2.5Gb backplane connection * before configuration of watchdog timer, which is * in msec values in 12.8usec intervals * watchdog timer= msec values in 32usec intervals * for non 2.5Gb connection */ if (hw->mac.type == e1000_i354) { int status = E1000_READ_REG(hw, E1000_STATUS); if ((status & E1000_STATUS_2P5_SKU) && (!(status & E1000_STATUS_2P5_SKU_OVER))) reg |= ((adapter->dmac * 5) >> 6); else reg |= (adapter->dmac >> 5); } else { reg |= (adapter->dmac >> 5); } E1000_WRITE_REG(hw, E1000_DMACR, reg); E1000_WRITE_REG(hw, E1000_DMCRTRH, 0); /* Set the interval before transition */ reg = E1000_READ_REG(hw, E1000_DMCTLX); if (hw->mac.type == e1000_i350) reg |= IGB_DMCTLX_DCFLUSH_DIS; /* ** in 2.5Gb connection, TTLX unit is 0.4 usec ** which is 0x4*2 = 0xA. But delay is still 4 usec */ if (hw->mac.type == e1000_i354) { int status = E1000_READ_REG(hw, E1000_STATUS); if ((status & E1000_STATUS_2P5_SKU) && (!(status & E1000_STATUS_2P5_SKU_OVER))) reg |= 0xA; else reg |= 0x4; } else { reg |= 0x4; } E1000_WRITE_REG(hw, E1000_DMCTLX, reg); /* free space in tx packet buffer to wake from DMA coal */ E1000_WRITE_REG(hw, E1000_DMCTXTH, (IGB_TXPBSIZE - (2 * adapter->max_frame_size)) >> 6); /* make low power state decision controlled by DMA coal */ reg = E1000_READ_REG(hw, E1000_PCIEMISC); reg &= ~E1000_PCIEMISC_LX_DECISION; E1000_WRITE_REG(hw, E1000_PCIEMISC, reg); } else if (hw->mac.type == e1000_82580) { u32 reg = E1000_READ_REG(hw, E1000_PCIEMISC); E1000_WRITE_REG(hw, E1000_PCIEMISC, reg & ~E1000_PCIEMISC_LX_DECISION); E1000_WRITE_REG(hw, E1000_DMACR, 0); } } /********************************************************************* * * Set up an fresh starting state * **********************************************************************/ static void igb_reset(struct adapter *adapter) { device_t dev = adapter->dev; struct e1000_hw *hw = &adapter->hw; struct e1000_fc_info *fc = &hw->fc; struct ifnet *ifp = adapter->ifp; u32 pba = 0; u16 hwm; INIT_DEBUGOUT("igb_reset: begin"); /* Let the firmware know the OS is in control */ igb_get_hw_control(adapter); /* * Packet Buffer Allocation (PBA) * Writing PBA sets the receive portion of the buffer * the remainder is used for the transmit buffer. */ switch (hw->mac.type) { case e1000_82575: pba = E1000_PBA_32K; break; case e1000_82576: case e1000_vfadapt: pba = E1000_READ_REG(hw, E1000_RXPBS); pba &= E1000_RXPBS_SIZE_MASK_82576; break; case e1000_82580: case e1000_i350: case e1000_i354: case e1000_vfadapt_i350: pba = E1000_READ_REG(hw, E1000_RXPBS); pba = e1000_rxpbs_adjust_82580(pba); break; case e1000_i210: case e1000_i211: pba = E1000_PBA_34K; default: break; } /* Special needs in case of Jumbo frames */ if ((hw->mac.type == e1000_82575) && (ifp->if_mtu > ETHERMTU)) { u32 tx_space, min_tx, min_rx; pba = E1000_READ_REG(hw, E1000_PBA); tx_space = pba >> 16; pba &= 0xffff; min_tx = (adapter->max_frame_size + sizeof(struct e1000_tx_desc) - ETHERNET_FCS_SIZE) * 2; min_tx = roundup2(min_tx, 1024); min_tx >>= 10; min_rx = adapter->max_frame_size; min_rx = roundup2(min_rx, 1024); min_rx >>= 10; if (tx_space < min_tx && ((min_tx - tx_space) < pba)) { pba = pba - (min_tx - tx_space); /* * if short on rx space, rx wins * and must trump tx adjustment */ if (pba < min_rx) pba = min_rx; } E1000_WRITE_REG(hw, E1000_PBA, pba); } INIT_DEBUGOUT1("igb_init: pba=%dK",pba); /* * These parameters control the automatic generation (Tx) and * response (Rx) to Ethernet PAUSE frames. * - High water mark should allow for at least two frames to be * received after sending an XOFF. * - Low water mark works best when it is very near the high water mark. * This allows the receiver to restart by sending XON when it has * drained a bit. */ hwm = min(((pba << 10) * 9 / 10), ((pba << 10) - 2 * adapter->max_frame_size)); if (hw->mac.type < e1000_82576) { fc->high_water = hwm & 0xFFF8; /* 8-byte granularity */ fc->low_water = fc->high_water - 8; } else { fc->high_water = hwm & 0xFFF0; /* 16-byte granularity */ fc->low_water = fc->high_water - 16; } fc->pause_time = IGB_FC_PAUSE_TIME; fc->send_xon = TRUE; if (adapter->fc) fc->requested_mode = adapter->fc; else fc->requested_mode = e1000_fc_default; /* Issue a global reset */ e1000_reset_hw(hw); E1000_WRITE_REG(hw, E1000_WUC, 0); /* Reset for AutoMediaDetect */ if (adapter->flags & IGB_MEDIA_RESET) { e1000_setup_init_funcs(hw, TRUE); e1000_get_bus_info(hw); adapter->flags &= ~IGB_MEDIA_RESET; } if (e1000_init_hw(hw) < 0) device_printf(dev, "Hardware Initialization Failed\n"); /* Setup DMA Coalescing */ igb_init_dmac(adapter, pba); E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); e1000_get_phy_info(hw); e1000_check_for_link(hw); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int igb_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("igb_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_init = igb_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = igb_ioctl; ifp->if_get_counter = igb_get_counter; #ifndef IGB_LEGACY_TX ifp->if_transmit = igb_mq_start; ifp->if_qflush = igb_qflush; #else ifp->if_start = igb_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1; IFQ_SET_READY(&ifp->if_snd); #endif ether_ifattach(ifp, adapter->hw.mac.addr); ifp->if_capabilities = ifp->if_capenable = 0; ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_TSO; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capenable = ifp->if_capabilities; /* Don't enable LRO by default */ ifp->if_capabilities |= IFCAP_LRO; #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif /* * Tell the upper layer(s) we * support full VLAN capability. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; /* ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the igb driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, igb_media_change, igb_media_status); if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX, 0, NULL); } else { ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL); if (adapter->hw.phy.type != e1000_phy_ife) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } /* * Manage DMA'able memory. */ static void igb_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { if (error) return; *(bus_addr_t *) arg = segs[0].ds_addr; } static int igb_dma_malloc(struct adapter *adapter, bus_size_t size, struct igb_dma_alloc *dma, int mapflags) { int error; error = bus_dma_tag_create(bus_get_dma_tag(adapter->dev), /* parent */ IGB_DBA_ALIGN, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->dma_tag); if (error) { device_printf(adapter->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, error); goto fail_0; } error = bus_dmamem_alloc(dma->dma_tag, (void**) &dma->dma_vaddr, BUS_DMA_NOWAIT | BUS_DMA_COHERENT, &dma->dma_map); if (error) { device_printf(adapter->dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, error); goto fail_2; } dma->dma_paddr = 0; error = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr, size, igb_dmamap_cb, &dma->dma_paddr, mapflags | BUS_DMA_NOWAIT); if (error || dma->dma_paddr == 0) { device_printf(adapter->dev, "%s: bus_dmamap_load failed: %d\n", __func__, error); goto fail_3; } return (0); fail_3: bus_dmamap_unload(dma->dma_tag, dma->dma_map); fail_2: bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); bus_dma_tag_destroy(dma->dma_tag); fail_0: dma->dma_tag = NULL; return (error); } static void igb_dma_free(struct adapter *adapter, struct igb_dma_alloc *dma) { if (dma->dma_tag == NULL) return; if (dma->dma_paddr != 0) { bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->dma_tag, dma->dma_map); dma->dma_paddr = 0; } if (dma->dma_vaddr != NULL) { bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); dma->dma_vaddr = NULL; } bus_dma_tag_destroy(dma->dma_tag); dma->dma_tag = NULL; } /********************************************************************* * * Allocate memory for the transmit and receive rings, and then * the descriptors associated with each, called only once at attach. * **********************************************************************/ static int igb_allocate_queues(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = NULL; struct tx_ring *txr = NULL; struct rx_ring *rxr = NULL; int rsize, tsize, error = E1000_SUCCESS; int txconf = 0, rxconf = 0; /* First allocate the top level queue structs */ if (!(adapter->queues = (struct igb_queue *) malloc(sizeof(struct igb_queue) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate queue memory\n"); error = ENOMEM; goto fail; } /* Next allocate the TX ring struct memory */ if (!(adapter->tx_rings = (struct tx_ring *) malloc(sizeof(struct tx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate TX ring memory\n"); error = ENOMEM; goto tx_fail; } /* Now allocate the RX */ if (!(adapter->rx_rings = (struct rx_ring *) malloc(sizeof(struct rx_ring) * adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate RX ring memory\n"); error = ENOMEM; goto rx_fail; } tsize = roundup2(adapter->num_tx_desc * sizeof(union e1000_adv_tx_desc), IGB_DBA_ALIGN); /* * Now set up the TX queues, txconf is needed to handle the * possibility that things fail midcourse and we need to * undo memory gracefully */ for (int i = 0; i < adapter->num_queues; i++, txconf++) { /* Set up some basics */ txr = &adapter->tx_rings[i]; txr->adapter = adapter; txr->me = i; txr->num_desc = adapter->num_tx_desc; /* Initialize the TX lock */ snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF); if (igb_dma_malloc(adapter, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate TX Descriptor memory\n"); error = ENOMEM; goto err_tx_desc; } txr->tx_base = (union e1000_adv_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); /* Now allocate transmit buffers for the ring */ if (igb_allocate_transmit_buffers(txr)) { device_printf(dev, "Critical Failure setting up transmit buffers\n"); error = ENOMEM; goto err_tx_desc; } #ifndef IGB_LEGACY_TX /* Allocate a buf ring */ txr->br = buf_ring_alloc(igb_buf_ring_size, M_DEVBUF, M_WAITOK, &txr->tx_mtx); #endif } /* * Next the RX queues... */ rsize = roundup2(adapter->num_rx_desc * sizeof(union e1000_adv_rx_desc), IGB_DBA_ALIGN); for (int i = 0; i < adapter->num_queues; i++, rxconf++) { rxr = &adapter->rx_rings[i]; rxr->adapter = adapter; rxr->me = i; /* Initialize the RX lock */ snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)", device_get_nameunit(dev), txr->me); mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF); if (igb_dma_malloc(adapter, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { device_printf(dev, "Unable to allocate RxDescriptor memory\n"); error = ENOMEM; goto err_rx_desc; } rxr->rx_base = (union e1000_adv_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); /* Allocate receive buffers for the ring*/ if (igb_allocate_receive_buffers(rxr)) { device_printf(dev, "Critical Failure setting up receive buffers\n"); error = ENOMEM; goto err_rx_desc; } } /* ** Finally set up the queue holding structs */ for (int i = 0; i < adapter->num_queues; i++) { que = &adapter->queues[i]; que->adapter = adapter; que->txr = &adapter->tx_rings[i]; que->rxr = &adapter->rx_rings[i]; } return (0); err_rx_desc: for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--) igb_dma_free(adapter, &rxr->rxdma); err_tx_desc: for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--) igb_dma_free(adapter, &txr->txdma); free(adapter->rx_rings, M_DEVBUF); rx_fail: #ifndef IGB_LEGACY_TX buf_ring_free(txr->br, M_DEVBUF); #endif free(adapter->tx_rings, M_DEVBUF); tx_fail: free(adapter->queues, M_DEVBUF); fail: return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ static int igb_allocate_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; device_t dev = adapter->dev; struct igb_tx_buf *txbuf; int error, i; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IGB_TSO_SIZE, /* maxsize */ IGB_MAX_SCATTER, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->txtag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } if (!(txr->tx_buffers = (struct igb_tx_buf *) malloc(sizeof(struct igb_tx_buf) * adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer dma maps */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { error = bus_dmamap_create(txr->txtag, 0, &txbuf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } return 0; fail: /* We free all, it handles case where we are in the middle */ igb_free_transmit_structures(adapter); return (error); } /********************************************************************* * * Initialize a transmit ring. * **********************************************************************/ static void igb_setup_transmit_ring(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct igb_tx_buf *txbuf; int i; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ /* Clear the old descriptor contents */ IGB_TX_LOCK(txr); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_TX, txr->me, 0); #endif /* DEV_NETMAP */ bzero((void *)txr->tx_base, (sizeof(union e1000_adv_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ txr->next_avail_desc = 0; txr->next_to_clean = 0; /* Free any existing tx buffers. */ txbuf = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) { if (txbuf->m_head != NULL) { bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, txbuf->map); m_freem(txbuf->m_head); txbuf->m_head = NULL; } #ifdef DEV_NETMAP if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); /* no need to set the address */ netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si)); } #endif /* DEV_NETMAP */ /* clear the watch index */ txbuf->eop = NULL; } /* Set number of descriptors available */ txr->tx_avail = adapter->num_tx_desc; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IGB_TX_UNLOCK(txr); } /********************************************************************* * * Initialize all transmit rings. * **********************************************************************/ static void igb_setup_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) igb_setup_transmit_ring(txr); return; } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void igb_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct e1000_hw *hw = &adapter->hw; u32 tctl, txdctl; INIT_DEBUGOUT("igb_initialize_transmit_units: begin"); tctl = txdctl = 0; /* Setup the Tx Descriptor Rings */ for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 bus_addr = txr->txdma.dma_paddr; E1000_WRITE_REG(hw, E1000_TDLEN(i), adapter->num_tx_desc * sizeof(struct e1000_tx_desc)); E1000_WRITE_REG(hw, E1000_TDBAH(i), (uint32_t)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr); /* Setup the HW Tx Head and Tail descriptor pointers */ E1000_WRITE_REG(hw, E1000_TDT(i), 0); E1000_WRITE_REG(hw, E1000_TDH(i), 0); HW_DEBUGOUT2("Base = %x, Length = %x\n", E1000_READ_REG(hw, E1000_TDBAL(i)), E1000_READ_REG(hw, E1000_TDLEN(i))); txr->queue_status = IGB_QUEUE_IDLE; txdctl |= IGB_TX_PTHRESH; txdctl |= IGB_TX_HTHRESH << 8; txdctl |= IGB_TX_WTHRESH << 16; txdctl |= E1000_TXDCTL_QUEUE_ENABLE; E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl); } if (adapter->vf_ifp) return; e1000_config_collision_dist(hw); /* Program the Transmit Control Register */ tctl = E1000_READ_REG(hw, E1000_TCTL); tctl &= ~E1000_TCTL_CT; tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN | (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT)); /* This write will effectively turn on the transmit unit. */ E1000_WRITE_REG(hw, E1000_TCTL, tctl); } /********************************************************************* * * Free all transmit rings. * **********************************************************************/ static void igb_free_transmit_structures(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); igb_free_transmit_buffers(txr); igb_dma_free(adapter, &txr->txdma); IGB_TX_UNLOCK(txr); IGB_TX_LOCK_DESTROY(txr); } free(adapter->tx_rings, M_DEVBUF); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ static void igb_free_transmit_buffers(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; struct igb_tx_buf *tx_buffer; int i; INIT_DEBUGOUT("free_transmit_ring: begin"); if (txr->tx_buffers == NULL) return; tx_buffer = txr->tx_buffers; for (i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) { if (tx_buffer->m_head != NULL) { bus_dmamap_sync(txr->txtag, tx_buffer->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, tx_buffer->map); m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; if (tx_buffer->map != NULL) { bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } else if (tx_buffer->map != NULL) { bus_dmamap_unload(txr->txtag, tx_buffer->map); bus_dmamap_destroy(txr->txtag, tx_buffer->map); tx_buffer->map = NULL; } } #ifndef IGB_LEGACY_TX if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); #endif if (txr->tx_buffers != NULL) { free(txr->tx_buffers, M_DEVBUF); txr->tx_buffers = NULL; } if (txr->txtag != NULL) { bus_dma_tag_destroy(txr->txtag); txr->txtag = NULL; } return; } /********************************************************************** * * Setup work for hardware segmentation offload (TSO) on * adapters using advanced tx descriptors * **********************************************************************/ static int igb_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct adapter *adapter = txr->adapter; struct e1000_adv_tx_context_desc *TXD; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; u32 mss_l4len_idx = 0, paylen; u16 vtag = 0, eh_type; int ctxd, ehdrlen, ip_hlen, tcp_hlen; struct ether_vlan_header *eh; #ifdef INET6 struct ip6_hdr *ip6; #endif #ifdef INET struct ip *ip; #endif struct tcphdr *th; /* * Determine where frame payload starts. * Jump over vlan headers if already present */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; eh_type = eh->evl_proto; } else { ehdrlen = ETHER_HDR_LEN; eh_type = eh->evl_encap_proto; } switch (ntohs(eh_type)) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); /* XXX-BZ For now we do not pretend to support ext. hdrs. */ if (ip6->ip6_nxt != IPPROTO_TCP) return (ENXIO); ip_hlen = sizeof(struct ip6_hdr); ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6; break; #endif #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); if (ip->ip_p != IPPROTO_TCP) return (ENXIO); ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; /* Tell transmit desc to also do IPv4 checksum. */ *olinfo_status |= E1000_TXD_POPTS_IXSM << 8; break; #endif default: panic("%s: CSUM_TSO but no supported IP version (0x%04x)", __func__, ntohs(eh_type)); break; } ctxd = txr->next_avail_desc; TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[ctxd]; tcp_hlen = th->th_off << 2; /* This is used in the transmit desc in encap */ paylen = mp->m_pkthdr.len - ehdrlen - ip_hlen - tcp_hlen; /* VLAN MACLEN IPLEN */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << E1000_ADVTXD_VLAN_SHIFT); } vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= ip_hlen; TXD->vlan_macip_lens = htole32(vlan_macip_lens); /* ADV DTYPE TUCMD */ type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); /* MSS L4LEN IDX */ mss_l4len_idx |= (mp->m_pkthdr.tso_segsz << E1000_ADVTXD_MSS_SHIFT); mss_l4len_idx |= (tcp_hlen << E1000_ADVTXD_L4LEN_SHIFT); /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) mss_l4len_idx |= txr->me << 4; TXD->mss_l4len_idx = htole32(mss_l4len_idx); TXD->seqnum_seed = htole32(0); if (++ctxd == txr->num_desc) ctxd = 0; txr->tx_avail--; txr->next_avail_desc = ctxd; *cmd_type_len |= E1000_ADVTXD_DCMD_TSE; *olinfo_status |= E1000_TXD_POPTS_TXSM << 8; *olinfo_status |= paylen << E1000_ADVTXD_PAYLEN_SHIFT; ++txr->tso_tx; return (0); } /********************************************************************* * * Advanced Context Descriptor setup for VLAN, CSUM or TSO * **********************************************************************/ static int igb_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, u32 *cmd_type_len, u32 *olinfo_status) { struct e1000_adv_tx_context_desc *TXD; struct adapter *adapter = txr->adapter; struct ether_vlan_header *eh; struct ip *ip; struct ip6_hdr *ip6; u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0, mss_l4len_idx = 0; int ehdrlen, ip_hlen = 0; u16 etype; u8 ipproto = 0; int offload = TRUE; int ctxd = txr->next_avail_desc; u16 vtag = 0; /* First check if TSO is to be used */ if (mp->m_pkthdr.csum_flags & CSUM_TSO) return (igb_tso_setup(txr, mp, cmd_type_len, olinfo_status)); if ((mp->m_pkthdr.csum_flags & CSUM_OFFLOAD) == 0) offload = FALSE; /* Indicate the whole packet as payload when not doing TSO */ *olinfo_status |= mp->m_pkthdr.len << E1000_ADVTXD_PAYLEN_SHIFT; /* Now ready a context descriptor */ TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[ctxd]; /* ** In advanced descriptors the vlan tag must ** be placed into the context descriptor. Hence ** we need to make one even if not doing offloads. */ if (mp->m_flags & M_VLANTAG) { vtag = htole16(mp->m_pkthdr.ether_vtag); vlan_macip_lens |= (vtag << E1000_ADVTXD_VLAN_SHIFT); } else if (offload == FALSE) /* ... no offload to do */ return (0); /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); ehdrlen = ETHER_HDR_LEN; } /* Set the ether header length */ vlan_macip_lens |= ehdrlen << E1000_ADVTXD_MACLEN_SHIFT; switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = ip->ip_hl << 2; ipproto = ip->ip_p; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); /* XXX-BZ this will go badly in case of ext hdrs. */ ipproto = ip6->ip6_nxt; type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6; break; default: offload = FALSE; break; } vlan_macip_lens |= ip_hlen; type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; switch (ipproto) { case IPPROTO_TCP: if (mp->m_pkthdr.csum_flags & CSUM_TCP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & CSUM_UDP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP; break; #if __FreeBSD_version >= 800000 case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & CSUM_SCTP) type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP; break; #endif default: offload = FALSE; break; } if (offload) /* For the TX descriptor setup */ *olinfo_status |= E1000_TXD_POPTS_TXSM << 8; /* 82575 needs the queue index added */ if (adapter->hw.mac.type == e1000_82575) mss_l4len_idx = txr->me << 4; /* Now copy bits into descriptor */ TXD->vlan_macip_lens = htole32(vlan_macip_lens); TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); TXD->seqnum_seed = htole32(0); TXD->mss_l4len_idx = htole32(mss_l4len_idx); /* We've consumed the first desc, adjust counters */ if (++ctxd == txr->num_desc) ctxd = 0; txr->next_avail_desc = ctxd; --txr->tx_avail; return (0); } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * * TRUE return means there's work in the ring to clean, FALSE its empty. **********************************************************************/ static bool igb_txeof(struct tx_ring *txr) { struct adapter *adapter = txr->adapter; #ifdef DEV_NETMAP struct ifnet *ifp = adapter->ifp; #endif /* DEV_NETMAP */ u32 work, processed = 0; int limit = adapter->tx_process_limit; struct igb_tx_buf *buf; union e1000_adv_tx_desc *txd; mtx_assert(&txr->tx_mtx, MA_OWNED); #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, txr->me)) return (FALSE); #endif /* DEV_NETMAP */ if (txr->tx_avail == txr->num_desc) { txr->queue_status = IGB_QUEUE_IDLE; return FALSE; } /* Get work starting point */ work = txr->next_to_clean; buf = &txr->tx_buffers[work]; txd = &txr->tx_base[work]; work -= txr->num_desc; /* The distance to ring end */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); do { union e1000_adv_tx_desc *eop = buf->eop; if (eop == NULL) /* No work */ break; if ((eop->wb.status & E1000_TXD_STAT_DD) == 0) break; /* I/O not complete */ if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; } buf->eop = NULL; ++txr->tx_avail; /* We clean the range if multi segment */ while (txd != eop) { ++txd; ++buf; ++work; /* wrap the ring? */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } if (buf->m_head) { txr->bytes += buf->m_head->m_pkthdr.len; bus_dmamap_sync(txr->txtag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txtag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; } ++txr->tx_avail; buf->eop = NULL; } ++txr->packets; ++processed; txr->watchdog_time = ticks; /* Try the next packet */ ++txd; ++buf; ++work; /* reset with a wrap */ if (__predict_false(!work)) { work -= txr->num_desc; buf = txr->tx_buffers; txd = txr->tx_base; } prefetch(txd); } while (__predict_true(--limit)); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); work += txr->num_desc; txr->next_to_clean = work; /* ** Watchdog calculation, we know there's ** work outstanding or the first return ** would have been taken, so none processed ** for too long indicates a hang. */ if ((!processed) && ((ticks - txr->watchdog_time) > IGB_WATCHDOG)) txr->queue_status |= IGB_QUEUE_HUNG; if (txr->tx_avail >= IGB_QUEUE_THRESHOLD) txr->queue_status &= ~IGB_QUEUE_DEPLETED; if (txr->tx_avail == txr->num_desc) { txr->queue_status = IGB_QUEUE_IDLE; return (FALSE); } return (TRUE); } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void igb_refresh_mbufs(struct rx_ring *rxr, int limit) { struct adapter *adapter = rxr->adapter; bus_dma_segment_t hseg[1]; bus_dma_segment_t pseg[1]; struct igb_rx_buf *rxbuf; struct mbuf *mh, *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_to_refresh; /* ** Get one descriptor beyond ** our work mark to control ** the loop. */ if (++j == adapter->num_rx_desc) j = 0; while (j != limit) { rxbuf = &rxr->rx_buffers[i]; /* No hdr mbuf used with header split off */ if (rxr->hdr_split == FALSE) goto no_split; if (rxbuf->m_head == NULL) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) goto update; } else mh = rxbuf->m_head; mh->m_pkthdr.len = mh->m_len = MHLEN; mh->m_len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: hdr dmamap load" " failure - %d\n", error); m_free(mh); rxbuf->m_head = NULL; goto update; } rxbuf->m_head = mh; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.hdr_addr = htole64(hseg[0].ds_addr); no_split: if (rxbuf->m_pack == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (mp == NULL) goto update; } else mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: payload dmamap load" " failure - %d\n", error); m_free(mp); rxbuf->m_pack = NULL; goto update; } rxbuf->m_pack = mp; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.pkt_addr = htole64(pseg[0].ds_addr); refreshed = TRUE; /* I feel wefreshed :) */ i = j; /* our next is precalculated */ rxr->next_to_refresh = i; if (++j == adapter->num_rx_desc) j = 0; } update: if (refreshed) /* update tail */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), rxr->next_to_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per received packet, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've allocated. * **********************************************************************/ static int igb_allocate_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; device_t dev = adapter->dev; struct igb_rx_buf *rxbuf; int i, bsize, error; bsize = sizeof(struct igb_rx_buf) * adapter->num_rx_desc; if (!(rxr->rx_buffers = (struct igb_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MSIZE, /* maxsize */ 1, /* nsegments */ MSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->htag))) { device_printf(dev, "Unable to create RX DMA tag\n"); goto fail; } if ((error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX payload DMA tag\n"); goto fail; } for (i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; error = bus_dmamap_create(rxr->htag, 0, &rxbuf->hmap); if (error) { device_printf(dev, "Unable to create RX head DMA maps\n"); goto fail; } error = bus_dmamap_create(rxr->ptag, 0, &rxbuf->pmap); if (error) { device_printf(dev, "Unable to create RX packet DMA maps\n"); goto fail; } } return (0); fail: /* Frees all, but can handle partial completion */ igb_free_receive_structures(adapter); return (error); } static void igb_free_receive_ring(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct igb_rx_buf *rxbuf; for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; } } /********************************************************************* * * Initialize a receive ring and its buffers. * **********************************************************************/ static int igb_setup_receive_ring(struct rx_ring *rxr) { struct adapter *adapter; struct ifnet *ifp; device_t dev; struct igb_rx_buf *rxbuf; bus_dma_segment_t pseg[1], hseg[1]; struct lro_ctrl *lro = &rxr->lro; int rsize, nsegs, error = 0; #ifdef DEV_NETMAP struct netmap_adapter *na = NA(rxr->adapter->ifp); struct netmap_slot *slot; #endif /* DEV_NETMAP */ adapter = rxr->adapter; dev = adapter->dev; ifp = adapter->ifp; /* Clear the ring contents */ IGB_RX_LOCK(rxr); #ifdef DEV_NETMAP slot = netmap_reset(na, NR_RX, rxr->me, 0); #endif /* DEV_NETMAP */ rsize = roundup2(adapter->num_rx_desc * sizeof(union e1000_adv_rx_desc), IGB_DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); /* ** Free current RX buffer structures and their mbufs */ igb_free_receive_ring(rxr); /* Configure for header split? */ if (igb_header_split) rxr->hdr_split = TRUE; /* Now replenish the ring mbufs */ for (int j = 0; j < adapter->num_rx_desc; ++j) { struct mbuf *mh, *mp; rxbuf = &rxr->rx_buffers[j]; #ifdef DEV_NETMAP if (slot) { /* slot sj is mapped to the j-th NIC-ring entry */ int sj = netmap_idx_n2k(&na->rx_rings[rxr->me], j); uint64_t paddr; void *addr; addr = PNMB(na, slot + sj, &paddr); netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(paddr); continue; } #endif /* DEV_NETMAP */ if (rxr->hdr_split == FALSE) goto skip_head; /* First the header */ rxbuf->m_head = m_gethdr(M_NOWAIT, MT_DATA); if (rxbuf->m_head == NULL) { error = ENOBUFS; goto fail; } m_adj(rxbuf->m_head, ETHER_ALIGN); mh = rxbuf->m_head; mh->m_len = mh->m_pkthdr.len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, rxbuf->hmap, rxbuf->m_head, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) /* Nothing elegant to do here */ goto fail; bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.hdr_addr = htole64(hseg[0].ds_addr); skip_head: /* Now the payload cluster */ rxbuf->m_pack = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->m_pack == NULL) { error = ENOBUFS; goto fail; } mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(pseg[0].ds_addr); } /* Setup our descriptor indices */ rxr->next_to_check = 0; rxr->next_to_refresh = adapter->num_rx_desc - 1; rxr->lro_enabled = FALSE; rxr->rx_split_packets = 0; rxr->rx_bytes = 0; rxr->fmp = NULL; rxr->lmp = NULL; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* ** Now set up the LRO interface, we ** also only do head split when LRO ** is enabled, since so often they ** are undesireable in similar setups. */ if (ifp->if_capenable & IFCAP_LRO) { error = tcp_lro_init(lro); if (error) { device_printf(dev, "LRO Initialization failed!\n"); goto fail; } INIT_DEBUGOUT("RX LRO Initialized\n"); rxr->lro_enabled = TRUE; lro->ifp = adapter->ifp; } IGB_RX_UNLOCK(rxr); return (0); fail: igb_free_receive_ring(rxr); IGB_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Initialize all receive rings. * **********************************************************************/ static int igb_setup_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; int i; for (i = 0; i < adapter->num_queues; i++, rxr++) if (igb_setup_receive_ring(rxr)) goto fail; return (0); fail: /* * Free RX buffers allocated so far, we will only handle * the rings that completed, the failing case will have * cleaned up for itself. 'i' is the endpoint. */ for (int j = 0; j < i; ++j) { rxr = &adapter->rx_rings[j]; IGB_RX_LOCK(rxr); igb_free_receive_ring(rxr); IGB_RX_UNLOCK(rxr); } return (ENOBUFS); } /* * Initialise the RSS mapping for NICs that support multiple transmit/ * receive rings. */ static void igb_initialise_rss_mapping(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; int i; int queue_id; u32 reta; u32 rss_key[10], mrqc, shift = 0; /* XXX? */ if (adapter->hw.mac.type == e1000_82575) shift = 6; /* * The redirection table controls which destination * queue each bucket redirects traffic to. * Each DWORD represents four queues, with the LSB * being the first queue in the DWORD. * * This just allocates buckets to queues using round-robin * allocation. * * NOTE: It Just Happens to line up with the default * RSS allocation method. */ /* Warning FM follows */ reta = 0; for (i = 0; i < 128; i++) { #ifdef RSS queue_id = rss_get_indirection_to_bucket(i); /* * If we have more queues than buckets, we'll * end up mapping buckets to a subset of the * queues. * * If we have more buckets than queues, we'll * end up instead assigning multiple buckets * to queues. * * Both are suboptimal, but we need to handle * the case so we don't go out of bounds * indexing arrays and such. */ queue_id = queue_id % adapter->num_queues; #else queue_id = (i % adapter->num_queues); #endif /* Adjust if required */ queue_id = queue_id << shift; /* * The low 8 bits are for hash value (n+0); * The next 8 bits are for hash value (n+1), etc. */ reta = reta >> 8; reta = reta | ( ((uint32_t) queue_id) << 24); if ((i & 3) == 3) { E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta); reta = 0; } } /* Now fill in hash table */ /* * MRQC: Multiple Receive Queues Command * Set queuing to RSS control, number depends on the device. */ mrqc = E1000_MRQC_ENABLE_RSS_8Q; #ifdef RSS /* XXX ew typecasting */ rss_getkey((uint8_t *) &rss_key); #else arc4rand(&rss_key, sizeof(rss_key), 0); #endif for (i = 0; i < 10; i++) E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key[i]); /* * Configure the RSS fields to hash upon. */ mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 | E1000_MRQC_RSS_FIELD_IPV4_TCP); mrqc |= (E1000_MRQC_RSS_FIELD_IPV6 | E1000_MRQC_RSS_FIELD_IPV6_TCP); mrqc |=( E1000_MRQC_RSS_FIELD_IPV4_UDP | E1000_MRQC_RSS_FIELD_IPV6_UDP); mrqc |=( E1000_MRQC_RSS_FIELD_IPV6_UDP_EX | E1000_MRQC_RSS_FIELD_IPV6_TCP_EX); E1000_WRITE_REG(hw, E1000_MRQC, mrqc); } /********************************************************************* * * Enable receive unit. * **********************************************************************/ static void igb_initialize_receive_units(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; struct ifnet *ifp = adapter->ifp; struct e1000_hw *hw = &adapter->hw; u32 rctl, rxcsum, psize, srrctl = 0; INIT_DEBUGOUT("igb_initialize_receive_unit: begin"); /* * Make sure receives are disabled while setting * up the descriptor ring */ rctl = E1000_READ_REG(hw, E1000_RCTL); E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN); /* ** Set up for header split */ if (igb_header_split) { /* Use a standard mbuf for the header */ srrctl |= IGB_HDR_BUF << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; } else srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF; /* ** Set up for jumbo frames */ if (ifp->if_mtu > ETHERMTU) { rctl |= E1000_RCTL_LPE; if (adapter->rx_mbuf_sz == MJUMPAGESIZE) { srrctl |= 4096 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX; } else if (adapter->rx_mbuf_sz > MJUMPAGESIZE) { srrctl |= 8192 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX; } /* Set maximum packet len */ psize = adapter->max_frame_size; /* are we on a vlan? */ if (adapter->ifp->if_vlantrunk != NULL) psize += VLAN_TAG_SIZE; E1000_WRITE_REG(&adapter->hw, E1000_RLPML, psize); } else { rctl &= ~E1000_RCTL_LPE; srrctl |= 2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT; rctl |= E1000_RCTL_SZ_2048; } /* * If TX flow control is disabled and there's >1 queue defined, * enable DROP. * * This drops frames rather than hanging the RX MAC for all queues. */ if ((adapter->num_queues > 1) && (adapter->fc == e1000_fc_none || adapter->fc == e1000_fc_rx_pause)) { srrctl |= E1000_SRRCTL_DROP_EN; } /* Setup the Base and Length of the Rx Descriptor Rings */ for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 bus_addr = rxr->rxdma.dma_paddr; u32 rxdctl; E1000_WRITE_REG(hw, E1000_RDLEN(i), adapter->num_rx_desc * sizeof(struct e1000_rx_desc)); E1000_WRITE_REG(hw, E1000_RDBAH(i), (uint32_t)(bus_addr >> 32)); E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr); E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl); /* Enable this Queue */ rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i)); rxdctl |= E1000_RXDCTL_QUEUE_ENABLE; rxdctl &= 0xFFF00000; rxdctl |= IGB_RX_PTHRESH; rxdctl |= IGB_RX_HTHRESH << 8; rxdctl |= IGB_RX_WTHRESH << 16; E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl); } /* ** Setup for RX MultiQueue */ rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); if (adapter->num_queues >1) { /* rss setup */ igb_initialise_rss_mapping(adapter); /* ** NOTE: Receive Full-Packet Checksum Offload ** is mutually exclusive with Multiqueue. However ** this is not the same as TCP/IP checksums which ** still work. */ rxcsum |= E1000_RXCSUM_PCSD; #if __FreeBSD_version >= 800000 /* For SCTP Offload */ if (((hw->mac.type == e1000_82576) || (hw->mac.type == e1000_82580)) && (ifp->if_capenable & IFCAP_RXCSUM)) rxcsum |= E1000_RXCSUM_CRCOFL; #endif } else { /* Non RSS setup */ if (ifp->if_capenable & IFCAP_RXCSUM) { rxcsum |= E1000_RXCSUM_IPPCSE; #if __FreeBSD_version >= 800000 if ((adapter->hw.mac.type == e1000_82576) || (adapter->hw.mac.type == e1000_82580)) rxcsum |= E1000_RXCSUM_CRCOFL; #endif } else rxcsum &= ~E1000_RXCSUM_TUOFL; } E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); /* Setup the Receive Control Register */ rctl &= ~(3 << E1000_RCTL_MO_SHIFT); rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO | E1000_RCTL_RDMTS_HALF | (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT); /* Strip CRC bytes. */ rctl |= E1000_RCTL_SECRC; /* Make sure VLAN Filters are off */ rctl &= ~E1000_RCTL_VFE; /* Don't store bad packets */ rctl &= ~E1000_RCTL_SBP; /* Enable Receives */ E1000_WRITE_REG(hw, E1000_RCTL, rctl); /* * Setup the HW Rx Head and Tail Descriptor Pointers * - needs to be after enable */ for (int i = 0; i < adapter->num_queues; i++) { rxr = &adapter->rx_rings[i]; E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check); #ifdef DEV_NETMAP /* * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. * In this driver it means we adjust RDT to * something different from next_to_refresh * (which is not used in netmap mode). */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; int t = rxr->next_to_refresh - nm_kr_rxspace(kring); if (t >= adapter->num_rx_desc) t -= adapter->num_rx_desc; else if (t < 0) t += adapter->num_rx_desc; E1000_WRITE_REG(hw, E1000_RDT(i), t); } else #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rxr->next_to_refresh); } return; } /********************************************************************* * * Free receive rings. * **********************************************************************/ static void igb_free_receive_structures(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; for (int i = 0; i < adapter->num_queues; i++, rxr++) { struct lro_ctrl *lro = &rxr->lro; igb_free_receive_buffers(rxr); tcp_lro_free(lro); igb_dma_free(adapter, &rxr->rxdma); } free(adapter->rx_rings, M_DEVBUF); } /********************************************************************* * * Free receive ring data structures. * **********************************************************************/ static void igb_free_receive_buffers(struct rx_ring *rxr) { struct adapter *adapter = rxr->adapter; struct igb_rx_buf *rxbuf; int i; INIT_DEBUGOUT("free_receive_structures: begin"); /* Cleanup any existing buffers */ if (rxr->rx_buffers != NULL) { for (i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, rxbuf->hmap); rxbuf->m_head->m_flags |= M_PKTHDR; m_freem(rxbuf->m_head); } if (rxbuf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, rxbuf->pmap); rxbuf->m_pack->m_flags |= M_PKTHDR; m_freem(rxbuf->m_pack); } rxbuf->m_head = NULL; rxbuf->m_pack = NULL; if (rxbuf->hmap != NULL) { bus_dmamap_destroy(rxr->htag, rxbuf->hmap); rxbuf->hmap = NULL; } if (rxbuf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, rxbuf->pmap); rxbuf->pmap = NULL; } } if (rxr->rx_buffers != NULL) { free(rxr->rx_buffers, M_DEVBUF); rxr->rx_buffers = NULL; } } if (rxr->htag != NULL) { bus_dma_tag_destroy(rxr->htag); rxr->htag = NULL; } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } } static __inline void igb_rx_discard(struct rx_ring *rxr, int i) { struct igb_rx_buf *rbuf; rbuf = &rxr->rx_buffers[i]; /* Partially received? Free the chain */ if (rxr->fmp != NULL) { rxr->fmp->m_flags |= M_PKTHDR; m_freem(rxr->fmp); rxr->fmp = NULL; rxr->lmp = NULL; } /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->m_head) { m_free(rbuf->m_head); rbuf->m_head = NULL; bus_dmamap_unload(rxr->htag, rbuf->hmap); } if (rbuf->m_pack) { m_free(rbuf->m_pack); rbuf->m_pack = NULL; bus_dmamap_unload(rxr->ptag, rbuf->pmap); } return; } static __inline void igb_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u32 ptype) { /* * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & (E1000_RXDADV_PKTTYPE_IPV4 | E1000_RXDADV_PKTTYPE_TCP)) == (E1000_RXDADV_PKTTYPE_IPV4 | E1000_RXDADV_PKTTYPE_TCP) && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } IGB_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IGB_RX_LOCK(rxr); } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE if more to clean, FALSE otherwise *********************************************************************/ static bool igb_rxeof(struct igb_queue *que, int count, int *done) { struct adapter *adapter = que->adapter; struct rx_ring *rxr = que->rxr; struct ifnet *ifp = adapter->ifp; struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; int i, processed = 0, rxdone = 0; u32 ptype, staterr = 0; union e1000_adv_rx_desc *cur; IGB_RX_LOCK(rxr); /* Sync the ring. */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, rxr->me, &processed)) { IGB_RX_UNLOCK(rxr); return (FALSE); } #endif /* DEV_NETMAP */ /* Main clean loop */ for (i = rxr->next_to_check; count != 0;) { struct mbuf *sendmp, *mh, *mp; struct igb_rx_buf *rxbuf; u16 hlen, plen, hdr, vtag, pkt_info; bool eop = FALSE; cur = &rxr->rx_base[i]; staterr = le32toh(cur->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = mh = mp = NULL; cur->wb.upper.status_error = 0; rxbuf = &rxr->rx_buffers[i]; plen = le16toh(cur->wb.upper.length); ptype = le32toh(cur->wb.lower.lo_dword.data) & IGB_PKTTYPE_MASK; if (((adapter->hw.mac.type == e1000_i350) || (adapter->hw.mac.type == e1000_i354)) && (staterr & E1000_RXDEXT_STATERR_LB)) vtag = be16toh(cur->wb.upper.vlan); else vtag = le16toh(cur->wb.upper.vlan); hdr = le16toh(cur->wb.lower.lo_dword.hs_rss.hdr_info); pkt_info = le16toh(cur->wb.lower.lo_dword.hs_rss.pkt_info); eop = ((staterr & E1000_RXD_STAT_EOP) == E1000_RXD_STAT_EOP); /* * Free the frame (all segments) if we're at EOP and * it's an error. * * The datasheet states that EOP + status is only valid for * the final segment in a multi-segment frame. */ if (eop && ((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) != 0)) { adapter->dropped_pkts++; ++rxr->rx_discarded; igb_rx_discard(rxr, i); goto next_desc; } /* ** The way the hardware is configured to ** split, it will ONLY use the header buffer ** when header split is enabled, otherwise we ** get normal behavior, ie, both header and ** payload are DMA'd into the payload buffer. ** ** The fmp test is to catch the case where a ** packet spans multiple descriptors, in that ** case only the first header is valid. */ if (rxr->hdr_split && rxr->fmp == NULL) { bus_dmamap_unload(rxr->htag, rxbuf->hmap); hlen = (hdr & E1000_RXDADV_HDRBUFLEN_MASK) >> E1000_RXDADV_HDRBUFLEN_SHIFT; if (hlen > IGB_HDR_BUF) hlen = IGB_HDR_BUF; mh = rxr->rx_buffers[i].m_head; mh->m_len = hlen; /* clear buf pointer for refresh */ rxbuf->m_head = NULL; /* ** Get the payload length, this ** could be zero if its a small ** packet. */ if (plen > 0) { mp = rxr->rx_buffers[i].m_pack; mp->m_len = plen; mh->m_next = mp; /* clear buf pointer */ rxbuf->m_pack = NULL; rxr->rx_split_packets++; } } else { /* ** Either no header split, or a ** secondary piece of a fragmented ** split packet. */ mh = rxr->rx_buffers[i].m_pack; mh->m_len = plen; /* clear buf info for refresh */ rxbuf->m_pack = NULL; } bus_dmamap_unload(rxr->ptag, rxbuf->pmap); ++processed; /* So we know when to refresh */ /* Initial frame - setup */ if (rxr->fmp == NULL) { mh->m_pkthdr.len = mh->m_len; /* Save the head of the chain */ rxr->fmp = mh; rxr->lmp = mh; if (mp != NULL) { /* Add payload if split */ mh->m_pkthdr.len += mp->m_len; rxr->lmp = mh->m_next; } } else { /* Chain mbuf's together */ rxr->lmp->m_next = mh; rxr->lmp = rxr->lmp->m_next; rxr->fmp->m_pkthdr.len += mh->m_len; } if (eop) { rxr->fmp->m_pkthdr.rcvif = ifp; rxr->rx_packets++; /* capture data for AIM */ rxr->packets++; rxr->bytes += rxr->fmp->m_pkthdr.len; rxr->rx_bytes += rxr->fmp->m_pkthdr.len; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) igb_rx_checksum(staterr, rxr->fmp, ptype); if ((ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (staterr & E1000_RXD_STAT_VP) != 0) { rxr->fmp->m_pkthdr.ether_vtag = vtag; rxr->fmp->m_flags |= M_VLANTAG; } /* * In case of multiqueue, we have RXCSUM.PCSD bit set * and never cleared. This means we have RSS hash * available to be used. */ if (adapter->num_queues > 1) { rxr->fmp->m_pkthdr.flowid = le32toh(cur->wb.lower.hi_dword.rss); switch (pkt_info & E1000_RXDADV_RSSTYPE_MASK) { case E1000_RXDADV_RSSTYPE_IPV4_TCP: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV4); break; case E1000_RXDADV_RSSTYPE_IPV4: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV4); break; case E1000_RXDADV_RSSTYPE_IPV6_TCP: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV6); break; case E1000_RXDADV_RSSTYPE_IPV6_EX: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV6_EX); break; case E1000_RXDADV_RSSTYPE_IPV6: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV6); break; case E1000_RXDADV_RSSTYPE_IPV6_TCP_EX: M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV6_EX); break; default: /* XXX fallthrough */ M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_OPAQUE); } } else { #ifndef IGB_LEGACY_TX rxr->fmp->m_pkthdr.flowid = que->msix; M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_OPAQUE); #endif } sendmp = rxr->fmp; /* Make sure to set M_PKTHDR. */ sendmp->m_flags |= M_PKTHDR; rxr->fmp = NULL; rxr->lmp = NULL; } next_desc: bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == adapter->num_rx_desc) i = 0; /* ** Send to the stack or LRO */ if (sendmp != NULL) { rxr->next_to_check = i; igb_rx_input(rxr, ifp, sendmp, ptype); i = rxr->next_to_check; rxdone++; } /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { igb_refresh_mbufs(rxr, i); processed = 0; } } /* Catch any remainders */ if (igb_rx_unrefreshed(rxr)) igb_refresh_mbufs(rxr, i); rxr->next_to_check = i; /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } if (done != NULL) *done += rxdone; IGB_RX_UNLOCK(rxr); return ((staterr & E1000_RXD_STAT_DD) ? TRUE : FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void igb_rx_checksum(u32 staterr, struct mbuf *mp, u32 ptype) { u16 status = (u16)staterr; u8 errors = (u8) (staterr >> 24); int sctp; /* Ignore Checksum bit is set */ if (status & E1000_RXD_STAT_IXSM) { mp->m_pkthdr.csum_flags = 0; return; } if ((ptype & E1000_RXDADV_PKTTYPE_ETQF) == 0 && (ptype & E1000_RXDADV_PKTTYPE_SCTP) != 0) sctp = 1; else sctp = 0; if (status & E1000_RXD_STAT_IPCS) { /* Did it pass? */ if (!(errors & E1000_RXD_ERR_IPE)) { /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } else mp->m_pkthdr.csum_flags = 0; } if (status & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS)) { u64 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); #if __FreeBSD_version >= 800000 if (sctp) /* reassign */ type = CSUM_SCTP_VALID; #endif /* Did it pass? */ if (!(errors & E1000_RXD_ERR_TCPE)) { mp->m_pkthdr.csum_flags |= type; if (sctp == 0) mp->m_pkthdr.csum_data = htons(0xffff); } } return; } /* * This routine is run via an vlan * config EVENT */ static void igb_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IGB_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } /* * This routine is run via an vlan * unconfig EVENT */ static void igb_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u32 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IGB_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } static void igb_setup_vlan_hw_support(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 reg; if (adapter->vf_ifp) { e1000_rlpml_set_vf(hw, adapter->max_frame_size + VLAN_TAG_SIZE); return; } reg = E1000_READ_REG(hw, E1000_CTRL); reg |= E1000_CTRL_VME; E1000_WRITE_REG(hw, E1000_CTRL, reg); /* Enable the Filter Table */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { reg = E1000_READ_REG(hw, E1000_RCTL); reg &= ~E1000_RCTL_CFIEN; reg |= E1000_RCTL_VFE; E1000_WRITE_REG(hw, E1000_RCTL, reg); } /* Update the frame size */ E1000_WRITE_REG(&adapter->hw, E1000_RLPML, adapter->max_frame_size + VLAN_TAG_SIZE); /* Don't bother with table if no vlans */ if ((adapter->num_vlans == 0) || ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0)) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IGB_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) { if (adapter->vf_ifp) e1000_vfta_set_vf(hw, adapter->shadow_vfta[i], TRUE); else e1000_write_vfta(hw, i, adapter->shadow_vfta[i]); } } static void igb_enable_intr(struct adapter *adapter) { /* With RSS set up what to auto clear */ if (adapter->msix_mem) { u32 mask = (adapter->que_mask | adapter->link_mask); E1000_WRITE_REG(&adapter->hw, E1000_EIAC, mask); E1000_WRITE_REG(&adapter->hw, E1000_EIAM, mask); E1000_WRITE_REG(&adapter->hw, E1000_EIMS, mask); E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); } else { E1000_WRITE_REG(&adapter->hw, E1000_IMS, IMS_ENABLE_MASK); } E1000_WRITE_FLUSH(&adapter->hw); return; } static void igb_disable_intr(struct adapter *adapter) { if (adapter->msix_mem) { E1000_WRITE_REG(&adapter->hw, E1000_EIMC, ~0); E1000_WRITE_REG(&adapter->hw, E1000_EIAC, 0); } E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0); E1000_WRITE_FLUSH(&adapter->hw); return; } /* * Bit of a misnomer, what this really means is * to enable OS management of the system... aka * to disable special hardware management features */ static void igb_init_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc2h = E1000_READ_REG(&adapter->hw, E1000_MANC2H); int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* disable hardware interception of ARP */ manc &= ~(E1000_MANC_ARP_EN); /* enable receiving management packets to the host */ manc |= E1000_MANC_EN_MNG2HOST; manc2h |= 1 << 5; /* Mng Port 623 */ manc2h |= 1 << 6; /* Mng Port 664 */ E1000_WRITE_REG(&adapter->hw, E1000_MANC2H, manc2h); E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * Give control back to hardware management * controller if there is one. */ static void igb_release_manageability(struct adapter *adapter) { if (adapter->has_manage) { int manc = E1000_READ_REG(&adapter->hw, E1000_MANC); /* re-enable hardware interception of ARP */ manc |= E1000_MANC_ARP_EN; manc &= ~E1000_MANC_EN_MNG2HOST; E1000_WRITE_REG(&adapter->hw, E1000_MANC, manc); } } /* * igb_get_hw_control sets CTRL_EXT:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that * the driver is loaded. * */ static void igb_get_hw_control(struct adapter *adapter) { u32 ctrl_ext; if (adapter->vf_ifp) return; /* Let firmware know the driver has taken over */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext | E1000_CTRL_EXT_DRV_LOAD); } /* * igb_release_hw_control resets CTRL_EXT:DRV_LOAD bit. * For ASF and Pass Through versions of f/w this means that the * driver is no longer loaded. * */ static void igb_release_hw_control(struct adapter *adapter) { u32 ctrl_ext; if (adapter->vf_ifp) return; /* Let firmware taken over control of h/w */ ctrl_ext = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); E1000_WRITE_REG(&adapter->hw, E1000_CTRL_EXT, ctrl_ext & ~E1000_CTRL_EXT_DRV_LOAD); } static int igb_is_valid_ether_addr(uint8_t *addr) { char zero_addr[6] = { 0, 0, 0, 0, 0, 0 }; if ((addr[0] & 1) || (!bcmp(addr, zero_addr, ETHER_ADDR_LEN))) { return (FALSE); } return (TRUE); } /* * Enable PCI Wake On Lan capability */ static void igb_enable_wakeup(device_t dev) { u16 cap, status; u8 id; /* First find the capabilities pointer*/ cap = pci_read_config(dev, PCIR_CAP_PTR, 2); /* Read the PM Capabilities */ id = pci_read_config(dev, cap, 1); if (id != PCIY_PMG) /* Something wrong */ return; /* OK, we have the power capabilities, so now get the status register */ cap += PCIR_POWER_STATUS; status = pci_read_config(dev, cap, 2); status |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE; pci_write_config(dev, cap, status, 2); return; } static void igb_led_func(void *arg, int onoff) { struct adapter *adapter = arg; IGB_CORE_LOCK(adapter); if (onoff) { e1000_setup_led(&adapter->hw); e1000_led_on(&adapter->hw); } else { e1000_led_off(&adapter->hw); e1000_cleanup_led(&adapter->hw); } IGB_CORE_UNLOCK(adapter); } static uint64_t igb_get_vf_counter(if_t ifp, ift_counter cnt) { struct adapter *adapter; struct e1000_vf_stats *stats; #ifndef IGB_LEGACY_TX struct tx_ring *txr; uint64_t rv; #endif adapter = if_getsoftc(ifp); stats = (struct e1000_vf_stats *)adapter->stats; switch (cnt) { case IFCOUNTER_IPACKETS: return (stats->gprc); case IFCOUNTER_OPACKETS: return (stats->gptc); case IFCOUNTER_IBYTES: return (stats->gorc); case IFCOUNTER_OBYTES: return (stats->gotc); case IFCOUNTER_IMCASTS: return (stats->mprc); case IFCOUNTER_IERRORS: return (adapter->dropped_pkts); case IFCOUNTER_OERRORS: return (adapter->watchdog_events); #ifndef IGB_LEGACY_TX case IFCOUNTER_OQDROPS: rv = 0; txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) rv += txr->br->br_drops; return (rv); #endif default: return (if_get_counter_default(ifp, cnt)); } } static uint64_t igb_get_counter(if_t ifp, ift_counter cnt) { struct adapter *adapter; struct e1000_hw_stats *stats; #ifndef IGB_LEGACY_TX struct tx_ring *txr; uint64_t rv; #endif adapter = if_getsoftc(ifp); if (adapter->vf_ifp) return (igb_get_vf_counter(ifp, cnt)); stats = (struct e1000_hw_stats *)adapter->stats; switch (cnt) { case IFCOUNTER_IPACKETS: return (stats->gprc); case IFCOUNTER_OPACKETS: return (stats->gptc); case IFCOUNTER_IBYTES: return (stats->gorc); case IFCOUNTER_OBYTES: return (stats->gotc); case IFCOUNTER_IMCASTS: return (stats->mprc); case IFCOUNTER_OMCASTS: return (stats->mptc); case IFCOUNTER_IERRORS: return (adapter->dropped_pkts + stats->rxerrc + stats->crcerrs + stats->algnerrc + stats->ruc + stats->roc + stats->cexterr); case IFCOUNTER_OERRORS: return (stats->ecol + stats->latecol + adapter->watchdog_events); case IFCOUNTER_COLLISIONS: return (stats->colc); case IFCOUNTER_IQDROPS: return (stats->mpc); #ifndef IGB_LEGACY_TX case IFCOUNTER_OQDROPS: rv = 0; txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) rv += txr->br->br_drops; return (rv); #endif default: return (if_get_counter_default(ifp, cnt)); } } /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void igb_update_stats_counters(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_hw_stats *stats; /* ** The virtual function adapter has only a ** small controlled set of stats, do only ** those and return. */ if (adapter->vf_ifp) { igb_update_vf_stats_counters(adapter); return; } stats = (struct e1000_hw_stats *)adapter->stats; if (adapter->hw.phy.media_type == e1000_media_type_copper || (E1000_READ_REG(hw, E1000_STATUS) & E1000_STATUS_LU)) { stats->symerrs += E1000_READ_REG(hw,E1000_SYMERRS); stats->sec += E1000_READ_REG(hw, E1000_SEC); } stats->crcerrs += E1000_READ_REG(hw, E1000_CRCERRS); stats->mpc += E1000_READ_REG(hw, E1000_MPC); stats->scc += E1000_READ_REG(hw, E1000_SCC); stats->ecol += E1000_READ_REG(hw, E1000_ECOL); stats->mcc += E1000_READ_REG(hw, E1000_MCC); stats->latecol += E1000_READ_REG(hw, E1000_LATECOL); stats->colc += E1000_READ_REG(hw, E1000_COLC); stats->dc += E1000_READ_REG(hw, E1000_DC); stats->rlec += E1000_READ_REG(hw, E1000_RLEC); stats->xonrxc += E1000_READ_REG(hw, E1000_XONRXC); stats->xontxc += E1000_READ_REG(hw, E1000_XONTXC); /* ** For watchdog management we need to know if we have been ** paused during the last interval, so capture that here. */ adapter->pause_frames = E1000_READ_REG(&adapter->hw, E1000_XOFFRXC); stats->xoffrxc += adapter->pause_frames; stats->xofftxc += E1000_READ_REG(hw, E1000_XOFFTXC); stats->fcruc += E1000_READ_REG(hw, E1000_FCRUC); stats->prc64 += E1000_READ_REG(hw, E1000_PRC64); stats->prc127 += E1000_READ_REG(hw, E1000_PRC127); stats->prc255 += E1000_READ_REG(hw, E1000_PRC255); stats->prc511 += E1000_READ_REG(hw, E1000_PRC511); stats->prc1023 += E1000_READ_REG(hw, E1000_PRC1023); stats->prc1522 += E1000_READ_REG(hw, E1000_PRC1522); stats->gprc += E1000_READ_REG(hw, E1000_GPRC); stats->bprc += E1000_READ_REG(hw, E1000_BPRC); stats->mprc += E1000_READ_REG(hw, E1000_MPRC); stats->gptc += E1000_READ_REG(hw, E1000_GPTC); /* For the 64-bit byte counters the low dword must be read first. */ /* Both registers clear on the read of the high dword */ stats->gorc += E1000_READ_REG(hw, E1000_GORCL) + ((u64)E1000_READ_REG(hw, E1000_GORCH) << 32); stats->gotc += E1000_READ_REG(hw, E1000_GOTCL) + ((u64)E1000_READ_REG(hw, E1000_GOTCH) << 32); stats->rnbc += E1000_READ_REG(hw, E1000_RNBC); stats->ruc += E1000_READ_REG(hw, E1000_RUC); stats->rfc += E1000_READ_REG(hw, E1000_RFC); stats->roc += E1000_READ_REG(hw, E1000_ROC); stats->rjc += E1000_READ_REG(hw, E1000_RJC); stats->mgprc += E1000_READ_REG(hw, E1000_MGTPRC); stats->mgpdc += E1000_READ_REG(hw, E1000_MGTPDC); stats->mgptc += E1000_READ_REG(hw, E1000_MGTPTC); stats->tor += E1000_READ_REG(hw, E1000_TORL) + ((u64)E1000_READ_REG(hw, E1000_TORH) << 32); stats->tot += E1000_READ_REG(hw, E1000_TOTL) + ((u64)E1000_READ_REG(hw, E1000_TOTH) << 32); stats->tpr += E1000_READ_REG(hw, E1000_TPR); stats->tpt += E1000_READ_REG(hw, E1000_TPT); stats->ptc64 += E1000_READ_REG(hw, E1000_PTC64); stats->ptc127 += E1000_READ_REG(hw, E1000_PTC127); stats->ptc255 += E1000_READ_REG(hw, E1000_PTC255); stats->ptc511 += E1000_READ_REG(hw, E1000_PTC511); stats->ptc1023 += E1000_READ_REG(hw, E1000_PTC1023); stats->ptc1522 += E1000_READ_REG(hw, E1000_PTC1522); stats->mptc += E1000_READ_REG(hw, E1000_MPTC); stats->bptc += E1000_READ_REG(hw, E1000_BPTC); /* Interrupt Counts */ stats->iac += E1000_READ_REG(hw, E1000_IAC); stats->icrxptc += E1000_READ_REG(hw, E1000_ICRXPTC); stats->icrxatc += E1000_READ_REG(hw, E1000_ICRXATC); stats->ictxptc += E1000_READ_REG(hw, E1000_ICTXPTC); stats->ictxatc += E1000_READ_REG(hw, E1000_ICTXATC); stats->ictxqec += E1000_READ_REG(hw, E1000_ICTXQEC); stats->ictxqmtc += E1000_READ_REG(hw, E1000_ICTXQMTC); stats->icrxdmtc += E1000_READ_REG(hw, E1000_ICRXDMTC); stats->icrxoc += E1000_READ_REG(hw, E1000_ICRXOC); /* Host to Card Statistics */ stats->cbtmpc += E1000_READ_REG(hw, E1000_CBTMPC); stats->htdpmc += E1000_READ_REG(hw, E1000_HTDPMC); stats->cbrdpc += E1000_READ_REG(hw, E1000_CBRDPC); stats->cbrmpc += E1000_READ_REG(hw, E1000_CBRMPC); stats->rpthc += E1000_READ_REG(hw, E1000_RPTHC); stats->hgptc += E1000_READ_REG(hw, E1000_HGPTC); stats->htcbdpc += E1000_READ_REG(hw, E1000_HTCBDPC); stats->hgorc += (E1000_READ_REG(hw, E1000_HGORCL) + ((u64)E1000_READ_REG(hw, E1000_HGORCH) << 32)); stats->hgotc += (E1000_READ_REG(hw, E1000_HGOTCL) + ((u64)E1000_READ_REG(hw, E1000_HGOTCH) << 32)); stats->lenerrs += E1000_READ_REG(hw, E1000_LENERRS); stats->scvpc += E1000_READ_REG(hw, E1000_SCVPC); stats->hrmpc += E1000_READ_REG(hw, E1000_HRMPC); stats->algnerrc += E1000_READ_REG(hw, E1000_ALGNERRC); stats->rxerrc += E1000_READ_REG(hw, E1000_RXERRC); stats->tncrs += E1000_READ_REG(hw, E1000_TNCRS); stats->cexterr += E1000_READ_REG(hw, E1000_CEXTERR); stats->tsctc += E1000_READ_REG(hw, E1000_TSCTC); stats->tsctfc += E1000_READ_REG(hw, E1000_TSCTFC); /* Driver specific counters */ adapter->device_control = E1000_READ_REG(hw, E1000_CTRL); adapter->rx_control = E1000_READ_REG(hw, E1000_RCTL); adapter->int_mask = E1000_READ_REG(hw, E1000_IMS); adapter->eint_mask = E1000_READ_REG(hw, E1000_EIMS); adapter->packet_buf_alloc_tx = ((E1000_READ_REG(hw, E1000_PBA) & 0xffff0000) >> 16); adapter->packet_buf_alloc_rx = (E1000_READ_REG(hw, E1000_PBA) & 0xffff); } /********************************************************************** * * Initialize the VF board statistics counters. * **********************************************************************/ static void igb_vf_init_stats(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_vf_stats *stats; stats = (struct e1000_vf_stats *)adapter->stats; if (stats == NULL) return; stats->last_gprc = E1000_READ_REG(hw, E1000_VFGPRC); stats->last_gorc = E1000_READ_REG(hw, E1000_VFGORC); stats->last_gptc = E1000_READ_REG(hw, E1000_VFGPTC); stats->last_gotc = E1000_READ_REG(hw, E1000_VFGOTC); stats->last_mprc = E1000_READ_REG(hw, E1000_VFMPRC); } /********************************************************************** * * Update the VF board statistics counters. * **********************************************************************/ static void igb_update_vf_stats_counters(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_vf_stats *stats; if (adapter->link_speed == 0) return; stats = (struct e1000_vf_stats *)adapter->stats; UPDATE_VF_REG(E1000_VFGPRC, stats->last_gprc, stats->gprc); UPDATE_VF_REG(E1000_VFGORC, stats->last_gorc, stats->gorc); UPDATE_VF_REG(E1000_VFGPTC, stats->last_gptc, stats->gptc); UPDATE_VF_REG(E1000_VFGOTC, stats->last_gotc, stats->gotc); UPDATE_VF_REG(E1000_VFMPRC, stats->last_mprc, stats->mprc); } /* Export a single 32-bit register via a read-only sysctl. */ static int igb_sysctl_reg_handler(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; u_int val; adapter = oidp->oid_arg1; val = E1000_READ_REG(&adapter->hw, oidp->oid_arg2); return (sysctl_handle_int(oidp, &val, 0, req)); } /* ** Tuneable interrupt rate handler */ static int igb_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) { struct igb_queue *que = ((struct igb_queue *)oidp->oid_arg1); int error; u32 reg, usec, rate; reg = E1000_READ_REG(&que->adapter->hw, E1000_EITR(que->msix)); usec = ((reg & 0x7FFC) >> 2); if (usec > 0) rate = 1000000 / usec; else rate = 0; error = sysctl_handle_int(oidp, &rate, 0, req); if (error || !req->newptr) return error; return 0; } /* * Add sysctl variables, one per statistic, to the system. */ static void igb_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct e1000_hw_stats *stats = adapter->stats; struct sysctl_oid *stat_node, *queue_node, *int_node, *host_node; struct sysctl_oid_list *stat_list, *queue_list, *int_list, *host_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, "Link MSIX IRQ Handled"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_defrag_fail", CTLFLAG_RD, &adapter->mbuf_defrag_failed, "Defragmenting mbuf chain failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_dma_fail", CTLFLAG_RD, &adapter->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_overruns", CTLFLAG_RD, &adapter->rx_overruns, "RX overruns"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_timeouts", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "device_control", CTLFLAG_RD, &adapter->device_control, "Device Control Register"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_control", CTLFLAG_RD, &adapter->rx_control, "Receiver Control Register"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "interrupt_mask", CTLFLAG_RD, &adapter->int_mask, "Interrupt Mask"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "extended_int_mask", CTLFLAG_RD, &adapter->eint_mask, "Extended Interrupt Mask"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_buf_alloc", CTLFLAG_RD, &adapter->packet_buf_alloc_tx, "Transmit Buffer Packet Allocation"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_buf_alloc", CTLFLAG_RD, &adapter->packet_buf_alloc_rx, "Receive Buffer Packet Allocation"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_high_water", CTLFLAG_RD, &adapter->hw.fc.high_water, 0, "Flow Control High Watermark"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "fc_low_water", CTLFLAG_RD, &adapter->hw.fc.low_water, 0, "Flow Control Low Watermark"); for (int i = 0; i < adapter->num_queues; i++, rxr++, txr++) { struct lro_ctrl *lro = &rxr->lro; snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", CTLTYPE_UINT | CTLFLAG_RD, &adapter->queues[i], sizeof(&adapter->queues[i]), igb_sysctl_interrupt_rate_handler, "IU", "Interrupt Rate"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDH(txr->me), igb_sysctl_reg_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_TDT(txr->me), igb_sysctl_reg_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue Descriptors Unavailable"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &txr->total_packets, "Queue Packets Transmitted"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDH(rxr->me), igb_sysctl_reg_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_tail", CTLTYPE_UINT | CTLFLAG_RD, adapter, E1000_RDT(rxr->me), igb_sysctl_reg_handler, "IU", "Receive Descriptor Tail"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &rxr->rx_packets, "Queue Packets Received"); SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); - SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_queued", + SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); - SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_flushed", + SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_flushed", CTLFLAG_RD, &lro->lro_flushed, 0, "LRO Flushed"); } /* MAC stats get their own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); /* ** VF adapter has a very limited set of stats ** since its not managing the metal, so to speak. */ if (adapter->vf_ifp) { SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); return; } SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "excess_coll", CTLFLAG_RD, &stats->ecol, "Excessive collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "single_coll", CTLFLAG_RD, &stats->scc, "Single collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "multiple_coll", CTLFLAG_RD, &stats->mcc, "Multiple collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "late_coll", CTLFLAG_RD, &stats->latecol, "Late collisions"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "collision_count", CTLFLAG_RD, &stats->colc, "Collision Count"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "symbol_errors", CTLFLAG_RD, &stats->symerrs, "Symbol Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "sequence_errors", CTLFLAG_RD, &stats->sec, "Sequence Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "defer_count", CTLFLAG_RD, &stats->dc, "Defer Count"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "missed_packets", CTLFLAG_RD, &stats->mpc, "Missed Packets"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_length_errors", CTLFLAG_RD, &stats->rlec, "Receive Length Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_no_buff", CTLFLAG_RD, &stats->rnbc, "Receive No Buffers"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_undersize", CTLFLAG_RD, &stats->ruc, "Receive Undersize"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &stats->rfc, "Fragmented Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_oversize", CTLFLAG_RD, &stats->roc, "Oversized Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_jabber", CTLFLAG_RD, &stats->rjc, "Recevied Jabber"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "recv_errs", CTLFLAG_RD, &stats->rxerrc, "Receive Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &stats->crcerrs, "CRC errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "alignment_errs", CTLFLAG_RD, &stats->algnerrc, "Alignment Errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_no_crs", CTLFLAG_RD, &stats->tncrs, "Transmit with No CRS"); /* On 82575 these are collision counts */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "coll_ext_errs", CTLFLAG_RD, &stats->cexterr, "Collision/Carrier extension errors"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &stats->xonrxc, "XON Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &stats->xontxc, "XON Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &stats->xoffrxc, "XOFF Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &stats->xofftxc, "XOFF Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "unsupported_fc_recvd", CTLFLAG_RD, &stats->fcruc, "Unsupported Flow Control Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mgmt_pkts_recvd", CTLFLAG_RD, &stats->mgprc, "Management Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mgmt_pkts_drop", CTLFLAG_RD, &stats->mgpdc, "Management Packets Dropped"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mgmt_pkts_txd", CTLFLAG_RD, &stats->mgptc, "Management Packets Transmitted"); /* Packet Reception Stats */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_pkts_recvd", CTLFLAG_RD, &stats->tpr, "Total Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_recvd", CTLFLAG_RD, &stats->bprc, "Broadcast Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_recvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &stats->prc64, "64 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &stats->prc127, "65-127 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &stats->prc255, "128-255 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &stats->prc511, "256-511 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &stats->prc1023, "512-1023 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &stats->prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_recvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_octets_recvd", CTLFLAG_RD, &stats->tor, "Total Octets Received"); /* Packet Transmission Stats */ SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_octets_txd", CTLFLAG_RD, &stats->tot, "Total Octets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &stats->tpt, "Total Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &stats->bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &stats->mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &stats->ptc64, "64 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &stats->ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &stats->ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &stats->ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &stats->ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &stats->ptc1522, "1024-1522 byte frames transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tso_txd", CTLFLAG_RD, &stats->tsctc, "TSO Contexts Transmitted"); SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "tso_ctx_fail", CTLFLAG_RD, &stats->tsctfc, "TSO Contexts Failed"); /* Interrupt Stats */ int_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "interrupts", CTLFLAG_RD, NULL, "Interrupt Statistics"); int_list = SYSCTL_CHILDREN(int_node); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "asserts", CTLFLAG_RD, &stats->iac, "Interrupt Assertion Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_pkt_timer", CTLFLAG_RD, &stats->icrxptc, "Interrupt Cause Rx Pkt Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_abs_timer", CTLFLAG_RD, &stats->icrxatc, "Interrupt Cause Rx Abs Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_pkt_timer", CTLFLAG_RD, &stats->ictxptc, "Interrupt Cause Tx Pkt Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_abs_timer", CTLFLAG_RD, &stats->ictxatc, "Interrupt Cause Tx Abs Timer Expire Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_queue_empty", CTLFLAG_RD, &stats->ictxqec, "Interrupt Cause Tx Queue Empty Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "tx_queue_min_thresh", CTLFLAG_RD, &stats->ictxqmtc, "Interrupt Cause Tx Queue Min Thresh Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_desc_min_thresh", CTLFLAG_RD, &stats->icrxdmtc, "Interrupt Cause Rx Desc Min Thresh Count"); SYSCTL_ADD_QUAD(ctx, int_list, OID_AUTO, "rx_overrun", CTLFLAG_RD, &stats->icrxoc, "Interrupt Cause Receiver Overrun Count"); /* Host to Card Stats */ host_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "host", CTLFLAG_RD, NULL, "Host to Card Statistics"); host_list = SYSCTL_CHILDREN(host_node); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_tx_pkt", CTLFLAG_RD, &stats->cbtmpc, "Circuit Breaker Tx Packet Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "host_tx_pkt_discard", CTLFLAG_RD, &stats->htdpmc, "Host Transmit Discarded Packets"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "rx_pkt", CTLFLAG_RD, &stats->rpthc, "Rx Packets To Host"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_rx_pkts", CTLFLAG_RD, &stats->cbrmpc, "Circuit Breaker Rx Packet Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_rx_pkt_drop", CTLFLAG_RD, &stats->cbrdpc, "Circuit Breaker Rx Dropped Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "tx_good_pkt", CTLFLAG_RD, &stats->hgptc, "Host Good Packets Tx Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "breaker_tx_pkt_drop", CTLFLAG_RD, &stats->htcbdpc, "Host Tx Circuit Breaker Dropped Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "rx_good_bytes", CTLFLAG_RD, &stats->hgorc, "Host Good Octets Received Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "tx_good_bytes", CTLFLAG_RD, &stats->hgotc, "Host Good Octets Transmit Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "length_errors", CTLFLAG_RD, &stats->lenerrs, "Length Errors"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "serdes_violation_pkt", CTLFLAG_RD, &stats->scvpc, "SerDes/SGMII Code Violation Pkt Count"); SYSCTL_ADD_QUAD(ctx, host_list, OID_AUTO, "header_redir_missed", CTLFLAG_RD, &stats->hrmpc, "Header Redirection Missed Packet Count"); } /********************************************************************** * * This routine provides a way to dump out the adapter eeprom, * often a useful debug/service tool. This only dumps the first * 32 words, stuff that matters is in that extent. * **********************************************************************/ static int igb_sysctl_nvm_info(SYSCTL_HANDLER_ARGS) { struct adapter *adapter; int error; int result; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); /* * This value will cause a hex dump of the * first 32 16-bit words of the EEPROM to * the screen. */ if (result == 1) { adapter = (struct adapter *)arg1; igb_print_nvm_info(adapter); } return (error); } static void igb_print_nvm_info(struct adapter *adapter) { u16 eeprom_data; int i, j, row = 0; /* Its a bit crude, but it gets the job done */ printf("\nInterface EEPROM Dump:\n"); printf("Offset\n0x0000 "); for (i = 0, j = 0; i < 32; i++, j++) { if (j == 8) { /* Make the offset block */ j = 0; ++row; printf("\n0x00%x0 ",row); } e1000_read_nvm(&adapter->hw, i, 1, &eeprom_data); printf("%04x ", eeprom_data); } printf("\n"); } static void igb_set_sysctl_value(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLFLAG_RW, limit, value, description); } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int igb_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error; static int input = 3; /* default is full */ struct adapter *adapter = (struct adapter *) arg1; error = sysctl_handle_int(oidp, &input, 0, req); if ((error) || (req->newptr == NULL)) return (error); switch (input) { case e1000_fc_rx_pause: case e1000_fc_tx_pause: case e1000_fc_full: case e1000_fc_none: adapter->hw.fc.requested_mode = input; adapter->fc = input; break; default: /* Do nothing */ return (error); } adapter->hw.fc.current_mode = adapter->hw.fc.requested_mode; e1000_force_mac_fc(&adapter->hw); /* XXX TODO: update DROP_EN on each RX queue if appropriate */ return (error); } /* ** Manage DMA Coalesce: ** Control values: ** 0/1 - off/on ** Legal timer values are: ** 250,500,1000-10000 in thousands */ static int igb_sysctl_dmac(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error; error = sysctl_handle_int(oidp, &adapter->dmac, 0, req); if ((error) || (req->newptr == NULL)) return (error); switch (adapter->dmac) { case 0: /* Disabling */ break; case 1: /* Just enable and use default */ adapter->dmac = 1000; break; case 250: case 500: case 1000: case 2000: case 3000: case 4000: case 5000: case 6000: case 7000: case 8000: case 9000: case 10000: /* Legal values - allow */ break; default: /* Do nothing, illegal value */ adapter->dmac = 0; return (EINVAL); } /* Reinit the interface */ igb_init(adapter); return (error); } /* ** Manage Energy Efficient Ethernet: ** Control values: ** 0/1 - enabled/disabled */ static int igb_sysctl_eee(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error, value; value = adapter->hw.dev_spec._82575.eee_disable; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); IGB_CORE_LOCK(adapter); adapter->hw.dev_spec._82575.eee_disable = (value != 0); igb_init_locked(adapter); IGB_CORE_UNLOCK(adapter); return (0); } Index: head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c =================================================================== --- head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 294326) +++ head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 294327) @@ -1,1574 +1,1574 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_net_vsc.h" #include "hv_rndis.h" #include "hv_rndis_filter.h" /* Short for Hyper-V network interface */ #define NETVSC_DEVNAME "hn" /* * It looks like offset 0 of buf is reserved to hold the softc pointer. * The sc pointer evidently not needed, and is not presently populated. * The packet offset is where the netvsc_packet starts in the buffer. */ #define HV_NV_SC_PTR_OFFSET_IN_BUF 0 #define HV_NV_PACKET_OFFSET_IN_BUF 16 /* * A unified flag for all outbound check sum flags is useful, * and it helps avoiding unnecessary check sum calculation in * network forwarding scenario. */ #define HV_CSUM_FOR_OUTBOUND \ (CSUM_IP|CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP|CSUM_IP_TSO| \ CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP| \ CSUM_IP6_TSO|CSUM_IP6_ISCSI) /* XXX move to netinet/tcp_lro.h */ #define HN_LRO_HIWAT_MAX 65535 #define HN_LRO_HIWAT_DEF HN_LRO_HIWAT_MAX /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_HIWAT_MTULIM(ifp) (2 * (ifp)->if_mtu) #define HN_LRO_HIWAT_ISVALID(sc, hiwat) \ ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) || \ (hiwat) <= HN_LRO_HIWAT_MAX) /* * Be aware that this sleepable mutex will exhibit WITNESS errors when * certain TCP and ARP code paths are taken. This appears to be a * well-known condition, as all other drivers checked use a sleeping * mutex to protect their transmit paths. * Also Be aware that mutexes do not play well with semaphores, and there * is a conflicting semaphore in a certain channel code path. */ #define NV_LOCK_INIT(_sc, _name) \ mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF) #define NV_LOCK(_sc) mtx_lock(&(_sc)->hn_lock) #define NV_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->hn_lock, MA_OWNED) #define NV_UNLOCK(_sc) mtx_unlock(&(_sc)->hn_lock) #define NV_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->hn_lock) /* * Globals */ int hv_promisc_mode = 0; /* normal mode by default */ /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 0; TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp); /* * Forward declarations */ static void hn_stop(hn_softc_t *sc); static void hn_ifinit_locked(hn_softc_t *sc); static void hn_ifinit(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int hn_start_locked(struct ifnet *ifp); static void hn_start(struct ifnet *ifp); static int hn_ifmedia_upd(struct ifnet *ifp); static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); #ifdef HN_LRO_HIWAT static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_check_iplen(const struct mbuf *, int); static __inline void hn_set_lro_hiwat(struct hn_softc *sc, int hiwat) { sc->hn_lro_hiwat = hiwat; #ifdef HN_LRO_HIWAT sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; #endif } /* * NetVsc get message transport protocol type */ static uint32_t get_transport_proto_type(struct mbuf *m_head) { uint32_t ret_val = TRANSPORT_TYPE_NOT_IP; uint16_t ether_type = 0; int ether_len = 0; struct ether_vlan_header *eh; #ifdef INET struct ip *iph; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif eh = mtod(m_head, struct ether_vlan_header*); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; ether_type = eh->evl_proto; } else { ether_len = ETHER_HDR_LEN; ether_type = eh->evl_encap_proto; } switch (ntohs(ether_type)) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(m_head->m_data + ether_len); if (IPPROTO_TCP == ip6->ip6_nxt) { ret_val = TRANSPORT_TYPE_IPV6_TCP; } else if (IPPROTO_UDP == ip6->ip6_nxt) { ret_val = TRANSPORT_TYPE_IPV6_UDP; } break; #endif #ifdef INET case ETHERTYPE_IP: iph = (struct ip *)(m_head->m_data + ether_len); if (IPPROTO_TCP == iph->ip_p) { ret_val = TRANSPORT_TYPE_IPV4_TCP; } else if (IPPROTO_UDP == iph->ip_p) { ret_val = TRANSPORT_TYPE_IPV4_UDP; } break; #endif default: ret_val = TRANSPORT_TYPE_NOT_IP; break; } return (ret_val); } static int hn_ifmedia_upd(struct ifnet *ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!sc->hn_carrier) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ static const hv_guid g_net_vsc_device_type = { .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} }; /* * Standard probe entry point. * */ static int netvsc_probe(device_t dev) { const char *p; p = vmbus_get_type(dev); if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) { device_set_desc(dev, "Synthetic Network Interface"); if (bootverbose) printf("Netvsc probe... DONE \n"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } /* * Standard attach entry point. * * Called when the driver is loaded. It allocates needed resources, * and initializes the "hardware" and software. */ static int netvsc_attach(device_t dev) { struct hv_device *device_ctx = vmbus_get_devctx(dev); netvsc_device_info device_info; hn_softc_t *sc; int unit = device_get_unit(dev); struct ifnet *ifp; struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int ret; sc = device_get_softc(dev); if (sc == NULL) { return (ENOMEM); } bzero(sc, sizeof(hn_softc_t)); sc->hn_unit = unit; sc->hn_dev = dev; sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF; sc->hn_trust_hosttcp = hn_trust_hosttcp; NV_LOCK_INIT(sc, "NetVSCLock"); sc->hn_dev_obj = device_ctx; ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_dunit = unit; ifp->if_dname = NETVSC_DEVNAME; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_start = hn_start; ifp->if_init = hn_ifinit; /* needed by hv_rf_on_device_add() code */ ifp->if_mtu = ETHERMTU; IFQ_SET_MAXLEN(&ifp->if_snd, 512); ifp->if_snd.ifq_drv_maxlen = 511; IFQ_SET_READY(&ifp->if_snd); ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Tell upper layers that we support full VLAN capability. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO; ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO; /* * Only enable UDP checksum offloading when it is on 2012R2 or * later. UDP checksum offloading doesn't work on earlier * Windows releases. */ if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1) ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO; else ifp->if_hwassist = CSUM_TCP | CSUM_TSO; ret = hv_rf_on_device_add(device_ctx, &device_info); if (ret != 0) { if_free(ifp); return (ret); } if (device_info.link_state == 0) { sc->hn_carrier = 1; } #if defined(INET) || defined(INET6) tcp_lro_init(&sc->hn_lro); /* Driver private LRO settings */ sc->hn_lro.ifp = ifp; #ifdef HN_LRO_HIWAT sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; #endif #endif /* INET || INET6 */ ether_ifattach(ifp, device_info.mac_addr); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued", + SYSCTL_ADD_U64(ctx, child, OID_AUTO, "lro_queued", CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued"); - SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed", + SYSCTL_ADD_U64(ctx, child, OID_AUTO, "lro_flushed", CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried", CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries"); #ifdef HN_LRO_HIWAT SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat", CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl, "I", "LRO high watermark"); #endif SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp", CTLFLAG_RW, &sc->hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip", CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp", CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted", CTLFLAG_RW, &sc->hn_csum_trusted, "# of TCP segements that we trust host's csum verification"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts", CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received"); if (unit == 0) { struct sysctl_ctx_list *dc_ctx; struct sysctl_oid_list *dc_child; devclass_t dc; /* * Add sysctl nodes for devclass */ dc = device_get_devclass(dev); dc_ctx = devclass_get_sysctl_ctx(dc); dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc)); SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp", CTLFLAG_RD, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); } return (0); } /* * Standard detach entry point */ static int netvsc_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct hv_device *hv_device = vmbus_get_devctx(dev); if (bootverbose) printf("netvsc_detach\n"); /* * XXXKYS: Need to clean up all our * driver state; this is the driver * unloading. */ /* * XXXKYS: Need to stop outgoing traffic and unregister * the netdevice. */ hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL); ifmedia_removeall(&sc->hn_media); #if defined(INET) || defined(INET6) tcp_lro_free(&sc->hn_lro); #endif return (0); } /* * Standard shutdown entry point */ static int netvsc_shutdown(device_t dev) { return (0); } /* * Send completion processing * * Note: It looks like offset 0 of buf is reserved to hold the softc * pointer. The sc pointer is not currently needed in this function, and * it is not presently populated by the TX function. */ void netvsc_xmit_completion(void *context) { netvsc_packet *packet = (netvsc_packet *)context; struct mbuf *mb; uint8_t *buf; mb = (struct mbuf *)(uintptr_t)packet->compl.send.send_completion_tid; buf = ((uint8_t *)packet) - HV_NV_PACKET_OFFSET_IN_BUF; free(buf, M_NETVSC); if (mb != NULL) { m_freem(mb); } } /* * Start a transmit of one or more packets */ static int hn_start_locked(struct ifnet *ifp) { hn_softc_t *sc = ifp->if_softc; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); netvsc_dev *net_dev = sc->net_dev; device_t dev = device_ctx->device; uint8_t *buf; netvsc_packet *packet; struct mbuf *m_head, *m; struct mbuf *mc_head = NULL; struct ether_vlan_header *eh; rndis_msg *rndis_mesg; rndis_packet *rndis_pkt; rndis_per_packet_info *rppi; ndis_8021q_info *rppi_vlan_info; rndis_tcp_ip_csum_info *csum_info; rndis_tcp_tso_info *tso_info; int ether_len; int i; int num_frags; int len; int retries = 0; int ret = 0; uint32_t rndis_msg_size = 0; uint32_t trans_proto_type; uint32_t send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; while (!IFQ_DRV_IS_EMPTY(&sc->hn_ifp->if_snd)) { IFQ_DRV_DEQUEUE(&sc->hn_ifp->if_snd, m_head); if (m_head == NULL) { break; } len = 0; num_frags = 0; /* Walk the mbuf list computing total length and num frags */ for (m = m_head; m != NULL; m = m->m_next) { if (m->m_len != 0) { num_frags++; len += m->m_len; } } /* * Reserve the number of pages requested. Currently, * one page is reserved for the message in the RNDIS * filter packet */ num_frags += HV_RF_NUM_TX_RESERVED_PAGE_BUFS; /* If exceeds # page_buffers in netvsc_packet */ if (num_frags > NETVSC_PACKET_MAXPAGE) { device_printf(dev, "exceed max page buffers,%d,%d\n", num_frags, NETVSC_PACKET_MAXPAGE); m_freem(m_head); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EINVAL); } /* * Allocate a buffer with space for a netvsc packet plus a * number of reserved areas. First comes a (currently 16 * bytes, currently unused) reserved data area. Second is * the netvsc_packet. Third is an area reserved for an * rndis_filter_packet struct. Fourth (optional) is a * rndis_per_packet_info struct. * Changed malloc to M_NOWAIT to avoid sleep under spin lock. * No longer reserving extra space for page buffers, as they * are already part of the netvsc_packet. */ buf = malloc(HV_NV_PACKET_OFFSET_IN_BUF + sizeof(netvsc_packet) + sizeof(rndis_msg) + RNDIS_VLAN_PPI_SIZE + RNDIS_TSO_PPI_SIZE + RNDIS_CSUM_PPI_SIZE, M_NETVSC, M_ZERO | M_NOWAIT); if (buf == NULL) { device_printf(dev, "hn:malloc packet failed\n"); m_freem(m_head); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOMEM); } packet = (netvsc_packet *)(buf + HV_NV_PACKET_OFFSET_IN_BUF); *(vm_offset_t *)buf = HV_NV_SC_PTR_OFFSET_IN_BUF; packet->is_data_pkt = TRUE; /* Set up the rndis header */ packet->page_buf_count = num_frags; /* Initialize it from the mbuf */ packet->tot_data_buf_len = len; /* * extension points to the area reserved for the * rndis_filter_packet, which is placed just after * the netvsc_packet (and rppi struct, if present; * length is updated later). */ packet->rndis_mesg = packet + 1; rndis_mesg = (rndis_msg *)packet->rndis_mesg; rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; rndis_pkt = &rndis_mesg->msg.packet; rndis_pkt->data_offset = sizeof(rndis_packet); rndis_pkt->data_length = packet->tot_data_buf_len; rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet); rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet); /* * If the Hyper-V infrastructure needs to embed a VLAN tag, * initialize netvsc_packet and rppi struct values as needed. */ if (m_head->m_flags & M_VLANTAG) { /* * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag * into the frame. */ packet->vlan_tci = m_head->m_pkthdr.ether_vtag; rndis_msg_size += RNDIS_VLAN_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE, ieee_8021q_info); /* VLAN info immediately follows rppi struct */ rppi_vlan_info = (ndis_8021q_info *)((char*)rppi + rppi->per_packet_info_offset); /* FreeBSD does not support CFI or priority */ rppi_vlan_info->u1.s1.vlan_id = packet->vlan_tci & 0xfff; } /* Only check the flags for outbound and ignore the ones for inbound */ if (0 == (m_head->m_pkthdr.csum_flags & HV_CSUM_FOR_OUTBOUND)) { goto pre_send; } eh = mtod(m_head, struct ether_vlan_header*); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { ether_len = ETHER_HDR_LEN; } trans_proto_type = get_transport_proto_type(m_head); if (TRANSPORT_TYPE_NOT_IP == trans_proto_type) { goto pre_send; } /* * TSO packet needless to setup the send side checksum * offload. */ if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { goto do_tso; } /* setup checksum offload */ rndis_msg_size += RNDIS_CSUM_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE, tcpip_chksum_info); csum_info = (rndis_tcp_ip_csum_info *)((char*)rppi + rppi->per_packet_info_offset); if (trans_proto_type & (TYPE_IPV4 << 16)) { csum_info->xmit.is_ipv4 = 1; } else { csum_info->xmit.is_ipv6 = 1; } if (trans_proto_type & TYPE_TCP) { csum_info->xmit.tcp_csum = 1; csum_info->xmit.tcp_header_offset = 0; } else if (trans_proto_type & TYPE_UDP) { csum_info->xmit.udp_csum = 1; } goto pre_send; do_tso: /* setup TCP segmentation offload */ rndis_msg_size += RNDIS_TSO_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE, tcp_large_send_info); tso_info = (rndis_tcp_tso_info *)((char *)rppi + rppi->per_packet_info_offset); tso_info->lso_v2_xmit.type = RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; #ifdef INET if (trans_proto_type & (TYPE_IPV4 << 16)) { struct ip *ip = (struct ip *)(m_head->m_data + ether_len); unsigned long iph_len = ip->ip_hl << 2; struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iph_len); tso_info->lso_v2_xmit.ip_version = RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4; ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6 = (struct ip6_hdr *)(m_head->m_data + ether_len); struct tcphdr *th = (struct tcphdr *)(ip6 + 1); tso_info->lso_v2_xmit.ip_version = RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6; ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); } #endif tso_info->lso_v2_xmit.tcp_header_offset = 0; tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz; pre_send: rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size; packet->tot_data_buf_len = rndis_mesg->msg_len; /* send packet with send buffer */ if (packet->tot_data_buf_len < net_dev->send_section_size) { send_buf_section_idx = hv_nv_get_next_send_section(net_dev); if (send_buf_section_idx != NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { char *dest = ((char *)net_dev->send_buf + send_buf_section_idx * net_dev->send_section_size); memcpy(dest, rndis_mesg, rndis_msg_size); dest += rndis_msg_size; for (m = m_head; m != NULL; m = m->m_next) { if (m->m_len) { memcpy(dest, (void *)mtod(m, vm_offset_t), m->m_len); dest += m->m_len; } } packet->send_buf_section_idx = send_buf_section_idx; packet->send_buf_section_size = packet->tot_data_buf_len; packet->page_buf_count = 0; goto do_send; } } /* send packet with page buffer */ packet->page_buffers[0].pfn = atop(hv_get_phys_addr(rndis_mesg)); packet->page_buffers[0].offset = (unsigned long)rndis_mesg & PAGE_MASK; packet->page_buffers[0].length = rndis_msg_size; /* * Fill the page buffers with mbuf info starting at index * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. */ i = HV_RF_NUM_TX_RESERVED_PAGE_BUFS; for (m = m_head; m != NULL; m = m->m_next) { if (m->m_len) { vm_offset_t paddr = vtophys(mtod(m, vm_offset_t)); packet->page_buffers[i].pfn = paddr >> PAGE_SHIFT; packet->page_buffers[i].offset = paddr & (PAGE_SIZE - 1); packet->page_buffers[i].length = m->m_len; i++; } } packet->send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; packet->send_buf_section_size = 0; do_send: /* * If bpf, copy the mbuf chain. This is less expensive than * it appears; the mbuf clusters are not copied, only their * reference counts are incremented. * Needed to avoid a race condition where the completion * callback is invoked, freeing the mbuf chain, before the * bpf_mtap code has a chance to run. */ if (ifp->if_bpf) { mc_head = m_copypacket(m_head, M_NOWAIT); } retry_send: /* Set the completion routine */ packet->compl.send.on_send_completion = netvsc_xmit_completion; packet->compl.send.send_completion_context = packet; packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)m_head; /* Removed critical_enter(), does not appear necessary */ ret = hv_nv_on_send(device_ctx, packet); if (ret == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* if bpf && mc_head, call bpf_mtap code */ if (mc_head) { ETHER_BPF_MTAP(ifp, mc_head); } } else { retries++; if (retries < 4) { goto retry_send; } IF_PREPEND(&ifp->if_snd, m_head); ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* * Null the mbuf pointer so the completion function * does not free the mbuf chain. We just pushed the * mbuf chain back on the if_snd queue. */ packet->compl.send.send_completion_tid = 0; /* * Release the resources since we will not get any * send completion */ netvsc_xmit_completion(packet); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* if bpf && mc_head, free the mbuf chain copy */ if (mc_head) { m_freem(mc_head); } } return (ret); } /* * Link up/down notification */ void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status) { hn_softc_t *sc = device_get_softc(device_obj->device); if (sc == NULL) { return; } if (status == 1) { sc->hn_carrier = 1; } else { sc->hn_carrier = 0; } } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } /* * Called when we receive a data packet from the "wire" on the * specified device * * Note: This is no longer used as a callback */ int netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, rndis_tcp_ip_csum_info *csum_info) { hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device); struct mbuf *m_new; struct ifnet *ifp; device_t dev = device_ctx->device; int size, do_lro = 0; if (sc == NULL) { return (0); /* TODO: KYS how can this be! */ } ifp = sc->hn_ifp; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { return (0); } /* * Bail out if packet contains more data than configured MTU. */ if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) { return (0); } else if (packet->tot_data_buf_len <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) return (0); memcpy(mtod(m_new, void *), packet->data, packet->tot_data_buf_len); m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len; sc->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (packet->tot_data_buf_len > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { device_printf(dev, "alloc mbuf failed.\n"); return (0); } hv_m_append(m_new, packet->tot_data_buf_len, packet->data); } m_new->m_pkthdr.rcvif = ifp; /* receive side checksum offload */ if (NULL != csum_info) { /* IP csum offload */ if (csum_info->receive.ip_csum_succeeded) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); sc->hn_csum_ip++; } /* TCP csum offload */ if (csum_info->receive.tcp_csum_succeeded) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; sc->hn_csum_tcp++; } if (csum_info->receive.ip_csum_succeeded && csum_info->receive.tcp_csum_succeeded) do_lro = 1; } else { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); if (m_new->m_len < hoff) goto skip; eh = mtod(m_new, struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) goto skip; evl = mtod(m_new, struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } if (etype == ETHERTYPE_IP) { int pr; pr = hn_check_iplen(m_new, hoff); if (pr == IPPROTO_TCP) { if (sc->hn_trust_hosttcp) { sc->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } /* Rely on SW csum verification though... */ do_lro = 1; } } } skip: if ((packet->vlan_tci != 0) && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) { m_new->m_pkthdr.ether_vtag = packet->vlan_tci; m_new->m_flags |= M_VLANTAG; } /* * Note: Moved RX completion back to hv_nv_on_receive() so all * messages (not just data messages) will trigger a response. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &sc->hn_lro; if (lro->lro_cnt) { sc->hn_lro_tried++; if (tcp_lro_rx(lro, m_new, 0) == 0) { /* DONE! */ return 0; } } #endif } /* We're not holding the lock here, so don't release it */ (*ifp->if_input)(ifp, m_new); return (0); } void netvsc_recv_rollup(struct hv_device *device_ctx) { #if defined(INET) || defined(INET6) hn_softc_t *sc = device_get_softc(device_ctx->device); struct lro_ctrl *lro = &sc->hn_lro; struct lro_entry *queued; while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } #endif } /* * Rules for using sc->temp_unusable: * 1. sc->temp_unusable can only be read or written while holding NV_LOCK() * 2. code reading sc->temp_unusable under NV_LOCK(), and finding * sc->temp_unusable set, must release NV_LOCK() and exit * 3. to retain exclusive control of the interface, * sc->temp_unusable must be set by code before releasing NV_LOCK() * 4. only code setting sc->temp_unusable can clear sc->temp_unusable * 5. code setting sc->temp_unusable must eventually clear sc->temp_unusable */ /* * Standard ioctl entry point. Called when the user wants to configure * the interface. */ static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { hn_softc_t *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; #ifdef INET struct ifaddr *ifa = (struct ifaddr *)data; #endif netvsc_device_info device_info; struct hv_device *hn_dev; int mask, error = 0; int retry_cnt = 500; switch(cmd) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) hn_ifinit(sc); arp_ifinit(ifp, ifa); } else #endif error = ether_ioctl(ifp, cmd, data); break; case SIOCSIFMTU: hn_dev = vmbus_get_devctx(sc->hn_dev); /* Check MTU value change */ if (ifp->if_mtu == ifr->ifr_mtu) break; if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) { error = EINVAL; break; } /* Obtain and record requested MTU */ ifp->if_mtu = ifr->ifr_mtu; /* * Make sure that LRO high watermark is still valid, * after MTU change (the 2*MTU limit). */ if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat)) hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp)); do { NV_LOCK(sc); if (!sc->temp_unusable) { sc->temp_unusable = TRUE; retry_cnt = -1; } NV_UNLOCK(sc); if (retry_cnt > 0) { retry_cnt--; DELAY(5 * 1000); } } while (retry_cnt > 0); if (retry_cnt == 0) { error = EINVAL; break; } /* We must remove and add back the device to cause the new * MTU to take effect. This includes tearing down, but not * deleting the channel, then bringing it back up. */ error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL); if (error) { NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; } error = hv_rf_on_device_add(hn_dev, &device_info); if (error) { NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; } hn_ifinit_locked(sc); NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; case SIOCSIFFLAGS: do { NV_LOCK(sc); if (!sc->temp_unusable) { sc->temp_unusable = TRUE; retry_cnt = -1; } NV_UNLOCK(sc); if (retry_cnt > 0) { retry_cnt--; DELAY(5 * 1000); } } while (retry_cnt > 0); if (retry_cnt == 0) { error = EINVAL; break; } if (ifp->if_flags & IFF_UP) { /* * If only the state of the PROMISC flag changed, * then just use the 'set promisc mode' command * instead of reinitializing the entire NIC. Doing * a full re-init means reloading the firmware and * waiting for it to start up, which may take a * second or two. */ #ifdef notyet /* Fixme: Promiscuous mode? */ if (ifp->if_drv_flags & IFF_DRV_RUNNING && ifp->if_flags & IFF_PROMISC && !(sc->hn_if_flags & IFF_PROMISC)) { /* do something here for Hyper-V */ } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && !(ifp->if_flags & IFF_PROMISC) && sc->hn_if_flags & IFF_PROMISC) { /* do something here for Hyper-V */ } else #endif hn_ifinit_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { hn_stop(sc); } } NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); sc->hn_if_flags = ifp->if_flags; error = 0; break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_TXCSUM; ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP); } else { ifp->if_capenable |= IFCAP_TXCSUM; /* * Only enable UDP checksum offloading on * Windows Server 2012R2 or later releases. */ if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); } else { ifp->if_hwassist |= CSUM_TCP; } } } if (mask & IFCAP_RXCSUM) { if (IFCAP_RXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_RXCSUM; } else { ifp->if_capenable |= IFCAP_RXCSUM; } } if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; ifp->if_hwassist ^= CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; ifp->if_hwassist ^= CSUM_IP6_TSO; } error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: #ifdef notyet /* Fixme: Multicast mode? */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { NV_LOCK(sc); netvsc_setmulti(sc); NV_UNLOCK(sc); error = 0; } #endif error = EINVAL; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } /* * */ static void hn_stop(hn_softc_t *sc) { struct ifnet *ifp; int ret; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); ifp = sc->hn_ifp; if (bootverbose) printf(" Closing Device ...\n"); ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); sc->hn_initdone = 0; ret = hv_rf_on_close(device_ctx); } /* * FreeBSD transmit entry point */ static void hn_start(struct ifnet *ifp) { hn_softc_t *sc; sc = ifp->if_softc; NV_LOCK(sc); if (sc->temp_unusable) { NV_UNLOCK(sc); return; } hn_start_locked(ifp); NV_UNLOCK(sc); } /* * */ static void hn_ifinit_locked(hn_softc_t *sc) { struct ifnet *ifp; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); int ret; ifp = sc->hn_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { return; } hv_promisc_mode = 1; ret = hv_rf_on_open(device_ctx); if (ret != 0) { return; } else { sc->hn_initdone = 1; } ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); } /* * */ static void hn_ifinit(void *xsc) { hn_softc_t *sc = xsc; NV_LOCK(sc); if (sc->temp_unusable) { NV_UNLOCK(sc); return; } sc->temp_unusable = TRUE; NV_UNLOCK(sc); hn_ifinit_locked(sc); NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); } #ifdef LATER /* * */ static void hn_watchdog(struct ifnet *ifp) { hn_softc_t *sc; sc = ifp->if_softc; printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit); hn_ifinit(sc); /*???*/ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } #endif #ifdef HN_LRO_HIWAT static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hiwat, error; hiwat = sc->hn_lro_hiwat; error = sysctl_handle_int(oidp, &hiwat, 0, req); if (error || req->newptr == NULL) return error; if (!HN_LRO_HIWAT_ISVALID(sc, hiwat)) return EINVAL; if (sc->hn_lro_hiwat != hiwat) hn_set_lro_hiwat(sc, hiwat); return 0; } #endif /* HN_LRO_HIWAT */ static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static device_method_t netvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netvsc_probe), DEVMETHOD(device_attach, netvsc_attach), DEVMETHOD(device_detach, netvsc_detach), DEVMETHOD(device_shutdown, netvsc_shutdown), { 0, 0 } }; static driver_t netvsc_driver = { NETVSC_DEVNAME, netvsc_methods, sizeof(hn_softc_t) }; static devclass_t netvsc_devclass; DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); Index: head/sys/dev/ixgbe/if_ix.c =================================================================== --- head/sys/dev/ixgbe/if_ix.c (revision 294326) +++ head/sys/dev/ixgbe/if_ix.c (revision 294327) @@ -1,5942 +1,5942 @@ /****************************************************************************** Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #ifndef IXGBE_STANDALONE_BUILD #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #endif #include "ixgbe.h" #ifdef RSS #include #include #endif /********************************************************************* * Driver version *********************************************************************/ char ixgbe_driver_version[] = "3.1.13-k"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into ixgbe_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static ixgbe_vendor_info_t ixgbe_vendor_info_array[] = { {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AF_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AF_SINGLE_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_CX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598AT2, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_DA_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_CX4_DUAL_PORT, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_XF_LR, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82598EB_SFP_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_KX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_KX4_MEZZ, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_XAUI_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_CX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_T3_LOM, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_COMBO_BACKPLANE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_BACKPLANE_FCOE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_SF2, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_FCOE, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599EN_SFP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_SFP_SF_QP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_QSFP_SF_QP, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X540T, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X540T1, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550T, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550T1, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550EM_X_KR, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550EM_X_KX4, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550EM_X_10G_T, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550EM_X_SFP, 0, 0, 0}, /* required last entry */ {0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings *********************************************************************/ static char *ixgbe_strings[] = { "Intel(R) PRO/10GbE PCI-Express Network Driver" }; /********************************************************************* * Function prototypes *********************************************************************/ static int ixgbe_probe(device_t); static int ixgbe_attach(device_t); static int ixgbe_detach(device_t); static int ixgbe_shutdown(device_t); static int ixgbe_suspend(device_t); static int ixgbe_resume(device_t); static int ixgbe_ioctl(struct ifnet *, u_long, caddr_t); static void ixgbe_init(void *); static void ixgbe_init_locked(struct adapter *); static void ixgbe_stop(void *); #if __FreeBSD_version >= 1100036 static uint64_t ixgbe_get_counter(struct ifnet *, ift_counter); #endif static void ixgbe_add_media_types(struct adapter *); static void ixgbe_media_status(struct ifnet *, struct ifmediareq *); static int ixgbe_media_change(struct ifnet *); static void ixgbe_identify_hardware(struct adapter *); static int ixgbe_allocate_pci_resources(struct adapter *); static void ixgbe_get_slot_info(struct adapter *); static int ixgbe_allocate_msix(struct adapter *); static int ixgbe_allocate_legacy(struct adapter *); static int ixgbe_setup_msix(struct adapter *); static void ixgbe_free_pci_resources(struct adapter *); static void ixgbe_local_timer(void *); static int ixgbe_setup_interface(device_t, struct adapter *); static void ixgbe_config_gpie(struct adapter *); static void ixgbe_config_dmac(struct adapter *); static void ixgbe_config_delay_values(struct adapter *); static void ixgbe_config_link(struct adapter *); static void ixgbe_check_wol_support(struct adapter *); static int ixgbe_setup_low_power_mode(struct adapter *); static void ixgbe_rearm_queues(struct adapter *, u64); static void ixgbe_initialize_transmit_units(struct adapter *); static void ixgbe_initialize_receive_units(struct adapter *); static void ixgbe_enable_rx_drop(struct adapter *); static void ixgbe_disable_rx_drop(struct adapter *); static void ixgbe_initialize_rss_mapping(struct adapter *); static void ixgbe_enable_intr(struct adapter *); static void ixgbe_disable_intr(struct adapter *); static void ixgbe_update_stats_counters(struct adapter *); static void ixgbe_set_promisc(struct adapter *); static void ixgbe_set_multi(struct adapter *); static void ixgbe_update_link_status(struct adapter *); static void ixgbe_set_ivar(struct adapter *, u8, u8, s8); static void ixgbe_configure_ivars(struct adapter *); static u8 * ixgbe_mc_array_itr(struct ixgbe_hw *, u8 **, u32 *); static void ixgbe_setup_vlan_hw_support(struct adapter *); static void ixgbe_register_vlan(void *, struct ifnet *, u16); static void ixgbe_unregister_vlan(void *, struct ifnet *, u16); static void ixgbe_add_device_sysctls(struct adapter *); static void ixgbe_add_hw_stats(struct adapter *); /* Sysctl handlers */ static void ixgbe_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS); static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_thermal_test(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_dmac(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_phy_temp(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_phy_overtemp_occurred(SYSCTL_HANDLER_ARGS); #ifdef IXGBE_DEBUG static int ixgbe_sysctl_power_state(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_print_rss_config(SYSCTL_HANDLER_ARGS); #endif static int ixgbe_sysctl_wol_enable(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_wufc(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_eee_enable(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_eee_negotiated(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_eee_rx_lpi_status(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_eee_tx_lpi_status(SYSCTL_HANDLER_ARGS); static int ixgbe_sysctl_eee_tx_lpi_delay(SYSCTL_HANDLER_ARGS); /* Support for pluggable optic modules */ static bool ixgbe_sfp_probe(struct adapter *); static void ixgbe_setup_optics(struct adapter *); /* Legacy (single vector interrupt handler */ static void ixgbe_legacy_irq(void *); /* The MSI/X Interrupt handlers */ static void ixgbe_msix_que(void *); static void ixgbe_msix_link(void *); /* Deferred interrupt tasklets */ static void ixgbe_handle_que(void *, int); static void ixgbe_handle_link(void *, int); static void ixgbe_handle_msf(void *, int); static void ixgbe_handle_mod(void *, int); static void ixgbe_handle_phy(void *, int); #ifdef IXGBE_FDIR static void ixgbe_reinit_fdir(void *, int); #endif #ifdef PCI_IOV static void ixgbe_ping_all_vfs(struct adapter *); static void ixgbe_handle_mbx(void *, int); static int ixgbe_init_iov(device_t, u16, const nvlist_t *); static void ixgbe_uninit_iov(device_t); static int ixgbe_add_vf(device_t, u16, const nvlist_t *); static void ixgbe_initialize_iov(struct adapter *); static void ixgbe_recalculate_max_frame(struct adapter *); static void ixgbe_init_vf(struct adapter *, struct ixgbe_vf *); #endif /* PCI_IOV */ /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t ix_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ixgbe_probe), DEVMETHOD(device_attach, ixgbe_attach), DEVMETHOD(device_detach, ixgbe_detach), DEVMETHOD(device_shutdown, ixgbe_shutdown), DEVMETHOD(device_suspend, ixgbe_suspend), DEVMETHOD(device_resume, ixgbe_resume), #ifdef PCI_IOV DEVMETHOD(pci_iov_init, ixgbe_init_iov), DEVMETHOD(pci_iov_uninit, ixgbe_uninit_iov), DEVMETHOD(pci_iov_add_vf, ixgbe_add_vf), #endif /* PCI_IOV */ DEVMETHOD_END }; static driver_t ix_driver = { "ix", ix_methods, sizeof(struct adapter), }; devclass_t ix_devclass; DRIVER_MODULE(ix, pci, ix_driver, ix_devclass, 0, 0); MODULE_DEPEND(ix, pci, 1, 1, 1); MODULE_DEPEND(ix, ether, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(ix, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ /* ** TUNEABLE PARAMETERS: */ static SYSCTL_NODE(_hw, OID_AUTO, ix, CTLFLAG_RD, 0, "IXGBE driver parameters"); /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int ixgbe_enable_aim = TRUE; SYSCTL_INT(_hw_ix, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &ixgbe_enable_aim, 0, "Enable adaptive interrupt moderation"); static int ixgbe_max_interrupt_rate = (4000000 / IXGBE_LOW_LATENCY); SYSCTL_INT(_hw_ix, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, &ixgbe_max_interrupt_rate, 0, "Maximum interrupts per second"); /* How many packets rxeof tries to clean at a time */ static int ixgbe_rx_process_limit = 256; SYSCTL_INT(_hw_ix, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, &ixgbe_rx_process_limit, 0, "Maximum number of received packets to process at a time," "-1 means unlimited"); /* How many packets txeof tries to clean at a time */ static int ixgbe_tx_process_limit = 256; SYSCTL_INT(_hw_ix, OID_AUTO, tx_process_limit, CTLFLAG_RDTUN, &ixgbe_tx_process_limit, 0, "Maximum number of sent packets to process at a time," "-1 means unlimited"); /* ** Smart speed setting, default to on ** this only works as a compile option ** right now as its during attach, set ** this to 'ixgbe_smart_speed_off' to ** disable. */ static int ixgbe_smart_speed = ixgbe_smart_speed_on; /* * MSIX should be the default for best performance, * but this allows it to be forced off for testing. */ static int ixgbe_enable_msix = 1; SYSCTL_INT(_hw_ix, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &ixgbe_enable_msix, 0, "Enable MSI-X interrupts"); /* * Number of Queues, can be set to 0, * it then autoconfigures based on the * number of cpus with a max of 8. This * can be overriden manually here. */ static int ixgbe_num_queues = 0; SYSCTL_INT(_hw_ix, OID_AUTO, num_queues, CTLFLAG_RDTUN, &ixgbe_num_queues, 0, "Number of queues to configure, 0 indicates autoconfigure"); /* ** Number of TX descriptors per ring, ** setting higher than RX as this seems ** the better performing choice. */ static int ixgbe_txd = PERFORM_TXD; SYSCTL_INT(_hw_ix, OID_AUTO, txd, CTLFLAG_RDTUN, &ixgbe_txd, 0, "Number of transmit descriptors per queue"); /* Number of RX descriptors per ring */ static int ixgbe_rxd = PERFORM_RXD; SYSCTL_INT(_hw_ix, OID_AUTO, rxd, CTLFLAG_RDTUN, &ixgbe_rxd, 0, "Number of receive descriptors per queue"); /* ** Defining this on will allow the use ** of unsupported SFP+ modules, note that ** doing so you are on your own :) */ static int allow_unsupported_sfp = FALSE; TUNABLE_INT("hw.ix.unsupported_sfp", &allow_unsupported_sfp); /* Keep running tab on them for sanity check */ static int ixgbe_total_ports; #ifdef IXGBE_FDIR /* ** Flow Director actually 'steals' ** part of the packet buffer as its ** filter pool, this variable controls ** how much it uses: ** 0 = 64K, 1 = 128K, 2 = 256K */ static int fdir_pballoc = 1; #endif #ifdef DEV_NETMAP /* * The #ifdef DEV_NETMAP / #endif blocks in this file are meant to * be a reference on how to implement netmap support in a driver. * Additional comments are in ixgbe_netmap.h . * * contains functions for netmap support * that extend the standard driver. */ #include #endif /* DEV_NETMAP */ static MALLOC_DEFINE(M_IXGBE, "ix", "ix driver allocations"); /********************************************************************* * Device identification routine * * ixgbe_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int ixgbe_probe(device_t dev) { ixgbe_vendor_info_t *ent; u16 pci_vendor_id = 0; u16 pci_device_id = 0; u16 pci_subvendor_id = 0; u16 pci_subdevice_id = 0; char adapter_name[256]; INIT_DEBUGOUT("ixgbe_probe: begin"); pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IXGBE_INTEL_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = ixgbe_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == 0)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == 0))) { sprintf(adapter_name, "%s, Version - %s", ixgbe_strings[ent->index], ixgbe_driver_version); device_set_desc_copy(dev, adapter_name); ++ixgbe_total_ports; return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int ixgbe_attach(device_t dev) { struct adapter *adapter; struct ixgbe_hw *hw; int error = 0; u16 csum; u32 ctrl_ext; INIT_DEBUGOUT("ixgbe_attach: begin"); /* Allocate, clear, and link in our adapter structure */ adapter = device_get_softc(dev); adapter->dev = dev; hw = &adapter->hw; #ifdef DEV_NETMAP adapter->init_locked = ixgbe_init_locked; adapter->stop_locked = ixgbe_stop; #endif /* Core Lock Init*/ IXGBE_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* Set up the timer callout */ callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware revision */ ixgbe_identify_hardware(adapter); /* Do base PCI setup - map BAR0 */ if (ixgbe_allocate_pci_resources(adapter)) { device_printf(dev, "Allocation of PCI resources failed\n"); error = ENXIO; goto err_out; } /* Sysctls for limiting the amount of work done in the taskqueues */ ixgbe_set_sysctl_value(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, ixgbe_rx_process_limit); ixgbe_set_sysctl_value(adapter, "tx_processing_limit", "max number of tx packets to process", &adapter->tx_process_limit, ixgbe_tx_process_limit); /* Do descriptor calc and sanity checks */ if (((ixgbe_txd * sizeof(union ixgbe_adv_tx_desc)) % DBA_ALIGN) != 0 || ixgbe_txd < MIN_TXD || ixgbe_txd > MAX_TXD) { device_printf(dev, "TXD config issue, using default!\n"); adapter->num_tx_desc = DEFAULT_TXD; } else adapter->num_tx_desc = ixgbe_txd; /* ** With many RX rings it is easy to exceed the ** system mbuf allocation. Tuning nmbclusters ** can alleviate this. */ if (nmbclusters > 0) { int s; s = (ixgbe_rxd * adapter->num_queues) * ixgbe_total_ports; if (s > nmbclusters) { device_printf(dev, "RX Descriptors exceed " "system mbuf max, using default instead!\n"); ixgbe_rxd = DEFAULT_RXD; } } if (((ixgbe_rxd * sizeof(union ixgbe_adv_rx_desc)) % DBA_ALIGN) != 0 || ixgbe_rxd < MIN_RXD || ixgbe_rxd > MAX_RXD) { device_printf(dev, "RXD config issue, using default!\n"); adapter->num_rx_desc = DEFAULT_RXD; } else adapter->num_rx_desc = ixgbe_rxd; /* Allocate our TX/RX Queues */ if (ixgbe_allocate_queues(adapter)) { error = ENOMEM; goto err_out; } /* Allocate multicast array memory. */ adapter->mta = malloc(sizeof(*adapter->mta) * MAX_NUM_MULTICAST_ADDRESSES, M_DEVBUF, M_NOWAIT); if (adapter->mta == NULL) { device_printf(dev, "Can not allocate multicast setup array\n"); error = ENOMEM; goto err_late; } /* Initialize the shared code */ hw->allow_unsupported_sfp = allow_unsupported_sfp; error = ixgbe_init_shared_code(hw); if (error == IXGBE_ERR_SFP_NOT_PRESENT) { /* ** No optics in this port, set up ** so the timer routine will probe ** for later insertion. */ adapter->sfp_probe = TRUE; error = 0; } else if (error == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module detected!\n"); error = EIO; goto err_late; } else if (error) { device_printf(dev, "Unable to initialize the shared code\n"); error = EIO; goto err_late; } /* Make sure we have a good EEPROM before we read from it */ if (ixgbe_validate_eeprom_checksum(&adapter->hw, &csum) < 0) { device_printf(dev, "The EEPROM Checksum Is Not Valid\n"); error = EIO; goto err_late; } error = ixgbe_init_hw(hw); switch (error) { case IXGBE_ERR_EEPROM_VERSION: device_printf(dev, "This device is a pre-production adapter/" "LOM. Please be aware there may be issues associated " "with your hardware.\nIf you are experiencing problems " "please contact your Intel or hardware representative " "who provided you with this hardware.\n"); break; case IXGBE_ERR_SFP_NOT_SUPPORTED: device_printf(dev, "Unsupported SFP+ Module\n"); error = EIO; goto err_late; case IXGBE_ERR_SFP_NOT_PRESENT: device_printf(dev, "No SFP+ Module found\n"); /* falls thru */ default: break; } if ((adapter->msix > 1) && (ixgbe_enable_msix)) error = ixgbe_allocate_msix(adapter); else error = ixgbe_allocate_legacy(adapter); if (error) goto err_late; /* Setup OS specific network interface */ if (ixgbe_setup_interface(dev, adapter) != 0) goto err_late; /* Initialize statistics */ ixgbe_update_stats_counters(adapter); /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ixgbe_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ixgbe_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); /* Check PCIE slot type/speed/width */ ixgbe_get_slot_info(adapter); /* Set an initial default flow control & dmac value */ adapter->fc = ixgbe_fc_full; adapter->dmac = 0; adapter->eee_enabled = 0; #ifdef PCI_IOV if ((hw->mac.type != ixgbe_mac_82598EB) && (adapter->msix > 1)) { nvlist_t *pf_schema, *vf_schema; hw->mbx.ops.init_params(hw); pf_schema = pci_iov_schema_alloc_node(); vf_schema = pci_iov_schema_alloc_node(); pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL); pci_iov_schema_add_bool(vf_schema, "mac-anti-spoof", IOV_SCHEMA_HASDEFAULT, TRUE); pci_iov_schema_add_bool(vf_schema, "allow-set-mac", IOV_SCHEMA_HASDEFAULT, FALSE); pci_iov_schema_add_bool(vf_schema, "allow-promisc", IOV_SCHEMA_HASDEFAULT, FALSE); error = pci_iov_attach(dev, pf_schema, vf_schema); if (error != 0) { device_printf(dev, "Error %d setting up SR-IOV\n", error); } } #endif /* PCI_IOV */ /* Check for certain supported features */ ixgbe_check_wol_support(adapter); /* Add sysctls */ ixgbe_add_device_sysctls(adapter); ixgbe_add_hw_stats(adapter); /* let hardware know driver is loaded */ ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT); ctrl_ext |= IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl_ext); #ifdef DEV_NETMAP ixgbe_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("ixgbe_attach: end"); return (0); err_late: ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); err_out: if (adapter->ifp != NULL) if_free(adapter->ifp); ixgbe_free_pci_resources(adapter); free(adapter->mta, M_DEVBUF); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int ixgbe_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; u32 ctrl_ext; INIT_DEBUGOUT("ixgbe_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev,"Vlan in use, detach first\n"); return (EBUSY); } #ifdef PCI_IOV if (pci_iov_detach(dev) != 0) { device_printf(dev, "SR-IOV in use; detach first.\n"); return (EBUSY); } #endif /* PCI_IOV */ ether_ifdetach(adapter->ifp); /* Stop the adapter */ IXGBE_CORE_LOCK(adapter); ixgbe_setup_low_power_mode(adapter); IXGBE_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { if (que->tq) { #ifndef IXGBE_LEGACY_TX taskqueue_drain(que->tq, &txr->txq_task); #endif taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } /* Drain the Link queue */ if (adapter->tq) { taskqueue_drain(adapter->tq, &adapter->link_task); taskqueue_drain(adapter->tq, &adapter->mod_task); taskqueue_drain(adapter->tq, &adapter->msf_task); #ifdef PCI_IOV taskqueue_drain(adapter->tq, &adapter->mbx_task); #endif taskqueue_drain(adapter->tq, &adapter->phy_task); #ifdef IXGBE_FDIR taskqueue_drain(adapter->tq, &adapter->fdir_task); #endif taskqueue_free(adapter->tq); } /* let hardware know driver is unloading */ ctrl_ext = IXGBE_READ_REG(&adapter->hw, IXGBE_CTRL_EXT); ctrl_ext &= ~IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(&adapter->hw, IXGBE_CTRL_EXT, ctrl_ext); /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(adapter->ifp); #endif /* DEV_NETMAP */ ixgbe_free_pci_resources(adapter); bus_generic_detach(dev); if_free(adapter->ifp); ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); free(adapter->mta, M_DEVBUF); IXGBE_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int ixgbe_shutdown(device_t dev) { struct adapter *adapter = device_get_softc(dev); int error = 0; INIT_DEBUGOUT("ixgbe_shutdown: begin"); IXGBE_CORE_LOCK(adapter); error = ixgbe_setup_low_power_mode(adapter); IXGBE_CORE_UNLOCK(adapter); return (error); } /** * Methods for going from: * D0 -> D3: ixgbe_suspend * D3 -> D0: ixgbe_resume */ static int ixgbe_suspend(device_t dev) { struct adapter *adapter = device_get_softc(dev); int error = 0; INIT_DEBUGOUT("ixgbe_suspend: begin"); IXGBE_CORE_LOCK(adapter); error = ixgbe_setup_low_power_mode(adapter); IXGBE_CORE_UNLOCK(adapter); return (error); } static int ixgbe_resume(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ifnet *ifp = adapter->ifp; struct ixgbe_hw *hw = &adapter->hw; u32 wus; INIT_DEBUGOUT("ixgbe_resume: begin"); IXGBE_CORE_LOCK(adapter); /* Read & clear WUS register */ wus = IXGBE_READ_REG(hw, IXGBE_WUS); if (wus) device_printf(dev, "Woken up by (WUS): %#010x\n", IXGBE_READ_REG(hw, IXGBE_WUS)); IXGBE_WRITE_REG(hw, IXGBE_WUS, 0xffffffff); /* And clear WUFC until next low-power transition */ IXGBE_WRITE_REG(hw, IXGBE_WUFC, 0); /* * Required after D3->D0 transition; * will re-advertise all previous advertised speeds */ if (ifp->if_flags & IFF_UP) ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); return (0); } /********************************************************************* * Ioctl entry point * * ixgbe_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int ixgbe_ioctl(struct ifnet * ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *)data; #endif int error = 0; bool avoid_reset = FALSE; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ixgbe_init(adapter); #ifdef INET if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); #endif } else error = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: IOCTL_DEBUGOUT("ioctl: SIOCSIFMTU (Set Interface MTU)"); if (ifr->ifr_mtu > IXGBE_MAX_MTU) { error = EINVAL; } else { IXGBE_CORE_LOCK(adapter); ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + IXGBE_MTU_HDR; ixgbe_init_locked(adapter); #ifdef PCI_IOV ixgbe_recalculate_max_frame(adapter); #endif IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl: SIOCSIFFLAGS (Set Interface Flags)"); IXGBE_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { if ((ifp->if_flags ^ adapter->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) { ixgbe_set_promisc(adapter); } } else ixgbe_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ixgbe_stop(adapter); adapter->if_flags = ifp->if_flags; IXGBE_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixgbe_disable_intr(adapter); ixgbe_set_multi(adapter); ixgbe_enable_intr(adapter); IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl: SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { IOCTL_DEBUGOUT("ioctl: SIOCSIFCAP (Set Capabilities)"); int mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (!mask) break; /* HW cannot turn these on/off separately */ if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) { ifp->if_capenable ^= IFCAP_RXCSUM; ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; } if (mask & IFCAP_TXCSUM) ifp->if_capenable ^= IFCAP_TXCSUM; if (mask & IFCAP_TXCSUM_IPV6) ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWFILTER) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } VLAN_CAPABILITIES(ifp); break; } #if __FreeBSD_version >= 1100036 case SIOCGI2C: { struct ixgbe_hw *hw = &adapter->hw; struct ifi2creq i2c; int i; IOCTL_DEBUGOUT("ioctl: SIOCGI2C (Get I2C Data)"); error = copyin(ifr->ifr_data, &i2c, sizeof(i2c)); if (error != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { error = EINVAL; break; } if (i2c.len > sizeof(i2c.data)) { error = EINVAL; break; } for (i = 0; i < i2c.len; i++) hw->phy.ops.read_i2c_byte(hw, i2c.offset + i, i2c.dev_addr, &i2c.data[i]); error = copyout(&i2c, ifr->ifr_data, sizeof(i2c)); break; } #endif default: IOCTL_DEBUGOUT1("ioctl: UNKNOWN (0x%X)\n", (int)command); error = ether_ioctl(ifp, command, data); break; } return (error); } /* * Set the various hardware offload abilities. * * This takes the ifnet's if_capenable flags (e.g. set by the user using * ifconfig) and indicates to the OS via the ifnet's if_hwassist field what * mbuf offload flags the driver will understand. */ static void ixgbe_set_if_hwassist(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; ifp->if_hwassist = 0; #if __FreeBSD_version >= 1000000 if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP); if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= (CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP); #else if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); struct ixgbe_hw *hw = &adapter->hw; if (hw->mac.type != ixgbe_mac_82598EB) ifp->if_hwassist |= CSUM_SCTP; } #endif } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ #define IXGBE_MHADD_MFS_SHIFT 16 static void ixgbe_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; struct tx_ring *txr; struct rx_ring *rxr; u32 txdctl, mhadd; u32 rxdctl, rxctrl; int err = 0; #ifdef PCI_IOV enum ixgbe_iov_mode mode; #endif mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixgbe_init_locked: begin"); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); callout_stop(&adapter->timer); #ifdef PCI_IOV mode = ixgbe_get_iov_mode(adapter); adapter->pool = ixgbe_max_vfs(mode); /* Queue indices may change with IOV mode */ for (int i = 0; i < adapter->num_queues; i++) { adapter->rx_rings[i].me = ixgbe_pf_que_index(mode, i); adapter->tx_rings[i].me = ixgbe_pf_que_index(mode, i); } #endif /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(hw, 0, hw->mac.addr, adapter->pool, IXGBE_RAH_AV); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(ifp), hw->mac.addr, IXGBE_ETH_LENGTH_OF_ADDRESS); ixgbe_set_rar(hw, 0, hw->mac.addr, adapter->pool, 1); hw->addr_ctrl.rar_used_count = 1; /* Set hardware offload abilities from ifnet flags */ ixgbe_set_if_hwassist(adapter); /* Prepare transmit descriptors and buffers */ if (ixgbe_setup_transmit_structures(adapter)) { device_printf(dev, "Could not setup transmit structures\n"); ixgbe_stop(adapter); return; } ixgbe_init_hw(hw); #ifdef PCI_IOV ixgbe_initialize_iov(adapter); #endif ixgbe_initialize_transmit_units(adapter); /* Setup Multicast table */ ixgbe_set_multi(adapter); /* Determine the correct mbuf pool, based on frame size */ if (adapter->max_frame_size <= MCLBYTES) adapter->rx_mbuf_sz = MCLBYTES; else adapter->rx_mbuf_sz = MJUMPAGESIZE; /* Prepare receive descriptors and buffers */ if (ixgbe_setup_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); ixgbe_stop(adapter); return; } /* Configure RX settings */ ixgbe_initialize_receive_units(adapter); /* Enable SDP & MSIX interrupts based on adapter */ ixgbe_config_gpie(adapter); /* Set MTU size */ if (ifp->if_mtu > ETHERMTU) { /* aka IXGBE_MAXFRS on 82599 and newer */ mhadd = IXGBE_READ_REG(hw, IXGBE_MHADD); mhadd &= ~IXGBE_MHADD_MFS_MASK; mhadd |= adapter->max_frame_size << IXGBE_MHADD_MFS_SHIFT; IXGBE_WRITE_REG(hw, IXGBE_MHADD, mhadd); } /* Now enable all the queues */ for (int i = 0; i < adapter->num_queues; i++) { txr = &adapter->tx_rings[i]; txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(txr->me)); txdctl |= IXGBE_TXDCTL_ENABLE; /* Set WTHRESH to 8, burst writeback */ txdctl |= (8 << 16); /* * When the internal queue falls below PTHRESH (32), * start prefetching as long as there are at least * HTHRESH (1) buffers ready. The values are taken * from the Intel linux driver 3.8.21. * Prefetching enables tx line rate even with 1 queue. */ txdctl |= (32 << 0) | (1 << 8); IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(txr->me), txdctl); } for (int i = 0, j = 0; i < adapter->num_queues; i++) { rxr = &adapter->rx_rings[i]; rxdctl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(rxr->me)); if (hw->mac.type == ixgbe_mac_82598EB) { /* ** PTHRESH = 21 ** HTHRESH = 4 ** WTHRESH = 8 */ rxdctl &= ~0x3FFFFF; rxdctl |= 0x080420; } rxdctl |= IXGBE_RXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(rxr->me), rxdctl); for (; j < 10; j++) { if (IXGBE_READ_REG(hw, IXGBE_RXDCTL(rxr->me)) & IXGBE_RXDCTL_ENABLE) break; else msec_delay(1); } wmb(); #ifdef DEV_NETMAP /* * In netmap mode, we must preserve the buffers made * available to userspace before the if_init() * (this is true by default on the TX side, because * init makes all buffers available to userspace). * * netmap_reset() and the device specific routines * (e.g. ixgbe_setup_receive_rings()) map these * buffers at the end of the NIC ring, so here we * must set the RDT (tail) register to make sure * they are not overwritten. * * In this driver the NIC ring starts at RDH = 0, * RDT points to the last slot available for reception (?), * so RDT = num_rx_desc - 1 means the whole ring is available. */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_RDT(rxr->me), t); } else #endif /* DEV_NETMAP */ IXGBE_WRITE_REG(hw, IXGBE_RDT(rxr->me), adapter->num_rx_desc - 1); } /* Enable Receive engine */ rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); if (hw->mac.type == ixgbe_mac_82598EB) rxctrl |= IXGBE_RXCTRL_DMBYPS; rxctrl |= IXGBE_RXCTRL_RXEN; ixgbe_enable_rx_dma(hw, rxctrl); callout_reset(&adapter->timer, hz, ixgbe_local_timer, adapter); /* Set up MSI/X routing */ if (ixgbe_enable_msix) { ixgbe_configure_ivars(adapter); /* Set up auto-mask */ if (hw->mac.type == ixgbe_mac_82598EB) IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); else { IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF); IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF); } } else { /* Simple settings for Legacy/MSI */ ixgbe_set_ivar(adapter, 0, 0, 0); ixgbe_set_ivar(adapter, 0, 0, 1); IXGBE_WRITE_REG(hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); } #ifdef IXGBE_FDIR /* Init Flow director */ if (hw->mac.type != ixgbe_mac_82598EB) { u32 hdrm = 32 << fdir_pballoc; hw->mac.ops.setup_rxpba(hw, 0, hdrm, PBA_STRATEGY_EQUAL); ixgbe_init_fdir_signature_82599(&adapter->hw, fdir_pballoc); } #endif /* * Check on any SFP devices that * need to be kick-started */ if (hw->phy.type == ixgbe_phy_none) { err = hw->phy.ops.identify(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module type was detected.\n"); return; } } /* Set moderation on the Link interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EITR(adapter->vector), IXGBE_LINK_ITR); /* Configure Energy Efficient Ethernet for supported devices */ if (hw->mac.ops.setup_eee) { err = hw->mac.ops.setup_eee(hw, adapter->eee_enabled); if (err) device_printf(dev, "Error setting up EEE: %d\n", err); } /* Config/Enable Link */ ixgbe_config_link(adapter); /* Hardware Packet Buffer & Flow Control setup */ ixgbe_config_delay_values(adapter); /* Initialize the FC settings */ ixgbe_start_hw(hw); /* Set up VLAN support and filter */ ixgbe_setup_vlan_hw_support(adapter); /* Setup DMA Coalescing */ ixgbe_config_dmac(adapter); /* And now turn on interrupts */ ixgbe_enable_intr(adapter); #ifdef PCI_IOV /* Enable the use of the MBX by the VF's */ { u32 reg = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT); reg |= IXGBE_CTRL_EXT_PFRSTD; IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, reg); } #endif /* Now inform the stack we're ready */ ifp->if_drv_flags |= IFF_DRV_RUNNING; return; } static void ixgbe_init(void *arg) { struct adapter *adapter = arg; IXGBE_CORE_LOCK(adapter); ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); return; } static void ixgbe_config_gpie(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 gpie; gpie = IXGBE_READ_REG(hw, IXGBE_GPIE); /* Fan Failure Interrupt */ if (hw->device_id == IXGBE_DEV_ID_82598AT) gpie |= IXGBE_SDP1_GPIEN; /* * Module detection (SDP2) * Media ready (SDP1) */ if (hw->mac.type == ixgbe_mac_82599EB) { gpie |= IXGBE_SDP2_GPIEN; if (hw->device_id != IXGBE_DEV_ID_82599_QSFP_SF_QP) gpie |= IXGBE_SDP1_GPIEN; } /* * Thermal Failure Detection (X540) * Link Detection (X552 SFP+, X552/X557-AT) */ if (hw->mac.type == ixgbe_mac_X540 || hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP || hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) gpie |= IXGBE_SDP0_GPIEN_X540; if (adapter->msix > 1) { /* Enable Enhanced MSIX mode */ gpie |= IXGBE_GPIE_MSIX_MODE; gpie |= IXGBE_GPIE_EIAME | IXGBE_GPIE_PBA_SUPPORT | IXGBE_GPIE_OCD; } IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie); return; } /* * Requires adapter->max_frame_size to be set. */ static void ixgbe_config_delay_values(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 rxpb, frame, size, tmp; frame = adapter->max_frame_size; /* Calculate High Water */ switch (hw->mac.type) { case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: tmp = IXGBE_DV_X540(frame, frame); break; default: tmp = IXGBE_DV(frame, frame); break; } size = IXGBE_BT2KB(tmp); rxpb = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(0)) >> 10; hw->fc.high_water[0] = rxpb - size; /* Now calculate Low Water */ switch (hw->mac.type) { case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: tmp = IXGBE_LOW_DV_X540(frame); break; default: tmp = IXGBE_LOW_DV(frame); break; } hw->fc.low_water[0] = IXGBE_BT2KB(tmp); hw->fc.requested_mode = adapter->fc; hw->fc.pause_time = IXGBE_FC_PAUSE; hw->fc.send_xon = TRUE; } /* ** ** MSIX Interrupt Handlers and Tasklets ** */ static inline void ixgbe_enable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; if (hw->mac.type == ixgbe_mac_82598EB) { mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask); } else { mask = (queue & 0xFFFFFFFF); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask); mask = (queue >> 32); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask); } } static inline void ixgbe_disable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; if (hw->mac.type == ixgbe_mac_82598EB) { mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_EIMC, mask); } else { mask = (queue & 0xFFFFFFFF); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(0), mask); mask = (queue >> 32); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(1), mask); } } static void ixgbe_handle_que(void *context, int pending) { struct ix_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifndef IXGBE_LEGACY_TX if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixgbe_start_locked(txr, ifp); #endif IXGBE_TX_UNLOCK(txr); } /* Reenable this interrupt */ if (que->res != NULL) ixgbe_enable_queue(adapter, que->msix); else ixgbe_enable_intr(adapter); return; } /********************************************************************* * * Legacy Interrupt Service routine * **********************************************************************/ static void ixgbe_legacy_irq(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; bool more; u32 reg_eicr; reg_eicr = IXGBE_READ_REG(hw, IXGBE_EICR); ++que->irqs; if (reg_eicr == 0) { ixgbe_enable_intr(adapter); return; } more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifdef IXGBE_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixgbe_start_locked(txr, ifp); #else if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #endif IXGBE_TX_UNLOCK(txr); /* Check for fan failure */ if ((hw->device_id == IXGBE_DEV_ID_82598AT) && (reg_eicr & IXGBE_EICR_GPI_SDP1)) { device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! " "REPLACE IMMEDIATELY!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EICR_GPI_SDP1_BY_MAC(hw)); } /* Link status change */ if (reg_eicr & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->link_task); /* External PHY interrupt */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T && (reg_eicr & IXGBE_EICR_GPI_SDP0_X540)) taskqueue_enqueue(adapter->tq, &adapter->phy_task); if (more) taskqueue_enqueue(que->tq, &que->que_task); else ixgbe_enable_intr(adapter); return; } /********************************************************************* * * MSIX Queue Interrupt Service routine * **********************************************************************/ void ixgbe_msix_que(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; bool more; u32 newitr = 0; /* Protect against spurious interrupts */ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ixgbe_disable_queue(adapter, que->msix); ++que->irqs; more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #ifdef IXGBE_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(ifp->if_snd)) ixgbe_start_locked(txr, ifp); #else if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #endif IXGBE_TX_UNLOCK(txr); /* Do AIM now? */ if (ixgbe_enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) IXGBE_WRITE_REG(&adapter->hw, IXGBE_EITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); if (adapter->hw.mac.type == ixgbe_mac_82598EB) newitr |= newitr << 16; else newitr |= IXGBE_EITR_CNT_WDIS; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: if (more) taskqueue_enqueue(que->tq, &que->que_task); else ixgbe_enable_queue(adapter, que->msix); return; } static void ixgbe_msix_link(void *arg) { struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; u32 reg_eicr, mod_mask; ++adapter->link_irq; /* Pause other interrupts */ IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EIMC_OTHER); /* First get the cause */ reg_eicr = IXGBE_READ_REG(hw, IXGBE_EICS); /* Be sure the queue bits are not cleared */ reg_eicr &= ~IXGBE_EICR_RTX_QUEUE; /* Clear interrupt with write */ IXGBE_WRITE_REG(hw, IXGBE_EICR, reg_eicr); /* Link status change */ if (reg_eicr & IXGBE_EICR_LSC) { IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EIMC_LSC); taskqueue_enqueue(adapter->tq, &adapter->link_task); } if (adapter->hw.mac.type != ixgbe_mac_82598EB) { #ifdef IXGBE_FDIR if (reg_eicr & IXGBE_EICR_FLOW_DIR) { /* This is probably overkill :) */ if (!atomic_cmpset_int(&adapter->fdir_reinit, 0, 1)) return; /* Disable the interrupt */ IXGBE_WRITE_REG(hw, IXGBE_EIMC, IXGBE_EICR_FLOW_DIR); taskqueue_enqueue(adapter->tq, &adapter->fdir_task); } else #endif if (reg_eicr & IXGBE_EICR_ECC) { device_printf(adapter->dev, "CRITICAL: ECC ERROR!! " "Please Reboot!!\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_ECC); } /* Check for over temp condition */ if (reg_eicr & IXGBE_EICR_TS) { device_printf(adapter->dev, "CRITICAL: OVER TEMP!! " "PHY IS SHUT DOWN!!\n"); device_printf(adapter->dev, "System shutdown required!\n"); IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_TS); } #ifdef PCI_IOV if (reg_eicr & IXGBE_EICR_MAILBOX) taskqueue_enqueue(adapter->tq, &adapter->mbx_task); #endif } /* Pluggable optics-related interrupt */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP) mod_mask = IXGBE_EICR_GPI_SDP0_X540; else mod_mask = IXGBE_EICR_GPI_SDP2_BY_MAC(hw); if (ixgbe_is_sfp(hw)) { if (reg_eicr & IXGBE_EICR_GPI_SDP1_BY_MAC(hw)) { IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1_BY_MAC(hw)); taskqueue_enqueue(adapter->tq, &adapter->msf_task); } else if (reg_eicr & mod_mask) { IXGBE_WRITE_REG(hw, IXGBE_EICR, mod_mask); taskqueue_enqueue(adapter->tq, &adapter->mod_task); } } /* Check for fan failure */ if ((hw->device_id == IXGBE_DEV_ID_82598AT) && (reg_eicr & IXGBE_EICR_GPI_SDP1)) { IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1); device_printf(adapter->dev, "\nCRITICAL: FAN FAILURE!! " "REPLACE IMMEDIATELY!!\n"); } /* External PHY interrupt */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T && (reg_eicr & IXGBE_EICR_GPI_SDP0_X540)) { IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP0_X540); taskqueue_enqueue(adapter->tq, &adapter->phy_task); } /* Re-enable other interrupts */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMS, IXGBE_EIMS_OTHER); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void ixgbe_media_status(struct ifnet * ifp, struct ifmediareq * ifmr) { struct adapter *adapter = ifp->if_softc; struct ixgbe_hw *hw = &adapter->hw; int layer; INIT_DEBUGOUT("ixgbe_media_status: begin"); IXGBE_CORE_LOCK(adapter); ixgbe_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IXGBE_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; layer = adapter->phy_layer; if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_T || layer & IXGBE_PHYSICAL_LAYER_1000BASE_T || layer & IXGBE_PHYSICAL_LAYER_100BASE_TX) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_T | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_T | IFM_FDX; break; case IXGBE_LINK_SPEED_100_FULL: ifmr->ifm_active |= IFM_100_TX | IFM_FDX; break; } if (layer & IXGBE_PHYSICAL_LAYER_SFP_PLUS_CU || layer & IXGBE_PHYSICAL_LAYER_SFP_ACTIVE_DA) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_TWINAX | IFM_FDX; break; } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_LR) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_LR | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_LX | IFM_FDX; break; } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_LRM) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_LRM | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_LX | IFM_FDX; break; } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_SR || layer & IXGBE_PHYSICAL_LAYER_1000BASE_SX) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_SR | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_SX | IFM_FDX; break; } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_CX4) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_CX4 | IFM_FDX; break; } /* ** XXX: These need to use the proper media types once ** they're added. */ #ifndef IFM_ETH_XTYPE if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KR) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_SR | IFM_FDX; break; case IXGBE_LINK_SPEED_2_5GB_FULL: ifmr->ifm_active |= IFM_2500_SX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_CX | IFM_FDX; break; } else if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KX4 || layer & IXGBE_PHYSICAL_LAYER_1000BASE_KX) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_CX4 | IFM_FDX; break; case IXGBE_LINK_SPEED_2_5GB_FULL: ifmr->ifm_active |= IFM_2500_SX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_CX | IFM_FDX; break; } #else if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KR) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_KR | IFM_FDX; break; case IXGBE_LINK_SPEED_2_5GB_FULL: ifmr->ifm_active |= IFM_2500_KX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_KX | IFM_FDX; break; } else if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KX4 || layer & IXGBE_PHYSICAL_LAYER_1000BASE_KX) switch (adapter->link_speed) { case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_10G_KX4 | IFM_FDX; break; case IXGBE_LINK_SPEED_2_5GB_FULL: ifmr->ifm_active |= IFM_2500_KX | IFM_FDX; break; case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_KX | IFM_FDX; break; } #endif /* If nothing is recognized... */ if (IFM_SUBTYPE(ifmr->ifm_active) == 0) ifmr->ifm_active |= IFM_UNKNOWN; #if __FreeBSD_version >= 900025 /* Display current flow control setting used on link */ if (hw->fc.current_mode == ixgbe_fc_rx_pause || hw->fc.current_mode == ixgbe_fc_full) ifmr->ifm_active |= IFM_ETH_RXPAUSE; if (hw->fc.current_mode == ixgbe_fc_tx_pause || hw->fc.current_mode == ixgbe_fc_full) ifmr->ifm_active |= IFM_ETH_TXPAUSE; #endif IXGBE_CORE_UNLOCK(adapter); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int ixgbe_media_change(struct ifnet * ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; struct ixgbe_hw *hw = &adapter->hw; ixgbe_link_speed speed = 0; INIT_DEBUGOUT("ixgbe_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); if (hw->phy.media_type == ixgbe_media_type_backplane) return (ENODEV); /* ** We don't actually need to check against the supported ** media types of the adapter; ifmedia will take care of ** that for us. */ #ifndef IFM_ETH_XTYPE switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: case IFM_10G_T: speed |= IXGBE_LINK_SPEED_100_FULL; case IFM_10G_LRM: case IFM_10G_SR: /* KR, too */ case IFM_10G_LR: case IFM_10G_CX4: /* KX4 */ speed |= IXGBE_LINK_SPEED_1GB_FULL; case IFM_10G_TWINAX: speed |= IXGBE_LINK_SPEED_10GB_FULL; break; case IFM_1000_T: speed |= IXGBE_LINK_SPEED_100_FULL; case IFM_1000_LX: case IFM_1000_SX: case IFM_1000_CX: /* KX */ speed |= IXGBE_LINK_SPEED_1GB_FULL; break; case IFM_100_TX: speed |= IXGBE_LINK_SPEED_100_FULL; break; default: goto invalid; } #else switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: case IFM_10G_T: speed |= IXGBE_LINK_SPEED_100_FULL; case IFM_10G_LRM: case IFM_10G_KR: case IFM_10G_LR: case IFM_10G_KX4: speed |= IXGBE_LINK_SPEED_1GB_FULL; case IFM_10G_TWINAX: speed |= IXGBE_LINK_SPEED_10GB_FULL; break; case IFM_1000_T: speed |= IXGBE_LINK_SPEED_100_FULL; case IFM_1000_LX: case IFM_1000_SX: case IFM_1000_KX: speed |= IXGBE_LINK_SPEED_1GB_FULL; break; case IFM_100_TX: speed |= IXGBE_LINK_SPEED_100_FULL; break; default: goto invalid; } #endif hw->mac.autotry_restart = TRUE; hw->mac.ops.setup_link(hw, speed, TRUE); adapter->advertise = ((speed & IXGBE_LINK_SPEED_10GB_FULL) << 2) | ((speed & IXGBE_LINK_SPEED_1GB_FULL) << 1) | ((speed & IXGBE_LINK_SPEED_100_FULL) << 0); return (0); invalid: device_printf(adapter->dev, "Invalid media type!\n"); return (EINVAL); } static void ixgbe_set_promisc(struct adapter *adapter) { u_int32_t reg_rctl; struct ifnet *ifp = adapter->ifp; int mcnt = 0; reg_rctl = IXGBE_READ_REG(&adapter->hw, IXGBE_FCTRL); reg_rctl &= (~IXGBE_FCTRL_UPE); if (ifp->if_flags & IFF_ALLMULTI) mcnt = MAX_NUM_MULTICAST_ADDRESSES; else { struct ifmultiaddr *ifma; #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif } if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) reg_rctl &= (~IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); if (ifp->if_flags & IFF_PROMISC) { reg_rctl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); } else if (ifp->if_flags & IFF_ALLMULTI) { reg_rctl |= IXGBE_FCTRL_MPE; reg_rctl &= ~IXGBE_FCTRL_UPE; IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, reg_rctl); } return; } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ #define IXGBE_RAR_ENTRIES 16 static void ixgbe_set_multi(struct adapter *adapter) { u32 fctrl; u8 *update_ptr; struct ifmultiaddr *ifma; struct ixgbe_mc_addr *mta; int mcnt = 0; struct ifnet *ifp = adapter->ifp; IOCTL_DEBUGOUT("ixgbe_set_multi: begin"); mta = adapter->mta; bzero(mta, sizeof(*mta) * MAX_NUM_MULTICAST_ADDRESSES); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == MAX_NUM_MULTICAST_ADDRESSES) break; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), mta[mcnt].addr, IXGBE_ETH_LENGTH_OF_ADDRESS); mta[mcnt].vmdq = adapter->pool; mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif fctrl = IXGBE_READ_REG(&adapter->hw, IXGBE_FCTRL); fctrl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); if (ifp->if_flags & IFF_PROMISC) fctrl |= (IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); else if (mcnt >= MAX_NUM_MULTICAST_ADDRESSES || ifp->if_flags & IFF_ALLMULTI) { fctrl |= IXGBE_FCTRL_MPE; fctrl &= ~IXGBE_FCTRL_UPE; } else fctrl &= ~(IXGBE_FCTRL_UPE | IXGBE_FCTRL_MPE); IXGBE_WRITE_REG(&adapter->hw, IXGBE_FCTRL, fctrl); if (mcnt < MAX_NUM_MULTICAST_ADDRESSES) { update_ptr = (u8 *)mta; ixgbe_update_mc_addr_list(&adapter->hw, update_ptr, mcnt, ixgbe_mc_array_itr, TRUE); } return; } /* * This is an iterator function now needed by the multicast * shared code. It simply feeds the shared code routine the * addresses in the array of ixgbe_set_multi() one by one. */ static u8 * ixgbe_mc_array_itr(struct ixgbe_hw *hw, u8 **update_ptr, u32 *vmdq) { struct ixgbe_mc_addr *mta; mta = (struct ixgbe_mc_addr *)*update_ptr; *vmdq = mta->vmdq; *update_ptr = (u8*)(mta + 1);; return (mta->addr); } /********************************************************************* * Timer routine * * This routine checks for link status,updates statistics, * and runs the watchdog check. * **********************************************************************/ static void ixgbe_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; u64 queues = 0; int hung = 0; mtx_assert(&adapter->core_mtx, MA_OWNED); /* Check for pluggable optics */ if (adapter->sfp_probe) if (!ixgbe_sfp_probe(adapter)) goto out; /* Nothing to do */ ixgbe_update_link_status(adapter); ixgbe_update_stats_counters(adapter); /* ** Check the TX queues status ** - mark hung queues so we don't schedule on them ** - watchdog only if all queues show hung */ for (int i = 0; i < adapter->num_queues; i++, que++) { /* Keep track of queues with work for soft irq */ if (que->txr->busy) queues |= ((u64)1 << que->me); /* ** Each time txeof runs without cleaning, but there ** are uncleaned descriptors it increments busy. If ** we get to the MAX we declare it hung. */ if (que->busy == IXGBE_QUEUE_HUNG) { ++hung; /* Mark the queue as inactive */ adapter->active_queues &= ~((u64)1 << que->me); continue; } else { /* Check if we've come back from hung */ if ((adapter->active_queues & ((u64)1 << que->me)) == 0) adapter->active_queues |= ((u64)1 << que->me); } if (que->busy >= IXGBE_MAX_TX_BUSY) { device_printf(dev,"Warning queue %d " "appears to be hung!\n", i); que->txr->busy = IXGBE_QUEUE_HUNG; ++hung; } } /* Only truly watchdog if all queues show hung */ if (hung == adapter->num_queues) goto watchdog; else if (queues != 0) { /* Force an IRQ on queues with work */ ixgbe_rearm_queues(adapter, queues); } out: callout_reset(&adapter->timer, hz, ixgbe_local_timer, adapter); return; watchdog: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; ixgbe_init_locked(adapter); } /* ** Note: this routine updates the OS on the link state ** the real check of the hardware only happens with ** a link interrupt. */ static void ixgbe_update_link_status(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; if (adapter->link_up){ if (adapter->link_active == FALSE) { if (bootverbose) device_printf(dev,"Link is up %d Gbps %s \n", ((adapter->link_speed == 128)? 10:1), "Full Duplex"); adapter->link_active = TRUE; /* Update any Flow Control changes */ ixgbe_fc_enable(&adapter->hw); /* Update DMA coalescing config */ ixgbe_config_dmac(adapter); if_link_state_change(ifp, LINK_STATE_UP); #ifdef PCI_IOV ixgbe_ping_all_vfs(adapter); #endif } } else { /* Link down */ if (adapter->link_active == TRUE) { if (bootverbose) device_printf(dev,"Link is Down\n"); if_link_state_change(ifp, LINK_STATE_DOWN); adapter->link_active = FALSE; #ifdef PCI_IOV ixgbe_ping_all_vfs(adapter); #endif } } return; } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void ixgbe_stop(void *arg) { struct ifnet *ifp; struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; ifp = adapter->ifp; mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixgbe_stop: begin\n"); ixgbe_disable_intr(adapter); callout_stop(&adapter->timer); /* Let the stack know...*/ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ixgbe_reset_hw(hw); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); if (hw->mac.type == ixgbe_mac_82599EB) ixgbe_stop_mac_link_on_d3_82599(hw); /* Turn off the laser - noop with no optics */ ixgbe_disable_tx_laser(hw); /* Update the stack */ adapter->link_up = FALSE; ixgbe_update_link_status(adapter); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(&adapter->hw, 0, adapter->hw.mac.addr, 0, IXGBE_RAH_AV); return; } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void ixgbe_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; /* Save off the information about this board */ hw->vendor_id = pci_get_vendor(dev); hw->device_id = pci_get_device(dev); hw->revision_id = pci_read_config(dev, PCIR_REVID, 1); hw->subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); hw->subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* ** Make sure BUSMASTER is set */ pci_enable_busmaster(dev); /* We need this here to set the num_segs below */ ixgbe_set_mac_type(hw); /* Pick up the 82599 settings */ if (hw->mac.type != ixgbe_mac_82598EB) { hw->phy.smart_speed = ixgbe_smart_speed; adapter->num_segs = IXGBE_82599_SCATTER; } else adapter->num_segs = IXGBE_82598_SCATTER; return; } /********************************************************************* * * Determine optic type * **********************************************************************/ static void ixgbe_setup_optics(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; int layer; layer = adapter->phy_layer = ixgbe_get_supported_physical_layer(hw); if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_T) { adapter->optics = IFM_10G_T; return; } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_T) { adapter->optics = IFM_1000_T; return; } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_SX) { adapter->optics = IFM_1000_SX; return; } if (layer & (IXGBE_PHYSICAL_LAYER_10GBASE_LR | IXGBE_PHYSICAL_LAYER_10GBASE_LRM)) { adapter->optics = IFM_10G_LR; return; } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_SR) { adapter->optics = IFM_10G_SR; return; } if (layer & IXGBE_PHYSICAL_LAYER_SFP_PLUS_CU) { adapter->optics = IFM_10G_TWINAX; return; } if (layer & (IXGBE_PHYSICAL_LAYER_10GBASE_KX4 | IXGBE_PHYSICAL_LAYER_10GBASE_CX4)) { adapter->optics = IFM_10G_CX4; return; } /* If we get here just set the default */ adapter->optics = IFM_ETHER | IFM_AUTO; return; } /********************************************************************* * * Setup the Legacy or MSI Interrupt handler * **********************************************************************/ static int ixgbe_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; #ifndef IXGBE_LEGACY_TX struct tx_ring *txr = adapter->tx_rings; #endif int error, rid = 0; /* MSI RID at 1 */ if (adapter->msix == 1) rid = 1; /* We allocate a single interrupt resource */ adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (adapter->res == NULL) { device_printf(dev, "Unable to allocate bus resource: " "interrupt\n"); return (ENXIO); } /* * Try allocating a fast interrupt and the associated deferred * processing contexts. */ #ifndef IXGBE_LEGACY_TX TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); #endif TASK_INIT(&que->que_task, 0, ixgbe_handle_que, que); que->tq = taskqueue_create_fast("ixgbe_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s ixq", device_get_nameunit(adapter->dev)); /* Tasklets for Link, SFP and Multispeed Fiber */ TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); TASK_INIT(&adapter->phy_task, 0, ixgbe_handle_phy, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif adapter->tq = taskqueue_create_fast("ixgbe_link", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s linkq", device_get_nameunit(adapter->dev)); if ((error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_legacy_irq, que, &adapter->tag)) != 0) { device_printf(dev, "Failed to register fast interrupt " "handler: %d\n", error); taskqueue_free(que->tq); taskqueue_free(adapter->tq); que->tq = NULL; adapter->tq = NULL; return (error); } /* For simplicity in the handlers */ adapter->active_queues = IXGBE_EIMS_ENABLE_MASK; return (0); } /********************************************************************* * * Setup MSIX Interrupt resources and handlers * **********************************************************************/ static int ixgbe_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; int error, rid, vector = 0; int cpu_id = 0; #ifdef RSS cpuset_t cpu_mask; #endif #ifdef RSS /* * If we're doing RSS, the number of queues needs to * match the number of RSS buckets that are configured. * * + If there's more queues than RSS buckets, we'll end * up with queues that get no traffic. * * + If there's more RSS buckets than queues, we'll end * up having multiple RSS buckets map to the same queue, * so there'll be some contention. */ if (adapter->num_queues != rss_getnumbuckets()) { device_printf(dev, "%s: number of queues (%d) != number of RSS buckets (%d)" "; performance will be impacted.\n", __func__, adapter->num_queues, rss_getnumbuckets()); } #endif for (int i = 0; i < adapter->num_queues; i++, vector++, que++, txr++) { rid = vector + 1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev,"Unable to allocate" " bus resource: que interrupt [%d]\n", vector); return (ENXIO); } /* Set the handler function */ error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register QUE handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "q%d", i); #endif que->msix = vector; adapter->active_queues |= (u64)(1 << que->msix); #ifdef RSS /* * The queue ID is used as the RSS layer bucket ID. * We look up the queue ID -> RSS CPU ID and select * that. */ cpu_id = rss_getcpu(i % rss_getnumbuckets()); #else /* * Bind the msix vector, and thus the * rings to the corresponding cpu. * * This just happens to match the default RSS round-robin * bucket -> queue -> CPU allocation. */ if (adapter->num_queues > 1) cpu_id = i; #endif if (adapter->num_queues > 1) bus_bind_intr(dev, que->res, cpu_id); #ifdef IXGBE_DEBUG #ifdef RSS device_printf(dev, "Bound RSS bucket %d to CPU %d\n", i, cpu_id); #else device_printf(dev, "Bound queue %d to cpu %d\n", i, cpu_id); #endif #endif /* IXGBE_DEBUG */ #ifndef IXGBE_LEGACY_TX TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); #endif TASK_INIT(&que->que_task, 0, ixgbe_handle_que, que); que->tq = taskqueue_create_fast("ixgbe_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); #ifdef RSS CPU_SETOF(cpu_id, &cpu_mask); taskqueue_start_threads_cpuset(&que->tq, 1, PI_NET, &cpu_mask, "%s (bucket %d)", device_get_nameunit(adapter->dev), cpu_id); #else taskqueue_start_threads(&que->tq, 1, PI_NET, "%s:q%d", device_get_nameunit(adapter->dev), i); #endif } /* and Link */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate" " bus resource: Link interrupt [%d]\n", rid); return (ENXIO); } /* Set the link handler function */ error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixgbe_msix_link, adapter, &adapter->tag); if (error) { adapter->res = NULL; device_printf(dev, "Failed to register LINK handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "link"); #endif adapter->vector = vector; /* Tasklets for Link, SFP and Multispeed Fiber */ TASK_INIT(&adapter->link_task, 0, ixgbe_handle_link, adapter); TASK_INIT(&adapter->mod_task, 0, ixgbe_handle_mod, adapter); TASK_INIT(&adapter->msf_task, 0, ixgbe_handle_msf, adapter); #ifdef PCI_IOV TASK_INIT(&adapter->mbx_task, 0, ixgbe_handle_mbx, adapter); #endif TASK_INIT(&adapter->phy_task, 0, ixgbe_handle_phy, adapter); #ifdef IXGBE_FDIR TASK_INIT(&adapter->fdir_task, 0, ixgbe_reinit_fdir, adapter); #endif adapter->tq = taskqueue_create_fast("ixgbe_link", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s linkq", device_get_nameunit(adapter->dev)); return (0); } /* * Setup Either MSI/X or MSI */ static int ixgbe_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int rid, want, queues, msgs; /* Override by tuneable */ if (ixgbe_enable_msix == 0) goto msi; /* First try MSI/X */ msgs = pci_msix_count(dev); if (msgs == 0) goto msi; rid = PCIR_BAR(MSIX_82598_BAR); adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->msix_mem == NULL) { rid += 4; /* 82599 maps in higher BAR */ adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); } if (adapter->msix_mem == NULL) { /* May not be enabled */ device_printf(adapter->dev, "Unable to map MSIX table \n"); goto msi; } /* Figure out a reasonable auto config value */ queues = (mp_ncpus > (msgs - 1)) ? (msgs - 1) : mp_ncpus; #ifdef RSS /* If we're doing RSS, clamp at the number of RSS buckets */ if (queues > rss_getnumbuckets()) queues = rss_getnumbuckets(); #endif if (ixgbe_num_queues != 0) queues = ixgbe_num_queues; /* Set max queues to 8 when autoconfiguring */ else if ((ixgbe_num_queues == 0) && (queues > 8)) queues = 8; /* reflect correct sysctl value */ ixgbe_num_queues = queues; /* ** Want one vector (RX/TX pair) per queue ** plus an additional for Link. */ want = queues + 1; if (msgs >= want) msgs = want; else { device_printf(adapter->dev, "MSIX Configuration Problem, " "%d vectors but %d queues wanted!\n", msgs, want); goto msi; } if ((pci_alloc_msix(dev, &msgs) == 0) && (msgs == want)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", msgs); adapter->num_queues = queues; return (msgs); } /* ** If MSIX alloc failed or provided us with ** less than needed, free and fall through to MSI */ pci_release_msi(dev); msi: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, rid, adapter->msix_mem); adapter->msix_mem = NULL; } msgs = 1; if (pci_alloc_msi(dev, &msgs) == 0) { device_printf(adapter->dev, "Using an MSI interrupt\n"); return (msgs); } device_printf(adapter->dev, "Using a Legacy interrupt\n"); return (0); } static int ixgbe_allocate_pci_resources(struct adapter *adapter) { int rid; device_t dev = adapter->dev; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!(adapter->pci_mem)) { device_printf(dev, "Unable to allocate bus resource: memory\n"); return (ENXIO); } /* Save bus_space values for READ/WRITE_REG macros */ adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); /* Set hw values for shared code */ adapter->hw.hw_addr = (u8 *) &adapter->osdep.mem_bus_space_handle; adapter->hw.back = adapter; /* Default to 1 queue if MSI-X setup fails */ adapter->num_queues = 1; /* ** Now setup MSI or MSI-X, should ** return us the number of supported ** vectors. (Will be 1 for MSI) */ adapter->msix = ixgbe_setup_msix(adapter); return (0); } static void ixgbe_free_pci_resources(struct adapter * adapter) { struct ix_queue *que = adapter->queues; device_t dev = adapter->dev; int rid, memrid; if (adapter->hw.mac.type == ixgbe_mac_82598EB) memrid = PCIR_BAR(MSIX_82598_BAR); else memrid = PCIR_BAR(MSIX_82599_BAR); /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* ** Release all msix queue resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->vector) /* we are doing MSIX */ rid = adapter->vector + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); if (adapter->tag != NULL) { bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, memrid, adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static int ixgbe_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("ixgbe_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not allocate ifnet structure\n"); return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); ifp->if_init = ixgbe_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ixgbe_ioctl; #if __FreeBSD_version >= 1100036 if_setgetcounterfn(ifp, ixgbe_get_counter); #endif #if __FreeBSD_version >= 1100045 /* TSO parameters */ ifp->if_hw_tsomax = 65518; ifp->if_hw_tsomaxsegcount = IXGBE_82599_SCATTER; ifp->if_hw_tsomaxsegsize = 2048; #endif #ifndef IXGBE_LEGACY_TX ifp->if_transmit = ixgbe_mq_start; ifp->if_qflush = ixgbe_qflush; #else ifp->if_start = ixgbe_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 2); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 2; IFQ_SET_READY(&ifp->if_snd); #endif ether_ifattach(ifp, adapter->hw.mac.addr); adapter->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; /* * Tell the upper layer(s) we support long frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* Set capability flags */ ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6 | IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_LRO | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU | IFCAP_HWSTATS; /* Enable the above capabilities by default */ ifp->if_capenable = ifp->if_capabilities; /* ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If ** using vlans directly on the ixgbe driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, ixgbe_media_change, ixgbe_media_status); adapter->phy_layer = ixgbe_get_supported_physical_layer(&adapter->hw); ixgbe_add_media_types(adapter); /* Set autoselect media by default */ ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return (0); } static void ixgbe_add_media_types(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; int layer; layer = adapter->phy_layer; /* Media types with matching FreeBSD media defines */ if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_T) ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_T, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_T) ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_100BASE_TX) ifmedia_add(&adapter->media, IFM_ETHER | IFM_100_TX, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_SFP_PLUS_CU || layer & IXGBE_PHYSICAL_LAYER_SFP_ACTIVE_DA) ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_TWINAX, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_LR) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_LR, 0, NULL); if (hw->phy.multispeed_fiber) ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_LX, 0, NULL); } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_SR) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_SR, 0, NULL); if (hw->phy.multispeed_fiber) ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX, 0, NULL); } else if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_SX) ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_SX, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_CX4) ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_CX4, 0, NULL); #ifdef IFM_ETH_XTYPE if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KR) ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_KR, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KX4) ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_KX4, 0, NULL); if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_KX) ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_KX, 0, NULL); #else if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KR) { device_printf(dev, "Media supported: 10GbaseKR\n"); device_printf(dev, "10GbaseKR mapped to 10GbaseSR\n"); ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_SR, 0, NULL); } if (layer & IXGBE_PHYSICAL_LAYER_10GBASE_KX4) { device_printf(dev, "Media supported: 10GbaseKX4\n"); device_printf(dev, "10GbaseKX4 mapped to 10GbaseCX4\n"); ifmedia_add(&adapter->media, IFM_ETHER | IFM_10G_CX4, 0, NULL); } if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_KX) { device_printf(dev, "Media supported: 1000baseKX\n"); device_printf(dev, "1000baseKX mapped to 1000baseCX\n"); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_CX, 0, NULL); } #endif if (layer & IXGBE_PHYSICAL_LAYER_1000BASE_BX) device_printf(dev, "Media supported: 1000baseBX\n"); if (hw->device_id == IXGBE_DEV_ID_82598AT) { ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL); ifmedia_add(&adapter->media, IFM_ETHER | IFM_1000_T, 0, NULL); } ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); } static void ixgbe_config_link(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 autoneg, err = 0; bool sfp, negotiate; sfp = ixgbe_is_sfp(hw); if (sfp) { taskqueue_enqueue(adapter->tq, &adapter->mod_task); } else { if (hw->mac.ops.check_link) err = ixgbe_check_link(hw, &adapter->link_speed, &adapter->link_up, FALSE); if (err) goto out; autoneg = hw->phy.autoneg_advertised; if ((!autoneg) && (hw->mac.ops.get_link_capabilities)) err = hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiate); if (err) goto out; if (hw->mac.ops.setup_link) err = hw->mac.ops.setup_link(hw, autoneg, adapter->link_up); } out: return; } /********************************************************************* * * Enable transmit units. * **********************************************************************/ static void ixgbe_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ixgbe_hw *hw = &adapter->hw; /* Setup the Base and Length of the Tx Descriptor Ring */ for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 tdba = txr->txdma.dma_paddr; u32 txctrl = 0; int j = txr->me; IXGBE_WRITE_REG(hw, IXGBE_TDBAL(j), (tdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_TDBAH(j), (tdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_TDLEN(j), adapter->num_tx_desc * sizeof(union ixgbe_adv_tx_desc)); /* Setup the HW Tx Head and Tail descriptor pointers */ IXGBE_WRITE_REG(hw, IXGBE_TDH(j), 0); IXGBE_WRITE_REG(hw, IXGBE_TDT(j), 0); /* Cache the tail address */ txr->tail = IXGBE_TDT(j); /* Disable Head Writeback */ /* * Note: for X550 series devices, these registers are actually * prefixed with TPH_ isntead of DCA_, but the addresses and * fields remain the same. */ switch (hw->mac.type) { case ixgbe_mac_82598EB: txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(j)); break; default: txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(j)); break; } txctrl &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; switch (hw->mac.type) { case ixgbe_mac_82598EB: IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(j), txctrl); break; default: IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(j), txctrl); break; } } if (hw->mac.type != ixgbe_mac_82598EB) { u32 dmatxctl, rttdcs; #ifdef PCI_IOV enum ixgbe_iov_mode mode = ixgbe_get_iov_mode(adapter); #endif dmatxctl = IXGBE_READ_REG(hw, IXGBE_DMATXCTL); dmatxctl |= IXGBE_DMATXCTL_TE; IXGBE_WRITE_REG(hw, IXGBE_DMATXCTL, dmatxctl); /* Disable arbiter to set MTQC */ rttdcs = IXGBE_READ_REG(hw, IXGBE_RTTDCS); rttdcs |= IXGBE_RTTDCS_ARBDIS; IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs); #ifdef PCI_IOV IXGBE_WRITE_REG(hw, IXGBE_MTQC, ixgbe_get_mtqc(mode)); #else IXGBE_WRITE_REG(hw, IXGBE_MTQC, IXGBE_MTQC_64Q_1PB); #endif rttdcs &= ~IXGBE_RTTDCS_ARBDIS; IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs); } return; } static void ixgbe_initialize_rss_mapping(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 reta = 0, mrqc, rss_key[10]; int queue_id, table_size, index_mult; #ifdef RSS u32 rss_hash_config; #endif #ifdef PCI_IOV enum ixgbe_iov_mode mode; #endif #ifdef RSS /* Fetch the configured RSS key */ rss_getkey((uint8_t *) &rss_key); #else /* set up random bits */ arc4rand(&rss_key, sizeof(rss_key), 0); #endif /* Set multiplier for RETA setup and table size based on MAC */ index_mult = 0x1; table_size = 128; switch (adapter->hw.mac.type) { case ixgbe_mac_82598EB: index_mult = 0x11; break; case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: table_size = 512; break; default: break; } /* Set up the redirection table */ for (int i = 0, j = 0; i < table_size; i++, j++) { if (j == adapter->num_queues) j = 0; #ifdef RSS /* * Fetch the RSS bucket id for the given indirection entry. * Cap it at the number of configured buckets (which is * num_queues.) */ queue_id = rss_get_indirection_to_bucket(i); queue_id = queue_id % adapter->num_queues; #else queue_id = (j * index_mult); #endif /* * The low 8 bits are for hash value (n+0); * The next 8 bits are for hash value (n+1), etc. */ reta = reta >> 8; reta = reta | ( ((uint32_t) queue_id) << 24); if ((i & 3) == 3) { if (i < 128) IXGBE_WRITE_REG(hw, IXGBE_RETA(i >> 2), reta); else IXGBE_WRITE_REG(hw, IXGBE_ERETA((i >> 2) - 32), reta); reta = 0; } } /* Now fill our hash function seeds */ for (int i = 0; i < 10; i++) IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), rss_key[i]); /* Perform hash on these packet types */ #ifdef RSS mrqc = IXGBE_MRQC_RSSEN; rss_hash_config = rss_gethashconfig(); if (rss_hash_config & RSS_HASHTYPE_RSS_IPV4) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4; if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV4) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4_TCP; if (rss_hash_config & RSS_HASHTYPE_RSS_IPV6) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6; if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV6) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_TCP; if (rss_hash_config & RSS_HASHTYPE_RSS_IPV6_EX) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX; if (rss_hash_config & RSS_HASHTYPE_RSS_TCP_IPV6_EX) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX_TCP; if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV4) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4_UDP; if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV4_EX) device_printf(adapter->dev, "%s: RSS_HASHTYPE_RSS_UDP_IPV4_EX defined, " "but not supported\n", __func__); if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV6) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_UDP; if (rss_hash_config & RSS_HASHTYPE_RSS_UDP_IPV6_EX) mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX_UDP; #else /* * Disable UDP - IP fragments aren't currently being handled * and so we end up with a mix of 2-tuple and 4-tuple * traffic. */ mrqc = IXGBE_MRQC_RSSEN | IXGBE_MRQC_RSS_FIELD_IPV4 | IXGBE_MRQC_RSS_FIELD_IPV4_TCP | IXGBE_MRQC_RSS_FIELD_IPV6_EX_TCP | IXGBE_MRQC_RSS_FIELD_IPV6_EX | IXGBE_MRQC_RSS_FIELD_IPV6 | IXGBE_MRQC_RSS_FIELD_IPV6_TCP ; #endif /* RSS */ #ifdef PCI_IOV mode = ixgbe_get_iov_mode(adapter); mrqc |= ixgbe_get_mrqc(mode); #endif IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc); } /********************************************************************* * * Setup receive registers and features. * **********************************************************************/ #define IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT 2 #define BSIZEPKT_ROUNDUP ((1<rx_rings; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 bufsz, fctrl, srrctl, rxcsum; u32 hlreg; /* * Make sure receives are disabled while * setting up the descriptor ring */ ixgbe_disable_rx(hw); /* Enable broadcasts */ fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL); fctrl |= IXGBE_FCTRL_BAM; if (adapter->hw.mac.type == ixgbe_mac_82598EB) { fctrl |= IXGBE_FCTRL_DPF; fctrl |= IXGBE_FCTRL_PMCF; } IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl); /* Set for Jumbo Frames? */ hlreg = IXGBE_READ_REG(hw, IXGBE_HLREG0); if (ifp->if_mtu > ETHERMTU) hlreg |= IXGBE_HLREG0_JUMBOEN; else hlreg &= ~IXGBE_HLREG0_JUMBOEN; #ifdef DEV_NETMAP /* crcstrip is conditional in netmap (in RDRXCTL too ?) */ if (ifp->if_capenable & IFCAP_NETMAP && !ix_crcstrip) hlreg &= ~IXGBE_HLREG0_RXCRCSTRP; else hlreg |= IXGBE_HLREG0_RXCRCSTRP; #endif /* DEV_NETMAP */ IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg); bufsz = (adapter->rx_mbuf_sz + BSIZEPKT_ROUNDUP) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 rdba = rxr->rxdma.dma_paddr; int j = rxr->me; /* Setup the Base and Length of the Rx Descriptor Ring */ IXGBE_WRITE_REG(hw, IXGBE_RDBAL(j), (rdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_RDBAH(j), (rdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_RDLEN(j), adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc)); /* Set up the SRRCTL register */ srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(j)); srrctl &= ~IXGBE_SRRCTL_BSIZEHDR_MASK; srrctl &= ~IXGBE_SRRCTL_BSIZEPKT_MASK; srrctl |= bufsz; srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; /* * Set DROP_EN iff we have no flow control and >1 queue. * Note that srrctl was cleared shortly before during reset, * so we do not need to clear the bit, but do it just in case * this code is moved elsewhere. */ if (adapter->num_queues > 1 && adapter->hw.fc.requested_mode == ixgbe_fc_none) { srrctl |= IXGBE_SRRCTL_DROP_EN; } else { srrctl &= ~IXGBE_SRRCTL_DROP_EN; } IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(j), srrctl); /* Setup the HW Rx Head and Tail Descriptor Pointers */ IXGBE_WRITE_REG(hw, IXGBE_RDH(j), 0); IXGBE_WRITE_REG(hw, IXGBE_RDT(j), 0); /* Set the driver rx tail address */ rxr->tail = IXGBE_RDT(rxr->me); } if (adapter->hw.mac.type != ixgbe_mac_82598EB) { u32 psrtype = IXGBE_PSRTYPE_TCPHDR | IXGBE_PSRTYPE_UDPHDR | IXGBE_PSRTYPE_IPV4HDR | IXGBE_PSRTYPE_IPV6HDR; IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), psrtype); } rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM); ixgbe_initialize_rss_mapping(adapter); if (adapter->num_queues > 1) { /* RSS and RX IPP Checksum are mutually exclusive */ rxcsum |= IXGBE_RXCSUM_PCSD; } if (ifp->if_capenable & IFCAP_RXCSUM) rxcsum |= IXGBE_RXCSUM_PCSD; /* This is useful for calculating UDP/IP fragment checksums */ if (!(rxcsum & IXGBE_RXCSUM_PCSD)) rxcsum |= IXGBE_RXCSUM_IPPCSE; IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum); return; } /* ** This routine is run via an vlan config EVENT, ** it enables us to use the HW Filter table since ** we can get the vlan id. This just creates the ** entry in the soft version of the VFTA, init will ** repopulate the real table. */ static void ixgbe_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; ixgbe_setup_vlan_hw_support(adapter); IXGBE_CORE_UNLOCK(adapter); } /* ** This routine is run via an vlan ** unconfig EVENT, remove our entry ** in the soft vfta. */ static void ixgbe_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Re-init to load the changes */ ixgbe_setup_vlan_hw_support(adapter); IXGBE_CORE_UNLOCK(adapter); } static void ixgbe_setup_vlan_hw_support(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; struct ixgbe_hw *hw = &adapter->hw; struct rx_ring *rxr; u32 ctrl; /* ** We get here thru init_locked, meaning ** a soft reset, this has already cleared ** the VFTA and other state, so if there ** have been no vlan's registered do nothing. */ if (adapter->num_vlans == 0) return; /* Setup the queues for vlans */ for (int i = 0; i < adapter->num_queues; i++) { rxr = &adapter->rx_rings[i]; /* On 82599 the VLAN enable is per/queue in RXDCTL */ if (hw->mac.type != ixgbe_mac_82598EB) { ctrl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(rxr->me)); ctrl |= IXGBE_RXDCTL_VME; IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(rxr->me), ctrl); } rxr->vtag_strip = TRUE; } if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0) return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IXGBE_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) IXGBE_WRITE_REG(hw, IXGBE_VFTA(i), adapter->shadow_vfta[i]); ctrl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL); /* Enable the Filter Table if enabled */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { ctrl &= ~IXGBE_VLNCTRL_CFIEN; ctrl |= IXGBE_VLNCTRL_VFE; } if (hw->mac.type == ixgbe_mac_82598EB) ctrl |= IXGBE_VLNCTRL_VME; IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, ctrl); } static void ixgbe_enable_intr(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ix_queue *que = adapter->queues; u32 mask, fwsm; mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE); /* Enable Fan Failure detection */ if (hw->device_id == IXGBE_DEV_ID_82598AT) mask |= IXGBE_EIMS_GPI_SDP1; switch (adapter->hw.mac.type) { case ixgbe_mac_82599EB: mask |= IXGBE_EIMS_ECC; /* Temperature sensor on some adapters */ mask |= IXGBE_EIMS_GPI_SDP0; /* SFP+ (RX_LOS_N & MOD_ABS_N) */ mask |= IXGBE_EIMS_GPI_SDP1; mask |= IXGBE_EIMS_GPI_SDP2; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif #ifdef PCI_IOV mask |= IXGBE_EIMS_MAILBOX; #endif break; case ixgbe_mac_X540: /* Detect if Thermal Sensor is enabled */ fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM); if (fwsm & IXGBE_FWSM_TS_ENABLED) mask |= IXGBE_EIMS_TS; mask |= IXGBE_EIMS_ECC; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif break; case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: /* MAC thermal sensor is automatically enabled */ mask |= IXGBE_EIMS_TS; /* Some devices use SDP0 for important information */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_SFP || hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) mask |= IXGBE_EIMS_GPI_SDP0_BY_MAC(hw); mask |= IXGBE_EIMS_ECC; #ifdef IXGBE_FDIR mask |= IXGBE_EIMS_FLOW_DIR; #endif #ifdef PCI_IOV mask |= IXGBE_EIMS_MAILBOX; #endif /* falls through */ default: break; } IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask); /* With MSI-X we use auto clear */ if (adapter->msix_mem) { mask = IXGBE_EIMS_ENABLE_MASK; /* Don't autoclear Link */ mask &= ~IXGBE_EIMS_OTHER; mask &= ~IXGBE_EIMS_LSC; #ifdef PCI_IOV mask &= ~IXGBE_EIMS_MAILBOX; #endif IXGBE_WRITE_REG(hw, IXGBE_EIAC, mask); } /* ** Now enable all queues, this is done separately to ** allow for handling the extended (beyond 32) MSIX ** vectors that can be used by 82599 */ for (int i = 0; i < adapter->num_queues; i++, que++) ixgbe_enable_queue(adapter, que->msix); IXGBE_WRITE_FLUSH(hw); return; } static void ixgbe_disable_intr(struct adapter *adapter) { if (adapter->msix_mem) IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIAC, 0); if (adapter->hw.mac.type == ixgbe_mac_82598EB) { IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, ~0); } else { IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, 0xFFFF0000); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(0), ~0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(1), ~0); } IXGBE_WRITE_FLUSH(&adapter->hw); return; } /* ** Get the width and transaction speed of ** the slot this adapter is plugged into. */ static void ixgbe_get_slot_info(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_mac_info *mac = &hw->mac; u16 link; u32 offset; /* For most devices simply call the shared code routine */ if (hw->device_id != IXGBE_DEV_ID_82599_SFP_SF_QP) { ixgbe_get_bus_info(hw); /* These devices don't use PCI-E */ switch (hw->mac.type) { case ixgbe_mac_X550EM_x: return; default: goto display; } } /* ** For the Quad port adapter we need to parse back ** up the PCI tree to find the speed of the expansion ** slot into which this adapter is plugged. A bit more work. */ dev = device_get_parent(device_get_parent(dev)); #ifdef IXGBE_DEBUG device_printf(dev, "parent pcib = %x,%x,%x\n", pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev)); #endif dev = device_get_parent(device_get_parent(dev)); #ifdef IXGBE_DEBUG device_printf(dev, "slot pcib = %x,%x,%x\n", pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev)); #endif /* Now get the PCI Express Capabilities offset */ pci_find_cap(dev, PCIY_EXPRESS, &offset); /* ...and read the Link Status Register */ link = pci_read_config(dev, offset + PCIER_LINK_STA, 2); switch (link & IXGBE_PCI_LINK_WIDTH) { case IXGBE_PCI_LINK_WIDTH_1: hw->bus.width = ixgbe_bus_width_pcie_x1; break; case IXGBE_PCI_LINK_WIDTH_2: hw->bus.width = ixgbe_bus_width_pcie_x2; break; case IXGBE_PCI_LINK_WIDTH_4: hw->bus.width = ixgbe_bus_width_pcie_x4; break; case IXGBE_PCI_LINK_WIDTH_8: hw->bus.width = ixgbe_bus_width_pcie_x8; break; default: hw->bus.width = ixgbe_bus_width_unknown; break; } switch (link & IXGBE_PCI_LINK_SPEED) { case IXGBE_PCI_LINK_SPEED_2500: hw->bus.speed = ixgbe_bus_speed_2500; break; case IXGBE_PCI_LINK_SPEED_5000: hw->bus.speed = ixgbe_bus_speed_5000; break; case IXGBE_PCI_LINK_SPEED_8000: hw->bus.speed = ixgbe_bus_speed_8000; break; default: hw->bus.speed = ixgbe_bus_speed_unknown; break; } mac->ops.set_lan_id(hw); display: device_printf(dev,"PCI Express Bus: Speed %s %s\n", ((hw->bus.speed == ixgbe_bus_speed_8000) ? "8.0GT/s": (hw->bus.speed == ixgbe_bus_speed_5000) ? "5.0GT/s": (hw->bus.speed == ixgbe_bus_speed_2500) ? "2.5GT/s":"Unknown"), (hw->bus.width == ixgbe_bus_width_pcie_x8) ? "Width x8" : (hw->bus.width == ixgbe_bus_width_pcie_x4) ? "Width x4" : (hw->bus.width == ixgbe_bus_width_pcie_x1) ? "Width x1" : ("Unknown")); if ((hw->device_id != IXGBE_DEV_ID_82599_SFP_SF_QP) && ((hw->bus.width <= ixgbe_bus_width_pcie_x4) && (hw->bus.speed == ixgbe_bus_speed_2500))) { device_printf(dev, "PCI-Express bandwidth available" " for this card\n is not sufficient for" " optimal performance.\n"); device_printf(dev, "For optimal performance a x8 " "PCIE, or x4 PCIE Gen2 slot is required.\n"); } if ((hw->device_id == IXGBE_DEV_ID_82599_SFP_SF_QP) && ((hw->bus.width <= ixgbe_bus_width_pcie_x8) && (hw->bus.speed < ixgbe_bus_speed_8000))) { device_printf(dev, "PCI-Express bandwidth available" " for this card\n is not sufficient for" " optimal performance.\n"); device_printf(dev, "For optimal performance a x8 " "PCIE Gen3 slot is required.\n"); } return; } /* ** Setup the correct IVAR register for a particular MSIX interrupt ** (yes this is all very magic and confusing :) ** - entry is the register array entry ** - vector is the MSIX vector for this queue ** - type is RX/TX/MISC */ static void ixgbe_set_ivar(struct adapter *adapter, u8 entry, u8 vector, s8 type) { struct ixgbe_hw *hw = &adapter->hw; u32 ivar, index; vector |= IXGBE_IVAR_ALLOC_VAL; switch (hw->mac.type) { case ixgbe_mac_82598EB: if (type == -1) entry = IXGBE_IVAR_OTHER_CAUSES_INDEX; else entry += (type * 64); index = (entry >> 2) & 0x1F; ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index)); ivar &= ~(0xFF << (8 * (entry & 0x3))); ivar |= (vector << (8 * (entry & 0x3))); IXGBE_WRITE_REG(&adapter->hw, IXGBE_IVAR(index), ivar); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: if (type == -1) { /* MISC IVAR */ index = (entry & 1) * 8; ivar = IXGBE_READ_REG(hw, IXGBE_IVAR_MISC); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, ivar); } else { /* RX/TX IVARS */ index = (16 * (entry & 1)) + (8 * type); ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(entry >> 1)); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_IVAR(entry >> 1), ivar); } default: break; } } static void ixgbe_configure_ivars(struct adapter *adapter) { struct ix_queue *que = adapter->queues; u32 newitr; if (ixgbe_max_interrupt_rate > 0) newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8; else { /* ** Disable DMA coalescing if interrupt moderation is ** disabled. */ adapter->dmac = 0; newitr = 0; } for (int i = 0; i < adapter->num_queues; i++, que++) { struct rx_ring *rxr = &adapter->rx_rings[i]; struct tx_ring *txr = &adapter->tx_rings[i]; /* First the RX queue entry */ ixgbe_set_ivar(adapter, rxr->me, que->msix, 0); /* ... and the TX */ ixgbe_set_ivar(adapter, txr->me, que->msix, 1); /* Set an Initial EITR value */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_EITR(que->msix), newitr); } /* For the Link interrupt */ ixgbe_set_ivar(adapter, 1, adapter->vector, -1); } /* ** ixgbe_sfp_probe - called in the local timer to ** determine if a port had optics inserted. */ static bool ixgbe_sfp_probe(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; bool result = FALSE; if ((hw->phy.type == ixgbe_phy_nl) && (hw->phy.sfp_type == ixgbe_sfp_type_not_present)) { s32 ret = hw->phy.ops.identify_sfp(hw); if (ret) goto out; ret = hw->phy.ops.reset(hw); if (ret == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module detected!"); device_printf(dev, "Reload driver with supported module.\n"); adapter->sfp_probe = FALSE; goto out; } else device_printf(dev, "SFP+ module detected!\n"); /* We now have supported optics */ adapter->sfp_probe = FALSE; /* Set the optics type so system reports correctly */ ixgbe_setup_optics(adapter); result = TRUE; } out: return (result); } /* ** Tasklet handler for MSIX Link interrupts ** - do outside interrupt since it might sleep */ static void ixgbe_handle_link(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; ixgbe_check_link(hw, &adapter->link_speed, &adapter->link_up, 0); ixgbe_update_link_status(adapter); /* Re-enable link interrupts */ IXGBE_WRITE_REG(hw, IXGBE_EIMS, IXGBE_EIMS_LSC); } /* ** Tasklet for handling SFP module interrupts */ static void ixgbe_handle_mod(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; enum ixgbe_phy_type orig_type = hw->phy.type; device_t dev = adapter->dev; u32 err; IXGBE_CORE_LOCK(adapter); /* Check to see if the PHY type changed */ if (hw->phy.ops.identify) { hw->phy.type = ixgbe_phy_unknown; hw->phy.ops.identify(hw); } if (hw->phy.type != orig_type) { device_printf(dev, "Detected phy_type %d\n", hw->phy.type); if (hw->phy.type == ixgbe_phy_none) { hw->phy.sfp_type = ixgbe_sfp_type_unknown; goto out; } /* Try to do the initialization that was skipped before */ if (hw->phy.ops.init) hw->phy.ops.init(hw); if (hw->phy.ops.reset) hw->phy.ops.reset(hw); } err = hw->phy.ops.identify_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Unsupported SFP+ module type was detected.\n"); goto out; } err = hw->mac.ops.setup_sfp(hw); if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) { device_printf(dev, "Setup failure - unsupported SFP+ module type.\n"); goto out; } if (hw->phy.multispeed_fiber) taskqueue_enqueue(adapter->tq, &adapter->msf_task); out: /* Update media type */ switch (hw->mac.ops.get_media_type(hw)) { case ixgbe_media_type_fiber: adapter->optics = IFM_10G_SR; break; case ixgbe_media_type_copper: adapter->optics = IFM_10G_TWINAX; break; case ixgbe_media_type_cx4: adapter->optics = IFM_10G_CX4; break; default: adapter->optics = 0; break; } IXGBE_CORE_UNLOCK(adapter); return; } /* ** Tasklet for handling MSF (multispeed fiber) interrupts */ static void ixgbe_handle_msf(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; u32 autoneg; bool negotiate; IXGBE_CORE_LOCK(adapter); /* get_supported_phy_layer will call hw->phy.ops.identify_sfp() */ adapter->phy_layer = ixgbe_get_supported_physical_layer(hw); autoneg = hw->phy.autoneg_advertised; if ((!autoneg) && (hw->mac.ops.get_link_capabilities)) hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiate); if (hw->mac.ops.setup_link) hw->mac.ops.setup_link(hw, autoneg, TRUE); /* Adjust media types shown in ifconfig */ ifmedia_removeall(&adapter->media); ixgbe_add_media_types(adapter); IXGBE_CORE_UNLOCK(adapter); return; } /* ** Tasklet for handling interrupts from an external PHY */ static void ixgbe_handle_phy(void *context, int pending) { struct adapter *adapter = context; struct ixgbe_hw *hw = &adapter->hw; int error; error = hw->phy.ops.handle_lasi(hw); if (error == IXGBE_ERR_OVERTEMP) device_printf(adapter->dev, "CRITICAL: EXTERNAL PHY OVER TEMP!! " " PHY will downshift to lower power state!\n"); else if (error) device_printf(adapter->dev, "Error handling LASI interrupt: %d\n", error); return; } #ifdef IXGBE_FDIR /* ** Tasklet for reinitializing the Flow Director filter table */ static void ixgbe_reinit_fdir(void *context, int pending) { struct adapter *adapter = context; struct ifnet *ifp = adapter->ifp; if (adapter->fdir_reinit != 1) /* Shouldn't happen */ return; ixgbe_reinit_fdir_tables_82599(&adapter->hw); adapter->fdir_reinit = 0; /* re-enable flow director interrupts */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMS, IXGBE_EIMS_FLOW_DIR); /* Restart the interface */ ifp->if_drv_flags |= IFF_DRV_RUNNING; return; } #endif /********************************************************************* * * Configure DMA Coalescing * **********************************************************************/ static void ixgbe_config_dmac(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_dmac_config *dcfg = &hw->mac.dmac_config; if (hw->mac.type < ixgbe_mac_X550 || !hw->mac.ops.dmac_config) return; if (dcfg->watchdog_timer ^ adapter->dmac || dcfg->link_speed ^ adapter->link_speed) { dcfg->watchdog_timer = adapter->dmac; dcfg->fcoe_en = false; dcfg->link_speed = adapter->link_speed; dcfg->num_tcs = 1; INIT_DEBUGOUT2("dmac settings: watchdog %d, link speed %d\n", dcfg->watchdog_timer, dcfg->link_speed); hw->mac.ops.dmac_config(hw); } } /* * Checks whether the adapter's ports are capable of * Wake On LAN by reading the adapter's NVM. * * Sets each port's hw->wol_enabled value depending * on the value read here. */ static void ixgbe_check_wol_support(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u16 dev_caps = 0; /* Find out WoL support for port */ adapter->wol_support = hw->wol_enabled = 0; ixgbe_get_device_caps(hw, &dev_caps); if ((dev_caps & IXGBE_DEVICE_CAPS_WOL_PORT0_1) || ((dev_caps & IXGBE_DEVICE_CAPS_WOL_PORT0) && hw->bus.func == 0)) adapter->wol_support = hw->wol_enabled = 1; /* Save initial wake up filter configuration */ adapter->wufc = IXGBE_READ_REG(hw, IXGBE_WUFC); return; } /* * Prepare the adapter/port for LPLU and/or WoL */ static int ixgbe_setup_low_power_mode(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; s32 error = 0; mtx_assert(&adapter->core_mtx, MA_OWNED); /* Limit power management flow to X550EM baseT */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T && hw->phy.ops.enter_lplu) { /* Turn off support for APM wakeup. (Using ACPI instead) */ IXGBE_WRITE_REG(hw, IXGBE_GRC, IXGBE_READ_REG(hw, IXGBE_GRC) & ~(u32)2); /* * Clear Wake Up Status register to prevent any previous wakeup * events from waking us up immediately after we suspend. */ IXGBE_WRITE_REG(hw, IXGBE_WUS, 0xffffffff); /* * Program the Wakeup Filter Control register with user filter * settings */ IXGBE_WRITE_REG(hw, IXGBE_WUFC, adapter->wufc); /* Enable wakeups and power management in Wakeup Control */ IXGBE_WRITE_REG(hw, IXGBE_WUC, IXGBE_WUC_WKEN | IXGBE_WUC_PME_EN); /* X550EM baseT adapters need a special LPLU flow */ hw->phy.reset_disable = true; ixgbe_stop(adapter); error = hw->phy.ops.enter_lplu(hw); if (error) device_printf(dev, "Error entering LPLU: %d\n", error); hw->phy.reset_disable = false; } else { /* Just stop for other adapters */ ixgbe_stop(adapter); } return error; } /********************************************************************** * * Update the board statistics counters. * **********************************************************************/ static void ixgbe_update_stats_counters(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 missed_rx = 0, bprc, lxon, lxoff, total; u64 total_missed_rx = 0; adapter->stats.pf.crcerrs += IXGBE_READ_REG(hw, IXGBE_CRCERRS); adapter->stats.pf.illerrc += IXGBE_READ_REG(hw, IXGBE_ILLERRC); adapter->stats.pf.errbc += IXGBE_READ_REG(hw, IXGBE_ERRBC); adapter->stats.pf.mspdc += IXGBE_READ_REG(hw, IXGBE_MSPDC); for (int i = 0; i < 16; i++) { adapter->stats.pf.qprc[i] += IXGBE_READ_REG(hw, IXGBE_QPRC(i)); adapter->stats.pf.qptc[i] += IXGBE_READ_REG(hw, IXGBE_QPTC(i)); adapter->stats.pf.qprdc[i] += IXGBE_READ_REG(hw, IXGBE_QPRDC(i)); } adapter->stats.pf.mlfc += IXGBE_READ_REG(hw, IXGBE_MLFC); adapter->stats.pf.mrfc += IXGBE_READ_REG(hw, IXGBE_MRFC); adapter->stats.pf.rlec += IXGBE_READ_REG(hw, IXGBE_RLEC); /* Hardware workaround, gprc counts missed packets */ adapter->stats.pf.gprc += IXGBE_READ_REG(hw, IXGBE_GPRC); adapter->stats.pf.gprc -= missed_rx; if (hw->mac.type != ixgbe_mac_82598EB) { adapter->stats.pf.gorc += IXGBE_READ_REG(hw, IXGBE_GORCL) + ((u64)IXGBE_READ_REG(hw, IXGBE_GORCH) << 32); adapter->stats.pf.gotc += IXGBE_READ_REG(hw, IXGBE_GOTCL) + ((u64)IXGBE_READ_REG(hw, IXGBE_GOTCH) << 32); adapter->stats.pf.tor += IXGBE_READ_REG(hw, IXGBE_TORL) + ((u64)IXGBE_READ_REG(hw, IXGBE_TORH) << 32); adapter->stats.pf.lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXCNT); adapter->stats.pf.lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXCNT); } else { adapter->stats.pf.lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXC); adapter->stats.pf.lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXC); /* 82598 only has a counter in the high register */ adapter->stats.pf.gorc += IXGBE_READ_REG(hw, IXGBE_GORCH); adapter->stats.pf.gotc += IXGBE_READ_REG(hw, IXGBE_GOTCH); adapter->stats.pf.tor += IXGBE_READ_REG(hw, IXGBE_TORH); } /* * Workaround: mprc hardware is incorrectly counting * broadcasts, so for now we subtract those. */ bprc = IXGBE_READ_REG(hw, IXGBE_BPRC); adapter->stats.pf.bprc += bprc; adapter->stats.pf.mprc += IXGBE_READ_REG(hw, IXGBE_MPRC); if (hw->mac.type == ixgbe_mac_82598EB) adapter->stats.pf.mprc -= bprc; adapter->stats.pf.prc64 += IXGBE_READ_REG(hw, IXGBE_PRC64); adapter->stats.pf.prc127 += IXGBE_READ_REG(hw, IXGBE_PRC127); adapter->stats.pf.prc255 += IXGBE_READ_REG(hw, IXGBE_PRC255); adapter->stats.pf.prc511 += IXGBE_READ_REG(hw, IXGBE_PRC511); adapter->stats.pf.prc1023 += IXGBE_READ_REG(hw, IXGBE_PRC1023); adapter->stats.pf.prc1522 += IXGBE_READ_REG(hw, IXGBE_PRC1522); lxon = IXGBE_READ_REG(hw, IXGBE_LXONTXC); adapter->stats.pf.lxontxc += lxon; lxoff = IXGBE_READ_REG(hw, IXGBE_LXOFFTXC); adapter->stats.pf.lxofftxc += lxoff; total = lxon + lxoff; adapter->stats.pf.gptc += IXGBE_READ_REG(hw, IXGBE_GPTC); adapter->stats.pf.mptc += IXGBE_READ_REG(hw, IXGBE_MPTC); adapter->stats.pf.ptc64 += IXGBE_READ_REG(hw, IXGBE_PTC64); adapter->stats.pf.gptc -= total; adapter->stats.pf.mptc -= total; adapter->stats.pf.ptc64 -= total; adapter->stats.pf.gotc -= total * ETHER_MIN_LEN; adapter->stats.pf.ruc += IXGBE_READ_REG(hw, IXGBE_RUC); adapter->stats.pf.rfc += IXGBE_READ_REG(hw, IXGBE_RFC); adapter->stats.pf.roc += IXGBE_READ_REG(hw, IXGBE_ROC); adapter->stats.pf.rjc += IXGBE_READ_REG(hw, IXGBE_RJC); adapter->stats.pf.mngprc += IXGBE_READ_REG(hw, IXGBE_MNGPRC); adapter->stats.pf.mngpdc += IXGBE_READ_REG(hw, IXGBE_MNGPDC); adapter->stats.pf.mngptc += IXGBE_READ_REG(hw, IXGBE_MNGPTC); adapter->stats.pf.tpr += IXGBE_READ_REG(hw, IXGBE_TPR); adapter->stats.pf.tpt += IXGBE_READ_REG(hw, IXGBE_TPT); adapter->stats.pf.ptc127 += IXGBE_READ_REG(hw, IXGBE_PTC127); adapter->stats.pf.ptc255 += IXGBE_READ_REG(hw, IXGBE_PTC255); adapter->stats.pf.ptc511 += IXGBE_READ_REG(hw, IXGBE_PTC511); adapter->stats.pf.ptc1023 += IXGBE_READ_REG(hw, IXGBE_PTC1023); adapter->stats.pf.ptc1522 += IXGBE_READ_REG(hw, IXGBE_PTC1522); adapter->stats.pf.bptc += IXGBE_READ_REG(hw, IXGBE_BPTC); adapter->stats.pf.xec += IXGBE_READ_REG(hw, IXGBE_XEC); adapter->stats.pf.fccrc += IXGBE_READ_REG(hw, IXGBE_FCCRC); adapter->stats.pf.fclast += IXGBE_READ_REG(hw, IXGBE_FCLAST); /* Only read FCOE on 82599 */ if (hw->mac.type != ixgbe_mac_82598EB) { adapter->stats.pf.fcoerpdc += IXGBE_READ_REG(hw, IXGBE_FCOERPDC); adapter->stats.pf.fcoeprc += IXGBE_READ_REG(hw, IXGBE_FCOEPRC); adapter->stats.pf.fcoeptc += IXGBE_READ_REG(hw, IXGBE_FCOEPTC); adapter->stats.pf.fcoedwrc += IXGBE_READ_REG(hw, IXGBE_FCOEDWRC); adapter->stats.pf.fcoedwtc += IXGBE_READ_REG(hw, IXGBE_FCOEDWTC); } /* Fill out the OS statistics structure */ IXGBE_SET_IPACKETS(adapter, adapter->stats.pf.gprc); IXGBE_SET_OPACKETS(adapter, adapter->stats.pf.gptc); IXGBE_SET_IBYTES(adapter, adapter->stats.pf.gorc); IXGBE_SET_OBYTES(adapter, adapter->stats.pf.gotc); IXGBE_SET_IMCASTS(adapter, adapter->stats.pf.mprc); IXGBE_SET_OMCASTS(adapter, adapter->stats.pf.mptc); IXGBE_SET_COLLISIONS(adapter, 0); IXGBE_SET_IQDROPS(adapter, total_missed_rx); IXGBE_SET_IERRORS(adapter, adapter->stats.pf.crcerrs + adapter->stats.pf.rlec); } #if __FreeBSD_version >= 1100036 static uint64_t ixgbe_get_counter(struct ifnet *ifp, ift_counter cnt) { struct adapter *adapter; struct tx_ring *txr; uint64_t rv; adapter = if_getsoftc(ifp); switch (cnt) { case IFCOUNTER_IPACKETS: return (adapter->ipackets); case IFCOUNTER_OPACKETS: return (adapter->opackets); case IFCOUNTER_IBYTES: return (adapter->ibytes); case IFCOUNTER_OBYTES: return (adapter->obytes); case IFCOUNTER_IMCASTS: return (adapter->imcasts); case IFCOUNTER_OMCASTS: return (adapter->omcasts); case IFCOUNTER_COLLISIONS: return (0); case IFCOUNTER_IQDROPS: return (adapter->iqdrops); case IFCOUNTER_OQDROPS: rv = 0; txr = adapter->tx_rings; for (int i = 0; i < adapter->num_queues; i++, txr++) rv += txr->br->br_drops; return (rv); case IFCOUNTER_IERRORS: return (adapter->ierrors); default: return (if_get_counter_default(ifp, cnt)); } } #endif /** ixgbe_sysctl_tdh_handler - Handler function * Retrieves the TDH value from the hardware */ static int ixgbe_sysctl_tdh_handler(SYSCTL_HANDLER_ARGS) { int error; struct tx_ring *txr = ((struct tx_ring *)oidp->oid_arg1); if (!txr) return 0; unsigned val = IXGBE_READ_REG(&txr->adapter->hw, IXGBE_TDH(txr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_tdt_handler - Handler function * Retrieves the TDT value from the hardware */ static int ixgbe_sysctl_tdt_handler(SYSCTL_HANDLER_ARGS) { int error; struct tx_ring *txr = ((struct tx_ring *)oidp->oid_arg1); if (!txr) return 0; unsigned val = IXGBE_READ_REG(&txr->adapter->hw, IXGBE_TDT(txr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_rdh_handler - Handler function * Retrieves the RDH value from the hardware */ static int ixgbe_sysctl_rdh_handler(SYSCTL_HANDLER_ARGS) { int error; struct rx_ring *rxr = ((struct rx_ring *)oidp->oid_arg1); if (!rxr) return 0; unsigned val = IXGBE_READ_REG(&rxr->adapter->hw, IXGBE_RDH(rxr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } /** ixgbe_sysctl_rdt_handler - Handler function * Retrieves the RDT value from the hardware */ static int ixgbe_sysctl_rdt_handler(SYSCTL_HANDLER_ARGS) { int error; struct rx_ring *rxr = ((struct rx_ring *)oidp->oid_arg1); if (!rxr) return 0; unsigned val = IXGBE_READ_REG(&rxr->adapter->hw, IXGBE_RDT(rxr->me)); error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return error; return 0; } static int ixgbe_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) { int error; struct ix_queue *que = ((struct ix_queue *)oidp->oid_arg1); unsigned int reg, usec, rate; reg = IXGBE_READ_REG(&que->adapter->hw, IXGBE_EITR(que->msix)); usec = ((reg & 0x0FF8) >> 3); if (usec > 0) rate = 500000 / usec; else rate = 0; error = sysctl_handle_int(oidp, &rate, 0, req); if (error || !req->newptr) return error; reg &= ~0xfff; /* default, no limitation */ ixgbe_max_interrupt_rate = 0; if (rate > 0 && rate < 500000) { if (rate < 1000) rate = 1000; ixgbe_max_interrupt_rate = rate; reg |= ((4000000/rate) & 0xff8 ); } IXGBE_WRITE_REG(&que->adapter->hw, IXGBE_EITR(que->msix), reg); return 0; } static void ixgbe_add_device_sysctls(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Sysctls for all devices */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "fc", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_flowcntl, "I", IXGBE_SYSCTL_DESC_SET_FC); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "enable_aim", CTLFLAG_RW, &ixgbe_enable_aim, 1, "Interrupt Moderation"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "advertise_speed", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_set_advertise, "I", IXGBE_SYSCTL_DESC_ADV_SPEED); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "thermal_test", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_sysctl_thermal_test, "I", "Thermal Test"); #ifdef IXGBE_DEBUG /* testing sysctls (for all devices) */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "power_state", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_sysctl_power_state, "I", "PCI Power State"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "print_rss_config", CTLTYPE_STRING | CTLFLAG_RD, adapter, 0, ixgbe_sysctl_print_rss_config, "A", "Prints RSS Configuration"); #endif /* for X550 series devices */ if (hw->mac.type >= ixgbe_mac_X550) SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "dmac", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_sysctl_dmac, "I", "DMA Coalesce"); /* for X552 backplane devices */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_KR) { struct sysctl_oid *eee_node; struct sysctl_oid_list *eee_list; eee_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "eee", CTLFLAG_RD, NULL, "Energy Efficient Ethernet sysctls"); eee_list = SYSCTL_CHILDREN(eee_node); SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_sysctl_eee_enable, "I", "Enable or Disable EEE"); SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "negotiated", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, ixgbe_sysctl_eee_negotiated, "I", "EEE negotiated on link"); SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "tx_lpi_status", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, ixgbe_sysctl_eee_tx_lpi_status, "I", "Whether or not TX link is in LPI state"); SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "rx_lpi_status", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, ixgbe_sysctl_eee_rx_lpi_status, "I", "Whether or not RX link is in LPI state"); SYSCTL_ADD_PROC(ctx, eee_list, OID_AUTO, "tx_lpi_delay", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, ixgbe_sysctl_eee_tx_lpi_delay, "I", "TX LPI entry delay in microseconds"); } /* for WoL-capable devices */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "wol_enable", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_sysctl_wol_enable, "I", "Enable/Disable Wake on LAN"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "wufc", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixgbe_sysctl_wufc, "I", "Enable/Disable Wake Up Filters"); } /* for X552/X557-AT devices */ if (hw->device_id == IXGBE_DEV_ID_X550EM_X_10G_T) { struct sysctl_oid *phy_node; struct sysctl_oid_list *phy_list; phy_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "phy", CTLFLAG_RD, NULL, "External PHY sysctls"); phy_list = SYSCTL_CHILDREN(phy_node); SYSCTL_ADD_PROC(ctx, phy_list, OID_AUTO, "temp", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, ixgbe_sysctl_phy_temp, "I", "Current External PHY Temperature (Celsius)"); SYSCTL_ADD_PROC(ctx, phy_list, OID_AUTO, "overtemp_occurred", CTLTYPE_INT | CTLFLAG_RD, adapter, 0, ixgbe_sysctl_phy_overtemp_occurred, "I", "External PHY High Temperature Event Occurred"); } } /* * Add sysctl variables, one per statistic, to the system. */ static void ixgbe_add_hw_stats(struct adapter *adapter) { device_t dev = adapter->dev; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct ixgbe_hw_stats *stats = &adapter->stats.pf; struct sysctl_oid *stat_node, *queue_node; struct sysctl_oid_list *stat_list, *queue_list; #define QUEUE_NAME_LEN 32 char namebuf[QUEUE_NAME_LEN]; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_defrag_failed", CTLFLAG_RD, &adapter->mbuf_defrag_failed, "m_defrag() failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_events", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, "Link MSIX IRQ Handled"); for (int i = 0; i < adapter->num_queues; i++, txr++) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", CTLTYPE_UINT | CTLFLAG_RW, &adapter->queues[i], sizeof(&adapter->queues[i]), ixgbe_sysctl_interrupt_rate_handler, "IU", "Interrupt Rate"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "irqs", CTLFLAG_RD, &(adapter->queues[i].irqs), "irqs on this queue"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr), ixgbe_sysctl_tdh_handler, "IU", "Transmit Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_tail", CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr), ixgbe_sysctl_tdt_handler, "IU", "Transmit Descriptor Tail"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "tso_tx", CTLFLAG_RD, &txr->tso_tx, "TSO"); SYSCTL_ADD_ULONG(ctx, queue_list, OID_AUTO, "no_tx_dma_setup", CTLFLAG_RD, &txr->no_tx_dma_setup, "Driver tx dma failure in xmit"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "no_desc_avail", CTLFLAG_RD, &txr->no_desc_avail, "Queue No Descriptor Available"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &txr->total_packets, "Queue Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "br_drops", CTLFLAG_RD, &txr->br->br_drops, "Packets dropped in buf_ring"); } for (int i = 0; i < adapter->num_queues; i++, rxr++) { snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); struct lro_ctrl *lro = &rxr->lro; snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, rxr, sizeof(rxr), ixgbe_sysctl_rdh_handler, "IU", "Receive Descriptor Head"); SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_tail", CTLTYPE_UINT | CTLFLAG_RD, rxr, sizeof(rxr), ixgbe_sysctl_rdt_handler, "IU", "Receive Descriptor Tail"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &rxr->rx_packets, "Queue Packets Received"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_copies", CTLFLAG_RD, &rxr->rx_copies, "Copied RX Frames"); - SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_queued", + SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); - SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_flushed", + SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_flushed", CTLFLAG_RD, &lro->lro_flushed, 0, "LRO Flushed"); } /* MAC stats get the own sub node */ stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac_stats", CTLFLAG_RD, NULL, "MAC Statistics"); stat_list = SYSCTL_CHILDREN(stat_node); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "crc_errs", CTLFLAG_RD, &stats->crcerrs, "CRC Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "ill_errs", CTLFLAG_RD, &stats->illerrc, "Illegal Byte Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "byte_errs", CTLFLAG_RD, &stats->errbc, "Byte Errors"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "short_discards", CTLFLAG_RD, &stats->mspdc, "MAC Short Packets Discarded"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "local_faults", CTLFLAG_RD, &stats->mlfc, "MAC Local Faults"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "remote_faults", CTLFLAG_RD, &stats->mrfc, "MAC Remote Faults"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rec_len_errs", CTLFLAG_RD, &stats->rlec, "Receive Length Errors"); /* Flow Control stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_txd", CTLFLAG_RD, &stats->lxontxc, "Link XON Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xon_recvd", CTLFLAG_RD, &stats->lxonrxc, "Link XON Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_txd", CTLFLAG_RD, &stats->lxofftxc, "Link XOFF Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "xoff_recvd", CTLFLAG_RD, &stats->lxoffrxc, "Link XOFF Received"); /* Packet Reception Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_octets_rcvd", CTLFLAG_RD, &stats->tor, "Total Octets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_rcvd", CTLFLAG_RD, &stats->gorc, "Good Octets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_rcvd", CTLFLAG_RD, &stats->tpr, "Total Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_rcvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_rcvd", CTLFLAG_RD, &stats->mprc, "Multicast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_rcvd", CTLFLAG_RD, &stats->bprc, "Broadcast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_64", CTLFLAG_RD, &stats->prc64, "64 byte frames received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_65_127", CTLFLAG_RD, &stats->prc127, "65-127 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_128_255", CTLFLAG_RD, &stats->prc255, "128-255 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_256_511", CTLFLAG_RD, &stats->prc511, "256-511 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_512_1023", CTLFLAG_RD, &stats->prc1023, "512-1023 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "rx_frames_1024_1522", CTLFLAG_RD, &stats->prc1522, "1023-1522 byte frames received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_undersized", CTLFLAG_RD, &stats->ruc, "Receive Undersized"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_fragmented", CTLFLAG_RD, &stats->rfc, "Fragmented Packets Received "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_oversized", CTLFLAG_RD, &stats->roc, "Oversized Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "recv_jabberd", CTLFLAG_RD, &stats->rjc, "Received Jabber"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_rcvd", CTLFLAG_RD, &stats->mngprc, "Management Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_drpd", CTLFLAG_RD, &stats->mngptc, "Management Packets Dropped"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "checksum_errs", CTLFLAG_RD, &stats->xec, "Checksum Errors"); /* Packet Transmission Stats */ SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->gotc, "Good Octets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "total_pkts_txd", CTLFLAG_RD, &stats->tpt, "Total Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->gptc, "Good Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "bcast_pkts_txd", CTLFLAG_RD, &stats->bptc, "Broadcast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_txd", CTLFLAG_RD, &stats->mptc, "Multicast Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "management_pkts_txd", CTLFLAG_RD, &stats->mngptc, "Management Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_64", CTLFLAG_RD, &stats->ptc64, "64 byte frames transmitted "); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_65_127", CTLFLAG_RD, &stats->ptc127, "65-127 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_128_255", CTLFLAG_RD, &stats->ptc255, "128-255 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_256_511", CTLFLAG_RD, &stats->ptc511, "256-511 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_512_1023", CTLFLAG_RD, &stats->ptc1023, "512-1023 byte frames transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "tx_frames_1024_1522", CTLFLAG_RD, &stats->ptc1522, "1024-1522 byte frames transmitted"); } static void ixgbe_set_sysctl_value(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLFLAG_RW, limit, value, description); } /* ** Set flow control using sysctl: ** Flow control values: ** 0 - off ** 1 - rx pause ** 2 - tx pause ** 3 - full */ static int ixgbe_set_flowcntl(SYSCTL_HANDLER_ARGS) { int error, last; struct adapter *adapter = (struct adapter *) arg1; last = adapter->fc; error = sysctl_handle_int(oidp, &adapter->fc, 0, req); if ((error) || (req->newptr == NULL)) return (error); /* Don't bother if it's not changed */ if (adapter->fc == last) return (0); switch (adapter->fc) { case ixgbe_fc_rx_pause: case ixgbe_fc_tx_pause: case ixgbe_fc_full: adapter->hw.fc.requested_mode = adapter->fc; if (adapter->num_queues > 1) ixgbe_disable_rx_drop(adapter); break; case ixgbe_fc_none: adapter->hw.fc.requested_mode = ixgbe_fc_none; if (adapter->num_queues > 1) ixgbe_enable_rx_drop(adapter); break; default: adapter->fc = last; return (EINVAL); } /* Don't autoneg if forcing a value */ adapter->hw.fc.disable_fc_autoneg = TRUE; ixgbe_fc_enable(&adapter->hw); return error; } /* ** Control advertised link speed: ** Flags: ** 0x1 - advertise 100 Mb ** 0x2 - advertise 1G ** 0x4 - advertise 10G */ static int ixgbe_set_advertise(SYSCTL_HANDLER_ARGS) { int error = 0, requested; struct adapter *adapter; device_t dev; struct ixgbe_hw *hw; ixgbe_link_speed speed = 0; adapter = (struct adapter *) arg1; dev = adapter->dev; hw = &adapter->hw; requested = adapter->advertise; error = sysctl_handle_int(oidp, &requested, 0, req); if ((error) || (req->newptr == NULL)) return (error); /* No speed changes for backplane media */ if (hw->phy.media_type == ixgbe_media_type_backplane) return (ENODEV); /* Checks to validate new value */ if (adapter->advertise == requested) /* no change */ return (0); if (!((hw->phy.media_type == ixgbe_media_type_copper) || (hw->phy.multispeed_fiber))) { device_printf(dev, "Advertised speed can only be set on copper or " "multispeed fiber media types.\n"); return (EINVAL); } if (requested < 0x1 || requested > 0x7) { device_printf(dev, "Invalid advertised speed; valid modes are 0x1 through 0x7\n"); return (EINVAL); } if ((requested & 0x1) && (hw->mac.type != ixgbe_mac_X540) && (hw->mac.type != ixgbe_mac_X550)) { device_printf(dev, "Set Advertise: 100Mb on X540/X550 only\n"); return (EINVAL); } /* Set new value and report new advertised mode */ if (requested & 0x1) speed |= IXGBE_LINK_SPEED_100_FULL; if (requested & 0x2) speed |= IXGBE_LINK_SPEED_1GB_FULL; if (requested & 0x4) speed |= IXGBE_LINK_SPEED_10GB_FULL; hw->mac.autotry_restart = TRUE; hw->mac.ops.setup_link(hw, speed, TRUE); adapter->advertise = requested; return (error); } /* * The following two sysctls are for X552/X557-AT devices; * they deal with the external PHY used in them. */ static int ixgbe_sysctl_phy_temp(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; u16 reg; if (hw->device_id != IXGBE_DEV_ID_X550EM_X_10G_T) { device_printf(adapter->dev, "Device has no supported external thermal sensor.\n"); return (ENODEV); } if (hw->phy.ops.read_reg(hw, IXGBE_PHY_CURRENT_TEMP, IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, ®)) { device_printf(adapter->dev, "Error reading from PHY's current temperature register\n"); return (EAGAIN); } /* Shift temp for output */ reg = reg >> 8; return (sysctl_handle_int(oidp, NULL, reg, req)); } /* * Reports whether the current PHY temperature is over * the overtemp threshold. * - This is reported directly from the PHY */ static int ixgbe_sysctl_phy_overtemp_occurred(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; u16 reg; if (hw->device_id != IXGBE_DEV_ID_X550EM_X_10G_T) { device_printf(adapter->dev, "Device has no supported external thermal sensor.\n"); return (ENODEV); } if (hw->phy.ops.read_reg(hw, IXGBE_PHY_OVERTEMP_STATUS, IXGBE_MDIO_VENDOR_SPECIFIC_1_DEV_TYPE, ®)) { device_printf(adapter->dev, "Error reading from PHY's temperature status register\n"); return (EAGAIN); } /* Get occurrence bit */ reg = !!(reg & 0x4000); return (sysctl_handle_int(oidp, 0, reg, req)); } /* ** Thermal Shutdown Trigger (internal MAC) ** - Set this to 1 to cause an overtemp event to occur */ static int ixgbe_sysctl_thermal_test(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; int error, fire = 0; error = sysctl_handle_int(oidp, &fire, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (fire) { u32 reg = IXGBE_READ_REG(hw, IXGBE_EICS); reg |= IXGBE_EICR_TS; IXGBE_WRITE_REG(hw, IXGBE_EICS, reg); } return (0); } /* ** Manage DMA Coalescing. ** Control values: ** 0/1 - off / on (use default value of 1000) ** ** Legal timer values are: ** 50,100,250,500,1000,2000,5000,10000 ** ** Turning off interrupt moderation will also turn this off. */ static int ixgbe_sysctl_dmac(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ifnet *ifp = adapter->ifp; int error; u32 newval; newval = adapter->dmac; error = sysctl_handle_int(oidp, &newval, 0, req); if ((error) || (req->newptr == NULL)) return (error); switch (newval) { case 0: /* Disabled */ adapter->dmac = 0; break; case 1: /* Enable and use default */ adapter->dmac = 1000; break; case 50: case 100: case 250: case 500: case 1000: case 2000: case 5000: case 10000: /* Legal values - allow */ adapter->dmac = newval; break; default: /* Do nothing, illegal value */ return (EINVAL); } /* Re-initialize hardware if it's already running */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) ixgbe_init(adapter); return (0); } #ifdef IXGBE_DEBUG /** * Sysctl to test power states * Values: * 0 - set device to D0 * 3 - set device to D3 * (none) - get current device power state */ static int ixgbe_sysctl_power_state(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; device_t dev = adapter->dev; int curr_ps, new_ps, error = 0; curr_ps = new_ps = pci_get_powerstate(dev); error = sysctl_handle_int(oidp, &new_ps, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (new_ps == curr_ps) return (0); if (new_ps == 3 && curr_ps == 0) error = DEVICE_SUSPEND(dev); else if (new_ps == 0 && curr_ps == 3) error = DEVICE_RESUME(dev); else return (EINVAL); device_printf(dev, "New state: %d\n", pci_get_powerstate(dev)); return (error); } #endif /* * Sysctl to enable/disable the WoL capability, if supported by the adapter. * Values: * 0 - disabled * 1 - enabled */ static int ixgbe_sysctl_wol_enable(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; int new_wol_enabled; int error = 0; new_wol_enabled = hw->wol_enabled; error = sysctl_handle_int(oidp, &new_wol_enabled, 0, req); if ((error) || (req->newptr == NULL)) return (error); new_wol_enabled = !!(new_wol_enabled); if (new_wol_enabled == hw->wol_enabled) return (0); if (new_wol_enabled > 0 && !adapter->wol_support) return (ENODEV); else hw->wol_enabled = new_wol_enabled; return (0); } /* * Sysctl to enable/disable the Energy Efficient Ethernet capability, * if supported by the adapter. * Values: * 0 - disabled * 1 - enabled */ static int ixgbe_sysctl_eee_enable(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; int new_eee_enabled, error = 0; new_eee_enabled = adapter->eee_enabled; error = sysctl_handle_int(oidp, &new_eee_enabled, 0, req); if ((error) || (req->newptr == NULL)) return (error); new_eee_enabled = !!(new_eee_enabled); if (new_eee_enabled == adapter->eee_enabled) return (0); if (new_eee_enabled > 0 && !hw->mac.ops.setup_eee) return (ENODEV); else adapter->eee_enabled = new_eee_enabled; /* Re-initialize hardware if it's already running */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) ixgbe_init(adapter); return (0); } /* * Read-only sysctl indicating whether EEE support was negotiated * on the link. */ static int ixgbe_sysctl_eee_negotiated(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; bool status; status = !!(IXGBE_READ_REG(hw, IXGBE_EEE_STAT) & IXGBE_EEE_STAT_NEG); return (sysctl_handle_int(oidp, 0, status, req)); } /* * Read-only sysctl indicating whether RX Link is in LPI state. */ static int ixgbe_sysctl_eee_rx_lpi_status(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; bool status; status = !!(IXGBE_READ_REG(hw, IXGBE_EEE_STAT) & IXGBE_EEE_RX_LPI_STATUS); return (sysctl_handle_int(oidp, 0, status, req)); } /* * Read-only sysctl indicating whether TX Link is in LPI state. */ static int ixgbe_sysctl_eee_tx_lpi_status(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; bool status; status = !!(IXGBE_READ_REG(hw, IXGBE_EEE_STAT) & IXGBE_EEE_TX_LPI_STATUS); return (sysctl_handle_int(oidp, 0, status, req)); } /* * Read-only sysctl indicating TX Link LPI delay */ static int ixgbe_sysctl_eee_tx_lpi_delay(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; struct ixgbe_hw *hw = &adapter->hw; u32 reg; reg = IXGBE_READ_REG(hw, IXGBE_EEE_SU); return (sysctl_handle_int(oidp, 0, reg >> 26, req)); } /* * Sysctl to enable/disable the types of packets that the * adapter will wake up on upon receipt. * WUFC - Wake Up Filter Control * Flags: * 0x1 - Link Status Change * 0x2 - Magic Packet * 0x4 - Direct Exact * 0x8 - Directed Multicast * 0x10 - Broadcast * 0x20 - ARP/IPv4 Request Packet * 0x40 - Direct IPv4 Packet * 0x80 - Direct IPv6 Packet * * Setting another flag will cause the sysctl to return an * error. */ static int ixgbe_sysctl_wufc(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *) arg1; int error = 0; u32 new_wufc; new_wufc = adapter->wufc; error = sysctl_handle_int(oidp, &new_wufc, 0, req); if ((error) || (req->newptr == NULL)) return (error); if (new_wufc == adapter->wufc) return (0); if (new_wufc & 0xffffff00) return (EINVAL); else { new_wufc &= 0xff; new_wufc |= (0xffffff & adapter->wufc); adapter->wufc = new_wufc; } return (0); } #ifdef IXGBE_DEBUG static int ixgbe_sysctl_print_rss_config(SYSCTL_HANDLER_ARGS) { struct adapter *adapter = (struct adapter *)arg1; struct ixgbe_hw *hw = &adapter->hw; device_t dev = adapter->dev; int error = 0, reta_size; struct sbuf *buf; u32 reg; buf = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (!buf) { device_printf(dev, "Could not allocate sbuf for output.\n"); return (ENOMEM); } // TODO: use sbufs to make a string to print out /* Set multiplier for RETA setup and table size based on MAC */ switch (adapter->hw.mac.type) { case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: reta_size = 128; break; default: reta_size = 32; break; } /* Print out the redirection table */ sbuf_cat(buf, "\n"); for (int i = 0; i < reta_size; i++) { if (i < 32) { reg = IXGBE_READ_REG(hw, IXGBE_RETA(i)); sbuf_printf(buf, "RETA(%2d): 0x%08x\n", i, reg); } else { reg = IXGBE_READ_REG(hw, IXGBE_ERETA(i - 32)); sbuf_printf(buf, "ERETA(%2d): 0x%08x\n", i - 32, reg); } } // TODO: print more config error = sbuf_finish(buf); if (error) device_printf(dev, "Error finishing sbuf: %d\n", error); sbuf_delete(buf); return (0); } #endif /* IXGBE_DEBUG */ /* ** Enable the hardware to drop packets when the buffer is ** full. This is useful when multiqueue,so that no single ** queue being full stalls the entire RX engine. We only ** enable this when Multiqueue AND when Flow Control is ** disabled. */ static void ixgbe_enable_rx_drop(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++) { struct rx_ring *rxr = &adapter->rx_rings[i]; u32 srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(rxr->me)); srrctl |= IXGBE_SRRCTL_DROP_EN; IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rxr->me), srrctl); } #ifdef PCI_IOV /* enable drop for each vf */ for (int i = 0; i < adapter->num_vfs; i++) { IXGBE_WRITE_REG(hw, IXGBE_QDE, (IXGBE_QDE_WRITE | (i << IXGBE_QDE_IDX_SHIFT) | IXGBE_QDE_ENABLE)); } #endif } static void ixgbe_disable_rx_drop(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++) { struct rx_ring *rxr = &adapter->rx_rings[i]; u32 srrctl = IXGBE_READ_REG(hw, IXGBE_SRRCTL(rxr->me)); srrctl &= ~IXGBE_SRRCTL_DROP_EN; IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rxr->me), srrctl); } #ifdef PCI_IOV /* disable drop for each vf */ for (int i = 0; i < adapter->num_vfs; i++) { IXGBE_WRITE_REG(hw, IXGBE_QDE, (IXGBE_QDE_WRITE | (i << IXGBE_QDE_IDX_SHIFT))); } #endif } static void ixgbe_rearm_queues(struct adapter *adapter, u64 queues) { u32 mask; switch (adapter->hw.mac.type) { case ixgbe_mac_82598EB: mask = (IXGBE_EIMS_RTX_QUEUE & queues); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EICS, mask); break; case ixgbe_mac_82599EB: case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: mask = (queues & 0xFFFFFFFF); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EICS_EX(0), mask); mask = (queues >> 32); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EICS_EX(1), mask); break; default: break; } } #ifdef PCI_IOV /* ** Support functions for SRIOV/VF management */ static void ixgbe_ping_all_vfs(struct adapter *adapter) { struct ixgbe_vf *vf; for (int i = 0; i < adapter->num_vfs; i++) { vf = &adapter->vfs[i]; if (vf->flags & IXGBE_VF_ACTIVE) ixgbe_send_vf_msg(adapter, vf, IXGBE_PF_CONTROL_MSG); } } static void ixgbe_vf_set_default_vlan(struct adapter *adapter, struct ixgbe_vf *vf, uint16_t tag) { struct ixgbe_hw *hw; uint32_t vmolr, vmvir; hw = &adapter->hw; vf->vlan_tag = tag; vmolr = IXGBE_READ_REG(hw, IXGBE_VMOLR(vf->pool)); /* Do not receive packets that pass inexact filters. */ vmolr &= ~(IXGBE_VMOLR_ROMPE | IXGBE_VMOLR_ROPE); /* Disable Multicast Promicuous Mode. */ vmolr &= ~IXGBE_VMOLR_MPE; /* Accept broadcasts. */ vmolr |= IXGBE_VMOLR_BAM; if (tag == 0) { /* Accept non-vlan tagged traffic. */ //vmolr |= IXGBE_VMOLR_AUPE; /* Allow VM to tag outgoing traffic; no default tag. */ vmvir = 0; } else { /* Require vlan-tagged traffic. */ vmolr &= ~IXGBE_VMOLR_AUPE; /* Tag all traffic with provided vlan tag. */ vmvir = (tag | IXGBE_VMVIR_VLANA_DEFAULT); } IXGBE_WRITE_REG(hw, IXGBE_VMOLR(vf->pool), vmolr); IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf->pool), vmvir); } static boolean_t ixgbe_vf_frame_size_compatible(struct adapter *adapter, struct ixgbe_vf *vf) { /* * Frame size compatibility between PF and VF is only a problem on * 82599-based cards. X540 and later support any combination of jumbo * frames on PFs and VFs. */ if (adapter->hw.mac.type != ixgbe_mac_82599EB) return (TRUE); switch (vf->api_ver) { case IXGBE_API_VER_1_0: case IXGBE_API_VER_UNKNOWN: /* * On legacy (1.0 and older) VF versions, we don't support jumbo * frames on either the PF or the VF. */ if (adapter->max_frame_size > ETHER_MAX_LEN || vf->max_frame_size > ETHER_MAX_LEN) return (FALSE); return (TRUE); break; case IXGBE_API_VER_1_1: default: /* * 1.1 or later VF versions always work if they aren't using * jumbo frames. */ if (vf->max_frame_size <= ETHER_MAX_LEN) return (TRUE); /* * Jumbo frames only work with VFs if the PF is also using jumbo * frames. */ if (adapter->max_frame_size <= ETHER_MAX_LEN) return (TRUE); return (FALSE); } } static void ixgbe_process_vf_reset(struct adapter *adapter, struct ixgbe_vf *vf) { ixgbe_vf_set_default_vlan(adapter, vf, vf->default_vlan); // XXX clear multicast addresses ixgbe_clear_rar(&adapter->hw, vf->rar_index); vf->api_ver = IXGBE_API_VER_UNKNOWN; } static void ixgbe_vf_enable_transmit(struct adapter *adapter, struct ixgbe_vf *vf) { struct ixgbe_hw *hw; uint32_t vf_index, vfte; hw = &adapter->hw; vf_index = IXGBE_VF_INDEX(vf->pool); vfte = IXGBE_READ_REG(hw, IXGBE_VFTE(vf_index)); vfte |= IXGBE_VF_BIT(vf->pool); IXGBE_WRITE_REG(hw, IXGBE_VFTE(vf_index), vfte); } static void ixgbe_vf_enable_receive(struct adapter *adapter, struct ixgbe_vf *vf) { struct ixgbe_hw *hw; uint32_t vf_index, vfre; hw = &adapter->hw; vf_index = IXGBE_VF_INDEX(vf->pool); vfre = IXGBE_READ_REG(hw, IXGBE_VFRE(vf_index)); if (ixgbe_vf_frame_size_compatible(adapter, vf)) vfre |= IXGBE_VF_BIT(vf->pool); else vfre &= ~IXGBE_VF_BIT(vf->pool); IXGBE_WRITE_REG(hw, IXGBE_VFRE(vf_index), vfre); } static void ixgbe_vf_reset_msg(struct adapter *adapter, struct ixgbe_vf *vf, uint32_t *msg) { struct ixgbe_hw *hw; uint32_t ack; uint32_t resp[IXGBE_VF_PERMADDR_MSG_LEN]; hw = &adapter->hw; ixgbe_process_vf_reset(adapter, vf); if (ixgbe_validate_mac_addr(vf->ether_addr) == 0) { ixgbe_set_rar(&adapter->hw, vf->rar_index, vf->ether_addr, vf->pool, TRUE); ack = IXGBE_VT_MSGTYPE_ACK; } else ack = IXGBE_VT_MSGTYPE_NACK; ixgbe_vf_enable_transmit(adapter, vf); ixgbe_vf_enable_receive(adapter, vf); vf->flags |= IXGBE_VF_CTS; resp[0] = IXGBE_VF_RESET | ack | IXGBE_VT_MSGTYPE_CTS; bcopy(vf->ether_addr, &resp[1], ETHER_ADDR_LEN); resp[3] = hw->mac.mc_filter_type; ixgbe_write_mbx(hw, resp, IXGBE_VF_PERMADDR_MSG_LEN, vf->pool); } static void ixgbe_vf_set_mac(struct adapter *adapter, struct ixgbe_vf *vf, uint32_t *msg) { uint8_t *mac; mac = (uint8_t*)&msg[1]; /* Check that the VF has permission to change the MAC address. */ if (!(vf->flags & IXGBE_VF_CAP_MAC) && ixgbe_vf_mac_changed(vf, mac)) { ixgbe_send_vf_nack(adapter, vf, msg[0]); return; } if (ixgbe_validate_mac_addr(mac) != 0) { ixgbe_send_vf_nack(adapter, vf, msg[0]); return; } bcopy(mac, vf->ether_addr, ETHER_ADDR_LEN); ixgbe_set_rar(&adapter->hw, vf->rar_index, vf->ether_addr, vf->pool, TRUE); ixgbe_send_vf_ack(adapter, vf, msg[0]); } /* ** VF multicast addresses are set by using the appropriate bit in ** 1 of 128 32 bit addresses (4096 possible). */ static void ixgbe_vf_set_mc_addr(struct adapter *adapter, struct ixgbe_vf *vf, u32 *msg) { u16 *list = (u16*)&msg[1]; int entries; u32 vmolr, vec_bit, vec_reg, mta_reg; entries = (msg[0] & IXGBE_VT_MSGINFO_MASK) >> IXGBE_VT_MSGINFO_SHIFT; entries = min(entries, IXGBE_MAX_VF_MC); vmolr = IXGBE_READ_REG(&adapter->hw, IXGBE_VMOLR(vf->pool)); vf->num_mc_hashes = entries; /* Set the appropriate MTA bit */ for (int i = 0; i < entries; i++) { vf->mc_hash[i] = list[i]; vec_reg = (vf->mc_hash[i] >> 5) & 0x7F; vec_bit = vf->mc_hash[i] & 0x1F; mta_reg = IXGBE_READ_REG(&adapter->hw, IXGBE_MTA(vec_reg)); mta_reg |= (1 << vec_bit); IXGBE_WRITE_REG(&adapter->hw, IXGBE_MTA(vec_reg), mta_reg); } vmolr |= IXGBE_VMOLR_ROMPE; IXGBE_WRITE_REG(&adapter->hw, IXGBE_VMOLR(vf->pool), vmolr); ixgbe_send_vf_ack(adapter, vf, msg[0]); return; } static void ixgbe_vf_set_vlan(struct adapter *adapter, struct ixgbe_vf *vf, uint32_t *msg) { struct ixgbe_hw *hw; int enable; uint16_t tag; hw = &adapter->hw; enable = IXGBE_VT_MSGINFO(msg[0]); tag = msg[1] & IXGBE_VLVF_VLANID_MASK; if (!(vf->flags & IXGBE_VF_CAP_VLAN)) { ixgbe_send_vf_nack(adapter, vf, msg[0]); return; } /* It is illegal to enable vlan tag 0. */ if (tag == 0 && enable != 0){ ixgbe_send_vf_nack(adapter, vf, msg[0]); return; } ixgbe_set_vfta(hw, tag, vf->pool, enable); ixgbe_send_vf_ack(adapter, vf, msg[0]); } static void ixgbe_vf_set_lpe(struct adapter *adapter, struct ixgbe_vf *vf, uint32_t *msg) { struct ixgbe_hw *hw; uint32_t vf_max_size, pf_max_size, mhadd; hw = &adapter->hw; vf_max_size = msg[1]; if (vf_max_size < ETHER_CRC_LEN) { /* We intentionally ACK invalid LPE requests. */ ixgbe_send_vf_ack(adapter, vf, msg[0]); return; } vf_max_size -= ETHER_CRC_LEN; if (vf_max_size > IXGBE_MAX_FRAME_SIZE) { /* We intentionally ACK invalid LPE requests. */ ixgbe_send_vf_ack(adapter, vf, msg[0]); return; } vf->max_frame_size = vf_max_size; ixgbe_update_max_frame(adapter, vf->max_frame_size); /* * We might have to disable reception to this VF if the frame size is * not compatible with the config on the PF. */ ixgbe_vf_enable_receive(adapter, vf); mhadd = IXGBE_READ_REG(hw, IXGBE_MHADD); pf_max_size = (mhadd & IXGBE_MHADD_MFS_MASK) >> IXGBE_MHADD_MFS_SHIFT; if (pf_max_size < adapter->max_frame_size) { mhadd &= ~IXGBE_MHADD_MFS_MASK; mhadd |= adapter->max_frame_size << IXGBE_MHADD_MFS_SHIFT; IXGBE_WRITE_REG(hw, IXGBE_MHADD, mhadd); } ixgbe_send_vf_ack(adapter, vf, msg[0]); } static void ixgbe_vf_set_macvlan(struct adapter *adapter, struct ixgbe_vf *vf, uint32_t *msg) { //XXX implement this ixgbe_send_vf_nack(adapter, vf, msg[0]); } static void ixgbe_vf_api_negotiate(struct adapter *adapter, struct ixgbe_vf *vf, uint32_t *msg) { switch (msg[1]) { case IXGBE_API_VER_1_0: case IXGBE_API_VER_1_1: vf->api_ver = msg[1]; ixgbe_send_vf_ack(adapter, vf, msg[0]); break; default: vf->api_ver = IXGBE_API_VER_UNKNOWN; ixgbe_send_vf_nack(adapter, vf, msg[0]); break; } } static void ixgbe_vf_get_queues(struct adapter *adapter, struct ixgbe_vf *vf, uint32_t *msg) { struct ixgbe_hw *hw; uint32_t resp[IXGBE_VF_GET_QUEUES_RESP_LEN]; int num_queues; hw = &adapter->hw; /* GET_QUEUES is not supported on pre-1.1 APIs. */ switch (msg[0]) { case IXGBE_API_VER_1_0: case IXGBE_API_VER_UNKNOWN: ixgbe_send_vf_nack(adapter, vf, msg[0]); return; } resp[0] = IXGBE_VF_GET_QUEUES | IXGBE_VT_MSGTYPE_ACK | IXGBE_VT_MSGTYPE_CTS; num_queues = ixgbe_vf_queues(ixgbe_get_iov_mode(adapter)); resp[IXGBE_VF_TX_QUEUES] = num_queues; resp[IXGBE_VF_RX_QUEUES] = num_queues; resp[IXGBE_VF_TRANS_VLAN] = (vf->default_vlan != 0); resp[IXGBE_VF_DEF_QUEUE] = 0; ixgbe_write_mbx(hw, resp, IXGBE_VF_GET_QUEUES_RESP_LEN, vf->pool); } static void ixgbe_process_vf_msg(struct adapter *adapter, struct ixgbe_vf *vf) { struct ixgbe_hw *hw; uint32_t msg[IXGBE_VFMAILBOX_SIZE]; int error; hw = &adapter->hw; error = ixgbe_read_mbx(hw, msg, IXGBE_VFMAILBOX_SIZE, vf->pool); if (error != 0) return; CTR3(KTR_MALLOC, "%s: received msg %x from %d", adapter->ifp->if_xname, msg[0], vf->pool); if (msg[0] == IXGBE_VF_RESET) { ixgbe_vf_reset_msg(adapter, vf, msg); return; } if (!(vf->flags & IXGBE_VF_CTS)) { ixgbe_send_vf_nack(adapter, vf, msg[0]); return; } switch (msg[0] & IXGBE_VT_MSG_MASK) { case IXGBE_VF_SET_MAC_ADDR: ixgbe_vf_set_mac(adapter, vf, msg); break; case IXGBE_VF_SET_MULTICAST: ixgbe_vf_set_mc_addr(adapter, vf, msg); break; case IXGBE_VF_SET_VLAN: ixgbe_vf_set_vlan(adapter, vf, msg); break; case IXGBE_VF_SET_LPE: ixgbe_vf_set_lpe(adapter, vf, msg); break; case IXGBE_VF_SET_MACVLAN: ixgbe_vf_set_macvlan(adapter, vf, msg); break; case IXGBE_VF_API_NEGOTIATE: ixgbe_vf_api_negotiate(adapter, vf, msg); break; case IXGBE_VF_GET_QUEUES: ixgbe_vf_get_queues(adapter, vf, msg); break; default: ixgbe_send_vf_nack(adapter, vf, msg[0]); } } /* * Tasklet for handling VF -> PF mailbox messages. */ static void ixgbe_handle_mbx(void *context, int pending) { struct adapter *adapter; struct ixgbe_hw *hw; struct ixgbe_vf *vf; int i; adapter = context; hw = &adapter->hw; IXGBE_CORE_LOCK(adapter); for (i = 0; i < adapter->num_vfs; i++) { vf = &adapter->vfs[i]; if (vf->flags & IXGBE_VF_ACTIVE) { if (ixgbe_check_for_rst(hw, vf->pool) == 0) ixgbe_process_vf_reset(adapter, vf); if (ixgbe_check_for_msg(hw, vf->pool) == 0) ixgbe_process_vf_msg(adapter, vf); if (ixgbe_check_for_ack(hw, vf->pool) == 0) ixgbe_process_vf_ack(adapter, vf); } } IXGBE_CORE_UNLOCK(adapter); } static int ixgbe_init_iov(device_t dev, u16 num_vfs, const nvlist_t *config) { struct adapter *adapter; enum ixgbe_iov_mode mode; adapter = device_get_softc(dev); adapter->num_vfs = num_vfs; mode = ixgbe_get_iov_mode(adapter); if (num_vfs > ixgbe_max_vfs(mode)) { adapter->num_vfs = 0; return (ENOSPC); } IXGBE_CORE_LOCK(adapter); adapter->vfs = malloc(sizeof(*adapter->vfs) * num_vfs, M_IXGBE, M_NOWAIT | M_ZERO); if (adapter->vfs == NULL) { adapter->num_vfs = 0; IXGBE_CORE_UNLOCK(adapter); return (ENOMEM); } ixgbe_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); return (0); } static void ixgbe_uninit_iov(device_t dev) { struct ixgbe_hw *hw; struct adapter *adapter; uint32_t pf_reg, vf_reg; adapter = device_get_softc(dev); hw = &adapter->hw; IXGBE_CORE_LOCK(adapter); /* Enable rx/tx for the PF and disable it for all VFs. */ pf_reg = IXGBE_VF_INDEX(adapter->pool); IXGBE_WRITE_REG(hw, IXGBE_VFRE(pf_reg), IXGBE_VF_BIT(adapter->pool)); IXGBE_WRITE_REG(hw, IXGBE_VFTE(pf_reg), IXGBE_VF_BIT(adapter->pool)); if (pf_reg == 0) vf_reg = 1; else vf_reg = 0; IXGBE_WRITE_REG(hw, IXGBE_VFRE(vf_reg), 0); IXGBE_WRITE_REG(hw, IXGBE_VFTE(vf_reg), 0); IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, 0); free(adapter->vfs, M_IXGBE); adapter->vfs = NULL; adapter->num_vfs = 0; IXGBE_CORE_UNLOCK(adapter); } static void ixgbe_initialize_iov(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; uint32_t mrqc, mtqc, vt_ctl, vf_reg, gcr_ext, gpie; enum ixgbe_iov_mode mode; int i; mode = ixgbe_get_iov_mode(adapter); if (mode == IXGBE_NO_VM) return; IXGBE_CORE_LOCK_ASSERT(adapter); mrqc = IXGBE_READ_REG(hw, IXGBE_MRQC); mrqc &= ~IXGBE_MRQC_MRQE_MASK; switch (mode) { case IXGBE_64_VM: mrqc |= IXGBE_MRQC_VMDQRSS64EN; break; case IXGBE_32_VM: mrqc |= IXGBE_MRQC_VMDQRSS32EN; break; default: panic("Unexpected SR-IOV mode %d", mode); } IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc); mtqc = IXGBE_MTQC_VT_ENA; switch (mode) { case IXGBE_64_VM: mtqc |= IXGBE_MTQC_64VF; break; case IXGBE_32_VM: mtqc |= IXGBE_MTQC_32VF; break; default: panic("Unexpected SR-IOV mode %d", mode); } IXGBE_WRITE_REG(hw, IXGBE_MTQC, mtqc); gcr_ext = IXGBE_READ_REG(hw, IXGBE_GCR_EXT); gcr_ext |= IXGBE_GCR_EXT_MSIX_EN; gcr_ext &= ~IXGBE_GCR_EXT_VT_MODE_MASK; switch (mode) { case IXGBE_64_VM: gcr_ext |= IXGBE_GCR_EXT_VT_MODE_64; break; case IXGBE_32_VM: gcr_ext |= IXGBE_GCR_EXT_VT_MODE_32; break; default: panic("Unexpected SR-IOV mode %d", mode); } IXGBE_WRITE_REG(hw, IXGBE_GCR_EXT, gcr_ext); gpie = IXGBE_READ_REG(hw, IXGBE_GPIE); gcr_ext &= ~IXGBE_GPIE_VTMODE_MASK; switch (mode) { case IXGBE_64_VM: gpie |= IXGBE_GPIE_VTMODE_64; break; case IXGBE_32_VM: gpie |= IXGBE_GPIE_VTMODE_32; break; default: panic("Unexpected SR-IOV mode %d", mode); } IXGBE_WRITE_REG(hw, IXGBE_GPIE, gpie); /* Enable rx/tx for the PF. */ vf_reg = IXGBE_VF_INDEX(adapter->pool); IXGBE_WRITE_REG(hw, IXGBE_VFRE(vf_reg), IXGBE_VF_BIT(adapter->pool)); IXGBE_WRITE_REG(hw, IXGBE_VFTE(vf_reg), IXGBE_VF_BIT(adapter->pool)); /* Allow VM-to-VM communication. */ IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, IXGBE_PFDTXGSWC_VT_LBEN); vt_ctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; vt_ctl |= (adapter->pool << IXGBE_VT_CTL_POOL_SHIFT); IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vt_ctl); for (i = 0; i < adapter->num_vfs; i++) ixgbe_init_vf(adapter, &adapter->vfs[i]); } /* ** Check the max frame setting of all active VF's */ static void ixgbe_recalculate_max_frame(struct adapter *adapter) { struct ixgbe_vf *vf; IXGBE_CORE_LOCK_ASSERT(adapter); for (int i = 0; i < adapter->num_vfs; i++) { vf = &adapter->vfs[i]; if (vf->flags & IXGBE_VF_ACTIVE) ixgbe_update_max_frame(adapter, vf->max_frame_size); } } static void ixgbe_init_vf(struct adapter *adapter, struct ixgbe_vf *vf) { struct ixgbe_hw *hw; uint32_t vf_index, pfmbimr; IXGBE_CORE_LOCK_ASSERT(adapter); hw = &adapter->hw; if (!(vf->flags & IXGBE_VF_ACTIVE)) return; vf_index = IXGBE_VF_INDEX(vf->pool); pfmbimr = IXGBE_READ_REG(hw, IXGBE_PFMBIMR(vf_index)); pfmbimr |= IXGBE_VF_BIT(vf->pool); IXGBE_WRITE_REG(hw, IXGBE_PFMBIMR(vf_index), pfmbimr); ixgbe_vf_set_default_vlan(adapter, vf, vf->vlan_tag); // XXX multicast addresses if (ixgbe_validate_mac_addr(vf->ether_addr) == 0) { ixgbe_set_rar(&adapter->hw, vf->rar_index, vf->ether_addr, vf->pool, TRUE); } ixgbe_vf_enable_transmit(adapter, vf); ixgbe_vf_enable_receive(adapter, vf); ixgbe_send_vf_msg(adapter, vf, IXGBE_PF_CONTROL_MSG); } static int ixgbe_add_vf(device_t dev, u16 vfnum, const nvlist_t *config) { struct adapter *adapter; struct ixgbe_vf *vf; const void *mac; adapter = device_get_softc(dev); KASSERT(vfnum < adapter->num_vfs, ("VF index %d is out of range %d", vfnum, adapter->num_vfs)); IXGBE_CORE_LOCK(adapter); vf = &adapter->vfs[vfnum]; vf->pool= vfnum; /* RAR[0] is used by the PF so use vfnum + 1 for VF RAR. */ vf->rar_index = vfnum + 1; vf->default_vlan = 0; vf->max_frame_size = ETHER_MAX_LEN; ixgbe_update_max_frame(adapter, vf->max_frame_size); if (nvlist_exists_binary(config, "mac-addr")) { mac = nvlist_get_binary(config, "mac-addr", NULL); bcopy(mac, vf->ether_addr, ETHER_ADDR_LEN); if (nvlist_get_bool(config, "allow-set-mac")) vf->flags |= IXGBE_VF_CAP_MAC; } else /* * If the administrator has not specified a MAC address then * we must allow the VF to choose one. */ vf->flags |= IXGBE_VF_CAP_MAC; vf->flags = IXGBE_VF_ACTIVE; ixgbe_init_vf(adapter, vf); IXGBE_CORE_UNLOCK(adapter); return (0); } #endif /* PCI_IOV */ Index: head/sys/dev/ixgbe/if_ixv.c =================================================================== --- head/sys/dev/ixgbe/if_ixv.c (revision 294326) +++ head/sys/dev/ixgbe/if_ixv.c (revision 294327) @@ -1,2203 +1,2203 @@ /****************************************************************************** Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #ifndef IXGBE_STANDALONE_BUILD #include "opt_inet.h" #include "opt_inet6.h" #endif #include "ixgbe.h" /********************************************************************* * Driver version *********************************************************************/ char ixv_driver_version[] = "1.4.6-k"; /********************************************************************* * PCI Device ID Table * * Used by probe to select devices to load on * Last field stores an index into ixv_strings * Last entry must be all 0s * * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, String Index } *********************************************************************/ static ixgbe_vendor_info_t ixv_vendor_info_array[] = { {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_82599_VF, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X540_VF, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550_VF, 0, 0, 0}, {IXGBE_INTEL_VENDOR_ID, IXGBE_DEV_ID_X550EM_X_VF, 0, 0, 0}, /* required last entry */ {0, 0, 0, 0, 0} }; /********************************************************************* * Table of branding strings *********************************************************************/ static char *ixv_strings[] = { "Intel(R) PRO/10GbE Virtual Function Network Driver" }; /********************************************************************* * Function prototypes *********************************************************************/ static int ixv_probe(device_t); static int ixv_attach(device_t); static int ixv_detach(device_t); static int ixv_shutdown(device_t); static int ixv_ioctl(struct ifnet *, u_long, caddr_t); static void ixv_init(void *); static void ixv_init_locked(struct adapter *); static void ixv_stop(void *); static void ixv_media_status(struct ifnet *, struct ifmediareq *); static int ixv_media_change(struct ifnet *); static void ixv_identify_hardware(struct adapter *); static int ixv_allocate_pci_resources(struct adapter *); static int ixv_allocate_msix(struct adapter *); static int ixv_setup_msix(struct adapter *); static void ixv_free_pci_resources(struct adapter *); static void ixv_local_timer(void *); static void ixv_setup_interface(device_t, struct adapter *); static void ixv_config_link(struct adapter *); static void ixv_initialize_transmit_units(struct adapter *); static void ixv_initialize_receive_units(struct adapter *); static void ixv_enable_intr(struct adapter *); static void ixv_disable_intr(struct adapter *); static void ixv_set_multi(struct adapter *); static void ixv_update_link_status(struct adapter *); static int ixv_sysctl_debug(SYSCTL_HANDLER_ARGS); static void ixv_set_ivar(struct adapter *, u8, u8, s8); static void ixv_configure_ivars(struct adapter *); static u8 * ixv_mc_array_itr(struct ixgbe_hw *, u8 **, u32 *); static void ixv_setup_vlan_support(struct adapter *); static void ixv_register_vlan(void *, struct ifnet *, u16); static void ixv_unregister_vlan(void *, struct ifnet *, u16); static void ixv_save_stats(struct adapter *); static void ixv_init_stats(struct adapter *); static void ixv_update_stats(struct adapter *); static void ixv_add_stats_sysctls(struct adapter *); static void ixv_set_sysctl_value(struct adapter *, const char *, const char *, int *, int); /* The MSI/X Interrupt handlers */ static void ixv_msix_que(void *); static void ixv_msix_mbx(void *); /* Deferred interrupt tasklets */ static void ixv_handle_que(void *, int); static void ixv_handle_mbx(void *, int); #ifdef DEV_NETMAP /* * This is defined in , which is included by * if_ix.c. */ extern void ixgbe_netmap_attach(struct adapter *adapter); #include #include #include #endif /* DEV_NETMAP */ /********************************************************************* * FreeBSD Device Interface Entry Points *********************************************************************/ static device_method_t ixv_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ixv_probe), DEVMETHOD(device_attach, ixv_attach), DEVMETHOD(device_detach, ixv_detach), DEVMETHOD(device_shutdown, ixv_shutdown), DEVMETHOD_END }; static driver_t ixv_driver = { "ixv", ixv_methods, sizeof(struct adapter), }; devclass_t ixv_devclass; DRIVER_MODULE(ixv, pci, ixv_driver, ixv_devclass, 0, 0); MODULE_DEPEND(ixv, pci, 1, 1, 1); MODULE_DEPEND(ixv, ether, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(ix, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ /* XXX depend on 'ix' ? */ /* ** TUNEABLE PARAMETERS: */ /* Number of Queues - do not exceed MSIX vectors - 1 */ static int ixv_num_queues = 1; TUNABLE_INT("hw.ixv.num_queues", &ixv_num_queues); /* ** AIM: Adaptive Interrupt Moderation ** which means that the interrupt rate ** is varied over time based on the ** traffic for that interrupt vector */ static int ixv_enable_aim = FALSE; TUNABLE_INT("hw.ixv.enable_aim", &ixv_enable_aim); /* How many packets rxeof tries to clean at a time */ static int ixv_rx_process_limit = 256; TUNABLE_INT("hw.ixv.rx_process_limit", &ixv_rx_process_limit); /* How many packets txeof tries to clean at a time */ static int ixv_tx_process_limit = 256; TUNABLE_INT("hw.ixv.tx_process_limit", &ixv_tx_process_limit); /* Flow control setting, default to full */ static int ixv_flow_control = ixgbe_fc_full; TUNABLE_INT("hw.ixv.flow_control", &ixv_flow_control); /* * Header split: this causes the hardware to DMA * the header into a seperate mbuf from the payload, * it can be a performance win in some workloads, but * in others it actually hurts, its off by default. */ static int ixv_header_split = FALSE; TUNABLE_INT("hw.ixv.hdr_split", &ixv_header_split); /* ** Number of TX descriptors per ring, ** setting higher than RX as this seems ** the better performing choice. */ static int ixv_txd = DEFAULT_TXD; TUNABLE_INT("hw.ixv.txd", &ixv_txd); /* Number of RX descriptors per ring */ static int ixv_rxd = DEFAULT_RXD; TUNABLE_INT("hw.ixv.rxd", &ixv_rxd); /* ** Shadow VFTA table, this is needed because ** the real filter table gets cleared during ** a soft reset and we need to repopulate it. */ static u32 ixv_shadow_vfta[IXGBE_VFTA_SIZE]; /********************************************************************* * Device identification routine * * ixv_probe determines if the driver should be loaded on * adapter based on PCI vendor/device id of the adapter. * * return BUS_PROBE_DEFAULT on success, positive on failure *********************************************************************/ static int ixv_probe(device_t dev) { ixgbe_vendor_info_t *ent; u16 pci_vendor_id = 0; u16 pci_device_id = 0; u16 pci_subvendor_id = 0; u16 pci_subdevice_id = 0; char adapter_name[256]; pci_vendor_id = pci_get_vendor(dev); if (pci_vendor_id != IXGBE_INTEL_VENDOR_ID) return (ENXIO); pci_device_id = pci_get_device(dev); pci_subvendor_id = pci_get_subvendor(dev); pci_subdevice_id = pci_get_subdevice(dev); ent = ixv_vendor_info_array; while (ent->vendor_id != 0) { if ((pci_vendor_id == ent->vendor_id) && (pci_device_id == ent->device_id) && ((pci_subvendor_id == ent->subvendor_id) || (ent->subvendor_id == 0)) && ((pci_subdevice_id == ent->subdevice_id) || (ent->subdevice_id == 0))) { sprintf(adapter_name, "%s, Version - %s", ixv_strings[ent->index], ixv_driver_version); device_set_desc_copy(dev, adapter_name); return (BUS_PROBE_DEFAULT); } ent++; } return (ENXIO); } /********************************************************************* * Device initialization routine * * The attach entry point is called when the driver is being loaded. * This routine identifies the type of hardware, allocates all resources * and initializes the hardware. * * return 0 on success, positive on failure *********************************************************************/ static int ixv_attach(device_t dev) { struct adapter *adapter; struct ixgbe_hw *hw; int error = 0; INIT_DEBUGOUT("ixv_attach: begin"); /* Allocate, clear, and link in our adapter structure */ adapter = device_get_softc(dev); adapter->dev = dev; hw = &adapter->hw; #ifdef DEV_NETMAP adapter->init_locked = ixv_init_locked; adapter->stop_locked = ixv_stop; #endif /* Core Lock Init*/ IXGBE_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); /* SYSCTL APIs */ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "debug", CTLTYPE_INT | CTLFLAG_RW, adapter, 0, ixv_sysctl_debug, "I", "Debug Info"); SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "enable_aim", CTLFLAG_RW, &ixv_enable_aim, 1, "Interrupt Moderation"); /* Set up the timer callout */ callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); /* Determine hardware revision */ ixv_identify_hardware(adapter); /* Do base PCI setup - map BAR0 */ if (ixv_allocate_pci_resources(adapter)) { device_printf(dev, "ixv_allocate_pci_resources() failed!\n"); error = ENXIO; goto err_out; } /* Sysctls for limiting the amount of work done in the taskqueues */ ixv_set_sysctl_value(adapter, "rx_processing_limit", "max number of rx packets to process", &adapter->rx_process_limit, ixv_rx_process_limit); ixv_set_sysctl_value(adapter, "tx_processing_limit", "max number of tx packets to process", &adapter->tx_process_limit, ixv_tx_process_limit); /* Do descriptor calc and sanity checks */ if (((ixv_txd * sizeof(union ixgbe_adv_tx_desc)) % DBA_ALIGN) != 0 || ixv_txd < MIN_TXD || ixv_txd > MAX_TXD) { device_printf(dev, "TXD config issue, using default!\n"); adapter->num_tx_desc = DEFAULT_TXD; } else adapter->num_tx_desc = ixv_txd; if (((ixv_rxd * sizeof(union ixgbe_adv_rx_desc)) % DBA_ALIGN) != 0 || ixv_rxd < MIN_RXD || ixv_rxd > MAX_RXD) { device_printf(dev, "RXD config issue, using default!\n"); adapter->num_rx_desc = DEFAULT_RXD; } else adapter->num_rx_desc = ixv_rxd; /* Allocate our TX/RX Queues */ if (ixgbe_allocate_queues(adapter)) { device_printf(dev, "ixgbe_allocate_queues() failed!\n"); error = ENOMEM; goto err_out; } /* ** Initialize the shared code: its ** at this point the mac type is set. */ error = ixgbe_init_shared_code(hw); if (error) { device_printf(dev, "ixgbe_init_shared_code() failed!\n"); error = EIO; goto err_late; } /* Setup the mailbox */ ixgbe_init_mbx_params_vf(hw); /* Reset mbox api to 1.0 */ error = ixgbe_reset_hw(hw); if (error == IXGBE_ERR_RESET_FAILED) device_printf(dev, "ixgbe_reset_hw() failure: Reset Failed!\n"); else if (error) device_printf(dev, "ixgbe_reset_hw() failed with error %d\n", error); if (error) { error = EIO; goto err_late; } /* Negotiate mailbox API version */ error = ixgbevf_negotiate_api_version(hw, ixgbe_mbox_api_11); if (error) { device_printf(dev, "MBX API 1.1 negotiation failed! Error %d\n", error); error = EIO; goto err_late; } error = ixgbe_init_hw(hw); if (error) { device_printf(dev, "ixgbe_init_hw() failed!\n"); error = EIO; goto err_late; } error = ixv_allocate_msix(adapter); if (error) { device_printf(dev, "ixv_allocate_msix() failed!\n"); goto err_late; } /* If no mac address was assigned, make a random one */ if (!ixv_check_ether_addr(hw->mac.addr)) { u8 addr[ETHER_ADDR_LEN]; arc4rand(&addr, sizeof(addr), 0); addr[0] &= 0xFE; addr[0] |= 0x02; bcopy(addr, hw->mac.addr, sizeof(addr)); } /* Setup OS specific network interface */ ixv_setup_interface(dev, adapter); /* Do the stats setup */ ixv_save_stats(adapter); ixv_init_stats(adapter); ixv_add_stats_sysctls(adapter); /* Register for VLAN events */ adapter->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ixv_register_vlan, adapter, EVENTHANDLER_PRI_FIRST); adapter->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ixv_unregister_vlan, adapter, EVENTHANDLER_PRI_FIRST); #ifdef DEV_NETMAP ixgbe_netmap_attach(adapter); #endif /* DEV_NETMAP */ INIT_DEBUGOUT("ixv_attach: end"); return (0); err_late: ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); err_out: ixv_free_pci_resources(adapter); return (error); } /********************************************************************* * Device removal routine * * The detach entry point is called when the driver is being removed. * This routine stops the adapter and deallocates all the resources * that were allocated for driver operation. * * return 0 on success, positive on failure *********************************************************************/ static int ixv_detach(device_t dev) { struct adapter *adapter = device_get_softc(dev); struct ix_queue *que = adapter->queues; INIT_DEBUGOUT("ixv_detach: begin"); /* Make sure VLANS are not using driver */ if (adapter->ifp->if_vlantrunk != NULL) { device_printf(dev, "Vlan in use, detach first\n"); return (EBUSY); } IXGBE_CORE_LOCK(adapter); ixv_stop(adapter); IXGBE_CORE_UNLOCK(adapter); for (int i = 0; i < adapter->num_queues; i++, que++) { if (que->tq) { struct tx_ring *txr = que->txr; taskqueue_drain(que->tq, &txr->txq_task); taskqueue_drain(que->tq, &que->que_task); taskqueue_free(que->tq); } } /* Drain the Mailbox(link) queue */ if (adapter->tq) { taskqueue_drain(adapter->tq, &adapter->link_task); taskqueue_free(adapter->tq); } /* Unregister VLAN events */ if (adapter->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, adapter->vlan_attach); if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); ether_ifdetach(adapter->ifp); callout_drain(&adapter->timer); #ifdef DEV_NETMAP netmap_detach(adapter->ifp); #endif /* DEV_NETMAP */ ixv_free_pci_resources(adapter); bus_generic_detach(dev); if_free(adapter->ifp); ixgbe_free_transmit_structures(adapter); ixgbe_free_receive_structures(adapter); IXGBE_CORE_LOCK_DESTROY(adapter); return (0); } /********************************************************************* * * Shutdown entry point * **********************************************************************/ static int ixv_shutdown(device_t dev) { struct adapter *adapter = device_get_softc(dev); IXGBE_CORE_LOCK(adapter); ixv_stop(adapter); IXGBE_CORE_UNLOCK(adapter); return (0); } /********************************************************************* * Ioctl entry point * * ixv_ioctl is called when the user wants to configure the * interface. * * return 0 on success, positive on failure **********************************************************************/ static int ixv_ioctl(struct ifnet * ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; #if defined(INET) || defined(INET6) struct ifaddr *ifa = (struct ifaddr *) data; bool avoid_reset = FALSE; #endif int error = 0; switch (command) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) avoid_reset = TRUE; #endif #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) avoid_reset = TRUE; #endif #if defined(INET) || defined(INET6) /* ** Calling init results in link renegotiation, ** so we avoid doing it when possible. */ if (avoid_reset) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ixv_init(adapter); if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); } else error = ether_ioctl(ifp, command, data); break; #endif case SIOCSIFMTU: IOCTL_DEBUGOUT("ioctl: SIOCSIFMTU (Set Interface MTU)"); if (ifr->ifr_mtu > IXGBE_MAX_FRAME_SIZE - IXGBE_MTU_HDR) { error = EINVAL; } else { IXGBE_CORE_LOCK(adapter); ifp->if_mtu = ifr->ifr_mtu; adapter->max_frame_size = ifp->if_mtu + IXGBE_MTU_HDR; ixv_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFFLAGS: IOCTL_DEBUGOUT("ioctl: SIOCSIFFLAGS (Set Interface Flags)"); IXGBE_CORE_LOCK(adapter); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ixv_init_locked(adapter); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ixv_stop(adapter); adapter->if_flags = ifp->if_flags; IXGBE_CORE_UNLOCK(adapter); break; case SIOCADDMULTI: case SIOCDELMULTI: IOCTL_DEBUGOUT("ioctl: SIOC(ADD|DEL)MULTI"); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixv_disable_intr(adapter); ixv_set_multi(adapter); ixv_enable_intr(adapter); IXGBE_CORE_UNLOCK(adapter); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: IOCTL_DEBUGOUT("ioctl: SIOCxIFMEDIA (Get/Set Interface Media)"); error = ifmedia_ioctl(ifp, ifr, &adapter->media, command); break; case SIOCSIFCAP: { int mask = ifr->ifr_reqcap ^ ifp->if_capenable; IOCTL_DEBUGOUT("ioctl: SIOCSIFCAP (Set Capabilities)"); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { IXGBE_CORE_LOCK(adapter); ixv_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } VLAN_CAPABILITIES(ifp); break; } default: IOCTL_DEBUGOUT1("ioctl: UNKNOWN (0x%X)\n", (int)command); error = ether_ioctl(ifp, command, data); break; } return (error); } /********************************************************************* * Init entry point * * This routine is used in two ways. It is used by the stack as * init entry point in network interface structure. It is also used * by the driver as a hw/sw initialization routine to get to a * consistent state. * * return 0 on success, positive on failure **********************************************************************/ #define IXGBE_MHADD_MFS_SHIFT 16 static void ixv_init_locked(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; int error = 0; INIT_DEBUGOUT("ixv_init_locked: begin"); mtx_assert(&adapter->core_mtx, MA_OWNED); hw->adapter_stopped = FALSE; ixgbe_stop_adapter(hw); callout_stop(&adapter->timer); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV); /* Get the latest mac address, User can use a LAA */ bcopy(IF_LLADDR(adapter->ifp), hw->mac.addr, IXGBE_ETH_LENGTH_OF_ADDRESS); ixgbe_set_rar(hw, 0, hw->mac.addr, 0, 1); hw->addr_ctrl.rar_used_count = 1; /* Prepare transmit descriptors and buffers */ if (ixgbe_setup_transmit_structures(adapter)) { device_printf(dev, "Could not setup transmit structures\n"); ixv_stop(adapter); return; } /* Reset VF and renegotiate mailbox API version */ ixgbe_reset_hw(hw); error = ixgbevf_negotiate_api_version(hw, ixgbe_mbox_api_11); if (error) device_printf(dev, "MBX API 1.1 negotiation failed! Error %d\n", error); ixv_initialize_transmit_units(adapter); /* Setup Multicast table */ ixv_set_multi(adapter); /* ** Determine the correct mbuf pool ** for doing jumbo/headersplit */ if (ifp->if_mtu > ETHERMTU) adapter->rx_mbuf_sz = MJUMPAGESIZE; else adapter->rx_mbuf_sz = MCLBYTES; /* Prepare receive descriptors and buffers */ if (ixgbe_setup_receive_structures(adapter)) { device_printf(dev, "Could not setup receive structures\n"); ixv_stop(adapter); return; } /* Configure RX settings */ ixv_initialize_receive_units(adapter); /* Set the various hardware offload abilities */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); #if __FreeBSD_version >= 800000 ifp->if_hwassist |= CSUM_SCTP; #endif } /* Set up VLAN offload and filter */ ixv_setup_vlan_support(adapter); /* Set up MSI/X routing */ ixv_configure_ivars(adapter); /* Set up auto-mask */ IXGBE_WRITE_REG(hw, IXGBE_VTEIAM, IXGBE_EICS_RTX_QUEUE); /* Set moderation on the Link interrupt */ IXGBE_WRITE_REG(hw, IXGBE_VTEITR(adapter->vector), IXGBE_LINK_ITR); /* Stats init */ ixv_init_stats(adapter); /* Config/Enable Link */ ixv_config_link(adapter); /* Start watchdog */ callout_reset(&adapter->timer, hz, ixv_local_timer, adapter); /* And now turn on interrupts */ ixv_enable_intr(adapter); /* Now inform the stack we're ready */ ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; return; } static void ixv_init(void *arg) { struct adapter *adapter = arg; IXGBE_CORE_LOCK(adapter); ixv_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); return; } /* ** ** MSIX Interrupt Handlers and Tasklets ** */ static inline void ixv_enable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u32 queue = 1 << vector; u32 mask; mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask); } static inline void ixv_disable_queue(struct adapter *adapter, u32 vector) { struct ixgbe_hw *hw = &adapter->hw; u64 queue = (u64)(1 << vector); u32 mask; mask = (IXGBE_EIMS_RTX_QUEUE & queue); IXGBE_WRITE_REG(hw, IXGBE_VTEIMC, mask); } static inline void ixv_rearm_queues(struct adapter *adapter, u64 queues) { u32 mask = (IXGBE_EIMS_RTX_QUEUE & queues); IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEICS, mask); } static void ixv_handle_que(void *context, int pending) { struct ix_queue *que = context; struct adapter *adapter = que->adapter; struct tx_ring *txr = que->txr; struct ifnet *ifp = adapter->ifp; bool more; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); #if __FreeBSD_version >= 800000 if (!drbr_empty(ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) ixgbe_start_locked(txr, ifp); #endif IXGBE_TX_UNLOCK(txr); if (more) { taskqueue_enqueue(que->tq, &que->que_task); return; } } /* Reenable this interrupt */ ixv_enable_queue(adapter, que->msix); return; } /********************************************************************* * * MSI Queue Interrupt Service routine * **********************************************************************/ void ixv_msix_que(void *arg) { struct ix_queue *que = arg; struct adapter *adapter = que->adapter; struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; bool more; u32 newitr = 0; ixv_disable_queue(adapter, que->msix); ++que->irqs; more = ixgbe_rxeof(que); IXGBE_TX_LOCK(txr); ixgbe_txeof(txr); /* ** Make certain that if the stack ** has anything queued the task gets ** scheduled to handle it. */ #ifdef IXGBE_LEGACY_TX if (!IFQ_DRV_IS_EMPTY(&adapter->ifp->if_snd)) ixgbe_start_locked(txr, ifp); #else if (!drbr_empty(adapter->ifp, txr->br)) ixgbe_mq_start_locked(ifp, txr); #endif IXGBE_TX_UNLOCK(txr); /* Do AIM now? */ if (ixv_enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: ** - Write out last calculated setting ** - Calculate based on average size over ** the last interval. */ if (que->eitr_setting) IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEITR(que->msix), que->eitr_setting); que->eitr_setting = 0; /* Idle, do nothing */ if ((txr->bytes == 0) && (rxr->bytes == 0)) goto no_calc; if ((txr->bytes) && (txr->packets)) newitr = txr->bytes/txr->packets; if ((rxr->bytes) && (rxr->packets)) newitr = max(newitr, (rxr->bytes / rxr->packets)); newitr += 24; /* account for hardware frame, crc */ /* set an upper boundary */ newitr = min(newitr, 3000); /* Be nice to the mid range */ if ((newitr > 300) && (newitr < 1200)) newitr = (newitr / 3); else newitr = (newitr / 2); newitr |= newitr << 16; /* save for next interrupt */ que->eitr_setting = newitr; /* Reset state */ txr->bytes = 0; txr->packets = 0; rxr->bytes = 0; rxr->packets = 0; no_calc: if (more) taskqueue_enqueue(que->tq, &que->que_task); else /* Reenable this interrupt */ ixv_enable_queue(adapter, que->msix); return; } static void ixv_msix_mbx(void *arg) { struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; u32 reg; ++adapter->link_irq; /* First get the cause */ reg = IXGBE_READ_REG(hw, IXGBE_VTEICS); /* Clear interrupt with write */ IXGBE_WRITE_REG(hw, IXGBE_VTEICR, reg); /* Link status change */ if (reg & IXGBE_EICR_LSC) taskqueue_enqueue(adapter->tq, &adapter->link_task); IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, IXGBE_EIMS_OTHER); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called whenever the user queries the status of * the interface using ifconfig. * **********************************************************************/ static void ixv_media_status(struct ifnet * ifp, struct ifmediareq * ifmr) { struct adapter *adapter = ifp->if_softc; INIT_DEBUGOUT("ixv_media_status: begin"); IXGBE_CORE_LOCK(adapter); ixv_update_link_status(adapter); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!adapter->link_active) { IXGBE_CORE_UNLOCK(adapter); return; } ifmr->ifm_status |= IFM_ACTIVE; switch (adapter->link_speed) { case IXGBE_LINK_SPEED_1GB_FULL: ifmr->ifm_active |= IFM_1000_T | IFM_FDX; break; case IXGBE_LINK_SPEED_10GB_FULL: ifmr->ifm_active |= IFM_FDX; break; } IXGBE_CORE_UNLOCK(adapter); return; } /********************************************************************* * * Media Ioctl callback * * This routine is called when the user changes speed/duplex using * media/mediopt option with ifconfig. * **********************************************************************/ static int ixv_media_change(struct ifnet * ifp) { struct adapter *adapter = ifp->if_softc; struct ifmedia *ifm = &adapter->media; INIT_DEBUGOUT("ixv_media_change: begin"); if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: break; default: device_printf(adapter->dev, "Only auto media type\n"); return (EINVAL); } return (0); } /********************************************************************* * Multicast Update * * This routine is called whenever multicast address list is updated. * **********************************************************************/ #define IXGBE_RAR_ENTRIES 16 static void ixv_set_multi(struct adapter *adapter) { u8 mta[MAX_NUM_MULTICAST_ADDRESSES * IXGBE_ETH_LENGTH_OF_ADDRESS]; u8 *update_ptr; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = adapter->ifp; IOCTL_DEBUGOUT("ixv_set_multi: begin"); #if __FreeBSD_version < 800000 IF_ADDR_LOCK(ifp); #else if_maddr_rlock(ifp); #endif TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * IXGBE_ETH_LENGTH_OF_ADDRESS], IXGBE_ETH_LENGTH_OF_ADDRESS); mcnt++; } #if __FreeBSD_version < 800000 IF_ADDR_UNLOCK(ifp); #else if_maddr_runlock(ifp); #endif update_ptr = mta; ixgbe_update_mc_addr_list(&adapter->hw, update_ptr, mcnt, ixv_mc_array_itr, TRUE); return; } /* * This is an iterator function now needed by the multicast * shared code. It simply feeds the shared code routine the * addresses in the array of ixv_set_multi() one by one. */ static u8 * ixv_mc_array_itr(struct ixgbe_hw *hw, u8 **update_ptr, u32 *vmdq) { u8 *addr = *update_ptr; u8 *newptr; *vmdq = 0; newptr = addr + IXGBE_ETH_LENGTH_OF_ADDRESS; *update_ptr = newptr; return addr; } /********************************************************************* * Timer routine * * This routine checks for link status,updates statistics, * and runs the watchdog check. * **********************************************************************/ static void ixv_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; u64 queues = 0; int hung = 0; mtx_assert(&adapter->core_mtx, MA_OWNED); ixv_update_link_status(adapter); /* Stats Update */ ixv_update_stats(adapter); /* ** Check the TX queues status ** - mark hung queues so we don't schedule on them ** - watchdog only if all queues show hung */ for (int i = 0; i < adapter->num_queues; i++, que++) { /* Keep track of queues with work for soft irq */ if (que->txr->busy) queues |= ((u64)1 << que->me); /* ** Each time txeof runs without cleaning, but there ** are uncleaned descriptors it increments busy. If ** we get to the MAX we declare it hung. */ if (que->busy == IXGBE_QUEUE_HUNG) { ++hung; /* Mark the queue as inactive */ adapter->active_queues &= ~((u64)1 << que->me); continue; } else { /* Check if we've come back from hung */ if ((adapter->active_queues & ((u64)1 << que->me)) == 0) adapter->active_queues |= ((u64)1 << que->me); } if (que->busy >= IXGBE_MAX_TX_BUSY) { device_printf(dev,"Warning queue %d " "appears to be hung!\n", i); que->txr->busy = IXGBE_QUEUE_HUNG; ++hung; } } /* Only truely watchdog if all queues show hung */ if (hung == adapter->num_queues) goto watchdog; else if (queues != 0) { /* Force an IRQ on queues with work */ ixv_rearm_queues(adapter, queues); } callout_reset(&adapter->timer, hz, ixv_local_timer, adapter); return; watchdog: device_printf(adapter->dev, "Watchdog timeout -- resetting\n"); adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; adapter->watchdog_events++; ixv_init_locked(adapter); } /* ** Note: this routine updates the OS on the link state ** the real check of the hardware only happens with ** a link interrupt. */ static void ixv_update_link_status(struct adapter *adapter) { struct ifnet *ifp = adapter->ifp; device_t dev = adapter->dev; if (adapter->link_up){ if (adapter->link_active == FALSE) { if (bootverbose) device_printf(dev,"Link is up %d Gbps %s \n", ((adapter->link_speed == 128)? 10:1), "Full Duplex"); adapter->link_active = TRUE; if_link_state_change(ifp, LINK_STATE_UP); } } else { /* Link down */ if (adapter->link_active == TRUE) { if (bootverbose) device_printf(dev,"Link is Down\n"); if_link_state_change(ifp, LINK_STATE_DOWN); adapter->link_active = FALSE; } } return; } /********************************************************************* * * This routine disables all traffic on the adapter by issuing a * global reset on the MAC and deallocates TX/RX buffers. * **********************************************************************/ static void ixv_stop(void *arg) { struct ifnet *ifp; struct adapter *adapter = arg; struct ixgbe_hw *hw = &adapter->hw; ifp = adapter->ifp; mtx_assert(&adapter->core_mtx, MA_OWNED); INIT_DEBUGOUT("ixv_stop: begin\n"); ixv_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ixgbe_reset_hw(hw); adapter->hw.adapter_stopped = FALSE; ixgbe_stop_adapter(hw); callout_stop(&adapter->timer); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(hw, 0, hw->mac.addr, 0, IXGBE_RAH_AV); return; } /********************************************************************* * * Determine hardware revision. * **********************************************************************/ static void ixv_identify_hardware(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; /* ** Make sure BUSMASTER is set, on a VM under ** KVM it may not be and will break things. */ pci_enable_busmaster(dev); /* Save off the information about this board */ hw->vendor_id = pci_get_vendor(dev); hw->device_id = pci_get_device(dev); hw->revision_id = pci_read_config(dev, PCIR_REVID, 1); hw->subsystem_vendor_id = pci_read_config(dev, PCIR_SUBVEND_0, 2); hw->subsystem_device_id = pci_read_config(dev, PCIR_SUBDEV_0, 2); /* We need this to determine device-specific things */ ixgbe_set_mac_type(hw); /* Set the right number of segments */ adapter->num_segs = IXGBE_82599_SCATTER; return; } /********************************************************************* * * Setup MSIX Interrupt resources and handlers * **********************************************************************/ static int ixv_allocate_msix(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = adapter->queues; struct tx_ring *txr = adapter->tx_rings; int error, rid, vector = 0; for (int i = 0; i < adapter->num_queues; i++, vector++, que++, txr++) { rid = vector + 1; que->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (que->res == NULL) { device_printf(dev,"Unable to allocate" " bus resource: que interrupt [%d]\n", vector); return (ENXIO); } /* Set the handler function */ error = bus_setup_intr(dev, que->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixv_msix_que, que, &que->tag); if (error) { que->res = NULL; device_printf(dev, "Failed to register QUE handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, que->res, que->tag, "que %d", i); #endif que->msix = vector; adapter->active_queues |= (u64)(1 << que->msix); /* ** Bind the msix vector, and thus the ** ring to the corresponding cpu. */ if (adapter->num_queues > 1) bus_bind_intr(dev, que->res, i); TASK_INIT(&txr->txq_task, 0, ixgbe_deferred_mq_start, txr); TASK_INIT(&que->que_task, 0, ixv_handle_que, que); que->tq = taskqueue_create_fast("ixv_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); } /* and Mailbox */ rid = vector + 1; adapter->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (!adapter->res) { device_printf(dev,"Unable to allocate" " bus resource: MBX interrupt [%d]\n", rid); return (ENXIO); } /* Set the mbx handler function */ error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE, NULL, ixv_msix_mbx, adapter, &adapter->tag); if (error) { adapter->res = NULL; device_printf(dev, "Failed to register LINK handler"); return (error); } #if __FreeBSD_version >= 800504 bus_describe_intr(dev, adapter->res, adapter->tag, "mbx"); #endif adapter->vector = vector; /* Tasklets for Mailbox */ TASK_INIT(&adapter->link_task, 0, ixv_handle_mbx, adapter); adapter->tq = taskqueue_create_fast("ixv_mbx", M_NOWAIT, taskqueue_thread_enqueue, &adapter->tq); taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s mbxq", device_get_nameunit(adapter->dev)); /* ** Due to a broken design QEMU will fail to properly ** enable the guest for MSIX unless the vectors in ** the table are all set up, so we must rewrite the ** ENABLE in the MSIX control register again at this ** point to cause it to successfully initialize us. */ if (adapter->hw.mac.type == ixgbe_mac_82599_vf) { int msix_ctrl; pci_find_cap(dev, PCIY_MSIX, &rid); rid += PCIR_MSIX_CTRL; msix_ctrl = pci_read_config(dev, rid, 2); msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE; pci_write_config(dev, rid, msix_ctrl, 2); } return (0); } /* * Setup MSIX resources, note that the VF * device MUST use MSIX, there is no fallback. */ static int ixv_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; int rid, want, msgs; /* Must have at least 2 MSIX vectors */ msgs = pci_msix_count(dev); if (msgs < 2) goto out; rid = PCIR_BAR(3); adapter->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (adapter->msix_mem == NULL) { device_printf(adapter->dev, "Unable to map MSIX table \n"); goto out; } /* ** Want vectors for the queues, ** plus an additional for mailbox. */ want = adapter->num_queues + 1; if (want > msgs) { want = msgs; adapter->num_queues = msgs - 1; } else msgs = want; if ((pci_alloc_msix(dev, &msgs) == 0) && (msgs == want)) { device_printf(adapter->dev, "Using MSIX interrupts with %d vectors\n", want); return (want); } /* Release in case alloc was insufficient */ pci_release_msi(dev); out: if (adapter->msix_mem != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, rid, adapter->msix_mem); adapter->msix_mem = NULL; } device_printf(adapter->dev,"MSIX config error\n"); return (ENXIO); } static int ixv_allocate_pci_resources(struct adapter *adapter) { int rid; device_t dev = adapter->dev; rid = PCIR_BAR(0); adapter->pci_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!(adapter->pci_mem)) { device_printf(dev, "Unable to allocate bus resource: memory\n"); return (ENXIO); } adapter->osdep.mem_bus_space_tag = rman_get_bustag(adapter->pci_mem); adapter->osdep.mem_bus_space_handle = rman_get_bushandle(adapter->pci_mem); adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle; /* Pick up the tuneable queues */ adapter->num_queues = ixv_num_queues; adapter->hw.back = adapter; /* ** Now setup MSI/X, should ** return us the number of ** configured vectors. */ adapter->msix = ixv_setup_msix(adapter); if (adapter->msix == ENXIO) return (ENXIO); else return (0); } static void ixv_free_pci_resources(struct adapter * adapter) { struct ix_queue *que = adapter->queues; device_t dev = adapter->dev; int rid, memrid; memrid = PCIR_BAR(MSIX_82598_BAR); /* ** There is a slight possibility of a failure mode ** in attach that will result in entering this function ** before interrupt resources have been initialized, and ** in that case we do not want to execute the loops below ** We can detect this reliably by the state of the adapter ** res pointer. */ if (adapter->res == NULL) goto mem; /* ** Release all msix queue resources: */ for (int i = 0; i < adapter->num_queues; i++, que++) { rid = que->msix + 1; if (que->tag != NULL) { bus_teardown_intr(dev, que->res, que->tag); que->tag = NULL; } if (que->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, que->res); } /* Clean the Legacy or Link interrupt last */ if (adapter->vector) /* we are doing MSIX */ rid = adapter->vector + 1; else (adapter->msix != 0) ? (rid = 1):(rid = 0); if (adapter->tag != NULL) { bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); mem: if (adapter->msix) pci_release_msi(dev); if (adapter->msix_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, memrid, adapter->msix_mem); if (adapter->pci_mem != NULL) bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), adapter->pci_mem); return; } /********************************************************************* * * Setup networking device structure and register an interface. * **********************************************************************/ static void ixv_setup_interface(device_t dev, struct adapter *adapter) { struct ifnet *ifp; INIT_DEBUGOUT("ixv_setup_interface: begin"); ifp = adapter->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) panic("%s: can not if_alloc()\n", device_get_nameunit(dev)); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = 1000000000; ifp->if_init = ixv_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = ixv_ioctl; #if __FreeBSD_version >= 800000 ifp->if_transmit = ixgbe_mq_start; ifp->if_qflush = ixgbe_qflush; #else ifp->if_start = ixgbe_start; #endif ifp->if_snd.ifq_maxlen = adapter->num_tx_desc - 2; ether_ifattach(ifp, adapter->hw.mac.addr); adapter->max_frame_size = ifp->if_mtu + IXGBE_MTU_HDR_VLAN; /* * Tell the upper layer(s) we support long frames. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_TSO4 | IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWTSO | IFCAP_VLAN_MTU; ifp->if_capabilities |= IFCAP_LRO; ifp->if_capenable = ifp->if_capabilities; /* * Specify the media types supported by this adapter and register * callbacks to update media and link information */ ifmedia_init(&adapter->media, IFM_IMASK, ixv_media_change, ixv_media_status); ifmedia_add(&adapter->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&adapter->media, IFM_ETHER | IFM_AUTO); return; } static void ixv_config_link(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 autoneg; if (hw->mac.ops.check_link) hw->mac.ops.check_link(hw, &autoneg, &adapter->link_up, FALSE); } /********************************************************************* * * Enable transmit unit. * **********************************************************************/ static void ixv_initialize_transmit_units(struct adapter *adapter) { struct tx_ring *txr = adapter->tx_rings; struct ixgbe_hw *hw = &adapter->hw; for (int i = 0; i < adapter->num_queues; i++, txr++) { u64 tdba = txr->txdma.dma_paddr; u32 txctrl, txdctl; /* Set WTHRESH to 8, burst writeback */ txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i)); txdctl |= (8 << 16); IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(i), txdctl); /* Set the HW Tx Head and Tail indices */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_VFTDH(i), 0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_VFTDT(i), 0); /* Set Tx Tail register */ txr->tail = IXGBE_VFTDT(i); /* Set Ring parameters */ IXGBE_WRITE_REG(hw, IXGBE_VFTDBAL(i), (tdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_VFTDBAH(i), (tdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_VFTDLEN(i), adapter->num_tx_desc * sizeof(struct ixgbe_legacy_tx_desc)); txctrl = IXGBE_READ_REG(hw, IXGBE_VFDCA_TXCTRL(i)); txctrl &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; IXGBE_WRITE_REG(hw, IXGBE_VFDCA_TXCTRL(i), txctrl); /* Now enable */ txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i)); txdctl |= IXGBE_TXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(i), txdctl); } return; } /********************************************************************* * * Setup receive registers and features. * **********************************************************************/ #define IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT 2 static void ixv_initialize_receive_units(struct adapter *adapter) { struct rx_ring *rxr = adapter->rx_rings; struct ixgbe_hw *hw = &adapter->hw; struct ifnet *ifp = adapter->ifp; u32 bufsz, rxcsum, psrtype; if (ifp->if_mtu > ETHERMTU) bufsz = 4096 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; else bufsz = 2048 >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; psrtype = IXGBE_PSRTYPE_TCPHDR | IXGBE_PSRTYPE_UDPHDR | IXGBE_PSRTYPE_IPV4HDR | IXGBE_PSRTYPE_IPV6HDR | IXGBE_PSRTYPE_L2HDR; IXGBE_WRITE_REG(hw, IXGBE_VFPSRTYPE, psrtype); /* Tell PF our max_frame size */ ixgbevf_rlpml_set_vf(hw, adapter->max_frame_size); for (int i = 0; i < adapter->num_queues; i++, rxr++) { u64 rdba = rxr->rxdma.dma_paddr; u32 reg, rxdctl; /* Disable the queue */ rxdctl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)); rxdctl &= ~IXGBE_RXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(i), rxdctl); for (int j = 0; j < 10; j++) { if (IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)) & IXGBE_RXDCTL_ENABLE) msec_delay(1); else break; } wmb(); /* Setup the Base and Length of the Rx Descriptor Ring */ IXGBE_WRITE_REG(hw, IXGBE_VFRDBAL(i), (rdba & 0x00000000ffffffffULL)); IXGBE_WRITE_REG(hw, IXGBE_VFRDBAH(i), (rdba >> 32)); IXGBE_WRITE_REG(hw, IXGBE_VFRDLEN(i), adapter->num_rx_desc * sizeof(union ixgbe_adv_rx_desc)); /* Reset the ring indices */ IXGBE_WRITE_REG(hw, IXGBE_VFRDH(rxr->me), 0); IXGBE_WRITE_REG(hw, IXGBE_VFRDT(rxr->me), 0); /* Set up the SRRCTL register */ reg = IXGBE_READ_REG(hw, IXGBE_VFSRRCTL(i)); reg &= ~IXGBE_SRRCTL_BSIZEHDR_MASK; reg &= ~IXGBE_SRRCTL_BSIZEPKT_MASK; reg |= bufsz; reg |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF; IXGBE_WRITE_REG(hw, IXGBE_VFSRRCTL(i), reg); /* Capture Rx Tail index */ rxr->tail = IXGBE_VFRDT(rxr->me); /* Do the queue enabling last */ rxdctl |= IXGBE_RXDCTL_ENABLE | IXGBE_RXDCTL_VME; IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(i), rxdctl); for (int k = 0; k < 10; k++) { if (IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)) & IXGBE_RXDCTL_ENABLE) break; else msec_delay(1); } wmb(); /* Set the Tail Pointer */ #ifdef DEV_NETMAP /* * In netmap mode, we must preserve the buffers made * available to userspace before the if_init() * (this is true by default on the TX side, because * init makes all buffers available to userspace). * * netmap_reset() and the device specific routines * (e.g. ixgbe_setup_receive_rings()) map these * buffers at the end of the NIC ring, so here we * must set the RDT (tail) register to make sure * they are not overwritten. * * In this driver the NIC ring starts at RDH = 0, * RDT points to the last slot available for reception (?), * so RDT = num_rx_desc - 1 means the whole ring is available. */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_VFRDT(rxr->me), t); } else #endif /* DEV_NETMAP */ IXGBE_WRITE_REG(hw, IXGBE_VFRDT(rxr->me), adapter->num_rx_desc - 1); } rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM); if (ifp->if_capenable & IFCAP_RXCSUM) rxcsum |= IXGBE_RXCSUM_PCSD; if (!(rxcsum & IXGBE_RXCSUM_PCSD)) rxcsum |= IXGBE_RXCSUM_IPPCSE; IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum); return; } static void ixv_setup_vlan_support(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 ctrl, vid, vfta, retry; struct rx_ring *rxr; /* ** We get here thru init_locked, meaning ** a soft reset, this has already cleared ** the VFTA and other state, so if there ** have been no vlan's registered do nothing. */ if (adapter->num_vlans == 0) return; /* Enable the queues */ for (int i = 0; i < adapter->num_queues; i++) { ctrl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i)); ctrl |= IXGBE_RXDCTL_VME; IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(i), ctrl); /* * Let Rx path know that it needs to store VLAN tag * as part of extra mbuf info. */ rxr = &adapter->rx_rings[i]; rxr->vtag_strip = TRUE; } /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IXGBE_VFTA_SIZE; i++) { if (ixv_shadow_vfta[i] == 0) continue; vfta = ixv_shadow_vfta[i]; /* ** Reconstruct the vlan id's ** based on the bits set in each ** of the array ints. */ for (int j = 0; j < 32; j++) { retry = 0; if ((vfta & (1 << j)) == 0) continue; vid = (i * 32) + j; /* Call the shared code mailbox routine */ while (ixgbe_set_vfta(hw, vid, 0, TRUE)) { if (++retry > 5) break; } } } } /* ** This routine is run via an vlan config EVENT, ** it enables us to use the HW Filter table since ** we can get the vlan id. This just creates the ** entry in the soft version of the VFTA, init will ** repopulate the real table. */ static void ixv_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) /* Not our event */ return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; ixv_shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; /* Re-init to load the changes */ ixv_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } /* ** This routine is run via an vlan ** unconfig EVENT, remove our entry ** in the soft vfta. */ static void ixv_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) { struct adapter *adapter = ifp->if_softc; u16 index, bit; if (ifp->if_softc != arg) return; if ((vtag == 0) || (vtag > 4095)) /* Invalid */ return; IXGBE_CORE_LOCK(adapter); index = (vtag >> 5) & 0x7F; bit = vtag & 0x1F; ixv_shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; /* Re-init to load the changes */ ixv_init_locked(adapter); IXGBE_CORE_UNLOCK(adapter); } static void ixv_enable_intr(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; struct ix_queue *que = adapter->queues; u32 mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE); IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, mask); mask = IXGBE_EIMS_ENABLE_MASK; mask &= ~(IXGBE_EIMS_OTHER | IXGBE_EIMS_LSC); IXGBE_WRITE_REG(hw, IXGBE_VTEIAC, mask); for (int i = 0; i < adapter->num_queues; i++, que++) ixv_enable_queue(adapter, que->msix); IXGBE_WRITE_FLUSH(hw); return; } static void ixv_disable_intr(struct adapter *adapter) { IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEIAC, 0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEIMC, ~0); IXGBE_WRITE_FLUSH(&adapter->hw); return; } /* ** Setup the correct IVAR register for a particular MSIX interrupt ** - entry is the register array entry ** - vector is the MSIX vector for this queue ** - type is RX/TX/MISC */ static void ixv_set_ivar(struct adapter *adapter, u8 entry, u8 vector, s8 type) { struct ixgbe_hw *hw = &adapter->hw; u32 ivar, index; vector |= IXGBE_IVAR_ALLOC_VAL; if (type == -1) { /* MISC IVAR */ ivar = IXGBE_READ_REG(hw, IXGBE_VTIVAR_MISC); ivar &= ~0xFF; ivar |= vector; IXGBE_WRITE_REG(hw, IXGBE_VTIVAR_MISC, ivar); } else { /* RX/TX IVARS */ index = (16 * (entry & 1)) + (8 * type); ivar = IXGBE_READ_REG(hw, IXGBE_VTIVAR(entry >> 1)); ivar &= ~(0xFF << index); ivar |= (vector << index); IXGBE_WRITE_REG(hw, IXGBE_VTIVAR(entry >> 1), ivar); } } static void ixv_configure_ivars(struct adapter *adapter) { struct ix_queue *que = adapter->queues; for (int i = 0; i < adapter->num_queues; i++, que++) { /* First the RX queue entry */ ixv_set_ivar(adapter, i, que->msix, 0); /* ... and the TX */ ixv_set_ivar(adapter, i, que->msix, 1); /* Set an initial value in EITR */ IXGBE_WRITE_REG(&adapter->hw, IXGBE_VTEITR(que->msix), IXV_EITR_DEFAULT); } /* For the mailbox interrupt */ ixv_set_ivar(adapter, 1, adapter->vector, -1); } /* ** Tasklet handler for MSIX MBX interrupts ** - do outside interrupt since it might sleep */ static void ixv_handle_mbx(void *context, int pending) { struct adapter *adapter = context; ixgbe_check_link(&adapter->hw, &adapter->link_speed, &adapter->link_up, 0); ixv_update_link_status(adapter); } /* ** The VF stats registers never have a truely virgin ** starting point, so this routine tries to make an ** artificial one, marking ground zero on attach as ** it were. */ static void ixv_save_stats(struct adapter *adapter) { if (adapter->stats.vf.vfgprc || adapter->stats.vf.vfgptc) { adapter->stats.vf.saved_reset_vfgprc += adapter->stats.vf.vfgprc - adapter->stats.vf.base_vfgprc; adapter->stats.vf.saved_reset_vfgptc += adapter->stats.vf.vfgptc - adapter->stats.vf.base_vfgptc; adapter->stats.vf.saved_reset_vfgorc += adapter->stats.vf.vfgorc - adapter->stats.vf.base_vfgorc; adapter->stats.vf.saved_reset_vfgotc += adapter->stats.vf.vfgotc - adapter->stats.vf.base_vfgotc; adapter->stats.vf.saved_reset_vfmprc += adapter->stats.vf.vfmprc - adapter->stats.vf.base_vfmprc; } } static void ixv_init_stats(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; adapter->stats.vf.last_vfgprc = IXGBE_READ_REG(hw, IXGBE_VFGPRC); adapter->stats.vf.last_vfgorc = IXGBE_READ_REG(hw, IXGBE_VFGORC_LSB); adapter->stats.vf.last_vfgorc |= (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGORC_MSB))) << 32); adapter->stats.vf.last_vfgptc = IXGBE_READ_REG(hw, IXGBE_VFGPTC); adapter->stats.vf.last_vfgotc = IXGBE_READ_REG(hw, IXGBE_VFGOTC_LSB); adapter->stats.vf.last_vfgotc |= (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGOTC_MSB))) << 32); adapter->stats.vf.last_vfmprc = IXGBE_READ_REG(hw, IXGBE_VFMPRC); adapter->stats.vf.base_vfgprc = adapter->stats.vf.last_vfgprc; adapter->stats.vf.base_vfgorc = adapter->stats.vf.last_vfgorc; adapter->stats.vf.base_vfgptc = adapter->stats.vf.last_vfgptc; adapter->stats.vf.base_vfgotc = adapter->stats.vf.last_vfgotc; adapter->stats.vf.base_vfmprc = adapter->stats.vf.last_vfmprc; } #define UPDATE_STAT_32(reg, last, count) \ { \ u32 current = IXGBE_READ_REG(hw, reg); \ if (current < last) \ count += 0x100000000LL; \ last = current; \ count &= 0xFFFFFFFF00000000LL; \ count |= current; \ } #define UPDATE_STAT_36(lsb, msb, last, count) \ { \ u64 cur_lsb = IXGBE_READ_REG(hw, lsb); \ u64 cur_msb = IXGBE_READ_REG(hw, msb); \ u64 current = ((cur_msb << 32) | cur_lsb); \ if (current < last) \ count += 0x1000000000LL; \ last = current; \ count &= 0xFFFFFFF000000000LL; \ count |= current; \ } /* ** ixv_update_stats - Update the board statistics counters. */ void ixv_update_stats(struct adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; UPDATE_STAT_32(IXGBE_VFGPRC, adapter->stats.vf.last_vfgprc, adapter->stats.vf.vfgprc); UPDATE_STAT_32(IXGBE_VFGPTC, adapter->stats.vf.last_vfgptc, adapter->stats.vf.vfgptc); UPDATE_STAT_36(IXGBE_VFGORC_LSB, IXGBE_VFGORC_MSB, adapter->stats.vf.last_vfgorc, adapter->stats.vf.vfgorc); UPDATE_STAT_36(IXGBE_VFGOTC_LSB, IXGBE_VFGOTC_MSB, adapter->stats.vf.last_vfgotc, adapter->stats.vf.vfgotc); UPDATE_STAT_32(IXGBE_VFMPRC, adapter->stats.vf.last_vfmprc, adapter->stats.vf.vfmprc); } /* * Add statistic sysctls for the VF. */ static void ixv_add_stats_sysctls(struct adapter *adapter) { device_t dev = adapter->dev; struct ix_queue *que = &adapter->queues[0]; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(dev); struct sysctl_oid *tree = device_get_sysctl_tree(dev); struct sysctl_oid_list *child = SYSCTL_CHILDREN(tree); struct ixgbevf_hw_stats *stats = &adapter->stats.vf; struct sysctl_oid *stat_node, *queue_node; struct sysctl_oid_list *stat_list, *queue_list; /* Driver Statistics */ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "dropped", CTLFLAG_RD, &adapter->dropped_pkts, "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_defrag_failed", CTLFLAG_RD, &adapter->mbuf_defrag_failed, "m_defrag() failed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "watchdog_events", CTLFLAG_RD, &adapter->watchdog_events, "Watchdog timeouts"); stat_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "mac", CTLFLAG_RD, NULL, "VF Statistics (read from HW registers)"); stat_list = SYSCTL_CHILDREN(stat_node); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_rcvd", CTLFLAG_RD, &stats->vfgprc, "Good Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_rcvd", CTLFLAG_RD, &stats->vfgorc, "Good Octets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "mcast_pkts_rcvd", CTLFLAG_RD, &stats->vfmprc, "Multicast Packets Received"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_pkts_txd", CTLFLAG_RD, &stats->vfgptc, "Good Packets Transmitted"); SYSCTL_ADD_UQUAD(ctx, stat_list, OID_AUTO, "good_octets_txd", CTLFLAG_RD, &stats->vfgotc, "Good Octets Transmitted"); queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "que", CTLFLAG_RD, NULL, "Queue Statistics (collected by SW)"); queue_list = SYSCTL_CHILDREN(queue_node); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "irqs", CTLFLAG_RD, &(que->irqs), "IRQs on queue"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_irqs", CTLFLAG_RD, &(rxr->rx_irq), "RX irqs on queue"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &(rxr->rx_packets), "RX packets"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &(rxr->rx_bytes), "RX bytes"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_discarded", CTLFLAG_RD, &(rxr->rx_discarded), "Discarded RX packets"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &(txr->total_packets), "TX Packets"); SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "tx_no_desc", CTLFLAG_RD, &(txr->no_desc_avail), "# of times not enough descriptors were available during TX"); } static void ixv_set_sysctl_value(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLFLAG_RW, limit, value, description); } /********************************************************************** * * This routine is called only when em_display_debug_stats is enabled. * This routine provides a way to take a look at important statistics * maintained by the driver and hardware. * **********************************************************************/ static void ixv_print_debug_info(struct adapter *adapter) { device_t dev = adapter->dev; struct ixgbe_hw *hw = &adapter->hw; struct ix_queue *que = adapter->queues; struct rx_ring *rxr; struct tx_ring *txr; struct lro_ctrl *lro; device_printf(dev,"Error Byte Count = %u \n", IXGBE_READ_REG(hw, IXGBE_ERRBC)); for (int i = 0; i < adapter->num_queues; i++, que++) { txr = que->txr; rxr = que->rxr; lro = &rxr->lro; device_printf(dev,"QUE(%d) IRQs Handled: %lu\n", que->msix, (long)que->irqs); device_printf(dev,"RX(%d) Packets Received: %lld\n", rxr->me, (long long)rxr->rx_packets); device_printf(dev,"RX(%d) Bytes Received: %lu\n", rxr->me, (long)rxr->rx_bytes); - device_printf(dev,"RX(%d) LRO Queued= %d\n", - rxr->me, lro->lro_queued); - device_printf(dev,"RX(%d) LRO Flushed= %d\n", - rxr->me, lro->lro_flushed); + device_printf(dev,"RX(%d) LRO Queued= %lld\n", + rxr->me, (long long)lro->lro_queued); + device_printf(dev,"RX(%d) LRO Flushed= %lld\n", + rxr->me, (long long)lro->lro_flushed); device_printf(dev,"TX(%d) Packets Sent: %lu\n", txr->me, (long)txr->total_packets); device_printf(dev,"TX(%d) NO Desc Avail: %lu\n", txr->me, (long)txr->no_desc_avail); } device_printf(dev,"MBX IRQ Handled: %lu\n", (long)adapter->link_irq); return; } static int ixv_sysctl_debug(SYSCTL_HANDLER_ARGS) { int error, result; struct adapter *adapter; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); if (result == 1) { adapter = (struct adapter *) arg1; ixv_print_debug_info(adapter); } return error; } Index: head/sys/dev/mxge/if_mxge.c =================================================================== --- head/sys/dev/mxge/if_mxge.c (revision 294326) +++ head/sys/dev/mxge/if_mxge.c (revision 294327) @@ -1,5032 +1,5032 @@ /****************************************************************************** Copyright (c) 2006-2013, Myricom Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Myricom Inc, nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for pci_cfg_restore */ #include /* for pmap_mapdev() */ #include #if defined(__i386) || defined(__amd64) #include #endif #include #include /*#define MXGE_FAKE_IFP*/ #include #ifdef IFNET_BUF_RING #include #endif #include "opt_inet.h" #include "opt_inet6.h" /* tunable params */ static int mxge_nvidia_ecrc_enable = 1; static int mxge_force_firmware = 0; static int mxge_intr_coal_delay = 30; static int mxge_deassert_wait = 1; static int mxge_flow_control = 1; static int mxge_verbose = 0; static int mxge_ticks; static int mxge_max_slices = 1; static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT; static int mxge_always_promisc = 0; static int mxge_initial_mtu = ETHERMTU_JUMBO; static int mxge_throttle = 0; static char *mxge_fw_unaligned = "mxge_ethp_z8e"; static char *mxge_fw_aligned = "mxge_eth_z8e"; static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e"; static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e"; static int mxge_probe(device_t dev); static int mxge_attach(device_t dev); static int mxge_detach(device_t dev); static int mxge_shutdown(device_t dev); static void mxge_intr(void *arg); static device_method_t mxge_methods[] = { /* Device interface */ DEVMETHOD(device_probe, mxge_probe), DEVMETHOD(device_attach, mxge_attach), DEVMETHOD(device_detach, mxge_detach), DEVMETHOD(device_shutdown, mxge_shutdown), DEVMETHOD_END }; static driver_t mxge_driver = { "mxge", mxge_methods, sizeof(mxge_softc_t), }; static devclass_t mxge_devclass; /* Declare ourselves to be a child of the PCI bus.*/ DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0); MODULE_DEPEND(mxge, firmware, 1, 1, 1); MODULE_DEPEND(mxge, zlib, 1, 1, 1); static int mxge_load_firmware(mxge_softc_t *sc, int adopt); static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data); static int mxge_close(mxge_softc_t *sc, int down); static int mxge_open(mxge_softc_t *sc); static void mxge_tick(void *arg); static int mxge_probe(device_t dev) { int rev; if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) && ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) || (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) { rev = pci_get_revid(dev); switch (rev) { case MXGE_PCI_REV_Z8E: device_set_desc(dev, "Myri10G-PCIE-8A"); break; case MXGE_PCI_REV_Z8ES: device_set_desc(dev, "Myri10G-PCIE-8B"); break; default: device_set_desc(dev, "Myri10G-PCIE-8??"); device_printf(dev, "Unrecognized rev %d NIC\n", rev); break; } return 0; } return ENXIO; } static void mxge_enable_wc(mxge_softc_t *sc) { #if defined(__i386) || defined(__amd64) vm_offset_t len; int err; sc->wc = 1; len = rman_get_size(sc->mem_res); err = pmap_change_attr((vm_offset_t) sc->sram, len, PAT_WRITE_COMBINING); if (err != 0) { device_printf(sc->dev, "pmap_change_attr failed, %d\n", err); sc->wc = 0; } #endif } /* callback to get our DMA address */ static void mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { if (error == 0) { *(bus_addr_t *) arg = segs->ds_addr; } } static int mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes, bus_size_t alignment) { int err; device_t dev = sc->dev; bus_size_t boundary, maxsegsize; if (bytes > 4096 && alignment == 4096) { boundary = 0; maxsegsize = bytes; } else { boundary = 4096; maxsegsize = 4096; } /* allocate DMAable memory tags */ err = bus_dma_tag_create(sc->parent_dmat, /* parent */ alignment, /* alignment */ boundary, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ bytes, /* maxsize */ 1, /* num segs */ maxsegsize, /* maxsegsize */ BUS_DMA_COHERENT, /* flags */ NULL, NULL, /* lock */ &dma->dmat); /* tag */ if (err != 0) { device_printf(dev, "couldn't alloc tag (err = %d)\n", err); return err; } /* allocate DMAable memory & map */ err = bus_dmamem_alloc(dma->dmat, &dma->addr, (BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO), &dma->map); if (err != 0) { device_printf(dev, "couldn't alloc mem (err = %d)\n", err); goto abort_with_dmat; } /* load the memory */ err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes, mxge_dmamap_callback, (void *)&dma->bus_addr, 0); if (err != 0) { device_printf(dev, "couldn't load map (err = %d)\n", err); goto abort_with_mem; } return 0; abort_with_mem: bus_dmamem_free(dma->dmat, dma->addr, dma->map); abort_with_dmat: (void)bus_dma_tag_destroy(dma->dmat); return err; } static void mxge_dma_free(mxge_dma_t *dma) { bus_dmamap_unload(dma->dmat, dma->map); bus_dmamem_free(dma->dmat, dma->addr, dma->map); (void)bus_dma_tag_destroy(dma->dmat); } /* * The eeprom strings on the lanaiX have the format * SN=x\0 * MAC=x:x:x:x:x:x\0 * PC=text\0 */ static int mxge_parse_strings(mxge_softc_t *sc) { char *ptr; int i, found_mac, found_sn2; char *endptr; ptr = sc->eeprom_strings; found_mac = 0; found_sn2 = 0; while (*ptr != '\0') { if (strncmp(ptr, "MAC=", 4) == 0) { ptr += 4; for (i = 0;;) { sc->mac_addr[i] = strtoul(ptr, &endptr, 16); if (endptr - ptr != 2) goto abort; ptr = endptr; if (++i == 6) break; if (*ptr++ != ':') goto abort; } found_mac = 1; } else if (strncmp(ptr, "PC=", 3) == 0) { ptr += 3; strlcpy(sc->product_code_string, ptr, sizeof(sc->product_code_string)); } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) { ptr += 3; strlcpy(sc->serial_number_string, ptr, sizeof(sc->serial_number_string)); } else if (strncmp(ptr, "SN2=", 4) == 0) { /* SN2 takes precedence over SN */ ptr += 4; found_sn2 = 1; strlcpy(sc->serial_number_string, ptr, sizeof(sc->serial_number_string)); } while (*ptr++ != '\0') {} } if (found_mac) return 0; abort: device_printf(sc->dev, "failed to parse eeprom_strings\n"); return ENXIO; } #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__ static void mxge_enable_nvidia_ecrc(mxge_softc_t *sc) { uint32_t val; unsigned long base, off; char *va, *cfgptr; device_t pdev, mcp55; uint16_t vendor_id, device_id, word; uintptr_t bus, slot, func, ivend, idev; uint32_t *ptr32; if (!mxge_nvidia_ecrc_enable) return; pdev = device_get_parent(device_get_parent(sc->dev)); if (pdev == NULL) { device_printf(sc->dev, "could not find parent?\n"); return; } vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2); device_id = pci_read_config(pdev, PCIR_DEVICE, 2); if (vendor_id != 0x10de) return; base = 0; if (device_id == 0x005d) { /* ck804, base address is magic */ base = 0xe0000000UL; } else if (device_id >= 0x0374 && device_id <= 0x378) { /* mcp55, base address stored in chipset */ mcp55 = pci_find_bsf(0, 0, 0); if (mcp55 && 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) && 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) { word = pci_read_config(mcp55, 0x90, 2); base = ((unsigned long)word & 0x7ffeU) << 25; } } if (!base) return; /* XXXX Test below is commented because it is believed that doing config read/write beyond 0xff will access the config space for the next larger function. Uncomment this and remove the hacky pmap_mapdev() way of accessing config space when FreeBSD grows support for extended pcie config space access */ #if 0 /* See if we can, by some miracle, access the extended config space */ val = pci_read_config(pdev, 0x178, 4); if (val != 0xffffffff) { val |= 0x40; pci_write_config(pdev, 0x178, val, 4); return; } #endif /* Rather than using normal pci config space writes, we must * map the Nvidia config space ourselves. This is because on * opteron/nvidia class machine the 0xe000000 mapping is * handled by the nvidia chipset, that means the internal PCI * device (the on-chip northbridge), or the amd-8131 bridge * and things behind them are not visible by this method. */ BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_BUS, &bus); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_SLOT, &slot); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_FUNCTION, &func); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_VENDOR, &ivend); BUS_READ_IVAR(device_get_parent(pdev), pdev, PCI_IVAR_DEVICE, &idev); off = base + 0x00100000UL * (unsigned long)bus + 0x00001000UL * (unsigned long)(func + 8 * slot); /* map it into the kernel */ va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE); if (va == NULL) { device_printf(sc->dev, "pmap_kenter_temporary didn't\n"); return; } /* get a pointer to the config space mapped into the kernel */ cfgptr = va + (off & PAGE_MASK); /* make sure that we can really access it */ vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR); device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE); if (! (vendor_id == ivend && device_id == idev)) { device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n", vendor_id, device_id); pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); return; } ptr32 = (uint32_t*)(cfgptr + 0x178); val = *ptr32; if (val == 0xffffffff) { device_printf(sc->dev, "extended mapping failed\n"); pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); return; } *ptr32 = val | 0x40; pmap_unmapdev((vm_offset_t)va, PAGE_SIZE); if (mxge_verbose) device_printf(sc->dev, "Enabled ECRC on upstream Nvidia bridge " "at %d:%d:%d\n", (int)bus, (int)slot, (int)func); return; } #else static void mxge_enable_nvidia_ecrc(mxge_softc_t *sc) { device_printf(sc->dev, "Nforce 4 chipset on non-x86/amd64!?!?!\n"); return; } #endif static int mxge_dma_test(mxge_softc_t *sc, int test_type) { mxge_cmd_t cmd; bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr; int status; uint32_t len; char *test = " "; /* Run a small DMA test. * The magic multipliers to the length tell the firmware * to do DMA read, write, or read+write tests. The * results are returned in cmd.data0. The upper 16 * bits of the return is the number of transfers completed. * The lower 16 bits is the time in 0.5us ticks that the * transfers took to complete. */ len = sc->tx_boundary; cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); cmd.data2 = len * 0x10000; status = mxge_send_cmd(sc, test_type, &cmd); if (status != 0) { test = "read"; goto abort; } sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff); cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); cmd.data2 = len * 0x1; status = mxge_send_cmd(sc, test_type, &cmd); if (status != 0) { test = "write"; goto abort; } sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff); cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus); cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus); cmd.data2 = len * 0x10001; status = mxge_send_cmd(sc, test_type, &cmd); if (status != 0) { test = "read/write"; goto abort; } sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) / (cmd.data0 & 0xffff); abort: if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) device_printf(sc->dev, "DMA %s benchmark failed: %d\n", test, status); return status; } /* * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput * when the PCI-E Completion packets are aligned on an 8-byte * boundary. Some PCI-E chip sets always align Completion packets; on * the ones that do not, the alignment can be enforced by enabling * ECRC generation (if supported). * * When PCI-E Completion packets are not aligned, it is actually more * efficient to limit Read-DMA transactions to 2KB, rather than 4KB. * * If the driver can neither enable ECRC nor verify that it has * already been enabled, then it must use a firmware image which works * around unaligned completion packets (ethp_z8e.dat), and it should * also ensure that it never gives the device a Read-DMA which is * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is * enabled, then the driver should use the aligned (eth_z8e.dat) * firmware image, and set tx_boundary to 4KB. */ static int mxge_firmware_probe(mxge_softc_t *sc) { device_t dev = sc->dev; int reg, status; uint16_t pectl; sc->tx_boundary = 4096; /* * Verify the max read request size was set to 4KB * before trying the test with 4KB. */ if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) { pectl = pci_read_config(dev, reg + 0x8, 2); if ((pectl & (5 << 12)) != (5 << 12)) { device_printf(dev, "Max Read Req. size != 4k (0x%x\n", pectl); sc->tx_boundary = 2048; } } /* * load the optimized firmware (which assumes aligned PCIe * completions) in order to see if it works on this host. */ sc->fw_name = mxge_fw_aligned; status = mxge_load_firmware(sc, 1); if (status != 0) { return status; } /* * Enable ECRC if possible */ mxge_enable_nvidia_ecrc(sc); /* * Run a DMA test which watches for unaligned completions and * aborts on the first one seen. Not required on Z8ES or newer. */ if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES) return 0; status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST); if (status == 0) return 0; /* keep the aligned firmware */ if (status != E2BIG) device_printf(dev, "DMA test failed: %d\n", status); if (status == ENOSYS) device_printf(dev, "Falling back to ethp! " "Please install up to date fw\n"); return status; } static int mxge_select_firmware(mxge_softc_t *sc) { int aligned = 0; int force_firmware = mxge_force_firmware; if (sc->throttle) force_firmware = sc->throttle; if (force_firmware != 0) { if (force_firmware == 1) aligned = 1; else aligned = 0; if (mxge_verbose) device_printf(sc->dev, "Assuming %s completions (forced)\n", aligned ? "aligned" : "unaligned"); goto abort; } /* if the PCIe link width is 4 or less, we can use the aligned firmware and skip any checks */ if (sc->link_width != 0 && sc->link_width <= 4) { device_printf(sc->dev, "PCIe x%d Link, expect reduced performance\n", sc->link_width); aligned = 1; goto abort; } if (0 == mxge_firmware_probe(sc)) return 0; abort: if (aligned) { sc->fw_name = mxge_fw_aligned; sc->tx_boundary = 4096; } else { sc->fw_name = mxge_fw_unaligned; sc->tx_boundary = 2048; } return (mxge_load_firmware(sc, 0)); } static int mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr) { if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) { device_printf(sc->dev, "Bad firmware type: 0x%x\n", be32toh(hdr->mcp_type)); return EIO; } /* save firmware version for sysctl */ strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version)); if (mxge_verbose) device_printf(sc->dev, "firmware id: %s\n", hdr->version); sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major, &sc->fw_ver_minor, &sc->fw_ver_tiny); if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) { device_printf(sc->dev, "Found firmware version %s\n", sc->fw_version); device_printf(sc->dev, "Driver needs %d.%d\n", MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR); return EINVAL; } return 0; } static void * z_alloc(void *nil, u_int items, u_int size) { void *ptr; ptr = malloc(items * size, M_TEMP, M_NOWAIT); return ptr; } static void z_free(void *nil, void *ptr) { free(ptr, M_TEMP); } static int mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit) { z_stream zs; char *inflate_buffer; const struct firmware *fw; const mcp_gen_header_t *hdr; unsigned hdr_offset; int status; unsigned int i; char dummy; size_t fw_len; fw = firmware_get(sc->fw_name); if (fw == NULL) { device_printf(sc->dev, "Could not find firmware image %s\n", sc->fw_name); return ENOENT; } /* setup zlib and decompress f/w */ bzero(&zs, sizeof (zs)); zs.zalloc = z_alloc; zs.zfree = z_free; status = inflateInit(&zs); if (status != Z_OK) { status = EIO; goto abort_with_fw; } /* the uncompressed size is stored as the firmware version, which would otherwise go unused */ fw_len = (size_t) fw->version; inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT); if (inflate_buffer == NULL) goto abort_with_zs; zs.avail_in = fw->datasize; zs.next_in = __DECONST(char *, fw->data); zs.avail_out = fw_len; zs.next_out = inflate_buffer; status = inflate(&zs, Z_FINISH); if (status != Z_STREAM_END) { device_printf(sc->dev, "zlib %d\n", status); status = EIO; goto abort_with_buffer; } /* check id */ hdr_offset = htobe32(*(const uint32_t *) (inflate_buffer + MCP_HEADER_PTR_OFFSET)); if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) { device_printf(sc->dev, "Bad firmware file"); status = EIO; goto abort_with_buffer; } hdr = (const void*)(inflate_buffer + hdr_offset); status = mxge_validate_firmware(sc, hdr); if (status != 0) goto abort_with_buffer; /* Copy the inflated firmware to NIC SRAM. */ for (i = 0; i < fw_len; i += 256) { mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i, min(256U, (unsigned)(fw_len - i))); wmb(); dummy = *sc->sram; wmb(); } *limit = fw_len; status = 0; abort_with_buffer: free(inflate_buffer, M_TEMP); abort_with_zs: inflateEnd(&zs); abort_with_fw: firmware_put(fw, FIRMWARE_UNLOAD); return status; } /* * Enable or disable periodic RDMAs from the host to make certain * chipsets resend dropped PCIe messages */ static void mxge_dummy_rdma(mxge_softc_t *sc, int enable) { char buf_bytes[72]; volatile uint32_t *confirm; volatile char *submit; uint32_t *buf, dma_low, dma_high; int i; buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL); /* clear confirmation addr */ confirm = (volatile uint32_t *)sc->cmd; *confirm = 0; wmb(); /* send an rdma command to the PCIe engine, and wait for the response in the confirmation address. The firmware should write a -1 there to indicate it is alive and well */ dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); buf[0] = htobe32(dma_high); /* confirm addr MSW */ buf[1] = htobe32(dma_low); /* confirm addr LSW */ buf[2] = htobe32(0xffffffff); /* confirm data */ dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr); buf[3] = htobe32(dma_high); /* dummy addr MSW */ buf[4] = htobe32(dma_low); /* dummy addr LSW */ buf[5] = htobe32(enable); /* enable? */ submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA); mxge_pio_copy(submit, buf, 64); wmb(); DELAY(1000); wmb(); i = 0; while (*confirm != 0xffffffff && i < 20) { DELAY(1000); i++; } if (*confirm != 0xffffffff) { device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)", (enable ? "enable" : "disable"), confirm, *confirm); } return; } static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data) { mcp_cmd_t *buf; char buf_bytes[sizeof(*buf) + 8]; volatile mcp_cmd_response_t *response = sc->cmd; volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD; uint32_t dma_low, dma_high; int err, sleep_total = 0; /* ensure buf is aligned to 8 bytes */ buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL); buf->data0 = htobe32(data->data0); buf->data1 = htobe32(data->data1); buf->data2 = htobe32(data->data2); buf->cmd = htobe32(cmd); dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); buf->response_addr.low = htobe32(dma_low); buf->response_addr.high = htobe32(dma_high); mtx_lock(&sc->cmd_mtx); response->result = 0xffffffff; wmb(); mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf)); /* wait up to 20ms */ err = EAGAIN; for (sleep_total = 0; sleep_total < 20; sleep_total++) { bus_dmamap_sync(sc->cmd_dma.dmat, sc->cmd_dma.map, BUS_DMASYNC_POSTREAD); wmb(); switch (be32toh(response->result)) { case 0: data->data0 = be32toh(response->data); err = 0; break; case 0xffffffff: DELAY(1000); break; case MXGEFW_CMD_UNKNOWN: err = ENOSYS; break; case MXGEFW_CMD_ERROR_UNALIGNED: err = E2BIG; break; case MXGEFW_CMD_ERROR_BUSY: err = EBUSY; break; case MXGEFW_CMD_ERROR_I2C_ABSENT: err = ENXIO; break; default: device_printf(sc->dev, "mxge: command %d " "failed, result = %d\n", cmd, be32toh(response->result)); err = ENXIO; break; } if (err != EAGAIN) break; } if (err == EAGAIN) device_printf(sc->dev, "mxge: command %d timed out" "result = %d\n", cmd, be32toh(response->result)); mtx_unlock(&sc->cmd_mtx); return err; } static int mxge_adopt_running_firmware(mxge_softc_t *sc) { struct mcp_gen_header *hdr; const size_t bytes = sizeof (struct mcp_gen_header); size_t hdr_offset; int status; /* find running firmware header */ hdr_offset = htobe32(*(volatile uint32_t *) (sc->sram + MCP_HEADER_PTR_OFFSET)); if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) { device_printf(sc->dev, "Running firmware has bad header offset (%d)\n", (int)hdr_offset); return EIO; } /* copy header of running firmware from SRAM to host memory to * validate firmware */ hdr = malloc(bytes, M_DEVBUF, M_NOWAIT); if (hdr == NULL) { device_printf(sc->dev, "could not malloc firmware hdr\n"); return ENOMEM; } bus_space_read_region_1(rman_get_bustag(sc->mem_res), rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes); status = mxge_validate_firmware(sc, hdr); free(hdr, M_DEVBUF); /* * check to see if adopted firmware has bug where adopting * it will cause broadcasts to be filtered unless the NIC * is kept in ALLMULTI mode */ if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 && sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) { sc->adopted_rx_filter_bug = 1; device_printf(sc->dev, "Adopting fw %d.%d.%d: " "working around rx filter bug\n", sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny); } return status; } static int mxge_load_firmware(mxge_softc_t *sc, int adopt) { volatile uint32_t *confirm; volatile char *submit; char buf_bytes[72]; uint32_t *buf, size, dma_low, dma_high; int status, i; buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL); size = sc->sram_size; status = mxge_load_firmware_helper(sc, &size); if (status) { if (!adopt) return status; /* Try to use the currently running firmware, if it is new enough */ status = mxge_adopt_running_firmware(sc); if (status) { device_printf(sc->dev, "failed to adopt running firmware\n"); return status; } device_printf(sc->dev, "Successfully adopted running firmware\n"); if (sc->tx_boundary == 4096) { device_printf(sc->dev, "Using firmware currently running on NIC" ". For optimal\n"); device_printf(sc->dev, "performance consider loading optimized " "firmware\n"); } sc->fw_name = mxge_fw_unaligned; sc->tx_boundary = 2048; return 0; } /* clear confirmation addr */ confirm = (volatile uint32_t *)sc->cmd; *confirm = 0; wmb(); /* send a reload command to the bootstrap MCP, and wait for the response in the confirmation address. The firmware should write a -1 there to indicate it is alive and well */ dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr); dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr); buf[0] = htobe32(dma_high); /* confirm addr MSW */ buf[1] = htobe32(dma_low); /* confirm addr LSW */ buf[2] = htobe32(0xffffffff); /* confirm data */ /* FIX: All newest firmware should un-protect the bottom of the sram before handoff. However, the very first interfaces do not. Therefore the handoff copy must skip the first 8 bytes */ /* where the code starts*/ buf[3] = htobe32(MXGE_FW_OFFSET + 8); buf[4] = htobe32(size - 8); /* length of code */ buf[5] = htobe32(8); /* where to copy to */ buf[6] = htobe32(0); /* where to jump to */ submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF); mxge_pio_copy(submit, buf, 64); wmb(); DELAY(1000); wmb(); i = 0; while (*confirm != 0xffffffff && i < 20) { DELAY(1000*10); i++; bus_dmamap_sync(sc->cmd_dma.dmat, sc->cmd_dma.map, BUS_DMASYNC_POSTREAD); } if (*confirm != 0xffffffff) { device_printf(sc->dev,"handoff failed (%p = 0x%x)", confirm, *confirm); return ENXIO; } return 0; } static int mxge_update_mac_address(mxge_softc_t *sc) { mxge_cmd_t cmd; uint8_t *addr = sc->mac_addr; int status; cmd.data0 = ((addr[0] << 24) | (addr[1] << 16) | (addr[2] << 8) | addr[3]); cmd.data1 = ((addr[4] << 8) | (addr[5])); status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd); return status; } static int mxge_change_pause(mxge_softc_t *sc, int pause) { mxge_cmd_t cmd; int status; if (pause) status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd); else status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd); if (status) { device_printf(sc->dev, "Failed to set flow control mode\n"); return ENXIO; } sc->pause = pause; return 0; } static void mxge_change_promisc(mxge_softc_t *sc, int promisc) { mxge_cmd_t cmd; int status; if (mxge_always_promisc) promisc = 1; if (promisc) status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd); else status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd); if (status) { device_printf(sc->dev, "Failed to set promisc mode\n"); } } static void mxge_set_multicast_list(mxge_softc_t *sc) { mxge_cmd_t cmd; struct ifmultiaddr *ifma; struct ifnet *ifp = sc->ifp; int err; /* This firmware is known to not support multicast */ if (!sc->fw_multicast_support) return; /* Disable multicast filtering while we play with the lists*/ err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd); if (err != 0) { device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI," " error status: %d\n", err); return; } if (sc->adopted_rx_filter_bug) return; if (ifp->if_flags & IFF_ALLMULTI) /* request to disable multicast filtering, so quit here */ return; /* Flush all the filters */ err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd); if (err != 0) { device_printf(sc->dev, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS" ", error status: %d\n", err); return; } /* Walk the multicast list, and add each address */ if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &cmd.data0, 4); bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4, &cmd.data1, 2); cmd.data0 = htonl(cmd.data0); cmd.data1 = htonl(cmd.data1); err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd); if (err != 0) { device_printf(sc->dev, "Failed " "MXGEFW_JOIN_MULTICAST_GROUP, error status:" "%d\t", err); /* abort, leaving multicast filtering off */ if_maddr_runlock(ifp); return; } } if_maddr_runlock(ifp); /* Enable multicast filtering */ err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd); if (err != 0) { device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI" ", error status: %d\n", err); } } static int mxge_max_mtu(mxge_softc_t *sc) { mxge_cmd_t cmd; int status; if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU) return MXGEFW_MAX_MTU - MXGEFW_PAD; /* try to set nbufs to see if it we can use virtually contiguous jumbos */ cmd.data0 = 0; status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd); if (status == 0) return MXGEFW_MAX_MTU - MXGEFW_PAD; /* otherwise, we're limited to MJUMPAGESIZE */ return MJUMPAGESIZE - MXGEFW_PAD; } static int mxge_reset(mxge_softc_t *sc, int interrupts_setup) { struct mxge_slice_state *ss; mxge_rx_done_t *rx_done; volatile uint32_t *irq_claim; mxge_cmd_t cmd; int slice, status; /* try to send a reset command to the card to see if it is alive */ memset(&cmd, 0, sizeof (cmd)); status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd); if (status != 0) { device_printf(sc->dev, "failed reset\n"); return ENXIO; } mxge_dummy_rdma(sc, 1); /* set the intrq size */ cmd.data0 = sc->rx_ring_size; status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd); /* * Even though we already know how many slices are supported * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES * has magic side effects, and must be called after a reset. * It must be called prior to calling any RSS related cmds, * including assigning an interrupt queue for anything but * slice 0. It must also be called *after* * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by * the firmware to compute offsets. */ if (sc->num_slices > 1) { /* ask the maximum number of slices it supports */ status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd); if (status != 0) { device_printf(sc->dev, "failed to get number of slices\n"); return status; } /* * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior * to setting up the interrupt queue DMA */ cmd.data0 = sc->num_slices; cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE; #ifdef IFNET_BUF_RING cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES; #endif status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd); if (status != 0) { device_printf(sc->dev, "failed to set number of slices\n"); return status; } } if (interrupts_setup) { /* Now exchange information about interrupts */ for (slice = 0; slice < sc->num_slices; slice++) { rx_done = &sc->ss[slice].rx_done; memset(rx_done->entry, 0, sc->rx_ring_size); cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr); cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr); cmd.data2 = slice; status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd); } } status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd); sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0); status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd); irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0); status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd); sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0); if (status != 0) { device_printf(sc->dev, "failed set interrupt parameters\n"); return status; } *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay); /* run a DMA benchmark */ (void) mxge_dma_test(sc, MXGEFW_DMA_TEST); for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; ss->irq_claim = irq_claim + (2 * slice); /* reset mcp/driver shared state back to 0 */ ss->rx_done.idx = 0; ss->rx_done.cnt = 0; ss->tx.req = 0; ss->tx.done = 0; ss->tx.pkt_done = 0; ss->tx.queue_active = 0; ss->tx.activate = 0; ss->tx.deactivate = 0; ss->tx.wake = 0; ss->tx.defrag = 0; ss->tx.stall = 0; ss->rx_big.cnt = 0; ss->rx_small.cnt = 0; ss->lc.lro_bad_csum = 0; ss->lc.lro_queued = 0; ss->lc.lro_flushed = 0; if (ss->fw_stats != NULL) { bzero(ss->fw_stats, sizeof *ss->fw_stats); } } sc->rdma_tags_available = 15; status = mxge_update_mac_address(sc); mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC); mxge_change_pause(sc, sc->pause); mxge_set_multicast_list(sc); if (sc->throttle) { cmd.data0 = sc->throttle; if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd)) { device_printf(sc->dev, "can't enable throttle\n"); } } return status; } static int mxge_change_throttle(SYSCTL_HANDLER_ARGS) { mxge_cmd_t cmd; mxge_softc_t *sc; int err; unsigned int throttle; sc = arg1; throttle = sc->throttle; err = sysctl_handle_int(oidp, &throttle, arg2, req); if (err != 0) { return err; } if (throttle == sc->throttle) return 0; if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE) return EINVAL; mtx_lock(&sc->driver_mtx); cmd.data0 = throttle; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd); if (err == 0) sc->throttle = throttle; mtx_unlock(&sc->driver_mtx); return err; } static int mxge_change_intr_coal(SYSCTL_HANDLER_ARGS) { mxge_softc_t *sc; unsigned int intr_coal_delay; int err; sc = arg1; intr_coal_delay = sc->intr_coal_delay; err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req); if (err != 0) { return err; } if (intr_coal_delay == sc->intr_coal_delay) return 0; if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000) return EINVAL; mtx_lock(&sc->driver_mtx); *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay); sc->intr_coal_delay = intr_coal_delay; mtx_unlock(&sc->driver_mtx); return err; } static int mxge_change_flow_control(SYSCTL_HANDLER_ARGS) { mxge_softc_t *sc; unsigned int enabled; int err; sc = arg1; enabled = sc->pause; err = sysctl_handle_int(oidp, &enabled, arg2, req); if (err != 0) { return err; } if (enabled == sc->pause) return 0; mtx_lock(&sc->driver_mtx); err = mxge_change_pause(sc, enabled); mtx_unlock(&sc->driver_mtx); return err; } static int mxge_handle_be32(SYSCTL_HANDLER_ARGS) { int err; if (arg1 == NULL) return EFAULT; arg2 = be32toh(*(int *)arg1); arg1 = NULL; err = sysctl_handle_int(oidp, arg1, arg2, req); return err; } static void mxge_rem_sysctls(mxge_softc_t *sc) { struct mxge_slice_state *ss; int slice; if (sc->slice_sysctl_tree == NULL) return; for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; if (ss == NULL || ss->sysctl_tree == NULL) continue; sysctl_ctx_free(&ss->sysctl_ctx); ss->sysctl_tree = NULL; } sysctl_ctx_free(&sc->slice_sysctl_ctx); sc->slice_sysctl_tree = NULL; } static void mxge_add_sysctls(mxge_softc_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; mcp_irq_data_t *fw; struct mxge_slice_state *ss; int slice; char slice_num[8]; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); fw = sc->ss[0].fw_stats; /* random information */ SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number", CTLFLAG_RD, sc->serial_number_string, 0, "serial number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code", CTLFLAG_RD, sc->product_code_string, 0, "product_code"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width", CTLFLAG_RD, &sc->link_width, 0, "tx_boundary"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary", CTLFLAG_RD, &sc->tx_boundary, 0, "tx_boundary"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine", CTLFLAG_RD, &sc->wc, 0, "write combining PIO?"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs", CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs", CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs", CTLFLAG_RD, &sc->read_write_dma, 0, "DMA concurrent Read/Write speed in MB/s"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets", CTLFLAG_RD, &sc->watchdog_resets, 0, "Number of times NIC was reset"); /* performance related tunables */ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay", CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I", "interrupt coalescing delay in usecs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle", CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I", "transmit throttling"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled", CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I", "interrupt coalescing delay in usecs"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait", CTLFLAG_RW, &mxge_deassert_wait, 0, "Wait for IRQ line to go low in ihandler"); /* stats block from firmware is in network byte order. Need to swap it */ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up", CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0, mxge_handle_be32, "I", "link up"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available", CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0, mxge_handle_be32, "I", "rdma_tags_available"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I", "dropped_bad_crc32"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I", "dropped_link_error_or_filtered"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0, mxge_handle_be32, "I", "dropped_link_overflow"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I", "dropped_multicast_filtered"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I", "dropped_no_big_buffer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I", "dropped_no_small_buffer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0, mxge_handle_be32, "I", "dropped_overrun"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered", CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I", "dropped_unicast_filtered"); /* verbose printing? */ SYSCTL_ADD_INT(ctx, children, OID_AUTO, "verbose", CTLFLAG_RW, &mxge_verbose, 0, "verbose printing"); /* add counters exported for debugging from all slices */ sysctl_ctx_init(&sc->slice_sysctl_ctx); sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO, "slice", CTLFLAG_RD, 0, ""); for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; sysctl_ctx_init(&ss->sysctl_ctx); ctx = &ss->sysctl_ctx; children = SYSCTL_CHILDREN(sc->slice_sysctl_tree); sprintf(slice_num, "%d", slice); ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num, CTLFLAG_RD, 0, ""); children = SYSCTL_CHILDREN(ss->sysctl_tree); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt", CTLFLAG_RD, &ss->rx_small.cnt, 0, "rx_small_cnt"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt", CTLFLAG_RD, &ss->rx_big.cnt, 0, "rx_small_cnt"); - SYSCTL_ADD_INT(ctx, children, OID_AUTO, + SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed, 0, "number of lro merge queues flushed"); - SYSCTL_ADD_INT(ctx, children, OID_AUTO, + SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum, 0, "number of bad csums preventing LRO"); - SYSCTL_ADD_INT(ctx, children, OID_AUTO, + SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued, 0, "number of frames appended to lro merge" "queues"); #ifndef IFNET_BUF_RING /* only transmit from slice 0 for now */ if (slice > 0) continue; #endif SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req", CTLFLAG_RD, &ss->tx.req, 0, "tx_req"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done", CTLFLAG_RD, &ss->tx.done, 0, "tx_done"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done", CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_stall", CTLFLAG_RD, &ss->tx.stall, 0, "tx_stall"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_wake", CTLFLAG_RD, &ss->tx.wake, 0, "tx_wake"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_defrag", CTLFLAG_RD, &ss->tx.defrag, 0, "tx_defrag"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active", CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate", CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate", CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate"); } } /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy backwards one at a time and handle ring wraps */ static inline void mxge_submit_req_backwards(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt) { int idx, starting_slot; starting_slot = tx->req; while (cnt > 1) { cnt--; idx = (starting_slot + cnt) & tx->mask; mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src)); wmb(); } } /* * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy * at most 32 bytes at a time, so as to avoid involving the software * pio handler in the nic. We re-write the first segment's flags * to mark them valid only after writing the entire chain */ static inline void mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt) { int idx, i; uint32_t *src_ints; volatile uint32_t *dst_ints; mcp_kreq_ether_send_t *srcp; volatile mcp_kreq_ether_send_t *dstp, *dst; uint8_t last_flags; idx = tx->req & tx->mask; last_flags = src->flags; src->flags = 0; wmb(); dst = dstp = &tx->lanai[idx]; srcp = src; if ((idx + cnt) < tx->mask) { for (i = 0; i < (cnt - 1); i += 2) { mxge_pio_copy(dstp, srcp, 2 * sizeof(*src)); wmb(); /* force write every 32 bytes */ srcp += 2; dstp += 2; } } else { /* submit all but the first request, and ensure that it is submitted below */ mxge_submit_req_backwards(tx, src, cnt); i = 0; } if (i < cnt) { /* submit the first request */ mxge_pio_copy(dstp, srcp, sizeof(*src)); wmb(); /* barrier before setting valid flag */ } /* re-write the last 32-bits with the valid flags */ src->flags = last_flags; src_ints = (uint32_t *)src; src_ints+=3; dst_ints = (volatile uint32_t *)dst; dst_ints+=3; *dst_ints = *src_ints; tx->req += cnt; wmb(); } static int mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m, struct mxge_pkt_info *pi) { struct ether_vlan_header *eh; uint16_t etype; int tso = m->m_pkthdr.csum_flags & (CSUM_TSO); #if IFCAP_TSO6 && defined(INET6) int nxt; #endif eh = mtod(m, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); pi->ip_off = ETHER_HDR_LEN; } switch (etype) { case ETHERTYPE_IP: /* * ensure ip header is in first mbuf, copy it to a * scratch buffer if not */ pi->ip = (struct ip *)(m->m_data + pi->ip_off); pi->ip6 = NULL; if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) { m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip), ss->scratch); pi->ip = (struct ip *)(ss->scratch + pi->ip_off); } pi->ip_hlen = pi->ip->ip_hl << 2; if (!tso) return 0; if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr))) { m_copydata(m, 0, pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr), ss->scratch); pi->ip = (struct ip *)(ss->scratch + pi->ip_off); } pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen); break; #if IFCAP_TSO6 && defined(INET6) case ETHERTYPE_IPV6: pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off); if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) { m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6), ss->scratch); pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off); } nxt = 0; pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt); pi->ip_hlen -= pi->ip_off; if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) return EINVAL; if (!tso) return 0; if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen) return EINVAL; if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr))) { m_copydata(m, 0, pi->ip_off + pi->ip_hlen + sizeof(struct tcphdr), ss->scratch); pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off); } pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen); break; #endif default: return EINVAL; } return 0; } #if IFCAP_TSO4 static void mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m, int busdma_seg_cnt, struct mxge_pkt_info *pi) { mxge_tx_ring_t *tx; mcp_kreq_ether_send_t *req; bus_dma_segment_t *seg; uint32_t low, high_swapped; int len, seglen, cum_len, cum_len_next; int next_is_first, chop, cnt, rdma_count, small; uint16_t pseudo_hdr_offset, cksum_offset, mss, sum; uint8_t flags, flags_next; static int once; mss = m->m_pkthdr.tso_segsz; /* negative cum_len signifies to the * send loop that we are still in the * header portion of the TSO packet. */ cksum_offset = pi->ip_off + pi->ip_hlen; cum_len = -(cksum_offset + (pi->tcp->th_off << 2)); /* TSO implies checksum offload on this hardware */ if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) { /* * If packet has full TCP csum, replace it with pseudo hdr * sum that the NIC expects, otherwise the NIC will emit * packets with bad TCP checksums. */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); if (pi->ip6) { #if (CSUM_TCP_IPV6 != 0) && defined(INET6) m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6; sum = in6_cksum_pseudo(pi->ip6, m->m_pkthdr.len - cksum_offset, IPPROTO_TCP, 0); #endif } else { #ifdef INET m->m_pkthdr.csum_flags |= CSUM_TCP; sum = in_pseudo(pi->ip->ip_src.s_addr, pi->ip->ip_dst.s_addr, htons(IPPROTO_TCP + (m->m_pkthdr.len - cksum_offset))); #endif } m_copyback(m, offsetof(struct tcphdr, th_sum) + cksum_offset, sizeof(sum), (caddr_t)&sum); } flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST; /* for TSO, pseudo_hdr_offset holds mss. * The firmware figures out where to put * the checksum by parsing the header. */ pseudo_hdr_offset = htobe16(mss); if (pi->ip6) { /* * for IPv6 TSO, the "checksum offset" is re-purposed * to store the TCP header len */ cksum_offset = (pi->tcp->th_off << 2); } tx = &ss->tx; req = tx->req_list; seg = tx->seg_list; cnt = 0; rdma_count = 0; /* "rdma_count" is the number of RDMAs belonging to the * current packet BEFORE the current send request. For * non-TSO packets, this is equal to "count". * For TSO packets, rdma_count needs to be reset * to 0 after a segment cut. * * The rdma_count field of the send request is * the number of RDMAs of the packet starting at * that request. For TSO send requests with one ore more cuts * in the middle, this is the number of RDMAs starting * after the last cut in the request. All previous * segments before the last cut implicitly have 1 RDMA. * * Since the number of RDMAs is not known beforehand, * it must be filled-in retroactively - after each * segmentation cut or at the end of the entire packet. */ while (busdma_seg_cnt) { /* Break the busdma segment up into pieces*/ low = MXGE_LOWPART_TO_U32(seg->ds_addr); high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); len = seg->ds_len; while (len) { flags_next = flags & ~MXGEFW_FLAGS_FIRST; seglen = len; cum_len_next = cum_len + seglen; (req-rdma_count)->rdma_count = rdma_count + 1; if (__predict_true(cum_len >= 0)) { /* payload */ chop = (cum_len_next > mss); cum_len_next = cum_len_next % mss; next_is_first = (cum_len_next == 0); flags |= chop * MXGEFW_FLAGS_TSO_CHOP; flags_next |= next_is_first * MXGEFW_FLAGS_FIRST; rdma_count |= -(chop | next_is_first); rdma_count += chop & !next_is_first; } else if (cum_len_next >= 0) { /* header ends */ rdma_count = -1; cum_len_next = 0; seglen = -cum_len; small = (mss <= MXGEFW_SEND_SMALL_SIZE); flags_next = MXGEFW_FLAGS_TSO_PLD | MXGEFW_FLAGS_FIRST | (small * MXGEFW_FLAGS_SMALL); } req->addr_high = high_swapped; req->addr_low = htobe32(low); req->pseudo_hdr_offset = pseudo_hdr_offset; req->pad = 0; req->rdma_count = 1; req->length = htobe16(seglen); req->cksum_offset = cksum_offset; req->flags = flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD); low += seglen; len -= seglen; cum_len = cum_len_next; flags = flags_next; req++; cnt++; rdma_count++; if (cksum_offset != 0 && !pi->ip6) { if (__predict_false(cksum_offset > seglen)) cksum_offset -= seglen; else cksum_offset = 0; } if (__predict_false(cnt > tx->max_desc)) goto drop; } busdma_seg_cnt--; seg++; } (req-rdma_count)->rdma_count = rdma_count; do { req--; req->flags |= MXGEFW_FLAGS_TSO_LAST; } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST))); tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1; mxge_submit_req(tx, tx->req_list, cnt); #ifdef IFNET_BUF_RING if ((ss->sc->num_slices > 1) && tx->queue_active == 0) { /* tell the NIC to start polling this slice */ *tx->send_go = 1; tx->queue_active = 1; tx->activate++; wmb(); } #endif return; drop: bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map); m_freem(m); ss->oerrors++; if (!once) { printf("tx->max_desc exceeded via TSO!\n"); printf("mss = %d, %ld, %d!\n", mss, (long)seg - (long)tx->seg_list, tx->max_desc); once = 1; } return; } #endif /* IFCAP_TSO4 */ #ifdef MXGE_NEW_VLAN_API /* * We reproduce the software vlan tag insertion from * net/if_vlan.c:vlan_start() here so that we can advertise "hardware" * vlan tag insertion. We need to advertise this in order to have the * vlan interface respect our csum offload flags. */ static struct mbuf * mxge_vlan_tag_insert(struct mbuf *m) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (__predict_false(m == NULL)) return NULL; if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (__predict_false(m == NULL)) return NULL; } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_flags &= ~M_VLANTAG; return m; } #endif /* MXGE_NEW_VLAN_API */ static void mxge_encap(struct mxge_slice_state *ss, struct mbuf *m) { struct mxge_pkt_info pi = {0,0,0,0}; mxge_softc_t *sc; mcp_kreq_ether_send_t *req; bus_dma_segment_t *seg; struct mbuf *m_tmp; struct ifnet *ifp; mxge_tx_ring_t *tx; int cnt, cum_len, err, i, idx, odd_flag; uint16_t pseudo_hdr_offset; uint8_t flags, cksum_offset; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; #ifdef MXGE_NEW_VLAN_API if (m->m_flags & M_VLANTAG) { m = mxge_vlan_tag_insert(m); if (__predict_false(m == NULL)) goto drop_without_m; } #endif if (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) { if (mxge_parse_tx(ss, m, &pi)) goto drop; } /* (try to) map the frame for DMA */ idx = tx->req & tx->mask; err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map, m, tx->seg_list, &cnt, BUS_DMA_NOWAIT); if (__predict_false(err == EFBIG)) { /* Too many segments in the chain. Try to defrag */ m_tmp = m_defrag(m, M_NOWAIT); if (m_tmp == NULL) { goto drop; } ss->tx.defrag++; m = m_tmp; err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map, m, tx->seg_list, &cnt, BUS_DMA_NOWAIT); } if (__predict_false(err != 0)) { device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d" " packet len = %d\n", err, m->m_pkthdr.len); goto drop; } bus_dmamap_sync(tx->dmat, tx->info[idx].map, BUS_DMASYNC_PREWRITE); tx->info[idx].m = m; #if IFCAP_TSO4 /* TSO is different enough, we handle it in another routine */ if (m->m_pkthdr.csum_flags & (CSUM_TSO)) { mxge_encap_tso(ss, m, cnt, &pi); return; } #endif req = tx->req_list; cksum_offset = 0; pseudo_hdr_offset = 0; flags = MXGEFW_FLAGS_NO_TSO; /* checksum offloading? */ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) { /* ensure ip header is in first mbuf, copy it to a scratch buffer if not */ cksum_offset = pi.ip_off + pi.ip_hlen; pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data; pseudo_hdr_offset = htobe16(pseudo_hdr_offset); req->cksum_offset = cksum_offset; flags |= MXGEFW_FLAGS_CKSUM; odd_flag = MXGEFW_FLAGS_ALIGN_ODD; } else { odd_flag = 0; } if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE) flags |= MXGEFW_FLAGS_SMALL; /* convert segments into a request list */ cum_len = 0; seg = tx->seg_list; req->flags = MXGEFW_FLAGS_FIRST; for (i = 0; i < cnt; i++) { req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr)); req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); req->length = htobe16(seg->ds_len); req->cksum_offset = cksum_offset; if (cksum_offset > seg->ds_len) cksum_offset -= seg->ds_len; else cksum_offset = 0; req->pseudo_hdr_offset = pseudo_hdr_offset; req->pad = 0; /* complete solid 16-byte block */ req->rdma_count = 1; req->flags |= flags | ((cum_len & 1) * odd_flag); cum_len += seg->ds_len; seg++; req++; req->flags = 0; } req--; /* pad runts to 60 bytes */ if (cum_len < 60) { req++; req->addr_low = htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr)); req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr)); req->length = htobe16(60 - cum_len); req->cksum_offset = 0; req->pseudo_hdr_offset = pseudo_hdr_offset; req->pad = 0; /* complete solid 16-byte block */ req->rdma_count = 1; req->flags |= flags | ((cum_len & 1) * odd_flag); cnt++; } tx->req_list[0].rdma_count = cnt; #if 0 /* print what the firmware will see */ for (i = 0; i < cnt; i++) { printf("%d: addr: 0x%x 0x%x len:%d pso%d," "cso:%d, flags:0x%x, rdma:%d\n", i, (int)ntohl(tx->req_list[i].addr_high), (int)ntohl(tx->req_list[i].addr_low), (int)ntohs(tx->req_list[i].length), (int)ntohs(tx->req_list[i].pseudo_hdr_offset), tx->req_list[i].cksum_offset, tx->req_list[i].flags, tx->req_list[i].rdma_count); } printf("--------------\n"); #endif tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1; mxge_submit_req(tx, tx->req_list, cnt); #ifdef IFNET_BUF_RING if ((ss->sc->num_slices > 1) && tx->queue_active == 0) { /* tell the NIC to start polling this slice */ *tx->send_go = 1; tx->queue_active = 1; tx->activate++; wmb(); } #endif return; drop: m_freem(m); drop_without_m: ss->oerrors++; return; } #ifdef IFNET_BUF_RING static void mxge_qflush(struct ifnet *ifp) { mxge_softc_t *sc = ifp->if_softc; mxge_tx_ring_t *tx; struct mbuf *m; int slice; for (slice = 0; slice < sc->num_slices; slice++) { tx = &sc->ss[slice].tx; mtx_lock(&tx->mtx); while ((m = buf_ring_dequeue_sc(tx->br)) != NULL) m_freem(m); mtx_unlock(&tx->mtx); } if_qflush(ifp); } static inline void mxge_start_locked(struct mxge_slice_state *ss) { mxge_softc_t *sc; struct mbuf *m; struct ifnet *ifp; mxge_tx_ring_t *tx; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) { m = drbr_dequeue(ifp, tx->br); if (m == NULL) { return; } /* let BPF see it */ BPF_MTAP(ifp, m); /* give it to the nic */ mxge_encap(ss, m); } /* ran out of transmit slots */ if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0) && (!drbr_empty(ifp, tx->br))) { ss->if_drv_flags |= IFF_DRV_OACTIVE; tx->stall++; } } static int mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m) { mxge_softc_t *sc; struct ifnet *ifp; mxge_tx_ring_t *tx; int err; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) { err = drbr_enqueue(ifp, tx->br, m); return (err); } if (!drbr_needs_enqueue(ifp, tx->br) && ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) { /* let BPF see it */ BPF_MTAP(ifp, m); /* give it to the nic */ mxge_encap(ss, m); } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) { return (err); } if (!drbr_empty(ifp, tx->br)) mxge_start_locked(ss); return (0); } static int mxge_transmit(struct ifnet *ifp, struct mbuf *m) { mxge_softc_t *sc = ifp->if_softc; struct mxge_slice_state *ss; mxge_tx_ring_t *tx; int err = 0; int slice; slice = m->m_pkthdr.flowid; slice &= (sc->num_slices - 1); /* num_slices always power of 2 */ ss = &sc->ss[slice]; tx = &ss->tx; if (mtx_trylock(&tx->mtx)) { err = mxge_transmit_locked(ss, m); mtx_unlock(&tx->mtx); } else { err = drbr_enqueue(ifp, tx->br, m); } return (err); } #else static inline void mxge_start_locked(struct mxge_slice_state *ss) { mxge_softc_t *sc; struct mbuf *m; struct ifnet *ifp; mxge_tx_ring_t *tx; sc = ss->sc; ifp = sc->ifp; tx = &ss->tx; while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { return; } /* let BPF see it */ BPF_MTAP(ifp, m); /* give it to the nic */ mxge_encap(ss, m); } /* ran out of transmit slots */ if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE; tx->stall++; } } #endif static void mxge_start(struct ifnet *ifp) { mxge_softc_t *sc = ifp->if_softc; struct mxge_slice_state *ss; /* only use the first slice for now */ ss = &sc->ss[0]; mtx_lock(&ss->tx.mtx); mxge_start_locked(ss); mtx_unlock(&ss->tx.mtx); } /* * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy * at most 32 bytes at a time, so as to avoid involving the software * pio handler in the nic. We re-write the first segment's low * DMA address to mark it valid only after we write the entire chunk * in a burst */ static inline void mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src) { uint32_t low; low = src->addr_low; src->addr_low = 0xffffffff; mxge_pio_copy(dst, src, 4 * sizeof (*src)); wmb(); mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src)); wmb(); src->addr_low = low; dst->addr_low = low; wmb(); } static int mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx) { bus_dma_segment_t seg; struct mbuf *m; mxge_rx_ring_t *rx = &ss->rx_small; int cnt, err; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { rx->alloc_fail++; err = ENOBUFS; goto done; } m->m_len = MHLEN; err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, &seg, &cnt, BUS_DMA_NOWAIT); if (err != 0) { m_free(m); goto done; } rx->info[idx].m = m; rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr)); rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr)); done: if ((idx & 7) == 7) mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]); return err; } static int mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx) { bus_dma_segment_t seg[3]; struct mbuf *m; mxge_rx_ring_t *rx = &ss->rx_big; int cnt, err, i; m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size); if (m == NULL) { rx->alloc_fail++; err = ENOBUFS; goto done; } m->m_len = rx->mlen; err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m, seg, &cnt, BUS_DMA_NOWAIT); if (err != 0) { m_free(m); goto done; } rx->info[idx].m = m; rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr)); rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr)); #if MXGE_VIRT_JUMBOS for (i = 1; i < cnt; i++) { rx->shadow[idx + i].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr)); rx->shadow[idx + i].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr)); } #endif done: for (i = 0; i < rx->nbufs; i++) { if ((idx & 7) == 7) { mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]); } idx++; } return err; } #ifdef INET6 static uint16_t mxge_csum_generic(uint16_t *raw, int len) { uint32_t csum; csum = 0; while (len > 0) { csum += *raw; raw++; len -= 2; } csum = (csum >> 16) + (csum & 0xffff); csum = (csum >> 16) + (csum & 0xffff); return (uint16_t)csum; } static inline uint16_t mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum) { uint32_t partial; int nxt, cksum_offset; struct ip6_hdr *ip6 = p; uint16_t c; nxt = ip6->ip6_nxt; cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN; if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) { cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN, IPPROTO_IPV6, &nxt); if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) return (1); } /* * IPv6 headers do not contain a checksum, and hence * do not checksum to zero, so they don't "fall out" * of the partial checksum calculation like IPv4 * headers do. We need to fix the partial checksum by * subtracting the checksum of the IPv6 header. */ partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset - ETHER_HDR_LEN); csum += ~partial; csum += (csum < ~partial); csum = (csum >> 16) + (csum & 0xFFFF); csum = (csum >> 16) + (csum & 0xFFFF); c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt, csum); c ^= 0xffff; return (c); } #endif /* INET6 */ /* * Myri10GE hardware checksums are not valid if the sender * padded the frame with non-zero padding. This is because * the firmware just does a simple 16-bit 1s complement * checksum across the entire frame, excluding the first 14 * bytes. It is best to simply to check the checksum and * tell the stack about it only if the checksum is good */ static inline uint16_t mxge_rx_csum(struct mbuf *m, int csum) { struct ether_header *eh; #ifdef INET struct ip *ip; #endif #if defined(INET) || defined(INET6) int cap = m->m_pkthdr.rcvif->if_capenable; #endif uint16_t c, etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); switch (etype) { #ifdef INET case ETHERTYPE_IP: if ((cap & IFCAP_RXCSUM) == 0) return (1); ip = (struct ip *)(eh + 1); if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) return (1); c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(ntohs(csum) + ntohs(ip->ip_len) - (ip->ip_hl << 2) + ip->ip_p)); c ^= 0xffff; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: if ((cap & IFCAP_RXCSUM_IPV6) == 0) return (1); c = mxge_rx_csum6((eh + 1), m, csum); break; #endif default: c = 1; } return (c); } static void mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum) { struct ether_vlan_header *evl; struct ether_header *eh; uint32_t partial; evl = mtod(m, struct ether_vlan_header *); eh = mtod(m, struct ether_header *); /* * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes * after what the firmware thought was the end of the ethernet * header. */ /* put checksum into host byte order */ *csum = ntohs(*csum); partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN)); (*csum) += ~partial; (*csum) += ((*csum) < ~partial); (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF); (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF); /* restore checksum to network byte order; later consumers expect this */ *csum = htons(*csum); /* save the tag */ #ifdef MXGE_NEW_VLAN_API m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); #else { struct m_tag *mtag; mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int), M_NOWAIT); if (mtag == NULL) return; VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag); m_tag_prepend(m, mtag); } #endif m->m_flags |= M_VLANTAG; /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } static inline void mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum, int lro) { mxge_softc_t *sc; struct ifnet *ifp; struct mbuf *m; struct ether_header *eh; mxge_rx_ring_t *rx; bus_dmamap_t old_map; int idx; sc = ss->sc; ifp = sc->ifp; rx = &ss->rx_big; idx = rx->cnt & rx->mask; rx->cnt += rx->nbufs; /* save a pointer to the received mbuf */ m = rx->info[idx].m; /* try to replace the received mbuf */ if (mxge_get_buf_big(ss, rx->extra_map, idx)) { /* drop the frame -- the old mbuf is re-cycled */ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } /* unmap the received buffer */ old_map = rx->info[idx].map; bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rx->dmat, old_map); /* swap the bus_dmamap_t's */ rx->info[idx].map = rx->extra_map; rx->extra_map = old_map; /* mcp implicitly skips 1st 2 bytes so that packet is properly * aligned */ m->m_data += MXGEFW_PAD; m->m_pkthdr.rcvif = ifp; m->m_len = m->m_pkthdr.len = len; ss->ipackets++; eh = mtod(m, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { mxge_vlan_tag_remove(m, &csum); } /* if the checksum is valid, mark it in the mbuf header */ if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) && (0 == mxge_rx_csum(m, csum))) { /* Tell the stack that the checksum is good */ m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID; #if defined(INET) || defined (INET6) if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0))) return; #endif } /* flowid only valid if RSS hashing is enabled */ if (sc->num_slices > 1) { m->m_pkthdr.flowid = (ss - sc->ss); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } /* pass the frame up the stack */ (*ifp->if_input)(ifp, m); } static inline void mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum, int lro) { mxge_softc_t *sc; struct ifnet *ifp; struct ether_header *eh; struct mbuf *m; mxge_rx_ring_t *rx; bus_dmamap_t old_map; int idx; sc = ss->sc; ifp = sc->ifp; rx = &ss->rx_small; idx = rx->cnt & rx->mask; rx->cnt++; /* save a pointer to the received mbuf */ m = rx->info[idx].m; /* try to replace the received mbuf */ if (mxge_get_buf_small(ss, rx->extra_map, idx)) { /* drop the frame -- the old mbuf is re-cycled */ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return; } /* unmap the received buffer */ old_map = rx->info[idx].map; bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rx->dmat, old_map); /* swap the bus_dmamap_t's */ rx->info[idx].map = rx->extra_map; rx->extra_map = old_map; /* mcp implicitly skips 1st 2 bytes so that packet is properly * aligned */ m->m_data += MXGEFW_PAD; m->m_pkthdr.rcvif = ifp; m->m_len = m->m_pkthdr.len = len; ss->ipackets++; eh = mtod(m, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { mxge_vlan_tag_remove(m, &csum); } /* if the checksum is valid, mark it in the mbuf header */ if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) && (0 == mxge_rx_csum(m, csum))) { /* Tell the stack that the checksum is good */ m->m_pkthdr.csum_data = 0xffff; m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID; #if defined(INET) || defined (INET6) if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum))) return; #endif } /* flowid only valid if RSS hashing is enabled */ if (sc->num_slices > 1) { m->m_pkthdr.flowid = (ss - sc->ss); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } /* pass the frame up the stack */ (*ifp->if_input)(ifp, m); } static inline void mxge_clean_rx_done(struct mxge_slice_state *ss) { mxge_rx_done_t *rx_done = &ss->rx_done; int limit = 0; uint16_t length; uint16_t checksum; int lro; lro = ss->sc->ifp->if_capenable & IFCAP_LRO; while (rx_done->entry[rx_done->idx].length != 0) { length = ntohs(rx_done->entry[rx_done->idx].length); rx_done->entry[rx_done->idx].length = 0; checksum = rx_done->entry[rx_done->idx].checksum; if (length <= (MHLEN - MXGEFW_PAD)) mxge_rx_done_small(ss, length, checksum, lro); else mxge_rx_done_big(ss, length, checksum, lro); rx_done->cnt++; rx_done->idx = rx_done->cnt & rx_done->mask; /* limit potential for livelock */ if (__predict_false(++limit > rx_done->mask / 2)) break; } #if defined(INET) || defined (INET6) while (!SLIST_EMPTY(&ss->lc.lro_active)) { struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active); SLIST_REMOVE_HEAD(&ss->lc.lro_active, next); tcp_lro_flush(&ss->lc, lro); } #endif } static inline void mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx) { struct ifnet *ifp; mxge_tx_ring_t *tx; struct mbuf *m; bus_dmamap_t map; int idx; int *flags; tx = &ss->tx; ifp = ss->sc->ifp; while (tx->pkt_done != mcp_idx) { idx = tx->done & tx->mask; tx->done++; m = tx->info[idx].m; /* mbuf and DMA map only attached to the first segment per-mbuf */ if (m != NULL) { ss->obytes += m->m_pkthdr.len; if (m->m_flags & M_MCAST) ss->omcasts++; ss->opackets++; tx->info[idx].m = NULL; map = tx->info[idx].map; bus_dmamap_unload(tx->dmat, map); m_freem(m); } if (tx->info[idx].flag) { tx->info[idx].flag = 0; tx->pkt_done++; } } /* If we have space, clear IFF_OACTIVE to tell the stack that its OK to send packets */ #ifdef IFNET_BUF_RING flags = &ss->if_drv_flags; #else flags = &ifp->if_drv_flags; #endif mtx_lock(&ss->tx.mtx); if ((*flags) & IFF_DRV_OACTIVE && tx->req - tx->done < (tx->mask + 1)/4) { *(flags) &= ~IFF_DRV_OACTIVE; ss->tx.wake++; mxge_start_locked(ss); } #ifdef IFNET_BUF_RING if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) { /* let the NIC stop polling this queue, since there * are no more transmits pending */ if (tx->req == tx->done) { *tx->send_stop = 1; tx->queue_active = 0; tx->deactivate++; wmb(); } } #endif mtx_unlock(&ss->tx.mtx); } static struct mxge_media_type mxge_xfp_media_types[] = { {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"}, {IFM_10G_SR, (1 << 7), "10GBASE-SR"}, {IFM_10G_LR, (1 << 6), "10GBASE-LR"}, {0, (1 << 5), "10GBASE-ER"}, {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"}, {0, (1 << 3), "10GBASE-SW"}, {0, (1 << 2), "10GBASE-LW"}, {0, (1 << 1), "10GBASE-EW"}, {0, (1 << 0), "Reserved"} }; static struct mxge_media_type mxge_sfp_media_types[] = { {IFM_10G_TWINAX, 0, "10GBASE-Twinax"}, {0, (1 << 7), "Reserved"}, {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"}, {IFM_10G_LR, (1 << 5), "10GBASE-LR"}, {IFM_10G_SR, (1 << 4), "10GBASE-SR"}, {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"} }; static void mxge_media_set(mxge_softc_t *sc, int media_type) { ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type); sc->current_media = media_type; sc->media.ifm_media = sc->media.ifm_cur->ifm_media; } static void mxge_media_init(mxge_softc_t *sc) { char *ptr; int i; ifmedia_removeall(&sc->media); mxge_media_set(sc, IFM_AUTO); /* * parse the product code to deterimine the interface type * (CX4, XFP, Quad Ribbon Fiber) by looking at the character * after the 3rd dash in the driver's cached copy of the * EEPROM's product code string. */ ptr = sc->product_code_string; if (ptr == NULL) { device_printf(sc->dev, "Missing product code\n"); return; } for (i = 0; i < 3; i++, ptr++) { ptr = strchr(ptr, '-'); if (ptr == NULL) { device_printf(sc->dev, "only %d dashes in PC?!?\n", i); return; } } if (*ptr == 'C' || *(ptr +1) == 'C') { /* -C is CX4 */ sc->connector = MXGE_CX4; mxge_media_set(sc, IFM_10G_CX4); } else if (*ptr == 'Q') { /* -Q is Quad Ribbon Fiber */ sc->connector = MXGE_QRF; device_printf(sc->dev, "Quad Ribbon Fiber Media\n"); /* FreeBSD has no media type for Quad ribbon fiber */ } else if (*ptr == 'R') { /* -R is XFP */ sc->connector = MXGE_XFP; } else if (*ptr == 'S' || *(ptr +1) == 'S') { /* -S or -2S is SFP+ */ sc->connector = MXGE_SFP; } else { device_printf(sc->dev, "Unknown media type: %c\n", *ptr); } } /* * Determine the media type for a NIC. Some XFPs will identify * themselves only when their link is up, so this is initiated via a * link up interrupt. However, this can potentially take up to * several milliseconds, so it is run via the watchdog routine, rather * than in the interrupt handler itself. */ static void mxge_media_probe(mxge_softc_t *sc) { mxge_cmd_t cmd; char *cage_type; struct mxge_media_type *mxge_media_types = NULL; int i, err, ms, mxge_media_type_entries; uint32_t byte; sc->need_media_probe = 0; if (sc->connector == MXGE_XFP) { /* -R is XFP */ mxge_media_types = mxge_xfp_media_types; mxge_media_type_entries = sizeof (mxge_xfp_media_types) / sizeof (mxge_xfp_media_types[0]); byte = MXGE_XFP_COMPLIANCE_BYTE; cage_type = "XFP"; } else if (sc->connector == MXGE_SFP) { /* -S or -2S is SFP+ */ mxge_media_types = mxge_sfp_media_types; mxge_media_type_entries = sizeof (mxge_sfp_media_types) / sizeof (mxge_sfp_media_types[0]); cage_type = "SFP+"; byte = 3; } else { /* nothing to do; media type cannot change */ return; } /* * At this point we know the NIC has an XFP cage, so now we * try to determine what is in the cage by using the * firmware's XFP I2C commands to read the XFP 10GbE compilance * register. We read just one byte, which may take over * a millisecond */ cmd.data0 = 0; /* just fetch 1 byte, not all 256 */ cmd.data1 = byte; err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd); if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) { device_printf(sc->dev, "failed to read XFP\n"); } if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) { device_printf(sc->dev, "Type R/S with no XFP!?!?\n"); } if (err != MXGEFW_CMD_OK) { return; } /* now we wait for the data to be cached */ cmd.data0 = byte; err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd); for (ms = 0; (err == EBUSY) && (ms < 50); ms++) { DELAY(1000); cmd.data0 = byte; err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd); } if (err != MXGEFW_CMD_OK) { device_printf(sc->dev, "failed to read %s (%d, %dms)\n", cage_type, err, ms); return; } if (cmd.data0 == mxge_media_types[0].bitmask) { if (mxge_verbose) device_printf(sc->dev, "%s:%s\n", cage_type, mxge_media_types[0].name); if (sc->current_media != mxge_media_types[0].flag) { mxge_media_init(sc); mxge_media_set(sc, mxge_media_types[0].flag); } return; } for (i = 1; i < mxge_media_type_entries; i++) { if (cmd.data0 & mxge_media_types[i].bitmask) { if (mxge_verbose) device_printf(sc->dev, "%s:%s\n", cage_type, mxge_media_types[i].name); if (sc->current_media != mxge_media_types[i].flag) { mxge_media_init(sc); mxge_media_set(sc, mxge_media_types[i].flag); } return; } } if (mxge_verbose) device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type, cmd.data0); return; } static void mxge_intr(void *arg) { struct mxge_slice_state *ss = arg; mxge_softc_t *sc = ss->sc; mcp_irq_data_t *stats = ss->fw_stats; mxge_tx_ring_t *tx = &ss->tx; mxge_rx_done_t *rx_done = &ss->rx_done; uint32_t send_done_count; uint8_t valid; #ifndef IFNET_BUF_RING /* an interrupt on a non-zero slice is implicitly valid since MSI-X irqs are not shared */ if (ss != sc->ss) { mxge_clean_rx_done(ss); *ss->irq_claim = be32toh(3); return; } #endif /* make sure the DMA has finished */ if (!stats->valid) { return; } valid = stats->valid; if (sc->legacy_irq) { /* lower legacy IRQ */ *sc->irq_deassert = 0; if (!mxge_deassert_wait) /* don't wait for conf. that irq is low */ stats->valid = 0; } else { stats->valid = 0; } /* loop while waiting for legacy irq deassertion */ do { /* check for transmit completes and receives */ send_done_count = be32toh(stats->send_done_count); while ((send_done_count != tx->pkt_done) || (rx_done->entry[rx_done->idx].length != 0)) { if (send_done_count != tx->pkt_done) mxge_tx_done(ss, (int)send_done_count); mxge_clean_rx_done(ss); send_done_count = be32toh(stats->send_done_count); } if (sc->legacy_irq && mxge_deassert_wait) wmb(); } while (*((volatile uint8_t *) &stats->valid)); /* fw link & error stats meaningful only on the first slice */ if (__predict_false((ss == sc->ss) && stats->stats_updated)) { if (sc->link_state != stats->link_up) { sc->link_state = stats->link_up; if (sc->link_state) { if_link_state_change(sc->ifp, LINK_STATE_UP); if (mxge_verbose) device_printf(sc->dev, "link up\n"); } else { if_link_state_change(sc->ifp, LINK_STATE_DOWN); if (mxge_verbose) device_printf(sc->dev, "link down\n"); } sc->need_media_probe = 1; } if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) { sc->rdma_tags_available = be32toh(stats->rdma_tags_available); device_printf(sc->dev, "RDMA timed out! %d tags " "left\n", sc->rdma_tags_available); } if (stats->link_down) { sc->down_cnt += stats->link_down; sc->link_state = 0; if_link_state_change(sc->ifp, LINK_STATE_DOWN); } } /* check to see if we have rx token to pass back */ if (valid & 0x1) *ss->irq_claim = be32toh(3); *(ss->irq_claim + 1) = be32toh(3); } static void mxge_init(void *arg) { mxge_softc_t *sc = arg; struct ifnet *ifp = sc->ifp; mtx_lock(&sc->driver_mtx); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) (void) mxge_open(sc); mtx_unlock(&sc->driver_mtx); } static void mxge_free_slice_mbufs(struct mxge_slice_state *ss) { int i; #if defined(INET) || defined(INET6) tcp_lro_free(&ss->lc); #endif for (i = 0; i <= ss->rx_big.mask; i++) { if (ss->rx_big.info[i].m == NULL) continue; bus_dmamap_unload(ss->rx_big.dmat, ss->rx_big.info[i].map); m_freem(ss->rx_big.info[i].m); ss->rx_big.info[i].m = NULL; } for (i = 0; i <= ss->rx_small.mask; i++) { if (ss->rx_small.info[i].m == NULL) continue; bus_dmamap_unload(ss->rx_small.dmat, ss->rx_small.info[i].map); m_freem(ss->rx_small.info[i].m); ss->rx_small.info[i].m = NULL; } /* transmit ring used only on the first slice */ if (ss->tx.info == NULL) return; for (i = 0; i <= ss->tx.mask; i++) { ss->tx.info[i].flag = 0; if (ss->tx.info[i].m == NULL) continue; bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map); m_freem(ss->tx.info[i].m); ss->tx.info[i].m = NULL; } } static void mxge_free_mbufs(mxge_softc_t *sc) { int slice; for (slice = 0; slice < sc->num_slices; slice++) mxge_free_slice_mbufs(&sc->ss[slice]); } static void mxge_free_slice_rings(struct mxge_slice_state *ss) { int i; if (ss->rx_done.entry != NULL) mxge_dma_free(&ss->rx_done.dma); ss->rx_done.entry = NULL; if (ss->tx.req_bytes != NULL) free(ss->tx.req_bytes, M_DEVBUF); ss->tx.req_bytes = NULL; if (ss->tx.seg_list != NULL) free(ss->tx.seg_list, M_DEVBUF); ss->tx.seg_list = NULL; if (ss->rx_small.shadow != NULL) free(ss->rx_small.shadow, M_DEVBUF); ss->rx_small.shadow = NULL; if (ss->rx_big.shadow != NULL) free(ss->rx_big.shadow, M_DEVBUF); ss->rx_big.shadow = NULL; if (ss->tx.info != NULL) { if (ss->tx.dmat != NULL) { for (i = 0; i <= ss->tx.mask; i++) { bus_dmamap_destroy(ss->tx.dmat, ss->tx.info[i].map); } bus_dma_tag_destroy(ss->tx.dmat); } free(ss->tx.info, M_DEVBUF); } ss->tx.info = NULL; if (ss->rx_small.info != NULL) { if (ss->rx_small.dmat != NULL) { for (i = 0; i <= ss->rx_small.mask; i++) { bus_dmamap_destroy(ss->rx_small.dmat, ss->rx_small.info[i].map); } bus_dmamap_destroy(ss->rx_small.dmat, ss->rx_small.extra_map); bus_dma_tag_destroy(ss->rx_small.dmat); } free(ss->rx_small.info, M_DEVBUF); } ss->rx_small.info = NULL; if (ss->rx_big.info != NULL) { if (ss->rx_big.dmat != NULL) { for (i = 0; i <= ss->rx_big.mask; i++) { bus_dmamap_destroy(ss->rx_big.dmat, ss->rx_big.info[i].map); } bus_dmamap_destroy(ss->rx_big.dmat, ss->rx_big.extra_map); bus_dma_tag_destroy(ss->rx_big.dmat); } free(ss->rx_big.info, M_DEVBUF); } ss->rx_big.info = NULL; } static void mxge_free_rings(mxge_softc_t *sc) { int slice; for (slice = 0; slice < sc->num_slices; slice++) mxge_free_slice_rings(&sc->ss[slice]); } static int mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries, int tx_ring_entries) { mxge_softc_t *sc = ss->sc; size_t bytes; int err, i; /* allocate per-slice receive resources */ ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1; ss->rx_done.mask = (2 * rx_ring_entries) - 1; /* allocate the rx shadow rings */ bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow); ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow); ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); /* allocate the rx host info rings */ bytes = rx_ring_entries * sizeof (*ss->rx_small.info); ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); bytes = rx_ring_entries * sizeof (*ss->rx_big.info); ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); /* allocate the rx busdma resources */ err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 1, /* alignment */ 4096, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ MHLEN, /* maxsize */ 1, /* num segs */ MHLEN, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, NULL, /* lock */ &ss->rx_small.dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating rx_small dmat\n", err); return err; } err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 1, /* alignment */ #if MXGE_VIRT_JUMBOS 4096, /* boundary */ #else 0, /* boundary */ #endif BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ 3*4096, /* maxsize */ #if MXGE_VIRT_JUMBOS 3, /* num segs */ 4096, /* maxsegsize*/ #else 1, /* num segs */ MJUM9BYTES, /* maxsegsize*/ #endif BUS_DMA_ALLOCNOW, /* flags */ NULL, NULL, /* lock */ &ss->rx_big.dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating rx_big dmat\n", err); return err; } for (i = 0; i <= ss->rx_small.mask; i++) { err = bus_dmamap_create(ss->rx_small.dmat, 0, &ss->rx_small.info[i].map); if (err != 0) { device_printf(sc->dev, "Err %d rx_small dmamap\n", err); return err; } } err = bus_dmamap_create(ss->rx_small.dmat, 0, &ss->rx_small.extra_map); if (err != 0) { device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err); return err; } for (i = 0; i <= ss->rx_big.mask; i++) { err = bus_dmamap_create(ss->rx_big.dmat, 0, &ss->rx_big.info[i].map); if (err != 0) { device_printf(sc->dev, "Err %d rx_big dmamap\n", err); return err; } } err = bus_dmamap_create(ss->rx_big.dmat, 0, &ss->rx_big.extra_map); if (err != 0) { device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err); return err; } /* now allocate TX resources */ #ifndef IFNET_BUF_RING /* only use a single TX ring for now */ if (ss != ss->sc->ss) return 0; #endif ss->tx.mask = tx_ring_entries - 1; ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4); /* allocate the tx request copy block */ bytes = 8 + sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4); ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK); /* ensure req_list entries are aligned to 8 bytes */ ss->tx.req_list = (mcp_kreq_ether_send_t *) ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL); /* allocate the tx busdma segment list */ bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc; ss->tx.seg_list = (bus_dma_segment_t *) malloc(bytes, M_DEVBUF, M_WAITOK); /* allocate the tx host info ring */ bytes = tx_ring_entries * sizeof (*ss->tx.info); ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK); /* allocate the tx busdma resources */ err = bus_dma_tag_create(sc->parent_dmat, /* parent */ 1, /* alignment */ sc->tx_boundary, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ 65536 + 256, /* maxsize */ ss->tx.max_desc - 2, /* num segs */ sc->tx_boundary, /* maxsegsz */ BUS_DMA_ALLOCNOW, /* flags */ NULL, NULL, /* lock */ &ss->tx.dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating tx dmat\n", err); return err; } /* now use these tags to setup dmamaps for each slot in the ring */ for (i = 0; i <= ss->tx.mask; i++) { err = bus_dmamap_create(ss->tx.dmat, 0, &ss->tx.info[i].map); if (err != 0) { device_printf(sc->dev, "Err %d tx dmamap\n", err); return err; } } return 0; } static int mxge_alloc_rings(mxge_softc_t *sc) { mxge_cmd_t cmd; int tx_ring_size; int tx_ring_entries, rx_ring_entries; int err, slice; /* get ring sizes */ err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd); tx_ring_size = cmd.data0; if (err != 0) { device_printf(sc->dev, "Cannot determine tx ring sizes\n"); goto abort; } tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t); rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t); IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1); sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen; IFQ_SET_READY(&sc->ifp->if_snd); for (slice = 0; slice < sc->num_slices; slice++) { err = mxge_alloc_slice_rings(&sc->ss[slice], rx_ring_entries, tx_ring_entries); if (err != 0) goto abort; } return 0; abort: mxge_free_rings(sc); return err; } static void mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs) { int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD; if (bufsize < MCLBYTES) { /* easy, everything fits in a single buffer */ *big_buf_size = MCLBYTES; *cl_size = MCLBYTES; *nbufs = 1; return; } if (bufsize < MJUMPAGESIZE) { /* still easy, everything still fits in a single buffer */ *big_buf_size = MJUMPAGESIZE; *cl_size = MJUMPAGESIZE; *nbufs = 1; return; } #if MXGE_VIRT_JUMBOS /* now we need to use virtually contiguous buffers */ *cl_size = MJUM9BYTES; *big_buf_size = 4096; *nbufs = mtu / 4096 + 1; /* needs to be a power of two, so round up */ if (*nbufs == 3) *nbufs = 4; #else *cl_size = MJUM9BYTES; *big_buf_size = MJUM9BYTES; *nbufs = 1; #endif } static int mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size) { mxge_softc_t *sc; mxge_cmd_t cmd; bus_dmamap_t map; int err, i, slice; sc = ss->sc; slice = ss - sc->ss; #if defined(INET) || defined(INET6) (void)tcp_lro_init(&ss->lc); #endif ss->lc.ifp = sc->ifp; /* get the lanai pointers to the send and receive rings */ err = 0; #ifndef IFNET_BUF_RING /* We currently only send from the first slice */ if (slice == 0) { #endif cmd.data0 = slice; err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd); ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0); ss->tx.send_go = (volatile uint32_t *) (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice); ss->tx.send_stop = (volatile uint32_t *) (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice); #ifndef IFNET_BUF_RING } #endif cmd.data0 = slice; err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd); ss->rx_small.lanai = (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0); cmd.data0 = slice; err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd); ss->rx_big.lanai = (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0); if (err != 0) { device_printf(sc->dev, "failed to get ring sizes or locations\n"); return EIO; } /* stock receive rings */ for (i = 0; i <= ss->rx_small.mask; i++) { map = ss->rx_small.info[i].map; err = mxge_get_buf_small(ss, map, i); if (err) { device_printf(sc->dev, "alloced %d/%d smalls\n", i, ss->rx_small.mask + 1); return ENOMEM; } } for (i = 0; i <= ss->rx_big.mask; i++) { ss->rx_big.shadow[i].addr_low = 0xffffffff; ss->rx_big.shadow[i].addr_high = 0xffffffff; } ss->rx_big.nbufs = nbufs; ss->rx_big.cl_size = cl_size; ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD; for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) { map = ss->rx_big.info[i].map; err = mxge_get_buf_big(ss, map, i); if (err) { device_printf(sc->dev, "alloced %d/%d bigs\n", i, ss->rx_big.mask + 1); return ENOMEM; } } return 0; } static int mxge_open(mxge_softc_t *sc) { mxge_cmd_t cmd; int err, big_bytes, nbufs, slice, cl_size, i; bus_addr_t bus; volatile uint8_t *itable; struct mxge_slice_state *ss; /* Copy the MAC address in case it was overridden */ bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN); err = mxge_reset(sc, 1); if (err != 0) { device_printf(sc->dev, "failed to reset\n"); return EIO; } if (sc->num_slices > 1) { /* setup the indirection table */ cmd.data0 = sc->num_slices; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd); err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd); if (err != 0) { device_printf(sc->dev, "failed to setup rss tables\n"); return err; } /* just enable an identity mapping */ itable = sc->sram + cmd.data0; for (i = 0; i < sc->num_slices; i++) itable[i] = (uint8_t)i; cmd.data0 = 1; cmd.data1 = mxge_rss_hash_type; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd); if (err != 0) { device_printf(sc->dev, "failed to enable slices\n"); return err; } } mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs); cmd.data0 = nbufs; err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd); /* error is only meaningful if we're trying to set MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */ if (err && nbufs > 1) { device_printf(sc->dev, "Failed to set alway-use-n to %d\n", nbufs); return EIO; } /* Give the firmware the mtu and the big and small buffer sizes. The firmware wants the big buf size to be a power of two. Luckily, FreeBSD's clusters are powers of two */ cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd); cmd.data0 = MHLEN - MXGEFW_PAD; err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd); cmd.data0 = big_bytes; err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd); if (err != 0) { device_printf(sc->dev, "failed to setup params\n"); goto abort; } /* Now give him the pointer to the stats block */ for (slice = 0; #ifdef IFNET_BUF_RING slice < sc->num_slices; #else slice < 1; #endif slice++) { ss = &sc->ss[slice]; cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr); cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr); cmd.data2 = sizeof(struct mcp_irq_data); cmd.data2 |= (slice << 16); err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd); } if (err != 0) { bus = sc->ss->fw_stats_dma.bus_addr; bus += offsetof(struct mcp_irq_data, send_done_count); cmd.data0 = MXGE_LOWPART_TO_U32(bus); cmd.data1 = MXGE_HIGHPART_TO_U32(bus); err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd); /* Firmware cannot support multicast without STATS_DMA_V2 */ sc->fw_multicast_support = 0; } else { sc->fw_multicast_support = 1; } if (err != 0) { device_printf(sc->dev, "failed to setup params\n"); goto abort; } for (slice = 0; slice < sc->num_slices; slice++) { err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size); if (err != 0) { device_printf(sc->dev, "couldn't open slice %d\n", slice); goto abort; } } /* Finally, start the firmware running */ err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd); if (err) { device_printf(sc->dev, "Couldn't bring up link\n"); goto abort; } #ifdef IFNET_BUF_RING for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; ss->if_drv_flags |= IFF_DRV_RUNNING; ss->if_drv_flags &= ~IFF_DRV_OACTIVE; } #endif sc->ifp->if_drv_flags |= IFF_DRV_RUNNING; sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; abort: mxge_free_mbufs(sc); return err; } static int mxge_close(mxge_softc_t *sc, int down) { mxge_cmd_t cmd; int err, old_down_cnt; #ifdef IFNET_BUF_RING struct mxge_slice_state *ss; int slice; #endif #ifdef IFNET_BUF_RING for (slice = 0; slice < sc->num_slices; slice++) { ss = &sc->ss[slice]; ss->if_drv_flags &= ~IFF_DRV_RUNNING; } #endif sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; if (!down) { old_down_cnt = sc->down_cnt; wmb(); err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd); if (err) { device_printf(sc->dev, "Couldn't bring down link\n"); } if (old_down_cnt == sc->down_cnt) { /* wait for down irq */ DELAY(10 * sc->intr_coal_delay); } wmb(); if (old_down_cnt == sc->down_cnt) { device_printf(sc->dev, "never got down irq\n"); } } mxge_free_mbufs(sc); return 0; } static void mxge_setup_cfg_space(mxge_softc_t *sc) { device_t dev = sc->dev; int reg; uint16_t lnk, pectl; /* find the PCIe link width and set max read request to 4KB*/ if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) { lnk = pci_read_config(dev, reg + 0x12, 2); sc->link_width = (lnk >> 4) & 0x3f; if (sc->pectl == 0) { pectl = pci_read_config(dev, reg + 0x8, 2); pectl = (pectl & ~0x7000) | (5 << 12); pci_write_config(dev, reg + 0x8, pectl, 2); sc->pectl = pectl; } else { /* restore saved pectl after watchdog reset */ pci_write_config(dev, reg + 0x8, sc->pectl, 2); } } /* Enable DMA and Memory space access */ pci_enable_busmaster(dev); } static uint32_t mxge_read_reboot(mxge_softc_t *sc) { device_t dev = sc->dev; uint32_t vs; /* find the vendor specific offset */ if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) { device_printf(sc->dev, "could not find vendor specific offset\n"); return (uint32_t)-1; } /* enable read32 mode */ pci_write_config(dev, vs + 0x10, 0x3, 1); /* tell NIC which register to read */ pci_write_config(dev, vs + 0x18, 0xfffffff0, 4); return (pci_read_config(dev, vs + 0x14, 4)); } static void mxge_watchdog_reset(mxge_softc_t *sc) { struct pci_devinfo *dinfo; struct mxge_slice_state *ss; int err, running, s, num_tx_slices = 1; uint32_t reboot; uint16_t cmd; err = ENXIO; device_printf(sc->dev, "Watchdog reset!\n"); /* * check to see if the NIC rebooted. If it did, then all of * PCI config space has been reset, and things like the * busmaster bit will be zero. If this is the case, then we * must restore PCI config space before the NIC can be used * again */ cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); if (cmd == 0xffff) { /* * maybe the watchdog caught the NIC rebooting; wait * up to 100ms for it to finish. If it does not come * back, then give up */ DELAY(1000*100); cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); if (cmd == 0xffff) { device_printf(sc->dev, "NIC disappeared!\n"); } } if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) { /* print the reboot status */ reboot = mxge_read_reboot(sc); device_printf(sc->dev, "NIC rebooted, status = 0x%x\n", reboot); running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING; if (running) { /* * quiesce NIC so that TX routines will not try to * xmit after restoration of BAR */ /* Mark the link as down */ if (sc->link_state) { sc->link_state = 0; if_link_state_change(sc->ifp, LINK_STATE_DOWN); } #ifdef IFNET_BUF_RING num_tx_slices = sc->num_slices; #endif /* grab all TX locks to ensure no tx */ for (s = 0; s < num_tx_slices; s++) { ss = &sc->ss[s]; mtx_lock(&ss->tx.mtx); } mxge_close(sc, 1); } /* restore PCI configuration space */ dinfo = device_get_ivars(sc->dev); pci_cfg_restore(sc->dev, dinfo); /* and redo any changes we made to our config space */ mxge_setup_cfg_space(sc); /* reload f/w */ err = mxge_load_firmware(sc, 0); if (err) { device_printf(sc->dev, "Unable to re-load f/w\n"); } if (running) { if (!err) err = mxge_open(sc); /* release all TX locks */ for (s = 0; s < num_tx_slices; s++) { ss = &sc->ss[s]; #ifdef IFNET_BUF_RING mxge_start_locked(ss); #endif mtx_unlock(&ss->tx.mtx); } } sc->watchdog_resets++; } else { device_printf(sc->dev, "NIC did not reboot, not resetting\n"); err = 0; } if (err) { device_printf(sc->dev, "watchdog reset failed\n"); } else { if (sc->dying == 2) sc->dying = 0; callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc); } } static void mxge_watchdog_task(void *arg, int pending) { mxge_softc_t *sc = arg; mtx_lock(&sc->driver_mtx); mxge_watchdog_reset(sc); mtx_unlock(&sc->driver_mtx); } static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice) { tx = &sc->ss[slice].tx; device_printf(sc->dev, "slice %d struck? ring state:\n", slice); device_printf(sc->dev, "tx.req=%d tx.done=%d, tx.queue_active=%d\n", tx->req, tx->done, tx->queue_active); device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n", tx->activate, tx->deactivate); device_printf(sc->dev, "pkt_done=%d fw=%d\n", tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count)); } static int mxge_watchdog(mxge_softc_t *sc) { mxge_tx_ring_t *tx; uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause); int i, err = 0; /* see if we have outstanding transmits, which have been pending for more than mxge_ticks */ for (i = 0; #ifdef IFNET_BUF_RING (i < sc->num_slices) && (err == 0); #else (i < 1) && (err == 0); #endif i++) { tx = &sc->ss[i].tx; if (tx->req != tx->done && tx->watchdog_req != tx->watchdog_done && tx->done == tx->watchdog_done) { /* check for pause blocking before resetting */ if (tx->watchdog_rx_pause == rx_pause) { mxge_warn_stuck(sc, tx, i); taskqueue_enqueue(sc->tq, &sc->watchdog_task); return (ENXIO); } else device_printf(sc->dev, "Flow control blocking " "xmits, check link partner\n"); } tx->watchdog_req = tx->req; tx->watchdog_done = tx->done; tx->watchdog_rx_pause = rx_pause; } if (sc->need_media_probe) mxge_media_probe(sc); return (err); } static uint64_t mxge_get_counter(struct ifnet *ifp, ift_counter cnt) { struct mxge_softc *sc; uint64_t rv; sc = if_getsoftc(ifp); rv = 0; switch (cnt) { case IFCOUNTER_IPACKETS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].ipackets; return (rv); case IFCOUNTER_OPACKETS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].opackets; return (rv); case IFCOUNTER_OERRORS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].oerrors; return (rv); #ifdef IFNET_BUF_RING case IFCOUNTER_OBYTES: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].obytes; return (rv); case IFCOUNTER_OMCASTS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].omcasts; return (rv); case IFCOUNTER_OQDROPS: for (int s = 0; s < sc->num_slices; s++) rv += sc->ss[s].tx.br->br_drops; return (rv); #endif default: return (if_get_counter_default(ifp, cnt)); } } static void mxge_tick(void *arg) { mxge_softc_t *sc = arg; u_long pkts = 0; int err = 0; int running, ticks; uint16_t cmd; ticks = mxge_ticks; running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING; if (running) { if (!sc->watchdog_countdown) { err = mxge_watchdog(sc); sc->watchdog_countdown = 4; } sc->watchdog_countdown--; } if (pkts == 0) { /* ensure NIC did not suffer h/w fault while idle */ cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2); if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) { sc->dying = 2; taskqueue_enqueue(sc->tq, &sc->watchdog_task); err = ENXIO; } /* look less often if NIC is idle */ ticks *= 4; } if (err == 0) callout_reset(&sc->co_hdl, ticks, mxge_tick, sc); } static int mxge_media_change(struct ifnet *ifp) { return EINVAL; } static int mxge_change_mtu(mxge_softc_t *sc, int mtu) { struct ifnet *ifp = sc->ifp; int real_mtu, old_mtu; int err = 0; real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; if ((real_mtu > sc->max_mtu) || real_mtu < 60) return EINVAL; mtx_lock(&sc->driver_mtx); old_mtu = ifp->if_mtu; ifp->if_mtu = mtu; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mxge_close(sc, 0); err = mxge_open(sc); if (err != 0) { ifp->if_mtu = old_mtu; mxge_close(sc, 0); (void) mxge_open(sc); } } mtx_unlock(&sc->driver_mtx); return err; } static void mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { mxge_softc_t *sc = ifp->if_softc; if (sc == NULL) return; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER | IFM_FDX; ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0; ifmr->ifm_active |= sc->current_media; } static int mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { mxge_softc_t *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int err, mask; err = 0; switch (command) { case SIOCSIFADDR: case SIOCGIFADDR: err = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: err = mxge_change_mtu(sc, ifr->ifr_mtu); break; case SIOCSIFFLAGS: mtx_lock(&sc->driver_mtx); if (sc->dying) { mtx_unlock(&sc->driver_mtx); return EINVAL; } if (ifp->if_flags & IFF_UP) { if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { err = mxge_open(sc); } else { /* take care of promis can allmulti flag chages */ mxge_change_promisc(sc, ifp->if_flags & IFF_PROMISC); mxge_set_multicast_list(sc); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mxge_close(sc, 0); } } mtx_unlock(&sc->driver_mtx); break; case SIOCADDMULTI: case SIOCDELMULTI: mtx_lock(&sc->driver_mtx); mxge_set_multicast_list(sc); mtx_unlock(&sc->driver_mtx); break; case SIOCSIFCAP: mtx_lock(&sc->driver_mtx); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP); } else { ifp->if_capenable |= IFCAP_TXCSUM; ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); } } else if (mask & IFCAP_RXCSUM) { if (IFCAP_RXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_RXCSUM; } else { ifp->if_capenable |= IFCAP_RXCSUM; } } if (mask & IFCAP_TSO4) { if (IFCAP_TSO4 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_TSO4; } else if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_TSO; } else { printf("mxge requires tx checksum offload" " be enabled to use TSO\n"); err = EINVAL; } } #if IFCAP_TSO6 if (mask & IFCAP_TXCSUM_IPV6) { if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) { ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); ifp->if_hwassist &= ~(CSUM_TCP_IPV6 | CSUM_UDP); } else { ifp->if_capenable |= IFCAP_TXCSUM_IPV6; ifp->if_hwassist |= (CSUM_TCP_IPV6 | CSUM_UDP_IPV6); } } else if (mask & IFCAP_RXCSUM_IPV6) { if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6; } else { ifp->if_capenable |= IFCAP_RXCSUM_IPV6; } } if (mask & IFCAP_TSO6) { if (IFCAP_TSO6 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_TSO6; } else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) { ifp->if_capenable |= IFCAP_TSO6; ifp->if_hwassist |= CSUM_TSO; } else { printf("mxge requires tx checksum offload" " be enabled to use TSO\n"); err = EINVAL; } } #endif /*IFCAP_TSO6 */ if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) || !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING)) ifp->if_capenable &= ~IFCAP_VLAN_HWTSO; mtx_unlock(&sc->driver_mtx); VLAN_CAPABILITIES(ifp); break; case SIOCGIFMEDIA: mtx_lock(&sc->driver_mtx); mxge_media_probe(sc); mtx_unlock(&sc->driver_mtx); err = ifmedia_ioctl(ifp, (struct ifreq *)data, &sc->media, command); break; default: err = ENOTTY; } return err; } static void mxge_fetch_tunables(mxge_softc_t *sc) { TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices); TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled", &mxge_flow_control); TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay); TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable); TUNABLE_INT_FETCH("hw.mxge.force_firmware", &mxge_force_firmware); TUNABLE_INT_FETCH("hw.mxge.deassert_wait", &mxge_deassert_wait); TUNABLE_INT_FETCH("hw.mxge.verbose", &mxge_verbose); TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks); TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc); TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type); TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type); TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu); TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle); if (bootverbose) mxge_verbose = 1; if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000) mxge_intr_coal_delay = 30; if (mxge_ticks == 0) mxge_ticks = hz / 2; sc->pause = mxge_flow_control; if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) { mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT; } if (mxge_initial_mtu > ETHERMTU_JUMBO || mxge_initial_mtu < ETHER_MIN_LEN) mxge_initial_mtu = ETHERMTU_JUMBO; if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE) mxge_throttle = MXGE_MAX_THROTTLE; if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE) mxge_throttle = MXGE_MIN_THROTTLE; sc->throttle = mxge_throttle; } static void mxge_free_slices(mxge_softc_t *sc) { struct mxge_slice_state *ss; int i; if (sc->ss == NULL) return; for (i = 0; i < sc->num_slices; i++) { ss = &sc->ss[i]; if (ss->fw_stats != NULL) { mxge_dma_free(&ss->fw_stats_dma); ss->fw_stats = NULL; #ifdef IFNET_BUF_RING if (ss->tx.br != NULL) { drbr_free(ss->tx.br, M_DEVBUF); ss->tx.br = NULL; } #endif mtx_destroy(&ss->tx.mtx); } if (ss->rx_done.entry != NULL) { mxge_dma_free(&ss->rx_done.dma); ss->rx_done.entry = NULL; } } free(sc->ss, M_DEVBUF); sc->ss = NULL; } static int mxge_alloc_slices(mxge_softc_t *sc) { mxge_cmd_t cmd; struct mxge_slice_state *ss; size_t bytes; int err, i, max_intr_slots; err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd); if (err != 0) { device_printf(sc->dev, "Cannot determine rx ring size\n"); return err; } sc->rx_ring_size = cmd.data0; max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t)); bytes = sizeof (*sc->ss) * sc->num_slices; sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->ss == NULL) return (ENOMEM); for (i = 0; i < sc->num_slices; i++) { ss = &sc->ss[i]; ss->sc = sc; /* allocate per-slice rx interrupt queues */ bytes = max_intr_slots * sizeof (*ss->rx_done.entry); err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096); if (err != 0) goto abort; ss->rx_done.entry = ss->rx_done.dma.addr; bzero(ss->rx_done.entry, bytes); /* * allocate the per-slice firmware stats; stats * (including tx) are used used only on the first * slice for now */ #ifndef IFNET_BUF_RING if (i > 0) continue; #endif bytes = sizeof (*ss->fw_stats); err = mxge_dma_alloc(sc, &ss->fw_stats_dma, sizeof (*ss->fw_stats), 64); if (err != 0) goto abort; ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr; snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name), "%s:tx(%d)", device_get_nameunit(sc->dev), i); mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF); #ifdef IFNET_BUF_RING ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &ss->tx.mtx); #endif } return (0); abort: mxge_free_slices(sc); return (ENOMEM); } static void mxge_slice_probe(mxge_softc_t *sc) { mxge_cmd_t cmd; char *old_fw; int msix_cnt, status, max_intr_slots; sc->num_slices = 1; /* * don't enable multiple slices if they are not enabled, * or if this is not an SMP system */ if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2) return; /* see how many MSI-X interrupts are available */ msix_cnt = pci_msix_count(sc->dev); if (msix_cnt < 2) return; /* now load the slice aware firmware see what it supports */ old_fw = sc->fw_name; if (old_fw == mxge_fw_aligned) sc->fw_name = mxge_fw_rss_aligned; else sc->fw_name = mxge_fw_rss_unaligned; status = mxge_load_firmware(sc, 0); if (status != 0) { device_printf(sc->dev, "Falling back to a single slice\n"); return; } /* try to send a reset command to the card to see if it is alive */ memset(&cmd, 0, sizeof (cmd)); status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd); if (status != 0) { device_printf(sc->dev, "failed reset\n"); goto abort_with_fw; } /* get rx ring size */ status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd); if (status != 0) { device_printf(sc->dev, "Cannot determine rx ring size\n"); goto abort_with_fw; } max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t)); /* tell it the size of the interrupt queues */ cmd.data0 = max_intr_slots * sizeof (struct mcp_slot); status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd); if (status != 0) { device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n"); goto abort_with_fw; } /* ask the maximum number of slices it supports */ status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd); if (status != 0) { device_printf(sc->dev, "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n"); goto abort_with_fw; } sc->num_slices = cmd.data0; if (sc->num_slices > msix_cnt) sc->num_slices = msix_cnt; if (mxge_max_slices == -1) { /* cap to number of CPUs in system */ if (sc->num_slices > mp_ncpus) sc->num_slices = mp_ncpus; } else { if (sc->num_slices > mxge_max_slices) sc->num_slices = mxge_max_slices; } /* make sure it is a power of two */ while (sc->num_slices & (sc->num_slices - 1)) sc->num_slices--; if (mxge_verbose) device_printf(sc->dev, "using %d slices\n", sc->num_slices); return; abort_with_fw: sc->fw_name = old_fw; (void) mxge_load_firmware(sc, 0); } static int mxge_add_msix_irqs(mxge_softc_t *sc) { size_t bytes; int count, err, i, rid; rid = PCIR_BAR(2); sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->msix_table_res == NULL) { device_printf(sc->dev, "couldn't alloc MSIX table res\n"); return ENXIO; } count = sc->num_slices; err = pci_alloc_msix(sc->dev, &count); if (err != 0) { device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d" "err = %d \n", sc->num_slices, err); goto abort_with_msix_table; } if (count < sc->num_slices) { device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n", count, sc->num_slices); device_printf(sc->dev, "Try setting hw.mxge.max_slices to %d\n", count); err = ENOSPC; goto abort_with_msix; } bytes = sizeof (*sc->msix_irq_res) * sc->num_slices; sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO); if (sc->msix_irq_res == NULL) { err = ENOMEM; goto abort_with_msix; } for (i = 0; i < sc->num_slices; i++) { rid = i + 1; sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (sc->msix_irq_res[i] == NULL) { device_printf(sc->dev, "couldn't allocate IRQ res" " for message %d\n", i); err = ENXIO; goto abort_with_res; } } bytes = sizeof (*sc->msix_ih) * sc->num_slices; sc->msix_ih = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO); for (i = 0; i < sc->num_slices; i++) { err = bus_setup_intr(sc->dev, sc->msix_irq_res[i], INTR_TYPE_NET | INTR_MPSAFE, #if __FreeBSD_version > 700030 NULL, #endif mxge_intr, &sc->ss[i], &sc->msix_ih[i]); if (err != 0) { device_printf(sc->dev, "couldn't setup intr for " "message %d\n", i); goto abort_with_intr; } bus_describe_intr(sc->dev, sc->msix_irq_res[i], sc->msix_ih[i], "s%d", i); } if (mxge_verbose) { device_printf(sc->dev, "using %d msix IRQs:", sc->num_slices); for (i = 0; i < sc->num_slices; i++) printf(" %ld", rman_get_start(sc->msix_irq_res[i])); printf("\n"); } return (0); abort_with_intr: for (i = 0; i < sc->num_slices; i++) { if (sc->msix_ih[i] != NULL) { bus_teardown_intr(sc->dev, sc->msix_irq_res[i], sc->msix_ih[i]); sc->msix_ih[i] = NULL; } } free(sc->msix_ih, M_DEVBUF); abort_with_res: for (i = 0; i < sc->num_slices; i++) { rid = i + 1; if (sc->msix_irq_res[i] != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, rid, sc->msix_irq_res[i]); sc->msix_irq_res[i] = NULL; } free(sc->msix_irq_res, M_DEVBUF); abort_with_msix: pci_release_msi(sc->dev); abort_with_msix_table: bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2), sc->msix_table_res); return err; } static int mxge_add_single_irq(mxge_softc_t *sc) { int count, err, rid; count = pci_msi_count(sc->dev); if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) { rid = 1; } else { rid = 0; sc->legacy_irq = 1; } sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0, 1, RF_SHAREABLE | RF_ACTIVE); if (sc->irq_res == NULL) { device_printf(sc->dev, "could not alloc interrupt\n"); return ENXIO; } if (mxge_verbose) device_printf(sc->dev, "using %s irq %ld\n", sc->legacy_irq ? "INTx" : "MSI", rman_get_start(sc->irq_res)); err = bus_setup_intr(sc->dev, sc->irq_res, INTR_TYPE_NET | INTR_MPSAFE, #if __FreeBSD_version > 700030 NULL, #endif mxge_intr, &sc->ss[0], &sc->ih); if (err != 0) { bus_release_resource(sc->dev, SYS_RES_IRQ, sc->legacy_irq ? 0 : 1, sc->irq_res); if (!sc->legacy_irq) pci_release_msi(sc->dev); } return err; } static void mxge_rem_msix_irqs(mxge_softc_t *sc) { int i, rid; for (i = 0; i < sc->num_slices; i++) { if (sc->msix_ih[i] != NULL) { bus_teardown_intr(sc->dev, sc->msix_irq_res[i], sc->msix_ih[i]); sc->msix_ih[i] = NULL; } } free(sc->msix_ih, M_DEVBUF); for (i = 0; i < sc->num_slices; i++) { rid = i + 1; if (sc->msix_irq_res[i] != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, rid, sc->msix_irq_res[i]); sc->msix_irq_res[i] = NULL; } free(sc->msix_irq_res, M_DEVBUF); bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2), sc->msix_table_res); pci_release_msi(sc->dev); return; } static void mxge_rem_single_irq(mxge_softc_t *sc) { bus_teardown_intr(sc->dev, sc->irq_res, sc->ih); bus_release_resource(sc->dev, SYS_RES_IRQ, sc->legacy_irq ? 0 : 1, sc->irq_res); if (!sc->legacy_irq) pci_release_msi(sc->dev); } static void mxge_rem_irq(mxge_softc_t *sc) { if (sc->num_slices > 1) mxge_rem_msix_irqs(sc); else mxge_rem_single_irq(sc); } static int mxge_add_irq(mxge_softc_t *sc) { int err; if (sc->num_slices > 1) err = mxge_add_msix_irqs(sc); else err = mxge_add_single_irq(sc); if (0 && err == 0 && sc->num_slices > 1) { mxge_rem_msix_irqs(sc); err = mxge_add_msix_irqs(sc); } return err; } static int mxge_attach(device_t dev) { mxge_cmd_t cmd; mxge_softc_t *sc = device_get_softc(dev); struct ifnet *ifp; int err, rid; sc->dev = dev; mxge_fetch_tunables(sc); TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc); sc->tq = taskqueue_create("mxge_taskq", M_WAITOK, taskqueue_thread_enqueue, &sc->tq); if (sc->tq == NULL) { err = ENOMEM; goto abort_with_nothing; } err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ 1, /* alignment */ 0, /* boundary */ BUS_SPACE_MAXADDR, /* low */ BUS_SPACE_MAXADDR, /* high */ NULL, NULL, /* filter */ 65536 + 256, /* maxsize */ MXGE_MAX_SEND_DESC, /* num segs */ 65536, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lock */ &sc->parent_dmat); /* tag */ if (err != 0) { device_printf(sc->dev, "Err %d allocating parent dmat\n", err); goto abort_with_tq; } ifp = sc->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not if_alloc()\n"); err = ENOSPC; goto abort_with_parent_dmat; } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd", device_get_nameunit(dev)); mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF); snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name), "%s:drv", device_get_nameunit(dev)); mtx_init(&sc->driver_mtx, sc->driver_mtx_name, MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0); mxge_setup_cfg_space(sc); /* Map the board into the kernel */ rid = PCIR_BARS; sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, ~0, 1, RF_ACTIVE); if (sc->mem_res == NULL) { device_printf(dev, "could not map memory\n"); err = ENXIO; goto abort_with_lock; } sc->sram = rman_get_virtual(sc->mem_res); sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100; if (sc->sram_size > rman_get_size(sc->mem_res)) { device_printf(dev, "impossible memory region size %ld\n", rman_get_size(sc->mem_res)); err = ENXIO; goto abort_with_mem_res; } /* make NULL terminated copy of the EEPROM strings section of lanai SRAM */ bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE); bus_space_read_region_1(rman_get_bustag(sc->mem_res), rman_get_bushandle(sc->mem_res), sc->sram_size - MXGE_EEPROM_STRINGS_SIZE, sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2); err = mxge_parse_strings(sc); if (err != 0) goto abort_with_mem_res; /* Enable write combining for efficient use of PCIe bus */ mxge_enable_wc(sc); /* Allocate the out of band dma memory */ err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof (mxge_cmd_t), 64); if (err != 0) goto abort_with_mem_res; sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr; err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64); if (err != 0) goto abort_with_cmd_dma; err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096); if (err != 0) goto abort_with_zeropad_dma; /* select & load the firmware */ err = mxge_select_firmware(sc); if (err != 0) goto abort_with_dmabench; sc->intr_coal_delay = mxge_intr_coal_delay; mxge_slice_probe(sc); err = mxge_alloc_slices(sc); if (err != 0) goto abort_with_dmabench; err = mxge_reset(sc, 0); if (err != 0) goto abort_with_slices; err = mxge_alloc_rings(sc); if (err != 0) { device_printf(sc->dev, "failed to allocate rings\n"); goto abort_with_slices; } err = mxge_add_irq(sc); if (err != 0) { device_printf(sc->dev, "failed to add irq\n"); goto abort_with_rings; } ifp->if_baudrate = IF_Gbps(10); ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 | IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6; #if defined(INET) || defined(INET6) ifp->if_capabilities |= IFCAP_LRO; #endif #ifdef MXGE_NEW_VLAN_API ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; /* Only FW 1.4.32 and newer can do TSO over vlans */ if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 && sc->fw_ver_tiny >= 32) ifp->if_capabilities |= IFCAP_VLAN_HWTSO; #endif sc->max_mtu = mxge_max_mtu(sc); if (sc->max_mtu >= 9000) ifp->if_capabilities |= IFCAP_JUMBO_MTU; else device_printf(dev, "MTU limited to %d. Install " "latest firmware for 9000 byte jumbo support\n", sc->max_mtu - ETHER_HDR_LEN); ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO; ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6; /* check to see if f/w supports TSO for IPv6 */ if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) { if (CSUM_TCP_IPV6) ifp->if_capabilities |= IFCAP_TSO6; sc->max_tso6_hlen = min(cmd.data0, sizeof (sc->ss[0].scratch)); } ifp->if_capenable = ifp->if_capabilities; if (sc->lro_cnt == 0) ifp->if_capenable &= ~IFCAP_LRO; ifp->if_init = mxge_init; ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = mxge_ioctl; ifp->if_start = mxge_start; ifp->if_get_counter = mxge_get_counter; /* Initialise the ifmedia structure */ ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status); mxge_media_init(sc); mxge_media_probe(sc); sc->dying = 0; ether_ifattach(ifp, sc->mac_addr); /* ether_ifattach sets mtu to ETHERMTU */ if (mxge_initial_mtu != ETHERMTU) mxge_change_mtu(sc, mxge_initial_mtu); mxge_add_sysctls(sc); #ifdef IFNET_BUF_RING ifp->if_transmit = mxge_transmit; ifp->if_qflush = mxge_qflush; #endif taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq", device_get_nameunit(sc->dev)); callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc); return 0; abort_with_rings: mxge_free_rings(sc); abort_with_slices: mxge_free_slices(sc); abort_with_dmabench: mxge_dma_free(&sc->dmabench_dma); abort_with_zeropad_dma: mxge_dma_free(&sc->zeropad_dma); abort_with_cmd_dma: mxge_dma_free(&sc->cmd_dma); abort_with_mem_res: bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res); abort_with_lock: pci_disable_busmaster(dev); mtx_destroy(&sc->cmd_mtx); mtx_destroy(&sc->driver_mtx); if_free(ifp); abort_with_parent_dmat: bus_dma_tag_destroy(sc->parent_dmat); abort_with_tq: if (sc->tq != NULL) { taskqueue_drain(sc->tq, &sc->watchdog_task); taskqueue_free(sc->tq); sc->tq = NULL; } abort_with_nothing: return err; } static int mxge_detach(device_t dev) { mxge_softc_t *sc = device_get_softc(dev); if (mxge_vlans_active(sc)) { device_printf(sc->dev, "Detach vlans before removing module\n"); return EBUSY; } mtx_lock(&sc->driver_mtx); sc->dying = 1; if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) mxge_close(sc, 0); mtx_unlock(&sc->driver_mtx); ether_ifdetach(sc->ifp); if (sc->tq != NULL) { taskqueue_drain(sc->tq, &sc->watchdog_task); taskqueue_free(sc->tq); sc->tq = NULL; } callout_drain(&sc->co_hdl); ifmedia_removeall(&sc->media); mxge_dummy_rdma(sc, 0); mxge_rem_sysctls(sc); mxge_rem_irq(sc); mxge_free_rings(sc); mxge_free_slices(sc); mxge_dma_free(&sc->dmabench_dma); mxge_dma_free(&sc->zeropad_dma); mxge_dma_free(&sc->cmd_dma); bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res); pci_disable_busmaster(dev); mtx_destroy(&sc->cmd_mtx); mtx_destroy(&sc->driver_mtx); if_free(sc->ifp); bus_dma_tag_destroy(sc->parent_dmat); return 0; } static int mxge_shutdown(device_t dev) { return 0; } /* This file uses Myri10GE driver indentation. Local Variables: c-file-style:"linux" tab-width:8 End: */ Index: head/sys/netinet/tcp_lro.c =================================================================== --- head/sys/netinet/tcp_lro.c (revision 294326) +++ head/sys/netinet/tcp_lro.c (revision 294327) @@ -1,636 +1,776 @@ /*- * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation + * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#ifndef LRO_ENTRIES -#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */ -#endif +static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_UPDATE_CSUM 1 #ifndef TCP_LRO_UPDATE_CSUM #define TCP_LRO_INVALID_CSUM 0x0000 #endif int tcp_lro_init(struct lro_ctrl *lc) { + return (tcp_lro_init_args(lc, NULL, TCP_LRO_ENTRIES, 0)); +} + +int +tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, + unsigned lro_entries, unsigned lro_mbufs) +{ struct lro_entry *le; - int error, i; + size_t size; + unsigned i; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_cnt = 0; + lc->lro_mbuf_count = 0; + lc->lro_mbuf_max = lro_mbufs; + lc->lro_cnt = lro_entries; + lc->ifp = ifp; SLIST_INIT(&lc->lro_free); SLIST_INIT(&lc->lro_active); - error = 0; - for (i = 0; i < LRO_ENTRIES; i++) { - le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (le == NULL) { - if (i == 0) - error = ENOMEM; - break; - } - lc->lro_cnt = i + 1; - SLIST_INSERT_HEAD(&lc->lro_free, le, next); - } + /* compute size to allocate */ + size = (lro_mbufs * sizeof(struct mbuf *)) + + (lro_entries * sizeof(*le)); + lc->lro_mbuf_data = (struct mbuf **) + malloc(size, M_LRO, M_NOWAIT | M_ZERO); - return (error); + /* check for out of memory */ + if (lc->lro_mbuf_data == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + /* compute offset for LRO entries */ + le = (struct lro_entry *) + (lc->lro_mbuf_data + lro_mbufs); + + /* setup linked list */ + for (i = 0; i != lro_entries; i++) + SLIST_INSERT_HEAD(&lc->lro_free, le + i, next); + + return (0); } void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; + unsigned x; - while (!SLIST_EMPTY(&lc->lro_free)) { - le = SLIST_FIRST(&lc->lro_free); - SLIST_REMOVE_HEAD(&lc->lro_free, next); - free(le, M_DEVBUF); + /* reset LRO free list */ + SLIST_INIT(&lc->lro_free); + + /* free active mbufs, if any */ + while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) { + SLIST_REMOVE_HEAD(&lc->lro_active, next); + m_freem(le->m_head); } + + /* free mbuf array, if any */ + for (x = 0; x != lc->lro_mbuf_count; x++) + m_freem(lc->lro_mbuf_data[x]); + lc->lro_mbuf_count = 0; + + /* free allocated memory, if any */ + free(lc->lro_mbuf_data, M_LRO); + lc->lro_mbuf_data = NULL; } #ifdef TCP_LRO_UPDATE_CSUM static uint16_t tcp_lro_csum_th(struct tcphdr *th) { uint32_t ch; uint16_t *p, l; ch = th->th_sum = 0x0000; l = th->th_off; p = (uint16_t *)th; while (l > 0) { ch += *p; p++; ch += *p; p++; l--; } while (ch > 0xffff) ch = (ch >> 16) + (ch & 0xffff); return (ch & 0xffff); } static uint16_t tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, uint16_t tcp_data_len, uint16_t csum) { uint32_t c; uint16_t cs; c = csum; /* Remove length from checksum. */ switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)l3hdr; if (le->append_cnt == 0) cs = ip6->ip6_plen; else { uint32_t cx; cx = ntohs(ip6->ip6_plen); cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); } break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; ip4 = (struct ip *)l3hdr; if (le->append_cnt == 0) cs = ip4->ip_len; else { cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), IPPROTO_TCP); cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, htons(cs)); } break; } #endif default: cs = 0; /* Keep compiler happy. */ } cs = ~cs; c += cs; /* Remove TCP header csum. */ cs = ~tcp_lro_csum_th(th); c += cs; while (c > 0xffff) c = (c >> 16) + (c & 0xffff); return (c & 0xffff); } #endif void tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) { struct lro_entry *le, *le_tmp; struct timeval tv; if (SLIST_EMPTY(&lc->lro_active)) return; getmicrotime(&tv); timevalsub(&tv, timeout); SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (timevalcmp(&tv, &le->mtime, >=)) { SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); tcp_lro_flush(lc, le); } } } void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { if (le->append_cnt > 0) { struct tcphdr *th; uint16_t p_len; p_len = htons(le->p_len); switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = le->le_ip6; ip6->ip6_plen = p_len; th = (struct tcphdr *)(ip6 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->p_len += ETHER_HDR_LEN + sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; #ifdef TCP_LRO_UPDATE_CSUM uint32_t cl; uint16_t c; #endif ip4 = le->le_ip4; #ifdef TCP_LRO_UPDATE_CSUM /* Fix IP header checksum for new length. */ c = ~ip4->ip_sum; cl = c; c = ~ip4->ip_len; cl += c + p_len; while (cl > 0xffff) cl = (cl >> 16) + (cl & 0xffff); c = cl; ip4->ip_sum = ~c; #else ip4->ip_sum = TCP_LRO_INVALID_CSUM; #endif ip4->ip_len = p_len; th = (struct tcphdr *)(ip4 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->p_len += ETHER_HDR_LEN; break; } #endif default: th = NULL; /* Keep compiler happy. */ } le->m_head->m_pkthdr.csum_data = 0xffff; le->m_head->m_pkthdr.len = le->p_len; /* Incorporate the latest ACK into the TCP header. */ th->th_ack = le->ack_seq; th->th_win = le->window; /* Incorporate latest timestamp into the TCP header. */ if (le->timestamp != 0) { uint32_t *ts_ptr; ts_ptr = (uint32_t *)(th + 1); ts_ptr[1] = htonl(le->tsval); ts_ptr[2] = le->tsecr; } #ifdef TCP_LRO_UPDATE_CSUM /* Update the TCP header checksum. */ le->ulp_csum += p_len; le->ulp_csum += tcp_lro_csum_th(th); while (le->ulp_csum > 0xffff) le->ulp_csum = (le->ulp_csum >> 16) + (le->ulp_csum & 0xffff); th->th_sum = (le->ulp_csum & 0xffff); th->th_sum = ~th->th_sum; #else th->th_sum = TCP_LRO_INVALID_CSUM; #endif } (*lc->ifp->if_input)(lc->ifp, le->m_head); lc->lro_queued += le->append_cnt + 1; lc->lro_flushed++; bzero(le, sizeof(*le)); SLIST_INSERT_HEAD(&lc->lro_free, le, next); } +static int +tcp_lro_mbuf_compare_header(const void *ppa, const void *ppb) +{ + const struct mbuf *ma = *((const struct mbuf * const *)ppa); + const struct mbuf *mb = *((const struct mbuf * const *)ppb); + int ret; + + ret = M_HASHTYPE_GET(ma) - M_HASHTYPE_GET(mb); + if (ret != 0) + goto done; + + ret = ma->m_pkthdr.flowid - mb->m_pkthdr.flowid; + if (ret != 0) + goto done; + + ret = TCP_LRO_SEQUENCE(ma) - TCP_LRO_SEQUENCE(mb); +done: + return (ret); +} + +void +tcp_lro_flush_all(struct lro_ctrl *lc) +{ + struct lro_entry *le; + uint32_t hashtype; + uint32_t flowid; + unsigned x; + + /* check if no mbufs to flush */ + if (__predict_false(lc->lro_mbuf_count == 0)) + goto done; + + /* sort all mbufs according to stream */ + qsort(lc->lro_mbuf_data, lc->lro_mbuf_count, sizeof(struct mbuf *), + &tcp_lro_mbuf_compare_header); + + /* input data into LRO engine, stream by stream */ + flowid = 0; + hashtype = M_HASHTYPE_NONE; + for (x = 0; x != lc->lro_mbuf_count; x++) { + struct mbuf *mb; + + mb = lc->lro_mbuf_data[x]; + + /* check for new stream */ + if (mb->m_pkthdr.flowid != flowid || + M_HASHTYPE_GET(mb) != hashtype) { + flowid = mb->m_pkthdr.flowid; + hashtype = M_HASHTYPE_GET(mb); + + /* flush active streams */ + while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) { + SLIST_REMOVE_HEAD(&lc->lro_active, next); + tcp_lro_flush(lc, le); + } + } +#ifdef TCP_LRO_RESET_SEQUENCE + /* reset sequence number */ + TCP_LRO_SEQUENCE(mb) = 0; +#endif + /* add packet to LRO engine */ + if (tcp_lro_rx(lc, mb, 0) != 0) { + /* input packet to network layer */ + (*lc->ifp->if_input)(lc->ifp, mb); + lc->lro_queued++; + lc->lro_flushed++; + } + } +done: + /* flush active streams */ + while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) { + SLIST_REMOVE_HEAD(&lc->lro_active, next); + tcp_lro_flush(lc, le); + } + lc->lro_mbuf_count = 0; +} + #ifdef INET6 static int tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, struct tcphdr **th) { /* XXX-BZ we should check the flow-label. */ /* XXX-BZ We do not yet support ext. hdrs. */ if (ip6->ip6_nxt != IPPROTO_TCP) return (TCP_LRO_NOT_SUPPORTED); /* Find the TCP header. */ *th = (struct tcphdr *)(ip6 + 1); return (0); } #endif #ifdef INET static int tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, struct tcphdr **th) { int csum_flags; uint16_t csum; if (ip4->ip_p != IPPROTO_TCP) return (TCP_LRO_NOT_SUPPORTED); /* Ensure there are no options. */ if ((ip4->ip_hl << 2) != sizeof (*ip4)) return (TCP_LRO_CANNOT); /* .. and the packet is not fragmented. */ if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) return (TCP_LRO_CANNOT); /* Legacy IP has a header checksum that needs to be correct. */ csum_flags = m->m_pkthdr.csum_flags; if (csum_flags & CSUM_IP_CHECKED) { if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } else { csum = in_cksum_hdr(ip4); if (__predict_false((csum) != 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } /* Find the TCP header (we assured there are no IP options). */ *th = (struct tcphdr *)(ip4 + 1); return (0); } #endif int tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) { struct lro_entry *le; struct ether_header *eh; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip4 = NULL; /* Keep compiler happy. */ #endif struct tcphdr *th; void *l3hdr = NULL; /* Keep compiler happy. */ uint32_t *ts_ptr; tcp_seq seq; int error, ip_len, l; uint16_t eh_type, tcp_data_len; /* We expect a contiguous header [eh, ip, tcp]. */ eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { CURVNET_SET(lc->ifp->if_vnet); if (V_ip6_forwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); error = tcp_lro_rx_ipv6(lc, m, ip6, &th); if (error != 0) return (error); tcp_data_len = ntohs(ip6->ip6_plen); ip_len = sizeof(*ip6) + tcp_data_len; break; } #endif #ifdef INET case ETHERTYPE_IP: { CURVNET_SET(lc->ifp->if_vnet); if (V_ipforwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip4 = (struct ip *)(eh + 1); error = tcp_lro_rx_ipv4(lc, m, ip4, &th); if (error != 0) return (error); ip_len = ntohs(ip4->ip_len); tcp_data_len = ip_len - sizeof(*ip4); break; } #endif /* XXX-BZ what happens in case of VLAN(s)? */ default: return (TCP_LRO_NOT_SUPPORTED); } /* * If the frame is padded beyond the end of the IP packet, then we must * trim the extra bytes off. */ l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); if (l != 0) { if (l < 0) /* Truncated packet. */ return (TCP_LRO_CANNOT); m_adj(m, -l); } /* * Check TCP header constraints. */ /* Ensure no bits set besides ACK or PSH. */ if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) return (TCP_LRO_CANNOT); /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */ /* XXX-BZ Ideally we'd flush on PUSH? */ /* * Check for timestamps. * Since the only option we handle are timestamps, we only have to * handle the simple case of aligned timestamps. */ l = (th->th_off << 2); tcp_data_len -= l; l -= sizeof(*th); ts_ptr = (uint32_t *)(th + 1); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) return (TCP_LRO_CANNOT); /* If the driver did not pass in the checksum, set it now. */ if (csum == 0x0000) csum = th->th_sum; seq = ntohl(th->th_seq); /* Try to find a matching previous segment. */ SLIST_FOREACH(le, &lc->lro_active, next) { if (le->eh_type != eh_type) continue; if (le->source_port != th->th_sport || le->dest_port != th->th_dport) continue; switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: if (bcmp(&le->source_ip6, &ip6->ip6_src, sizeof(struct in6_addr)) != 0 || bcmp(&le->dest_ip6, &ip6->ip6_dst, sizeof(struct in6_addr)) != 0) continue; break; #endif #ifdef INET case ETHERTYPE_IP: if (le->source_ip4 != ip4->ip_src.s_addr || le->dest_ip4 != ip4->ip_dst.s_addr) continue; break; #endif } /* Flush now if appending will result in overflow. */ if (le->p_len > (65535 - tcp_data_len)) { SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); tcp_lro_flush(lc, le); break; } /* Try to append the new segment. */ if (__predict_false(seq != le->next_seq || (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { /* Out of order packet or duplicate ACK. */ SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); tcp_lro_flush(lc, le); return (TCP_LRO_CANNOT); } if (l != 0) { uint32_t tsval = ntohl(*(ts_ptr + 1)); /* Make sure timestamp values are increasing. */ /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ if (__predict_false(le->tsval > tsval || *(ts_ptr + 2) == 0)) return (TCP_LRO_CANNOT); le->tsval = tsval; le->tsecr = *(ts_ptr + 2); } le->next_seq += tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; le->append_cnt++; #ifdef TCP_LRO_UPDATE_CSUM le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, ~csum); #endif if (tcp_data_len == 0) { m_freem(m); return (0); } le->p_len += tcp_data_len; /* * Adjust the mbuf so that m_data points to the first byte of * the ULP payload. Adjust the mbuf to avoid complications and * append new segment to existing mbuf chain. */ m_adj(m, m->m_pkthdr.len - tcp_data_len); m_demote_pkthdr(m); le->m_tail->m_next = m; le->m_tail = m_last(m); /* * If a possible next full length packet would cause an * overflow, pro-actively flush now. */ if (le->p_len > (65535 - lc->ifp->if_mtu)) { SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); tcp_lro_flush(lc, le); } else getmicrotime(&le->mtime); return (0); } /* Try to find an empty slot. */ if (SLIST_EMPTY(&lc->lro_free)) return (TCP_LRO_CANNOT); /* Start a new segment chain. */ le = SLIST_FIRST(&lc->lro_free); SLIST_REMOVE_HEAD(&lc->lro_free, next); SLIST_INSERT_HEAD(&lc->lro_active, le, next); getmicrotime(&le->mtime); /* Start filling in details. */ switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: le->le_ip6 = ip6; le->source_ip6 = ip6->ip6_src; le->dest_ip6 = ip6->ip6_dst; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); break; #endif #ifdef INET case ETHERTYPE_IP: le->le_ip4 = ip4; le->source_ip4 = ip4->ip_src.s_addr; le->dest_ip4 = ip4->ip_dst.s_addr; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; break; #endif } le->source_port = th->th_sport; le->dest_port = th->th_dport; le->next_seq = seq + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; if (l != 0) { le->timestamp = 1; le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } #ifdef TCP_LRO_UPDATE_CSUM /* * Do not touch the csum of the first packet. However save the * "adjusted" checksum of just the source and destination addresses, * the next header and the TCP payload. The length and TCP header * parts may change, so we remove those from the saved checksum and * re-add with final values on tcp_lro_flush() if needed. */ KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", __func__, le, le->ulp_csum)); le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, ~csum); th->th_sum = csum; /* Restore checksum on first packet. */ #endif le->m_head = m; le->m_tail = m_last(m); return (0); +} + +void +tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) +{ + /* sanity checks */ + if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || + lc->lro_mbuf_max == 0)) { + /* packet drop */ + m_freem(mb); + return; + } + + /* check if packet is not LRO capable */ + if (__predict_false(mb->m_pkthdr.csum_flags == 0 || + (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { + lc->lro_flushed++; + lc->lro_queued++; + + /* input packet to network layer */ + (*lc->ifp->if_input) (lc->ifp, mb); + return; + } + + /* check if array is full */ + if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max)) + tcp_lro_flush_all(lc); + + /* store sequence number */ + TCP_LRO_SEQUENCE(mb) = lc->lro_mbuf_count; + + /* enter mbuf */ + lc->lro_mbuf_data[lc->lro_mbuf_count++] = mb; } /* end */ Index: head/sys/netinet/tcp_lro.h =================================================================== --- head/sys/netinet/tcp_lro.h (revision 294326) +++ head/sys/netinet/tcp_lro.h (revision 294327) @@ -1,96 +1,111 @@ /*- * Copyright (c) 2006, Myricom Inc. * Copyright (c) 2008, Intel Corporation. + * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _TCP_LRO_H_ #define _TCP_LRO_H_ #include +#ifndef TCP_LRO_ENTRIES +/* Define default number of LRO entries per RX queue */ +#define TCP_LRO_ENTRIES 8 +#endif + +#define TCP_LRO_SEQUENCE(mb) \ + (mb)->m_pkthdr.PH_loc.thirtytwo[0] + struct lro_entry { SLIST_ENTRY(lro_entry) next; struct mbuf *m_head; struct mbuf *m_tail; union { struct ip *ip4; struct ip6_hdr *ip6; } leip; union { in_addr_t s_ip4; struct in6_addr s_ip6; } lesource; union { in_addr_t d_ip4; struct in6_addr d_ip6; } ledest; uint16_t source_port; uint16_t dest_port; uint16_t eh_type; /* EthernetHeader type. */ uint16_t append_cnt; uint32_t p_len; /* IP header payload length. */ uint32_t ulp_csum; /* TCP, etc. checksum. */ uint32_t next_seq; /* tcp_seq */ uint32_t ack_seq; /* tcp_seq */ uint32_t tsval; uint32_t tsecr; uint16_t window; uint16_t timestamp; /* flag, not a TCP hdr field. */ struct timeval mtime; }; SLIST_HEAD(lro_head, lro_entry); #define le_ip4 leip.ip4 #define le_ip6 leip.ip6 #define source_ip4 lesource.s_ip4 #define dest_ip4 ledest.d_ip4 #define source_ip6 lesource.s_ip6 #define dest_ip6 ledest.d_ip6 /* NB: This is part of driver structs. */ struct lro_ctrl { struct ifnet *ifp; - int lro_queued; - int lro_flushed; - int lro_bad_csum; - int lro_cnt; + struct mbuf **lro_mbuf_data; + uint64_t lro_queued; + uint64_t lro_flushed; + uint64_t lro_bad_csum; + unsigned lro_cnt; + unsigned lro_mbuf_count; + unsigned lro_mbuf_max; struct lro_head lro_active; struct lro_head lro_free; }; int tcp_lro_init(struct lro_ctrl *); +int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned); void tcp_lro_free(struct lro_ctrl *); void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *); void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); +void tcp_lro_flush_all(struct lro_ctrl *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); +void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); #define TCP_LRO_CANNOT -1 #define TCP_LRO_NOT_SUPPORTED 1 #endif /* _TCP_LRO_H_ */ Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 294326) +++ head/sys/sys/param.h (revision 294327) @@ -1,363 +1,363 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1100094 /* Master, propagated to newvers */ +#define __FreeBSD_version 1100095 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #ifdef _KERNEL #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . This should * probably be done to increase its value, when MAXBCACHEBUF is * defined as a larger value in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef lint #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* lint */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */