diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index b3b214ce3c96..27655ec2fe59 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -1,1449 +1,1461 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_ADAPTER_H__ #define __T4_ADAPTER_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include "offload.h" #include "t4_ioctl.h" #include "common/t4_msg.h" #include "firmware/t4fw_interface.h" #define KTR_CXGBE KTR_SPARE3 MALLOC_DECLARE(M_CXGBE); #define CXGBE_UNIMPLEMENTED(s) \ panic("%s (%s, line %d) not implemented yet.", s, __FILE__, __LINE__) +/* + * Same as LIST_HEAD from queue.h. This is to avoid conflict with LinuxKPI's + * LIST_HEAD when building iw_cxgbe. + */ +#define CXGBE_LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + #ifndef SYSCTL_ADD_UQUAD #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD #define sysctl_handle_64 sysctl_handle_quad #define CTLTYPE_U64 CTLTYPE_QUAD #endif SYSCTL_DECL(_hw_cxgbe); struct adapter; typedef struct adapter adapter_t; enum { /* * All ingress queues use this entry size. Note that the firmware event * queue and any iq expecting CPL_RX_PKT in the descriptor needs this to * be at least 64. */ IQ_ESIZE = 64, /* Default queue sizes for all kinds of ingress queues */ FW_IQ_QSIZE = 256, RX_IQ_QSIZE = 1024, /* All egress queues use this entry size */ EQ_ESIZE = 64, /* Default queue sizes for all kinds of egress queues */ CTRL_EQ_QSIZE = 1024, TX_EQ_QSIZE = 1024, #if MJUMPAGESIZE != MCLBYTES SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */ #else SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */ #endif CL_METADATA_SIZE = CACHE_LINE_SIZE, SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ TX_SGL_SEGS = 39, TX_SGL_SEGS_TSO = 38, TX_SGL_SEGS_VM = 38, TX_SGL_SEGS_VM_TSO = 37, TX_SGL_SEGS_EO_TSO = 30, /* XXX: lower for IPv6. */ TX_SGL_SEGS_VXLAN_TSO = 37, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 }; enum { /* adapter intr_type */ INTR_INTX = (1 << 0), INTR_MSI = (1 << 1), INTR_MSIX = (1 << 2) }; enum { XGMAC_MTU = (1 << 0), XGMAC_PROMISC = (1 << 1), XGMAC_ALLMULTI = (1 << 2), XGMAC_VLANEX = (1 << 3), XGMAC_UCADDR = (1 << 4), XGMAC_MCADDRS = (1 << 5), XGMAC_ALL = 0xffff }; enum { /* flags understood by begin_synchronized_op */ HOLD_LOCK = (1 << 0), SLEEP_OK = (1 << 1), INTR_OK = (1 << 2), /* flags understood by end_synchronized_op */ LOCK_HELD = HOLD_LOCK, }; enum { /* adapter flags */ FULL_INIT_DONE = (1 << 0), FW_OK = (1 << 1), CHK_MBOX_ACCESS = (1 << 2), MASTER_PF = (1 << 3), ADAP_SYSCTL_CTX = (1 << 4), ADAP_ERR = (1 << 5), BUF_PACKING_OK = (1 << 6), IS_VF = (1 << 7), KERN_TLS_ON = (1 << 8), /* HW is configured for KERN_TLS */ CXGBE_BUSY = (1 << 9), HW_OFF_LIMITS = (1 << 10), /* off limits to all except reset_thread */ /* port flags */ HAS_TRACEQ = (1 << 3), FIXED_IFMEDIA = (1 << 4), /* ifmedia list doesn't change. */ /* VI flags */ DOOMED = (1 << 0), VI_INIT_DONE = (1 << 1), VI_SYSCTL_CTX = (1 << 2), TX_USES_VM_WR = (1 << 3), VI_SKIP_STATS = (1 << 4), /* adapter debug_flags */ DF_DUMP_MBOX = (1 << 0), /* Log all mbox cmd/rpl. */ DF_LOAD_FW_ANYTIME = (1 << 1), /* Allow LOAD_FW after init */ DF_DISABLE_TCB_CACHE = (1 << 2), /* Disable TCB cache (T6+) */ DF_DISABLE_CFG_RETRY = (1 << 3), /* Disable fallback config */ DF_VERBOSE_SLOWINTR = (1 << 4), /* Chatty slow intr handler */ }; #define IS_DOOMED(vi) ((vi)->flags & DOOMED) #define SET_DOOMED(vi) do {(vi)->flags |= DOOMED;} while (0) #define IS_BUSY(sc) ((sc)->flags & CXGBE_BUSY) #define SET_BUSY(sc) do {(sc)->flags |= CXGBE_BUSY;} while (0) #define CLR_BUSY(sc) do {(sc)->flags &= ~CXGBE_BUSY;} while (0) struct vi_info { device_t dev; struct port_info *pi; struct adapter *adapter; struct ifnet *ifp; struct pfil_head *pfil; unsigned long flags; int if_flags; uint16_t *rss, *nm_rss; uint16_t viid; /* opaque VI identifier */ uint16_t smt_idx; uint16_t vin; uint8_t vfvld; int16_t xact_addr_filt;/* index of exact MAC address filter */ uint16_t rss_size; /* size of VI's RSS table slice */ uint16_t rss_base; /* start of VI's RSS table slice */ int hashen; int nintr; int first_intr; /* These need to be int as they are used in sysctl */ int ntxq; /* # of tx queues */ int first_txq; /* index of first tx queue */ int rsrv_noflowq; /* Reserve queue 0 for non-flowid packets */ int nrxq; /* # of rx queues */ int first_rxq; /* index of first rx queue */ int nofldtxq; /* # of offload tx queues */ int first_ofld_txq; /* index of first offload tx queue */ int nofldrxq; /* # of offload rx queues */ int first_ofld_rxq; /* index of first offload rx queue */ int nnmtxq; int first_nm_txq; int nnmrxq; int first_nm_rxq; int tmr_idx; int ofld_tmr_idx; int pktc_idx; int ofld_pktc_idx; int qsize_rxq; int qsize_txq; struct timeval last_refreshed; struct fw_vi_stats_vf stats; struct mtx tick_mtx; struct callout tick; struct sysctl_ctx_list ctx; struct sysctl_oid *rxq_oid; struct sysctl_oid *txq_oid; struct sysctl_oid *nm_rxq_oid; struct sysctl_oid *nm_txq_oid; struct sysctl_oid *ofld_rxq_oid; struct sysctl_oid *ofld_txq_oid; uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */ }; struct tx_ch_rl_params { enum fw_sched_params_rate ratemode; /* %port (REL) or kbps (ABS) */ uint32_t maxrate; }; enum { CLRL_USER = (1 << 0), /* allocated manually. */ CLRL_SYNC = (1 << 1), /* sync hw update in progress. */ CLRL_ASYNC = (1 << 2), /* async hw update requested. */ CLRL_ERR = (1 << 3), /* last hw setup ended in error. */ }; struct tx_cl_rl_params { int refcount; uint8_t flags; enum fw_sched_params_rate ratemode; /* %port REL or ABS value */ enum fw_sched_params_unit rateunit; /* kbps or pps (when ABS) */ enum fw_sched_params_mode mode; /* aggr or per-flow */ uint32_t maxrate; uint16_t pktsize; uint16_t burstsize; }; /* Tx scheduler parameters for a channel/port */ struct tx_sched_params { /* Channel Rate Limiter */ struct tx_ch_rl_params ch_rl; /* Class WRR */ /* XXX */ /* Class Rate Limiter (including the default pktsize and burstsize). */ int pktsize; int burstsize; struct tx_cl_rl_params cl_rl[]; }; struct port_info { device_t dev; struct adapter *adapter; struct vi_info *vi; int nvi; int up_vis; int uld_vis; bool vxlan_tcam_entry; struct tx_sched_params *sched_params; struct mtx pi_lock; char lockname[16]; unsigned long flags; uint8_t lport; /* associated offload logical port */ int8_t mdio_addr; uint8_t port_type; uint8_t mod_type; uint8_t port_id; uint8_t tx_chan; uint8_t mps_bg_map; /* rx MPS buffer group bitmap */ uint8_t rx_e_chan_map; /* rx TP e-channel bitmap */ uint8_t rx_c_chan; /* rx TP c-channel */ struct link_config link_cfg; struct ifmedia media; struct port_stats stats; u_int tnl_cong_drops; u_int tx_parse_error; int fcs_reg; uint64_t fcs_base; }; #define IS_MAIN_VI(vi) ((vi) == &((vi)->pi->vi[0])) struct cluster_metadata { uma_zone_t zone; caddr_t cl; u_int refcount; }; struct fl_sdesc { caddr_t cl; uint16_t nmbuf; /* # of driver originated mbufs with ref on cluster */ int16_t moff; /* offset of metadata from cl */ uint8_t zidx; }; struct tx_desc { __be64 flit[8]; }; struct tx_sdesc { struct mbuf *m; /* m_nextpkt linked chain of frames */ uint8_t desc_used; /* # of hardware descriptors used by the WR */ }; #define IQ_PAD (IQ_ESIZE - sizeof(struct rsp_ctrl) - sizeof(struct rss_header)) struct iq_desc { struct rss_header rss; uint8_t cpl[IQ_PAD]; struct rsp_ctrl rsp; }; #undef IQ_PAD CTASSERT(sizeof(struct iq_desc) == IQ_ESIZE); enum { /* iq flags */ IQ_SW_ALLOCATED = (1 << 0), /* sw resources allocated */ IQ_HAS_FL = (1 << 1), /* iq associated with a freelist */ IQ_RX_TIMESTAMP = (1 << 2), /* provide the SGE rx timestamp */ IQ_LRO_ENABLED = (1 << 3), /* iq is an eth rxq with LRO enabled */ IQ_ADJ_CREDIT = (1 << 4), /* hw is off by 1 credit for this iq */ IQ_HW_ALLOCATED = (1 << 5), /* fw/hw resources allocated */ /* iq state */ IQS_DISABLED = 0, IQS_BUSY = 1, IQS_IDLE = 2, /* netmap related flags */ NM_OFF = 0, NM_ON = 1, NM_BUSY = 2, }; enum { CPL_COOKIE_RESERVED = 0, CPL_COOKIE_FILTER, CPL_COOKIE_DDP0, CPL_COOKIE_DDP1, CPL_COOKIE_TOM, CPL_COOKIE_HASHFILTER, CPL_COOKIE_ETHOFLD, CPL_COOKIE_KERN_TLS, NUM_CPL_COOKIES = 8 /* Limited by M_COOKIE. Do not increase. */ }; struct sge_iq; struct rss_header; typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *, struct mbuf *); typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *); typedef int (*fw_msg_handler_t)(struct adapter *, const __be64 *); /* * Ingress Queue: T4 is producer, driver is consumer. */ struct sge_iq { uint32_t flags; volatile int state; struct adapter *adapter; struct iq_desc *desc; /* KVA of descriptor ring */ int8_t intr_pktc_idx; /* packet count threshold index */ uint8_t gen; /* generation bit */ uint8_t intr_params; /* interrupt holdoff parameters */ int8_t cong; /* congestion settings */ uint16_t qsize; /* size (# of entries) of the queue */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer index */ uint16_t cntxt_id; /* SGE context id for the iq */ uint16_t abs_id; /* absolute SGE id for the iq */ int16_t intr_idx; /* interrupt used by the queue */ STAILQ_ENTRY(sge_iq) link; bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ }; enum { /* eq type */ EQ_CTRL = 1, EQ_ETH = 2, EQ_OFLD = 3, /* eq flags */ EQ_SW_ALLOCATED = (1 << 0), /* sw resources allocated */ EQ_HW_ALLOCATED = (1 << 1), /* hw/fw resources allocated */ EQ_ENABLED = (1 << 3), /* open for business */ EQ_QFLUSH = (1 << 4), /* if_qflush in progress */ }; /* Listed in order of preference. Update t4_sysctls too if you change these */ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB}; /* * Egress Queue: driver is producer, T4 is consumer. * * Note: A free list is an egress queue (driver produces the buffers and T4 * consumes them) but it's special enough to have its own struct (see sge_fl). */ struct sge_eq { unsigned int flags; /* MUST be first */ unsigned int cntxt_id; /* SGE context id for the eq */ unsigned int abs_id; /* absolute SGE id for the eq */ uint8_t type; /* EQ_CTRL/EQ_ETH/EQ_OFLD */ uint8_t doorbells; uint8_t tx_chan; /* tx channel used by the eq */ struct mtx eq_lock; struct tx_desc *desc; /* KVA of descriptor ring */ volatile uint32_t *udb; /* KVA of doorbell (lies within BAR2) */ u_int udb_qid; /* relative qid within the doorbell page */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer idx (desc idx) */ uint16_t pidx; /* producer idx (desc idx) */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint16_t iqid; /* cached iq->cntxt_id (see iq below) */ volatile u_int equiq; /* EQUIQ outstanding */ struct sge_iq *iq; /* iq that receives egr_update for the eq */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ char lockname[16]; }; struct rx_buf_info { uma_zone_t zone; /* zone that this cluster comes from */ uint16_t size1; /* same as size of cluster: 2K/4K/9K/16K. * hwsize[hwidx1] = size1. No spare. */ uint16_t size2; /* hwsize[hwidx2] = size2. * spare in cluster = size1 - size2. */ int8_t hwidx1; /* SGE bufsize idx for size1 */ int8_t hwidx2; /* SGE bufsize idx for size2 */ uint8_t type; /* EXT_xxx type of the cluster */ }; enum { NUM_MEMWIN = 3, MEMWIN0_APERTURE = 2048, MEMWIN0_BASE = 0x1b800, MEMWIN1_APERTURE = 32768, MEMWIN1_BASE = 0x28000, MEMWIN2_APERTURE_T4 = 65536, MEMWIN2_BASE_T4 = 0x30000, MEMWIN2_APERTURE_T5 = 128 * 1024, MEMWIN2_BASE_T5 = 0x60000, }; struct memwin { struct rwlock mw_lock __aligned(CACHE_LINE_SIZE); uint32_t mw_base; /* constant after setup_memwin */ uint32_t mw_aperture; /* ditto */ uint32_t mw_curpos; /* protected by mw_lock */ }; enum { FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */ FL_DOOMED = (1 << 1), /* about to be destroyed */ FL_BUF_PACKING = (1 << 2), /* buffer packing enabled */ FL_BUF_RESUME = (1 << 3), /* resume from the middle of the frame */ }; #define FL_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) <= fl->lowat) #define FL_NOT_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) >= 2 * fl->lowat) struct sge_fl { struct mtx fl_lock; __be64 *desc; /* KVA of descriptor ring, ptr to addresses */ struct fl_sdesc *sdesc; /* KVA of software descriptor ring */ uint16_t zidx; /* refill zone idx */ uint16_t safe_zidx; uint16_t lowat; /* # of buffers <= this means fl needs help */ int flags; uint16_t buf_boundary; /* The 16b idx all deal with hw descriptors */ uint16_t dbidx; /* hw pidx after last doorbell */ uint16_t sidx; /* index of status page */ volatile uint16_t hw_cidx; /* The 32b idx are all buffer idx, not hardware descriptor idx */ uint32_t cidx; /* consumer index */ uint32_t pidx; /* producer index */ uint32_t dbval; u_int rx_offset; /* offset in fl buf (when buffer packing) */ volatile uint32_t *udb; uint64_t cl_allocated; /* # of clusters allocated */ uint64_t cl_recycled; /* # of clusters recycled */ uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */ /* These 3 are valid when FL_BUF_RESUME is set, stale otherwise. */ struct mbuf *m0; struct mbuf **pnext; u_int remaining; uint16_t qsize; /* # of hw descriptors (status page included) */ uint16_t cntxt_id; /* SGE context id for the freelist */ TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; char lockname[16]; bus_addr_t ba; /* bus address of descriptor ring */ }; struct mp_ring; struct txpkts { uint8_t wr_type; /* type 0 or type 1 */ uint8_t npkt; /* # of packets in this work request */ uint8_t len16; /* # of 16B pieces used by this work request */ uint8_t score; uint8_t max_npkt; /* maximum number of packets allowed */ uint16_t plen; /* total payload (sum of all packets) */ /* straight from fw_eth_tx_pkts_vm_wr. */ __u8 ethmacdst[6]; __u8 ethmacsrc[6]; __be16 ethtype; __be16 vlantci; struct mbuf *mb[15]; }; /* txq: SGE egress queue + what's needed for Ethernet NIC */ struct sge_txq { struct sge_eq eq; /* MUST be first */ struct ifnet *ifp; /* the interface this txq belongs to */ struct mp_ring *r; /* tx software ring */ struct tx_sdesc *sdesc; /* KVA of software descriptor ring */ struct sglist *gl; __be32 cpl_ctrl0; /* for convenience */ int tc_idx; /* traffic class */ uint64_t last_tx; /* cycle count when eth_tx was last called */ struct txpkts txp; struct task tx_reclaim_task; /* stats for common events first */ uint64_t txcsum; /* # of times hardware assisted with checksum */ uint64_t tso_wrs; /* # of TSO work requests */ uint64_t vlan_insertion;/* # of times VLAN tag was inserted */ uint64_t imm_wrs; /* # of work requests with immediate data */ uint64_t sgl_wrs; /* # of work requests with direct SGL */ uint64_t txpkt_wrs; /* # of txpkt work requests (not coalesced) */ uint64_t txpkts0_wrs; /* # of type0 coalesced tx work requests */ uint64_t txpkts1_wrs; /* # of type1 coalesced tx work requests */ uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */ uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */ uint64_t txpkts_flush; /* # of times txp had to be sent by tx_update */ uint64_t raw_wrs; /* # of raw work requests (alloc_wr_mbuf) */ uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */ uint64_t vxlan_txcsum; uint64_t kern_tls_records; uint64_t kern_tls_short; uint64_t kern_tls_partial; uint64_t kern_tls_full; uint64_t kern_tls_octets; uint64_t kern_tls_waste; uint64_t kern_tls_options; uint64_t kern_tls_header; uint64_t kern_tls_fin; uint64_t kern_tls_fin_short; uint64_t kern_tls_cbc; uint64_t kern_tls_gcm; /* stats for not-that-common events */ /* Optional scratch space for constructing work requests. */ uint8_t ss[SGE_MAX_WR_LEN] __aligned(16); } __aligned(CACHE_LINE_SIZE); /* rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ struct ifnet *ifp; /* the interface this rxq belongs to */ struct lro_ctrl lro; /* LRO state */ /* stats for common events first */ uint64_t rxcsum; /* # of times hardware assisted with checksum */ uint64_t vlan_extraction;/* # of times VLAN tag was extracted */ uint64_t vxlan_rxcsum; /* stats for not-that-common events */ } __aligned(CACHE_LINE_SIZE); static inline struct sge_rxq * iq_to_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_rxq, iq)); } /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_ofld_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ counter_u64_t rx_iscsi_ddp_setup_ok; counter_u64_t rx_iscsi_ddp_setup_error; uint64_t rx_iscsi_ddp_pdus; uint64_t rx_iscsi_ddp_octets; uint64_t rx_iscsi_fl_pdus; uint64_t rx_iscsi_fl_octets; u_long rx_toe_tls_records; u_long rx_toe_tls_octets; } __aligned(CACHE_LINE_SIZE); static inline struct sge_ofld_rxq * iq_to_ofld_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_ofld_rxq, iq)); } struct wrqe { STAILQ_ENTRY(wrqe) link; struct sge_wrq *wrq; int wr_len; char wr[] __aligned(16); }; struct wrq_cookie { TAILQ_ENTRY(wrq_cookie) link; int ndesc; int pidx; }; /* * wrq: SGE egress queue that is given prebuilt work requests. Control queues * are of this type. */ struct sge_wrq { struct sge_eq eq; /* MUST be first */ struct adapter *adapter; struct task wrq_tx_task; /* Tx desc reserved but WR not "committed" yet. */ TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs; /* List of WRs ready to go out as soon as descriptors are available. */ STAILQ_HEAD(, wrqe) wr_list; u_int nwr_pending; u_int ndesc_needed; /* stats for common events first */ uint64_t tx_wrs_direct; /* # of WRs written directly to desc ring. */ uint64_t tx_wrs_ss; /* # of WRs copied from scratch space. */ uint64_t tx_wrs_copied; /* # of WRs queued and copied to desc ring. */ /* stats for not-that-common events */ /* * Scratch space for work requests that wrap around after reaching the * status page, and some information about the last WR that used it. */ uint16_t ss_pidx; uint16_t ss_len; uint8_t ss[SGE_MAX_WR_LEN]; } __aligned(CACHE_LINE_SIZE); /* ofld_txq: SGE egress queue + miscellaneous items */ struct sge_ofld_txq { struct sge_wrq wrq; counter_u64_t tx_iscsi_pdus; counter_u64_t tx_iscsi_octets; counter_u64_t tx_toe_tls_records; counter_u64_t tx_toe_tls_octets; } __aligned(CACHE_LINE_SIZE); #define INVALID_NM_RXQ_CNTXT_ID ((uint16_t)(-1)) struct sge_nm_rxq { /* Items used by the driver rx ithread are in this cacheline. */ volatile int nm_state __aligned(CACHE_LINE_SIZE); /* NM_OFF, NM_ON, or NM_BUSY */ u_int nid; /* netmap ring # for this queue */ struct vi_info *vi; struct iq_desc *iq_desc; uint16_t iq_abs_id; uint16_t iq_cntxt_id; uint16_t iq_cidx; uint16_t iq_sidx; uint8_t iq_gen; uint32_t fl_sidx; /* Items used by netmap rxsync are in this cacheline. */ __be64 *fl_desc __aligned(CACHE_LINE_SIZE); uint16_t fl_cntxt_id; uint32_t fl_pidx; uint32_t fl_sidx2; /* copy of fl_sidx */ uint32_t fl_db_val; u_int fl_db_saved; u_int fl_db_threshold; /* in descriptors */ u_int fl_hwidx:4; /* * fl_cidx is used by both the ithread and rxsync, the rest are not used * in the rx fast path. */ uint32_t fl_cidx __aligned(CACHE_LINE_SIZE); bus_dma_tag_t iq_desc_tag; bus_dmamap_t iq_desc_map; bus_addr_t iq_ba; int intr_idx; bus_dma_tag_t fl_desc_tag; bus_dmamap_t fl_desc_map; bus_addr_t fl_ba; }; #define INVALID_NM_TXQ_CNTXT_ID ((u_int)(-1)) struct sge_nm_txq { struct tx_desc *desc; uint16_t cidx; uint16_t pidx; uint16_t sidx; uint16_t equiqidx; /* EQUIQ last requested at this pidx */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint8_t doorbells; volatile uint32_t *udb; u_int udb_qid; u_int cntxt_id; __be32 cpl_ctrl0; /* for convenience */ __be32 op_pkd; /* ditto */ u_int nid; /* netmap ring # for this queue */ /* infrequently used items after this */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; int iqidx; } __aligned(CACHE_LINE_SIZE); struct sge { int nrxq; /* total # of Ethernet rx queues */ int ntxq; /* total # of Ethernet tx queues */ int nofldrxq; /* total # of TOE rx queues */ int nofldtxq; /* total # of TOE tx queues */ int nnmrxq; /* total # of netmap rx queues */ int nnmtxq; /* total # of netmap tx queues */ int niq; /* total # of ingress queues */ int neq; /* total # of egress queues */ struct sge_iq fwq; /* Firmware event queue */ struct sge_wrq *ctrlq; /* Control queues */ struct sge_txq *txq; /* NIC tx queues */ struct sge_rxq *rxq; /* NIC rx queues */ struct sge_ofld_txq *ofld_txq; /* TOE tx queues */ struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */ struct sge_nm_txq *nm_txq; /* netmap tx queues */ struct sge_nm_rxq *nm_rxq; /* netmap rx queues */ uint16_t iq_start; /* first cntxt_id */ uint16_t iq_base; /* first abs_id */ int eq_start; /* first cntxt_id */ int eq_base; /* first abs_id */ int iqmap_sz; int eqmap_sz; struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ int8_t safe_zidx; struct rx_buf_info rx_buf_info[SW_ZONE_SIZES]; }; struct devnames { const char *nexus_name; const char *ifnet_name; const char *vi_ifnet_name; const char *pf03_drv_name; const char *vf_nexus_name; const char *vf_ifnet_name; }; struct clip_entry; struct adapter { SLIST_ENTRY(adapter) link; device_t dev; struct cdev *cdev; const struct devnames *names; /* PCIe register resources */ int regs_rid; struct resource *regs_res; int msix_rid; struct resource *msix_res; bus_space_handle_t bh; bus_space_tag_t bt; bus_size_t mmio_len; int udbs_rid; struct resource *udbs_res; volatile uint8_t *udbs_base; unsigned int pf; unsigned int mbox; unsigned int vpd_busy; unsigned int vpd_flag; /* Interrupt information */ int intr_type; int intr_count; struct irq { struct resource *res; int rid; void *tag; struct sge_rxq *rxq; struct sge_nm_rxq *nm_rxq; } __aligned(CACHE_LINE_SIZE) *irq; int sge_gts_reg; int sge_kdoorbell_reg; bus_dma_tag_t dmat; /* Parent DMA tag */ struct sge sge; int lro_timeout; int sc_do_rxcopy; int vxlan_port; u_int vxlan_refcount; int rawf_base; int nrawf; struct taskqueue *tq[MAX_NCHAN]; /* General purpose taskqueues */ struct task async_event_task; struct port_info *port[MAX_NPORTS]; uint8_t chan_map[MAX_NCHAN]; /* channel -> port */ - struct mtx clip_table_lock; - TAILQ_HEAD(, clip_entry) clip_table; + CXGBE_LIST_HEAD(, clip_entry) *clip_table; + TAILQ_HEAD(, clip_entry) clip_pending; /* these need hw update. */ + u_long clip_mask; int clip_gen; + struct timeout_task clip_task; void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; struct t4_offload_policy *policy; struct rwlock policy_lock; void *iwarp_softc; /* (struct c4iw_dev *) */ struct iw_tunables iwt; void *iscsi_ulp_softc; /* (struct cxgbei_data *) */ void *ccr_softc; /* (struct ccr_softc *) */ struct l2t_data *l2t; /* L2 table */ struct smt_data *smt; /* Source MAC Table */ struct tid_info tids; vmem_t *key_map; struct tls_tunables tlst; uint8_t doorbells; int offload_map; /* ports with IFCAP_TOE enabled */ int active_ulds; /* ULDs activated on this adapter */ int flags; int debug_flags; char ifp_lockname[16]; struct mtx ifp_lock; struct ifnet *ifp; /* tracer ifp */ struct ifmedia media; int traceq; /* iq used by all tracers, -1 if none */ int tracer_valid; /* bitmap of valid tracers */ int tracer_enabled; /* bitmap of enabled tracers */ char fw_version[16]; char tp_version[16]; char er_version[16]; char bs_version[16]; char cfg_file[32]; u_int cfcsum; struct adapter_params params; const struct chip_params *chip_params; struct t4_virt_res vres; uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; struct sysctl_ctx_list ctx; struct sysctl_oid *ctrlq_oid; struct sysctl_oid *fwq_oid; struct mtx sc_lock; char lockname[16]; /* Starving free lists */ struct mtx sfl_lock; /* same cache-line as sc_lock? but that's ok */ TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; /* * Driver code that can run when the adapter is suspended must use this * lock or a synchronized_op and check for HW_OFF_LIMITS before * accessing hardware. * * XXX: could be changed to rwlock. wlock in suspend/resume and for * indirect register access, rlock everywhere else. */ struct mtx reg_lock; struct memwin memwin[NUM_MEMWIN]; /* memory windows */ struct mtx tc_lock; struct task tc_task; struct task reset_task; const void *reset_thread; int num_resets; int incarnation; const char *last_op; const void *last_op_thr; int last_op_flags; int swintr; int sensor_resets; struct callout ktls_tick; }; #define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock) #define ADAPTER_UNLOCK(sc) mtx_unlock(&(sc)->sc_lock) #define ADAPTER_LOCK_ASSERT_OWNED(sc) mtx_assert(&(sc)->sc_lock, MA_OWNED) #define ADAPTER_LOCK_ASSERT_NOTOWNED(sc) mtx_assert(&(sc)->sc_lock, MA_NOTOWNED) #define ASSERT_SYNCHRONIZED_OP(sc) \ KASSERT(IS_BUSY(sc) && \ (mtx_owned(&(sc)->sc_lock) || sc->last_op_thr == curthread), \ ("%s: operation not synchronized.", __func__)) #define PORT_LOCK(pi) mtx_lock(&(pi)->pi_lock) #define PORT_UNLOCK(pi) mtx_unlock(&(pi)->pi_lock) #define PORT_LOCK_ASSERT_OWNED(pi) mtx_assert(&(pi)->pi_lock, MA_OWNED) #define PORT_LOCK_ASSERT_NOTOWNED(pi) mtx_assert(&(pi)->pi_lock, MA_NOTOWNED) #define FL_LOCK(fl) mtx_lock(&(fl)->fl_lock) #define FL_TRYLOCK(fl) mtx_trylock(&(fl)->fl_lock) #define FL_UNLOCK(fl) mtx_unlock(&(fl)->fl_lock) #define FL_LOCK_ASSERT_OWNED(fl) mtx_assert(&(fl)->fl_lock, MA_OWNED) #define FL_LOCK_ASSERT_NOTOWNED(fl) mtx_assert(&(fl)->fl_lock, MA_NOTOWNED) #define RXQ_FL_LOCK(rxq) FL_LOCK(&(rxq)->fl) #define RXQ_FL_UNLOCK(rxq) FL_UNLOCK(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_OWNED(rxq) FL_LOCK_ASSERT_OWNED(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl) #define EQ_LOCK(eq) mtx_lock(&(eq)->eq_lock) #define EQ_TRYLOCK(eq) mtx_trylock(&(eq)->eq_lock) #define EQ_UNLOCK(eq) mtx_unlock(&(eq)->eq_lock) #define EQ_LOCK_ASSERT_OWNED(eq) mtx_assert(&(eq)->eq_lock, MA_OWNED) #define EQ_LOCK_ASSERT_NOTOWNED(eq) mtx_assert(&(eq)->eq_lock, MA_NOTOWNED) #define TXQ_LOCK(txq) EQ_LOCK(&(txq)->eq) #define TXQ_TRYLOCK(txq) EQ_TRYLOCK(&(txq)->eq) #define TXQ_UNLOCK(txq) EQ_UNLOCK(&(txq)->eq) #define TXQ_LOCK_ASSERT_OWNED(txq) EQ_LOCK_ASSERT_OWNED(&(txq)->eq) #define TXQ_LOCK_ASSERT_NOTOWNED(txq) EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq) #define for_each_txq(vi, iter, q) \ for (q = &vi->adapter->sge.txq[vi->first_txq], iter = 0; \ iter < vi->ntxq; ++iter, ++q) #define for_each_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.rxq[vi->first_rxq], iter = 0; \ iter < vi->nrxq; ++iter, ++q) #define for_each_ofld_txq(vi, iter, q) \ for (q = &vi->adapter->sge.ofld_txq[vi->first_ofld_txq], iter = 0; \ iter < vi->nofldtxq; ++iter, ++q) #define for_each_ofld_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.ofld_rxq[vi->first_ofld_rxq], iter = 0; \ iter < vi->nofldrxq; ++iter, ++q) #define for_each_nm_txq(vi, iter, q) \ for (q = &vi->adapter->sge.nm_txq[vi->first_nm_txq], iter = 0; \ iter < vi->nnmtxq; ++iter, ++q) #define for_each_nm_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.nm_rxq[vi->first_nm_rxq], iter = 0; \ iter < vi->nnmrxq; ++iter, ++q) #define for_each_vi(_pi, _iter, _vi) \ for ((_vi) = (_pi)->vi, (_iter) = 0; (_iter) < (_pi)->nvi; \ ++(_iter), ++(_vi)) #define IDXINCR(idx, incr, wrap) do { \ idx = wrap - idx > incr ? idx + incr : incr - (wrap - idx); \ } while (0) #define IDXDIFF(head, tail, wrap) \ ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) /* One for errors, one for firmware events */ #define T4_EXTRA_INTR 2 /* One for firmware events */ #define T4VF_EXTRA_INTR 1 static inline int forwarding_intr_to_fwq(struct adapter *sc) { return (sc->intr_count == 1); } /* Works reliably inside a sync_op or with reg_lock held. */ static inline bool hw_off_limits(struct adapter *sc) { return (__predict_false(sc->flags & HW_OFF_LIMITS)); } static inline uint32_t t4_read_reg(struct adapter *sc, uint32_t reg) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); return bus_space_read_4(sc->bt, sc->bh, reg); } static inline void t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); bus_space_write_4(sc->bt, sc->bh, reg, val); } static inline uint64_t t4_read_reg64(struct adapter *sc, uint32_t reg) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); #ifdef __LP64__ return bus_space_read_8(sc->bt, sc->bh, reg); #else return (uint64_t)bus_space_read_4(sc->bt, sc->bh, reg) + ((uint64_t)bus_space_read_4(sc->bt, sc->bh, reg + 4) << 32); #endif } static inline void t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); #ifdef __LP64__ bus_space_write_8(sc->bt, sc->bh, reg, val); #else bus_space_write_4(sc->bt, sc->bh, reg, val); bus_space_write_4(sc->bt, sc->bh, reg + 4, val>> 32); #endif } static inline void t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 1); } static inline void t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 1); } static inline void t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 2); } static inline void t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 2); } static inline void t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 4); } static inline void t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 4); } static inline struct port_info * adap2pinfo(struct adapter *sc, int idx) { return (sc->port[idx]); } static inline void t4_os_set_hw_addr(struct port_info *pi, uint8_t hw_addr[]) { bcopy(hw_addr, pi->vi[0].hw_addr, ETHER_ADDR_LEN); } static inline int tx_resume_threshold(struct sge_eq *eq) { /* not quite the same as qsize / 4, but this will do. */ return (eq->sidx / 4); } static inline int t4_use_ldst(struct adapter *sc) { #ifdef notyet return (sc->flags & FW_OK || !sc->use_bd); #else return (0); #endif } static inline void CH_DUMP_MBOX(struct adapter *sc, int mbox, const int reg, const char *msg, const __be64 *const p, const bool err) { if (!(sc->debug_flags & DF_DUMP_MBOX) && !err) return; if (p != NULL) { log(err ? LOG_ERR : LOG_DEBUG, "%s: mbox %u %s %016llx %016llx %016llx %016llx " "%016llx %016llx %016llx %016llx\n", device_get_nameunit(sc->dev), mbox, msg, (long long)be64_to_cpu(p[0]), (long long)be64_to_cpu(p[1]), (long long)be64_to_cpu(p[2]), (long long)be64_to_cpu(p[3]), (long long)be64_to_cpu(p[4]), (long long)be64_to_cpu(p[5]), (long long)be64_to_cpu(p[6]), (long long)be64_to_cpu(p[7])); } else { log(err ? LOG_ERR : LOG_DEBUG, "%s: mbox %u %s %016llx %016llx %016llx %016llx " "%016llx %016llx %016llx %016llx\n", device_get_nameunit(sc->dev), mbox, msg, (long long)t4_read_reg64(sc, reg), (long long)t4_read_reg64(sc, reg + 8), (long long)t4_read_reg64(sc, reg + 16), (long long)t4_read_reg64(sc, reg + 24), (long long)t4_read_reg64(sc, reg + 32), (long long)t4_read_reg64(sc, reg + 40), (long long)t4_read_reg64(sc, reg + 48), (long long)t4_read_reg64(sc, reg + 56)); } } /* t4_main.c */ extern int t4_ntxq; extern int t4_nrxq; extern int t4_intr_types; extern int t4_tmr_idx; extern int t4_pktc_idx; extern unsigned int t4_qsize_rxq; extern unsigned int t4_qsize_txq; extern device_method_t cxgbe_methods[]; int t4_os_find_pci_capability(struct adapter *, int); int t4_os_pci_save_state(struct adapter *); int t4_os_pci_restore_state(struct adapter *); void t4_os_portmod_changed(struct port_info *); void t4_os_link_changed(struct port_info *); void t4_iterate(void (*)(struct adapter *, void *), void *); void t4_init_devnames(struct adapter *); void t4_add_adapter(struct adapter *); int t4_detach_common(device_t); int t4_map_bars_0_and_4(struct adapter *); int t4_map_bar_2(struct adapter *); int t4_setup_intr_handlers(struct adapter *); void t4_sysctls(struct adapter *); int begin_synchronized_op(struct adapter *, struct vi_info *, int, char *); void doom_vi(struct adapter *, struct vi_info *); void end_synchronized_op(struct adapter *, int); int update_mac_settings(struct ifnet *, int); int adapter_init(struct adapter *); int vi_init(struct vi_info *); void vi_sysctls(struct vi_info *); int rw_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int, int); int alloc_atid(struct adapter *, void *); void *lookup_atid(struct adapter *, int); void free_atid(struct adapter *, int); void release_tid(struct adapter *, int, struct sge_wrq *); int cxgbe_media_change(struct ifnet *); void cxgbe_media_status(struct ifnet *, struct ifmediareq *); bool t4_os_dump_cimla(struct adapter *, int, bool); void t4_os_dump_devlog(struct adapter *); #ifdef KERN_TLS /* t4_kern_tls.c */ int cxgbe_tls_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); void cxgbe_tls_tag_free(struct m_snd_tag *); void t6_ktls_modload(void); void t6_ktls_modunload(void); int t6_ktls_try(struct ifnet *, struct socket *, struct ktls_session *); int t6_ktls_parse_pkt(struct mbuf *, int *, int *); int t6_ktls_write_wr(struct sge_txq *, void *, struct mbuf *, u_int, u_int); #endif /* t4_keyctx.c */ struct auth_hash; union authctx; void t4_aes_getdeckey(void *, const void *, unsigned int); void t4_copy_partial_hash(int, union authctx *, void *); void t4_init_gmac_hash(const char *, int, char *); void t4_init_hmac_digest(struct auth_hash *, u_int, const char *, int, char *); #ifdef DEV_NETMAP /* t4_netmap.c */ struct sge_nm_rxq; void cxgbe_nm_attach(struct vi_info *); void cxgbe_nm_detach(struct vi_info *); void service_nm_rxq(struct sge_nm_rxq *); int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int); int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *); int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int); int free_nm_txq(struct vi_info *, struct sge_nm_txq *); #endif /* t4_sge.c */ void t4_sge_modload(void); void t4_sge_modunload(void); uint64_t t4_sge_extfree_refs(void); void t4_tweak_chip_settings(struct adapter *); int t4_verify_chip_settings(struct adapter *); void t4_init_rx_buf_info(struct adapter *); int t4_create_dma_tag(struct adapter *); void t4_sge_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid_list *); int t4_destroy_dma_tag(struct adapter *); int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); void free_fl_buffers(struct adapter *, struct sge_fl *); int t4_setup_adapter_queues(struct adapter *); int t4_teardown_adapter_queues(struct adapter *); int t4_setup_vi_queues(struct vi_info *); int t4_teardown_vi_queues(struct vi_info *); void t4_intr_all(void *); void t4_intr(void *); #ifdef DEV_NETMAP void t4_nm_intr(void *); void t4_vi_intr(void *); #endif void t4_intr_err(void *); void t4_intr_evt(void *); void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); void t4_update_fl_bufsize(struct ifnet *); struct mbuf *alloc_wr_mbuf(int, int); int parse_pkt(struct mbuf **, bool); void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *); void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *); int tnl_cong(struct port_info *, int); void t4_register_an_handler(an_handler_t); void t4_register_fw_msg_handler(int, fw_msg_handler_t); void t4_register_cpl_handler(int, cpl_handler_t); void t4_register_shared_cpl_handler(int, cpl_handler_t, int); #ifdef RATELIMIT int ethofld_transmit(struct ifnet *, struct mbuf *); void send_etid_flush_wr(struct cxgbe_rate_tag *); #endif /* t4_tracer.c */ struct t4_tracer; void t4_tracer_modload(void); void t4_tracer_modunload(void); void t4_tracer_port_detach(struct adapter *); int t4_get_tracer(struct adapter *, struct t4_tracer *); int t4_set_tracer(struct adapter *, struct t4_tracer *); int t4_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); int t5_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); /* t4_sched.c */ int t4_set_sched_class(struct adapter *, struct t4_sched_params *); int t4_set_sched_queue(struct adapter *, struct t4_sched_queue *); int t4_init_tx_sched(struct adapter *); int t4_free_tx_sched(struct adapter *); void t4_update_tx_sched(struct adapter *); int t4_reserve_cl_rl_kbps(struct adapter *, int, u_int, int *); void t4_release_cl_rl(struct adapter *, int, int); int sysctl_tc(SYSCTL_HANDLER_ARGS); int sysctl_tc_params(SYSCTL_HANDLER_ARGS); #ifdef RATELIMIT void t4_init_etid_table(struct adapter *); void t4_free_etid_table(struct adapter *); struct cxgbe_rate_tag *lookup_etid(struct adapter *, int); int cxgbe_rate_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); int cxgbe_rate_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); int cxgbe_rate_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); void cxgbe_rate_tag_free(struct m_snd_tag *); void cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *); void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif /* t4_filter.c */ int get_filter_mode(struct adapter *, uint32_t *); int set_filter_mode(struct adapter *, uint32_t); int set_filter_mask(struct adapter *, uint32_t); int get_filter(struct adapter *, struct t4_filter *); int set_filter(struct adapter *, struct t4_filter *); int del_filter(struct adapter *, struct t4_filter *); int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_hashfilter_ao_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_hashfilter_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_del_hashfilter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); void free_hftid_hash(struct tid_info *); static inline struct wrqe * alloc_wrqe(int wr_len, struct sge_wrq *wrq) { int len = offsetof(struct wrqe, wr) + wr_len; struct wrqe *wr; wr = malloc(len, M_CXGBE, M_NOWAIT); if (__predict_false(wr == NULL)) return (NULL); wr->wr_len = wr_len; wr->wrq = wrq; return (wr); } static inline void * wrtod(struct wrqe *wr) { return (&wr->wr[0]); } static inline void free_wrqe(struct wrqe *wr) { free(wr, M_CXGBE); } static inline void t4_wrq_tx(struct adapter *sc, struct wrqe *wr) { struct sge_wrq *wrq = wr->wrq; TXQ_LOCK(wrq); t4_wrq_tx_locked(sc, wrq, wr); TXQ_UNLOCK(wrq); } static inline int read_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, val, len, 0)); } static inline int write_via_memwin(struct adapter *sc, int idx, uint32_t addr, const uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, (void *)(uintptr_t)val, len, 1)); } /* Number of len16 -> number of descriptors */ static inline int tx_len16_to_desc(int len16) { return (howmany(len16, EQ_ESIZE / 16)); } #endif diff --git a/sys/dev/cxgbe/crypto/t4_kern_tls.c b/sys/dev/cxgbe/crypto/t4_kern_tls.c index 957d0202fa3f..99d0d33cf128 100644 --- a/sys/dev/cxgbe/crypto/t4_kern_tls.c +++ b/sys/dev/cxgbe/crypto/t4_kern_tls.c @@ -1,2397 +1,2397 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018-2019 Chelsio Communications, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_l2t.h" #include "t4_clip.h" #include "t4_mp_ring.h" #include "crypto/t4_crypto.h" #if defined(INET) || defined(INET6) #define SALT_SIZE 4 #define GCM_TAG_SIZE 16 #define TLS_HEADER_LENGTH 5 #define TLS_KEY_CONTEXT_SZ roundup2(sizeof(struct tls_keyctx), 32) struct tls_scmd { __be32 seqno_numivs; __be32 ivgen_hdrlen; }; struct tls_key_req { /* FW_ULPTX_WR */ __be32 wr_hi; __be32 wr_mid; __be32 ftid; __u8 reneg_to_write_rx; __u8 protocol; __be16 mfs; /* master command */ __be32 cmd; __be32 len16; /* command length */ __be32 dlen; /* data length in 32-byte units */ __be32 kaddr; /* sub-command */ __be32 sc_more; __be32 sc_len; }__packed; struct tls_keyctx { struct tx_keyctx_hdr { __u8 ctxlen; __u8 r2; __be16 dualck_to_txvalid; __u8 txsalt[4]; __be64 r5; } txhdr; struct keys { __u8 edkey[32]; __u8 ipad[64]; __u8 opad[64]; } keys; }; #define S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 11 #define M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT 0x1 #define V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) #define G_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) & \ M_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT) #define F_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT \ V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1U) #define S_TLS_KEYCTX_TX_WR_SALT_PRESENT 10 #define M_TLS_KEYCTX_TX_WR_SALT_PRESENT 0x1 #define V_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \ ((x) << S_TLS_KEYCTX_TX_WR_SALT_PRESENT) #define G_TLS_KEYCTX_TX_WR_SALT_PRESENT(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_SALT_PRESENT) & \ M_TLS_KEYCTX_TX_WR_SALT_PRESENT) #define F_TLS_KEYCTX_TX_WR_SALT_PRESENT \ V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1U) #define S_TLS_KEYCTX_TX_WR_TXCK_SIZE 6 #define M_TLS_KEYCTX_TX_WR_TXCK_SIZE 0xf #define V_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXCK_SIZE) #define G_TLS_KEYCTX_TX_WR_TXCK_SIZE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXCK_SIZE) & \ M_TLS_KEYCTX_TX_WR_TXCK_SIZE) #define S_TLS_KEYCTX_TX_WR_TXMK_SIZE 2 #define M_TLS_KEYCTX_TX_WR_TXMK_SIZE 0xf #define V_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXMK_SIZE) #define G_TLS_KEYCTX_TX_WR_TXMK_SIZE(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXMK_SIZE) & \ M_TLS_KEYCTX_TX_WR_TXMK_SIZE) #define S_TLS_KEYCTX_TX_WR_TXVALID 0 #define M_TLS_KEYCTX_TX_WR_TXVALID 0x1 #define V_TLS_KEYCTX_TX_WR_TXVALID(x) \ ((x) << S_TLS_KEYCTX_TX_WR_TXVALID) #define G_TLS_KEYCTX_TX_WR_TXVALID(x) \ (((x) >> S_TLS_KEYCTX_TX_WR_TXVALID) & M_TLS_KEYCTX_TX_WR_TXVALID) #define F_TLS_KEYCTX_TX_WR_TXVALID V_TLS_KEYCTX_TX_WR_TXVALID(1U) /* Key Context Programming Operation type */ #define KEY_WRITE_RX 0x1 #define KEY_WRITE_TX 0x2 #define KEY_DELETE_RX 0x4 #define KEY_DELETE_TX 0x8 struct tlspcb { struct m_snd_tag com; struct vi_info *vi; /* virtual interface */ struct adapter *sc; struct l2t_entry *l2te; /* L2 table entry used by this connection */ int tid; /* Connection identifier */ int tx_key_addr; bool inline_key; bool using_timestamps; unsigned char enc_mode; struct tls_scmd scmd0; struct tls_scmd scmd0_short; unsigned int tx_key_info_size; uint32_t prev_seq; uint32_t prev_ack; uint32_t prev_tsecr; uint16_t prev_win; uint16_t prev_mss; /* Only used outside of setup and teardown when using inline keys. */ struct tls_keyctx keyctx; /* Fields only used during setup and teardown. */ struct inpcb *inp; /* backpointer to host stack's PCB */ struct sge_txq *txq; struct sge_wrq *ctrlq; struct clip_entry *ce; /* CLIP table entry used by this tid */ unsigned char auth_mode; unsigned char hmac_ctrl; unsigned char mac_first; unsigned char iv_size; unsigned int frag_size; unsigned int cipher_secret_size; int proto_ver; bool open_pending; }; static int ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls, struct sge_txq *txq); static inline struct tlspcb * mst_to_tls(struct m_snd_tag *t) { return (__containerof(t, struct tlspcb, com)); } /* XXX: There are similar versions of these two in tom/t4_tls.c. */ static int get_new_keyid(struct tlspcb *tlsp) { vmem_addr_t addr; if (vmem_alloc(tlsp->sc->key_map, TLS_KEY_CONTEXT_SZ, M_NOWAIT | M_FIRSTFIT, &addr) != 0) return (-1); return (addr); } static void free_keyid(struct tlspcb *tlsp, int keyid) { CTR3(KTR_CXGBE, "%s: tid %d key addr %#x", __func__, tlsp->tid, keyid); vmem_free(tlsp->sc->key_map, keyid, TLS_KEY_CONTEXT_SZ); } static struct tlspcb * alloc_tlspcb(struct ifnet *ifp, struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tlspcb *tlsp; tlsp = malloc(sizeof(*tlsp), M_CXGBE, M_ZERO | flags); if (tlsp == NULL) return (NULL); m_snd_tag_init(&tlsp->com, ifp, IF_SND_TAG_TYPE_TLS); tlsp->vi = vi; tlsp->sc = sc; tlsp->ctrlq = &sc->sge.ctrlq[pi->port_id]; tlsp->tid = -1; tlsp->tx_key_addr = -1; return (tlsp); } static void init_ktls_key_params(struct tlspcb *tlsp, const struct ktls_session *tls) { int mac_key_size; if (tls->params.tls_vminor == TLS_MINOR_VER_ONE) tlsp->proto_ver = SCMD_PROTO_VERSION_TLS_1_1; else tlsp->proto_ver = SCMD_PROTO_VERSION_TLS_1_2; tlsp->cipher_secret_size = tls->params.cipher_key_len; tlsp->tx_key_info_size = sizeof(struct tx_keyctx_hdr) + tlsp->cipher_secret_size; if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) { tlsp->auth_mode = SCMD_AUTH_MODE_GHASH; tlsp->enc_mode = SCMD_CIPH_MODE_AES_GCM; tlsp->iv_size = 4; tlsp->mac_first = 0; tlsp->hmac_ctrl = SCMD_HMAC_CTRL_NOP; tlsp->tx_key_info_size += GMAC_BLOCK_LEN; } else { switch (tls->params.auth_algorithm) { case CRYPTO_SHA1_HMAC: mac_key_size = roundup2(SHA1_HASH_LEN, 16); tlsp->auth_mode = SCMD_AUTH_MODE_SHA1; break; case CRYPTO_SHA2_256_HMAC: mac_key_size = SHA2_256_HASH_LEN; tlsp->auth_mode = SCMD_AUTH_MODE_SHA256; break; case CRYPTO_SHA2_384_HMAC: mac_key_size = SHA2_512_HASH_LEN; tlsp->auth_mode = SCMD_AUTH_MODE_SHA512_384; break; } tlsp->enc_mode = SCMD_CIPH_MODE_AES_CBC; tlsp->iv_size = 8; /* for CBC, iv is 16B, unit of 2B */ tlsp->mac_first = 1; tlsp->hmac_ctrl = SCMD_HMAC_CTRL_NO_TRUNC; tlsp->tx_key_info_size += mac_key_size * 2; } tlsp->frag_size = tls->params.max_frame_len; } static int ktls_act_open_cpl_size(bool isipv6) { if (isipv6) return (sizeof(struct cpl_t6_act_open_req6)); else return (sizeof(struct cpl_t6_act_open_req)); } static void mk_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst) { struct tcpcb *tp = intotcpcb(inp); struct cpl_t6_act_open_req *cpl6; struct cpl_act_open_req *cpl; uint64_t options; int qid_atid; cpl6 = dst; cpl = (struct cpl_act_open_req *)cpl6; INIT_TP_WR(cpl6, 0); qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_KERN_TLS); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE); options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan); options |= F_NON_OFFLOAD; cpl->opt0 = htobe64(options); options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]); if (tp->t_flags & TF_REQ_TSTMP) options |= F_TSTAMPS_EN; cpl->opt2 = htobe32(options); } static void mk_ktls_act_open_req6(struct adapter *sc, struct vi_info *vi, struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst) { struct tcpcb *tp = intotcpcb(inp); struct cpl_t6_act_open_req6 *cpl6; struct cpl_act_open_req6 *cpl; uint64_t options; int qid_atid; cpl6 = dst; cpl = (struct cpl_act_open_req6 *)cpl6; INIT_TP_WR(cpl6, 0); qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_KERN_TLS); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE); options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan); options |= F_NON_OFFLOAD; cpl->opt0 = htobe64(options); options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]); if (tp->t_flags & TF_REQ_TSTMP) options |= F_TSTAMPS_EN; cpl->opt2 = htobe32(options); } static int send_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp, struct tlspcb *tlsp, int atid) { struct wrqe *wr; bool isipv6; isipv6 = (inp->inp_vflag & INP_IPV6) != 0; if (isipv6) { - tlsp->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL); + tlsp->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (tlsp->ce == NULL) return (ENOENT); } wr = alloc_wrqe(ktls_act_open_cpl_size(isipv6), tlsp->ctrlq); if (wr == NULL) { CTR2(KTR_CXGBE, "%s: atid %d failed to alloc WR", __func__, atid); return (ENOMEM); } if (isipv6) mk_ktls_act_open_req6(sc, vi, inp, tlsp, atid, wrtod(wr)); else mk_ktls_act_open_req(sc, vi, inp, tlsp, atid, wrtod(wr)); tlsp->open_pending = true; t4_wrq_tx(sc, wr); return (0); } static int ktls_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct tlspcb *tlsp = lookup_atid(sc, atid); struct inpcb *inp = tlsp->inp; CTR3(KTR_CXGBE, "%s: atid %d status %d", __func__, atid, status); free_atid(sc, atid); if (status == 0) tlsp->tid = GET_TID(cpl); INP_WLOCK(inp); tlsp->open_pending = false; wakeup(tlsp); INP_WUNLOCK(inp); return (0); } /* SET_TCB_FIELD sent as a ULP command looks like this */ #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) _Static_assert((LEN__SET_TCB_FIELD_ULP + sizeof(struct ulptx_idata)) % 16 == 0, "CPL_SET_TCB_FIELD ULP command not 16-byte aligned"); static void write_set_tcb_field_ulp(struct tlspcb *tlsp, void *dst, struct sge_txq *txq, uint16_t word, uint64_t mask, uint64_t val) { struct ulp_txpkt *txpkt; struct ulptx_idata *idata; struct cpl_set_tcb_field_core *cpl; /* ULP_TXPKT */ txpkt = dst; txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DATAMODIFY(0) | V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1)); txpkt->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); /* ULPTX_IDATA sub-command */ idata = (struct ulptx_idata *)(txpkt + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); idata->len = htobe32(sizeof(*cpl)); /* CPL_SET_TCB_FIELD */ cpl = (struct cpl_set_tcb_field_core *)(idata + 1); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tlsp->tid)); cpl->reply_ctrl = htobe16(F_NO_REPLY); cpl->word_cookie = htobe16(V_WORD(word)); cpl->mask = htobe64(mask); cpl->val = htobe64(val); /* ULPTX_NOOP */ idata = (struct ulptx_idata *)(cpl + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); idata->len = htobe32(0); } static int ktls_set_tcb_fields(struct tlspcb *tlsp, struct tcpcb *tp, struct sge_txq *txq) { struct fw_ulptx_wr *wr; struct mbuf *m; char *dst; void *items[1]; int error, len; len = sizeof(*wr) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); if (tp->t_flags & TF_REQ_TSTMP) len += roundup2(LEN__SET_TCB_FIELD_ULP, 16); m = alloc_wr_mbuf(len, M_NOWAIT); if (m == NULL) { CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__, tlsp->tid); return (ENOMEM); } m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; /* FW_ULPTX_WR */ wr = mtod(m, void *); wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR)); wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA | V_FW_WR_LEN16(len / 16)); wr->cookie = 0; dst = (char *)(wr + 1); /* Clear TF_NON_OFFLOAD and set TF_CORE_BYPASS */ write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_T_FLAGS, V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1) | V_TF_NON_OFFLOAD(1)), V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1))); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); /* Clear the SND_UNA_RAW, SND_NXT_RAW, and SND_MAX_RAW offsets. */ write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_UNA_RAW, V_TCB_SND_NXT_RAW(M_TCB_SND_NXT_RAW) | V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW), V_TCB_SND_NXT_RAW(0) | V_TCB_SND_UNA_RAW(0)); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_MAX_RAW, V_TCB_SND_MAX_RAW(M_TCB_SND_MAX_RAW), V_TCB_SND_MAX_RAW(0)); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); if (tp->t_flags & TF_REQ_TSTMP) { write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_TIMESTAMP_OFFSET, V_TCB_TIMESTAMP_OFFSET(M_TCB_TIMESTAMP_OFFSET), V_TCB_TIMESTAMP_OFFSET(tp->ts_offset >> 28)); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); } KASSERT(dst - (char *)wr == len, ("%s: length mismatch", __func__)); items[0] = m; error = mp_ring_enqueue(txq->r, items, 1, 1); if (error) m_free(m); return (error); } int cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { const struct ktls_session *tls; struct tlspcb *tlsp; struct adapter *sc; struct vi_info *vi; struct inpcb *inp; struct tcpcb *tp; struct sge_txq *txq; int atid, error, keyid; tls = params->tls.tls; /* Only TLS 1.1 and TLS 1.2 are currently supported. */ if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE || tls->params.tls_vminor < TLS_MINOR_VER_ONE || tls->params.tls_vminor > TLS_MINOR_VER_TWO) return (EPROTONOSUPPORT); /* Sanity check values in *tls. */ switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: /* XXX: Explicitly ignore any provided IV. */ switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } switch (tls->params.auth_algorithm) { case CRYPTO_SHA1_HMAC: case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: break; default: return (EPROTONOSUPPORT); } break; case CRYPTO_AES_NIST_GCM_16: if (tls->params.iv_len != SALT_SIZE) return (EINVAL); switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } break; default: return (EPROTONOSUPPORT); } vi = ifp->if_softc; sc = vi->adapter; tlsp = alloc_tlspcb(ifp, vi, M_WAITOK); atid = alloc_atid(sc, tlsp); if (atid < 0) { error = ENOMEM; goto failed; } if (sc->tlst.inline_keys) keyid = -1; else keyid = get_new_keyid(tlsp); if (keyid < 0) { CTR2(KTR_CXGBE, "%s: atid %d using immediate key ctx", __func__, atid); tlsp->inline_key = true; } else { tlsp->tx_key_addr = keyid; CTR3(KTR_CXGBE, "%s: atid %d allocated TX key addr %#x", __func__, atid, tlsp->tx_key_addr); } inp = params->tls.inp; INP_RLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); error = ECONNRESET; goto failed; } tlsp->inp = inp; tp = inp->inp_ppcb; if (tp->t_flags & TF_REQ_TSTMP) { tlsp->using_timestamps = true; if ((tp->ts_offset & 0xfffffff) != 0) { INP_RUNLOCK(inp); error = EINVAL; goto failed; } } else tlsp->using_timestamps = false; error = send_ktls_act_open_req(sc, vi, inp, tlsp, atid); if (error) { INP_RUNLOCK(inp); goto failed; } /* Wait for reply to active open. */ CTR2(KTR_CXGBE, "%s: atid %d sent CPL_ACT_OPEN_REQ", __func__, atid); while (tlsp->open_pending) { /* * XXX: PCATCH? We would then have to discard the PCB * when the completion CPL arrived. */ error = rw_sleep(tlsp, &inp->inp_lock, 0, "t6tlsop", 0); } atid = -1; if (tlsp->tid < 0) { INP_RUNLOCK(inp); error = ENOMEM; goto failed; } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); error = ECONNRESET; goto failed; } txq = &sc->sge.txq[vi->first_txq]; if (inp->inp_flowtype != M_HASHTYPE_NONE) txq += ((inp->inp_flowid % (vi->ntxq - vi->rsrv_noflowq)) + vi->rsrv_noflowq); tlsp->txq = txq; error = ktls_set_tcb_fields(tlsp, tp, txq); INP_RUNLOCK(inp); if (error) goto failed; init_ktls_key_params(tlsp, tls); error = ktls_setup_keys(tlsp, tls, txq); if (error) goto failed; /* The SCMD fields used when encrypting a full TLS record. */ tlsp->scmd0.seqno_numivs = htobe32(V_SCMD_SEQ_NO_CTRL(3) | V_SCMD_PROTO_VERSION(tlsp->proto_ver) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((tlsp->mac_first == 0)) | V_SCMD_CIPH_MODE(tlsp->enc_mode) | V_SCMD_AUTH_MODE(tlsp->auth_mode) | V_SCMD_HMAC_CTRL(tlsp->hmac_ctrl) | V_SCMD_IV_SIZE(tlsp->iv_size) | V_SCMD_NUM_IVS(1)); tlsp->scmd0.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) | V_SCMD_TLS_FRAG_ENABLE(0); if (tlsp->inline_key) tlsp->scmd0.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1); tlsp->scmd0.ivgen_hdrlen = htobe32(tlsp->scmd0.ivgen_hdrlen); /* * The SCMD fields used when encrypting a partial TLS record * (no trailer and possibly a truncated payload). */ tlsp->scmd0_short.seqno_numivs = V_SCMD_SEQ_NO_CTRL(0) | V_SCMD_PROTO_VERSION(SCMD_PROTO_VERSION_GENERIC) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((tlsp->mac_first == 0)) | V_SCMD_AUTH_MODE(SCMD_AUTH_MODE_NOP) | V_SCMD_HMAC_CTRL(SCMD_HMAC_CTRL_NOP) | V_SCMD_IV_SIZE(AES_BLOCK_LEN / 2) | V_SCMD_NUM_IVS(0); if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) tlsp->scmd0_short.seqno_numivs |= V_SCMD_CIPH_MODE(SCMD_CIPH_MODE_AES_CTR); else tlsp->scmd0_short.seqno_numivs |= V_SCMD_CIPH_MODE(tlsp->enc_mode); tlsp->scmd0_short.seqno_numivs = htobe32(tlsp->scmd0_short.seqno_numivs); tlsp->scmd0_short.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) | V_SCMD_TLS_FRAG_ENABLE(0) | V_SCMD_AADIVDROP(1); if (tlsp->inline_key) tlsp->scmd0_short.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1); TXQ_LOCK(txq); if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) txq->kern_tls_gcm++; else txq->kern_tls_cbc++; TXQ_UNLOCK(txq); *pt = &tlsp->com; return (0); failed: if (atid >= 0) free_atid(sc, atid); m_snd_tag_rele(&tlsp->com); return (error); } static int ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls, struct sge_txq *txq) { struct auth_hash *axf; int error, keyid, kwrlen, kctxlen, len; struct tls_key_req *kwr; struct tls_keyctx *kctx; void *items[1], *key; struct tx_keyctx_hdr *khdr; unsigned int ck_size, mk_size, partial_digest_len; struct mbuf *m; /* * Store the salt and keys in the key context. For * connections with an inline key, this key context is passed * as immediate data in each work request. For connections * storing the key in DDR, a work request is used to store a * copy of the key context in DDR. */ kctx = &tlsp->keyctx; khdr = &kctx->txhdr; switch (tlsp->cipher_secret_size) { case 128 / 8: ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_128; break; case 192 / 8: ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_192; break; case 256 / 8: ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256; break; default: panic("bad key size"); } axf = NULL; partial_digest_len = 0; if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_512; else { switch (tlsp->auth_mode) { case SCMD_AUTH_MODE_SHA1: axf = &auth_hash_hmac_sha1; mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_160; partial_digest_len = SHA1_HASH_LEN; break; case SCMD_AUTH_MODE_SHA256: axf = &auth_hash_hmac_sha2_256; mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_256; partial_digest_len = SHA2_256_HASH_LEN; break; case SCMD_AUTH_MODE_SHA512_384: axf = &auth_hash_hmac_sha2_384; mk_size = CHCR_KEYCTX_MAC_KEY_SIZE_512; partial_digest_len = SHA2_512_HASH_LEN; break; default: panic("bad auth mode"); } } khdr->ctxlen = (tlsp->tx_key_info_size >> 4); khdr->dualck_to_txvalid = V_TLS_KEYCTX_TX_WR_SALT_PRESENT(1) | V_TLS_KEYCTX_TX_WR_TXCK_SIZE(ck_size) | V_TLS_KEYCTX_TX_WR_TXMK_SIZE(mk_size) | V_TLS_KEYCTX_TX_WR_TXVALID(1); if (tlsp->enc_mode != SCMD_CIPH_MODE_AES_GCM) khdr->dualck_to_txvalid |= V_TLS_KEYCTX_TX_WR_TXOPAD_PRESENT(1); khdr->dualck_to_txvalid = htobe16(khdr->dualck_to_txvalid); key = kctx->keys.edkey; memcpy(key, tls->params.cipher_key, tls->params.cipher_key_len); if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) { memcpy(khdr->txsalt, tls->params.iv, SALT_SIZE); t4_init_gmac_hash(tls->params.cipher_key, tls->params.cipher_key_len, (char *)key + tls->params.cipher_key_len); } else { t4_init_hmac_digest(axf, partial_digest_len, tls->params.auth_key, tls->params.auth_key_len, (char *)key + tls->params.cipher_key_len); } if (tlsp->inline_key) return (0); keyid = tlsp->tx_key_addr; /* Populate key work request. */ kwrlen = sizeof(*kwr); kctxlen = roundup2(sizeof(*kctx), 32); len = kwrlen + kctxlen; m = alloc_wr_mbuf(len, M_NOWAIT); if (m == NULL) { CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__, tlsp->tid); return (ENOMEM); } m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; kwr = mtod(m, void *); memset(kwr, 0, len); kwr->wr_hi = htobe32(V_FW_WR_OP(FW_ULPTX_WR) | F_FW_WR_ATOMIC); kwr->wr_mid = htobe32(V_FW_WR_LEN16(DIV_ROUND_UP(len, 16))); kwr->protocol = tlsp->proto_ver; kwr->mfs = htons(tlsp->frag_size); kwr->reneg_to_write_rx = KEY_WRITE_TX; /* master command */ kwr->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) | V_T5_ULP_MEMIO_ORDER(1) | V_T5_ULP_MEMIO_IMM(1)); kwr->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(kctxlen >> 5)); kwr->len16 = htobe32((tlsp->tid << 8) | DIV_ROUND_UP(len - sizeof(struct work_request_hdr), 16)); kwr->kaddr = htobe32(V_ULP_MEMIO_ADDR(keyid >> 5)); /* sub command */ kwr->sc_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); kwr->sc_len = htobe32(kctxlen); kctx = (struct tls_keyctx *)(kwr + 1); memcpy(kctx, &tlsp->keyctx, sizeof(*kctx)); /* * Place the key work request in the transmit queue. It * should be sent to the NIC before any TLS packets using this * session. */ items[0] = m; error = mp_ring_enqueue(txq->r, items, 1, 1); if (error) m_free(m); else CTR2(KTR_CXGBE, "%s: tid %d sent key WR", __func__, tlsp->tid); return (error); } static u_int ktls_base_wr_size(struct tlspcb *tlsp) { u_int wr_len; wr_len = sizeof(struct fw_ulptx_wr); // 16 wr_len += sizeof(struct ulp_txpkt); // 8 wr_len += sizeof(struct ulptx_idata); // 8 wr_len += sizeof(struct cpl_tx_sec_pdu);// 32 if (tlsp->inline_key) wr_len += tlsp->tx_key_info_size; else { wr_len += sizeof(struct ulptx_sc_memrd);// 8 wr_len += sizeof(struct ulptx_idata); // 8 } wr_len += sizeof(struct cpl_tx_data); // 16 return (wr_len); } /* How many bytes of TCP payload to send for a given TLS record. */ static u_int ktls_tcp_payload_length(struct tlspcb *tlsp, struct mbuf *m_tls) { struct tls_record_layer *hdr; u_int plen, mlen; M_ASSERTEXTPG(m_tls); hdr = (void *)m_tls->m_epg_hdr; plen = ntohs(hdr->tls_length); /* * What range of the TLS record is the mbuf requesting to be * sent. */ mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len; /* Always send complete records. */ if (mlen == TLS_HEADER_LENGTH + plen) return (mlen); /* * If the host stack has asked to send part of the trailer, * trim the length to avoid sending any of the trailer. There * is no way to send a partial trailer currently. */ if (mlen > TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen) mlen = TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen; /* * For AES-CBC adjust the ciphertext length for the block * size. */ if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC && mlen > TLS_HEADER_LENGTH) { mlen = TLS_HEADER_LENGTH + rounddown(mlen - TLS_HEADER_LENGTH, AES_BLOCK_LEN); } #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d short TLS record (%u vs %u)", __func__, tlsp->tid, mlen, TLS_HEADER_LENGTH + plen); #endif return (mlen); } /* * For a "short" TLS record, determine the offset into the TLS record * payload to send. This offset does not include the TLS header, but * a non-zero offset implies that a header will not be sent. */ static u_int ktls_payload_offset(struct tlspcb *tlsp, struct mbuf *m_tls) { struct tls_record_layer *hdr; u_int offset, plen; #ifdef INVARIANTS u_int mlen; #endif M_ASSERTEXTPG(m_tls); hdr = (void *)m_tls->m_epg_hdr; plen = ntohs(hdr->tls_length); #ifdef INVARIANTS mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len; MPASS(mlen < TLS_HEADER_LENGTH + plen); #endif if (mtod(m_tls, vm_offset_t) <= m_tls->m_epg_hdrlen) return (0); if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) { /* * Always send something. This function is only called * if we aren't sending the tag at all, but if the * request starts in the tag then we are in an odd * state where would effectively send nothing. Cap * the offset at the last byte of the record payload * to send the last cipher block. */ offset = min(mtod(m_tls, vm_offset_t) - m_tls->m_epg_hdrlen, (plen - TLS_HEADER_LENGTH - m_tls->m_epg_trllen) - 1); return (rounddown(offset, AES_BLOCK_LEN)); } return (0); } static u_int ktls_sgl_size(u_int nsegs) { u_int wr_len; /* First segment is part of ulptx_sgl. */ nsegs--; wr_len = sizeof(struct ulptx_sgl); wr_len += 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (wr_len); } static int ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struct mbuf *m_tls, int *nsegsp) { struct tls_record_layer *hdr; u_int imm_len, offset, plen, wr_len, tlen; M_ASSERTEXTPG(m_tls); /* * Determine the size of the TLS record payload to send * excluding header and trailer. */ tlen = ktls_tcp_payload_length(tlsp, m_tls); if (tlen <= m_tls->m_epg_hdrlen) { /* * For requests that only want to send the TLS header, * send a tunnelled packet as immediate data. */ wr_len = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len + m_tls->m_len, 16); if (wr_len > SGE_MAX_WR_LEN) { CTR3(KTR_CXGBE, "%s: tid %d TLS header-only packet too long (len %d)", __func__, tlsp->tid, m->m_len + m_tls->m_len); } /* This should always be the last TLS record in a chain. */ MPASS(m_tls->m_next == NULL); /* * XXX: Set a bogus 'nsegs' value to avoid tripping an * assertion in mbuf_nsegs() in t4_sge.c. */ *nsegsp = 1; return (wr_len); } hdr = (void *)m_tls->m_epg_hdr; plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen; if (tlen < plen) { plen = tlen; offset = ktls_payload_offset(tlsp, m_tls); } else offset = 0; /* Calculate the size of the work request. */ wr_len = ktls_base_wr_size(tlsp); /* * Full records and short records with an offset of 0 include * the TLS header as immediate data. Short records include a * raw AES IV as immediate data. */ imm_len = 0; if (offset == 0) imm_len += m_tls->m_epg_hdrlen; if (plen == tlen) imm_len += AES_BLOCK_LEN; wr_len += roundup2(imm_len, 16); /* TLS record payload via DSGL. */ *nsegsp = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen + offset, plen - (m_tls->m_epg_hdrlen + offset)); wr_len += ktls_sgl_size(*nsegsp); wr_len = roundup2(wr_len, 16); return (wr_len); } /* * See if we have any TCP options requiring a dedicated options-only * packet. */ static int ktls_has_tcp_options(struct tcphdr *tcp) { u_char *cp; int cnt, opt, optlen; cp = (u_char *)(tcp + 1); cnt = tcp->th_off * 4 - sizeof(struct tcphdr); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_NOP: case TCPOPT_TIMESTAMP: break; default: return (1); } } return (0); } /* * Find the TCP timestamp option. */ static void * ktls_find_tcp_timestamps(struct tcphdr *tcp) { u_char *cp; int cnt, opt, optlen; cp = (u_char *)(tcp + 1); cnt = tcp->th_off * 4 - sizeof(struct tcphdr); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } if (opt == TCPOPT_TIMESTAMP && optlen == TCPOLEN_TIMESTAMP) return (cp + 2); } return (NULL); } int t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p) { struct tlspcb *tlsp; struct ether_header *eh; struct ip *ip; struct ip6_hdr *ip6; struct tcphdr *tcp; struct mbuf *m_tls; int nsegs; u_int wr_len, tot_len; /* * Locate headers in initial mbuf. * * XXX: This assumes all of the headers are in the initial mbuf. * Could perhaps use m_advance() like parse_pkt() if that turns * out to not be true. */ M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.snd_tag != NULL); tlsp = mst_to_tls(m->m_pkthdr.snd_tag); if (m->m_len <= sizeof(*eh) + sizeof(*ip)) { CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short", __func__, tlsp->tid); return (EINVAL); } eh = mtod(m, struct ether_header *); if (ntohs(eh->ether_type) != ETHERTYPE_IP && ntohs(eh->ether_type) != ETHERTYPE_IPV6) { CTR2(KTR_CXGBE, "%s: tid %d mbuf not ETHERTYPE_IP{,V6}", __func__, tlsp->tid); return (EINVAL); } m->m_pkthdr.l2hlen = sizeof(*eh); /* XXX: Reject unsupported IP options? */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (struct ip *)(eh + 1); if (ip->ip_p != IPPROTO_TCP) { CTR2(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP", __func__, tlsp->tid); return (EINVAL); } m->m_pkthdr.l3hlen = ip->ip_hl * 4; } else { ip6 = (struct ip6_hdr *)(eh + 1); if (ip6->ip6_nxt != IPPROTO_TCP) { CTR3(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP (%u)", __func__, tlsp->tid, ip6->ip6_nxt); return (EINVAL); } m->m_pkthdr.l3hlen = sizeof(struct ip6_hdr); } if (m->m_len < m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)) { CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short (2)", __func__, tlsp->tid); return (EINVAL); } tcp = (struct tcphdr *)((char *)(eh + 1) + m->m_pkthdr.l3hlen); m->m_pkthdr.l4hlen = tcp->th_off * 4; /* Bail if there is TCP payload before the TLS record. */ if (m->m_len != m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen) { CTR6(KTR_CXGBE, "%s: tid %d header mbuf bad length (%d + %d + %d != %d)", __func__, tlsp->tid, m->m_pkthdr.l2hlen, m->m_pkthdr.l3hlen, m->m_pkthdr.l4hlen, m->m_len); return (EINVAL); } /* Assume all headers are in 'm' for now. */ MPASS(m->m_next != NULL); MPASS(m->m_next->m_flags & M_EXTPG); tot_len = 0; /* * Each of the remaining mbufs in the chain should reference a * TLS record. */ *nsegsp = 0; for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) { MPASS(m_tls->m_flags & M_EXTPG); wr_len = ktls_wr_len(tlsp, m, m_tls, &nsegs); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d wr_len %d nsegs %d", __func__, tlsp->tid, wr_len, nsegs); #endif if (wr_len > SGE_MAX_WR_LEN || nsegs > TX_SGL_SEGS) return (EFBIG); tot_len += roundup2(wr_len, EQ_ESIZE); /* * Store 'nsegs' for the first TLS record in the * header mbuf's metadata. */ if (*nsegsp == 0) *nsegsp = nsegs; } MPASS(tot_len != 0); /* * See if we have any TCP options or a FIN requiring a * dedicated packet. */ if ((tcp->th_flags & TH_FIN) != 0 || ktls_has_tcp_options(tcp)) { wr_len = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len, 16); if (wr_len > SGE_MAX_WR_LEN) { CTR3(KTR_CXGBE, "%s: tid %d options-only packet too long (len %d)", __func__, tlsp->tid, m->m_len); return (EINVAL); } tot_len += roundup2(wr_len, EQ_ESIZE); } /* Include room for a TP work request to program an L2T entry. */ tot_len += EQ_ESIZE; /* * Include room for a ULPTX work request including up to 5 * CPL_SET_TCB_FIELD commands before the first TLS work * request. */ wr_len = sizeof(struct fw_ulptx_wr) + 5 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); /* * If timestamps are present, reserve 1 more command for * setting the echoed timestamp. */ if (tlsp->using_timestamps) wr_len += roundup2(LEN__SET_TCB_FIELD_ULP, 16); tot_len += roundup2(wr_len, EQ_ESIZE); *len16p = tot_len / 16; #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d len16 %d nsegs %d", __func__, tlsp->tid, *len16p, *nsegsp); #endif return (0); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_buf(struct sglist *gl, caddr_t to) { struct sglist_seg *seg; __be64 *flitp; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)to & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, to)); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)to; seg = &gl->sg_segs[0]; usgl = (void *)flitp; usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; if ((uintptr_t)(*to) == (uintptr_t)&eq->desc[eq->sidx]) (*to) = (caddr_t)eq->desc; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static int ktls_write_tcp_options(struct sge_txq *txq, void *dst, struct mbuf *m, u_int available, u_int pidx) { struct tx_sdesc *txsd; struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int len16, ndesc, pktlen; struct ether_header *eh; struct ip *ip, newip; struct ip6_hdr *ip6, newip6; struct tcphdr *tcp, newtcp; caddr_t out; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m); wr = dst; pktlen = m->m_len; ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; cpl = (void *)(wr + 1); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); out = (void *)(cpl + 1); /* Copy over Ethernet header. */ eh = mtod(m, struct ether_header *); copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen); /* Fixup length in IP header and copy out. */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip = *ip; newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip)); if (m->m_pkthdr.l3hlen > sizeof(*ip)) copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out, m->m_pkthdr.l3hlen - sizeof(*ip)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } else { ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip6 = *ip6; newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6)); MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } cpl->ctrl1 = htobe64(ctrl1); txq->txcsum++; /* Clear PUSH and FIN in the TCP header if present. */ tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); newtcp = *tcp; newtcp.th_flags &= ~(TH_PUSH | TH_FIN); copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp)); /* Copy rest of packet. */ copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, pktlen - (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp))); txq->imm_wrs++; txq->txpkt_wrs++; txq->kern_tls_options++; txsd = &txq->sdesc[pidx]; txsd->m = NULL; txsd->desc_used = ndesc; return (ndesc); } static int ktls_write_tunnel_packet(struct sge_txq *txq, void *dst, struct mbuf *m, struct mbuf *m_tls, u_int available, tcp_seq tcp_seqno, u_int pidx) { struct tx_sdesc *txsd; struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int len16, ndesc, pktlen; struct ether_header *eh; struct ip *ip, newip; struct ip6_hdr *ip6, newip6; struct tcphdr *tcp, newtcp; caddr_t out; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m); /* Locate the template TLS header. */ M_ASSERTEXTPG(m_tls); /* This should always be the last TLS record in a chain. */ MPASS(m_tls->m_next == NULL); wr = dst; pktlen = m->m_len + m_tls->m_len; ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; cpl = (void *)(wr + 1); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); out = (void *)(cpl + 1); /* Copy over Ethernet header. */ eh = mtod(m, struct ether_header *); copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen); /* Fixup length in IP header and copy out. */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip = *ip; newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip)); if (m->m_pkthdr.l3hlen > sizeof(*ip)) copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out, m->m_pkthdr.l3hlen - sizeof(*ip)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } else { ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip6 = *ip6; newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6)); MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } cpl->ctrl1 = htobe64(ctrl1); txq->txcsum++; /* Set sequence number in TCP header. */ tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); newtcp = *tcp; newtcp.th_seq = htonl(tcp_seqno + mtod(m_tls, vm_offset_t)); copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp)); /* Copy rest of TCP header. */ copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len - (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp))); /* Copy the subset of the TLS header requested. */ copy_to_txd(&txq->eq, (char *)m_tls->m_epg_hdr + mtod(m_tls, vm_offset_t), &out, m_tls->m_len); txq->imm_wrs++; txq->txpkt_wrs++; txq->kern_tls_header++; txsd = &txq->sdesc[pidx]; txsd->m = m; txsd->desc_used = ndesc; return (ndesc); } _Static_assert(sizeof(struct cpl_set_tcb_field) <= EQ_ESIZE, "CPL_SET_TCB_FIELD must be smaller than a single TX descriptor"); _Static_assert(W_TCB_SND_UNA_RAW == W_TCB_SND_NXT_RAW, "SND_NXT_RAW and SND_UNA_RAW are in different words"); static int ktls_write_tls_wr(struct tlspcb *tlsp, struct sge_txq *txq, void *dst, struct mbuf *m, struct tcphdr *tcp, struct mbuf *m_tls, u_int nsegs, u_int available, tcp_seq tcp_seqno, uint32_t *tsopt, u_int pidx, bool set_l2t_idx) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct fw_ulptx_wr *wr; struct ulp_txpkt *txpkt; struct ulptx_sc_memrd *memrd; struct ulptx_idata *idata; struct cpl_tx_sec_pdu *sec_pdu; struct cpl_tx_data *tx_data; struct tls_record_layer *hdr; char *iv, *out; u_int aad_start, aad_stop; u_int auth_start, auth_stop, auth_insert; u_int cipher_start, cipher_stop, iv_offset; u_int imm_len, mss, ndesc, offset, plen, tlen, twr_len, wr_len; u_int fields, tx_max_offset, tx_max; bool first_wr, last_wr, using_scratch; ndesc = 0; MPASS(tlsp->txq == txq); first_wr = (tlsp->prev_seq == 0 && tlsp->prev_ack == 0 && tlsp->prev_win == 0); /* * Use the per-txq scratch pad if near the end of the ring to * simplify handling of wrap-around. This uses a simple but * not quite perfect test of using the scratch buffer if we * can't fit a maximal work request in without wrapping. */ using_scratch = (eq->sidx - pidx < SGE_MAX_WR_LEN / EQ_ESIZE); /* Locate the TLS header. */ M_ASSERTEXTPG(m_tls); hdr = (void *)m_tls->m_epg_hdr; plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen; /* Determine how much of the TLS record to send. */ tlen = ktls_tcp_payload_length(tlsp, m_tls); if (tlen <= m_tls->m_epg_hdrlen) { /* * For requests that only want to send the TLS header, * send a tunnelled packet as immediate data. */ #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d header-only TLS record %u", __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno); #endif return (ktls_write_tunnel_packet(txq, dst, m, m_tls, available, tcp_seqno, pidx)); } if (tlen < plen) { plen = tlen; offset = ktls_payload_offset(tlsp, m_tls); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d short TLS record %u with offset %u", __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno, offset); #endif if (m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) != 0) { txq->kern_tls_fin_short++; #ifdef INVARIANTS panic("%s: FIN on short TLS record", __func__); #endif } } else offset = 0; /* * This is the last work request for a given TLS mbuf chain if * it is the last mbuf in the chain and FIN is not set. If * FIN is set, then ktls_write_tcp_fin() will write out the * last work request. */ last_wr = m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) == 0; /* * The host stack may ask us to not send part of the start of * a TLS record. (For example, the stack might have * previously sent a "short" TLS record and might later send * down an mbuf that requests to send the remainder of the TLS * record.) The crypto engine must process a TLS record from * the beginning if computing a GCM tag or HMAC, so we always * send the TLS record from the beginning as input to the * crypto engine and via CPL_TX_DATA to TP. However, TP will * drop individual packets after they have been chopped up * into MSS-sized chunks if the entire sequence range of those * packets is less than SND_UNA. SND_UNA is computed as * TX_MAX - SND_UNA_RAW. Thus, use the offset stored in * m_data to set TX_MAX to the first byte in the TCP sequence * space the host actually wants us to send and set * SND_UNA_RAW to 0. * * If the host sends us back to back requests that span the * trailer of a single TLS record (first request ends "in" the * trailer and second request starts at the next byte but * still "in" the trailer), the initial bytes of the trailer * that the first request drops will not be retransmitted. If * the host uses the same requests when retransmitting the * connection will hang. To handle this, always transmit the * full trailer for a request that begins "in" the trailer * (the second request in the example above). This should * also help to avoid retransmits for the common case. * * A similar condition exists when using CBC for back to back * requests that span a single AES block. The first request * will be truncated to end at the end of the previous AES * block. To handle this, always begin transmission at the * start of the current AES block. */ tx_max_offset = mtod(m_tls, vm_offset_t); if (tx_max_offset > TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen) { /* Always send the full trailer. */ tx_max_offset = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen; } if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC && tx_max_offset > TLS_HEADER_LENGTH) { /* Always send all of the first AES block. */ tx_max_offset = TLS_HEADER_LENGTH + rounddown(tx_max_offset - TLS_HEADER_LENGTH, AES_BLOCK_LEN); } tx_max = tcp_seqno + tx_max_offset; /* * Update TCB fields. Reserve space for the FW_ULPTX_WR header * but don't populate it until we know how many field updates * are required. */ if (using_scratch) wr = (void *)txq->ss; else wr = dst; out = (void *)(wr + 1); fields = 0; if (set_l2t_idx) { KASSERT(nsegs != 0, ("trying to set L2T_IX for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d set L2T_IX to %d", __func__, tlsp->tid, tlsp->l2te->idx); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_L2T_IX, V_TCB_L2T_IX(M_TCB_L2T_IX), V_TCB_L2T_IX(tlsp->l2te->idx)); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; } if (tsopt != NULL && tlsp->prev_tsecr != ntohl(tsopt[1])) { KASSERT(nsegs != 0, ("trying to set T_RTSEQ_RECENT for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d wrote updated T_RTSEQ_RECENT", __func__, tlsp->tid); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_T_RTSEQ_RECENT, V_TCB_T_RTSEQ_RECENT(M_TCB_T_RTSEQ_RECENT), V_TCB_T_RTSEQ_RECENT(ntohl(tsopt[1]))); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; tlsp->prev_tsecr = ntohl(tsopt[1]); } if (first_wr || tlsp->prev_seq != tx_max) { KASSERT(nsegs != 0, ("trying to set TX_MAX for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d setting TX_MAX to %u (tcp_seqno %u)", __func__, tlsp->tid, tx_max, tcp_seqno); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_TX_MAX, V_TCB_TX_MAX(M_TCB_TX_MAX), V_TCB_TX_MAX(tx_max)); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; } /* * If there is data to drop at the beginning of this TLS * record or if this is a retransmit, * reset SND_UNA_RAW to 0 so that SND_UNA == TX_MAX. */ if (tlsp->prev_seq != tx_max || mtod(m_tls, vm_offset_t) != 0) { KASSERT(nsegs != 0, ("trying to clear SND_UNA_RAW for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d clearing SND_UNA_RAW", __func__, tlsp->tid); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_SND_UNA_RAW, V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW), V_TCB_SND_UNA_RAW(0)); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; } /* * Store the expected sequence number of the next byte after * this record. */ tlsp->prev_seq = tcp_seqno + tlen; if (first_wr || tlsp->prev_ack != ntohl(tcp->th_ack)) { KASSERT(nsegs != 0, ("trying to set RCV_NXT for subsequent TLS WR")); write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_NXT, V_TCB_RCV_NXT(M_TCB_RCV_NXT), V_TCB_RCV_NXT(ntohl(tcp->th_ack))); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; tlsp->prev_ack = ntohl(tcp->th_ack); } if (first_wr || tlsp->prev_win != ntohs(tcp->th_win)) { KASSERT(nsegs != 0, ("trying to set RCV_WND for subsequent TLS WR")); write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_WND, V_TCB_RCV_WND(M_TCB_RCV_WND), V_TCB_RCV_WND(ntohs(tcp->th_win))); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; tlsp->prev_win = ntohs(tcp->th_win); } /* Recalculate 'nsegs' if cached value is not available. */ if (nsegs == 0) nsegs = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen + offset, plen - (m_tls->m_epg_hdrlen + offset)); /* Calculate the size of the TLS work request. */ twr_len = ktls_base_wr_size(tlsp); imm_len = 0; if (offset == 0) imm_len += m_tls->m_epg_hdrlen; if (plen == tlen) imm_len += AES_BLOCK_LEN; twr_len += roundup2(imm_len, 16); twr_len += ktls_sgl_size(nsegs); /* * If any field updates were required, determine if they can * be included in the TLS work request. If not, use the * FW_ULPTX_WR work request header at 'wr' as a dedicated work * request for the field updates and start a new work request * for the TLS work request afterward. */ if (fields != 0) { wr_len = fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16); if (twr_len + wr_len <= SGE_MAX_WR_LEN && tlsp->sc->tlst.combo_wrs) { wr_len += twr_len; txpkt = (void *)out; } else { wr_len += sizeof(*wr); wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR)); wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA | V_FW_WR_LEN16(wr_len / 16)); wr->cookie = 0; /* * If we were using scratch space, copy the * field updates work request to the ring. */ if (using_scratch) { out = dst; copy_to_txd(eq, txq->ss, &out, wr_len); } ndesc = howmany(wr_len, EQ_ESIZE); MPASS(ndesc <= available); txq->raw_wrs++; txsd = &txq->sdesc[pidx]; txsd->m = NULL; txsd->desc_used = ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; /* * Determine if we should use scratch space * for the TLS work request based on the * available space after advancing pidx for * the field updates work request. */ wr_len = twr_len; using_scratch = (eq->sidx - pidx < howmany(wr_len, EQ_ESIZE)); if (using_scratch) wr = (void *)txq->ss; else wr = dst; txpkt = (void *)(wr + 1); } } else { wr_len = twr_len; txpkt = (void *)out; } wr_len = roundup2(wr_len, 16); MPASS(ndesc + howmany(wr_len, EQ_ESIZE) <= available); /* FW_ULPTX_WR */ wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR)); wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA | V_FW_WR_LEN16(wr_len / 16)); wr->cookie = 0; /* ULP_TXPKT */ txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DATAMODIFY(0) | V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1)); txpkt->len = htobe32(howmany(twr_len - sizeof(*wr), 16)); /* ULPTX_IDATA sub-command */ idata = (void *)(txpkt + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | V_ULP_TX_SC_MORE(1)); idata->len = sizeof(struct cpl_tx_sec_pdu); /* * The key context, CPL_TX_DATA, and immediate data are part * of this ULPTX_IDATA when using an inline key. When reading * the key from memory, the CPL_TX_DATA and immediate data are * part of a separate ULPTX_IDATA. */ if (tlsp->inline_key) idata->len += tlsp->tx_key_info_size + sizeof(struct cpl_tx_data) + imm_len; idata->len = htobe32(idata->len); /* CPL_TX_SEC_PDU */ sec_pdu = (void *)(idata + 1); /* * For short records, AAD is counted as header data in SCMD0, * the IV is next followed by a cipher region for the payload. */ if (plen == tlen) { aad_start = 0; aad_stop = 0; iv_offset = 1; auth_start = 0; auth_stop = 0; auth_insert = 0; cipher_start = AES_BLOCK_LEN + 1; cipher_stop = 0; sec_pdu->pldlen = htobe32(16 + plen - (m_tls->m_epg_hdrlen + offset)); /* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */ sec_pdu->seqno_numivs = tlsp->scmd0_short.seqno_numivs; sec_pdu->ivgen_hdrlen = htobe32( tlsp->scmd0_short.ivgen_hdrlen | V_SCMD_HDR_LEN(offset == 0 ? m_tls->m_epg_hdrlen : 0)); txq->kern_tls_short++; } else { /* * AAD is TLS header. IV is after AAD. The cipher region * starts after the IV. See comments in ccr_authenc() and * ccr_gmac() in t4_crypto.c regarding cipher and auth * start/stop values. */ aad_start = 1; aad_stop = TLS_HEADER_LENGTH; iv_offset = TLS_HEADER_LENGTH + 1; cipher_start = m_tls->m_epg_hdrlen + 1; if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) { cipher_stop = 0; auth_start = cipher_start; auth_stop = 0; auth_insert = 0; } else { cipher_stop = 0; auth_start = cipher_start; auth_stop = 0; auth_insert = 0; } sec_pdu->pldlen = htobe32(plen); /* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */ sec_pdu->seqno_numivs = tlsp->scmd0.seqno_numivs; sec_pdu->ivgen_hdrlen = tlsp->scmd0.ivgen_hdrlen; if (mtod(m_tls, vm_offset_t) == 0) txq->kern_tls_full++; else txq->kern_tls_partial++; } sec_pdu->op_ivinsrtofst = htobe32( V_CPL_TX_SEC_PDU_OPCODE(CPL_TX_SEC_PDU) | V_CPL_TX_SEC_PDU_CPLLEN(2) | V_CPL_TX_SEC_PDU_PLACEHOLDER(0) | V_CPL_TX_SEC_PDU_IVINSRTOFST(iv_offset)); sec_pdu->aadstart_cipherstop_hi = htobe32( V_CPL_TX_SEC_PDU_AADSTART(aad_start) | V_CPL_TX_SEC_PDU_AADSTOP(aad_stop) | V_CPL_TX_SEC_PDU_CIPHERSTART(cipher_start) | V_CPL_TX_SEC_PDU_CIPHERSTOP_HI(cipher_stop >> 4)); sec_pdu->cipherstop_lo_authinsert = htobe32( V_CPL_TX_SEC_PDU_CIPHERSTOP_LO(cipher_stop & 0xf) | V_CPL_TX_SEC_PDU_AUTHSTART(auth_start) | V_CPL_TX_SEC_PDU_AUTHSTOP(auth_stop) | V_CPL_TX_SEC_PDU_AUTHINSERT(auth_insert)); sec_pdu->scmd1 = htobe64(m_tls->m_epg_seqno); /* Key context */ out = (void *)(sec_pdu + 1); if (tlsp->inline_key) { memcpy(out, &tlsp->keyctx, tlsp->tx_key_info_size); out += tlsp->tx_key_info_size; } else { /* ULPTX_SC_MEMRD to read key context. */ memrd = (void *)out; memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) | V_ULP_TX_SC_MORE(1) | V_ULPTX_LEN16(tlsp->tx_key_info_size >> 4)); memrd->addr = htobe32(tlsp->tx_key_addr >> 5); /* ULPTX_IDATA for CPL_TX_DATA and TLS header. */ idata = (void *)(memrd + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | V_ULP_TX_SC_MORE(1)); idata->len = htobe32(sizeof(struct cpl_tx_data) + imm_len); out = (void *)(idata + 1); } /* CPL_TX_DATA */ tx_data = (void *)out; OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tlsp->tid)); if (m->m_pkthdr.csum_flags & CSUM_TSO) { mss = m->m_pkthdr.tso_segsz; tlsp->prev_mss = mss; } else if (tlsp->prev_mss != 0) mss = tlsp->prev_mss; else mss = tlsp->vi->ifp->if_mtu - (m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen); if (offset == 0) { tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen)); tx_data->rsvd = htobe32(tcp_seqno); } else { tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen - (m_tls->m_epg_hdrlen + offset))); tx_data->rsvd = htobe32(tcp_seqno + m_tls->m_epg_hdrlen + offset); } tx_data->flags = htobe32(F_TX_BYPASS); if (last_wr && tcp->th_flags & TH_PUSH) tx_data->flags |= htobe32(F_TX_PUSH | F_TX_SHOVE); /* Populate the TLS header */ out = (void *)(tx_data + 1); if (offset == 0) { memcpy(out, m_tls->m_epg_hdr, m_tls->m_epg_hdrlen); out += m_tls->m_epg_hdrlen; } /* AES IV for a short record. */ if (plen == tlen) { iv = out; if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) { memcpy(iv, tlsp->keyctx.txhdr.txsalt, SALT_SIZE); memcpy(iv + 4, hdr + 1, 8); *(uint32_t *)(iv + 12) = htobe32(2 + offset / AES_BLOCK_LEN); } else memcpy(iv, hdr + 1, AES_BLOCK_LEN); out += AES_BLOCK_LEN; } if (imm_len % 16 != 0) { /* Zero pad to an 8-byte boundary. */ memset(out, 0, 8 - (imm_len % 8)); out += 8 - (imm_len % 8); /* * Insert a ULP_TX_SC_NOOP if needed so the SGL is * 16-byte aligned. */ if (imm_len % 16 <= 8) { idata = (void *)out; idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); idata->len = htobe32(0); out = (void *)(idata + 1); } } /* SGL for record payload */ sglist_reset(txq->gl); if (sglist_append_mbuf_epg(txq->gl, m_tls, m_tls->m_epg_hdrlen + offset, plen - (m_tls->m_epg_hdrlen + offset)) != 0) { #ifdef INVARIANTS panic("%s: failed to append sglist", __func__); #endif } write_gl_to_buf(txq->gl, out); if (using_scratch) { out = dst; copy_to_txd(eq, txq->ss, &out, wr_len); } ndesc += howmany(wr_len, EQ_ESIZE); MPASS(ndesc <= available); txq->kern_tls_records++; txq->kern_tls_octets += tlen - mtod(m_tls, vm_offset_t); if (mtod(m_tls, vm_offset_t) != 0) { if (offset == 0) txq->kern_tls_waste += mtod(m_tls, vm_offset_t); else txq->kern_tls_waste += mtod(m_tls, vm_offset_t) - (m_tls->m_epg_hdrlen + offset); } txsd = &txq->sdesc[pidx]; if (last_wr) txsd->m = m; else txsd->m = NULL; txsd->desc_used = howmany(wr_len, EQ_ESIZE); return (ndesc); } static int ktls_write_tcp_fin(struct sge_txq *txq, void *dst, struct mbuf *m, u_int available, tcp_seq tcp_seqno, u_int pidx) { struct tx_sdesc *txsd; struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int len16, ndesc, pktlen; struct ether_header *eh; struct ip *ip, newip; struct ip6_hdr *ip6, newip6; struct tcphdr *tcp, newtcp; caddr_t out; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m); wr = dst; pktlen = m->m_len; ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; cpl = (void *)(wr + 1); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); out = (void *)(cpl + 1); /* Copy over Ethernet header. */ eh = mtod(m, struct ether_header *); copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen); /* Fixup length in IP header and copy out. */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip = *ip; newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip)); if (m->m_pkthdr.l3hlen > sizeof(*ip)) copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out, m->m_pkthdr.l3hlen - sizeof(*ip)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } else { ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip6 = *ip6; newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6)); MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } cpl->ctrl1 = htobe64(ctrl1); txq->txcsum++; /* Set sequence number in TCP header. */ tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); newtcp = *tcp; newtcp.th_seq = htonl(tcp_seqno); copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp)); /* Copy rest of packet. */ copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len - (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp))); txq->imm_wrs++; txq->txpkt_wrs++; txq->kern_tls_fin++; txsd = &txq->sdesc[pidx]; txsd->m = m; txsd->desc_used = ndesc; return (ndesc); } int t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct tlspcb *tlsp; struct tcphdr *tcp; struct mbuf *m_tls; struct ether_header *eh; tcp_seq tcp_seqno; u_int ndesc, pidx, totdesc; uint16_t vlan_tag; bool has_fin, set_l2t_idx; void *tsopt; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.snd_tag != NULL); tlsp = mst_to_tls(m->m_pkthdr.snd_tag); totdesc = 0; eh = mtod(m, struct ether_header *); tcp = (struct tcphdr *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); pidx = eq->pidx; has_fin = (tcp->th_flags & TH_FIN) != 0; /* * If this TLS record has a FIN, then we will send any * requested options as part of the FIN packet. */ if (!has_fin && ktls_has_tcp_options(tcp)) { ndesc = ktls_write_tcp_options(txq, dst, m, available, pidx); totdesc += ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d wrote TCP options packet", __func__, tlsp->tid); #endif } /* * Allocate a new L2T entry if necessary. This may write out * a work request to the txq. */ if (m->m_flags & M_VLANTAG) vlan_tag = m->m_pkthdr.ether_vtag; else vlan_tag = 0xfff; set_l2t_idx = false; if (tlsp->l2te == NULL || tlsp->l2te->vlan != vlan_tag || memcmp(tlsp->l2te->dmac, eh->ether_dhost, ETHER_ADDR_LEN) != 0) { set_l2t_idx = true; if (tlsp->l2te) t4_l2t_release(tlsp->l2te); tlsp->l2te = t4_l2t_alloc_tls(tlsp->sc, txq, dst, &ndesc, vlan_tag, tlsp->vi->pi->lport, eh->ether_dhost); if (tlsp->l2te == NULL) CXGBE_UNIMPLEMENTED("failed to allocate TLS L2TE"); if (ndesc != 0) { MPASS(ndesc <= available - totdesc); txq->raw_wrs++; txsd = &txq->sdesc[pidx]; txsd->m = NULL; txsd->desc_used = ndesc; totdesc += ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; } } /* * Iterate over each TLS record constructing a work request * for that record. */ for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) { MPASS(m_tls->m_flags & M_EXTPG); /* * Determine the initial TCP sequence number for this * record. */ tsopt = NULL; if (m_tls == m->m_next) { tcp_seqno = ntohl(tcp->th_seq) - mtod(m_tls, vm_offset_t); if (tlsp->using_timestamps) tsopt = ktls_find_tcp_timestamps(tcp); } else { MPASS(mtod(m_tls, vm_offset_t) == 0); tcp_seqno = tlsp->prev_seq; } ndesc = ktls_write_tls_wr(tlsp, txq, dst, m, tcp, m_tls, nsegs, available - totdesc, tcp_seqno, tsopt, pidx, set_l2t_idx); totdesc += ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; /* * The value of nsegs from the header mbuf's metadata * is only valid for the first TLS record. */ nsegs = 0; /* Only need to set the L2T index once. */ set_l2t_idx = false; } if (has_fin) { /* * If the TCP header for this chain has FIN sent, then * explicitly send a packet that has FIN set. This * will also have PUSH set if requested. This assumes * we sent at least one TLS record work request and * uses the TCP sequence number after that reqeust as * the sequence number for the FIN packet. */ ndesc = ktls_write_tcp_fin(txq, dst, m, available, tlsp->prev_seq, pidx); totdesc += ndesc; } MPASS(totdesc <= available); return (totdesc); } void cxgbe_tls_tag_free(struct m_snd_tag *mst) { struct adapter *sc; struct tlspcb *tlsp; tlsp = mst_to_tls(mst); sc = tlsp->sc; CTR2(KTR_CXGBE, "%s: tid %d", __func__, tlsp->tid); if (tlsp->l2te) t4_l2t_release(tlsp->l2te); if (tlsp->tid >= 0) release_tid(sc, tlsp->tid, tlsp->ctrlq); if (tlsp->ce) - t4_release_lip(sc, tlsp->ce); + t4_release_clip_entry(sc, tlsp->ce); if (tlsp->tx_key_addr >= 0) free_keyid(tlsp, tlsp->tx_key_addr); zfree(tlsp, M_CXGBE); } void t6_ktls_modload(void) { t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, ktls_act_open_rpl, CPL_COOKIE_KERN_TLS); } void t6_ktls_modunload(void) { t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_KERN_TLS); } #else int cxgbe_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { return (ENXIO); } int t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p) { return (EINVAL); } int t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs, u_int available) { panic("can't happen"); } void cxgbe_tls_tag_free(struct m_snd_tag *mst) { panic("can't happen"); } void t6_ktls_modload(void) { } void t6_ktls_modunload(void) { } #endif diff --git a/sys/dev/cxgbe/t4_clip.c b/sys/dev/cxgbe/t4_clip.c index ad26d212315e..18d78a9e830b 100644 --- a/sys/dev/cxgbe/t4_clip.c +++ b/sys/dev/cxgbe/t4_clip.c @@ -1,408 +1,862 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2012 Chelsio Communications, Inc. + * Copyright (c) 2012-2021 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "t4_clip.h" +/* + * Code to deal with the Compressed Local IPv6 (CLIP) table in the ASIC. + * + * The driver maintains a global CLIP database (clip_db) of IPv6 addresses and a + * per-adapter CLIP table (sc->clip_table) with entries that point to an IPv6 in + * the clip_db. All access is protected by a single global lock (clip_db_lock). + * The correct lock order is clip lock before synchronized op. + * + * By default (hw.cxgbe.clip_db_auto=1) all local IPv6 addresses are added to + * the db. Addresses are also added on-demand when the driver allocates an + * entry for a filter, TOE tid, etc. krn_ref counts the number of times an + * address appears in the system. adp_ref counts the number of adapters that + * have that address in their CLIP table. If both are 0 then the entry is + * evicted from the db. Consumers of the CLIP table entry (filters, TOE tids) + * are tracked in ce->refcount. Driver ioctls let external consumers add/remove + * addresses from the CLIP table. + */ + #if defined(INET6) -static int add_lip(struct adapter *, struct in6_addr *); -static int delete_lip(struct adapter *, struct in6_addr *); -static struct clip_entry *search_lip(struct adapter *, struct in6_addr *); -static void update_clip(struct adapter *, void *); -static void t4_clip_task(void *, int); -static void update_clip_table(struct adapter *); +struct clip_db_entry { + LIST_ENTRY(clip_db_entry) link; /* clip_db hash linkage */ + struct in6_addr lip; + u_int krn_ref; /* # of times this IP6 appears in list of all IP6 */ + u_int adp_ref; /* # of adapters with this IP6 in their CLIP */ + u_int tmp_ref; /* Used only during refresh */ +}; + +struct clip_entry { + LIST_ENTRY(clip_entry) link; /* clip_table hash linkage */ + TAILQ_ENTRY(clip_entry) plink; /* clip_pending linkage */ + struct clip_db_entry *cde; + int16_t clip_idx; /* index in the hw table */ + bool pending; /* in clip_pending list */ + int refcount; +}; -static int in6_ifaddr_gen; static eventhandler_tag ifaddr_evhandler; -static struct timeout_task clip_task; +static struct mtx clip_db_lock; +static LIST_HEAD(, clip_db_entry) *clip_db; +static u_long clip_db_mask; +static int clip_db_gen; +static struct task clip_db_task; + +static int add_lip(struct adapter *, struct in6_addr *, int16_t *); +static int del_lip(struct adapter *, struct in6_addr *); +static void t4_clip_db_task(void *, int); +static void t4_clip_task(void *, int); +static void update_clip_db(void); +static int update_sw_clip_table(struct adapter *); +static int update_hw_clip_table(struct adapter *); +static void update_clip_table(struct adapter *, void *); +static int sysctl_clip_db(SYSCTL_HANDLER_ARGS); +static int sysctl_clip_db_auto(SYSCTL_HANDLER_ARGS); +static struct clip_db_entry *lookup_clip_db_entry(struct in6_addr *, bool); +static struct clip_entry *lookup_clip_entry(struct adapter *, struct in6_addr *, + bool); + +SYSCTL_PROC(_hw_cxgbe, OID_AUTO, clip_db, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_clip_db, "A", + "CLIP database"); + +int t4_clip_db_auto = 1; +SYSCTL_PROC(_hw_cxgbe, OID_AUTO, clip_db_auto, CTLTYPE_INT | CTLFLAG_RWTUN | + CTLFLAG_MPSAFE, NULL, 0, sysctl_clip_db_auto, "I", + "Add local IPs to CLIP db automatically (0 = no, 1 = yes)"); + +static inline uint32_t +clip_hashfn(struct in6_addr *addr) +{ + return (fnv_32_buf(addr, sizeof(*addr), FNV1_32_INIT) & clip_db_mask); +} + +static inline struct clip_db_entry * +alloc_clip_db_entry(struct in6_addr *in6) +{ + struct clip_db_entry *cde; + + cde = malloc(sizeof(*cde), M_CXGBE, M_NOWAIT | M_ZERO); + if (__predict_true(cde != NULL)) + memcpy(&cde->lip, in6, sizeof(cde->lip)); + + return (cde); +} + +static inline struct clip_entry * +alloc_clip_entry(struct clip_db_entry *cde) +{ + struct clip_entry *ce; + + mtx_assert(&clip_db_lock, MA_OWNED); + + ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT | M_ZERO); + if (__predict_true(ce != NULL)) { + ce->cde = cde; + cde->adp_ref++; + ce->clip_idx = -1; + } + + return (ce); +} + +/* + * Look up the IP6 address in the CLIP db. If add is set then an entry for the + * IP6 will be added to the db. + */ +static struct clip_db_entry * +lookup_clip_db_entry(struct in6_addr *in6, bool add) +{ + struct clip_db_entry *cde; + const int bucket = clip_hashfn(in6); + + mtx_assert(&clip_db_lock, MA_OWNED); + + LIST_FOREACH(cde, &clip_db[bucket], link) { + if (IN6_ARE_ADDR_EQUAL(&cde->lip, in6)) + return (cde); + } + + /* Not found. Create a new entry if requested. */ + if (add) { + cde = alloc_clip_db_entry(in6); + if (cde != NULL) + LIST_INSERT_HEAD(&clip_db[bucket], cde, link); + } + + return (cde); +} + +/* + * Look up the IP6 address in the CLIP db. If add is set then an entry for the + * IP6 will be added to the db. + */ +static struct clip_entry * +lookup_clip_entry(struct adapter *sc, struct in6_addr *in6, bool add) +{ + struct clip_db_entry *cde; + struct clip_entry *ce; + const int bucket = clip_hashfn(in6); + + mtx_assert(&clip_db_lock, MA_OWNED); + + cde = lookup_clip_db_entry(in6, add); + if (cde == NULL) + return (NULL); + + LIST_FOREACH(ce, &sc->clip_table[bucket], link) { + if (ce->cde == cde) + return (ce); + } + + /* Not found. Create a new entry if requested. */ + if (add) { + ce = alloc_clip_entry(cde); + if (ce != NULL) { + LIST_INSERT_HEAD(&sc->clip_table[bucket], ce, link); + TAILQ_INSERT_TAIL(&sc->clip_pending, ce, plink); + ce->pending = true; + } + } + + return (ce); +} static int -add_lip(struct adapter *sc, struct in6_addr *lip) +add_lip(struct adapter *sc, struct in6_addr *lip, int16_t *idx) { - struct fw_clip_cmd c; + struct fw_clip_cmd c; + int rc; ASSERT_SYNCHRONIZED_OP(sc); - mtx_assert(&sc->clip_table_lock, MA_OWNED); - memset(&c, 0, sizeof(c)); + memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); - c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); - c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; - c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; + c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); + c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; + c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; - return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); + rc = -t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c); + if (rc == 0 && idx != NULL) + *idx = G_FW_CLIP_CMD_INDEX(ntohl(c.alloc_to_len16)); + return (rc); } static int -delete_lip(struct adapter *sc, struct in6_addr *lip) +del_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); - mtx_assert(&sc->clip_table_lock, MA_OWNED); memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); - c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); - c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; - c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; + c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); + c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; + c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } +#endif -static struct clip_entry * -search_lip(struct adapter *sc, struct in6_addr *lip) +struct clip_entry * +t4_get_clip_entry(struct adapter *sc, struct in6_addr *in6, bool add) { +#ifdef INET6 struct clip_entry *ce; + bool schedule = false; - mtx_assert(&sc->clip_table_lock, MA_OWNED); - - TAILQ_FOREACH(ce, &sc->clip_table, link) { - if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) - return (ce); + mtx_lock(&clip_db_lock); + ce = lookup_clip_entry(sc, in6, add); + if (ce != NULL) { + MPASS(ce->cde->adp_ref > 0); + if (++ce->refcount == 1 && ce->pending && ce->clip_idx != -1) { + /* + * Valid entry that was waiting to be deleted. It is in + * use now so take it off the pending list. + */ + TAILQ_REMOVE(&sc->clip_pending, ce, plink); + ce->pending = false; + } + if (ce->clip_idx == -1 && update_hw_clip_table(sc) != 0) + schedule = true; } + mtx_unlock(&clip_db_lock); + if (schedule) + taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, 0); + return (ce); +#else return (NULL); -} #endif +} -struct clip_entry * -t4_hold_lip(struct adapter *sc, struct in6_addr *lip, struct clip_entry *ce) +void +t4_hold_clip_entry(struct adapter *sc, struct clip_entry *ce) { - #ifdef INET6 - mtx_lock(&sc->clip_table_lock); - if (ce == NULL) - ce = search_lip(sc, lip); - if (ce != NULL) - ce->refcount++; - mtx_unlock(&sc->clip_table_lock); + MPASS(ce != NULL); + MPASS(ce->cde->adp_ref > 0); - return (ce); -#else - return (NULL); + mtx_lock(&clip_db_lock); + MPASS(ce->refcount > 0); /* Caller should already have a reference */ + ce->refcount++; + mtx_unlock(&clip_db_lock); #endif } +#ifdef INET6 +static void +release_clip_entry_locked(struct adapter *sc, struct clip_entry *ce) +{ + struct clip_db_entry *cde; + + mtx_assert(&clip_db_lock, MA_OWNED); + MPASS(ce->refcount > 0); + cde = ce->cde; + MPASS(cde->adp_ref > 0); + if (--ce->refcount == 0 && cde->krn_ref == 0) { + if (ce->clip_idx == -1) { + /* Was never written to the hardware. */ + MPASS(ce->pending); + TAILQ_REMOVE(&sc->clip_pending, ce, plink); + LIST_REMOVE(ce, link); + free(ce, M_CXGBE); + if (--cde->adp_ref == 0) { + LIST_REMOVE(cde, link); + free(cde, M_CXGBE); + } + } else { + /* + * Valid entry is now unused, add to the pending list + * for deletion. Its refcount was 1 on entry so it + * can't already be pending. + */ + MPASS(!ce->pending); + TAILQ_INSERT_HEAD(&sc->clip_pending, ce, plink); + ce->pending = true; + } + } +} +#endif + void -t4_release_lip(struct adapter *sc, struct clip_entry *ce) +t4_release_clip_entry(struct adapter *sc, struct clip_entry *ce) { +#ifdef INET6 + MPASS(ce != NULL); + + mtx_lock(&clip_db_lock); + release_clip_entry_locked(sc, ce); + /* + * This isn't a manual release via the ioctl. No need to update the + * hw right now even if the release resulted in the entry being queued + * for deletion. + */ + mtx_unlock(&clip_db_lock); +#endif +} +int +t4_release_clip_addr(struct adapter *sc, struct in6_addr *in6) +{ + int rc = ENOTSUP; #ifdef INET6 - mtx_lock(&sc->clip_table_lock); - KASSERT(search_lip(sc, &ce->lip) == ce, - ("%s: CLIP entry %p p not in CLIP table.", __func__, ce)); - KASSERT(ce->refcount > 0, - ("%s: CLIP entry %p has refcount 0", __func__, ce)); - --ce->refcount; - mtx_unlock(&sc->clip_table_lock); + struct clip_entry *ce; + bool schedule = false; + + mtx_lock(&clip_db_lock); + ce = lookup_clip_entry(sc, in6, false); + if (ce == NULL) + rc = ENOENT; + else if (ce->refcount == 0) + rc = EIO; + else { + release_clip_entry_locked(sc, ce); + if (update_hw_clip_table(sc) != 0) + schedule = true; + rc = 0; + } + mtx_unlock(&clip_db_lock); + if (schedule) + taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, 0); #endif + return (rc); } #ifdef INET6 void t4_init_clip_table(struct adapter *sc) { - - mtx_init(&sc->clip_table_lock, "CLIP table lock", NULL, MTX_DEF); - TAILQ_INIT(&sc->clip_table); + TAILQ_INIT(&sc->clip_pending); + TIMEOUT_TASK_INIT(taskqueue_thread, &sc->clip_task, 0, t4_clip_task, sc); sc->clip_gen = -1; + sc->clip_table = hashinit(CLIP_HASH_SIZE, M_CXGBE, &sc->clip_mask); + /* Both the hashes must use the same bucket for the same key. */ + if (sc->clip_table != NULL) + MPASS(sc->clip_mask == clip_db_mask); /* * Don't bother forcing an update of the clip table when the * adapter is initialized. Before an interface can be used it * must be assigned an address which will trigger the event * handler to update the table. */ } +/* + * Returns true if any additions or deletions were made to the CLIP DB. + */ static void -update_clip(struct adapter *sc, void *arg __unused) +update_clip_db(void) { + VNET_ITERATOR_DECL(vnet_iter); + struct rm_priotracker in6_ifa_tracker; + struct in6_addr *in6, tin6; + struct in6_ifaddr *ia; + struct clip_db_entry *cde, *cde_tmp; + int i, addel; - if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4clip")) - return; + VNET_LIST_RLOCK(); + IN6_IFADDR_RLOCK(&in6_ifa_tracker); + mtx_lock(&clip_db_lock); + VNET_FOREACH(vnet_iter) { + CURVNET_SET_QUIET(vnet_iter); + CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { + if (ia->ia_ifp->if_flags & IFF_LOOPBACK) + continue; + in6 = &ia->ia_addr.sin6_addr; + KASSERT(!IN6_IS_ADDR_MULTICAST(in6), + ("%s: mcast address in in6_ifaddr list", __func__)); + if (IN6_IS_ADDR_LOOPBACK(in6)) + continue; - if (mtx_initialized(&sc->clip_table_lock) && !hw_off_limits(sc)) - update_clip_table(sc); + if (IN6_IS_SCOPE_EMBED(in6)) { + tin6 = *in6; + in6 = &tin6; + in6_clearscope(in6); + } + cde = lookup_clip_db_entry(in6, true); + if (cde == NULL) + continue; + cde->tmp_ref++; + } + CURVNET_RESTORE(); + } + + addel = 0; + for (i = 0; i <= clip_db_mask; i++) { + LIST_FOREACH_SAFE(cde, &clip_db[i], link, cde_tmp) { + if (cde->krn_ref == 0 && cde->tmp_ref > 0) { + addel++; /* IP6 addr added. */ + } else if (cde->krn_ref > 0 && cde->tmp_ref == 0) { + if (cde->adp_ref == 0) { + LIST_REMOVE(cde, link); + free(cde, M_CXGBE); + continue; + } + addel++; /* IP6 addr deleted. */ + } + cde->krn_ref = cde->tmp_ref; + cde->tmp_ref = 0; + } + } + if (addel > 0) + clip_db_gen++; + mtx_unlock(&clip_db_lock); + IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); + VNET_LIST_RUNLOCK(); - end_synchronized_op(sc, LOCK_HELD); } +/* + * Update the CLIP db and then update the CLIP tables on all the adapters. + */ static void -t4_clip_task(void *arg, int count) +t4_clip_db_task(void *arg, int count) { - - t4_iterate(update_clip, NULL); + update_clip_db(); + t4_iterate(update_clip_table, NULL); } -static void -update_clip_table(struct adapter *sc) +/* + * Refresh the sw CLIP table for this adapter from the global CLIP db. Entries + * that need to be added or deleted from the hardware CLIP table are placed on a + * pending list but the hardware is not touched. The pending list is something + * reasonable even if this fails so it's ok to apply that to the hardware. + */ +static int +update_sw_clip_table(struct adapter *sc) { - struct rm_priotracker in6_ifa_tracker; - struct in6_ifaddr *ia; - struct in6_addr *lip, tlip; - TAILQ_HEAD(, clip_entry) stale; + struct clip_db_entry *cde; struct clip_entry *ce, *ce_temp; - struct vi_info *vi; - int rc, gen, i, j; - uintptr_t last_vnet; - - ASSERT_SYNCHRONIZED_OP(sc); + int i; + bool found; - IN6_IFADDR_RLOCK(&in6_ifa_tracker); - mtx_lock(&sc->clip_table_lock); - - gen = atomic_load_acq_int(&in6_ifaddr_gen); - if (gen == sc->clip_gen) - goto done; - - TAILQ_INIT(&stale); - TAILQ_CONCAT(&stale, &sc->clip_table, link); + mtx_assert(&clip_db_lock, MA_OWNED); /* - * last_vnet optimizes the common cases where all if_vnet = NULL (no - * VIMAGE) or all if_vnet = vnet0. + * We are about to rebuild the pending list from scratch. Deletions are + * placed before additions because that's how we want to submit them to + * the hardware. */ - last_vnet = (uintptr_t)(-1); - for_each_port(sc, i) - for_each_vi(sc->port[i], j, vi) { - if (IS_DOOMED(vi)) - continue; - - if (last_vnet == (uintptr_t)vi->ifp->if_vnet) - continue; + TAILQ_INIT(&sc->clip_pending); - /* XXX: races with if_vmove */ - CURVNET_SET(vi->ifp->if_vnet); - CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { - lip = &ia->ia_addr.sin6_addr; - - KASSERT(!IN6_IS_ADDR_MULTICAST(lip), - ("%s: mcast address in in6_ifaddr list", __func__)); - - if (IN6_IS_ADDR_LOOPBACK(lip)) + /* + * Walk the sw CLIP table first. We want to reset every entry's pending + * status as we're rebuilding the pending list. + */ + for (i = 0; i <= clip_db_mask; i++) { + LIST_FOREACH_SAFE(ce, &sc->clip_table[i], link, ce_temp) { + cde = ce->cde; + MPASS(cde->adp_ref > 0); + if (ce->refcount != 0 || cde->krn_ref != 0) { + /* + * Entry should stay in the CLIP. + */ + + if (ce->clip_idx != -1) { + ce->pending = false; + } else { + /* Was never added, carry forward. */ + MPASS(ce->pending); + TAILQ_INSERT_TAIL(&sc->clip_pending, ce, + plink); + } continue; - if (IN6_IS_SCOPE_EMBED(lip)) { - /* Remove the embedded scope */ - tlip = *lip; - lip = &tlip; - in6_clearscope(lip); } - /* - * XXX: how to weed out the link local address for the - * loopback interface? It's fe80::1 usually (always?). - */ /* - * If it's in the main list then we already know it's - * not stale. + * Entry should be removed from the CLIP. */ - TAILQ_FOREACH(ce, &sc->clip_table, link) { - if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) - goto next; - } - /* - * If it's in the stale list we should move it to the - * main list. - */ - TAILQ_FOREACH(ce, &stale, link) { - if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) { - TAILQ_REMOVE(&stale, ce, link); - TAILQ_INSERT_TAIL(&sc->clip_table, ce, - link); - goto next; + if (ce->clip_idx != -1) { + ce->pending = true; + TAILQ_INSERT_HEAD(&sc->clip_pending, ce, plink); + } else { + /* Was never added, free right now. */ + MPASS(ce->pending); + LIST_REMOVE(ce, link); + free(ce, M_CXGBE); + if (--cde->adp_ref == 0) { + LIST_REMOVE(cde, link); + free(cde, M_CXGBE); } } + } + } + + for (i = 0; i <= clip_db_mask; i++) { + LIST_FOREACH(cde, &clip_db[i], link) { + if (cde->krn_ref == 0) + continue; - /* A new IP6 address; add it to the CLIP table */ - ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT); - memcpy(&ce->lip, lip, sizeof(ce->lip)); - ce->refcount = 0; - rc = add_lip(sc, lip); - if (rc == 0) - TAILQ_INSERT_TAIL(&sc->clip_table, ce, link); - else { - char ip[INET6_ADDRSTRLEN]; - - inet_ntop(AF_INET6, &ce->lip, &ip[0], - sizeof(ip)); - if (sc->flags & KERN_TLS_ON || - sc->active_ulds != 0) { - log(LOG_ERR, - "%s: could not add %s (%d)\n", - __func__, ip, rc); + found = false; + LIST_FOREACH(ce, &sc->clip_table[i], link) { + if (ce->cde == cde) { + found = true; + break; } - free(ce, M_CXGBE); } -next: - continue; + if (found) + continue; + ce = alloc_clip_entry(cde); + if (ce == NULL) + return (ENOMEM); + LIST_INSERT_HEAD(&sc->clip_table[i], ce, link); + TAILQ_INSERT_TAIL(&sc->clip_pending, ce, plink); + ce->pending = true; } - CURVNET_RESTORE(); - last_vnet = (uintptr_t)vi->ifp->if_vnet; } - /* - * Remove stale addresses (those no longer in V_in6_ifaddrhead) that are - * no longer referenced by the driver. - */ - TAILQ_FOREACH_SAFE(ce, &stale, link, ce_temp) { - if (ce->refcount == 0) { - rc = delete_lip(sc, &ce->lip); - if (rc == 0) { - TAILQ_REMOVE(&stale, ce, link); + sc->clip_gen = clip_db_gen; + return (0); +} + +static int +update_hw_clip_table(struct adapter *sc) +{ + struct clip_db_entry *cde; + struct clip_entry *ce; + int rc; + char ip[INET6_ADDRSTRLEN]; + + mtx_assert(&clip_db_lock, MA_OWNED); + rc = begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4clip"); + if (rc != 0) + return (rc); + if (hw_off_limits(sc)) + goto done; /* with rc = 0, we don't want to reschedule. */ + while (!TAILQ_EMPTY(&sc->clip_pending)) { + ce = TAILQ_FIRST(&sc->clip_pending); + MPASS(ce->pending); + cde = ce->cde; + MPASS(cde->adp_ref > 0); + + if (ce->clip_idx == -1) { + /* + * Entry was queued for addition to the HW CLIP. + */ + + if (ce->refcount == 0 && cde->krn_ref == 0) { + /* No need to add to HW CLIP. */ + TAILQ_REMOVE(&sc->clip_pending, ce, plink); + LIST_REMOVE(ce, link); free(ce, M_CXGBE); + if (--cde->adp_ref == 0) { + LIST_REMOVE(cde, link); + free(cde, M_CXGBE); + } } else { - char ip[INET6_ADDRSTRLEN]; + /* Add to the HW CLIP. */ + rc = add_lip(sc, &cde->lip, &ce->clip_idx); + if (rc == FW_ENOMEM) { + /* CLIP full, no point in retrying. */ + rc = 0; + goto done; + } + if (rc != 0) { + inet_ntop(AF_INET6, &cde->lip, &ip[0], + sizeof(ip)); + CH_ERR(sc, "add_lip(%s) failed: %d\n", + ip, rc); + goto done; + } + MPASS(ce->clip_idx != -1); + TAILQ_REMOVE(&sc->clip_pending, ce, plink); + ce->pending = false; + } + } else { + /* + * Entry was queued for deletion from the HW CLIP. + */ - inet_ntop(AF_INET6, &ce->lip, &ip[0], - sizeof(ip)); - log(LOG_ERR, "%s: could not delete %s (%d)\n", - __func__, ip, rc); + if (ce->refcount == 0 && cde->krn_ref == 0) { + /* + * Delete from the HW CLIP. Delete should never + * fail so we always log an error. But if the + * failure is that the entry wasn't found in the + * CLIP then we carry on as if it was deleted. + */ + rc = del_lip(sc, &cde->lip); + if (rc != 0) + CH_ERR(sc, "del_lip(%s) failed: %d\n", + ip, rc); + if (rc == FW_EPROTO) + rc = 0; + if (rc != 0) + goto done; + + TAILQ_REMOVE(&sc->clip_pending, ce, plink); + LIST_REMOVE(ce, link); + free(ce, M_CXGBE); + if (--cde->adp_ref == 0) { + LIST_REMOVE(cde, link); + free(cde, M_CXGBE); + } + } else { + /* No need to delete from HW CLIP. */ + TAILQ_REMOVE(&sc->clip_pending, ce, plink); + ce->pending = false; } } } - /* The ones that are still referenced need to stay in the CLIP table */ - TAILQ_CONCAT(&sc->clip_table, &stale, link); - - sc->clip_gen = gen; done: - mtx_unlock(&sc->clip_table_lock); - IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); + end_synchronized_op(sc, LOCK_HELD); + return (rc); +} + +static void +update_clip_table(struct adapter *sc, void *arg __unused) +{ + bool reschedule; + + if (sc->clip_table == NULL) + return; + + reschedule = false; + mtx_lock(&clip_db_lock); + if (sc->clip_gen != clip_db_gen && update_sw_clip_table(sc) != 0) + reschedule = true; + if (!TAILQ_EMPTY(&sc->clip_pending) && update_hw_clip_table(sc) != 0) + reschedule = true; + mtx_unlock(&clip_db_lock); + if (reschedule) + taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, + -hz / 4); +} + +/* + * Update the CLIP table of the specified adapter. + */ +static void +t4_clip_task(void *sc, int count) +{ + update_clip_table(sc, NULL); } void t4_destroy_clip_table(struct adapter *sc) { struct clip_entry *ce, *ce_temp; - - if (mtx_initialized(&sc->clip_table_lock)) { - mtx_lock(&sc->clip_table_lock); - TAILQ_FOREACH_SAFE(ce, &sc->clip_table, link, ce_temp) { - KASSERT(ce->refcount == 0, - ("%s: CLIP entry %p still in use (%d)", __func__, - ce, ce->refcount)); - TAILQ_REMOVE(&sc->clip_table, ce, link); + int i; + + mtx_lock(&clip_db_lock); + if (sc->clip_table == NULL) + goto done; /* CLIP was never initialized. */ + for (i = 0; i <= sc->clip_mask; i++) { + LIST_FOREACH_SAFE(ce, &sc->clip_table[i], link, ce_temp) { + MPASS(ce->refcount == 0); + MPASS(ce->cde->adp_ref > 0); #if 0 - delete_lip(sc, &ce->lip); + del_lip(sc, &ce->lip); #endif + LIST_REMOVE(ce, link); + if (--ce->cde->adp_ref == 0 && ce->cde->krn_ref == 0) { + LIST_REMOVE(ce->cde, link); + free(ce->cde, M_CXGBE); + } free(ce, M_CXGBE); } - mtx_unlock(&sc->clip_table_lock); - mtx_destroy(&sc->clip_table_lock); } + hashdestroy(&sc->clip_table, M_CXGBE, sc->clip_mask); + sc->clip_table = NULL; +done: + mtx_unlock(&clip_db_lock); } static void t4_ifaddr_event(void *arg __unused, struct ifnet *ifp, struct ifaddr *ifa, int event) { + struct in6_addr *in6; + if (t4_clip_db_auto == 0) + return; /* Automatic updates not allowed. */ if (ifa->ifa_addr->sa_family != AF_INET6) return; + if (ifp->if_flags & IFF_LOOPBACK) + return; + in6 = &((struct in6_ifaddr *)ifa)->ia_addr.sin6_addr; + if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_MULTICAST(in6)) + return; - atomic_add_rel_int(&in6_ifaddr_gen, 1); - taskqueue_enqueue_timeout(taskqueue_thread, &clip_task, -hz / 4); + taskqueue_enqueue(taskqueue_thread, &clip_db_task); } int sysctl_clip(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct clip_entry *ce; struct sbuf *sb; - int rc, header = 0; + int i, rc, header = 0; char ip[INET6_ADDRSTRLEN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); - mtx_lock(&sc->clip_table_lock); - TAILQ_FOREACH(ce, &sc->clip_table, link) { - if (header == 0) { - sbuf_printf(sb, "%-40s %-5s", "IP address", "Users"); - header = 1; + mtx_lock(&clip_db_lock); + for (i = 0; i <= sc->clip_mask; i++) { + LIST_FOREACH(ce, &sc->clip_table[i], link) { + if (header == 0) { + sbuf_printf(sb, "%-4s %-4s %s", "Indx", "Refs", + "IP address"); + header = 1; + } + inet_ntop(AF_INET6, &ce->cde->lip, &ip[0], sizeof(ip)); + if (ce->clip_idx == -1) { + sbuf_printf(sb, "\n%-4s %-4d %s", "-", + ce->refcount, ip); + } else { + sbuf_printf(sb, "\n%-4d %-4d %s", ce->clip_idx, + ce->refcount, ip); + } } - inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); + } + mtx_unlock(&clip_db_lock); + + rc = sbuf_finish(sb); + sbuf_delete(sb); - sbuf_printf(sb, "\n%-40s %5u", ip, ce->refcount); + return (rc); +} + +static int +sysctl_clip_db(SYSCTL_HANDLER_ARGS) +{ + struct clip_db_entry *cde; + struct sbuf *sb; + int i, rc, header = 0; + char ip[INET6_ADDRSTRLEN]; + + rc = sysctl_wire_old_buffer(req, 0); + if (rc != 0) + return (rc); + + sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); + if (sb == NULL) + return (ENOMEM); + + mtx_lock(&clip_db_lock); + for (i = 0; i <= clip_db_mask; i++) { + LIST_FOREACH(cde, &clip_db[i], link) { + MPASS(cde->tmp_ref == 0); + if (header == 0) { + sbuf_printf(sb, "%-4s %-4s %s", "Kref", "Aref", + "IP address"); + header = 1; + } + inet_ntop(AF_INET6, &cde->lip, &ip[0], sizeof(ip)); + sbuf_printf(sb, "\n%-4d %-4d %s", cde->krn_ref, + cde->adp_ref, ip); + } } - mtx_unlock(&sc->clip_table_lock); + mtx_unlock(&clip_db_lock); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } +static int +sysctl_clip_db_auto(SYSCTL_HANDLER_ARGS) +{ + int rc, val; + + val = t4_clip_db_auto; + rc = sysctl_handle_int(oidp, &val, 0, req); + if (rc != 0 || req->newptr == NULL) + return (rc); + + if (val == 0 || val == 1) + t4_clip_db_auto = val; + else { + /* + * Writing a value other than 0 or 1 forces a one-time update of + * the clip_db directly in the sysctl and not in some taskqueue. + */ + t4_clip_db_task(NULL, 0); + } + + return (0); +} + void t4_clip_modload(void) { - - TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL); + mtx_init(&clip_db_lock, "clip_db", NULL, MTX_DEF); + clip_db = hashinit(CLIP_HASH_SIZE, M_CXGBE, &clip_db_mask); + TASK_INIT(&clip_db_task, 0, t4_clip_db_task, NULL); ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event_ext, t4_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); } void t4_clip_modunload(void) { - EVENTHANDLER_DEREGISTER(ifaddr_event_ext, ifaddr_evhandler); - taskqueue_cancel_timeout(taskqueue_thread, &clip_task, NULL); + taskqueue_drain(taskqueue_thread, &clip_db_task); + hashdestroy(&clip_db, M_CXGBE, clip_db_mask); + mtx_destroy(&clip_db_lock); } #endif diff --git a/sys/dev/cxgbe/t4_clip.h b/sys/dev/cxgbe/t4_clip.h index 8d9acdb86fa5..9dc3d39f3266 100644 --- a/sys/dev/cxgbe/t4_clip.h +++ b/sys/dev/cxgbe/t4_clip.h @@ -1,51 +1,50 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __T4_CLIP_H #define __T4_CLIP_H -struct clip_entry { - TAILQ_ENTRY(clip_entry) link; - struct in6_addr lip; /* local IPv6 address */ - u_int refcount; -}; +#define CLIP_HASH_SIZE 32 +struct clip_entry; +struct in6_addr; void t4_clip_modload(void); void t4_clip_modunload(void); void t4_init_clip_table(struct adapter *); void t4_destroy_clip_table(struct adapter *); -struct clip_entry *t4_hold_lip(struct adapter *, struct in6_addr *, - struct clip_entry *); -void t4_release_lip(struct adapter *, struct clip_entry *); +struct clip_entry *t4_get_clip_entry(struct adapter *, struct in6_addr *, bool); +void t4_hold_clip_entry(struct adapter *, struct clip_entry *); +void t4_release_clip_entry(struct adapter *, struct clip_entry *); +int t4_release_clip_addr(struct adapter *, struct in6_addr *); int sysctl_clip(SYSCTL_HANDLER_ARGS); #endif /* __T4_CLIP_H */ diff --git a/sys/dev/cxgbe/t4_ioctl.h b/sys/dev/cxgbe/t4_ioctl.h index ff2c5ef80a14..f3bb7d8b4aa4 100644 --- a/sys/dev/cxgbe/t4_ioctl.h +++ b/sys/dev/cxgbe/t4_ioctl.h @@ -1,434 +1,444 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_IOCTL_H__ #define __T4_IOCTL_H__ #include #include #include /* * Ioctl commands specific to this driver. */ enum { T4_GETREG = 0x40, /* read register */ T4_SETREG, /* write register */ T4_REGDUMP, /* dump of all registers */ T4_GET_FILTER_MODE, /* get global filter mode */ T4_SET_FILTER_MODE, /* set global filter mode */ T4_GET_FILTER, /* get information about a filter */ T4_SET_FILTER, /* program a filter */ T4_DEL_FILTER, /* delete a filter */ T4_GET_SGE_CONTEXT, /* get SGE context for a queue */ T4_LOAD_FW, /* flash firmware */ T4_GET_MEM, /* read memory */ T4_GET_I2C, /* read from i2c addressible device */ T4_CLEAR_STATS, /* clear a port's MAC statistics */ T4_SET_OFLD_POLICY, /* Set offload policy */ T4_SET_SCHED_CLASS, /* set sched class */ T4_SET_SCHED_QUEUE, /* set queue class */ T4_GET_TRACER, /* get information about a tracer */ T4_SET_TRACER, /* program a tracer */ T4_LOAD_CFG, /* copy a config file to card's flash */ T4_LOAD_BOOT, /* flash boot rom */ T4_LOAD_BOOTCFG, /* flash bootcfg */ T4_CUDBG_DUMP, /* debug dump of chip state */ T4_SET_FILTER_MASK, /* set filter mask (hashfilter mode) */ + T4_HOLD_CLIP_ADDR, /* add ref on an IP in the CLIP */ + T4_RELEASE_CLIP_ADDR, /* remove ref from an IP in the CLIP */ }; struct t4_reg { uint32_t addr; uint32_t size; uint64_t val; }; #define T4_REGDUMP_SIZE (160 * 1024) #define T5_REGDUMP_SIZE (332 * 1024) struct t4_regdump { uint32_t version; uint32_t len; /* bytes */ uint32_t *data; }; struct t4_data { uint32_t len; uint8_t *data; }; struct t4_bootrom { uint32_t pf_offset; uint32_t pfidx_addr; uint32_t len; uint8_t *data; }; struct t4_i2c_data { uint8_t port_id; uint8_t dev_addr; uint8_t offset; uint8_t len; uint8_t data[8]; }; /* * A hardware filter is some valid combination of these. */ #define T4_FILTER_IPv4 0x1 /* IPv4 packet */ #define T4_FILTER_IPv6 0x2 /* IPv6 packet */ #define T4_FILTER_IP_SADDR 0x4 /* Source IP address or network */ #define T4_FILTER_IP_DADDR 0x8 /* Destination IP address or network */ #define T4_FILTER_IP_SPORT 0x10 /* Source IP port */ #define T4_FILTER_IP_DPORT 0x20 /* Destination IP port */ #define T4_FILTER_FCoE 0x40 /* Fibre Channel over Ethernet packet */ #define T4_FILTER_PORT 0x80 /* Physical ingress port */ #define T4_FILTER_VNIC 0x100 /* See the IC_* bits towards the end */ #define T4_FILTER_VLAN 0x200 /* VLAN ID */ #define T4_FILTER_IP_TOS 0x400 /* IPv4 TOS/IPv6 Traffic Class */ #define T4_FILTER_IP_PROTO 0x800 /* IP protocol */ #define T4_FILTER_ETH_TYPE 0x1000 /* Ethernet Type */ #define T4_FILTER_MAC_IDX 0x2000 /* MPS MAC address match index */ #define T4_FILTER_MPS_HIT_TYPE 0x4000 /* MPS match type */ #define T4_FILTER_IP_FRAGMENT 0x8000 /* IP fragment */ /* * T4_FILTER_VNIC's real meaning depends on the ingress config. */ #define T4_FILTER_IC_OVLAN 0 /* outer VLAN */ #define T4_FILTER_IC_VNIC 0x80000000 /* VNIC id (PF/VF) */ #define T4_FILTER_IC_ENCAP 0x40000000 /* Filter action */ enum { FILTER_PASS = 0, /* default */ FILTER_DROP, FILTER_SWITCH }; /* 802.1q manipulation on FILTER_SWITCH */ enum { VLAN_NOCHANGE = 0, /* default */ VLAN_REMOVE, VLAN_INSERT, VLAN_REWRITE }; /* MPS match type */ enum { UCAST_EXACT = 0, /* exact unicast match */ UCAST_HASH = 1, /* inexact (hashed) unicast match */ MCAST_EXACT = 2, /* exact multicast match */ MCAST_HASH = 3, /* inexact (hashed) multicast match */ PROMISC = 4, /* no match but port is promiscuous */ HYPPROMISC = 5, /* port is hypervisor-promisuous + not bcast */ BCAST = 6, /* broadcast packet */ }; /* Rx steering */ enum { DST_MODE_QUEUE, /* queue is directly specified by filter */ DST_MODE_RSS_QUEUE, /* filter specifies RSS entry containing queue */ DST_MODE_RSS, /* queue selected by default RSS hash lookup */ DST_MODE_FILT_RSS /* queue selected by hashing in filter-specified RSS subtable */ }; enum { NAT_MODE_NONE = 0, /* No NAT performed */ NAT_MODE_DIP, /* NAT on Dst IP */ NAT_MODE_DIP_DP, /* NAT on Dst IP, Dst Port */ NAT_MODE_DIP_DP_SIP, /* NAT on Dst IP, Dst Port and Src IP */ NAT_MODE_DIP_DP_SP, /* NAT on Dst IP, Dst Port and Src Port */ NAT_MODE_SIP_SP, /* NAT on Src IP and Src Port */ NAT_MODE_DIP_SIP_SP, /* NAT on Dst IP, Src IP and Src Port */ NAT_MODE_ALL /* NAT on entire 4-tuple */ }; struct t4_filter_tuple { /* * These are always available. */ uint8_t sip[16]; /* source IP address (IPv4 in [3:0]) */ uint8_t dip[16]; /* destination IP address (IPv4 in [3:0]) */ uint16_t sport; /* source port */ uint16_t dport; /* destination port */ /* * A combination of these (up to 36 bits) is available. TP_VLAN_PRI_MAP * is used to select the global mode and all filters are limited to the * set of fields allowed by the global mode. */ uint16_t vnic; /* VNIC id (PF/VF) or outer VLAN tag */ uint16_t vlan; /* VLAN tag */ uint16_t ethtype; /* Ethernet type */ uint8_t tos; /* TOS/Traffic Type */ uint8_t proto; /* protocol type */ uint32_t fcoe:1; /* FCoE packet */ uint32_t iport:3; /* ingress port */ uint32_t matchtype:3; /* MPS match type */ uint32_t frag:1; /* fragmentation extension header */ uint32_t macidx:9; /* exact match MAC index */ uint32_t vlan_vld:1; /* VLAN valid */ uint32_t ovlan_vld:1; /* outer VLAN tag valid, value in "vnic" */ uint32_t pfvf_vld:1; /* VNIC id (PF/VF) valid, value in "vnic" */ }; struct t4_filter_specification { uint32_t hitcnts:1; /* count filter hits in TCB */ uint32_t prio:1; /* filter has priority over active/server */ uint32_t type:1; /* 0 => IPv4, 1 => IPv6 */ uint32_t hash:1; /* 0 => LE TCAM, 1 => Hash */ uint32_t action:2; /* drop, pass, switch */ uint32_t rpttid:1; /* report TID in RSS hash field */ uint32_t dirsteer:1; /* 0 => RSS, 1 => steer to iq */ uint32_t iq:10; /* ingress queue */ uint32_t maskhash:1; /* dirsteer=0: steer to an RSS sub-region */ uint32_t dirsteerhash:1;/* dirsteer=1: 0 => TCB contains RSS hash */ /* 1 => TCB contains IQ ID */ /* * Switch proxy/rewrite fields. An ingress packet which matches a * filter with "switch" set will be looped back out as an egress * packet -- potentially with some Ethernet header rewriting. */ uint32_t eport:2; /* egress port to switch packet out */ uint32_t newdmac:1; /* rewrite destination MAC address */ uint32_t newsmac:1; /* rewrite source MAC address */ uint32_t swapmac:1; /* swap SMAC/DMAC for loopback packet */ uint32_t newvlan:2; /* rewrite VLAN Tag */ uint32_t nat_mode:3; /* NAT operation mode */ uint32_t nat_flag_chk:1;/* check TCP flags before NAT'ing */ uint32_t nat_seq_chk; /* sequence value to use for NAT check*/ uint8_t dmac[ETHER_ADDR_LEN]; /* new destination MAC address */ uint8_t smac[ETHER_ADDR_LEN]; /* new source MAC address */ uint16_t vlan; /* VLAN Tag to insert */ uint8_t nat_dip[16]; /* destination IP to use after NAT'ing */ uint8_t nat_sip[16]; /* source IP to use after NAT'ing */ uint16_t nat_dport; /* destination port to use after NAT'ing */ uint16_t nat_sport; /* source port to use after NAT'ing */ /* * Filter rule value/mask pairs. */ struct t4_filter_tuple val; struct t4_filter_tuple mask; }; struct t4_filter { uint32_t idx; uint16_t l2tidx; uint16_t smtidx; uint64_t hits; struct t4_filter_specification fs; }; /* Tx Scheduling Class parameters */ struct t4_sched_class_params { int8_t level; /* scheduler hierarchy level */ int8_t mode; /* per-class or per-flow */ int8_t rateunit; /* bit or packet rate */ int8_t ratemode; /* %port relative or kbps absolute */ int8_t channel; /* scheduler channel [0..N] */ int8_t cl; /* scheduler class [0..N] */ int32_t minrate; /* minimum rate */ int32_t maxrate; /* maximum rate */ int16_t weight; /* percent weight */ int16_t pktsize; /* average packet size */ }; /* * Support for "sched-class" command to allow a TX Scheduling Class to be * programmed with various parameters. */ struct t4_sched_params { int8_t subcmd; /* sub-command */ int8_t type; /* packet or flow */ union { struct { /* sub-command SCHED_CLASS_CONFIG */ int8_t minmax; /* minmax enable */ } config; struct t4_sched_class_params params; uint8_t reserved[6 + 8 * 8]; } u; }; enum { SCHED_CLASS_SUBCMD_CONFIG, /* config sub-command */ SCHED_CLASS_SUBCMD_PARAMS, /* params sub-command */ }; enum { SCHED_CLASS_TYPE_PACKET, }; enum { SCHED_CLASS_LEVEL_CL_RL, /* class rate limiter */ SCHED_CLASS_LEVEL_CL_WRR, /* class weighted round robin */ SCHED_CLASS_LEVEL_CH_RL, /* channel rate limiter */ }; enum { SCHED_CLASS_MODE_CLASS, /* per-class scheduling */ SCHED_CLASS_MODE_FLOW, /* per-flow scheduling */ }; enum { SCHED_CLASS_RATEUNIT_BITS, /* bit rate scheduling */ SCHED_CLASS_RATEUNIT_PKTS, /* packet rate scheduling */ }; enum { SCHED_CLASS_RATEMODE_REL, /* percent of port bandwidth */ SCHED_CLASS_RATEMODE_ABS, /* Kb/s */ }; /* * Support for "sched_queue" command to allow one or more NIC TX Queues to be * bound to a TX Scheduling Class. */ struct t4_sched_queue { uint8_t port; int8_t queue; /* queue index; -1 => all queues */ int8_t cl; /* class index; -1 => unbind */ }; #define T4_SGE_CONTEXT_SIZE 24 enum { SGE_CONTEXT_EGRESS, SGE_CONTEXT_INGRESS, SGE_CONTEXT_FLM, SGE_CONTEXT_CNM }; struct t4_sge_context { uint32_t mem_id; uint32_t cid; uint32_t data[T4_SGE_CONTEXT_SIZE / 4]; }; struct t4_mem_range { uint32_t addr; uint32_t len; uint32_t *data; }; #define T4_TRACE_LEN 112 struct t4_trace_params { uint32_t data[T4_TRACE_LEN / 4]; uint32_t mask[T4_TRACE_LEN / 4]; uint16_t snap_len; uint16_t min_len; uint8_t skip_ofst; uint8_t skip_len; uint8_t invert; uint8_t port; }; struct t4_tracer { uint8_t idx; uint8_t enabled; uint8_t valid; struct t4_trace_params tp; }; struct t4_cudbg_dump { uint8_t wr_flash; uint8_t bitmap[16]; uint32_t len; uint8_t *data; }; enum { OPEN_TYPE_LISTEN = 'L', OPEN_TYPE_ACTIVE = 'A', OPEN_TYPE_PASSIVE = 'P', OPEN_TYPE_DONTCARE = 'D', }; struct offload_settings { int8_t offload; int8_t rx_coalesce; int8_t cong_algo; int8_t sched_class; int8_t tstamp; int8_t sack; int8_t nagle; int8_t ecn; int8_t ddp; int8_t tls; int16_t txq; int16_t rxq; int16_t mss; }; struct offload_rule { char open_type; struct offload_settings settings; struct bpf_program bpf_prog; /* compiled program/filter */ }; /* * An offload policy consists of a set of rules matched in sequence. The * settings of the first rule that matches are applied to that connection. */ struct t4_offload_policy { uint32_t nrules; struct offload_rule *rule; }; +/* Address/mask entry in the CLIP. FW_CLIP2_CMD is aware of the mask. */ +struct t4_clip_addr { + uint8_t addr[16]; + uint8_t mask[16]; +}; + #define CHELSIO_T4_GETREG _IOWR('f', T4_GETREG, struct t4_reg) #define CHELSIO_T4_SETREG _IOW('f', T4_SETREG, struct t4_reg) #define CHELSIO_T4_REGDUMP _IOWR('f', T4_REGDUMP, struct t4_regdump) #define CHELSIO_T4_GET_FILTER_MODE _IOWR('f', T4_GET_FILTER_MODE, uint32_t) #define CHELSIO_T4_SET_FILTER_MODE _IOW('f', T4_SET_FILTER_MODE, uint32_t) #define CHELSIO_T4_GET_FILTER _IOWR('f', T4_GET_FILTER, struct t4_filter) #define CHELSIO_T4_SET_FILTER _IOWR('f', T4_SET_FILTER, struct t4_filter) #define CHELSIO_T4_DEL_FILTER _IOW('f', T4_DEL_FILTER, struct t4_filter) #define CHELSIO_T4_GET_SGE_CONTEXT _IOWR('f', T4_GET_SGE_CONTEXT, \ struct t4_sge_context) #define CHELSIO_T4_LOAD_FW _IOW('f', T4_LOAD_FW, struct t4_data) #define CHELSIO_T4_GET_MEM _IOW('f', T4_GET_MEM, struct t4_mem_range) #define CHELSIO_T4_GET_I2C _IOWR('f', T4_GET_I2C, struct t4_i2c_data) #define CHELSIO_T4_CLEAR_STATS _IOW('f', T4_CLEAR_STATS, uint32_t) #define CHELSIO_T4_SCHED_CLASS _IOW('f', T4_SET_SCHED_CLASS, \ struct t4_sched_params) #define CHELSIO_T4_SCHED_QUEUE _IOW('f', T4_SET_SCHED_QUEUE, \ struct t4_sched_queue) #define CHELSIO_T4_GET_TRACER _IOWR('f', T4_GET_TRACER, struct t4_tracer) #define CHELSIO_T4_SET_TRACER _IOW('f', T4_SET_TRACER, struct t4_tracer) #define CHELSIO_T4_LOAD_CFG _IOW('f', T4_LOAD_CFG, struct t4_data) #define CHELSIO_T4_LOAD_BOOT _IOW('f', T4_LOAD_BOOT, struct t4_bootrom) #define CHELSIO_T4_LOAD_BOOTCFG _IOW('f', T4_LOAD_BOOTCFG, struct t4_data) #define CHELSIO_T4_CUDBG_DUMP _IOWR('f', T4_CUDBG_DUMP, struct t4_cudbg_dump) #define CHELSIO_T4_SET_OFLD_POLICY _IOW('f', T4_SET_OFLD_POLICY, struct t4_offload_policy) #define CHELSIO_T4_SET_FILTER_MASK _IOW('f', T4_SET_FILTER_MASK, uint32_t) +#define CHELSIO_T4_HOLD_CLIP_ADDR _IOW('f', T4_HOLD_CLIP_ADDR, struct t4_clip_addr) +#define CHELSIO_T4_RELEASE_CLIP_ADDR _IOW('f', T4_RELEASE_CLIP_ADDR, struct t4_clip_addr) #endif diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 3cfab1ef04e2..51fc6504e5c2 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -1,13063 +1,13100 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #ifdef KERN_TLS #include #endif #if defined(__i386__) || defined(__amd64__) #include #include #include #include #endif #ifdef DDB #include #include #endif #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "cudbg/cudbg.h" #include "t4_clip.h" #include "t4_ioctl.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #include "t4_if.h" #include "t4_smt.h" /* T4 bus driver interface */ static int t4_probe(device_t); static int t4_attach(device_t); static int t4_detach(device_t); static int t4_child_location_str(device_t, device_t, char *, size_t); static int t4_ready(device_t); static int t4_read_port_device(device_t, int, device_t *); static int t4_suspend(device_t); static int t4_resume(device_t); static int t4_reset_prepare(device_t, device_t); static int t4_reset_post(device_t, device_t); static device_method_t t4_methods[] = { DEVMETHOD(device_probe, t4_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(device_suspend, t4_suspend), DEVMETHOD(device_resume, t4_resume), DEVMETHOD(bus_child_location_str, t4_child_location_str), DEVMETHOD(bus_reset_prepare, t4_reset_prepare), DEVMETHOD(bus_reset_post, t4_reset_post), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t4_driver = { "t4nex", t4_methods, sizeof(struct adapter) }; /* T4 port (cxgbe) interface */ static int cxgbe_probe(device_t); static int cxgbe_attach(device_t); static int cxgbe_detach(device_t); device_method_t cxgbe_methods[] = { DEVMETHOD(device_probe, cxgbe_probe), DEVMETHOD(device_attach, cxgbe_attach), DEVMETHOD(device_detach, cxgbe_detach), { 0, 0 } }; static driver_t cxgbe_driver = { "cxgbe", cxgbe_methods, sizeof(struct port_info) }; /* T4 VI (vcxgbe) interface */ static int vcxgbe_probe(device_t); static int vcxgbe_attach(device_t); static int vcxgbe_detach(device_t); static device_method_t vcxgbe_methods[] = { DEVMETHOD(device_probe, vcxgbe_probe), DEVMETHOD(device_attach, vcxgbe_attach), DEVMETHOD(device_detach, vcxgbe_detach), { 0, 0 } }; static driver_t vcxgbe_driver = { "vcxgbe", vcxgbe_methods, sizeof(struct vi_info) }; static d_ioctl_t t4_ioctl; static struct cdevsw t4_cdevsw = { .d_version = D_VERSION, .d_ioctl = t4_ioctl, .d_name = "t4nex", }; /* T5 bus driver interface */ static int t5_probe(device_t); static device_method_t t5_methods[] = { DEVMETHOD(device_probe, t5_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(device_suspend, t4_suspend), DEVMETHOD(device_resume, t4_resume), DEVMETHOD(bus_child_location_str, t4_child_location_str), DEVMETHOD(bus_reset_prepare, t4_reset_prepare), DEVMETHOD(bus_reset_post, t4_reset_post), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t5_driver = { "t5nex", t5_methods, sizeof(struct adapter) }; /* T5 port (cxl) interface */ static driver_t cxl_driver = { "cxl", cxgbe_methods, sizeof(struct port_info) }; /* T5 VI (vcxl) interface */ static driver_t vcxl_driver = { "vcxl", vcxgbe_methods, sizeof(struct vi_info) }; /* T6 bus driver interface */ static int t6_probe(device_t); static device_method_t t6_methods[] = { DEVMETHOD(device_probe, t6_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(device_suspend, t4_suspend), DEVMETHOD(device_resume, t4_resume), DEVMETHOD(bus_child_location_str, t4_child_location_str), DEVMETHOD(bus_reset_prepare, t4_reset_prepare), DEVMETHOD(bus_reset_post, t4_reset_post), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t6_driver = { "t6nex", t6_methods, sizeof(struct adapter) }; /* T6 port (cc) interface */ static driver_t cc_driver = { "cc", cxgbe_methods, sizeof(struct port_info) }; /* T6 VI (vcc) interface */ static driver_t vcc_driver = { "vcc", vcxgbe_methods, sizeof(struct vi_info) }; /* ifnet interface */ static void cxgbe_init(void *); static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t); static int cxgbe_transmit(struct ifnet *, struct mbuf *); static void cxgbe_qflush(struct ifnet *); #if defined(KERN_TLS) || defined(RATELIMIT) static int cxgbe_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void cxgbe_snd_tag_free(struct m_snd_tag *); #endif MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services"); /* * Correct lock order when you need to acquire multiple locks is t4_list_lock, * then ADAPTER_LOCK, then t4_uld_list_lock. */ static struct sx t4_list_lock; SLIST_HEAD(, adapter) t4_list; #ifdef TCP_OFFLOAD static struct sx t4_uld_list_lock; SLIST_HEAD(, uld_info) t4_uld_list; #endif /* * Tunables. See tweak_tunables() too. * * Each tunable is set to a default value here if it's known at compile-time. * Otherwise it is set to -n as an indication to tweak_tunables() that it should * provide a reasonable default (upto n) when the driver is loaded. * * Tunables applicable to both T4 and T5 are under hw.cxgbe. Those specific to * T5 are under hw.cxl. */ SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) parameters"); SYSCTL_NODE(_hw, OID_AUTO, cxl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) T5+ parameters"); SYSCTL_NODE(_hw_cxgbe, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) TOE parameters"); /* * Number of queues for tx and rx, NIC and offload. */ #define NTXQ 16 int t4_ntxq = -NTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq, CTLFLAG_RDTUN, &t4_ntxq, 0, "Number of TX queues per port"); TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq); /* Old name, undocumented */ #define NRXQ 8 int t4_nrxq = -NRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq, CTLFLAG_RDTUN, &t4_nrxq, 0, "Number of RX queues per port"); TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq); /* Old name, undocumented */ #define NTXQ_VI 1 static int t4_ntxq_vi = -NTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq_vi, CTLFLAG_RDTUN, &t4_ntxq_vi, 0, "Number of TX queues per VI"); #define NRXQ_VI 1 static int t4_nrxq_vi = -NRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq_vi, CTLFLAG_RDTUN, &t4_nrxq_vi, 0, "Number of RX queues per VI"); static int t4_rsrv_noflowq = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, rsrv_noflowq, CTLFLAG_RDTUN, &t4_rsrv_noflowq, 0, "Reserve TX queue 0 of each VI for non-flowid packets"); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) #define NOFLDTXQ 8 static int t4_nofldtxq = -NOFLDTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0, "Number of offload TX queues per port"); #define NOFLDRXQ 2 static int t4_nofldrxq = -NOFLDRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0, "Number of offload RX queues per port"); #define NOFLDTXQ_VI 1 static int t4_nofldtxq_vi = -NOFLDTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq_vi, CTLFLAG_RDTUN, &t4_nofldtxq_vi, 0, "Number of offload TX queues per VI"); #define NOFLDRXQ_VI 1 static int t4_nofldrxq_vi = -NOFLDRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq_vi, CTLFLAG_RDTUN, &t4_nofldrxq_vi, 0, "Number of offload RX queues per VI"); #define TMR_IDX_OFLD 1 int t4_tmr_idx_ofld = TMR_IDX_OFLD; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx_ofld, CTLFLAG_RDTUN, &t4_tmr_idx_ofld, 0, "Holdoff timer index for offload queues"); #define PKTC_IDX_OFLD (-1) int t4_pktc_idx_ofld = PKTC_IDX_OFLD; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx_ofld, CTLFLAG_RDTUN, &t4_pktc_idx_ofld, 0, "holdoff packet counter index for offload queues"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_keepalive_idle = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_idle, CTLFLAG_RDTUN, &t4_toe_keepalive_idle, 0, "TOE keepalive idle timer (us)"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_keepalive_interval = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_interval, CTLFLAG_RDTUN, &t4_toe_keepalive_interval, 0, "TOE keepalive interval timer (us)"); /* 0 means chip/fw default, non-zero number is # of keepalives before abort */ static int t4_toe_keepalive_count = 0; SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, keepalive_count, CTLFLAG_RDTUN, &t4_toe_keepalive_count, 0, "Number of TOE keepalive probes before abort"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_rexmt_min = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_min, CTLFLAG_RDTUN, &t4_toe_rexmt_min, 0, "Minimum TOE retransmit interval (us)"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_rexmt_max = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_max, CTLFLAG_RDTUN, &t4_toe_rexmt_max, 0, "Maximum TOE retransmit interval (us)"); /* 0 means chip/fw default, non-zero number is # of rexmt before abort */ static int t4_toe_rexmt_count = 0; SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, rexmt_count, CTLFLAG_RDTUN, &t4_toe_rexmt_count, 0, "Number of TOE retransmissions before abort"); /* -1 means chip/fw default, other values are raw backoff values to use */ static int t4_toe_rexmt_backoff[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; SYSCTL_NODE(_hw_cxgbe_toe, OID_AUTO, rexmt_backoff, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) TOE retransmit backoff values"); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 0, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[0], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 1, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[1], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 2, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[2], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 3, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[3], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 4, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[4], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 5, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[5], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 6, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[6], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 7, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[7], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 8, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[8], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 9, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[9], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 10, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[10], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 11, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[11], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 12, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[12], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 13, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[13], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[14], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[15], 0, ""); static int t4_toe_tls_rx_timeout = 5; SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, tls_rx_timeout, CTLFLAG_RDTUN, &t4_toe_tls_rx_timeout, 0, "Timeout in seconds to downgrade TLS sockets to plain TOE"); #endif #ifdef DEV_NETMAP #define NN_MAIN_VI (1 << 0) /* Native netmap on the main VI */ #define NN_EXTRA_VI (1 << 1) /* Native netmap on the extra VI(s) */ static int t4_native_netmap = NN_EXTRA_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, native_netmap, CTLFLAG_RDTUN, &t4_native_netmap, 0, "Native netmap support. bit 0 = main VI, bit 1 = extra VIs"); #define NNMTXQ 8 static int t4_nnmtxq = -NNMTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq, CTLFLAG_RDTUN, &t4_nnmtxq, 0, "Number of netmap TX queues"); #define NNMRXQ 8 static int t4_nnmrxq = -NNMRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq, CTLFLAG_RDTUN, &t4_nnmrxq, 0, "Number of netmap RX queues"); #define NNMTXQ_VI 2 static int t4_nnmtxq_vi = -NNMTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq_vi, CTLFLAG_RDTUN, &t4_nnmtxq_vi, 0, "Number of netmap TX queues per VI"); #define NNMRXQ_VI 2 static int t4_nnmrxq_vi = -NNMRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq_vi, CTLFLAG_RDTUN, &t4_nnmrxq_vi, 0, "Number of netmap RX queues per VI"); #endif /* * Holdoff parameters for ports. */ #define TMR_IDX 1 int t4_tmr_idx = TMR_IDX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx, CTLFLAG_RDTUN, &t4_tmr_idx, 0, "Holdoff timer index"); TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx); /* Old name */ #define PKTC_IDX (-1) int t4_pktc_idx = PKTC_IDX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx, CTLFLAG_RDTUN, &t4_pktc_idx, 0, "Holdoff packet counter index"); TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx); /* Old name */ /* * Size (# of entries) of each tx and rx queue. */ unsigned int t4_qsize_txq = TX_EQ_QSIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_txq, CTLFLAG_RDTUN, &t4_qsize_txq, 0, "Number of descriptors in each TX queue"); unsigned int t4_qsize_rxq = RX_IQ_QSIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_rxq, CTLFLAG_RDTUN, &t4_qsize_rxq, 0, "Number of descriptors in each RX queue"); /* * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively). */ int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, interrupt_types, CTLFLAG_RDTUN, &t4_intr_types, 0, "Interrupt types allowed (bit 0 = INTx, 1 = MSI, 2 = MSI-X)"); /* * Configuration file. All the _CF names here are special. */ #define DEFAULT_CF "default" #define BUILTIN_CF "built-in" #define FLASH_CF "flash" #define UWIRE_CF "uwire" #define FPGA_CF "fpga" static char t4_cfg_file[32] = DEFAULT_CF; SYSCTL_STRING(_hw_cxgbe, OID_AUTO, config_file, CTLFLAG_RDTUN, t4_cfg_file, sizeof(t4_cfg_file), "Firmware configuration file"); /* * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. * pause_autoneg = 1 means PAUSE will be negotiated if possible and the * negotiated settings will override rx_pause/tx_pause. * Otherwise rx_pause/tx_pause are applied forcibly. */ static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG; SYSCTL_INT(_hw_cxgbe, OID_AUTO, pause_settings, CTLFLAG_RDTUN, &t4_pause_settings, 0, "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)"); /* * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively). * -1 to run with the firmware default. Same as FEC_AUTO (bit 5) * 0 to disable FEC. */ static int t4_fec = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fec, CTLFLAG_RDTUN, &t4_fec, 0, "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)"); /* * Link autonegotiation. * -1 to run with the firmware default. * 0 to disable. * 1 to enable. */ static int t4_autoneg = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, autoneg, CTLFLAG_RDTUN, &t4_autoneg, 0, "Link autonegotiation"); /* * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed, * encouraged respectively). '-n' is the same as 'n' except the firmware * version used in the checks is read from the firmware bundled with the driver. */ static int t4_fw_install = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fw_install, CTLFLAG_RDTUN, &t4_fw_install, 0, "Firmware auto-install (0 = prohibited, 1 = allowed, 2 = encouraged)"); /* * ASIC features that will be used. Disable the ones you don't want so that the * chip resources aren't wasted on features that will not be used. */ static int t4_nbmcaps_allowed = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nbmcaps_allowed, CTLFLAG_RDTUN, &t4_nbmcaps_allowed, 0, "Default NBM capabilities"); static int t4_linkcaps_allowed = 0; /* No DCBX, PPP, etc. by default */ SYSCTL_INT(_hw_cxgbe, OID_AUTO, linkcaps_allowed, CTLFLAG_RDTUN, &t4_linkcaps_allowed, 0, "Default link capabilities"); static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS | FW_CAPS_CONFIG_SWITCH_EGRESS; SYSCTL_INT(_hw_cxgbe, OID_AUTO, switchcaps_allowed, CTLFLAG_RDTUN, &t4_switchcaps_allowed, 0, "Default switch capabilities"); #ifdef RATELIMIT static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD; #else static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER; #endif SYSCTL_INT(_hw_cxgbe, OID_AUTO, niccaps_allowed, CTLFLAG_RDTUN, &t4_niccaps_allowed, 0, "Default NIC capabilities"); static int t4_toecaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, toecaps_allowed, CTLFLAG_RDTUN, &t4_toecaps_allowed, 0, "Default TCP offload capabilities"); static int t4_rdmacaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, rdmacaps_allowed, CTLFLAG_RDTUN, &t4_rdmacaps_allowed, 0, "Default RDMA capabilities"); static int t4_cryptocaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cryptocaps_allowed, CTLFLAG_RDTUN, &t4_cryptocaps_allowed, 0, "Default crypto capabilities"); static int t4_iscsicaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, iscsicaps_allowed, CTLFLAG_RDTUN, &t4_iscsicaps_allowed, 0, "Default iSCSI capabilities"); static int t4_fcoecaps_allowed = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fcoecaps_allowed, CTLFLAG_RDTUN, &t4_fcoecaps_allowed, 0, "Default FCoE capabilities"); static int t5_write_combine = 0; SYSCTL_INT(_hw_cxl, OID_AUTO, write_combine, CTLFLAG_RDTUN, &t5_write_combine, 0, "Use WC instead of UC for BAR2"); static int t4_num_vis = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, num_vis, CTLFLAG_RDTUN, &t4_num_vis, 0, "Number of VIs per port"); /* * PCIe Relaxed Ordering. * -1: driver should figure out a good value. * 0: disable RO. * 1: enable RO. * 2: leave RO alone. */ static int pcie_relaxed_ordering = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, pcie_relaxed_ordering, CTLFLAG_RDTUN, &pcie_relaxed_ordering, 0, "PCIe Relaxed Ordering: 0 = disable, 1 = enable, 2 = leave alone"); static int t4_panic_on_fatal_err = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RWTUN, &t4_panic_on_fatal_err, 0, "panic on fatal errors"); static int t4_reset_on_fatal_err = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, reset_on_fatal_err, CTLFLAG_RWTUN, &t4_reset_on_fatal_err, 0, "reset adapter on fatal errors"); static int t4_tx_vm_wr = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_vm_wr, CTLFLAG_RWTUN, &t4_tx_vm_wr, 0, "Use VM work requests to transmit packets."); /* * Set to non-zero to enable the attack filter. A packet that matches any of * these conditions will get dropped on ingress: * 1) IP && source address == destination address. * 2) TCP/IP && source address is not a unicast address. * 3) TCP/IP && destination address is not a unicast address. * 4) IP && source address is loopback (127.x.y.z). * 5) IP && destination address is loopback (127.x.y.z). * 6) IPv6 && source address == destination address. * 7) IPv6 && source address is not a unicast address. * 8) IPv6 && source address is loopback (::1/128). * 9) IPv6 && destination address is loopback (::1/128). * 10) IPv6 && source address is unspecified (::/128). * 11) IPv6 && destination address is unspecified (::/128). * 12) TCP/IPv6 && source address is multicast (ff00::/8). * 13) TCP/IPv6 && destination address is multicast (ff00::/8). */ static int t4_attack_filter = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, attack_filter, CTLFLAG_RDTUN, &t4_attack_filter, 0, "Drop suspicious traffic"); static int t4_drop_ip_fragments = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_ip_fragments, CTLFLAG_RDTUN, &t4_drop_ip_fragments, 0, "Drop IP fragments"); static int t4_drop_pkts_with_l2_errors = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l2_errors, CTLFLAG_RDTUN, &t4_drop_pkts_with_l2_errors, 0, "Drop all frames with Layer 2 length or checksum errors"); static int t4_drop_pkts_with_l3_errors = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l3_errors, CTLFLAG_RDTUN, &t4_drop_pkts_with_l3_errors, 0, "Drop all frames with IP version, length, or checksum errors"); static int t4_drop_pkts_with_l4_errors = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l4_errors, CTLFLAG_RDTUN, &t4_drop_pkts_with_l4_errors, 0, "Drop all frames with Layer 4 length, checksum, or other errors"); #ifdef TCP_OFFLOAD /* * TOE tunables. */ static int t4_cop_managed_offloading = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cop_managed_offloading, CTLFLAG_RDTUN, &t4_cop_managed_offloading, 0, "COP (Connection Offload Policy) controls all TOE offload"); #endif #ifdef KERN_TLS /* * This enables KERN_TLS for all adapters if set. */ static int t4_kern_tls = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, kern_tls, CTLFLAG_RDTUN, &t4_kern_tls, 0, "Enable KERN_TLS mode for all supported adapters"); SYSCTL_NODE(_hw_cxgbe, OID_AUTO, tls, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) KERN_TLS parameters"); static int t4_tls_inline_keys = 0; SYSCTL_INT(_hw_cxgbe_tls, OID_AUTO, inline_keys, CTLFLAG_RDTUN, &t4_tls_inline_keys, 0, "Always pass TLS keys in work requests (1) or attempt to store TLS keys " "in card memory."); static int t4_tls_combo_wrs = 0; SYSCTL_INT(_hw_cxgbe_tls, OID_AUTO, combo_wrs, CTLFLAG_RDTUN, &t4_tls_combo_wrs, 0, "Attempt to combine TCB field updates with TLS record work requests."); #endif /* Functions used by VIs to obtain unique MAC addresses for each VI. */ static int vi_mac_funcs[] = { FW_VI_FUNC_ETH, FW_VI_FUNC_OFLD, FW_VI_FUNC_IWARP, FW_VI_FUNC_OPENISCSI, FW_VI_FUNC_OPENFCOE, FW_VI_FUNC_FOISCSI, FW_VI_FUNC_FOFCOE, }; struct intrs_and_queues { uint16_t intr_type; /* INTx, MSI, or MSI-X */ uint16_t num_vis; /* number of VIs for each port */ uint16_t nirq; /* Total # of vectors */ uint16_t ntxq; /* # of NIC txq's for each port */ uint16_t nrxq; /* # of NIC rxq's for each port */ uint16_t nofldtxq; /* # of TOE/ETHOFLD txq's for each port */ uint16_t nofldrxq; /* # of TOE rxq's for each port */ uint16_t nnmtxq; /* # of netmap txq's */ uint16_t nnmrxq; /* # of netmap rxq's */ /* The vcxgbe/vcxl interfaces use these and not the ones above. */ uint16_t ntxq_vi; /* # of NIC txq's */ uint16_t nrxq_vi; /* # of NIC rxq's */ uint16_t nofldtxq_vi; /* # of TOE txq's */ uint16_t nofldrxq_vi; /* # of TOE rxq's */ uint16_t nnmtxq_vi; /* # of netmap txq's */ uint16_t nnmrxq_vi; /* # of netmap rxq's */ }; static void setup_memwin(struct adapter *); static void position_memwin(struct adapter *, int, uint32_t); static int validate_mem_range(struct adapter *, uint32_t, uint32_t); static int fwmtype_to_hwmtype(int); static int validate_mt_off_len(struct adapter *, int, uint32_t, uint32_t, uint32_t *); static int fixup_devlog_params(struct adapter *); static int cfg_itype_and_nqueues(struct adapter *, struct intrs_and_queues *); static int contact_firmware(struct adapter *); static int partition_resources(struct adapter *); static int get_params__pre_init(struct adapter *); static int set_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); static bool fixed_ifmedia(struct port_info *); static void build_medialist(struct port_info *); static void init_link_config(struct port_info *); static int fixup_link_config(struct port_info *); static int apply_link_config(struct port_info *); static int cxgbe_init_synchronized(struct vi_info *); static int cxgbe_uninit_synchronized(struct vi_info *); static int adapter_full_init(struct adapter *); static void adapter_full_uninit(struct adapter *); static int vi_full_init(struct vi_info *); static void vi_full_uninit(struct vi_info *); static int alloc_extra_vi(struct adapter *, struct port_info *, struct vi_info *); static void quiesce_txq(struct sge_txq *); static void quiesce_wrq(struct sge_wrq *); static void quiesce_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *); static void quiesce_vi(struct vi_info *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, driver_intr_t *, void *, char *); static int t4_free_irq(struct adapter *, struct irq *); static void t4_init_atid_table(struct adapter *); static void t4_free_atid_table(struct adapter *); static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void vi_refresh_stats(struct vi_info *); static void cxgbe_refresh_stats(struct vi_info *); static void cxgbe_tick(void *); static void vi_tick(void *); static void cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS); static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS); static int sysctl_fec(SYSCTL_HANDLER_ARGS); static int sysctl_module_fec(SYSCTL_HANDLER_ARGS); static int sysctl_autoneg(SYSCTL_HANDLER_ARGS); static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS); static int sysctl_temperature(SYSCTL_HANDLER_ARGS); static int sysctl_vdd(SYSCTL_HANDLER_ARGS); static int sysctl_reset_sensor(SYSCTL_HANDLER_ARGS); static int sysctl_loadavg(SYSCTL_HANDLER_ARGS); static int sysctl_cctrl(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS); static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tid_stats(SYSCTL_HANDLER_ARGS); static int sysctl_devlog(SYSCTL_HANDLER_ARGS); static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS); static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS); static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS); static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS); static int sysctl_meminfo(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS); static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS); static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS); static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tids(SYSCTL_HANDLER_ARGS); static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tnl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la(SYSCTL_HANDLER_ARGS); static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); static int sysctl_cpus(SYSCTL_HANDLER_ARGS); static int sysctl_reset(SYSCTL_HANDLER_ARGS); #ifdef TCP_OFFLOAD static int sysctl_tls(SYSCTL_HANDLER_ARGS); static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS); static int sysctl_tls_rx_timeout(SYSCTL_HANDLER_ARGS); static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS); static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS); static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS); #endif static int get_sge_context(struct adapter *, struct t4_sge_context *); static int load_fw(struct adapter *, struct t4_data *); static int load_cfg(struct adapter *, struct t4_data *); static int load_boot(struct adapter *, struct t4_bootrom *); static int load_bootcfg(struct adapter *, struct t4_data *); static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *); static void free_offload_policy(struct t4_offload_policy *); static int set_offload_policy(struct adapter *, struct t4_offload_policy *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); static int clear_stats(struct adapter *, u_int); +static int hold_clip_addr(struct adapter *, struct t4_clip_addr *); +static int release_clip_addr(struct adapter *, struct t4_clip_addr *); #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *, bool); static void t4_async_event(void *, int); #endif #ifdef KERN_TLS static int ktls_capability(struct adapter *, bool); #endif static int mod_event(module_t, int, void *); static int notify_siblings(device_t, int); static uint64_t vi_get_counter(struct ifnet *, ift_counter); static uint64_t cxgbe_get_counter(struct ifnet *, ift_counter); static void enable_vxlan_rx(struct adapter *); static void reset_adapter(void *, int); struct { uint16_t device; char *desc; } t4_pciids[] = { {0xa000, "Chelsio Terminator 4 FPGA"}, {0x4400, "Chelsio T440-dbg"}, {0x4401, "Chelsio T420-CR"}, {0x4402, "Chelsio T422-CR"}, {0x4403, "Chelsio T440-CR"}, {0x4404, "Chelsio T420-BCH"}, {0x4405, "Chelsio T440-BCH"}, {0x4406, "Chelsio T440-CH"}, {0x4407, "Chelsio T420-SO"}, {0x4408, "Chelsio T420-CX"}, {0x4409, "Chelsio T420-BT"}, {0x440a, "Chelsio T404-BT"}, {0x440e, "Chelsio T440-LP-CR"}, }, t5_pciids[] = { {0xb000, "Chelsio Terminator 5 FPGA"}, {0x5400, "Chelsio T580-dbg"}, {0x5401, "Chelsio T520-CR"}, /* 2 x 10G */ {0x5402, "Chelsio T522-CR"}, /* 2 x 10G, 2 X 1G */ {0x5403, "Chelsio T540-CR"}, /* 4 x 10G */ {0x5407, "Chelsio T520-SO"}, /* 2 x 10G, nomem */ {0x5409, "Chelsio T520-BT"}, /* 2 x 10GBaseT */ {0x540a, "Chelsio T504-BT"}, /* 4 x 1G */ {0x540d, "Chelsio T580-CR"}, /* 2 x 40G */ {0x540e, "Chelsio T540-LP-CR"}, /* 4 x 10G */ {0x5410, "Chelsio T580-LP-CR"}, /* 2 x 40G */ {0x5411, "Chelsio T520-LL-CR"}, /* 2 x 10G */ {0x5412, "Chelsio T560-CR"}, /* 1 x 40G, 2 x 10G */ {0x5414, "Chelsio T580-LP-SO-CR"}, /* 2 x 40G, nomem */ {0x5415, "Chelsio T502-BT"}, /* 2 x 1G */ {0x5418, "Chelsio T540-BT"}, /* 4 x 10GBaseT */ {0x5419, "Chelsio T540-LP-BT"}, /* 4 x 10GBaseT */ {0x541a, "Chelsio T540-SO-BT"}, /* 4 x 10GBaseT, nomem */ {0x541b, "Chelsio T540-SO-CR"}, /* 4 x 10G, nomem */ /* Custom */ {0x5483, "Custom T540-CR"}, {0x5484, "Custom T540-BT"}, }, t6_pciids[] = { {0xc006, "Chelsio Terminator 6 FPGA"}, /* T6 PE10K6 FPGA (PF0) */ {0x6400, "Chelsio T6-DBG-25"}, /* 2 x 10/25G, debug */ {0x6401, "Chelsio T6225-CR"}, /* 2 x 10/25G */ {0x6402, "Chelsio T6225-SO-CR"}, /* 2 x 10/25G, nomem */ {0x6403, "Chelsio T6425-CR"}, /* 4 x 10/25G */ {0x6404, "Chelsio T6425-SO-CR"}, /* 4 x 10/25G, nomem */ {0x6405, "Chelsio T6225-OCP-SO"}, /* 2 x 10/25G, nomem */ {0x6406, "Chelsio T62100-OCP-SO"}, /* 2 x 40/50/100G, nomem */ {0x6407, "Chelsio T62100-LP-CR"}, /* 2 x 40/50/100G */ {0x6408, "Chelsio T62100-SO-CR"}, /* 2 x 40/50/100G, nomem */ {0x6409, "Chelsio T6210-BT"}, /* 2 x 10GBASE-T */ {0x640d, "Chelsio T62100-CR"}, /* 2 x 40/50/100G */ {0x6410, "Chelsio T6-DBG-100"}, /* 2 x 40/50/100G, debug */ {0x6411, "Chelsio T6225-LL-CR"}, /* 2 x 10/25G */ {0x6414, "Chelsio T61100-OCP-SO"}, /* 1 x 40/50/100G, nomem */ {0x6415, "Chelsio T6201-BT"}, /* 2 x 1000BASE-T */ /* Custom */ {0x6480, "Custom T6225-CR"}, {0x6481, "Custom T62100-CR"}, {0x6482, "Custom T6225-CR"}, {0x6483, "Custom T62100-CR"}, {0x6484, "Custom T64100-CR"}, {0x6485, "Custom T6240-SO"}, {0x6486, "Custom T6225-SO-CR"}, {0x6487, "Custom T6225-CR"}, }; #ifdef TCP_OFFLOAD /* * service_iq_fl() has an iq and needs the fl. Offset of fl from the iq should * be exactly the same for both rxq and ofld_rxq. */ CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); static int t4_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xa000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t4_pciids); i++) { if (d == t4_pciids[i].device) { device_set_desc(dev, t4_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t5_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xb000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t5_pciids); i++) { if (d == t5_pciids[i].device) { device_set_desc(dev, t5_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t6_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); for (i = 0; i < nitems(t6_pciids); i++) { if (d == t6_pciids[i].device) { device_set_desc(dev, t6_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void t5_attribute_workaround(device_t dev) { device_t root_port; uint32_t v; /* * The T5 chips do not properly echo the No Snoop and Relaxed * Ordering attributes when replying to a TLP from a Root * Port. As a workaround, find the parent Root Port and * disable No Snoop and Relaxed Ordering. Note that this * affects all devices under this root port. */ root_port = pci_find_pcie_root_port(dev); if (root_port == NULL) { device_printf(dev, "Unable to find parent root port\n"); return; } v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL, PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2); if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) != 0) device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n", device_get_nameunit(root_port)); } static const struct devnames devnames[] = { { .nexus_name = "t4nex", .ifnet_name = "cxgbe", .vi_ifnet_name = "vcxgbe", .pf03_drv_name = "t4iov", .vf_nexus_name = "t4vf", .vf_ifnet_name = "cxgbev" }, { .nexus_name = "t5nex", .ifnet_name = "cxl", .vi_ifnet_name = "vcxl", .pf03_drv_name = "t5iov", .vf_nexus_name = "t5vf", .vf_ifnet_name = "cxlv" }, { .nexus_name = "t6nex", .ifnet_name = "cc", .vi_ifnet_name = "vcc", .pf03_drv_name = "t6iov", .vf_nexus_name = "t6vf", .vf_ifnet_name = "ccv" } }; void t4_init_devnames(struct adapter *sc) { int id; id = chip_id(sc); if (id >= CHELSIO_T4 && id - CHELSIO_T4 < nitems(devnames)) sc->names = &devnames[id - CHELSIO_T4]; else { device_printf(sc->dev, "chip id %d is not supported.\n", id); sc->names = NULL; } } static int t4_ifnet_unit(struct adapter *sc, struct port_info *pi) { const char *parent, *name; long value; int line, unit; line = 0; parent = device_get_nameunit(sc->dev); name = sc->names->ifnet_name; while (resource_find_dev(&line, name, &unit, "at", parent) == 0) { if (resource_long_value(name, unit, "port", &value) == 0 && value == pi->port_id) return (unit); } return (-1); } static int t4_attach(device_t dev) { struct adapter *sc; int rc = 0, i, j, rqidx, tqidx, nports; struct make_dev_args mda; struct intrs_and_queues iaq; struct sge *s; uint32_t *buf; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) int ofld_tqidx; #endif #ifdef TCP_OFFLOAD int ofld_rqidx; #endif #ifdef DEV_NETMAP int nm_rqidx, nm_tqidx; #endif int num_vis; sc = device_get_softc(dev); sc->dev = dev; TUNABLE_INT_FETCH("hw.cxgbe.dflags", &sc->debug_flags); if ((pci_get_device(dev) & 0xff00) == 0x5400) t5_attribute_workaround(dev); pci_enable_busmaster(dev); if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) { uint32_t v; pci_set_max_read_req(dev, 4096); v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2); sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5); if (pcie_relaxed_ordering == 0 && (v & PCIEM_CTL_RELAXED_ORD_ENABLE) != 0) { v &= ~PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); } else if (pcie_relaxed_ordering == 1 && (v & PCIEM_CTL_RELAXED_ORD_ENABLE) == 0) { v |= PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); } } sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS); sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL); sc->traceq = -1; mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF); snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer", device_get_nameunit(dev)); snprintf(sc->lockname, sizeof(sc->lockname), "%s", device_get_nameunit(dev)); mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF); t4_add_adapter(sc); mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF); TAILQ_INIT(&sc->sfl); callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0); mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF); sc->policy = NULL; rw_init(&sc->policy_lock, "connection offload policy"); callout_init(&sc->ktls_tick, 1); #ifdef TCP_OFFLOAD TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc); #endif refcount_init(&sc->vxlan_refcount, 0); TASK_INIT(&sc->reset_task, 0, reset_adapter, sc); sc->ctrlq_oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(sc->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "ctrlq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "control queues"); sc->fwq_oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(sc->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "fwq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "firmware event queue"); rc = t4_map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); /* Prepare the adapter for operation. */ buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_prep_adapter(sc, buf); free(buf, M_CXGBE); if (rc != 0) { device_printf(dev, "failed to prepare adapter: %d.\n", rc); goto done; } /* * This is the real PF# to which we're attaching. Works from within PCI * passthrough environments too, where pci_get_function() could return a * different PF# depending on the passthrough configuration. We need to * use the real PF# in all our communication with the firmware. */ j = t4_read_reg(sc, A_PL_WHOAMI); sc->pf = chip_id(sc) <= CHELSIO_T5 ? G_SOURCEPF(j) : G_T6_SOURCEPF(j); sc->mbox = sc->pf; t4_init_devnames(sc); if (sc->names == NULL) { rc = ENOTSUP; goto done; /* error message displayed already */ } /* * Do this really early, with the memory windows set up even before the * character device. The userland tool's register i/o and mem read * will work even in "recovery mode". */ setup_memwin(sc); if (t4_init_devlog_params(sc, 0) == 0) fixup_devlog_params(sc); make_dev_args_init(&mda); mda.mda_devsw = &t4_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (rc != 0) device_printf(dev, "failed to create nexus char device: %d.\n", rc); /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { device_printf(dev, "recovery mode.\n"); goto done; } #if defined(__i386__) if ((cpu_feature & CPUID_CX8) == 0) { device_printf(dev, "64 bit atomics not available.\n"); rc = ENOTSUP; goto done; } #endif /* Contact the firmware and try to become the master driver. */ rc = contact_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ MPASS(sc->flags & FW_OK); rc = get_params__pre_init(sc); if (rc != 0) goto done; /* error message displayed already */ if (sc->flags & MASTER_PF) { rc = partition_resources(sc); if (rc != 0) goto done; /* error message displayed already */ t4_intr_clear(sc); } rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_map_bar_2(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_create_dma_tag(sc); if (rc != 0) goto done; /* error message displayed already */ /* * First pass over all the ports - allocate VIs and initialize some * basic parameters like mac address, port type, etc. */ for_each_port(sc, i) { struct port_info *pi; pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK); sc->port[i] = pi; /* These must be set before t4_port_init */ pi->adapter = sc; pi->port_id = i; /* * XXX: vi[0] is special so we can't delay this allocation until * pi->nvi's final value is known. */ pi->vi = malloc(sizeof(struct vi_info) * t4_num_vis, M_CXGBE, M_ZERO | M_WAITOK); /* * Allocate the "main" VI and initialize parameters * like mac addr. */ rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i); if (rc != 0) { device_printf(dev, "unable to initialize port %d: %d\n", i, rc); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF); sc->chan_map[pi->tx_chan] = i; /* * The MPS counter for FCS errors doesn't work correctly on the * T6 so we use the MAC counter here. Which MAC is in use * depends on the link settings which will be known when the * link comes up. */ if (is_t6(sc)) { pi->fcs_reg = -1; } else if (is_t4(sc)) { pi->fcs_reg = PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L); } else { pi->fcs_reg = T5_PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L); } pi->fcs_base = 0; /* All VIs on this port share this media. */ ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); PORT_LOCK(pi); init_link_config(pi); fixup_link_config(pi); build_medialist(pi); if (fixed_ifmedia(pi)) pi->flags |= FIXED_IFMEDIA; PORT_UNLOCK(pi); pi->dev = device_add_child(dev, sc->names->ifnet_name, t4_ifnet_unit(sc, pi)); if (pi->dev == NULL) { device_printf(dev, "failed to add device for port %d.\n", i); rc = ENXIO; goto done; } pi->vi[0].dev = pi->dev; device_set_softc(pi->dev, pi); } /* * Interrupt type, # of interrupts, # of rx/tx queues, etc. */ nports = sc->params.nports; rc = cfg_itype_and_nqueues(sc, &iaq); if (rc != 0) goto done; /* error message displayed already */ num_vis = iaq.num_vis; sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; s = &sc->sge; s->nrxq = nports * iaq.nrxq; s->ntxq = nports * iaq.ntxq; if (num_vis > 1) { s->nrxq += nports * (num_vis - 1) * iaq.nrxq_vi; s->ntxq += nports * (num_vis - 1) * iaq.ntxq_vi; } s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += nports; /* ctrl queues: 1 per port */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (is_offload(sc) || is_ethoffload(sc)) { s->nofldtxq = nports * iaq.nofldtxq; if (num_vis > 1) s->nofldtxq += nports * (num_vis - 1) * iaq.nofldtxq_vi; s->neq += s->nofldtxq; s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_ofld_txq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = nports * iaq.nofldrxq; if (num_vis > 1) s->nofldrxq += nports * (num_vis - 1) * iaq.nofldrxq_vi; s->neq += s->nofldrxq; /* free list */ s->niq += s->nofldrxq; s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef DEV_NETMAP s->nnmrxq = 0; s->nnmtxq = 0; if (t4_native_netmap & NN_MAIN_VI) { s->nnmrxq += nports * iaq.nnmrxq; s->nnmtxq += nports * iaq.nnmtxq; } if (num_vis > 1 && t4_native_netmap & NN_EXTRA_VI) { s->nnmrxq += nports * (num_vis - 1) * iaq.nnmrxq_vi; s->nnmtxq += nports * (num_vis - 1) * iaq.nnmtxq_vi; } s->neq += s->nnmtxq + s->nnmrxq; s->niq += s->nnmrxq; s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), M_CXGBE, M_ZERO | M_WAITOK); #endif MPASS(s->niq <= s->iqmap_sz); MPASS(s->neq <= s->eqmap_sz); s->ctrlq = malloc(nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE, M_ZERO | M_WAITOK); s->iqmap = malloc(s->iqmap_sz * sizeof(struct sge_iq *), M_CXGBE, M_ZERO | M_WAITOK); s->eqmap = malloc(s->eqmap_sz * sizeof(struct sge_eq *), M_CXGBE, M_ZERO | M_WAITOK); sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE, M_ZERO | M_WAITOK); t4_init_l2t(sc, M_WAITOK); t4_init_smt(sc, M_WAITOK); t4_init_tx_sched(sc); t4_init_atid_table(sc); #ifdef RATELIMIT t4_init_etid_table(sc); #endif #ifdef INET6 t4_init_clip_table(sc); #endif if (sc->vres.key.size != 0) sc->key_map = vmem_create("T4TLS key map", sc->vres.key.start, sc->vres.key.size, 32, 0, M_FIRSTFIT | M_WAITOK); /* * Second pass over the ports. This time we know the number of rx and * tx queues that each port should get. */ rqidx = tqidx = 0; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) ofld_tqidx = 0; #endif #ifdef TCP_OFFLOAD ofld_rqidx = 0; #endif #ifdef DEV_NETMAP nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; struct vi_info *vi; if (pi == NULL) continue; pi->nvi = num_vis; for_each_vi(pi, j, vi) { vi->pi = pi; vi->adapter = sc; vi->first_intr = -1; vi->qsize_rxq = t4_qsize_rxq; vi->qsize_txq = t4_qsize_txq; vi->first_rxq = rqidx; vi->first_txq = tqidx; vi->tmr_idx = t4_tmr_idx; vi->pktc_idx = t4_pktc_idx; vi->nrxq = j == 0 ? iaq.nrxq : iaq.nrxq_vi; vi->ntxq = j == 0 ? iaq.ntxq : iaq.ntxq_vi; rqidx += vi->nrxq; tqidx += vi->ntxq; if (j == 0 && vi->ntxq > 1) vi->rsrv_noflowq = t4_rsrv_noflowq ? 1 : 0; else vi->rsrv_noflowq = 0; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) vi->first_ofld_txq = ofld_tqidx; vi->nofldtxq = j == 0 ? iaq.nofldtxq : iaq.nofldtxq_vi; ofld_tqidx += vi->nofldtxq; #endif #ifdef TCP_OFFLOAD vi->ofld_tmr_idx = t4_tmr_idx_ofld; vi->ofld_pktc_idx = t4_pktc_idx_ofld; vi->first_ofld_rxq = ofld_rqidx; vi->nofldrxq = j == 0 ? iaq.nofldrxq : iaq.nofldrxq_vi; ofld_rqidx += vi->nofldrxq; #endif #ifdef DEV_NETMAP vi->first_nm_rxq = nm_rqidx; vi->first_nm_txq = nm_tqidx; if (j == 0) { vi->nnmrxq = iaq.nnmrxq; vi->nnmtxq = iaq.nnmtxq; } else { vi->nnmrxq = iaq.nnmrxq_vi; vi->nnmtxq = iaq.nnmtxq_vi; } nm_rqidx += vi->nnmrxq; nm_tqidx += vi->nnmtxq; #endif } } rc = t4_setup_intr_handlers(sc); if (rc != 0) { device_printf(dev, "failed to setup interrupt handlers: %d\n", rc); goto done; } rc = bus_generic_probe(dev); if (rc != 0) { device_printf(dev, "failed to probe child drivers: %d\n", rc); goto done; } /* * Ensure thread-safe mailbox access (in debug builds). * * So far this was the only thread accessing the mailbox but various * ifnets and sysctls are about to be created and their handlers/ioctls * will access the mailbox from different threads. */ sc->flags |= CHK_MBOX_ACCESS; rc = bus_generic_attach(dev); if (rc != 0) { device_printf(dev, "failed to attach all child ports: %d\n", rc); goto done; } device_printf(dev, "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n", sc->params.pci.speed, sc->params.pci.width, sc->params.nports, sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" : (sc->intr_type == INTR_MSI ? "MSI" : "INTx"), sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq); t4_set_desc(sc); notify_siblings(dev, 0); done: if (rc != 0 && sc->cdev) { /* cdev was created and so cxgbetool works; recover that way. */ device_printf(dev, "error during attach, adapter is now in recovery mode.\n"); rc = 0; } if (rc != 0) t4_detach_common(dev); else t4_sysctls(sc); return (rc); } static int t4_child_location_str(device_t bus, device_t dev, char *buf, size_t buflen) { struct adapter *sc; struct port_info *pi; int i; sc = device_get_softc(bus); buf[0] = '\0'; for_each_port(sc, i) { pi = sc->port[i]; if (pi != NULL && pi->dev == dev) { snprintf(buf, buflen, "port=%d", pi->port_id); break; } } return (0); } static int t4_ready(device_t dev) { struct adapter *sc; sc = device_get_softc(dev); if (sc->flags & FW_OK) return (0); return (ENXIO); } static int t4_read_port_device(device_t dev, int port, device_t *child) { struct adapter *sc; struct port_info *pi; sc = device_get_softc(dev); if (port < 0 || port >= MAX_NPORTS) return (EINVAL); pi = sc->port[port]; if (pi == NULL || pi->dev == NULL) return (ENXIO); *child = pi->dev; return (0); } static int notify_siblings(device_t dev, int detaching) { device_t sibling; int error, i; error = 0; for (i = 0; i < PCI_FUNCMAX; i++) { if (i == pci_get_function(dev)) continue; sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), i); if (sibling == NULL || !device_is_attached(sibling)) continue; if (detaching) error = T4_DETACH_CHILD(sibling); else (void)T4_ATTACH_CHILD(sibling); if (error) break; } return (error); } /* * Idempotent */ static int t4_detach(device_t dev) { struct adapter *sc; int rc; sc = device_get_softc(dev); rc = notify_siblings(dev, 1); if (rc) { device_printf(dev, "failed to detach sibling devices: %d\n", rc); return (rc); } return (t4_detach_common(dev)); } int t4_detach_common(device_t dev) { struct adapter *sc; struct port_info *pi; int i, rc; sc = device_get_softc(dev); if (sc->cdev) { destroy_dev(sc->cdev); sc->cdev = NULL; } sx_xlock(&t4_list_lock); SLIST_REMOVE(&t4_list, sc, adapter, link); sx_xunlock(&t4_list_lock); sc->flags &= ~CHK_MBOX_ACCESS; if (sc->flags & FULL_INIT_DONE) { if (!(sc->flags & IS_VF)) t4_intr_disable(sc); } if (device_is_attached(dev)) { rc = bus_generic_detach(dev); if (rc) { device_printf(dev, "failed to detach child devices: %d\n", rc); return (rc); } } #ifdef TCP_OFFLOAD taskqueue_drain(taskqueue_thread, &sc->async_event_task); #endif for (i = 0; i < sc->intr_count; i++) t4_free_irq(sc, &sc->irq[i]); if ((sc->flags & (IS_VF | FW_OK)) == FW_OK) t4_free_tx_sched(sc); for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid); if (pi->dev) device_delete_child(dev, pi->dev); mtx_destroy(&pi->pi_lock); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); } } device_delete_children(dev); adapter_full_uninit(sc); if ((sc->flags & (IS_VF | FW_OK)) == FW_OK) t4_fw_bye(sc, sc->mbox); if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX) pci_release_msi(dev); if (sc->regs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); if (sc->udbs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->msix_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid, sc->msix_res); if (sc->l2t) t4_free_l2t(sc->l2t); if (sc->smt) t4_free_smt(sc->smt); t4_free_atid_table(sc); #ifdef RATELIMIT t4_free_etid_table(sc); #endif if (sc->key_map) vmem_destroy(sc->key_map); #ifdef INET6 t4_destroy_clip_table(sc); #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) free(sc->sge.ofld_txq, M_CXGBE); #endif #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); #endif #ifdef DEV_NETMAP free(sc->sge.nm_rxq, M_CXGBE); free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); free(sc->sge.txq, M_CXGBE); free(sc->sge.ctrlq, M_CXGBE); free(sc->sge.iqmap, M_CXGBE); free(sc->sge.eqmap, M_CXGBE); free(sc->tids.ftid_tab, M_CXGBE); free(sc->tids.hpftid_tab, M_CXGBE); free_hftid_hash(&sc->tids); free(sc->tids.tid_tab, M_CXGBE); free(sc->tt.tls_rx_ports, M_CXGBE); t4_destroy_dma_tag(sc); callout_drain(&sc->ktls_tick); callout_drain(&sc->sfl_callout); if (mtx_initialized(&sc->tids.ftid_lock)) { mtx_destroy(&sc->tids.ftid_lock); cv_destroy(&sc->tids.ftid_cv); } if (mtx_initialized(&sc->tids.atid_lock)) mtx_destroy(&sc->tids.atid_lock); if (mtx_initialized(&sc->ifp_lock)) mtx_destroy(&sc->ifp_lock); if (rw_initialized(&sc->policy_lock)) { rw_destroy(&sc->policy_lock); #ifdef TCP_OFFLOAD if (sc->policy != NULL) free_offload_policy(sc->policy); #endif } for (i = 0; i < NUM_MEMWIN; i++) { struct memwin *mw = &sc->memwin[i]; if (rw_initialized(&mw->mw_lock)) rw_destroy(&mw->mw_lock); } mtx_destroy(&sc->sfl_lock); mtx_destroy(&sc->reg_lock); mtx_destroy(&sc->sc_lock); bzero(sc, sizeof(*sc)); return (0); } static inline bool ok_to_reset(struct adapter *sc) { struct tid_info *t = &sc->tids; struct port_info *pi; struct vi_info *vi; int i, j; const int caps = IFCAP_TOE | IFCAP_TXTLS | IFCAP_NETMAP | IFCAP_TXRTLMT; ASSERT_SYNCHRONIZED_OP(sc); MPASS(!(sc->flags & IS_VF)); for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, j, vi) { if (vi->ifp->if_capenable & caps) return (false); } } if (atomic_load_int(&t->tids_in_use) > 0) return (false); if (atomic_load_int(&t->stids_in_use) > 0) return (false); if (atomic_load_int(&t->atids_in_use) > 0) return (false); if (atomic_load_int(&t->ftids_in_use) > 0) return (false); if (atomic_load_int(&t->hpftids_in_use) > 0) return (false); if (atomic_load_int(&t->etids_in_use) > 0) return (false); return (true); } static int t4_suspend(device_t dev) { struct adapter *sc = device_get_softc(dev); struct port_info *pi; struct vi_info *vi; struct ifnet *ifp; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif int rc, i, j, k; CH_ALERT(sc, "suspend requested\n"); rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4sus"); if (rc != 0) return (ENXIO); /* XXX: Can the kernel call suspend repeatedly without resume? */ MPASS(!hw_off_limits(sc)); if (!ok_to_reset(sc)) { /* XXX: should list what resource is preventing suspend. */ CH_ERR(sc, "not safe to suspend.\n"); rc = EBUSY; goto done; } /* No more DMA or interrupts. */ t4_shutdown_adapter(sc); /* Quiesce all activity. */ for_each_port(sc, i) { pi = sc->port[i]; pi->vxlan_tcam_entry = false; PORT_LOCK(pi); if (pi->up_vis > 0) { /* * t4_shutdown_adapter has already shut down all the * PHYs but it also disables interrupts and DMA so there * won't be a link interrupt. So we update the state * manually and inform the kernel. */ pi->link_cfg.link_ok = false; t4_os_link_changed(pi); } PORT_UNLOCK(pi); for_each_vi(pi, j, vi) { vi->xact_addr_filt = -1; if (!(vi->flags & VI_INIT_DONE)) continue; ifp = vi->ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mtx_lock(&vi->tick_mtx); vi->flags |= VI_SKIP_STATS; callout_stop(&vi->tick); mtx_unlock(&vi->tick_mtx); callout_drain(&vi->tick); } /* * Note that the HW is not available. */ for_each_txq(vi, k, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~(EQ_ENABLED | EQ_HW_ALLOCATED); TXQ_UNLOCK(txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, k, ofld_txq) { ofld_txq->wrq.eq.flags &= ~EQ_HW_ALLOCATED; } #endif for_each_rxq(vi, k, rxq) { rxq->iq.flags &= ~IQ_HW_ALLOCATED; } #if defined(TCP_OFFLOAD) for_each_ofld_rxq(vi, k, ofld_rxq) { ofld_rxq->iq.flags &= ~IQ_HW_ALLOCATED; } #endif quiesce_vi(vi); } if (sc->flags & FULL_INIT_DONE) { /* Control queue */ wrq = &sc->sge.ctrlq[i]; wrq->eq.flags &= ~EQ_HW_ALLOCATED; quiesce_wrq(wrq); } } if (sc->flags & FULL_INIT_DONE) { /* Firmware event queue */ sc->sge.fwq.flags &= ~IQ_HW_ALLOCATED; quiesce_iq_fl(sc, &sc->sge.fwq, NULL); } /* Mark the adapter totally off limits. */ mtx_lock(&sc->reg_lock); sc->flags |= HW_OFF_LIMITS; sc->flags &= ~(FW_OK | MASTER_PF); sc->reset_thread = NULL; mtx_unlock(&sc->reg_lock); sc->num_resets++; CH_ALERT(sc, "suspend completed.\n"); done: end_synchronized_op(sc, 0); return (rc); } struct adapter_pre_reset_state { u_int flags; uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; u_int cfcsum; char cfg_file[32]; struct adapter_params params; struct t4_virt_res vres; struct tid_info tids; struct sge sge; int rawf_base; int nrawf; }; static void save_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o) { ASSERT_SYNCHRONIZED_OP(sc); o->flags = sc->flags; o->nbmcaps = sc->nbmcaps; o->linkcaps = sc->linkcaps; o->switchcaps = sc->switchcaps; o->niccaps = sc->niccaps; o->toecaps = sc->toecaps; o->rdmacaps = sc->rdmacaps; o->cryptocaps = sc->cryptocaps; o->iscsicaps = sc->iscsicaps; o->fcoecaps = sc->fcoecaps; o->cfcsum = sc->cfcsum; MPASS(sizeof(o->cfg_file) == sizeof(sc->cfg_file)); memcpy(o->cfg_file, sc->cfg_file, sizeof(o->cfg_file)); o->params = sc->params; o->vres = sc->vres; o->tids = sc->tids; o->sge = sc->sge; o->rawf_base = sc->rawf_base; o->nrawf = sc->nrawf; } static int compare_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o) { int rc = 0; ASSERT_SYNCHRONIZED_OP(sc); /* Capabilities */ #define COMPARE_CAPS(c) do { \ if (o->c##caps != sc->c##caps) { \ CH_ERR(sc, "%scaps 0x%04x -> 0x%04x.\n", #c, o->c##caps, \ sc->c##caps); \ rc = EINVAL; \ } \ } while (0) COMPARE_CAPS(nbm); COMPARE_CAPS(link); COMPARE_CAPS(switch); COMPARE_CAPS(nic); COMPARE_CAPS(toe); COMPARE_CAPS(rdma); COMPARE_CAPS(crypto); COMPARE_CAPS(iscsi); COMPARE_CAPS(fcoe); #undef COMPARE_CAPS /* Firmware config file */ if (o->cfcsum != sc->cfcsum) { CH_ERR(sc, "config file %s (0x%x) -> %s (0x%x)\n", o->cfg_file, o->cfcsum, sc->cfg_file, sc->cfcsum); rc = EINVAL; } #define COMPARE_PARAM(p, name) do { \ if (o->p != sc->p) { \ CH_ERR(sc, #name " %d -> %d\n", o->p, sc->p); \ rc = EINVAL; \ } \ } while (0) COMPARE_PARAM(sge.iq_start, iq_start); COMPARE_PARAM(sge.eq_start, eq_start); COMPARE_PARAM(tids.ftid_base, ftid_base); COMPARE_PARAM(tids.ftid_end, ftid_end); COMPARE_PARAM(tids.nftids, nftids); COMPARE_PARAM(vres.l2t.start, l2t_start); COMPARE_PARAM(vres.l2t.size, l2t_size); COMPARE_PARAM(sge.iqmap_sz, iqmap_sz); COMPARE_PARAM(sge.eqmap_sz, eqmap_sz); COMPARE_PARAM(tids.tid_base, tid_base); COMPARE_PARAM(tids.hpftid_base, hpftid_base); COMPARE_PARAM(tids.hpftid_end, hpftid_end); COMPARE_PARAM(tids.nhpftids, nhpftids); COMPARE_PARAM(rawf_base, rawf_base); COMPARE_PARAM(nrawf, nrawf); COMPARE_PARAM(params.mps_bg_map, mps_bg_map); COMPARE_PARAM(params.filter2_wr_support, filter2_wr_support); COMPARE_PARAM(params.ulptx_memwrite_dsgl, ulptx_memwrite_dsgl); COMPARE_PARAM(params.fr_nsmr_tpte_wr_support, fr_nsmr_tpte_wr_support); COMPARE_PARAM(params.max_pkts_per_eth_tx_pkts_wr, max_pkts_per_eth_tx_pkts_wr); COMPARE_PARAM(tids.ntids, ntids); COMPARE_PARAM(tids.etid_base, etid_base); COMPARE_PARAM(tids.etid_end, etid_end); COMPARE_PARAM(tids.netids, netids); COMPARE_PARAM(params.eo_wr_cred, eo_wr_cred); COMPARE_PARAM(params.ethoffload, ethoffload); COMPARE_PARAM(tids.natids, natids); COMPARE_PARAM(tids.stid_base, stid_base); COMPARE_PARAM(vres.ddp.start, ddp_start); COMPARE_PARAM(vres.ddp.size, ddp_size); COMPARE_PARAM(params.ofldq_wr_cred, ofldq_wr_cred); COMPARE_PARAM(vres.stag.start, stag_start); COMPARE_PARAM(vres.stag.size, stag_size); COMPARE_PARAM(vres.rq.start, rq_start); COMPARE_PARAM(vres.rq.size, rq_size); COMPARE_PARAM(vres.pbl.start, pbl_start); COMPARE_PARAM(vres.pbl.size, pbl_size); COMPARE_PARAM(vres.qp.start, qp_start); COMPARE_PARAM(vres.qp.size, qp_size); COMPARE_PARAM(vres.cq.start, cq_start); COMPARE_PARAM(vres.cq.size, cq_size); COMPARE_PARAM(vres.ocq.start, ocq_start); COMPARE_PARAM(vres.ocq.size, ocq_size); COMPARE_PARAM(vres.srq.start, srq_start); COMPARE_PARAM(vres.srq.size, srq_size); COMPARE_PARAM(params.max_ordird_qp, max_ordird_qp); COMPARE_PARAM(params.max_ird_adapter, max_ird_adapter); COMPARE_PARAM(vres.iscsi.start, iscsi_start); COMPARE_PARAM(vres.iscsi.size, iscsi_size); COMPARE_PARAM(vres.key.start, key_start); COMPARE_PARAM(vres.key.size, key_size); #undef COMPARE_PARAM return (rc); } static int t4_resume(device_t dev) { struct adapter *sc = device_get_softc(dev); struct adapter_pre_reset_state *old_state = NULL; struct port_info *pi; struct vi_info *vi; struct ifnet *ifp; struct sge_txq *txq; int rc, i, j, k; CH_ALERT(sc, "resume requested.\n"); rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4res"); if (rc != 0) return (ENXIO); MPASS(hw_off_limits(sc)); MPASS((sc->flags & FW_OK) == 0); MPASS((sc->flags & MASTER_PF) == 0); MPASS(sc->reset_thread == NULL); sc->reset_thread = curthread; /* Register access is expected to work by the time we're here. */ if (t4_read_reg(sc, A_PL_WHOAMI) == 0xffffffff) { CH_ERR(sc, "%s: can't read device registers\n", __func__); rc = ENXIO; goto done; } /* Restore memory window. */ setup_memwin(sc); /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { CH_ALERT(sc, "recovery mode on resume.\n"); rc = 0; mtx_lock(&sc->reg_lock); sc->flags &= ~HW_OFF_LIMITS; mtx_unlock(&sc->reg_lock); goto done; } old_state = malloc(sizeof(*old_state), M_CXGBE, M_ZERO | M_WAITOK); save_caps_and_params(sc, old_state); /* Reestablish contact with firmware and become the primary PF. */ rc = contact_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ MPASS(sc->flags & FW_OK); if (sc->flags & MASTER_PF) { rc = partition_resources(sc); if (rc != 0) goto done; /* error message displayed already */ t4_intr_clear(sc); } rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = compare_caps_and_params(sc, old_state); if (rc != 0) goto done; /* error message displayed already */ for_each_port(sc, i) { pi = sc->port[i]; MPASS(pi != NULL); MPASS(pi->vi != NULL); MPASS(pi->vi[0].dev == pi->dev); rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i); if (rc != 0) { CH_ERR(sc, "failed to re-initialize port %d: %d\n", i, rc); goto done; } MPASS(sc->chan_map[pi->tx_chan] == i); PORT_LOCK(pi); fixup_link_config(pi); build_medialist(pi); PORT_UNLOCK(pi); for_each_vi(pi, j, vi) { if (IS_MAIN_VI(vi)) continue; rc = alloc_extra_vi(sc, pi, vi); if (rc != 0) { CH_ERR(vi, "failed to re-allocate extra VI: %d\n", rc); goto done; } } } /* * Interrupts and queues are about to be enabled and other threads will * want to access the hardware too. It is safe to do so. Note that * this thread is still in the middle of a synchronized_op. */ mtx_lock(&sc->reg_lock); sc->flags &= ~HW_OFF_LIMITS; mtx_unlock(&sc->reg_lock); if (sc->flags & FULL_INIT_DONE) { rc = adapter_full_init(sc); if (rc != 0) { CH_ERR(sc, "failed to re-initialize adapter: %d\n", rc); goto done; } if (sc->vxlan_refcount > 0) enable_vxlan_rx(sc); for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, j, vi) { if (!(vi->flags & VI_INIT_DONE)) continue; rc = vi_full_init(vi); if (rc != 0) { CH_ERR(vi, "failed to re-initialize " "interface: %d\n", rc); goto done; } ifp = vi->ifp; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) continue; /* * Note that we do not setup multicast addresses * in the first pass. This ensures that the * unicast DMACs for all VIs on all ports get an * MPS TCAM entry. */ rc = update_mac_settings(ifp, XGMAC_ALL & ~XGMAC_MCADDRS); if (rc != 0) { CH_ERR(vi, "failed to re-configure MAC: %d\n", rc); goto done; } rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { CH_ERR(vi, "failed to re-enable VI: %d\n", rc); goto done; } for_each_txq(vi, k, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_ENABLED; TXQ_UNLOCK(txq); } mtx_lock(&vi->tick_mtx); vi->flags &= ~VI_SKIP_STATS; callout_schedule(&vi->tick, hz); mtx_unlock(&vi->tick_mtx); } PORT_LOCK(pi); if (pi->up_vis > 0) { t4_update_port_info(pi); fixup_link_config(pi); build_medialist(pi); apply_link_config(pi); if (pi->link_cfg.link_ok) t4_os_link_changed(pi); } PORT_UNLOCK(pi); } /* Now reprogram the L2 multicast addresses. */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, j, vi) { if (!(vi->flags & VI_INIT_DONE)) continue; ifp = vi->ifp; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) continue; rc = update_mac_settings(ifp, XGMAC_MCADDRS); if (rc != 0) { CH_ERR(vi, "failed to re-configure MCAST MACs: %d\n", rc); rc = 0; /* carry on */ } } } } done: if (rc == 0) { sc->incarnation++; CH_ALERT(sc, "resume completed.\n"); } end_synchronized_op(sc, 0); free(old_state, M_CXGBE); return (rc); } static int t4_reset_prepare(device_t dev, device_t child) { struct adapter *sc = device_get_softc(dev); CH_ALERT(sc, "reset_prepare.\n"); return (0); } static int t4_reset_post(device_t dev, device_t child) { struct adapter *sc = device_get_softc(dev); CH_ALERT(sc, "reset_post.\n"); return (0); } static void reset_adapter(void *arg, int pending) { struct adapter *sc = arg; int rc; CH_ALERT(sc, "reset requested.\n"); rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst1"); if (rc != 0) return; if (hw_off_limits(sc)) { CH_ERR(sc, "adapter is suspended, use resume (not reset).\n"); rc = ENXIO; goto done; } if (!ok_to_reset(sc)) { /* XXX: should list what resource is preventing reset. */ CH_ERR(sc, "not safe to reset.\n"); rc = EBUSY; goto done; } done: end_synchronized_op(sc, 0); if (rc != 0) return; /* Error logged already. */ mtx_lock(&Giant); rc = BUS_RESET_CHILD(device_get_parent(sc->dev), sc->dev, 0); mtx_unlock(&Giant); if (rc != 0) CH_ERR(sc, "bus_reset_child failed: %d.\n", rc); else CH_ALERT(sc, "bus_reset_child succeeded.\n"); } static int cxgbe_probe(device_t dev) { char buf[128]; struct port_info *pi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d", pi->port_id); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \ IFCAP_HWRXTSTMP | IFCAP_MEXTPG) #define T4_CAP_ENABLE (T4_CAP) static int cxgbe_vi_attach(device_t dev, struct vi_info *vi) { struct ifnet *ifp; struct sbuf *sb; struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; struct pfil_head_args pa; struct adapter *sc = vi->adapter; ctx = device_get_sysctl_ctx(vi->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(vi->dev)); vi->rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NIC rx queues"); vi->txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NIC tx queues"); #ifdef DEV_NETMAP vi->nm_rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "nm_rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap rx queues"); vi->nm_txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "nm_txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap tx queues"); #endif #ifdef TCP_OFFLOAD vi->ofld_rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE rx queues"); #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) vi->ofld_txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE/ETHOFLD tx queues"); #endif vi->xact_addr_filt = -1; mtx_init(&vi->tick_mtx, "vi tick", NULL, MTX_DEF); callout_init_mtx(&vi->tick, &vi->tick_mtx, 0); if (sc->flags & IS_VF || t4_tx_vm_wr != 0) vi->flags |= TX_USES_VM_WR; /* Allocate an ifnet and set it up */ ifp = if_alloc_dev(IFT_ETHER, dev); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } vi->ifp = ifp; ifp->if_softc = vi; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = cxgbe_init; ifp->if_ioctl = cxgbe_ioctl; ifp->if_transmit = cxgbe_transmit; ifp->if_qflush = cxgbe_qflush; if (vi->pi->nvi > 1 || sc->flags & IS_VF) ifp->if_get_counter = vi_get_counter; else ifp->if_get_counter = cxgbe_get_counter; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = cxgbe_snd_tag_alloc; ifp->if_snd_tag_modify = cxgbe_snd_tag_modify; ifp->if_snd_tag_query = cxgbe_snd_tag_query; ifp->if_snd_tag_free = cxgbe_snd_tag_free; #endif #ifdef RATELIMIT ifp->if_ratelimit_query = cxgbe_ratelimit_query; #endif ifp->if_capabilities = T4_CAP; ifp->if_capenable = T4_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; if (chip_id(sc) >= CHELSIO_T6) { ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO; ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO; ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN; } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) ifp->if_capabilities |= IFCAP_TOE; #endif #ifdef RATELIMIT if (is_ethoffload(sc) && vi->nofldtxq != 0) { ifp->if_capabilities |= IFCAP_TXRTLMT; ifp->if_capenable |= IFCAP_TXRTLMT; } #endif ifp->if_hw_tsomax = IP_MAXPACKET; if (vi->flags & TX_USES_VM_WR) ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO; else ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; #ifdef RATELIMIT if (is_ethoffload(sc) && vi->nofldtxq != 0) ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO; #endif ifp->if_hw_tsomaxsegsize = 65536; #ifdef KERN_TLS if (is_ktls(sc)) { ifp->if_capabilities |= IFCAP_TXTLS; if (sc->flags & KERN_TLS_ON) ifp->if_capenable |= IFCAP_TXTLS; } #endif ether_ifattach(ifp, vi->hw_addr); #ifdef DEV_NETMAP if (vi->nnmrxq != 0) cxgbe_nm_attach(vi); #endif sb = sbuf_new_auto(); sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) switch (ifp->if_capabilities & (IFCAP_TOE | IFCAP_TXRTLMT)) { case IFCAP_TOE: sbuf_printf(sb, "; %d txq (TOE)", vi->nofldtxq); break; case IFCAP_TOE | IFCAP_TXRTLMT: sbuf_printf(sb, "; %d txq (TOE/ETHOFLD)", vi->nofldtxq); break; case IFCAP_TXRTLMT: sbuf_printf(sb, "; %d txq (ETHOFLD)", vi->nofldtxq); break; } #endif #ifdef TCP_OFFLOAD if (ifp->if_capabilities & IFCAP_TOE) sbuf_printf(sb, ", %d rxq (TOE)", vi->nofldrxq); #endif #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) sbuf_printf(sb, "; %d txq, %d rxq (netmap)", vi->nnmtxq, vi->nnmrxq); #endif sbuf_finish(sb); device_printf(dev, "%s\n", sbuf_data(sb)); sbuf_delete(sb); vi_sysctls(vi); pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; pa.pa_headname = ifp->if_xname; vi->pfil = pfil_head_register(&pa); return (0); } static int cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; struct vi_info *vi; int i, rc; rc = cxgbe_vi_attach(dev, &pi->vi[0]); if (rc) return (rc); for_each_vi(pi, i, vi) { if (i == 0) continue; vi->dev = device_add_child(dev, sc->names->vi_ifnet_name, -1); if (vi->dev == NULL) { device_printf(dev, "failed to add VI %d\n", i); continue; } device_set_softc(vi->dev, vi); } cxgbe_sysctls(pi); bus_generic_attach(dev); return (0); } static void cxgbe_vi_detach(struct vi_info *vi) { struct ifnet *ifp = vi->ifp; if (vi->pfil != NULL) { pfil_head_unregister(vi->pfil); vi->pfil = NULL; } ether_ifdetach(ifp); /* Let detach proceed even if these fail. */ #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) cxgbe_nm_detach(vi); #endif cxgbe_uninit_synchronized(vi); callout_drain(&vi->tick); vi_full_uninit(vi); if_free(vi->ifp); vi->ifp = NULL; } static int cxgbe_detach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; int rc; /* Detach the extra VIs first. */ rc = bus_generic_detach(dev); if (rc) return (rc); device_delete_children(dev); doom_vi(sc, &pi->vi[0]); if (pi->flags & HAS_TRACEQ) { sc->traceq = -1; /* cloner should not create ifnet */ t4_tracer_port_detach(sc); } cxgbe_vi_detach(&pi->vi[0]); ifmedia_removeall(&pi->media); end_synchronized_op(sc, 0); return (0); } static void cxgbe_init(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->adapter; if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0) return; cxgbe_init_synchronized(vi); end_synchronized_op(sc, 0); } static int cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) { int rc = 0, mtu, flags; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifreq *ifr = (struct ifreq *)data; uint32_t mask; switch (cmd) { case SIOCSIFMTU: mtu = ifr->ifr_mtu; if (mtu < ETHERMIN || mtu > MAX_MTU) return (EINVAL); rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu"); if (rc) return (rc); ifp->if_mtu = mtu; if (vi->flags & VI_INIT_DONE) { t4_update_fl_bufsize(ifp); if (!hw_off_limits(sc) && ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4flg"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto fail; } if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { flags = vi->if_flags; if ((ifp->if_flags ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } } else { rc = cxgbe_init_synchronized(vi); } vi->if_flags = ifp->if_flags; } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { rc = cxgbe_uninit_synchronized(vi); } end_synchronized_op(sc, 0); break; case SIOCADDMULTI: case SIOCDELMULTI: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4multi"); if (rc) return (rc); if (!hw_off_limits(sc) && ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, 0); break; case SIOCSIFCAP: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap"); if (rc) return (rc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { mask &= ~IFCAP_TSO4; ifp->if_capenable &= ~IFCAP_TSO4; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { mask &= ~IFCAP_TSO6; ifp->if_capenable &= ~IFCAP_TSO6; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { if_printf(ifp, "enable txcsum first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO4; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { if_printf(ifp, "enable txcsum6 first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO6; } if (mask & IFCAP_LRO) { #if defined(INET) || defined(INET6) int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_LRO; for_each_rxq(vi, i, rxq) { if (ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; else rxq->iq.flags &= ~IQ_LRO_ENABLED; } #endif } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; rc = toe_capability(vi, enable); if (rc != 0) goto fail; ifp->if_capenable ^= mask; } #endif if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; /* Need to find out how to disable auto-mtu-inflation */ } if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWCSUM) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; #ifdef RATELIMIT if (mask & IFCAP_TXRTLMT) ifp->if_capenable ^= IFCAP_TXRTLMT; #endif if (mask & IFCAP_HWRXTSTMP) { int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_HWRXTSTMP; for_each_rxq(vi, i, rxq) { if (ifp->if_capenable & IFCAP_HWRXTSTMP) rxq->iq.flags |= IQ_RX_TIMESTAMP; else rxq->iq.flags &= ~IQ_RX_TIMESTAMP; } } if (mask & IFCAP_MEXTPG) ifp->if_capenable ^= IFCAP_MEXTPG; #ifdef KERN_TLS if (mask & IFCAP_TXTLS) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TXTLS; rc = ktls_capability(sc, enable); if (rc != 0) goto fail; ifp->if_capenable ^= (mask & IFCAP_TXTLS); } #endif if (mask & IFCAP_VXLAN_HWCSUM) { ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM; ifp->if_hwassist ^= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP; } if (mask & IFCAP_VXLAN_HWTSO) { ifp->if_capenable ^= IFCAP_VXLAN_HWTSO; ifp->if_hwassist ^= CSUM_INNER_IP6_TSO | CSUM_INNER_IP_TSO; } #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif fail: end_synchronized_op(sc, 0); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: ifmedia_ioctl(ifp, ifr, &pi->media, cmd); break; case SIOCGI2C: { struct ifi2creq i2c; rc = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (rc != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { rc = EPERM; break; } if (i2c.len > sizeof(i2c.data)) { rc = EINVAL; break; } rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr, i2c.offset, i2c.len, &i2c.data[0]); end_synchronized_op(sc, 0); if (rc == 0) rc = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); break; } default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc; struct sge_txq *txq; void *items[1]; int rc; M_ASSERTPKTHDR(m); MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) MPASS(m->m_pkthdr.snd_tag->ifp == ifp); #endif if (__predict_false(pi->link_cfg.link_ok == false)) { m_freem(m); return (ENETDOWN); } rc = parse_pkt(&m, vi->flags & TX_USES_VM_WR); if (__predict_false(rc != 0)) { MPASS(m == NULL); /* was freed already */ atomic_add_int(&pi->tx_parse_error, 1); /* rare, atomic is ok */ return (rc); } #ifdef RATELIMIT if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { if (m->m_pkthdr.snd_tag->type == IF_SND_TAG_TYPE_RATE_LIMIT) return (ethofld_transmit(ifp, m)); } #endif /* Select a txq. */ sc = vi->adapter; txq = &sc->sge.txq[vi->first_txq]; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) + vi->rsrv_noflowq); items[0] = m; rc = mp_ring_enqueue(txq->r, items, 1, 256); if (__predict_false(rc != 0)) m_freem(m); return (rc); } static void cxgbe_qflush(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct sge_txq *txq; int i; /* queues do not exist if !VI_INIT_DONE. */ if (vi->flags & VI_INIT_DONE) { for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_QFLUSH; TXQ_UNLOCK(txq); while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 4096); pause("qflush", 1); } TXQ_LOCK(txq); txq->eq.flags &= ~EQ_QFLUSH; TXQ_UNLOCK(txq); } } if_qflush(ifp); } static uint64_t vi_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct fw_vi_stats_vf *s = &vi->stats; mtx_lock(&vi->tick_mtx); vi_refresh_stats(vi); mtx_unlock(&vi->tick_mtx); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_bcast_frames + s->rx_mcast_frames + s->rx_ucast_frames); case IFCOUNTER_IERRORS: return (s->rx_err_frames); case IFCOUNTER_OPACKETS: return (s->tx_bcast_frames + s->tx_mcast_frames + s->tx_ucast_frames + s->tx_offload_frames); case IFCOUNTER_OERRORS: return (s->tx_drop_frames); case IFCOUNTER_IBYTES: return (s->rx_bcast_bytes + s->rx_mcast_bytes + s->rx_ucast_bytes); case IFCOUNTER_OBYTES: return (s->tx_bcast_bytes + s->tx_mcast_bytes + s->tx_ucast_bytes + s->tx_offload_bytes); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = 0; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->dropped); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } static uint64_t cxgbe_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct port_stats *s = &pi->stats; mtx_lock(&vi->tick_mtx); cxgbe_refresh_stats(vi); mtx_unlock(&vi->tick_mtx); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_frames); case IFCOUNTER_IERRORS: return (s->rx_jabber + s->rx_runt + s->rx_too_long + s->rx_fcs_err + s->rx_len_err); case IFCOUNTER_OPACKETS: return (s->tx_frames); case IFCOUNTER_OERRORS: return (s->tx_error_frames); case IFCOUNTER_IBYTES: return (s->rx_octets); case IFCOUNTER_OBYTES: return (s->tx_octets); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_IQDROPS: return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3 + pi->tnl_cong_drops); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = s->tx_drop; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->dropped); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } #if defined(KERN_TLS) || defined(RATELIMIT) static int cxgbe_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { int error; switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: error = cxgbe_rate_tag_alloc(ifp, params, pt); break; #endif #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: error = cxgbe_tls_tag_alloc(ifp, params, pt); break; #endif default: error = EOPNOTSUPP; } return (error); } static int cxgbe_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { switch (mst->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (cxgbe_rate_tag_modify(mst, params)); #endif default: return (EOPNOTSUPP); } } static int cxgbe_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { switch (mst->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (cxgbe_rate_tag_query(mst, params)); #endif default: return (EOPNOTSUPP); } } static void cxgbe_snd_tag_free(struct m_snd_tag *mst) { switch (mst->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: cxgbe_rate_tag_free(mst); return; #endif #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: cxgbe_tls_tag_free(mst); return; #endif default: panic("shouldn't get here"); } } #endif /* * The kernel picks a media from the list we had provided but we still validate * the requeste. */ int cxgbe_media_change(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct ifmedia *ifm = &pi->media; struct link_config *lc = &pi->link_cfg; struct adapter *sc = pi->adapter; int rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mec"); if (rc != 0) return (rc); PORT_LOCK(pi); if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) { /* ifconfig .. media autoselect */ if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; /* AN not supported by transceiver */ goto done; } lc->requested_aneg = AUTONEG_ENABLE; lc->requested_speed = 0; lc->requested_fc |= PAUSE_AUTONEG; } else { lc->requested_aneg = AUTONEG_DISABLE; lc->requested_speed = ifmedia_baudrate(ifm->ifm_media) / 1000000; lc->requested_fc = 0; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE) lc->requested_fc |= PAUSE_RX; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) lc->requested_fc |= PAUSE_TX; } if (pi->up_vis > 0) { fixup_link_config(pi); rc = apply_link_config(pi); } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } /* * Base media word (without ETHER, pause, link active, etc.) for the port at the * given speed. */ static int port_mword(struct port_info *pi, uint32_t speed) { MPASS(speed & M_FW_PORT_CAP32_SPEED); MPASS(powerof2(speed)); switch(pi->port_type) { case FW_PORT_TYPE_BT_SGMII: case FW_PORT_TYPE_BT_XFI: case FW_PORT_TYPE_BT_XAUI: /* BaseT */ switch (speed) { case FW_PORT_CAP32_SPEED_100M: return (IFM_100_T); case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_T); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_T); } break; case FW_PORT_TYPE_KX4: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_KX4); break; case FW_PORT_TYPE_CX4: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_CX4); break; case FW_PORT_TYPE_KX: if (speed == FW_PORT_CAP32_SPEED_1G) return (IFM_1000_KX); break; case FW_PORT_TYPE_KR: case FW_PORT_TYPE_BP_AP: case FW_PORT_TYPE_BP4_AP: case FW_PORT_TYPE_BP40_BA: case FW_PORT_TYPE_KR4_100G: case FW_PORT_TYPE_KR_SFP28: case FW_PORT_TYPE_KR_XLAUI: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_KX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_KR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_KR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_KR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_KR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_KR4); } break; case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_QSA: case FW_PORT_TYPE_QSFP: case FW_PORT_TYPE_CR4_QSFP: case FW_PORT_TYPE_CR_QSFP: case FW_PORT_TYPE_CR2_QSFP: case FW_PORT_TYPE_SFP28: /* Pluggable transceiver */ switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_LX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_LR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_LR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_LR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_LR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_LR4); } break; case FW_PORT_MOD_TYPE_SR: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_SX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_SR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_SR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_SR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_SR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_SR4); } break; case FW_PORT_MOD_TYPE_ER: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_ER); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_CX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_TWINAX); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_CR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_CR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_CR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_CR4); } break; case FW_PORT_MOD_TYPE_LRM: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_NA: MPASS(0); /* Not pluggable? */ /* fall throough */ case FW_PORT_MOD_TYPE_ERROR: case FW_PORT_MOD_TYPE_UNKNOWN: case FW_PORT_MOD_TYPE_NOTSUPPORTED: break; case FW_PORT_MOD_TYPE_NONE: return (IFM_NONE); } break; case FW_PORT_TYPE_NONE: return (IFM_NONE); } return (IFM_UNKNOWN); } void cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4med") != 0) return; PORT_LOCK(pi); if (pi->up_vis == 0) { /* * If all the interfaces are administratively down the firmware * does not report transceiver changes. Refresh port info here * so that ifconfig displays accurate ifmedia at all times. * This is the only reason we have a synchronized op in this * function. Just PORT_LOCK would have been enough otherwise. */ t4_update_port_info(pi); build_medialist(pi); } /* ifm_status */ ifmr->ifm_status = IFM_AVALID; if (lc->link_ok == false) goto done; ifmr->ifm_status |= IFM_ACTIVE; /* ifm_active */ ifmr->ifm_active = IFM_ETHER | IFM_FDX; ifmr->ifm_active &= ~(IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE); if (lc->fc & PAUSE_RX) ifmr->ifm_active |= IFM_ETH_RXPAUSE; if (lc->fc & PAUSE_TX) ifmr->ifm_active |= IFM_ETH_TXPAUSE; ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed)); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } static int vcxgbe_probe(device_t dev) { char buf[128]; struct vi_info *vi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id, vi - vi->pi->vi); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } static int alloc_extra_vi(struct adapter *sc, struct port_info *pi, struct vi_info *vi) { int func, index, rc; uint32_t param, val; ASSERT_SYNCHRONIZED_OP(sc); index = vi - pi->vi; MPASS(index > 0); /* This function deals with _extra_ VIs only */ KASSERT(index < nitems(vi_mac_funcs), ("%s: VI %s doesn't have a MAC func", __func__, device_get_nameunit(vi->dev))); func = vi_mac_funcs[index]; rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1, vi->hw_addr, &vi->rss_size, &vi->vfvld, &vi->vin, func, 0); if (rc < 0) { CH_ERR(vi, "failed to allocate virtual interface %d" "for port %d: %d\n", index, pi->port_id, -rc); return (-rc); } vi->viid = rc; if (vi->rss_size == 1) { /* * This VI didn't get a slice of the RSS table. Reduce the * number of VIs being created (hw.cxgbe.num_vis) or modify the * configuration file (nvi, rssnvi for this PF) if this is a * problem. */ device_printf(vi->dev, "RSS table not available.\n"); vi->rss_base = 0xffff; return (0); } param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) | V_FW_PARAMS_PARAM_YZ(vi->viid); rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc) vi->rss_base = 0xffff; else { MPASS((val >> 16) == vi->rss_size); vi->rss_base = val & 0xffff; } return (0); } static int vcxgbe_attach(device_t dev) { struct vi_info *vi; struct port_info *pi; struct adapter *sc; int rc; vi = device_get_softc(dev); pi = vi->pi; sc = pi->adapter; rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4via"); if (rc) return (rc); rc = alloc_extra_vi(sc, pi, vi); end_synchronized_op(sc, 0); if (rc) return (rc); rc = cxgbe_vi_attach(dev, vi); if (rc) { t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); return (rc); } return (0); } static int vcxgbe_detach(device_t dev) { struct vi_info *vi; struct adapter *sc; vi = device_get_softc(dev); sc = vi->adapter; doom_vi(sc, vi); cxgbe_vi_detach(vi); t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); end_synchronized_op(sc, 0); return (0); } static struct callout fatal_callout; static struct taskqueue *reset_tq; static void delayed_panic(void *arg) { struct adapter *sc = arg; panic("%s: panic on fatal error", device_get_nameunit(sc->dev)); } void t4_fatal_err(struct adapter *sc, bool fw_error) { t4_shutdown_adapter(sc); log(LOG_ALERT, "%s: encountered fatal error, adapter stopped.\n", device_get_nameunit(sc->dev)); if (fw_error) { if (sc->flags & CHK_MBOX_ACCESS) ASSERT_SYNCHRONIZED_OP(sc); sc->flags |= ADAP_ERR; } else { ADAPTER_LOCK(sc); sc->flags |= ADAP_ERR; ADAPTER_UNLOCK(sc); } #ifdef TCP_OFFLOAD taskqueue_enqueue(taskqueue_thread, &sc->async_event_task); #endif if (t4_panic_on_fatal_err) { CH_ALERT(sc, "panicking on fatal error (after 30s).\n"); callout_reset(&fatal_callout, hz * 30, delayed_panic, sc); } else if (t4_reset_on_fatal_err) { CH_ALERT(sc, "resetting on fatal error.\n"); taskqueue_enqueue(reset_tq, &sc->reset_task); } } void t4_add_adapter(struct adapter *sc) { sx_xlock(&t4_list_lock); SLIST_INSERT_HEAD(&t4_list, sc, link); sx_xunlock(&t4_list_lock); } int t4_map_bars_0_and_4(struct adapter *sc) { sc->regs_rid = PCIR_BAR(0); sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE); if (sc->regs_res == NULL) { device_printf(sc->dev, "cannot map registers.\n"); return (ENXIO); } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); setbit(&sc->doorbells, DOORBELL_KDB); sc->msix_rid = PCIR_BAR(4); sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->msix_rid, RF_ACTIVE); if (sc->msix_res == NULL) { device_printf(sc->dev, "cannot map MSI-X BAR.\n"); return (ENXIO); } return (0); } int t4_map_bar_2(struct adapter *sc) { /* * T4: only iWARP driver uses the userspace doorbells. There is no need * to map it if RDMA is disabled. */ if (is_t4(sc) && sc->rdmacaps == 0) return (0); sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE); if (sc->udbs_res == NULL) { device_printf(sc->dev, "cannot map doorbell BAR.\n"); return (ENXIO); } sc->udbs_base = rman_get_virtual(sc->udbs_res); if (chip_id(sc) >= CHELSIO_T5) { setbit(&sc->doorbells, DOORBELL_UDB); #if defined(__i386__) || defined(__amd64__) if (t5_write_combine) { int rc, mode; /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B * (UDBS_SEG_SIZE) doorbell regions, each associated * with an egress queue. The first 64B has the doorbell * and the second 64B can be used to submit a tx work * request with an implicit doorbell. */ rc = pmap_change_attr((vm_offset_t)sc->udbs_base, rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING); if (rc == 0) { clrbit(&sc->doorbells, DOORBELL_UDB); setbit(&sc->doorbells, DOORBELL_WCWR); setbit(&sc->doorbells, DOORBELL_UDBWC); } else { device_printf(sc->dev, "couldn't enable write combining: %d\n", rc); } mode = is_t5(sc) ? V_STATMODE(0) : V_T6_STATMODE(0); t4_write_reg(sc, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) | mode); } #endif } sc->iwt.wc_en = isset(&sc->doorbells, DOORBELL_UDBWC) ? 1 : 0; return (0); } struct memwin_init { uint32_t base; uint32_t aperture; }; static const struct memwin_init t4_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 } }; static const struct memwin_init t5_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 }, }; static void setup_memwin(struct adapter *sc) { const struct memwin_init *mw_init; struct memwin *mw; int i; uint32_t bar0; if (is_t4(sc)) { /* * Read low 32b of bar0 indirectly via the hardware backdoor * mechanism. Works from within PCI passthrough environments * too, where rman_get_start() can return a different value. We * need to program the T4 memory window decoders with the actual * addresses that will be coming across the PCIe link. */ bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0)); bar0 &= (uint32_t) PCIM_BAR_MEM_BASE; mw_init = &t4_memwin[0]; } else { /* T5+ use the relative offset inside the PCIe BAR */ bar0 = 0; mw_init = &t5_memwin[0]; } for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) { if (!rw_initialized(&mw->mw_lock)) { rw_init(&mw->mw_lock, "memory window access"); mw->mw_base = mw_init->base; mw->mw_aperture = mw_init->aperture; mw->mw_curpos = 0; } t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i), (mw->mw_base + bar0) | V_BIR(0) | V_WINDOW(ilog2(mw->mw_aperture) - 10)); rw_wlock(&mw->mw_lock); position_memwin(sc, i, mw->mw_curpos); rw_wunlock(&mw->mw_lock); } /* flush */ t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2)); } /* * Positions the memory window at the given address in the card's address space. * There are some alignment requirements and the actual position may be at an * address prior to the requested address. mw->mw_curpos always has the actual * position of the window. */ static void position_memwin(struct adapter *sc, int idx, uint32_t addr) { struct memwin *mw; uint32_t pf; uint32_t reg; MPASS(idx >= 0 && idx < NUM_MEMWIN); mw = &sc->memwin[idx]; rw_assert(&mw->mw_lock, RA_WLOCKED); if (is_t4(sc)) { pf = 0; mw->mw_curpos = addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); mw->mw_curpos = addr & ~0x7f; /* start must be 128B aligned */ } reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx); t4_write_reg(sc, reg, mw->mw_curpos | pf); t4_read_reg(sc, reg); /* flush */ } int rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len, int rw) { struct memwin *mw; uint32_t mw_end, v; MPASS(idx >= 0 && idx < NUM_MEMWIN); /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len <= 0) return (EINVAL); mw = &sc->memwin[idx]; while (len > 0) { rw_rlock(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; if (addr >= mw_end || addr < mw->mw_curpos) { /* Will need to reposition the window */ if (!rw_try_upgrade(&mw->mw_lock)) { rw_runlock(&mw->mw_lock); rw_wlock(&mw->mw_lock); } rw_assert(&mw->mw_lock, RA_WLOCKED); position_memwin(sc, idx, addr); rw_downgrade(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; } rw_assert(&mw->mw_lock, RA_RLOCKED); while (addr < mw_end && len > 0) { if (rw == 0) { v = t4_read_reg(sc, mw->mw_base + addr - mw->mw_curpos); *val++ = le32toh(v); } else { v = *val++; t4_write_reg(sc, mw->mw_base + addr - mw->mw_curpos, htole32(v)); } addr += 4; len -= 4; } rw_runlock(&mw->mw_lock); } return (0); } static void t4_init_atid_table(struct adapter *sc) { struct tid_info *t; int i; t = &sc->tids; if (t->natids == 0) return; MPASS(t->atid_tab == NULL); t->atid_tab = malloc(t->natids * sizeof(*t->atid_tab), M_CXGBE, M_ZERO | M_WAITOK); mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); t->afree = t->atid_tab; t->atids_in_use = 0; for (i = 1; i < t->natids; i++) t->atid_tab[i - 1].next = &t->atid_tab[i]; t->atid_tab[t->natids - 1].next = NULL; } static void t4_free_atid_table(struct adapter *sc) { struct tid_info *t; t = &sc->tids; KASSERT(t->atids_in_use == 0, ("%s: %d atids still in use.", __func__, t->atids_in_use)); if (mtx_initialized(&t->atid_lock)) mtx_destroy(&t->atid_lock); free(t->atid_tab, M_CXGBE); t->atid_tab = NULL; } int alloc_atid(struct adapter *sc, void *ctx) { struct tid_info *t = &sc->tids; int atid = -1; mtx_lock(&t->atid_lock); if (t->afree) { union aopen_entry *p = t->afree; atid = p - t->atid_tab; MPASS(atid <= M_TID_TID); t->afree = p->next; p->data = ctx; t->atids_in_use++; } mtx_unlock(&t->atid_lock); return (atid); } void * lookup_atid(struct adapter *sc, int atid) { struct tid_info *t = &sc->tids; return (t->atid_tab[atid].data); } void free_atid(struct adapter *sc, int atid) { struct tid_info *t = &sc->tids; union aopen_entry *p = &t->atid_tab[atid]; mtx_lock(&t->atid_lock); p->next = t->afree; t->afree = p; t->atids_in_use--; mtx_unlock(&t->atid_lock); } static void queue_tid_release(struct adapter *sc, int tid) { CXGBE_UNIMPLEMENTED("deferred tid release"); } void release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) { struct wrqe *wr; struct cpl_tid_release *req; wr = alloc_wrqe(sizeof(*req), ctrlq); if (wr == NULL) { queue_tid_release(sc, tid); /* defer */ return; } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); t4_wrq_tx(sc, wr); } static int t4_range_cmp(const void *a, const void *b) { return ((const struct t4_range *)a)->start - ((const struct t4_range *)b)->start; } /* * Verify that the memory range specified by the addr/len pair is valid within * the card's address space. */ static int validate_mem_range(struct adapter *sc, uint32_t addr, uint32_t len) { struct t4_range mem_ranges[4], *r, *next; uint32_t em, addr_len; int i, n, remaining; /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len == 0) return (EINVAL); /* Enabled memories */ em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); r = &mem_ranges[0]; n = 0; bzero(r, sizeof(mem_ranges)); if (em & F_EDRAM0_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); r->size = G_EDRAM0_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM0_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EDRAM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); r->size = G_EDRAM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EXT_MEM_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); r->size = G_EXT_MEM_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); r->size = G_EXT_MEM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } MPASS(n <= nitems(mem_ranges)); if (n > 1) { /* Sort and merge the ranges. */ qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp); /* Start from index 0 and examine the next n - 1 entries. */ r = &mem_ranges[0]; for (remaining = n - 1; remaining > 0; remaining--, r++) { MPASS(r->size > 0); /* r is a valid entry. */ next = r + 1; MPASS(next->size > 0); /* and so is the next one. */ while (r->start + r->size >= next->start) { /* Merge the next one into the current entry. */ r->size = max(r->start + r->size, next->start + next->size) - r->start; n--; /* One fewer entry in total. */ if (--remaining == 0) goto done; /* short circuit */ next++; } if (next != r + 1) { /* * Some entries were merged into r and next * points to the first valid entry that couldn't * be merged. */ MPASS(next->size > 0); /* must be valid */ memcpy(r + 1, next, remaining * sizeof(*r)); #ifdef INVARIANTS /* * This so that the foo->size assertion in the * next iteration of the loop do the right * thing for entries that were pulled up and are * no longer valid. */ MPASS(n < nitems(mem_ranges)); bzero(&mem_ranges[n], (nitems(mem_ranges) - n) * sizeof(struct t4_range)); #endif } } done: /* Done merging the ranges. */ MPASS(n > 0); r = &mem_ranges[0]; for (i = 0; i < n; i++, r++) { if (addr >= r->start && addr + len <= r->start + r->size) return (0); } } return (EFAULT); } static int fwmtype_to_hwmtype(int mtype) { switch (mtype) { case FW_MEMTYPE_EDC0: return (MEM_EDC0); case FW_MEMTYPE_EDC1: return (MEM_EDC1); case FW_MEMTYPE_EXTMEM: return (MEM_MC0); case FW_MEMTYPE_EXTMEM1: return (MEM_MC1); default: panic("%s: cannot translate fw mtype %d.", __func__, mtype); } } /* * Verify that the memory range specified by the memtype/offset/len pair is * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ static int validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, uint32_t len, uint32_t *addr) { uint32_t em, addr_len, maddr; /* Memory can only be accessed in naturally aligned 4 byte units */ if (off & 3 || len & 3 || len == 0) return (EINVAL); em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); switch (fwmtype_to_hwmtype(mtype)) { case MEM_EDC0: if (!(em & F_EDRAM0_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; break; case MEM_EDC1: if (!(em & F_EDRAM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; break; case MEM_MC: if (!(em & F_EXT_MEM_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; break; case MEM_MC1: if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; break; default: return (EINVAL); } *addr = maddr + off; /* global address */ return (validate_mem_range(sc, *addr, len)); } static int fixup_devlog_params(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; int rc; rc = validate_mt_off_len(sc, dparams->memtype, dparams->start, dparams->size, &dparams->addr); return (rc); } static void update_nirq(struct intrs_and_queues *iaq, int nports) { iaq->nirq = T4_EXTRA_INTR; iaq->nirq += nports * max(iaq->nrxq, iaq->nnmrxq); iaq->nirq += nports * iaq->nofldrxq; iaq->nirq += nports * (iaq->num_vis - 1) * max(iaq->nrxq_vi, iaq->nnmrxq_vi); iaq->nirq += nports * (iaq->num_vis - 1) * iaq->nofldrxq_vi; } /* * Adjust requirements to fit the number of interrupts available. */ static void calculate_iaq(struct adapter *sc, struct intrs_and_queues *iaq, int itype, int navail) { int old_nirq; const int nports = sc->params.nports; MPASS(nports > 0); MPASS(navail > 0); bzero(iaq, sizeof(*iaq)); iaq->intr_type = itype; iaq->num_vis = t4_num_vis; iaq->ntxq = t4_ntxq; iaq->ntxq_vi = t4_ntxq_vi; iaq->nrxq = t4_nrxq; iaq->nrxq_vi = t4_nrxq_vi; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (is_offload(sc) || is_ethoffload(sc)) { iaq->nofldtxq = t4_nofldtxq; iaq->nofldtxq_vi = t4_nofldtxq_vi; } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldrxq = t4_nofldrxq; iaq->nofldrxq_vi = t4_nofldrxq_vi; } #endif #ifdef DEV_NETMAP if (t4_native_netmap & NN_MAIN_VI) { iaq->nnmtxq = t4_nnmtxq; iaq->nnmrxq = t4_nnmrxq; } if (t4_native_netmap & NN_EXTRA_VI) { iaq->nnmtxq_vi = t4_nnmtxq_vi; iaq->nnmrxq_vi = t4_nnmrxq_vi; } #endif update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { /* * This is the normal case -- there are enough interrupts for * everything. */ goto done; } /* * If extra VIs have been configured try reducing their count and see if * that works. */ while (iaq->num_vis > 1) { iaq->num_vis--; update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { device_printf(sc->dev, "virtual interfaces per port " "reduced to %d from %d. nrxq=%u, nofldrxq=%u, " "nrxq_vi=%u nofldrxq_vi=%u, nnmrxq_vi=%u. " "itype %d, navail %u, nirq %d.\n", iaq->num_vis, t4_num_vis, iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, itype, navail, iaq->nirq); goto done; } } /* * Extra VIs will not be created. Log a message if they were requested. */ MPASS(iaq->num_vis == 1); iaq->ntxq_vi = iaq->nrxq_vi = 0; iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0; iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0; if (iaq->num_vis != t4_num_vis) { device_printf(sc->dev, "extra virtual interfaces disabled. " "nrxq=%u, nofldrxq=%u, nrxq_vi=%u nofldrxq_vi=%u, " "nnmrxq_vi=%u. itype %d, navail %u, nirq %d.\n", iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, itype, navail, iaq->nirq); } /* * Keep reducing the number of NIC rx queues to the next lower power of * 2 (for even RSS distribution) and halving the TOE rx queues and see * if that works. */ do { if (iaq->nrxq > 1) { do { iaq->nrxq--; } while (!powerof2(iaq->nrxq)); if (iaq->nnmrxq > iaq->nrxq) iaq->nnmrxq = iaq->nrxq; } if (iaq->nofldrxq > 1) iaq->nofldrxq >>= 1; old_nirq = iaq->nirq; update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { device_printf(sc->dev, "running with reduced number of " "rx queues because of shortage of interrupts. " "nrxq=%u, nofldrxq=%u. " "itype %d, navail %u, nirq %d.\n", iaq->nrxq, iaq->nofldrxq, itype, navail, iaq->nirq); goto done; } } while (old_nirq != iaq->nirq); /* One interrupt for everything. Ugh. */ device_printf(sc->dev, "running with minimal number of queues. " "itype %d, navail %u.\n", itype, navail); iaq->nirq = 1; iaq->nrxq = 1; iaq->ntxq = 1; if (iaq->nofldrxq > 0) { iaq->nofldrxq = 1; iaq->nofldtxq = 1; } iaq->nnmtxq = 0; iaq->nnmrxq = 0; done: MPASS(iaq->num_vis > 0); if (iaq->num_vis > 1) { MPASS(iaq->nrxq_vi > 0); MPASS(iaq->ntxq_vi > 0); } MPASS(iaq->nirq > 0); MPASS(iaq->nrxq > 0); MPASS(iaq->ntxq > 0); if (itype == INTR_MSI) { MPASS(powerof2(iaq->nirq)); } } static int cfg_itype_and_nqueues(struct adapter *sc, struct intrs_and_queues *iaq) { int rc, itype, navail, nalloc; for (itype = INTR_MSIX; itype; itype >>= 1) { if ((itype & t4_intr_types) == 0) continue; /* not allowed */ if (itype == INTR_MSIX) navail = pci_msix_count(sc->dev); else if (itype == INTR_MSI) navail = pci_msi_count(sc->dev); else navail = 1; restart: if (navail == 0) continue; calculate_iaq(sc, iaq, itype, navail); nalloc = iaq->nirq; rc = 0; if (itype == INTR_MSIX) rc = pci_alloc_msix(sc->dev, &nalloc); else if (itype == INTR_MSI) rc = pci_alloc_msi(sc->dev, &nalloc); if (rc == 0 && nalloc > 0) { if (nalloc == iaq->nirq) return (0); /* * Didn't get the number requested. Use whatever number * the kernel is willing to allocate. */ device_printf(sc->dev, "fewer vectors than requested, " "type=%d, req=%d, rcvd=%d; will downshift req.\n", itype, iaq->nirq, nalloc); pci_release_msi(sc->dev); navail = nalloc; goto restart; } device_printf(sc->dev, "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n", itype, rc, iaq->nirq, nalloc); } device_printf(sc->dev, "failed to find a usable interrupt type. " "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types, pci_msix_count(sc->dev), pci_msi_count(sc->dev)); return (ENXIO); } #define FW_VERSION(chip) ( \ V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \ V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \ V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \ V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD)) #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf) /* Just enough of fw_hdr to cover all version info. */ struct fw_h { __u8 ver; __u8 chip; __be16 len512; __be32 fw_ver; __be32 tp_microcode_ver; __u8 intfver_nic; __u8 intfver_vnic; __u8 intfver_ofld; __u8 intfver_ri; __u8 intfver_iscsipdu; __u8 intfver_iscsi; __u8 intfver_fcoepdu; __u8 intfver_fcoe; }; /* Spot check a couple of fields. */ CTASSERT(offsetof(struct fw_h, fw_ver) == offsetof(struct fw_hdr, fw_ver)); CTASSERT(offsetof(struct fw_h, intfver_nic) == offsetof(struct fw_hdr, intfver_nic)); CTASSERT(offsetof(struct fw_h, intfver_fcoe) == offsetof(struct fw_hdr, intfver_fcoe)); struct fw_info { uint8_t chip; char *kld_name; char *fw_mod_name; struct fw_h fw_h; } fw_info[] = { { .chip = CHELSIO_T4, .kld_name = "t4fw_cfg", .fw_mod_name = "t4fw", .fw_h = { .chip = FW_HDR_CHIP_T4, .fw_ver = htobe32(FW_VERSION(T4)), .intfver_nic = FW_INTFVER(T4, NIC), .intfver_vnic = FW_INTFVER(T4, VNIC), .intfver_ofld = FW_INTFVER(T4, OFLD), .intfver_ri = FW_INTFVER(T4, RI), .intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T4, ISCSI), .intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU), .intfver_fcoe = FW_INTFVER(T4, FCOE), }, }, { .chip = CHELSIO_T5, .kld_name = "t5fw_cfg", .fw_mod_name = "t5fw", .fw_h = { .chip = FW_HDR_CHIP_T5, .fw_ver = htobe32(FW_VERSION(T5)), .intfver_nic = FW_INTFVER(T5, NIC), .intfver_vnic = FW_INTFVER(T5, VNIC), .intfver_ofld = FW_INTFVER(T5, OFLD), .intfver_ri = FW_INTFVER(T5, RI), .intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T5, ISCSI), .intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU), .intfver_fcoe = FW_INTFVER(T5, FCOE), }, }, { .chip = CHELSIO_T6, .kld_name = "t6fw_cfg", .fw_mod_name = "t6fw", .fw_h = { .chip = FW_HDR_CHIP_T6, .fw_ver = htobe32(FW_VERSION(T6)), .intfver_nic = FW_INTFVER(T6, NIC), .intfver_vnic = FW_INTFVER(T6, VNIC), .intfver_ofld = FW_INTFVER(T6, OFLD), .intfver_ri = FW_INTFVER(T6, RI), .intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T6, ISCSI), .intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU), .intfver_fcoe = FW_INTFVER(T6, FCOE), }, } }; static struct fw_info * find_fw_info(int chip) { int i; for (i = 0; i < nitems(fw_info); i++) { if (fw_info[i].chip == chip) return (&fw_info[i]); } return (NULL); } /* * Is the given firmware API compatible with the one the driver was compiled * with? */ static int fw_compatible(const struct fw_h *hdr1, const struct fw_h *hdr2) { /* short circuit if it's the exact same firmware version */ if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) return (1); /* * XXX: Is this too conservative? Perhaps I should limit this to the * features that are supported in the driver. */ #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) && SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe)) return (1); #undef SAME_INTF return (0); } static int load_fw_module(struct adapter *sc, const struct firmware **dcfg, const struct firmware **fw) { struct fw_info *fw_info; *dcfg = NULL; if (fw != NULL) *fw = NULL; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } *dcfg = firmware_get(fw_info->kld_name); if (*dcfg != NULL) { if (fw != NULL) *fw = firmware_get(fw_info->fw_mod_name); return (0); } return (ENOENT); } static void unload_fw_module(struct adapter *sc, const struct firmware *dcfg, const struct firmware *fw) { if (fw != NULL) firmware_put(fw, FIRMWARE_UNLOAD); if (dcfg != NULL) firmware_put(dcfg, FIRMWARE_UNLOAD); } /* * Return values: * 0 means no firmware install attempted. * ERESTART means a firmware install was attempted and was successful. * +ve errno means a firmware install was attempted but failed. */ static int install_kld_firmware(struct adapter *sc, struct fw_h *card_fw, const struct fw_h *drv_fw, const char *reason, int *already) { const struct firmware *cfg, *fw; const uint32_t c = be32toh(card_fw->fw_ver); uint32_t d, k; int rc, fw_install; struct fw_h bundled_fw; bool load_attempted; cfg = fw = NULL; load_attempted = false; fw_install = t4_fw_install < 0 ? -t4_fw_install : t4_fw_install; memcpy(&bundled_fw, drv_fw, sizeof(bundled_fw)); if (t4_fw_install < 0) { rc = load_fw_module(sc, &cfg, &fw); if (rc != 0 || fw == NULL) { device_printf(sc->dev, "failed to load firmware module: %d. cfg %p, fw %p;" " will use compiled-in firmware version for" "hw.cxgbe.fw_install checks.\n", rc, cfg, fw); } else { memcpy(&bundled_fw, fw->data, sizeof(bundled_fw)); } load_attempted = true; } d = be32toh(bundled_fw.fw_ver); if (reason != NULL) goto install; if ((sc->flags & FW_OK) == 0) { if (c == 0xffffffff) { reason = "missing"; goto install; } rc = 0; goto done; } if (!fw_compatible(card_fw, &bundled_fw)) { reason = "incompatible or unusable"; goto install; } if (d > c) { reason = "older than the version bundled with this driver"; goto install; } if (fw_install == 2 && d != c) { reason = "different than the version bundled with this driver"; goto install; } /* No reason to do anything to the firmware already on the card. */ rc = 0; goto done; install: rc = 0; if ((*already)++) goto done; if (fw_install == 0) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver is prohibited from installing a firmware " "on the card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); goto done; } /* * We'll attempt to install a firmware. Load the module first (if it * hasn't been loaded already). */ if (!load_attempted) { rc = load_fw_module(sc, &cfg, &fw); if (rc != 0 || fw == NULL) { device_printf(sc->dev, "failed to load firmware module: %d. cfg %p, fw %p\n", rc, cfg, fw); /* carry on */ } } if (fw == NULL) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver cannot take corrective action because it " "is unable to load the firmware module.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); rc = sc->flags & FW_OK ? 0 : ENOENT; goto done; } k = be32toh(((const struct fw_hdr *)fw->data)->fw_ver); if (k != d) { MPASS(t4_fw_install > 0); device_printf(sc->dev, "firmware in KLD (%u.%u.%u.%u) is not what the driver was " "expecting (%u.%u.%u.%u) and will not be used.\n", G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k), G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d)); rc = sc->flags & FW_OK ? 0 : EINVAL; goto done; } device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "installing firmware %u.%u.%u.%u on card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason, G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d)); rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0); if (rc != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", rc); } else { /* Installed successfully, update the cached header too. */ rc = ERESTART; memcpy(card_fw, fw->data, sizeof(*card_fw)); } done: unload_fw_module(sc, cfg, fw); return (rc); } /* * Establish contact with the firmware and attempt to become the master driver. * * A firmware will be installed to the card if needed (if the driver is allowed * to do so). */ static int contact_firmware(struct adapter *sc) { int rc, already = 0; enum dev_state state; struct fw_info *fw_info; struct fw_hdr *card_fw; /* fw on the card */ const struct fw_h *drv_fw; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } drv_fw = &fw_info->fw_h; /* Read the header of the firmware on the card */ card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK); restart: rc = -t4_get_fw_hdr(sc, card_fw); if (rc != 0) { device_printf(sc->dev, "unable to read firmware header from card's flash: %d\n", rc); goto done; } rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL, &already); if (rc == ERESTART) goto restart; if (rc != 0) goto done; rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state); if (rc < 0 || state == DEV_STATE_ERR) { rc = -rc; device_printf(sc->dev, "failed to connect to the firmware: %d, %d. " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); #if 0 if (install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, "not responding properly to HELLO", &already) == ERESTART) goto restart; #endif goto done; } MPASS(be32toh(card_fw->flags) & FW_HDR_FLAGS_RESET_HALT); sc->flags |= FW_OK; /* The firmware responded to the FW_HELLO. */ if (rc == sc->pf) { sc->flags |= MASTER_PF; rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL, &already); if (rc == ERESTART) rc = 0; else if (rc != 0) goto done; } else if (state == DEV_STATE_UNINIT) { /* * We didn't get to be the master so we definitely won't be * configuring the chip. It's a bug if someone else hasn't * configured it already. */ device_printf(sc->dev, "couldn't be master(%d), " "device not already initialized either(%d). " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); rc = EPROTO; goto done; } else { /* * Some other PF is the master and has configured the chip. * This is allowed but untested. */ device_printf(sc->dev, "PF%d is master, device state %d. " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", rc); sc->cfcsum = 0; rc = 0; } done: if (rc != 0 && sc->flags & FW_OK) { t4_fw_bye(sc, sc->mbox); sc->flags &= ~FW_OK; } free(card_fw, M_CXGBE); return (rc); } static int copy_cfg_file_to_card(struct adapter *sc, char *cfg_file, uint32_t mtype, uint32_t moff) { struct fw_info *fw_info; const struct firmware *dcfg, *rcfg = NULL; const uint32_t *cfdata; uint32_t cflen, addr; int rc; load_fw_module(sc, &dcfg, NULL); /* Card specific interpretation of "default". */ if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { if (pci_get_device(sc->dev) == 0x440a) snprintf(cfg_file, sizeof(t4_cfg_file), UWIRE_CF); if (is_fpga(sc)) snprintf(cfg_file, sizeof(t4_cfg_file), FPGA_CF); } if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { if (dcfg == NULL) { device_printf(sc->dev, "KLD with default config is not available.\n"); rc = ENOENT; goto done; } cfdata = dcfg->data; cflen = dcfg->datasize & ~3; } else { char s[32]; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); rc = EINVAL; goto done; } snprintf(s, sizeof(s), "%s_%s", fw_info->kld_name, cfg_file); rcfg = firmware_get(s); if (rcfg == NULL) { device_printf(sc->dev, "unable to load module \"%s\" for configuration " "profile \"%s\".\n", s, cfg_file); rc = ENOENT; goto done; } cfdata = rcfg->data; cflen = rcfg->datasize & ~3; } if (cflen > FLASH_CFG_MAX_SIZE) { device_printf(sc->dev, "config file too long (%d, max allowed is %d).\n", cflen, FLASH_CFG_MAX_SIZE); rc = EINVAL; goto done; } rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr); if (rc != 0) { device_printf(sc->dev, "%s: addr (%d/0x%x) or len %d is not valid: %d.\n", __func__, mtype, moff, cflen, rc); rc = EINVAL; goto done; } write_via_memwin(sc, 2, addr, cfdata, cflen); done: if (rcfg != NULL) firmware_put(rcfg, FIRMWARE_UNLOAD); unload_fw_module(sc, dcfg, NULL); return (rc); } struct caps_allowed { uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; }; #define FW_PARAM_DEV(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) #define FW_PARAM_PFVF(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)) /* * Provide a configuration profile to the firmware and have it initialize the * chip accordingly. This may involve uploading a configuration file to the * card. */ static int apply_cfg_and_initialize(struct adapter *sc, char *cfg_file, const struct caps_allowed *caps_allowed) { int rc; struct fw_caps_config_cmd caps; uint32_t mtype, moff, finicsum, cfcsum, param, val; rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST); if (rc != 0) { device_printf(sc->dev, "firmware reset failed: %d.\n", rc); return (rc); } bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); if (strncmp(cfg_file, BUILTIN_CF, sizeof(t4_cfg_file)) == 0) { mtype = 0; moff = 0; caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); } else if (strncmp(cfg_file, FLASH_CF, sizeof(t4_cfg_file)) == 0) { mtype = FW_MEMTYPE_FLASH; moff = t4_flash_cfg_addr(sc); caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); } else { /* * Ask the firmware where it wants us to upload the config file. */ param = FW_PARAM_DEV(CF); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* No support for config file? Shouldn't happen. */ device_printf(sc->dev, "failed to query config file location: %d.\n", rc); goto done; } mtype = G_FW_PARAMS_PARAM_Y(val); moff = G_FW_PARAMS_PARAM_Z(val) << 16; caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); rc = copy_cfg_file_to_card(sc, cfg_file, mtype, moff); if (rc != 0) { device_printf(sc->dev, "failed to upload config file to card: %d.\n", rc); goto done; } } rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to pre-process config file: %d " "(mtype %d, moff 0x%x).\n", rc, mtype, moff); goto done; } finicsum = be32toh(caps.finicsum); cfcsum = be32toh(caps.cfcsum); /* actual */ if (finicsum != cfcsum) { device_printf(sc->dev, "WARNING: config file checksum mismatch: %08x %08x\n", finicsum, cfcsum); } sc->cfcsum = cfcsum; snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", cfg_file); /* * Let the firmware know what features will (not) be used so it can tune * things accordingly. */ #define LIMIT_CAPS(x) do { \ caps.x##caps &= htobe16(caps_allowed->x##caps); \ } while (0) LIMIT_CAPS(nbm); LIMIT_CAPS(link); LIMIT_CAPS(switch); LIMIT_CAPS(nic); LIMIT_CAPS(toe); LIMIT_CAPS(rdma); LIMIT_CAPS(crypto); LIMIT_CAPS(iscsi); LIMIT_CAPS(fcoe); #undef LIMIT_CAPS if (caps.niccaps & htobe16(FW_CAPS_CONFIG_NIC_HASHFILTER)) { /* * TOE and hashfilters are mutually exclusive. It is a config * file or firmware bug if both are reported as available. Try * to cope with the situation in non-debug builds by disabling * TOE. */ MPASS(caps.toecaps == 0); caps.toecaps = 0; caps.rdmacaps = 0; caps.iscsicaps = 0; } caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL); if (rc != 0) { device_printf(sc->dev, "failed to process config file: %d.\n", rc); goto done; } t4_tweak_chip_settings(sc); set_params__pre_init(sc); /* get basic stuff going */ rc = -t4_fw_initialize(sc, sc->mbox); if (rc != 0) { device_printf(sc->dev, "fw_initialize failed: %d.\n", rc); goto done; } done: return (rc); } /* * Partition chip resources for use between various PFs, VFs, etc. */ static int partition_resources(struct adapter *sc) { char cfg_file[sizeof(t4_cfg_file)]; struct caps_allowed caps_allowed; int rc; bool fallback; /* Only the master driver gets to configure the chip resources. */ MPASS(sc->flags & MASTER_PF); #define COPY_CAPS(x) do { \ caps_allowed.x##caps = t4_##x##caps_allowed; \ } while (0) bzero(&caps_allowed, sizeof(caps_allowed)); COPY_CAPS(nbm); COPY_CAPS(link); COPY_CAPS(switch); COPY_CAPS(nic); COPY_CAPS(toe); COPY_CAPS(rdma); COPY_CAPS(crypto); COPY_CAPS(iscsi); COPY_CAPS(fcoe); fallback = sc->debug_flags & DF_DISABLE_CFG_RETRY ? false : true; snprintf(cfg_file, sizeof(cfg_file), "%s", t4_cfg_file); retry: rc = apply_cfg_and_initialize(sc, cfg_file, &caps_allowed); if (rc != 0 && fallback) { device_printf(sc->dev, "failed (%d) to configure card with \"%s\" profile, " "will fall back to a basic configuration and retry.\n", rc, cfg_file); snprintf(cfg_file, sizeof(cfg_file), "%s", BUILTIN_CF); bzero(&caps_allowed, sizeof(caps_allowed)); COPY_CAPS(switch); caps_allowed.niccaps = FW_CAPS_CONFIG_NIC; fallback = false; goto retry; } #undef COPY_CAPS return (rc); } /* * Retrieve parameters that are needed (or nice to have) very early. */ static int get_params__pre_init(struct adapter *sc) { int rc; uint32_t param[2], val[2]; t4_get_version_info(sc); snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); snprintf(sc->bs_version, sizeof(sc->bs_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MINOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MICRO(sc->params.bs_vers), G_FW_HDR_FW_VER_BUILD(sc->params.bs_vers)); snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers), G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers)); snprintf(sc->er_version, sizeof(sc->er_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.er_vers), G_FW_HDR_FW_VER_MINOR(sc->params.er_vers), G_FW_HDR_FW_VER_MICRO(sc->params.er_vers), G_FW_HDR_FW_VER_BUILD(sc->params.er_vers)); param[0] = FW_PARAM_DEV(PORTVEC); param[1] = FW_PARAM_DEV(CCLK); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (pre_init): %d.\n", rc); return (rc); } sc->params.portvec = val[0]; sc->params.nports = bitcount32(val[0]); sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ rc = -t4_init_devlog_params(sc, 1); if (rc == 0) fixup_devlog_params(sc); else { device_printf(sc->dev, "failed to get devlog parameters: %d.\n", rc); rc = 0; /* devlog isn't critical for device operation */ } return (rc); } /* * Any params that need to be set before FW_INITIALIZE. */ static int set_params__pre_init(struct adapter *sc) { int rc = 0; uint32_t param, val; if (chip_id(sc) >= CHELSIO_T6) { param = FW_PARAM_DEV(HPFILTER_REGION_SUPPORT); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); /* firmwares < 1.20.1.0 do not have this param. */ if (rc == FW_EINVAL && sc->params.fw_vers < FW_VERSION32(1, 20, 1, 0)) { rc = 0; } if (rc != 0) { device_printf(sc->dev, "failed to enable high priority filters :%d.\n", rc); } } /* Enable opaque VIIDs with firmwares that support it. */ param = FW_PARAM_DEV(OPAQUE_VIID_SMT_EXTN); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc == 0 && val == 1) sc->params.viid_smt_extn_support = true; else sc->params.viid_smt_extn_support = false; return (rc); } /* * Retrieve various parameters that are of interest to the driver. The device * has been initialized by the firmware at this point. */ static int get_params__post_init(struct adapter *sc) { int rc; uint32_t param[7], val[7]; struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); param[2] = FW_PARAM_PFVF(FILTER_START); param[3] = FW_PARAM_PFVF(FILTER_END); param[4] = FW_PARAM_PFVF(L2T_START); param[5] = FW_PARAM_PFVF(L2T_END); param[6] = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 7, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init): %d.\n", rc); return (rc); } sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; if ((int)val[3] > (int)val[2]) { sc->tids.ftid_base = val[2]; sc->tids.ftid_end = val[3]; sc->tids.nftids = val[3] - val[2] + 1; } sc->vres.l2t.start = val[4]; sc->vres.l2t.size = val[5] - val[4] + 1; KASSERT(sc->vres.l2t.size <= L2T_SIZE, ("%s: L2 table size (%u) larger than expected (%u)", __func__, sc->vres.l2t.size, L2T_SIZE)); sc->params.core_vdd = val[6]; param[0] = FW_PARAM_PFVF(IQFLINT_END); param[1] = FW_PARAM_PFVF(EQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init2): %d.\n", rc); return (rc); } MPASS((int)val[0] >= sc->sge.iq_start); sc->sge.iqmap_sz = val[0] - sc->sge.iq_start + 1; MPASS((int)val[1] >= sc->sge.eq_start); sc->sge.eqmap_sz = val[1] - sc->sge.eq_start + 1; if (chip_id(sc) >= CHELSIO_T6) { sc->tids.tid_base = t4_read_reg(sc, A_LE_DB_ACTIVE_TABLE_START_INDEX); param[0] = FW_PARAM_PFVF(HPFILTER_START); param[1] = FW_PARAM_PFVF(HPFILTER_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query hpfilter parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->tids.hpftid_base = val[0]; sc->tids.hpftid_end = val[1]; sc->tids.nhpftids = val[1] - val[0] + 1; /* * These should go off if the layout changes and the * driver needs to catch up. */ MPASS(sc->tids.hpftid_base == 0); MPASS(sc->tids.tid_base == sc->tids.nhpftids); } param[0] = FW_PARAM_PFVF(RAWF_START); param[1] = FW_PARAM_PFVF(RAWF_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query rawf parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->rawf_base = val[0]; sc->nrawf = val[1] - val[0] + 1; } } /* * MPSBGMAP is queried separately because only recent firmwares support * it as a parameter and we don't want the compound query above to fail * on older firmwares. */ param[0] = FW_PARAM_DEV(MPSBGMAP); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.mps_bg_map = val[0]; else sc->params.mps_bg_map = 0; /* * Determine whether the firmware supports the filter2 work request. * This is queried separately for the same reason as MPSBGMAP above. */ param[0] = FW_PARAM_DEV(FILTER2_WR); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.filter2_wr_support = val[0] != 0; else sc->params.filter2_wr_support = 0; /* * Find out whether we're allowed to use the ULPTX MEMWRITE DSGL. * This is queried separately for the same reason as other params above. */ param[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.ulptx_memwrite_dsgl = val[0] != 0; else sc->params.ulptx_memwrite_dsgl = false; /* FW_RI_FR_NSMR_TPTE_WR support */ param[0] = FW_PARAM_DEV(RI_FR_NSMR_TPTE_WR); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.fr_nsmr_tpte_wr_support = val[0] != 0; else sc->params.fr_nsmr_tpte_wr_support = false; param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.max_pkts_per_eth_tx_pkts_wr = val[0]; else sc->params.max_pkts_per_eth_tx_pkts_wr = 15; /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to get card capabilities: %d.\n", rc); return (rc); } #define READ_CAPS(x) do { \ sc->x = htobe16(caps.x); \ } while (0) READ_CAPS(nbmcaps); READ_CAPS(linkcaps); READ_CAPS(switchcaps); READ_CAPS(niccaps); READ_CAPS(toecaps); READ_CAPS(rdmacaps); READ_CAPS(cryptocaps); READ_CAPS(iscsicaps); READ_CAPS(fcoecaps); if (sc->niccaps & FW_CAPS_CONFIG_NIC_HASHFILTER) { MPASS(chip_id(sc) > CHELSIO_T4); MPASS(sc->toecaps == 0); sc->toecaps = 0; param[0] = FW_PARAM_DEV(NTID); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query HASHFILTER parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; if (sc->params.fw_vers < FW_VERSION32(1, 20, 5, 0)) { MPASS(sc->tids.ntids >= sc->tids.nhpftids); sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); sc->params.hash_filter = 1; } if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) { param[0] = FW_PARAM_PFVF(ETHOFLD_START); param[1] = FW_PARAM_PFVF(ETHOFLD_END); param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query NIC parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->tids.etid_base = val[0]; sc->tids.etid_end = val[1]; sc->tids.netids = val[1] - val[0] + 1; sc->params.eo_wr_cred = val[2]; sc->params.ethoffload = 1; } } if (sc->toecaps) { /* query offload-related parameters */ param[0] = FW_PARAM_DEV(NTID); param[1] = FW_PARAM_PFVF(SERVER_START); param[2] = FW_PARAM_PFVF(SERVER_END); param[3] = FW_PARAM_PFVF(TDDP_START); param[4] = FW_PARAM_PFVF(TDDP_END); param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TOE parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; if (sc->params.fw_vers < FW_VERSION32(1, 20, 5, 0)) { MPASS(sc->tids.ntids >= sc->tids.nhpftids); sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); if ((int)val[2] > (int)val[1]) { sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; } sc->vres.ddp.start = val[3]; sc->vres.ddp.size = val[4] - val[3] + 1; sc->params.ofldq_wr_cred = val[5]; sc->params.offload = 1; } else { /* * The firmware attempts memfree TOE configuration for -SO cards * and will report toecaps=0 if it runs out of resources (this * depends on the config file). It may not report 0 for other * capabilities dependent on the TOE in this case. Set them to * 0 here so that the driver doesn't bother tracking resources * that will never be used. */ sc->iscsicaps = 0; sc->rdmacaps = 0; } if (sc->rdmacaps) { param[0] = FW_PARAM_PFVF(STAG_START); param[1] = FW_PARAM_PFVF(STAG_END); param[2] = FW_PARAM_PFVF(RQ_START); param[3] = FW_PARAM_PFVF(RQ_END); param[4] = FW_PARAM_PFVF(PBL_START); param[5] = FW_PARAM_PFVF(PBL_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(1): %d.\n", rc); return (rc); } sc->vres.stag.start = val[0]; sc->vres.stag.size = val[1] - val[0] + 1; sc->vres.rq.start = val[2]; sc->vres.rq.size = val[3] - val[2] + 1; sc->vres.pbl.start = val[4]; sc->vres.pbl.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SQRQ_START); param[1] = FW_PARAM_PFVF(SQRQ_END); param[2] = FW_PARAM_PFVF(CQ_START); param[3] = FW_PARAM_PFVF(CQ_END); param[4] = FW_PARAM_PFVF(OCQ_START); param[5] = FW_PARAM_PFVF(OCQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(2): %d.\n", rc); return (rc); } sc->vres.qp.start = val[0]; sc->vres.qp.size = val[1] - val[0] + 1; sc->vres.cq.start = val[2]; sc->vres.cq.size = val[3] - val[2] + 1; sc->vres.ocq.start = val[4]; sc->vres.ocq.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SRQ_START); param[1] = FW_PARAM_PFVF(SRQ_END); param[2] = FW_PARAM_DEV(MAXORDIRD_QP); param[3] = FW_PARAM_DEV(MAXIRD_ADAPTER); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 4, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(3): %d.\n", rc); return (rc); } sc->vres.srq.start = val[0]; sc->vres.srq.size = val[1] - val[0] + 1; sc->params.max_ordird_qp = val[2]; sc->params.max_ird_adapter = val[3]; } if (sc->iscsicaps) { param[0] = FW_PARAM_PFVF(ISCSI_START); param[1] = FW_PARAM_PFVF(ISCSI_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query iSCSI parameters: %d.\n", rc); return (rc); } sc->vres.iscsi.start = val[0]; sc->vres.iscsi.size = val[1] - val[0] + 1; } if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS) { param[0] = FW_PARAM_PFVF(TLS_START); param[1] = FW_PARAM_PFVF(TLS_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TLS parameters: %d.\n", rc); return (rc); } sc->vres.key.start = val[0]; sc->vres.key.size = val[1] - val[0] + 1; } /* * We've got the params we wanted to query directly from the firmware. * Grab some others via other means. */ t4_init_sge_params(sc); t4_init_tp_params(sc); t4_read_mtu_tbl(sc, sc->params.mtus, NULL); t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd); rc = t4_verify_chip_settings(sc); if (rc != 0) return (rc); t4_init_rx_buf_info(sc); return (rc); } #ifdef KERN_TLS static void ktls_tick(void *arg) { struct adapter *sc; uint32_t tstamp; sc = arg; if (sc->flags & KERN_TLS_ON) { tstamp = tcp_ts_getticks(); t4_write_reg(sc, A_TP_SYNC_TIME_HI, tstamp >> 1); t4_write_reg(sc, A_TP_SYNC_TIME_LO, tstamp << 31); } callout_schedule_sbt(&sc->ktls_tick, SBT_1MS, 0, C_HARDCLOCK); } static int t4_config_kern_tls(struct adapter *sc, bool enable) { int rc; uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_KTLS_HW) | V_FW_PARAMS_PARAM_Y(enable ? 1 : 0) | V_FW_PARAMS_PARAM_Z(FW_PARAMS_PARAM_DEV_KTLS_HW_USER_ENABLE); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, ¶m); if (rc != 0) { CH_ERR(sc, "failed to %s NIC TLS: %d\n", enable ? "enable" : "disable", rc); return (rc); } if (enable) sc->flags |= KERN_TLS_ON; else sc->flags &= ~KERN_TLS_ON; return (rc); } #endif static int set_params__post_init(struct adapter *sc) { uint32_t mask, param, val; #ifdef TCP_OFFLOAD int i, v, shift; #endif /* ask for encapsulated CPLs */ param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); /* Enable 32b port caps if the firmware supports it. */ param = FW_PARAM_PFVF(PORT_CAPS32); val = 1; if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val) == 0) sc->params.port_caps32 = 1; /* Let filter + maskhash steer to a part of the VI's RSS region. */ val = 1 << (G_MASKSIZE(t4_read_reg(sc, A_TP_RSS_CONFIG_TNL)) - 1); t4_set_reg_field(sc, A_TP_RSS_CONFIG_TNL, V_MASKFILTER(M_MASKFILTER), V_MASKFILTER(val - 1)); mask = F_DROPERRORANY | F_DROPERRORMAC | F_DROPERRORIPVER | F_DROPERRORFRAG | F_DROPERRORATTACK | F_DROPERRORETHHDRLEN | F_DROPERRORIPHDRLEN | F_DROPERRORTCPHDRLEN | F_DROPERRORPKTLEN | F_DROPERRORTCPOPT | F_DROPERRORCSUMIP | F_DROPERRORCSUM; val = 0; if (chip_id(sc) < CHELSIO_T6 && t4_attack_filter != 0) { t4_set_reg_field(sc, A_TP_GLOBAL_CONFIG, F_ATTACKFILTERENABLE, F_ATTACKFILTERENABLE); val |= F_DROPERRORATTACK; } if (t4_drop_ip_fragments != 0) { t4_set_reg_field(sc, A_TP_GLOBAL_CONFIG, F_FRAGMENTDROP, F_FRAGMENTDROP); val |= F_DROPERRORFRAG; } if (t4_drop_pkts_with_l2_errors != 0) val |= F_DROPERRORMAC | F_DROPERRORETHHDRLEN; if (t4_drop_pkts_with_l3_errors != 0) { val |= F_DROPERRORIPVER | F_DROPERRORIPHDRLEN | F_DROPERRORCSUMIP; } if (t4_drop_pkts_with_l4_errors != 0) { val |= F_DROPERRORTCPHDRLEN | F_DROPERRORPKTLEN | F_DROPERRORTCPOPT | F_DROPERRORCSUM; } t4_set_reg_field(sc, A_TP_ERR_CONFIG, mask, val); #ifdef TCP_OFFLOAD /* * Override the TOE timers with user provided tunables. This is not the * recommended way to change the timers (the firmware config file is) so * these tunables are not documented. * * All the timer tunables are in microseconds. */ if (t4_toe_keepalive_idle != 0) { v = us_to_tcp_ticks(sc, t4_toe_keepalive_idle); v &= M_KEEPALIVEIDLE; t4_set_reg_field(sc, A_TP_KEEP_IDLE, V_KEEPALIVEIDLE(M_KEEPALIVEIDLE), V_KEEPALIVEIDLE(v)); } if (t4_toe_keepalive_interval != 0) { v = us_to_tcp_ticks(sc, t4_toe_keepalive_interval); v &= M_KEEPALIVEINTVL; t4_set_reg_field(sc, A_TP_KEEP_INTVL, V_KEEPALIVEINTVL(M_KEEPALIVEINTVL), V_KEEPALIVEINTVL(v)); } if (t4_toe_keepalive_count != 0) { v = t4_toe_keepalive_count & M_KEEPALIVEMAXR2; t4_set_reg_field(sc, A_TP_SHIFT_CNT, V_KEEPALIVEMAXR1(M_KEEPALIVEMAXR1) | V_KEEPALIVEMAXR2(M_KEEPALIVEMAXR2), V_KEEPALIVEMAXR1(1) | V_KEEPALIVEMAXR2(v)); } if (t4_toe_rexmt_min != 0) { v = us_to_tcp_ticks(sc, t4_toe_rexmt_min); v &= M_RXTMIN; t4_set_reg_field(sc, A_TP_RXT_MIN, V_RXTMIN(M_RXTMIN), V_RXTMIN(v)); } if (t4_toe_rexmt_max != 0) { v = us_to_tcp_ticks(sc, t4_toe_rexmt_max); v &= M_RXTMAX; t4_set_reg_field(sc, A_TP_RXT_MAX, V_RXTMAX(M_RXTMAX), V_RXTMAX(v)); } if (t4_toe_rexmt_count != 0) { v = t4_toe_rexmt_count & M_RXTSHIFTMAXR2; t4_set_reg_field(sc, A_TP_SHIFT_CNT, V_RXTSHIFTMAXR1(M_RXTSHIFTMAXR1) | V_RXTSHIFTMAXR2(M_RXTSHIFTMAXR2), V_RXTSHIFTMAXR1(1) | V_RXTSHIFTMAXR2(v)); } for (i = 0; i < nitems(t4_toe_rexmt_backoff); i++) { if (t4_toe_rexmt_backoff[i] != -1) { v = t4_toe_rexmt_backoff[i] & M_TIMERBACKOFFINDEX0; shift = (i & 3) << 3; t4_set_reg_field(sc, A_TP_TCP_BACKOFF_REG0 + (i & ~3), M_TIMERBACKOFFINDEX0 << shift, v << shift); } } #endif #ifdef KERN_TLS if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS && sc->toecaps & FW_CAPS_CONFIG_TOE) { /* * Limit TOE connections to 2 reassembly "islands". This is * required for TOE TLS connections to downgrade to plain TOE * connections if an unsupported TLS version or ciphersuite is * used. */ t4_tp_wr_bits_indirect(sc, A_TP_FRAG_CONFIG, V_PASSMODE(M_PASSMODE), V_PASSMODE(2)); if (is_ktls(sc)) { sc->tlst.inline_keys = t4_tls_inline_keys; sc->tlst.combo_wrs = t4_tls_combo_wrs; if (t4_kern_tls != 0) t4_config_kern_tls(sc, true); } } #endif return (0); } #undef FW_PARAM_PFVF #undef FW_PARAM_DEV static void t4_set_desc(struct adapter *sc) { char buf[128]; struct adapter_params *p = &sc->params; snprintf(buf, sizeof(buf), "Chelsio %s", p->vpd.id); device_set_desc_copy(sc->dev, buf); } static inline void ifmedia_add4(struct ifmedia *ifm, int m) { ifmedia_add(ifm, m, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_TXPAUSE, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_RXPAUSE, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL); } /* * This is the selected media, which is not quite the same as the active media. * The media line in ifconfig is "media: Ethernet selected (active)" if selected * and active are not the same, and "media: Ethernet selected" otherwise. */ static void set_current_media(struct port_info *pi) { struct link_config *lc; struct ifmedia *ifm; int mword; u_int speed; PORT_LOCK_ASSERT_OWNED(pi); /* Leave current media alone if it's already set to IFM_NONE. */ ifm = &pi->media; if (ifm->ifm_cur != NULL && IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE) return; lc = &pi->link_cfg; if (lc->requested_aneg != AUTONEG_DISABLE && lc->pcaps & FW_PORT_CAP32_ANEG) { ifmedia_set(ifm, IFM_ETHER | IFM_AUTO); return; } mword = IFM_ETHER | IFM_FDX; if (lc->requested_fc & PAUSE_TX) mword |= IFM_ETH_TXPAUSE; if (lc->requested_fc & PAUSE_RX) mword |= IFM_ETH_RXPAUSE; if (lc->requested_speed == 0) speed = port_top_speed(pi) * 1000; /* Gbps -> Mbps */ else speed = lc->requested_speed; mword |= port_mword(pi, speed_to_fwcap(speed)); ifmedia_set(ifm, mword); } /* * Returns true if the ifmedia list for the port cannot change. */ static bool fixed_ifmedia(struct port_info *pi) { return (pi->port_type == FW_PORT_TYPE_BT_SGMII || pi->port_type == FW_PORT_TYPE_BT_XFI || pi->port_type == FW_PORT_TYPE_BT_XAUI || pi->port_type == FW_PORT_TYPE_KX4 || pi->port_type == FW_PORT_TYPE_KX || pi->port_type == FW_PORT_TYPE_KR || pi->port_type == FW_PORT_TYPE_BP_AP || pi->port_type == FW_PORT_TYPE_BP4_AP || pi->port_type == FW_PORT_TYPE_BP40_BA || pi->port_type == FW_PORT_TYPE_KR4_100G || pi->port_type == FW_PORT_TYPE_KR_SFP28 || pi->port_type == FW_PORT_TYPE_KR_XLAUI); } static void build_medialist(struct port_info *pi) { uint32_t ss, speed; int unknown, mword, bit; struct link_config *lc; struct ifmedia *ifm; PORT_LOCK_ASSERT_OWNED(pi); if (pi->flags & FIXED_IFMEDIA) return; /* * Rebuild the ifmedia list. */ ifm = &pi->media; ifmedia_removeall(ifm); lc = &pi->link_cfg; ss = G_FW_PORT_CAP32_SPEED(lc->pcaps); /* Supported Speeds */ if (__predict_false(ss == 0)) { /* not supposed to happen. */ MPASS(ss != 0); no_media: MPASS(LIST_EMPTY(&ifm->ifm_list)); ifmedia_add(ifm, IFM_ETHER | IFM_NONE, 0, NULL); ifmedia_set(ifm, IFM_ETHER | IFM_NONE); return; } unknown = 0; for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) { speed = 1 << bit; MPASS(speed & M_FW_PORT_CAP32_SPEED); if (ss & speed) { mword = port_mword(pi, speed); if (mword == IFM_NONE) { goto no_media; } else if (mword == IFM_UNKNOWN) unknown++; else ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | mword); } } if (unknown > 0) /* Add one unknown for all unknown media types. */ ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN); if (lc->pcaps & FW_PORT_CAP32_ANEG) ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL); set_current_media(pi); } /* * Initialize the requested fields in the link config based on driver tunables. */ static void init_link_config(struct port_info *pi) { struct link_config *lc = &pi->link_cfg; PORT_LOCK_ASSERT_OWNED(pi); lc->requested_speed = 0; if (t4_autoneg == 0) lc->requested_aneg = AUTONEG_DISABLE; else if (t4_autoneg == 1) lc->requested_aneg = AUTONEG_ENABLE; else lc->requested_aneg = AUTONEG_AUTO; lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG); if (t4_fec & FEC_AUTO) lc->requested_fec = FEC_AUTO; else if (t4_fec == 0) lc->requested_fec = FEC_NONE; else { /* -1 is handled by the FEC_AUTO block above and not here. */ lc->requested_fec = t4_fec & (FEC_RS | FEC_BASER_RS | FEC_NONE | FEC_MODULE); if (lc->requested_fec == 0) lc->requested_fec = FEC_AUTO; } } /* * Makes sure that all requested settings comply with what's supported by the * port. Returns the number of settings that were invalid and had to be fixed. */ static int fixup_link_config(struct port_info *pi) { int n = 0; struct link_config *lc = &pi->link_cfg; uint32_t fwspeed; PORT_LOCK_ASSERT_OWNED(pi); /* Speed (when not autonegotiating) */ if (lc->requested_speed != 0) { fwspeed = speed_to_fwcap(lc->requested_speed); if ((fwspeed & lc->pcaps) == 0) { n++; lc->requested_speed = 0; } } /* Link autonegotiation */ MPASS(lc->requested_aneg == AUTONEG_ENABLE || lc->requested_aneg == AUTONEG_DISABLE || lc->requested_aneg == AUTONEG_AUTO); if (lc->requested_aneg == AUTONEG_ENABLE && !(lc->pcaps & FW_PORT_CAP32_ANEG)) { n++; lc->requested_aneg = AUTONEG_AUTO; } /* Flow control */ MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0); if (lc->requested_fc & PAUSE_TX && !(lc->pcaps & FW_PORT_CAP32_FC_TX)) { n++; lc->requested_fc &= ~PAUSE_TX; } if (lc->requested_fc & PAUSE_RX && !(lc->pcaps & FW_PORT_CAP32_FC_RX)) { n++; lc->requested_fc &= ~PAUSE_RX; } if (!(lc->requested_fc & PAUSE_AUTONEG) && !(lc->pcaps & FW_PORT_CAP32_FORCE_PAUSE)) { n++; lc->requested_fc |= PAUSE_AUTONEG; } /* FEC */ if ((lc->requested_fec & FEC_RS && !(lc->pcaps & FW_PORT_CAP32_FEC_RS)) || (lc->requested_fec & FEC_BASER_RS && !(lc->pcaps & FW_PORT_CAP32_FEC_BASER_RS))) { n++; lc->requested_fec = FEC_AUTO; } return (n); } /* * Apply the requested L1 settings, which are expected to be valid, to the * hardware. */ static int apply_link_config(struct port_info *pi) { struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; #ifdef INVARIANTS ASSERT_SYNCHRONIZED_OP(sc); PORT_LOCK_ASSERT_OWNED(pi); if (lc->requested_aneg == AUTONEG_ENABLE) MPASS(lc->pcaps & FW_PORT_CAP32_ANEG); if (!(lc->requested_fc & PAUSE_AUTONEG)) MPASS(lc->pcaps & FW_PORT_CAP32_FORCE_PAUSE); if (lc->requested_fc & PAUSE_TX) MPASS(lc->pcaps & FW_PORT_CAP32_FC_TX); if (lc->requested_fc & PAUSE_RX) MPASS(lc->pcaps & FW_PORT_CAP32_FC_RX); if (lc->requested_fec & FEC_RS) MPASS(lc->pcaps & FW_PORT_CAP32_FEC_RS); if (lc->requested_fec & FEC_BASER_RS) MPASS(lc->pcaps & FW_PORT_CAP32_FEC_BASER_RS); #endif rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); if (rc != 0) { /* Don't complain if the VF driver gets back an EPERM. */ if (!(sc->flags & IS_VF) || rc != FW_EPERM) device_printf(pi->dev, "l1cfg failed: %d\n", rc); } else { /* * An L1_CFG will almost always result in a link-change event if * the link is up, and the driver will refresh the actual * fec/fc/etc. when the notification is processed. If the link * is down then the actual settings are meaningless. * * This takes care of the case where a change in the L1 settings * may not result in a notification. */ if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG)) lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX); } return (rc); } #define FW_MAC_EXACT_CHUNK 7 struct mcaddr_ctx { struct ifnet *ifp; const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK]; uint64_t hash; int i; int del; int rc; }; static u_int add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { struct mcaddr_ctx *ctx = arg; struct vi_info *vi = ctx->ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; if (ctx->rc < 0) return (0); ctx->mcaddr[ctx->i] = LLADDR(sdl); MPASS(ETHER_IS_MULTICAST(ctx->mcaddr[ctx->i])); ctx->i++; if (ctx->i == FW_MAC_EXACT_CHUNK) { ctx->rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, ctx->del, ctx->i, ctx->mcaddr, NULL, &ctx->hash, 0); if (ctx->rc < 0) { int j; for (j = 0; j < ctx->i; j++) { if_printf(ctx->ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", ctx->mcaddr[j][0], ctx->mcaddr[j][1], ctx->mcaddr[j][2], ctx->mcaddr[j][3], ctx->mcaddr[j][4], ctx->mcaddr[j][5], -ctx->rc); } return (0); } ctx->del = 0; ctx->i = 0; } return (1); } /* * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ int update_mac_settings(struct ifnet *ifp, int flags) { int rc = 0; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; uint8_t match_all_mac[ETHER_ADDR_LEN] = {0}; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); if (flags & XGMAC_MTU) mtu = ifp->if_mtu; if (flags & XGMAC_PROMISC) promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0; if (flags & XGMAC_ALLMULTI) allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0; if (flags & XGMAC_VLANEX) vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc, allmulti, 1, vlanex, false); if (rc) { if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); return (rc); } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt, ucaddr, true, &vi->smt_idx); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { vi->xact_addr_filt = rc; rc = 0; } } if (flags & XGMAC_MCADDRS) { struct epoch_tracker et; struct mcaddr_ctx ctx; int j; ctx.ifp = ifp; ctx.hash = 0; ctx.i = 0; ctx.del = 1; ctx.rc = 0; /* * Unlike other drivers, we accumulate list of pointers into * interface address lists and we need to keep it safe even * after if_foreach_llmaddr() returns, thus we must enter the * network epoch. */ NET_EPOCH_ENTER(et); if_foreach_llmaddr(ifp, add_maddr, &ctx); if (ctx.rc < 0) { NET_EPOCH_EXIT(et); rc = -ctx.rc; return (rc); } if (ctx.i > 0) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, ctx.del, ctx.i, ctx.mcaddr, NULL, &ctx.hash, 0); NET_EPOCH_EXIT(et); if (rc < 0) { rc = -rc; for (j = 0; j < ctx.i; j++) { if_printf(ifp, "failed to add mcast address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", ctx.mcaddr[j][0], ctx.mcaddr[j][1], ctx.mcaddr[j][2], ctx.mcaddr[j][3], ctx.mcaddr[j][4], ctx.mcaddr[j][5], rc); } return (rc); } ctx.del = 0; } else NET_EPOCH_EXIT(et); rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0); if (rc != 0) if_printf(ifp, "failed to set mcast address hash: %d\n", rc); if (ctx.del == 0) { /* We clobbered the VXLAN entry if there was one. */ pi->vxlan_tcam_entry = false; } } if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 && pi->vxlan_tcam_entry == false) { rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac, match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id, true); if (rc < 0) { rc = -rc; if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n", rc); } else { MPASS(rc == sc->rawf_base + pi->port_id); rc = 0; pi->vxlan_tcam_entry = true; } } return (rc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ int begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags, char *wmesg) { int rc, pri; #ifdef WITNESS /* the caller thinks it's ok to sleep, but is it really? */ if (flags & SLEEP_OK) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "begin_synchronized_op"); #endif if (INTR_OK) pri = PCATCH; else pri = 0; ADAPTER_LOCK(sc); for (;;) { if (vi && IS_DOOMED(vi)) { rc = ENXIO; goto done; } if (!IS_BUSY(sc)) { rc = 0; break; } if (!(flags & SLEEP_OK)) { rc = EBUSY; goto done; } if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) { rc = EINTR; goto done; } } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = wmesg; sc->last_op_thr = curthread; sc->last_op_flags = flags; #endif done: if (!(flags & HOLD_LOCK) || rc) ADAPTER_UNLOCK(sc); return (rc); } /* * Tell if_ioctl and if_init that the VI is going away. This is * special variant of begin_synchronized_op and must be paired with a * call to end_synchronized_op. */ void doom_vi(struct adapter *sc, struct vi_info *vi) { ADAPTER_LOCK(sc); SET_DOOMED(vi); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = "t4detach"; sc->last_op_thr = curthread; sc->last_op_flags = 0; #endif ADAPTER_UNLOCK(sc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ void end_synchronized_op(struct adapter *sc, int flags) { if (flags & LOCK_HELD) ADAPTER_LOCK_ASSERT_OWNED(sc); else ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); } static int cxgbe_init_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc = 0, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return (0); /* already running */ if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_init(sc)) != 0)) return (rc); /* error message displayed already */ if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_init(vi)) != 0)) return (rc); /* error message displayed already */ rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ PORT_LOCK(pi); if (pi->up_vis == 0) { t4_update_port_info(pi); fixup_link_config(pi); build_medialist(pi); apply_link_config(pi); } rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); PORT_UNLOCK(pi); goto done; } /* * Can't fail from this point onwards. Review cxgbe_uninit_synchronized * if this changes. */ for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_ENABLED; TXQ_UNLOCK(txq); } /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0 && IS_MAIN_VI(vi)) { sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id; t4_write_reg(sc, is_t4(sc) ? A_MPS_TRC_RSS_CONTROL : A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) | V_QUEUENUMBER(sc->traceq)); pi->flags |= HAS_TRACEQ; } /* all ok */ pi->up_vis++; ifp->if_drv_flags |= IFF_DRV_RUNNING; if (pi->link_cfg.link_ok) t4_os_link_changed(pi); PORT_UNLOCK(pi); mtx_lock(&vi->tick_mtx); if (ifp->if_get_counter == vi_get_counter) callout_reset(&vi->tick, hz, vi_tick, vi); else callout_reset(&vi->tick, hz, cxgbe_tick, vi); mtx_unlock(&vi->tick_mtx); done: if (rc != 0) cxgbe_uninit_synchronized(vi); return (rc); } /* * Idempotent. */ static int cxgbe_uninit_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (!(vi->flags & VI_INIT_DONE)) { if (__predict_false(ifp->if_drv_flags & IFF_DRV_RUNNING)) { KASSERT(0, ("uninited VI is running")); if_printf(ifp, "uninited VI with running ifnet. " "vi->flags 0x%016lx, if_flags 0x%08x, " "if_drv_flags 0x%08x\n", vi->flags, ifp->if_flags, ifp->if_drv_flags); } return (0); } /* * Disable the VI so that all its data in either direction is discarded * by the MPS. Leave everything else (the queues, interrupts, and 1Hz * tick) intact as the TP can deliver negative advice or data that it's * holding in its RAM (for an offloaded connection) even after the VI is * disabled. */ rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false); if (rc) { if_printf(ifp, "disable_vi failed: %d\n", rc); return (rc); } for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); } mtx_lock(&vi->tick_mtx); callout_stop(&vi->tick); mtx_unlock(&vi->tick_mtx); PORT_LOCK(pi); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { PORT_UNLOCK(pi); return (0); } ifp->if_drv_flags &= ~IFF_DRV_RUNNING; pi->up_vis--; if (pi->up_vis > 0) { PORT_UNLOCK(pi); return (0); } pi->link_cfg.link_ok = false; pi->link_cfg.speed = 0; pi->link_cfg.link_down_rc = 255; t4_os_link_changed(pi); PORT_UNLOCK(pi); return (0); } /* * It is ok for this function to fail midway and return right away. t4_detach * will walk the entire sc->irq list and clean up whatever is valid. */ int t4_setup_intr_handlers(struct adapter *sc) { int rc, rid, p, q, v; char s[8]; struct irq *irq; struct port_info *pi; struct vi_info *vi; struct sge *sge = &sc->sge; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; #endif #ifdef RSS int nbuckets = rss_getnumbuckets(); #endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; if (forwarding_intr_to_fwq(sc)) return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); /* Multiple interrupts. */ if (sc->flags & IS_VF) KASSERT(sc->intr_count >= T4VF_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); else KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); /* The first one is always error intr on PFs */ if (!(sc->flags & IS_VF)) { rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); if (rc != 0) return (rc); irq++; rid++; } /* The second one is always the firmware event queue (first on VFs) */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt"); if (rc != 0) return (rc); irq++; rid++; for_each_port(sc, p) { pi = sc->port[p]; for_each_vi(pi, v, vi) { vi->first_intr = rid - 1; if (vi->nnmrxq > 0) { int n = max(vi->nrxq, vi->nnmrxq); rxq = &sge->rxq[vi->first_rxq]; #ifdef DEV_NETMAP nm_rxq = &sge->nm_rxq[vi->first_nm_rxq]; #endif for (q = 0; q < n; q++) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); if (q < vi->nrxq) irq->rxq = rxq++; #ifdef DEV_NETMAP if (q < vi->nnmrxq) irq->nm_rxq = nm_rxq++; if (irq->nm_rxq != NULL && irq->rxq == NULL) { /* Netmap rx only */ rc = t4_alloc_irq(sc, irq, rid, t4_nm_intr, irq->nm_rxq, s); } if (irq->nm_rxq != NULL && irq->rxq != NULL) { /* NIC and Netmap rx */ rc = t4_alloc_irq(sc, irq, rid, t4_vi_intr, irq, s); } #endif if (irq->rxq != NULL && irq->nm_rxq == NULL) { /* NIC rx only */ rc = t4_alloc_irq(sc, irq, rid, t4_intr, irq->rxq, s); } if (rc != 0) return (rc); #ifdef RSS if (q < vi->nrxq) { bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); } #endif irq++; rid++; vi->nintr++; } } else { for_each_rxq(vi, q, rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); if (rc != 0) return (rc); #ifdef RSS bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); #endif irq++; rid++; vi->nintr++; } } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, q, ofld_rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); if (rc != 0) return (rc); irq++; rid++; vi->nintr++; } #endif } } MPASS(irq == &sc->irq[sc->intr_count]); return (0); } static void write_global_rss_key(struct adapter *sc) { #ifdef RSS int i; uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; CTASSERT(RSS_KEYSIZE == 40); rss_getkey((void *)&raw_rss_key[0]); for (i = 0; i < nitems(rss_key); i++) { rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]); } t4_write_rss_key(sc, &rss_key[0], -1, 1); #endif } /* * Idempotent. */ static int adapter_full_init(struct adapter *sc) { int rc, i; ASSERT_SYNCHRONIZED_OP(sc); if (!(sc->flags & ADAP_SYSCTL_CTX)) { sysctl_ctx_init(&sc->ctx); sc->flags |= ADAP_SYSCTL_CTX; } /* * queues that belong to the adapter (not any particular port). */ rc = t4_setup_adapter_queues(sc); if (rc != 0) return (rc); for (i = 0; i < nitems(sc->tq); i++) { if (sc->tq[i] != NULL) continue; sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq[i]); if (sc->tq[i] == NULL) { CH_ERR(sc, "failed to allocate task queue %d\n", i); return (ENOMEM); } taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d", device_get_nameunit(sc->dev), i); } if (!(sc->flags & IS_VF)) { write_global_rss_key(sc); t4_intr_enable(sc); } #ifdef KERN_TLS if (is_ktls(sc)) callout_reset_sbt(&sc->ktls_tick, SBT_1MS, 0, ktls_tick, sc, C_HARDCLOCK); #endif return (0); } int adapter_init(struct adapter *sc) { int rc; ASSERT_SYNCHRONIZED_OP(sc); ADAPTER_LOCK_ASSERT_NOTOWNED(sc); KASSERT((sc->flags & FULL_INIT_DONE) == 0, ("%s: FULL_INIT_DONE already", __func__)); rc = adapter_full_init(sc); if (rc != 0) adapter_full_uninit(sc); else sc->flags |= FULL_INIT_DONE; return (rc); } /* * Idempotent. */ static void adapter_full_uninit(struct adapter *sc) { int i; /* Do this before freeing the adapter queues. */ if (sc->flags & ADAP_SYSCTL_CTX) { sysctl_ctx_free(&sc->ctx); sc->flags &= ~ADAP_SYSCTL_CTX; } t4_teardown_adapter_queues(sc); for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) { taskqueue_free(sc->tq[i]); sc->tq[i] = NULL; } sc->flags &= ~FULL_INIT_DONE; } #ifdef RSS #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \ RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \ RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \ RSS_HASHTYPE_RSS_UDP_IPV6) /* Translates kernel hash types to hardware. */ static int hashconfig_to_hashen(int hashconfig) { int hashen = 0; if (hashconfig & RSS_HASHTYPE_RSS_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; return (hashen); } /* Translates hardware hash types to kernel. */ static int hashen_to_hashconfig(int hashen) { int hashconfig = 0; if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) { /* * If UDP hashing was enabled it must have been enabled for * either IPv4 or IPv6 (inclusive or). Enabling UDP without * enabling any 4-tuple hash is nonsense configuration. */ MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)); if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6; } if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV6; return (hashconfig); } #endif /* * Idempotent. */ static int vi_full_init(struct vi_info *vi) { struct adapter *sc = vi->adapter; struct sge_rxq *rxq; int rc, i, j; #ifdef RSS int nbuckets = rss_getnumbuckets(); int hashconfig = rss_gethashconfig(); int extra; #endif ASSERT_SYNCHRONIZED_OP(sc); if (!(vi->flags & VI_SYSCTL_CTX)) { sysctl_ctx_init(&vi->ctx); vi->flags |= VI_SYSCTL_CTX; } /* * Allocate tx/rx/fl queues for this VI. */ rc = t4_setup_vi_queues(vi); if (rc != 0) return (rc); /* * Setup RSS for this VI. Save a copy of the RSS table for later use. */ if (vi->nrxq > vi->rss_size) { CH_ALERT(vi, "nrxq (%d) > hw RSS table size (%d); " "some queues will never receive traffic.\n", vi->nrxq, vi->rss_size); } else if (vi->rss_size % vi->nrxq) { CH_ALERT(vi, "nrxq (%d), hw RSS table size (%d); " "expect uneven traffic distribution.\n", vi->nrxq, vi->rss_size); } #ifdef RSS if (vi->nrxq != nbuckets) { CH_ALERT(vi, "nrxq (%d) != kernel RSS buckets (%d);" "performance will be impacted.\n", vi->nrxq, nbuckets); } #endif if (vi->rss == NULL) vi->rss = malloc(vi->rss_size * sizeof (*vi->rss), M_CXGBE, M_ZERO | M_WAITOK); for (i = 0; i < vi->rss_size;) { #ifdef RSS j = rss_get_indirection_to_bucket(i); j %= vi->nrxq; rxq = &sc->sge.rxq[vi->first_rxq + j]; vi->rss[i++] = rxq->iq.abs_id; #else for_each_rxq(vi, j, rxq) { vi->rss[i++] = rxq->iq.abs_id; if (i == vi->rss_size) break; } #endif } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, vi->rss, vi->rss_size); if (rc != 0) { CH_ERR(vi, "rss_config failed: %d\n", rc); return (rc); } #ifdef RSS vi->hashen = hashconfig_to_hashen(hashconfig); /* * We may have had to enable some hashes even though the global config * wants them disabled. This is a potential problem that must be * reported to the user. */ extra = hashen_to_hashconfig(vi->hashen) ^ hashconfig; /* * If we consider only the supported hash types, then the enabled hashes * are a superset of the requested hashes. In other words, there cannot * be any supported hash that was requested but not enabled, but there * can be hashes that were not requested but had to be enabled. */ extra &= SUPPORTED_RSS_HASHTYPES; MPASS((extra & hashconfig) == 0); if (extra) { CH_ALERT(vi, "global RSS config (0x%x) cannot be accommodated.\n", hashconfig); } if (extra & RSS_HASHTYPE_RSS_IPV4) CH_ALERT(vi, "IPv4 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV4) CH_ALERT(vi, "TCP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_IPV6) CH_ALERT(vi, "IPv6 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV6) CH_ALERT(vi, "TCP/IPv6 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV4) CH_ALERT(vi, "UDP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV6) CH_ALERT(vi, "UDP/IPv6 4-tuple hashing forced on.\n"); #else vi->hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN; #endif rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, vi->rss[0], 0, 0); if (rc != 0) { CH_ERR(vi, "rss hash/defaultq config failed: %d\n", rc); return (rc); } return (0); } int vi_init(struct vi_info *vi) { int rc; ASSERT_SYNCHRONIZED_OP(vi->adapter); KASSERT((vi->flags & VI_INIT_DONE) == 0, ("%s: VI_INIT_DONE already", __func__)); rc = vi_full_init(vi); if (rc != 0) vi_full_uninit(vi); else vi->flags |= VI_INIT_DONE; return (rc); } /* * Idempotent. */ static void vi_full_uninit(struct vi_info *vi) { if (vi->flags & VI_INIT_DONE) { quiesce_vi(vi); free(vi->rss, M_CXGBE); free(vi->nm_rss, M_CXGBE); } /* Do this before freeing the VI queues. */ if (vi->flags & VI_SYSCTL_CTX) { sysctl_ctx_free(&vi->ctx); vi->flags &= ~VI_SYSCTL_CTX; } t4_teardown_vi_queues(vi); vi->flags &= ~VI_INIT_DONE; } static void quiesce_txq(struct sge_txq *txq) { struct sge_eq *eq = &txq->eq; struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; MPASS(eq->flags & EQ_SW_ALLOCATED); MPASS(!(eq->flags & EQ_ENABLED)); /* Wait for the mp_ring to empty. */ while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 4096); pause("rquiesce", 1); } MPASS(txq->txp.npkt == 0); if (eq->flags & EQ_HW_ALLOCATED) { /* * Hardware is alive and working normally. Wait for it to * finish and then wait for the driver to catch up and reclaim * all descriptors. */ while (spg->cidx != htobe16(eq->pidx)) pause("equiesce", 1); while (eq->cidx != eq->pidx) pause("dquiesce", 1); } else { /* * Hardware is unavailable. Discard all pending tx and reclaim * descriptors directly. */ TXQ_LOCK(txq); while (eq->cidx != eq->pidx) { struct mbuf *m, *nextpkt; struct tx_sdesc *txsd; txsd = &txq->sdesc[eq->cidx]; for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } IDXINCR(eq->cidx, txsd->desc_used, eq->sidx); } spg->pidx = spg->cidx = htobe16(eq->cidx); TXQ_UNLOCK(txq); } } static void quiesce_wrq(struct sge_wrq *wrq) { /* XXXTX */ } static void quiesce_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) { /* Synchronize with the interrupt handler */ while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED)) pause("iqfree", 1); if (fl != NULL) { MPASS(iq->flags & IQ_HAS_FL); mtx_lock(&sc->sfl_lock); FL_LOCK(fl); fl->flags |= FL_DOOMED; FL_UNLOCK(fl); callout_stop(&sc->sfl_callout); mtx_unlock(&sc->sfl_lock); KASSERT((fl->flags & FL_STARVING) == 0, ("%s: still starving", __func__)); /* Release all buffers if hardware is no longer available. */ if (!(iq->flags & IQ_HW_ALLOCATED)) free_fl_buffers(sc, fl); } } /* * Wait for all activity on all the queues of the VI to complete. It is assumed * that no new work is being enqueued by the hardware or the driver. That part * should be arranged before calling this function. */ static void quiesce_vi(struct vi_info *vi) { int i; struct adapter *sc = vi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif if (!(vi->flags & VI_INIT_DONE)) return; for_each_txq(vi, i, txq) { quiesce_txq(txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { quiesce_wrq(&ofld_txq->wrq); } #endif for_each_rxq(vi, i, rxq) { quiesce_iq_fl(sc, &rxq->iq, &rxq->fl); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { quiesce_iq_fl(sc, &ofld_rxq->iq, &ofld_rxq->fl); } #endif } static int t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid, driver_intr_t *handler, void *arg, char *name) { int rc; irq->rid = rid; irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid, RF_SHAREABLE | RF_ACTIVE); if (irq->res == NULL) { device_printf(sc->dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET, NULL, handler, arg, &irq->tag); if (rc != 0) { device_printf(sc->dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name, rc); } else if (name) bus_describe_intr(sc->dev, irq->res, irq->tag, "%s", name); return (rc); } static int t4_free_irq(struct adapter *sc, struct irq *irq) { if (irq->tag) bus_teardown_intr(sc->dev, irq->res, irq->tag); if (irq->res) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res); bzero(irq, sizeof(*irq)); return (0); } static void get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf) { regs->version = chip_id(sc) | chip_rev(sc) << 10; t4_get_regs(sc, buf, regs->len); } #define A_PL_INDIR_CMD 0x1f8 #define S_PL_AUTOINC 31 #define M_PL_AUTOINC 0x1U #define V_PL_AUTOINC(x) ((x) << S_PL_AUTOINC) #define G_PL_AUTOINC(x) (((x) >> S_PL_AUTOINC) & M_PL_AUTOINC) #define S_PL_VFID 20 #define M_PL_VFID 0xffU #define V_PL_VFID(x) ((x) << S_PL_VFID) #define G_PL_VFID(x) (((x) >> S_PL_VFID) & M_PL_VFID) #define S_PL_ADDR 0 #define M_PL_ADDR 0xfffffU #define V_PL_ADDR(x) ((x) << S_PL_ADDR) #define G_PL_ADDR(x) (((x) >> S_PL_ADDR) & M_PL_ADDR) #define A_PL_INDIR_DATA 0x1fc static uint64_t read_vf_stat(struct adapter *sc, u_int vin, int reg) { u32 stats[2]; if (sc->flags & IS_VF) { stats[0] = t4_read_reg(sc, VF_MPS_REG(reg)); stats[1] = t4_read_reg(sc, VF_MPS_REG(reg + 4)); } else { mtx_assert(&sc->reg_lock, MA_OWNED); t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(reg))); stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA); stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA); } return (((uint64_t)stats[1]) << 32 | stats[0]); } static void t4_get_vi_stats(struct adapter *sc, u_int vin, struct fw_vi_stats_vf *stats) { #define GET_STAT(name) \ read_vf_stat(sc, vin, A_MPS_VF_STAT_##name##_L) if (!(sc->flags & IS_VF)) mtx_lock(&sc->reg_lock); stats->tx_bcast_bytes = GET_STAT(TX_VF_BCAST_BYTES); stats->tx_bcast_frames = GET_STAT(TX_VF_BCAST_FRAMES); stats->tx_mcast_bytes = GET_STAT(TX_VF_MCAST_BYTES); stats->tx_mcast_frames = GET_STAT(TX_VF_MCAST_FRAMES); stats->tx_ucast_bytes = GET_STAT(TX_VF_UCAST_BYTES); stats->tx_ucast_frames = GET_STAT(TX_VF_UCAST_FRAMES); stats->tx_drop_frames = GET_STAT(TX_VF_DROP_FRAMES); stats->tx_offload_bytes = GET_STAT(TX_VF_OFFLOAD_BYTES); stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES); stats->rx_bcast_bytes = GET_STAT(RX_VF_BCAST_BYTES); stats->rx_bcast_frames = GET_STAT(RX_VF_BCAST_FRAMES); stats->rx_mcast_bytes = GET_STAT(RX_VF_MCAST_BYTES); stats->rx_mcast_frames = GET_STAT(RX_VF_MCAST_FRAMES); stats->rx_ucast_bytes = GET_STAT(RX_VF_UCAST_BYTES); stats->rx_ucast_frames = GET_STAT(RX_VF_UCAST_FRAMES); stats->rx_err_frames = GET_STAT(RX_VF_ERR_FRAMES); if (!(sc->flags & IS_VF)) mtx_unlock(&sc->reg_lock); #undef GET_STAT } static void t4_clr_vi_stats(struct adapter *sc, u_int vin) { int reg; t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L))); for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L; reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4) t4_write_reg(sc, A_PL_INDIR_DATA, 0); } static void vi_refresh_stats(struct vi_info *vi) { struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ mtx_assert(&vi->tick_mtx, MA_OWNED); if (!(vi->flags & VI_INIT_DONE) || vi->flags & VI_SKIP_STATS) return; getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &vi->last_refreshed, <)) return; t4_get_vi_stats(vi->adapter, vi->vin, &vi->stats); getmicrotime(&vi->last_refreshed); } static void cxgbe_refresh_stats(struct vi_info *vi) { u_int i, v, tnl_cong_drops, chan_map; struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ struct port_info *pi; struct adapter *sc; mtx_assert(&vi->tick_mtx, MA_OWNED); if (vi->flags & VI_SKIP_STATS) return; getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &vi->last_refreshed, <)) return; pi = vi->pi; sc = vi->adapter; tnl_cong_drops = 0; t4_get_port_stats(sc, pi->tx_chan, &pi->stats); chan_map = pi->rx_e_chan_map; while (chan_map) { i = ffs(chan_map) - 1; mtx_lock(&sc->reg_lock); t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); mtx_unlock(&sc->reg_lock); tnl_cong_drops += v; chan_map &= ~(1 << i); } pi->tnl_cong_drops = tnl_cong_drops; getmicrotime(&vi->last_refreshed); } static void cxgbe_tick(void *arg) { struct vi_info *vi = arg; MPASS(IS_MAIN_VI(vi)); mtx_assert(&vi->tick_mtx, MA_OWNED); cxgbe_refresh_stats(vi); callout_schedule(&vi->tick, hz); } static void vi_tick(void *arg) { struct vi_info *vi = arg; mtx_assert(&vi->tick_mtx, MA_OWNED); vi_refresh_stats(vi); callout_schedule(&vi->tick, hz); } /* * Should match fw_caps_config_ enums in t4fw_interface.h */ static char *caps_decoder[] = { "\20\001IPMI\002NCSI", /* 0: NBM */ "\20\001PPP\002QFC\003DCBX", /* 1: link */ "\20\001INGRESS\002EGRESS", /* 2: switch */ "\20\001NIC\002VM\003IDS\004UM\005UM_ISGL" /* 3: NIC */ "\006HASHFILTER\007ETHOFLD", "\20\001TOE", /* 4: TOE */ "\20\001RDDP\002RDMAC", /* 5: RDMA */ "\20\001INITIATOR_PDU\002TARGET_PDU" /* 6: iSCSI */ "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD" "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD" "\007T10DIF" "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD", "\20\001LOOKASIDE\002TLSKEYS\003IPSEC_INLINE" /* 7: Crypto */ "\004TLS_HW", "\20\001INITIATOR\002TARGET\003CTRL_OFLD" /* 8: FCoE */ "\004PO_INITIATOR\005PO_TARGET", }; void t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *c0; static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"}; ctx = device_get_sysctl_ctx(sc->dev); /* * dev.t4nex.X. */ oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); sc->sc_do_rxcopy = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, doorbells, (uintptr_t)&sc->doorbells, sysctl_bitfield_8b, "A", "available doorbells"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL, sc->params.vpd.cclk, "core clock frequency (in KHz)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc->params.sge.timer_val, sizeof(sc->params.sge.timer_val), sysctl_int_array, "A", "interrupt holdoff timer values (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc->params.sge.counter_val, sizeof(sc->params.sge.counter_val), sysctl_int_array, "A", "interrupt holdoff packet counter values"); t4_sge_sysctls(sc, ctx, children); sc->lro_timeout = 100; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW, &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dflags", CTLFLAG_RW, &sc->debug_flags, 0, "flags to enable runtime debugging"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version", CTLFLAG_RD, sc->tp_version, 0, "TP microcode version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); if (sc->flags & IS_VF) return; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, NULL, chip_rev(sc), "chip hardware revision"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "sn", CTLFLAG_RD, sc->params.vpd.sn, 0, "serial number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pn", CTLFLAG_RD, sc->params.vpd.pn, 0, "part number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "ec", CTLFLAG_RD, sc->params.vpd.ec, 0, "engineering change"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "md_version", CTLFLAG_RD, sc->params.vpd.md, 0, "manufacturing diags version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "na", CTLFLAG_RD, sc->params.vpd.na, 0, "network address"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "er_version", CTLFLAG_RD, sc->er_version, 0, "expansion ROM version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bs_version", CTLFLAG_RD, sc->bs_version, 0, "bootstrap firmware version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "scfg_version", CTLFLAG_RD, NULL, sc->params.scfg_vers, "serial config version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "vpd_version", CTLFLAG_RD, NULL, sc->params.vpd_vers, "VPD version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf", CTLFLAG_RD, sc->cfg_file, 0, "configuration file"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL, sc->cfcsum, "config file checksum"); #define SYSCTL_CAP(name, n, text) \ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, caps_decoder[n], \ (uintptr_t)&sc->name, sysctl_bitfield_16b, "A", \ "available " text " capabilities") SYSCTL_CAP(nbmcaps, 0, "NBM"); SYSCTL_CAP(linkcaps, 1, "link"); SYSCTL_CAP(switchcaps, 2, "switch"); SYSCTL_CAP(niccaps, 3, "NIC"); SYSCTL_CAP(toecaps, 4, "TCP offload"); SYSCTL_CAP(rdmacaps, 5, "RDMA"); SYSCTL_CAP(iscsicaps, 6, "iSCSI"); SYSCTL_CAP(cryptocaps, 7, "crypto"); SYSCTL_CAP(fcoecaps, 8, "FCoE"); #undef SYSCTL_CAP SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD, NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "reset_sensor", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_reset_sensor, "I", "reset the chip's temperature sensor."); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "loadavg", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_loadavg, "A", "microprocessor load averages (debug firmwares only)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "core_vdd", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_vdd, "I", "core Vdd (in mV)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "local_cpus", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, LOCAL_CPUS, sysctl_cpus, "A", "local CPUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_cpus", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, INTR_CPUS, sysctl_cpus, "A", "preferred CPUs for interrupts"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "swintr", CTLFLAG_RW, &sc->swintr, 0, "software triggered interrupts"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "reset", CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_reset, "I", "1 = reset adapter, 0 = zero reset counter"); /* * dev.t4nex.X.misc. Marked CTLFLAG_SKIP to avoid information overload. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc", CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, "logs and miscellaneous information"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cctrl, "A", "congestion control"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1, sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2, sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 3, sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 4, sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 5, sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_la, "A", "CIM logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_ma_la, "A", "CIM MA logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)"); if (chip_id(sc) > CHELSIO_T4) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_pif_la, "A", "CIM PIF logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_qcfg, "A", "CIM queue configuration"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cpl_stats, "A", "CPL statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_ddp_stats, "A", "non-TCP DDP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tid_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tid_stats, "A", "tid stats"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_devlog, "A", "firmware's device log"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_fcoe_stats, "A", "FCoE statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_hw_sched, "A", "hardware scheduler "); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_l2t, "A", "hardware L2 table"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "smt", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_smt, "A", "hardware source MAC table"); #ifdef INET6 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "clip", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_clip, "A", "active CLIP table entries"); #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_lb_stats, "A", "loopback statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_meminfo, "A", "memory regions"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6, "A", "MPS TCAM entries"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_path_mtus, "A", "path MTUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_pm_stats, "A", "PM statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_rdma_stats, "A", "RDMA statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tcp_stats, "A", "TCP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tids, "A", "TID information"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_err_stats, "A", "TP error statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tnl_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tnl_stats, "A", "TP tunnel statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_la_mask, "I", "TP logic analyzer event capture mask"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_la, "A", "TP logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tx_rate, "A", "Tx rate"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_ulprx_la, "A", "ULPRX logic analyzer"); if (chip_id(sc) >= CHELSIO_T5) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_wcwr_stats, "A", "write combined work requests"); } #ifdef KERN_TLS if (is_ktls(sc)) { /* * dev.t4nex.0.tls. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "tls", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "KERN_TLS parameters"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "inline_keys", CTLFLAG_RW, &sc->tlst.inline_keys, 0, "Always pass TLS " "keys in work requests (1) or attempt to store TLS keys " "in card memory."); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "combo_wrs", CTLFLAG_RW, &sc->tlst.combo_wrs, 0, "Attempt to combine " "TCB field updates with TLS record work requests."); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { int i; char s[4]; /* * dev.t4nex.X.toe. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE parameters"); children = SYSCTL_CHILDREN(oid); sc->tt.cong_algorithm = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_algorithm", CTLFLAG_RW, &sc->tt.cong_algorithm, 0, "congestion control " "(-1 = default, 0 = reno, 1 = tahoe, 2 = newreno, " "3 = highspeed)"); sc->tt.sndbuf = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, &sc->tt.sndbuf, 0, "hardware send buffer"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW | CTLFLAG_SKIP, &sc->tt.ddp, 0, ""); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_zcopy", CTLFLAG_RW, &sc->tt.ddp, 0, "Enable zero-copy aio_read(2)"); sc->tt.rx_coalesce = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); sc->tt.tls = 0; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tls, "I", "Inline TLS allowed"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_ports", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tls_rx_ports, "I", "TCP ports that use inline TLS+TOE RX"); sc->tt.tls_rx_timeout = t4_toe_tls_rx_timeout; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_timeout", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tls_rx_timeout, "I", "Timeout in seconds to downgrade TLS sockets to plain TOE"); sc->tt.tx_align = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align", CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload"); sc->tt.tx_zcopy = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy", CTLFLAG_RW, &sc->tt.tx_zcopy, 0, "Enable zero-copy aio_write(2)"); sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cop_managed_offloading", CTLFLAG_RW, &sc->tt.cop_managed_offloading, 0, "COP (Connection Offload Policy) controls all TOE offload"); sc->tt.autorcvbuf_inc = 16 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "autorcvbuf_inc", CTLFLAG_RW, &sc->tt.autorcvbuf_inc, 0, "autorcvbuf increment"); sc->tt.update_hc_on_pmtu_change = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "update_hc_on_pmtu_change", CTLFLAG_RW, &sc->tt.update_hc_on_pmtu_change, 0, "Update hostcache entry if the PMTU changes"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_tick, "A", "TP timer tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1, sysctl_tp_tick, "A", "TCP timestamp tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2, sysctl_tp_tick, "A", "DACK tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_dack_timer, "IU", "DACK timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_RXT_MIN, sysctl_tp_timer, "LU", "Minimum retransmit interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_RXT_MAX, sysctl_tp_timer, "LU", "Maximum retransmit interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_PERS_MIN, sysctl_tp_timer, "LU", "Persist timer min (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_PERS_MAX, sysctl_tp_timer, "LU", "Persist timer max (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_KEEP_IDLE, sysctl_tp_timer, "LU", "Keepalive idle timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_interval", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_KEEP_INTVL, sysctl_tp_timer, "LU", "Keepalive interval timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_INIT_SRTT, sysctl_tp_timer, "LU", "Initial SRTT (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_FINWAIT2_TIMER, sysctl_tp_timer, "LU", "FINWAIT2 timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "syn_rexmt_count", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, S_SYNSHIFTMAX, sysctl_tp_shift_cnt, "IU", "Number of SYN retransmissions before abort"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_count", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, S_RXTSHIFTMAXR2, sysctl_tp_shift_cnt, "IU", "Number of retransmissions before abort"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_count", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, S_KEEPALIVEMAXR2, sysctl_tp_shift_cnt, "IU", "Number of keepalive probes before abort"); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rexmt_backoff", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE retransmit backoffs"); children = SYSCTL_CHILDREN(oid); for (i = 0; i < 16; i++) { snprintf(s, sizeof(s), "%u", i); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, s, CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, i, sysctl_tp_backoff, "IU", "TOE retransmit backoff"); } } #endif } void vi_sysctls(struct vi_info *vi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(vi->dev); /* * dev.v?(cxgbe|cxl).X. */ oid = device_get_sysctl_tree(vi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL, vi->viid, "VI identifer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD, &vi->nrxq, 0, "# of rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD, &vi->ntxq, 0, "# of tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD, &vi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &vi->first_txq, 0, "index of first tx queue"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_base", CTLFLAG_RD, NULL, vi->rss_base, "start of RSS indirection table"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL, vi->rss_size, "size of RSS indirection table"); if (IS_MAIN_VI(vi)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_noflowq, "IU", "Reserve queue 0 for non-flowid packets"); } if (vi->adapter->flags & IS_VF) { MPASS(vi->flags & TX_USES_VM_WR); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_vm_wr", CTLFLAG_RD, NULL, 1, "use VM work requests for transmit"); } else { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_vm_wr", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_tx_vm_wr, "I", "use VM work requestes for transmit"); } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &vi->nofldrxq, 0, "# of rx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq", CTLFLAG_RD, &vi->first_ofld_rxq, 0, "index of first TOE rx queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx_ofld", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_tmr_idx_ofld, "I", "holdoff timer index for TOE queues"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx_ofld", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_pktc_idx_ofld, "I", "holdoff packet counter index for TOE queues"); } #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (vi->nofldtxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD, &vi->nofldtxq, 0, "# of tx queues for TOE/ETHOFLD"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq", CTLFLAG_RD, &vi->first_ofld_txq, 0, "index of first TOE/ETHOFLD tx queue"); } #endif #ifdef DEV_NETMAP if (vi->nnmrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, &vi->nnmrxq, 0, "# of netmap rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, &vi->nnmtxq, 0, "# of netmap tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", CTLFLAG_RD, &vi->first_nm_rxq, 0, "index of first netmap rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", CTLFLAG_RD, &vi->first_nm_txq, 0, "index of first netmap tx queue"); } #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_tmr_idx, "I", "holdoff timer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_qsize_rxq, "I", "rx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_qsize_txq, "I", "tx queue size"); } static void cxgbe_sysctls(struct port_info *pi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *children2; struct adapter *sc = pi->adapter; int i; char name[16]; static char *tc_flags = {"\20\1USER\2SYNC\3ASYNC\4ERR"}; ctx = device_get_sysctl_ctx(pi->dev); /* * dev.cxgbe.X. */ oid = device_get_sysctl_tree(pi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 0, sysctl_linkdnrc, "A", "reason why link is down"); if (pi->port_type == FW_PORT_TYPE_BT_XAUI) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 0, sysctl_btphy, "I", "PHY temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 1, sysctl_btphy, "I", "PHY firmware version"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0, sysctl_pause_settings, "A", "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fec", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0, sysctl_fec, "A", "FECs to use (bit 0 = RS, 1 = FC, 2 = none, 5 = auto, 6 = module)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "module_fec", CTLTYPE_STRING | CTLFLAG_MPSAFE, pi, 0, sysctl_module_fec, "A", "FEC recommended by the cable/transceiver"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "autoneg", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0, sysctl_autoneg, "I", "autonegotiation (-1 = not supported)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcaps", CTLFLAG_RD, &pi->link_cfg.pcaps, 0, "port capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "acaps", CTLFLAG_RD, &pi->link_cfg.acaps, 0, "advertised capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lpacaps", CTLFLAG_RD, &pi->link_cfg.lpacaps, 0, "link partner advertised capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL, port_top_speed(pi), "max speed (in Gbps)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "mps_bg_map", CTLFLAG_RD, NULL, pi->mps_bg_map, "MPS buffer group map"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_e_chan_map", CTLFLAG_RD, NULL, pi->rx_e_chan_map, "TP rx e-channel map"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_c_chan", CTLFLAG_RD, NULL, pi->rx_c_chan, "TP rx c-channel"); if (sc->flags & IS_VF) return; /* * dev.(cxgbe|cxl).X.tc. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Tx scheduler traffic classes (cl_rl)"); children2 = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "pktsize", CTLFLAG_RW, &pi->sched_params->pktsize, 0, "pktsize for per-flow cl-rl (0 means up to the driver )"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "burstsize", CTLFLAG_RW, &pi->sched_params->burstsize, 0, "burstsize for per-flow cl-rl (0 means up to the driver)"); for (i = 0; i < sc->chip_params->nsched_cls; i++) { struct tx_cl_rl_params *tc = &pi->sched_params->cl_rl[i]; snprintf(name, sizeof(name), "%d", i); children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "traffic class")); SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "flags", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, tc_flags, (uintptr_t)&tc->flags, sysctl_bitfield_8b, "A", "flags"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount", CTLFLAG_RD, &tc->refcount, 0, "references to this class"); SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, (pi->port_id << 16) | i, sysctl_tc_params, "A", "traffic class parameters"); } /* * dev.cxgbe.X.stats. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD, &pi->tx_parse_error, 0, "# of tx packets with invalid length or # of segments"); #define T4_REGSTAT(name, stat, desc) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, #name, \ CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, \ (is_t4(sc) ? PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_##stat##_L) : \ T5_PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_##stat##_L)), \ sysctl_handle_t4_reg64, "QU", desc) /* We get these from port_stats and they may be stale by up to 1s */ #define T4_PORTSTAT(name, desc) \ SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \ &pi->stats.name, desc) T4_REGSTAT(tx_octets, TX_PORT_BYTES, "# of octets in good frames"); T4_REGSTAT(tx_frames, TX_PORT_FRAMES, "total # of good frames"); T4_REGSTAT(tx_bcast_frames, TX_PORT_BCAST, "# of broadcast frames"); T4_REGSTAT(tx_mcast_frames, TX_PORT_MCAST, "# of multicast frames"); T4_REGSTAT(tx_ucast_frames, TX_PORT_UCAST, "# of unicast frames"); T4_REGSTAT(tx_error_frames, TX_PORT_ERROR, "# of error frames"); T4_REGSTAT(tx_frames_64, TX_PORT_64B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_65_127, TX_PORT_65B_127B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_128_255, TX_PORT_128B_255B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_256_511, TX_PORT_256B_511B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_512_1023, TX_PORT_512B_1023B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_1024_1518, TX_PORT_1024B_1518B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_1519_max, TX_PORT_1519B_MAX, "# of tx frames in this range"); T4_REGSTAT(tx_drop, TX_PORT_DROP, "# of dropped tx frames"); T4_REGSTAT(tx_pause, TX_PORT_PAUSE, "# of pause frames transmitted"); T4_REGSTAT(tx_ppp0, TX_PORT_PPP0, "# of PPP prio 0 frames transmitted"); T4_REGSTAT(tx_ppp1, TX_PORT_PPP1, "# of PPP prio 1 frames transmitted"); T4_REGSTAT(tx_ppp2, TX_PORT_PPP2, "# of PPP prio 2 frames transmitted"); T4_REGSTAT(tx_ppp3, TX_PORT_PPP3, "# of PPP prio 3 frames transmitted"); T4_REGSTAT(tx_ppp4, TX_PORT_PPP4, "# of PPP prio 4 frames transmitted"); T4_REGSTAT(tx_ppp5, TX_PORT_PPP5, "# of PPP prio 5 frames transmitted"); T4_REGSTAT(tx_ppp6, TX_PORT_PPP6, "# of PPP prio 6 frames transmitted"); T4_REGSTAT(tx_ppp7, TX_PORT_PPP7, "# of PPP prio 7 frames transmitted"); T4_REGSTAT(rx_octets, RX_PORT_BYTES, "# of octets in good frames"); T4_REGSTAT(rx_frames, RX_PORT_FRAMES, "total # of good frames"); T4_REGSTAT(rx_bcast_frames, RX_PORT_BCAST, "# of broadcast frames"); T4_REGSTAT(rx_mcast_frames, RX_PORT_MCAST, "# of multicast frames"); T4_REGSTAT(rx_ucast_frames, RX_PORT_UCAST, "# of unicast frames"); T4_REGSTAT(rx_too_long, RX_PORT_MTU_ERROR, "# of frames exceeding MTU"); T4_REGSTAT(rx_jabber, RX_PORT_MTU_CRC_ERROR, "# of jabber frames"); if (is_t6(sc)) { T4_PORTSTAT(rx_fcs_err, "# of frames received with bad FCS since last link up"); } else { T4_REGSTAT(rx_fcs_err, RX_PORT_CRC_ERROR, "# of frames received with bad FCS"); } T4_REGSTAT(rx_len_err, RX_PORT_LEN_ERROR, "# of frames received with length error"); T4_REGSTAT(rx_symbol_err, RX_PORT_SYM_ERROR, "symbol errors"); T4_REGSTAT(rx_runt, RX_PORT_LESS_64B, "# of short frames received"); T4_REGSTAT(rx_frames_64, RX_PORT_64B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_65_127, RX_PORT_65B_127B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_128_255, RX_PORT_128B_255B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_256_511, RX_PORT_256B_511B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_512_1023, RX_PORT_512B_1023B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_1024_1518, RX_PORT_1024B_1518B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_1519_max, RX_PORT_1519B_MAX, "# of rx frames in this range"); T4_REGSTAT(rx_pause, RX_PORT_PAUSE, "# of pause frames received"); T4_REGSTAT(rx_ppp0, RX_PORT_PPP0, "# of PPP prio 0 frames received"); T4_REGSTAT(rx_ppp1, RX_PORT_PPP1, "# of PPP prio 1 frames received"); T4_REGSTAT(rx_ppp2, RX_PORT_PPP2, "# of PPP prio 2 frames received"); T4_REGSTAT(rx_ppp3, RX_PORT_PPP3, "# of PPP prio 3 frames received"); T4_REGSTAT(rx_ppp4, RX_PORT_PPP4, "# of PPP prio 4 frames received"); T4_REGSTAT(rx_ppp5, RX_PORT_PPP5, "# of PPP prio 5 frames received"); T4_REGSTAT(rx_ppp6, RX_PORT_PPP6, "# of PPP prio 6 frames received"); T4_REGSTAT(rx_ppp7, RX_PORT_PPP7, "# of PPP prio 7 frames received"); T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows"); T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows"); T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows"); T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows"); T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets"); T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets"); T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets"); T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets"); #undef T4_REGSTAT #undef T4_PORTSTAT } static int sysctl_int_array(SYSCTL_HANDLER_ARGS) { int rc, *i, space = 0; struct sbuf sb; sbuf_new_for_sysctl(&sb, NULL, 64, req); for (i = arg1; arg2; arg2 -= sizeof(int), i++) { if (space) sbuf_printf(&sb, " "); sbuf_printf(&sb, "%d", *i); space = 1; } rc = sbuf_finish(&sb); sbuf_delete(&sb); return (rc); } static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", *(uint8_t *)(uintptr_t)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", *(uint16_t *)(uintptr_t)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_btphy(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int op = arg2; struct adapter *sc = pi->adapter; u_int v; int rc; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { /* XXX: magic numbers */ rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820, &v); } end_synchronized_op(sc, 0); if (rc) return (rc); if (op == 0) v /= 256; rc = sysctl_handle_int(oidp, &v, 0, req); return (rc); } static int sysctl_noflowq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; int rc, val; val = vi->rsrv_noflowq; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if ((val >= 1) && (vi->ntxq > 1)) vi->rsrv_noflowq = 1; else vi->rsrv_noflowq = 0; return (rc); } static int sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int rc, val, i; MPASS(!(sc->flags & IS_VF)); val = vi->flags & TX_USES_VM_WR ? 1 : 0; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val != 0 && val != 1) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txvm"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else if (vi->ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * We don't want parse_pkt to run with one setting (VF or PF) * and then eth_tx to see a different setting but still use * stale information calculated by parse_pkt. */ rc = EBUSY; } else { struct port_info *pi = vi->pi; struct sge_txq *txq; uint32_t ctrl0; uint8_t npkt = sc->params.max_pkts_per_eth_tx_pkts_wr; if (val) { vi->flags |= TX_USES_VM_WR; vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO; ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan)); if (!(sc->flags & IS_VF)) npkt--; } else { vi->flags &= ~TX_USES_VM_WR; vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); } for_each_txq(vi, i, txq) { txq->cpl_ctrl0 = ctrl0; txq->txp.max_npkt = npkt; } } end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc, i; struct sge_rxq *rxq; uint8_t v; idx = vi->tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1); for_each_rxq(vi, i, rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&rxq->iq.intr_params, v); #else rxq->iq.intr_params = v; #endif } vi->tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc; idx = vi->pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4pktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int qsize, rc; qsize = vi->qsize_rxq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || (qsize & 7)) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4rxqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_rxq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int qsize, rc; qsize = vi->qsize_txq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || qsize > 65536) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_txq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1RX\2TX\3AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); if (lc->link_ok) { sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) | (lc->requested_fc & PAUSE_AUTONEG), bits); } else { sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG), bits); } rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); if (s[1] != 0) return (EINVAL); if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4PAUSE"); if (rc) return (rc); if (!hw_off_limits(sc)) { PORT_LOCK(pi); lc->requested_fc = n; fixup_link_config(pi); if (pi->up_vis > 0) rc = apply_link_config(pi); set_current_media(pi); PORT_UNLOCK(pi); } end_synchronized_op(sc, 0); } return (rc); } static int sysctl_fec(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; int8_t old; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1RS-FEC\2FC-FEC\3NO-FEC\4RSVD2" "\5RSVD3\6auto\7module"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); /* * Display the requested_fec when the link is down -- the actual * FEC makes sense only when the link is up. */ if (lc->link_ok) { sbuf_printf(sb, "%b", (lc->fec & M_FW_PORT_CAP32_FEC) | (lc->requested_fec & (FEC_AUTO | FEC_MODULE)), bits); } else { sbuf_printf(sb, "%b", lc->requested_fec, bits); } rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[8]; int n; snprintf(s, sizeof(s), "%d", lc->requested_fec == FEC_AUTO ? -1 : lc->requested_fec & (M_FW_PORT_CAP32_FEC | FEC_MODULE)); rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); n = strtol(&s[0], NULL, 0); if (n < 0 || n & FEC_AUTO) n = FEC_AUTO; else if (n & ~(M_FW_PORT_CAP32_FEC | FEC_MODULE)) return (EINVAL);/* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4fec"); if (rc) return (rc); PORT_LOCK(pi); old = lc->requested_fec; if (n == FEC_AUTO) lc->requested_fec = FEC_AUTO; else if (n == 0 || n == FEC_NONE) lc->requested_fec = FEC_NONE; else { if ((lc->pcaps | V_FW_PORT_CAP32_FEC(n & M_FW_PORT_CAP32_FEC)) != lc->pcaps) { rc = ENOTSUP; goto done; } lc->requested_fec = n & (M_FW_PORT_CAP32_FEC | FEC_MODULE); } if (!hw_off_limits(sc)) { fixup_link_config(pi); if (pi->up_vis > 0) { rc = apply_link_config(pi); if (rc != 0) { lc->requested_fec = old; if (rc == FW_EPROTO) rc = ENOTSUP; } } } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } return (rc); } static int sysctl_module_fec(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; int8_t fec; struct sbuf *sb; static char *bits = "\20\1RS-FEC\2FC-FEC\3NO-FEC\4RSVD2\5RSVD3"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mfec") != 0) { rc = EBUSY; goto done; } if (hw_off_limits(sc)) { rc = ENXIO; goto done; } PORT_LOCK(pi); if (pi->up_vis == 0) { /* * If all the interfaces are administratively down the firmware * does not report transceiver changes. Refresh port info here. * This is the only reason we have a synchronized op in this * function. Just PORT_LOCK would have been enough otherwise. */ t4_update_port_info(pi); } fec = lc->fec_hint; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE || !fec_supported(lc->pcaps)) { sbuf_printf(sb, "n/a"); } else { if (fec == 0) fec = FEC_NONE; sbuf_printf(sb, "%b", fec & M_FW_PORT_CAP32_FEC, bits); } rc = sbuf_finish(sb); PORT_UNLOCK(pi); done: sbuf_delete(sb); end_synchronized_op(sc, 0); return (rc); } static int sysctl_autoneg(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc, val; if (lc->pcaps & FW_PORT_CAP32_ANEG) val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1; else val = -1; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val == 0) val = AUTONEG_DISABLE; else if (val == 1) val = AUTONEG_ENABLE; else val = AUTONEG_AUTO; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4aneg"); if (rc) return (rc); PORT_LOCK(pi); if (val == AUTONEG_ENABLE && !(lc->pcaps & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; goto done; } lc->requested_aneg = val; if (!hw_off_limits(sc)) { fixup_link_config(pi); if (pi->up_vis > 0) rc = apply_link_config(pi); set_current_media(pi); } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, reg = arg2; uint64_t val; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; val = t4_read_reg64(sc, reg); } mtx_unlock(&sc->reg_lock); if (rc == 0) rc = sysctl_handle_64(oidp, &val, 0, req); return (rc); } static int sysctl_temperature(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, t; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc) return (rc); /* unknown is returned as 0 but we display -1 in that case */ t = val == 0 ? -1 : val; rc = sysctl_handle_int(oidp, &t, 0, req); return (rc); } static int sysctl_vdd(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc; uint32_t param, val; if (sc->params.core_vdd == 0) { rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vdd"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc) return (rc); sc->params.core_vdd = val; } return (sysctl_handle_int(oidp, &sc->params.core_vdd, 0, req)); } static int sysctl_reset_sensor(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, v; uint32_t param, val; v = sc->sensor_resets; rc = sysctl_handle_int(oidp, &v, 0, req); if (rc != 0 || req->newptr == NULL || v <= 0) return (rc); if (sc->params.fw_vers < FW_VERSION32(1, 24, 7, 0) || chip_id(sc) < CHELSIO_T5) return (ENOTSUP); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4srst"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_RESET_TMP_SENSOR)); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc == 0) sc->sensor_resets++; return (rc); } static int sysctl_loadavg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4lavg"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_LOAD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); if (val == 0xffffffff) { /* Only debug and custom firmwares report load averages. */ sbuf_printf(sb, "not available"); } else { sbuf_printf(sb, "%d %d %d", val & 0xff, (val >> 8) & 0xff, (val >> 16) & 0xff); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cctrl(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t incr[NMTUS][NCCTRL_WIN]; static const char *dec_fac[] = { "0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875", "0.9375" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_read_cong_tbl(sc, incr); mtx_unlock(&sc->reg_lock); if (rc) goto done; for (i = 0; i < NCCTRL_WIN; ++i) { sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i, incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i], incr[5][i], incr[6][i], incr[7][i]); sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n", incr[8][i], incr[9][i], incr[10][i], incr[11][i], incr[12][i], incr[13][i], incr[14][i], incr[15][i], sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); return (rc); } static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = { "TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI", /* ibq's */ "ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI", /* obq's */ "SGE0-RX", "SGE1-RX" /* additional obq's (T5 onwards) */ }; static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n, qid = arg2; uint32_t *buf, *p; char *qtype; u_int cim_num_obq = sc->chip_params->cim_num_obq; KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq, ("%s: bad qid %d\n", __func__, qid)); if (qid < CIM_NUM_IBQ) { /* inbound queue */ qtype = "IBQ"; n = 4 * CIM_IBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = -ENXIO; else rc = t4_read_cim_ibq(sc, qid, buf, n); mtx_unlock(&sc->reg_lock); } else { /* outbound queue */ qtype = "OBQ"; qid -= CIM_NUM_IBQ; n = 4 * cim_num_obq * CIM_OBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = -ENXIO; else rc = t4_read_cim_obq(sc, qid, buf, n); mtx_unlock(&sc->reg_lock); } if (rc < 0) { rc = -rc; goto done; } n = rc * sizeof(uint32_t); /* rc has # of words actually read */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]); for (i = 0, p = buf; i < n; i += 16, p += 4) sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1], p[2], p[3]); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static void sbuf_cim_la4(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg) { uint32_t *p; sbuf_printf(sb, "Status Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x", p[5] & 0xff, p[6], p[7]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x", (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8, p[4] & 0xff, p[5] >> 8); sbuf_printf(sb, "\n %02x %x%07x %x%07x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4); } else { sbuf_printf(sb, "\n %02x %x%07x %x%07x %08x %08x " "%08x%08x%08x%08x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5], p[6], p[7]); } } } static void sbuf_cim_la6(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg) { uint32_t *p; sbuf_printf(sb, "Status Inst Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data LS1Stat LS1Addr LS1Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x %08x", p[3] & 0xff, p[2], p[1], p[0]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x %02x%06x", (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8, p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8); sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16); } else { sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x " "%08x %08x %08x %08x %08x %08x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16, p[2], p[1], p[0], p[5], p[4], p[3]); } } } static int sbuf_cim_la(struct adapter *sc, struct sbuf *sb, int flags) { uint32_t cfg, *buf; int rc; MPASS(flags == M_WAITOK || flags == M_NOWAIT); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | flags); if (buf == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc == 0) rc = -t4_cim_read_la(sc, buf, NULL); } mtx_unlock(&sc->reg_lock); if (rc == 0) { if (chip_id(sc) < CHELSIO_T6) sbuf_cim_la4(sc, sb, buf, cfg); else sbuf_cim_la6(sc, sb, buf, cfg); } free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); rc = sbuf_cim_la(sc, sb, M_WAITOK); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } bool t4_os_dump_cimla(struct adapter *sc, int arg, bool verbose) { struct sbuf sb; int rc; if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) return (false); rc = sbuf_cim_la(sc, &sb, M_NOWAIT); if (rc == 0) { rc = sbuf_finish(&sb); if (rc == 0) { log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s", device_get_nameunit(sc->dev), sbuf_data(&sb)); } } sbuf_delete(&sb); return (false); } static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE); mtx_unlock(&sc->reg_lock); if (rc) goto done; p = buf; for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCnt ID Tag UE Data RDY VLD"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%3u %2u %x %u %08x%08x %u %u", (p[2] >> 10) & 0xff, (p[2] >> 7) & 7, (p[2] >> 3) & 0xf, (p[2] >> 2) & 1, (p[1] >> 2) | ((p[2] & 3) << 30), (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1, p[0] & 1); } rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL); mtx_unlock(&sc->reg_lock); if (rc) goto done; p = buf; sbuf_printf(sb, "Cntl ID DataBE Addr Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %04x %08x %08x%08x%08x%08x", (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCntl ID Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %08x%08x%08x%08x", (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t thres[CIM_NUM_IBQ]; uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr; uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat; u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq; cim_num_obq = sc->chip_params->cim_num_obq; if (is_t4(sc)) { ibq_rdaddr = A_UP_IBQ_0_RDADDR; obq_rdaddr = A_UP_OBQ_0_REALADDR; } else { ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR; obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR; } nq = CIM_NUM_IBQ + cim_num_obq; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat); if (rc == 0) { rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr); if (rc == 0) t4_read_cimq_cfg(sc, base, size, thres); } } mtx_unlock(&sc->reg_lock); if (rc) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, " Queue Base Size Thres RdPtr WrPtr SOP EOP Avail"); for (i = 0; i < CIM_NUM_IBQ; i++, p += 4) sbuf_printf(sb, "\n%7s %5x %5u %5u %6x %4x %4u %4u %5u", qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]), G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); for ( ; i < nq; i++, p += 4, wr += 2) sbuf_printf(sb, "\n%7s %5x %5u %12x %4x %4u %4u %5u", qname[i], base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff, wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_cpl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_cpl_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (rc) goto done; if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\nCPL requests: %10u %10u %10u %10u", stats.req[0], stats.req[1], stats.req[2], stats.req[3]); sbuf_printf(sb, "\nCPL responses: %10u %10u %10u %10u", stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\nCPL requests: %10u %10u", stats.req[0], stats.req[1]); sbuf_printf(sb, "\nCPL responses: %10u %10u", stats.rsp[0], stats.rsp[1]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); return (rc); } static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_usm_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_get_usm_stats(sc, &stats, 1); mtx_unlock(&sc->reg_lock); if (rc == 0) { sbuf_printf(sb, "Frames: %u\n", stats.frames); sbuf_printf(sb, "Octets: %ju\n", stats.octets); sbuf_printf(sb, "Drops: %u", stats.drops); rc = sbuf_finish(sb); } sbuf_delete(sb); return (rc); } static int sysctl_tid_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tid_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_tid_stats(sc, &stats, 1); mtx_unlock(&sc->reg_lock); if (rc == 0) { sbuf_printf(sb, "Delete: %u\n", stats.del); sbuf_printf(sb, "Invalidate: %u\n", stats.inv); sbuf_printf(sb, "Active: %u\n", stats.act); sbuf_printf(sb, "Passive: %u", stats.pas); rc = sbuf_finish(sb); } sbuf_delete(sb); return (rc); } static const char * const devlog_level_strings[] = { [FW_DEVLOG_LEVEL_EMERG] = "EMERG", [FW_DEVLOG_LEVEL_CRIT] = "CRIT", [FW_DEVLOG_LEVEL_ERR] = "ERR", [FW_DEVLOG_LEVEL_NOTICE] = "NOTICE", [FW_DEVLOG_LEVEL_INFO] = "INFO", [FW_DEVLOG_LEVEL_DEBUG] = "DEBUG" }; static const char * const devlog_facility_strings[] = { [FW_DEVLOG_FACILITY_CORE] = "CORE", [FW_DEVLOG_FACILITY_CF] = "CF", [FW_DEVLOG_FACILITY_SCHED] = "SCHED", [FW_DEVLOG_FACILITY_TIMER] = "TIMER", [FW_DEVLOG_FACILITY_RES] = "RES", [FW_DEVLOG_FACILITY_HW] = "HW", [FW_DEVLOG_FACILITY_FLR] = "FLR", [FW_DEVLOG_FACILITY_DMAQ] = "DMAQ", [FW_DEVLOG_FACILITY_PHY] = "PHY", [FW_DEVLOG_FACILITY_MAC] = "MAC", [FW_DEVLOG_FACILITY_PORT] = "PORT", [FW_DEVLOG_FACILITY_VI] = "VI", [FW_DEVLOG_FACILITY_FILTER] = "FILTER", [FW_DEVLOG_FACILITY_ACL] = "ACL", [FW_DEVLOG_FACILITY_TM] = "TM", [FW_DEVLOG_FACILITY_QFC] = "QFC", [FW_DEVLOG_FACILITY_DCB] = "DCB", [FW_DEVLOG_FACILITY_ETH] = "ETH", [FW_DEVLOG_FACILITY_OFLD] = "OFLD", [FW_DEVLOG_FACILITY_RI] = "RI", [FW_DEVLOG_FACILITY_ISCSI] = "ISCSI", [FW_DEVLOG_FACILITY_FCOE] = "FCOE", [FW_DEVLOG_FACILITY_FOISCSI] = "FOISCSI", [FW_DEVLOG_FACILITY_FOFCOE] = "FOFCOE", [FW_DEVLOG_FACILITY_CHNET] = "CHNET", }; static int sbuf_devlog(struct adapter *sc, struct sbuf *sb, int flags) { int i, j, rc, nentries, first = 0; struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e *buf, *e; uint64_t ftstamp = UINT64_MAX; if (dparams->addr == 0) return (ENXIO); MPASS(flags == M_WAITOK || flags == M_NOWAIT); buf = malloc(dparams->size, M_CXGBE, M_ZERO | flags); if (buf == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf, dparams->size); mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; nentries = dparams->size / sizeof(struct fw_devlog_e); for (i = 0; i < nentries; i++) { e = &buf[i]; if (e->timestamp == 0) break; /* end */ e->timestamp = be64toh(e->timestamp); e->seqno = be32toh(e->seqno); for (j = 0; j < 8; j++) e->params[j] = be32toh(e->params[j]); if (e->timestamp < ftstamp) { ftstamp = e->timestamp; first = i; } } if (buf[first].timestamp == 0) goto done; /* nothing in the log */ sbuf_printf(sb, "%10s %15s %8s %8s %s\n", "Seq#", "Tstamp", "Level", "Facility", "Message"); i = first; do { e = &buf[i]; if (e->timestamp == 0) break; /* end */ sbuf_printf(sb, "%10d %15ju %8s %8s ", e->seqno, e->timestamp, (e->level < nitems(devlog_level_strings) ? devlog_level_strings[e->level] : "UNKNOWN"), (e->facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e->facility] : "UNKNOWN")); sbuf_printf(sb, e->fmt, e->params[0], e->params[1], e->params[2], e->params[3], e->params[4], e->params[5], e->params[6], e->params[7]); if (++i == nentries) i = 0; } while (i != first); done: free(buf, M_CXGBE); return (rc); } static int sysctl_devlog(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); rc = sbuf_devlog(sc, sb, M_WAITOK); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } void t4_os_dump_devlog(struct adapter *sc) { int rc; struct sbuf sb; if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) return; rc = sbuf_devlog(sc, &sb, M_NOWAIT); if (rc == 0) { rc = sbuf_finish(&sb); if (rc == 0) { log(LOG_DEBUG, "%s: device log follows.\n%s", device_get_nameunit(sc->dev), sbuf_data(&sb)); } } sbuf_delete(&sb); } static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_fcoe_stats stats[MAX_NCHAN]; int i, nchan = sc->chip_params->nchan; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { for (i = 0; i < nchan; i++) t4_get_fcoe_stats(sc, i, &stats[i], 1); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp, stats[2].octets_ddp, stats[3].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp, stats[2].frames_ddp, stats[3].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u", stats[0].frames_drop, stats[1].frames_drop, stats[2].frames_drop, stats[3].frames_drop); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u", stats[0].frames_drop, stats[1].frames_drop); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; unsigned int map, kbps, ipg, mode; unsigned int pace_tab[NTX_SCHED]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP); mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG)); t4_read_pace_tbl(sc, pace_tab); sbuf_printf(sb, "Scheduler Mode Channel Rate (Kbps) " "Class IPG (0.1 ns) Flow IPG (us)"); for (i = 0; i < NTX_SCHED; ++i, map >>= 2) { t4_get_tx_sched(sc, i, &kbps, &ipg, 1); sbuf_printf(sb, "\n %u %-5s %u ", i, (mode & (1 << i)) ? "flow" : "class", map & 3); if (kbps) sbuf_printf(sb, "%9u ", kbps); else sbuf_printf(sb, " disabled "); if (ipg) sbuf_printf(sb, "%13u ", ipg); else sbuf_printf(sb, " disabled "); if (pace_tab[i]) sbuf_printf(sb, "%10u", pace_tab[i]); else sbuf_printf(sb, " disabled"); } rc = sbuf_finish(sb); done: mtx_unlock(&sc->reg_lock); sbuf_delete(sb); return (rc); } static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, j; uint64_t *p0, *p1; struct lb_port_stats s[2]; static const char *stat_name[] = { "OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:", "UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:", "Frames128To255:", "Frames256To511:", "Frames512To1023:", "Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:", "BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:", "BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:", "BG2FramesTrunc:", "BG3FramesTrunc:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); memset(s, 0, sizeof(s)); for (i = 0; i < sc->chip_params->nchan; i += 2) { mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_get_lb_stats(sc, i, &s[0]); t4_get_lb_stats(sc, i + 1, &s[1]); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; p0 = &s[0].octets; p1 = &s[1].octets; sbuf_printf(sb, "%s Loopback %u" " Loopback %u", i == 0 ? "" : "\n", i, i + 1); for (j = 0; j < nitems(stat_name); j++) sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j], *p0++, *p1++); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS) { int rc = 0; struct port_info *pi = arg1; struct link_config *lc = &pi->link_cfg; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 64, req); if (sb == NULL) return (ENOMEM); if (lc->link_ok || lc->link_down_rc == 255) sbuf_printf(sb, "n/a"); else sbuf_printf(sb, "%s", t4_link_down_rc_str(lc->link_down_rc)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct mem_desc { unsigned int base; unsigned int limit; unsigned int idx; }; static int mem_desc_cmp(const void *a, const void *b) { return ((const struct mem_desc *)a)->base - ((const struct mem_desc *)b)->base; } static void mem_region_show(struct sbuf *sb, const char *name, unsigned int from, unsigned int to) { unsigned int size; if (from == to) return; size = to - from + 1; if (size == 0) return; /* XXX: need humanize_number(3) in libkern for a more readable 'size' */ sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size); } static int sysctl_meminfo(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; uint32_t lo, hi, used, alloc; static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"}; static const char *region[] = { "DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:", "Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:", "Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:", "TDDP region:", "TPT region:", "STAG region:", "RQ region:", "RQUDP region:", "PBL region:", "TXPBL region:", "DBVFIFO region:", "ULPRX state:", "ULPTX state:", "On-chip queues:", "TLS keys:", }; struct mem_desc avail[4]; struct mem_desc mem[nitems(region) + 3]; /* up to 3 holes */ struct mem_desc *md = mem; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nitems(mem); i++) { mem[i].limit = 0; mem[i].idx = i; } mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } /* Find and sort the populated memory ranges */ i = 0; lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (lo & F_EDRAM0_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM0_BAR); avail[i].base = G_EDRAM0_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20); avail[i].idx = 0; i++; } if (lo & F_EDRAM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM1_BAR); avail[i].base = G_EDRAM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20); avail[i].idx = 1; i++; } if (lo & F_EXT_MEM_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); avail[i].base = G_EXT_MEM_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20); avail[i].idx = is_t5(sc) ? 3 : 2; /* Call it MC0 for T5 */ i++; } if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 4; i++; } if (!i) /* no memory available */ goto done; qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp); (md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR); (md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE); /* the next few have explicit upper bounds */ md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) * G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE)); md++; md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) * G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE)); md++; if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { if (chip_id(sc) <= CHELSIO_T5) md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE); else md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR); md->limit = 0; } else { md->base = 0; md->idx = nitems(region); /* hide it */ } md++; #define ulp_region(reg) \ md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\ (md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT) ulp_region(RX_ISCSI); ulp_region(RX_TDDP); ulp_region(TX_TPT); ulp_region(RX_STAG); ulp_region(RX_RQ); ulp_region(RX_RQUDP); ulp_region(RX_PBL); ulp_region(TX_PBL); #undef ulp_region md->base = 0; md->idx = nitems(region); if (!is_t4(sc)) { uint32_t size = 0; uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2); uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE); if (is_t5(sc)) { if (sge_ctrl & F_VFIFO_ENABLE) size = G_DBVFIFO_SIZE(fifo_size); } else size = G_T6_DBVFIFO_SIZE(fifo_size); if (size) { md->base = G_BASEADDR(t4_read_reg(sc, A_SGE_DBVFIFO_BADDR)); md->limit = md->base + (size << 2) - 1; } } md++; md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE); md->limit = 0; md++; md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE); md->limit = 0; md++; md->base = sc->vres.ocq.start; if (sc->vres.ocq.size) md->limit = md->base + sc->vres.ocq.size - 1; else md->idx = nitems(region); /* hide it */ md++; md->base = sc->vres.key.start; if (sc->vres.key.size) md->limit = md->base + sc->vres.key.size - 1; else md->idx = nitems(region); /* hide it */ md++; /* add any address-space holes, there can be up to 3 */ for (n = 0; n < i - 1; n++) if (avail[n].limit < avail[n + 1].base) (md++)->base = avail[n].limit; if (avail[n].limit) (md++)->base = avail[n].limit; n = md - mem; qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp); for (lo = 0; lo < i; lo++) mem_region_show(sb, memory[avail[lo].idx], avail[lo].base, avail[lo].limit - 1); sbuf_printf(sb, "\n"); for (i = 0; i < n; i++) { if (mem[i].idx >= nitems(region)) continue; /* skip holes */ if (!mem[i].limit) mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0; mem_region_show(sb, region[mem[i].idx], mem[i].base, mem[i].limit); } sbuf_printf(sb, "\n"); lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP RAM:", lo, hi); lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP Extmem2:", lo, hi); lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE); sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n", G_PMRXMAXPAGE(lo), t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10, (lo & F_PMRXNUMCHN) ? 2 : 1); lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE); hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n", G_PMTXMAXPAGE(lo), hi >= (1 << 20) ? (hi >> 20) : (hi >> 10), hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo)); sbuf_printf(sb, "%u p-structs\n", t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT)); for (i = 0; i < 4; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated", i, used, alloc); } for (i = 0; i < sc->chip_params->nchan; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nLoopback %d using %u pages out of %u allocated", i, used, alloc); } done: mtx_unlock(&sc->reg_lock); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static inline void tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask) { *mask = x | y; y = htobe64(y); memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN); } static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) <= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask Vld Ports PF" " VF Replication P0 P1 P2 P3 ML"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint64_t tcamx, tcamy, mask; uint32_t cls_lo, cls_hi; uint8_t addr[ETHER_ADDR_LEN]; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i)); tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i)); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx" " %c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, (cls_lo & F_SRAM_VLD) ? 'Y' : 'N', G_PORTMAP(cls_hi), G_PF(cls_lo), (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1); if (cls_lo & F_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mps"); if (rc) break; if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) break; else { sbuf_printf(sb, " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%36s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo), G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo), G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) > CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask VNI Mask" " IVLAN Vld DIP_Hit Lookup Port Vld Ports PF VF" " Replication" " P0 P1 P2 P3 ML\n"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint8_t dip_hit, vlan_vld, lookup_type, port_num; uint16_t ivlan; uint64_t tcamx, tcamy, val, mask; uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy; uint8_t addr[ETHER_ADDR_LEN]; ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0); if (i < 256) ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0); else ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamy = G_DMACH(val) << 32; tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; lookup_type = G_DATALKPTYPE(data2); port_num = G_DATAPORTNUM(data2); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI */ vniy = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); dip_hit = data2 & F_DATADIPHIT; vlan_vld = 0; } else { vniy = 0; dip_hit = 0; vlan_vld = data2 & F_DATAVIDH2; ivlan = G_VIDL(val); } ctl |= V_CTLXYBITSEL(1); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamx = G_DMACH(val) << 32; tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI mask */ vnix = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); } else vnix = 0; if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; if (lookup_type && lookup_type != M_DATALKPTYPE) { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx %06x %06x - - %3c" " I %4x %3c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } else { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx - - ", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask); if (vlan_vld) sbuf_printf(sb, "%4u Y ", ivlan); else sbuf_printf(sb, " - N "); sbuf_printf(sb, "- %3c %4x %3c %#x%4u%4d", lookup_type ? 'I' : 'O', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } if (cls_lo & F_T6_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t6mps"); if (rc) break; if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) break; else { sbuf_printf(sb, " %08x %08x %08x %08x" " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc255_224), be32toh(ldst_cmd.u.mps.rplc.rplc223_192), be32toh(ldst_cmd.u.mps.rplc.rplc191_160), be32toh(ldst_cmd.u.mps.rplc.rplc159_128), be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%72s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#x", G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo), G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo), (cls_lo >> S_T6_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint16_t mtus[NMTUS]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_read_mtu_tbl(sc, mtus, NULL); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u", mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6], mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13], mtus[14], mtus[15]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS]; uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS]; static const char *tx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Bypass + mem:", "Tx FIFO wait", NULL, "Tx latency" }; static const char *rx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Flush:", "Rx FIFO wait", NULL, "Rx latency" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_pmtx_get_stats(sc, tx_cnt, tx_cyc); t4_pmrx_get_stats(sc, rx_cnt, rx_cyc); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, " Tx pcmds Tx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); } sbuf_printf(sb, "\n Rx pcmds Rx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } if (chip_id(sc) > CHELSIO_T5) { sbuf_printf(sb, "\n Total wait Total occupancy"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); i += 2; MPASS(i < nitems(tx_stats)); sbuf_printf(sb, "\n Reads Total wait"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_rdma_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_rdma_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod); sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tcp_stats v4, v6; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_tcp_stats(sc, &v4, &v6, 0); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, " IP IPv6\n"); sbuf_printf(sb, "OutRsts: %20u %20u\n", v4.tcp_out_rsts, v6.tcp_out_rsts); sbuf_printf(sb, "InSegs: %20ju %20ju\n", v4.tcp_in_segs, v6.tcp_in_segs); sbuf_printf(sb, "OutSegs: %20ju %20ju\n", v4.tcp_out_segs, v6.tcp_out_segs); sbuf_printf(sb, "RetransSegs: %20ju %20ju", v4.tcp_retrans_segs, v6.tcp_retrans_segs); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tids(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint32_t x, y; struct tid_info *t = &sc->tids; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (t->natids) { sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1, t->atids_in_use); } if (t->nhpftids) { sbuf_printf(sb, "HPFTID range: %u-%u, in use: %u\n", t->hpftid_base, t->hpftid_end, t->hpftids_in_use); } if (t->ntids) { bool hashen = false; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { hashen = true; if (chip_id(sc) <= CHELSIO_T5) { x = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4; y = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4; } else { x = t4_read_reg(sc, A_LE_DB_SRVR_START_INDEX); y = t4_read_reg(sc, A_T6_LE_DB_HASH_TID_BASE); } } mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; sbuf_printf(sb, "TID range: "); if (hashen) { if (x) sbuf_printf(sb, "%u-%u, ", t->tid_base, x - 1); sbuf_printf(sb, "%u-%u", y, t->ntids - 1); } else { sbuf_printf(sb, "%u-%u", t->tid_base, t->tid_base + t->ntids - 1); } sbuf_printf(sb, ", in use: %u\n", atomic_load_acq_int(&t->tids_in_use)); } if (t->nstids) { sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base, t->stid_base + t->nstids - 1, t->stids_in_use); } if (t->nftids) { sbuf_printf(sb, "FTID range: %u-%u, in use: %u\n", t->ftid_base, t->ftid_end, t->ftids_in_use); } if (t->netids) { sbuf_printf(sb, "ETID range: %u-%u, in use: %u\n", t->etid_base, t->etid_base + t->netids - 1, t->etids_in_use); } mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { x = t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4); y = t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6); } mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", x, y); done: if (rc == 0) rc = sbuf_finish(sb); else (void)sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_err_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_err_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "macInErrs: %10u %10u %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1], stats.mac_in_errs[2], stats.mac_in_errs[3]); sbuf_printf(sb, "hdrInErrs: %10u %10u %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1], stats.hdr_in_errs[2], stats.hdr_in_errs[3]); sbuf_printf(sb, "tcpInErrs: %10u %10u %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1], stats.tcp_in_errs[2], stats.tcp_in_errs[3]); sbuf_printf(sb, "tcp6InErrs: %10u %10u %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1], stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]); sbuf_printf(sb, "tnlCongDrops: %10u %10u %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1], stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]); sbuf_printf(sb, "tnlTxDrops: %10u %10u %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1], stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1], stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]); sbuf_printf(sb, "ofldChanDrops: %10u %10u %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1], stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "macInErrs: %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1]); sbuf_printf(sb, "hdrInErrs: %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1]); sbuf_printf(sb, "tcpInErrs: %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1]); sbuf_printf(sb, "tcp6InErrs: %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]); sbuf_printf(sb, "tnlCongDrops: %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]); sbuf_printf(sb, "tnlTxDrops: %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]); sbuf_printf(sb, "ofldChanDrops: %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]); } sbuf_printf(sb, "ofldNoNeigh: %u\nofldCongDefer: %u", stats.ofld_no_neigh, stats.ofld_cong_defer); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tnl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tnl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_tnl_stats(sc, &stats, 1); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "OutPkts: %10u %10u %10u %10u\n", stats.out_pkt[0], stats.out_pkt[1], stats.out_pkt[2], stats.out_pkt[3]); sbuf_printf(sb, "InPkts: %10u %10u %10u %10u", stats.in_pkt[0], stats.in_pkt[1], stats.in_pkt[2], stats.in_pkt[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "OutPkts: %10u %10u\n", stats.out_pkt[0], stats.out_pkt[1]); sbuf_printf(sb, "InPkts: %10u %10u", stats.in_pkt[0], stats.in_pkt[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tp_params *tpp = &sc->params.tp; u_int mask; int rc; mask = tpp->la_mask >> 16; rc = sysctl_handle_int(oidp, &mask, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (mask > 0xffff) return (EINVAL); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { tpp->la_mask = mask << 16; t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U, tpp->la_mask); } mtx_unlock(&sc->reg_lock); return (rc); } struct field_desc { const char *name; u_int start; u_int width; }; static void field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f) { char buf[32]; int line_size = 0; while (f->name) { uint64_t mask = (1ULL << f->width) - 1; int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name, ((uintmax_t)v >> f->start) & mask); if (line_size + len >= 79) { line_size = 8; sbuf_printf(sb, "\n "); } sbuf_printf(sb, "%s ", buf); line_size += len + 1; f++; } sbuf_printf(sb, "\n"); } static const struct field_desc tp_la0[] = { { "RcfOpCodeOut", 60, 4 }, { "State", 56, 4 }, { "WcfState", 52, 4 }, { "RcfOpcSrcOut", 50, 2 }, { "CRxError", 49, 1 }, { "ERxError", 48, 1 }, { "SanityFailed", 47, 1 }, { "SpuriousMsg", 46, 1 }, { "FlushInputMsg", 45, 1 }, { "FlushInputCpl", 44, 1 }, { "RssUpBit", 43, 1 }, { "RssFilterHit", 42, 1 }, { "Tid", 32, 10 }, { "InitTcb", 31, 1 }, { "LineNumber", 24, 7 }, { "Emsg", 23, 1 }, { "EdataOut", 22, 1 }, { "Cmsg", 21, 1 }, { "CdataOut", 20, 1 }, { "EreadPdu", 19, 1 }, { "CreadPdu", 18, 1 }, { "TunnelPkt", 17, 1 }, { "RcfPeerFin", 16, 1 }, { "RcfReasonOut", 12, 4 }, { "TxCchannel", 10, 2 }, { "RcfTxChannel", 8, 2 }, { "RxEchannel", 6, 2 }, { "RcfRxChannel", 5, 1 }, { "RcfDataOutSrdy", 4, 1 }, { "RxDvld", 3, 1 }, { "RxOoDvld", 2, 1 }, { "RxCongestion", 1, 1 }, { "TxCongestion", 0, 1 }, { NULL } }; static const struct field_desc tp_la1[] = { { "CplCmdIn", 56, 8 }, { "CplCmdOut", 48, 8 }, { "ESynOut", 47, 1 }, { "EAckOut", 46, 1 }, { "EFinOut", 45, 1 }, { "ERstOut", 44, 1 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static const struct field_desc tp_la2[] = { { "CplCmdIn", 56, 8 }, { "MpsVfVld", 55, 1 }, { "MpsPf", 52, 3 }, { "MpsVf", 44, 8 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static void tp_la_show(struct sbuf *sb, uint64_t *p, int idx) { field_desc_show(sb, *p, tp_la0); } static void tp_la_show2(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], tp_la0); } static void tp_la_show3(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1); } static int sysctl_tp_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint64_t *buf, *p; int rc; u_int i, inc; void (*show_func)(struct sbuf *, uint64_t *, int); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_tp_read_la(sc, buf, NULL); switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) { case 2: inc = 2; show_func = tp_la_show2; break; case 3: inc = 2; show_func = tp_la_show3; break; default: inc = 1; show_func = tp_la_show; } } mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; p = buf; for (i = 0; i < TPLA_SIZE / inc; i++, p += inc) (*show_func)(sb, p, i); rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; u64 nrate[MAX_NCHAN], orate[MAX_NCHAN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_get_chan_txrate(sc, nrate, orate); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju %10ju %10ju\n", nrate[0], nrate[1], nrate[2], nrate[3]); sbuf_printf(sb, "Offload B/s: %10ju %10ju %10ju %10ju", orate[0], orate[1], orate[2], orate[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju\n", nrate[0], nrate[1]); sbuf_printf(sb, "Offload B/s: %10ju %10ju", orate[0], orate[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint32_t *buf, *p; int rc, i; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_ulprx_read_la(sc, buf); mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; p = buf; sbuf_printf(sb, " Pcmd Type Message" " Data"); for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) { sbuf_printf(sb, "\n%08x%08x %4x %08x %08x%08x%08x%08x", p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint32_t cfg, s1, s2; MPASS(chip_id(sc) >= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { cfg = t4_read_reg(sc, A_SGE_STAT_CFG); s1 = t4_read_reg(sc, A_SGE_STAT_TOTAL); s2 = t4_read_reg(sc, A_SGE_STAT_MATCH); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); if (G_STATSOURCE_T5(cfg) == 7) { int mode; mode = is_t5(sc) ? G_STATMODE(cfg) : G_T6_STATMODE(cfg); if (mode == 0) sbuf_printf(sb, "total %d, incomplete %d", s1, s2); else if (mode == 1) sbuf_printf(sb, "total %d, data overflow %d", s1, s2); else sbuf_printf(sb, "unknown mode %d", mode); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; enum cpu_sets op = arg2; cpuset_t cpuset; struct sbuf *sb; int i, rc; MPASS(op == LOCAL_CPUS || op == INTR_CPUS); CPU_ZERO(&cpuset); rc = bus_get_cpus(sc->dev, op, sizeof(cpuset), &cpuset); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); CPU_FOREACH(i) sbuf_printf(sb, "%d ", i); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_reset(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int val; int rc; val = sc->num_resets; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val == 0) { /* Zero out the counter that tracks reset. */ sc->num_resets = 0; return (0); } if (val != 1) return (EINVAL); /* 0 or 1 are the only legal values */ if (hw_off_limits(sc)) /* harmless race */ return (EALREADY); taskqueue_enqueue(reset_tq, &sc->reset_task); return (0); } #ifdef TCP_OFFLOAD static int sysctl_tls(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int i, j, v, rc; struct vi_info *vi; v = sc->tt.tls; rc = sysctl_handle_int(oidp, &v, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (v != 0 && !(sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS)) return (ENOTSUP); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4stls"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { sc->tt.tls = !!v; for_each_port(sc, i) { for_each_vi(sc->port[i], j, vi) { if (vi->flags & VI_INIT_DONE) t4_update_fl_bufsize(vi->ifp); } } } end_synchronized_op(sc, 0); return (rc); } static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int *old_ports, *new_ports; int i, new_count, rc; if (req->newptr == NULL && req->oldptr == NULL) return (SYSCTL_OUT(req, NULL, imax(sc->tt.num_tls_rx_ports, 1) * sizeof(sc->tt.tls_rx_ports[0]))); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tlsrx"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (sc->tt.num_tls_rx_ports == 0) { i = -1; rc = SYSCTL_OUT(req, &i, sizeof(i)); } else rc = SYSCTL_OUT(req, sc->tt.tls_rx_ports, sc->tt.num_tls_rx_ports * sizeof(sc->tt.tls_rx_ports[0])); if (rc == 0 && req->newptr != NULL) { new_count = req->newlen / sizeof(new_ports[0]); new_ports = malloc(new_count * sizeof(new_ports[0]), M_CXGBE, M_WAITOK); rc = SYSCTL_IN(req, new_ports, new_count * sizeof(new_ports[0])); if (rc) goto err; /* Allow setting to a single '-1' to clear the list. */ if (new_count == 1 && new_ports[0] == -1) { ADAPTER_LOCK(sc); old_ports = sc->tt.tls_rx_ports; sc->tt.tls_rx_ports = NULL; sc->tt.num_tls_rx_ports = 0; ADAPTER_UNLOCK(sc); free(old_ports, M_CXGBE); } else { for (i = 0; i < new_count; i++) { if (new_ports[i] < 1 || new_ports[i] > IPPORT_MAX) { rc = EINVAL; goto err; } } ADAPTER_LOCK(sc); old_ports = sc->tt.tls_rx_ports; sc->tt.tls_rx_ports = new_ports; sc->tt.num_tls_rx_ports = new_count; ADAPTER_UNLOCK(sc); free(old_ports, M_CXGBE); new_ports = NULL; } err: free(new_ports, M_CXGBE); } done: end_synchronized_op(sc, 0); return (rc); } static int sysctl_tls_rx_timeout(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int v, rc; v = sc->tt.tls_rx_timeout; rc = sysctl_handle_int(oidp, &v, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (v < 0) return (EINVAL); if (v != 0 && !(sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS)) return (ENOTSUP); sc->tt.tls_rx_timeout = v; return (0); } static void unit_conv(char *buf, size_t len, u_int val, u_int factor) { u_int rem = val % factor; if (rem == 0) snprintf(buf, len, "%u", val / factor); else { while (rem % 10 == 0) rem /= 10; snprintf(buf, len, "%u.%u", val / factor, rem); } } static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; char buf[16]; u_int res, re; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) res = (u_int)-1; else res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); mtx_unlock(&sc->reg_lock); if (res == (u_int)-1) return (ENXIO); switch (arg2) { case 0: /* timer_tick */ re = G_TIMERRESOLUTION(res); break; case 1: /* TCP timestamp tick */ re = G_TIMESTAMPRESOLUTION(res); break; case 2: /* DACK tick */ re = G_DELAYEDACKRESOLUTION(res); break; default: return (EDOOFUS); } unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000); return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc; u_int dack_tmr, dack_re, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; dack_re = G_DELAYEDACKRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION)); dack_tmr = t4_read_reg(sc, A_TP_DACK_TIMER); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); v = ((cclk_ps << dack_re) / 1000000) * dack_tmr; return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, reg = arg2; u_int tre; u_long tp_tick_us, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX || reg == A_TP_PERS_MIN || reg == A_TP_PERS_MAX || reg == A_TP_KEEP_IDLE || reg == A_TP_KEEP_INTVL || reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION)); tp_tick_us = (cclk_ps << tre) / 1000000; if (reg == A_TP_INIT_SRTT) v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg)); else v = tp_tick_us * t4_read_reg(sc, reg); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); else return (sysctl_handle_long(oidp, &v, 0, req)); } /* * All fields in TP_SHIFT_CNT are 4b and the starting location of the field is * passed to this function. */ static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, idx = arg2; u_int v; MPASS(idx >= 0 && idx <= 24); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; v = (t4_read_reg(sc, A_TP_SHIFT_CNT) >> idx) & 0xf; } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); else return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, idx = arg2; u_int shift, v, r; MPASS(idx >= 0 && idx < 16); r = A_TP_TCP_BACKOFF_REG0 + (idx & ~3); shift = (idx & 3) << 3; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; v = (t4_read_reg(sc, r) >> shift) & M_TIMERBACKOFFINDEX0; } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); else return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc, i; struct sge_ofld_rxq *ofld_rxq; uint8_t v; idx = vi->ofld_tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4otmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->ofld_pktc_idx != -1); for_each_ofld_rxq(vi, i, ofld_rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&ofld_rxq->iq.intr_params, v); #else ofld_rxq->iq.intr_params = v; #endif } vi->ofld_tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc; idx = vi->ofld_pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4opktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->ofld_pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } #endif static int get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt) { int rc; if (cntxt->cid > M_CTXTQID) return (EINVAL); if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS && cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (sc->flags & FW_OK) { rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); if (rc == 0) goto done; } /* * Read via firmware failed or wasn't even attempted. Read directly via * the backdoor. */ rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); done: end_synchronized_op(sc, 0); return (rc); } static int load_fw(struct adapter *sc, struct t4_data *fw) { int rc; uint8_t *fw_data; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } /* * The firmware, with the sole exception of the memory parity error * handler, runs from memory and not flash. It is almost always safe to * install a new firmware on a running system. Just set bit 1 in * hw.cxgbe.dflags or dev...dflags first. */ if (sc->flags & FULL_INIT_DONE && (sc->debug_flags & DF_LOAD_FW_ANYTIME) == 0) { rc = EBUSY; goto done; } fw_data = malloc(fw->len, M_CXGBE, M_WAITOK); rc = copyin(fw->data, fw_data, fw->len); if (rc == 0) rc = -t4_load_fw(sc, fw_data, fw->len); free(fw_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_cfg(struct adapter *sc, struct t4_data *cfg) { int rc; uint8_t *cfg_data = NULL; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (cfg->len == 0) { /* clear */ rc = -t4_load_cfg(sc, NULL, 0); goto done; } cfg_data = malloc(cfg->len, M_CXGBE, M_WAITOK); rc = copyin(cfg->data, cfg_data, cfg->len); if (rc == 0) rc = -t4_load_cfg(sc, cfg_data, cfg->len); free(cfg_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_boot(struct adapter *sc, struct t4_bootrom *br) { int rc; uint8_t *br_data = NULL; u_int offset; if (br->len > 1024 * 1024) return (EFBIG); if (br->pf_offset == 0) { /* pfidx */ if (br->pfidx_addr > 7) return (EINVAL); offset = G_OFFSET(t4_read_reg(sc, PF_REG(br->pfidx_addr, A_PCIE_PF_EXPROM_OFST))); } else if (br->pf_offset == 1) { /* offset */ offset = G_OFFSET(br->pfidx_addr); } else { return (EINVAL); } rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldbr"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (br->len == 0) { /* clear */ rc = -t4_load_boot(sc, NULL, offset, 0); goto done; } br_data = malloc(br->len, M_CXGBE, M_WAITOK); rc = copyin(br->data, br_data, br->len); if (rc == 0) rc = -t4_load_boot(sc, br_data, offset, br->len); free(br_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_bootcfg(struct adapter *sc, struct t4_data *bc) { int rc; uint8_t *bc_data = NULL; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (bc->len == 0) { /* clear */ rc = -t4_load_bootcfg(sc, NULL, 0); goto done; } bc_data = malloc(bc->len, M_CXGBE, M_WAITOK); rc = copyin(bc->data, bc_data, bc->len); if (rc == 0) rc = -t4_load_bootcfg(sc, bc_data, bc->len); free(bc_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int cudbg_dump(struct adapter *sc, struct t4_cudbg_dump *dump) { int rc; struct cudbg_init *cudbg; void *handle, *buf; /* buf is large, don't block if no memory is available */ buf = malloc(dump->len, M_CXGBE, M_NOWAIT | M_ZERO); if (buf == NULL) return (ENOMEM); handle = cudbg_alloc_handle(); if (handle == NULL) { rc = ENOMEM; goto done; } cudbg = cudbg_get_init(handle); cudbg->adap = sc; cudbg->print = (cudbg_print_cb)printf; #ifndef notyet device_printf(sc->dev, "%s: wr_flash %u, len %u, data %p.\n", __func__, dump->wr_flash, dump->len, dump->data); #endif if (dump->wr_flash) cudbg->use_flash = 1; MPASS(sizeof(cudbg->dbg_bitmap) == sizeof(dump->bitmap)); memcpy(cudbg->dbg_bitmap, dump->bitmap, sizeof(cudbg->dbg_bitmap)); rc = cudbg_collect(handle, buf, &dump->len); if (rc != 0) goto done; rc = copyout(buf, dump->data, dump->len); done: cudbg_free_handle(handle); free(buf, M_CXGBE); return (rc); } static void free_offload_policy(struct t4_offload_policy *op) { struct offload_rule *r; int i; if (op == NULL) return; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { free(r->bpf_prog.bf_insns, M_CXGBE); } free(op->rule, M_CXGBE); free(op, M_CXGBE); } static int set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop) { int i, rc, len; struct t4_offload_policy *op, *old; struct bpf_program *bf; const struct offload_settings *s; struct offload_rule *r; void *u; if (!is_offload(sc)) return (ENODEV); if (uop->nrules == 0) { /* Delete installed policies. */ op = NULL; goto set_policy; } else if (uop->nrules > 256) { /* arbitrary */ return (E2BIG); } /* Copy userspace offload policy to kernel */ op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK); op->nrules = uop->nrules; len = op->nrules * sizeof(struct offload_rule); op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); rc = copyin(uop->rule, op->rule, len); if (rc) { free(op->rule, M_CXGBE); free(op, M_CXGBE); return (rc); } r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { /* Validate open_type */ if (r->open_type != OPEN_TYPE_LISTEN && r->open_type != OPEN_TYPE_ACTIVE && r->open_type != OPEN_TYPE_PASSIVE && r->open_type != OPEN_TYPE_DONTCARE) { error: /* * Rules 0 to i have malloc'd filters that need to be * freed. Rules i+1 to nrules have userspace pointers * and should be left alone. */ op->nrules = i; free_offload_policy(op); return (rc); } /* Validate settings */ s = &r->settings; if ((s->offload != 0 && s->offload != 1) || s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED || s->sched_class < -1 || s->sched_class >= sc->chip_params->nsched_cls) { rc = EINVAL; goto error; } bf = &r->bpf_prog; u = bf->bf_insns; /* userspace ptr */ bf->bf_insns = NULL; if (bf->bf_len == 0) { /* legal, matches everything */ continue; } len = bf->bf_len * sizeof(*bf->bf_insns); bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); rc = copyin(u, bf->bf_insns, len); if (rc != 0) goto error; if (!bpf_validate(bf->bf_insns, bf->bf_len)) { rc = EINVAL; goto error; } } set_policy: rw_wlock(&sc->policy_lock); old = sc->policy; sc->policy = op; rw_wunlock(&sc->policy_lock); free_offload_policy(old); return (0); } #define MAX_READ_BUF_SIZE (128 * 1024) static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) { uint32_t addr, remaining, n; uint32_t *buf; int rc; uint8_t *dst; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else rc = validate_mem_range(sc, mr->addr, mr->len); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK); addr = mr->addr; remaining = mr->len; dst = (void *)mr->data; while (remaining) { n = min(remaining, MAX_READ_BUF_SIZE); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else read_via_memwin(sc, 2, addr, buf, n); mtx_unlock(&sc->reg_lock); if (rc != 0) break; rc = copyout(buf, dst, n); if (rc != 0) break; dst += n; remaining -= n; addr += n; } free(buf, M_CXGBE); return (rc); } #undef MAX_READ_BUF_SIZE static int read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd) { int rc; if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports) return (EINVAL); if (i2cd->len > sizeof(i2cd->data)) return (EFBIG); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr, i2cd->offset, i2cd->len, &i2cd->data[0]); end_synchronized_op(sc, 0); return (rc); } static int clear_stats(struct adapter *sc, u_int port_id) { int i, v, chan_map; struct port_info *pi; struct vi_info *vi; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif if (port_id >= sc->params.nports) return (EINVAL); pi = sc->port[port_id]; if (pi == NULL) return (EIO); mtx_lock(&sc->reg_lock); if (!hw_off_limits(sc)) { /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); if (is_t6(sc)) { if (pi->fcs_reg != -1) pi->fcs_base = t4_read_reg64(sc, pi->fcs_reg); else pi->stats.rx_fcs_err = 0; } for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) t4_clr_vi_stats(sc, vi->vin); } chan_map = pi->rx_e_chan_map; v = 0; /* reuse */ while (chan_map) { i = ffs(chan_map) - 1; t4_write_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); chan_map &= ~(1 << i); } } mtx_unlock(&sc->reg_lock); pi->tx_parse_error = 0; pi->tnl_cong_drops = 0; /* * Since this command accepts a port, clear stats for * all VIs on this port. */ for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) { for_each_rxq(vi, i, rxq) { #if defined(INET) || defined(INET6) rxq->lro.lro_queued = 0; rxq->lro.lro_flushed = 0; #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; rxq->vxlan_rxcsum = 0; rxq->fl.cl_allocated = 0; rxq->fl.cl_recycled = 0; rxq->fl.cl_fast_recycled = 0; } for_each_txq(vi, i, txq) { txq->txcsum = 0; txq->tso_wrs = 0; txq->vlan_insertion = 0; txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; txq->txpkts0_wrs = 0; txq->txpkts1_wrs = 0; txq->txpkts0_pkts = 0; txq->txpkts1_pkts = 0; txq->txpkts_flush = 0; txq->raw_wrs = 0; txq->vxlan_tso_wrs = 0; txq->vxlan_txcsum = 0; txq->kern_tls_records = 0; txq->kern_tls_short = 0; txq->kern_tls_partial = 0; txq->kern_tls_full = 0; txq->kern_tls_octets = 0; txq->kern_tls_waste = 0; txq->kern_tls_options = 0; txq->kern_tls_header = 0; txq->kern_tls_fin = 0; txq->kern_tls_fin_short = 0; txq->kern_tls_cbc = 0; txq->kern_tls_gcm = 0; mp_ring_reset_stats(txq->r); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { ofld_txq->wrq.tx_wrs_direct = 0; ofld_txq->wrq.tx_wrs_copied = 0; counter_u64_zero(ofld_txq->tx_iscsi_pdus); counter_u64_zero(ofld_txq->tx_iscsi_octets); counter_u64_zero(ofld_txq->tx_toe_tls_records); counter_u64_zero(ofld_txq->tx_toe_tls_octets); } #endif #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { ofld_rxq->fl.cl_allocated = 0; ofld_rxq->fl.cl_recycled = 0; ofld_rxq->fl.cl_fast_recycled = 0; counter_u64_zero( ofld_rxq->rx_iscsi_ddp_setup_ok); counter_u64_zero( ofld_rxq->rx_iscsi_ddp_setup_error); ofld_rxq->rx_iscsi_ddp_pdus = 0; ofld_rxq->rx_iscsi_ddp_octets = 0; ofld_rxq->rx_iscsi_fl_pdus = 0; ofld_rxq->rx_iscsi_fl_octets = 0; ofld_rxq->rx_toe_tls_records = 0; ofld_rxq->rx_toe_tls_octets = 0; } #endif if (IS_MAIN_VI(vi)) { wrq = &sc->sge.ctrlq[pi->port_id]; wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } } } return (0); } +static int +hold_clip_addr(struct adapter *sc, struct t4_clip_addr *ca) +{ +#ifdef INET6 + struct in6_addr in6; + + bcopy(&ca->addr[0], &in6.s6_addr[0], sizeof(in6.s6_addr)); + if (t4_get_clip_entry(sc, &in6, true) != NULL) + return (0); + else + return (EIO); +#else + return (ENOTSUP); +#endif +} + +static int +release_clip_addr(struct adapter *sc, struct t4_clip_addr *ca) +{ +#ifdef INET6 + struct in6_addr in6; + + bcopy(&ca->addr[0], &in6.s6_addr[0], sizeof(in6.s6_addr)); + return (t4_release_clip_addr(sc, &in6)); +#else + return (ENOTSUP); +#endif +} + int t4_os_find_pci_capability(struct adapter *sc, int cap) { int i; return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0); } int t4_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t4_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } void t4_os_portmod_changed(struct port_info *pi) { struct adapter *sc = pi->adapter; struct vi_info *vi; struct ifnet *ifp; static const char *mod_str[] = { NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; KASSERT((pi->flags & FIXED_IFMEDIA) == 0, ("%s: port_type %u", __func__, pi->port_type)); vi = &pi->vi[0]; if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) { PORT_LOCK(pi); build_medialist(pi); if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) { fixup_link_config(pi); apply_link_config(pi); } PORT_UNLOCK(pi); end_synchronized_op(sc, LOCK_HELD); } ifp = vi->ifp; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) if_printf(ifp, "transceiver unplugged.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN) if_printf(ifp, "unknown transceiver inserted.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED) if_printf(ifp, "unsupported transceiver inserted.\n"); else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) { if_printf(ifp, "%dGbps %s transceiver inserted.\n", port_top_speed(pi), mod_str[pi->mod_type]); } else { if_printf(ifp, "transceiver (type %d) inserted.\n", pi->mod_type); } } void t4_os_link_changed(struct port_info *pi) { struct vi_info *vi; struct ifnet *ifp; struct link_config *lc = &pi->link_cfg; struct adapter *sc = pi->adapter; int v; PORT_LOCK_ASSERT_OWNED(pi); if (is_t6(sc)) { if (lc->link_ok) { if (lc->speed > 25000 || (lc->speed == 25000 && lc->fec == FEC_RS)) { pi->fcs_reg = T5_PORT_REG(pi->tx_chan, A_MAC_PORT_AFRAMECHECKSEQUENCEERRORS); } else { pi->fcs_reg = T5_PORT_REG(pi->tx_chan, A_MAC_PORT_MTIP_1G10G_RX_CRCERRORS); } pi->fcs_base = t4_read_reg64(sc, pi->fcs_reg); pi->stats.rx_fcs_err = 0; } else { pi->fcs_reg = -1; } } else { MPASS(pi->fcs_reg != -1); MPASS(pi->fcs_base == 0); } for_each_vi(pi, v, vi) { ifp = vi->ifp; if (ifp == NULL) continue; if (lc->link_ok) { ifp->if_baudrate = IF_Mbps(lc->speed); if_link_state_change(ifp, LINK_STATE_UP); } else { if_link_state_change(ifp, LINK_STATE_DOWN); } } } void t4_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; sx_slock(&t4_list_lock); SLIST_FOREACH(sc, &t4_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } sx_sunlock(&t4_list_lock); } static int t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int rc; struct adapter *sc = dev->si_drv1; rc = priv_check(td, PRIV_DRIVER); if (rc != 0) return (rc); switch (cmd) { case CHELSIO_T4_GETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else if (edata->size == 4) edata->val = t4_read_reg(sc, edata->addr); else if (edata->size == 8) edata->val = t4_read_reg64(sc, edata->addr); else rc = EINVAL; mtx_unlock(&sc->reg_lock); break; } case CHELSIO_T4_SETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else if (edata->size == 4) { if (edata->val & 0xffffffff00000000) rc = EINVAL; t4_write_reg(sc, edata->addr, (uint32_t) edata->val); } else if (edata->size == 8) t4_write_reg64(sc, edata->addr, edata->val); else rc = EINVAL; mtx_unlock(&sc->reg_lock); break; } case CHELSIO_T4_REGDUMP: { struct t4_regdump *regs = (struct t4_regdump *)data; int reglen = t4_get_regs_len(sc); uint8_t *buf; if (regs->len < reglen) { regs->len = reglen; /* hint to the caller */ return (ENOBUFS); } regs->len = reglen; buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else get_regs(sc, regs, buf); mtx_unlock(&sc->reg_lock); if (rc == 0) rc = copyout(buf, regs->data, reglen); free(buf, M_CXGBE); break; } case CHELSIO_T4_GET_FILTER_MODE: rc = get_filter_mode(sc, (uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MODE: rc = set_filter_mode(sc, *(uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MASK: rc = set_filter_mask(sc, *(uint32_t *)data); break; case CHELSIO_T4_GET_FILTER: rc = get_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_SET_FILTER: rc = set_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_DEL_FILTER: rc = del_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_GET_SGE_CONTEXT: rc = get_sge_context(sc, (struct t4_sge_context *)data); break; case CHELSIO_T4_LOAD_FW: rc = load_fw(sc, (struct t4_data *)data); break; case CHELSIO_T4_GET_MEM: rc = read_card_mem(sc, 2, (struct t4_mem_range *)data); break; case CHELSIO_T4_GET_I2C: rc = read_i2c(sc, (struct t4_i2c_data *)data); break; case CHELSIO_T4_CLEAR_STATS: rc = clear_stats(sc, *(uint32_t *)data); break; case CHELSIO_T4_SCHED_CLASS: rc = t4_set_sched_class(sc, (struct t4_sched_params *)data); break; case CHELSIO_T4_SCHED_QUEUE: rc = t4_set_sched_queue(sc, (struct t4_sched_queue *)data); break; case CHELSIO_T4_GET_TRACER: rc = t4_get_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_SET_TRACER: rc = t4_set_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_LOAD_CFG: rc = load_cfg(sc, (struct t4_data *)data); break; case CHELSIO_T4_LOAD_BOOT: rc = load_boot(sc, (struct t4_bootrom *)data); break; case CHELSIO_T4_LOAD_BOOTCFG: rc = load_bootcfg(sc, (struct t4_data *)data); break; case CHELSIO_T4_CUDBG_DUMP: rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data); break; case CHELSIO_T4_SET_OFLD_POLICY: rc = set_offload_policy(sc, (struct t4_offload_policy *)data); break; + case CHELSIO_T4_HOLD_CLIP_ADDR: + rc = hold_clip_addr(sc, (struct t4_clip_addr *)data); + break; + case CHELSIO_T4_RELEASE_CLIP_ADDR: + rc = release_clip_addr(sc, (struct t4_clip_addr *)data); + break; default: rc = ENOTTY; } return (rc); } #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *vi, bool enable) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; ASSERT_SYNCHRONIZED_OP(sc); if (!is_offload(sc)) return (ENODEV); if (hw_off_limits(sc)) return (ENXIO); if (enable) { #ifdef KERN_TLS if (sc->flags & KERN_TLS_ON) { int i, j, n; struct port_info *p; struct vi_info *v; /* * Reconfigure hardware for TOE if TXTLS is not enabled * on any ifnet. */ n = 0; for_each_port(sc, i) { p = sc->port[i]; for_each_vi(p, j, v) { if (v->ifp->if_capenable & IFCAP_TXTLS) { CH_WARN(sc, "%s has NIC TLS enabled.\n", device_get_nameunit(v->dev)); n++; } } } if (n > 0) { CH_WARN(sc, "Disable NIC TLS on all interfaces " "associated with this adapter before " "trying to enable TOE.\n"); return (EAGAIN); } rc = t4_config_kern_tls(sc, false); if (rc) return (rc); } #endif if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) { /* TOE is already enabled. */ return (0); } /* * We need the port's queues around so that we're able to send * and receive CPLs to/from the TOE even if the ifnet for this * port has never been UP'd administratively. */ if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_init(vi)) != 0)) return (rc); if (!(pi->vi[0].flags & VI_INIT_DONE) && ((rc = vi_init(&pi->vi[0])) != 0)) return (rc); if (isset(&sc->offload_map, pi->port_id)) { /* TOE is enabled on another VI of this port. */ pi->uld_vis++; return (0); } if (!uld_active(sc, ULD_TOM)) { rc = t4_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t4_tom.ko before trying " "to enable TOE on a cxgbe interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM activated but flag not set", __func__)); } /* Activate iWARP and iSCSI too, if the modules are loaded. */ if (!uld_active(sc, ULD_IWARP)) (void) t4_activate_uld(sc, ULD_IWARP); if (!uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); pi->uld_vis++; setbit(&sc->offload_map, pi->port_id); } else { pi->uld_vis--; if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0) return (0); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t4_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t4_uld_list, ui, link); ui->refcount = 0; done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t4_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_activate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = EAGAIN; /* kldoad the module with this ULD and try again. */ sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { if (!(sc->flags & FULL_INIT_DONE)) { rc = adapter_init(sc); if (rc != 0) break; } rc = ui->activate(sc); if (rc == 0) { setbit(&sc->active_ulds, id); ui->refcount++; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int t4_deactivate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = ENXIO; sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) { clrbit(&sc->active_ulds, id); ui->refcount--; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } static void t4_async_event(void *arg, int n) { struct uld_info *ui; struct adapter *sc = (struct adapter *)arg; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4async") != 0) return; sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == ULD_IWARP) { ui->async_event(sc); break; } } sx_sunlock(&t4_uld_list_lock); end_synchronized_op(sc, 0); } int uld_active(struct adapter *sc, int uld_id) { MPASS(uld_id >= 0 && uld_id <= ULD_MAX); return (isset(&sc->active_ulds, uld_id)); } #endif #ifdef KERN_TLS static int ktls_capability(struct adapter *sc, bool enable) { ASSERT_SYNCHRONIZED_OP(sc); if (!is_ktls(sc)) return (ENODEV); if (hw_off_limits(sc)) return (ENXIO); if (enable) { if (sc->flags & KERN_TLS_ON) return (0); /* already on */ if (sc->offload_map != 0) { CH_WARN(sc, "Disable TOE on all interfaces associated with " "this adapter before trying to enable NIC TLS.\n"); return (EAGAIN); } return (t4_config_kern_tls(sc, true)); } else { /* * Nothing to do for disable. If TOE is enabled sometime later * then toe_capability will reconfigure the hardware. */ return (0); } } #endif /* * t = ptr to tunable. * nc = number of CPUs. * c = compiled in default for that tunable. */ static void calculate_nqueues(int *t, int nc, const int c) { int nq; if (*t > 0) return; nq = *t < 0 ? -*t : c; *t = min(nc, nq); } /* * Come up with reasonable defaults for some of the tunables, provided they're * not set by the user (in which case we'll use the values as is). */ static void tweak_tunables(void) { int nc = mp_ncpus; /* our snapshot of the number of CPUs */ if (t4_ntxq < 1) { #ifdef RSS t4_ntxq = rss_getnumbuckets(); #else calculate_nqueues(&t4_ntxq, nc, NTXQ); #endif } calculate_nqueues(&t4_ntxq_vi, nc, NTXQ_VI); if (t4_nrxq < 1) { #ifdef RSS t4_nrxq = rss_getnumbuckets(); #else calculate_nqueues(&t4_nrxq, nc, NRXQ); #endif } calculate_nqueues(&t4_nrxq_vi, nc, NRXQ_VI); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) calculate_nqueues(&t4_nofldtxq, nc, NOFLDTXQ); calculate_nqueues(&t4_nofldtxq_vi, nc, NOFLDTXQ_VI); #endif #ifdef TCP_OFFLOAD calculate_nqueues(&t4_nofldrxq, nc, NOFLDRXQ); calculate_nqueues(&t4_nofldrxq_vi, nc, NOFLDRXQ_VI); #endif #if defined(TCP_OFFLOAD) || defined(KERN_TLS) if (t4_toecaps_allowed == -1) t4_toecaps_allowed = FW_CAPS_CONFIG_TOE; #else if (t4_toecaps_allowed == -1) t4_toecaps_allowed = 0; #endif #ifdef TCP_OFFLOAD if (t4_rdmacaps_allowed == -1) { t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP | FW_CAPS_CONFIG_RDMA_RDMAC; } if (t4_iscsicaps_allowed == -1) { t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU | FW_CAPS_CONFIG_ISCSI_TARGET_PDU | FW_CAPS_CONFIG_ISCSI_T10DIF; } if (t4_tmr_idx_ofld < 0 || t4_tmr_idx_ofld >= SGE_NTIMERS) t4_tmr_idx_ofld = TMR_IDX_OFLD; if (t4_pktc_idx_ofld < -1 || t4_pktc_idx_ofld >= SGE_NCOUNTERS) t4_pktc_idx_ofld = PKTC_IDX_OFLD; if (t4_toe_tls_rx_timeout < 0) t4_toe_tls_rx_timeout = 0; #else if (t4_rdmacaps_allowed == -1) t4_rdmacaps_allowed = 0; if (t4_iscsicaps_allowed == -1) t4_iscsicaps_allowed = 0; #endif #ifdef DEV_NETMAP calculate_nqueues(&t4_nnmtxq, nc, NNMTXQ); calculate_nqueues(&t4_nnmrxq, nc, NNMRXQ); calculate_nqueues(&t4_nnmtxq_vi, nc, NNMTXQ_VI); calculate_nqueues(&t4_nnmrxq_vi, nc, NNMRXQ_VI); #endif if (t4_tmr_idx < 0 || t4_tmr_idx >= SGE_NTIMERS) t4_tmr_idx = TMR_IDX; if (t4_pktc_idx < -1 || t4_pktc_idx >= SGE_NCOUNTERS) t4_pktc_idx = PKTC_IDX; if (t4_qsize_txq < 128) t4_qsize_txq = 128; if (t4_qsize_rxq < 128) t4_qsize_rxq = 128; while (t4_qsize_rxq & 7) t4_qsize_rxq++; t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX; /* * Number of VIs to create per-port. The first VI is the "main" regular * VI for the port. The rest are additional virtual interfaces on the * same physical port. Note that the main VI does not have native * netmap support but the extra VIs do. * * Limit the number of VIs per port to the number of available * MAC addresses per port. */ if (t4_num_vis < 1) t4_num_vis = 1; if (t4_num_vis > nitems(vi_mac_funcs)) { t4_num_vis = nitems(vi_mac_funcs); printf("cxgbe: number of VIs limited to %d\n", t4_num_vis); } if (pcie_relaxed_ordering < 0 || pcie_relaxed_ordering > 2) { pcie_relaxed_ordering = 1; #if defined(__i386__) || defined(__amd64__) if (cpu_vendor_id == CPU_VENDOR_INTEL) pcie_relaxed_ordering = 0; #endif } } #ifdef DDB static void t4_dump_tcb(struct adapter *sc, int tid) { uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos; reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2); save = t4_read_reg(sc, reg); base = sc->memwin[2].mw_base; /* Dump TCB for the tid */ tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE); tcb_addr += tid * TCB_SIZE; if (is_t4(sc)) { pf = 0; win_pos = tcb_addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); win_pos = tcb_addr & ~0x7f; /* start must be 128B aligned */ } t4_write_reg(sc, reg, win_pos | pf); t4_read_reg(sc, reg); off = tcb_addr - win_pos; for (i = 0; i < 4; i++) { uint32_t buf[8]; for (j = 0; j < 8; j++, off += 4) buf[j] = htonl(t4_read_reg(sc, base + off)); db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); } t4_write_reg(sc, reg, save); t4_read_reg(sc, reg); } static void t4_dump_devlog(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e e; int i, first, j, m, nentries, rc; uint64_t ftstamp = UINT64_MAX; if (dparams->start == 0) { db_printf("devlog params not valid\n"); return; } nentries = dparams->size / sizeof(struct fw_devlog_e); m = fwmtype_to_hwmtype(dparams->memtype); /* Find the first entry. */ first = -1; for (i = 0; i < nentries && !db_pager_quit; i++) { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) break; if (e.timestamp == 0) break; e.timestamp = be64toh(e.timestamp); if (e.timestamp < ftstamp) { ftstamp = e.timestamp; first = i; } } if (first == -1) return; i = first; do { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) return; if (e.timestamp == 0) return; e.timestamp = be64toh(e.timestamp); e.seqno = be32toh(e.seqno); for (j = 0; j < 8; j++) e.params[j] = be32toh(e.params[j]); db_printf("%10d %15ju %8s %8s ", e.seqno, e.timestamp, (e.level < nitems(devlog_level_strings) ? devlog_level_strings[e.level] : "UNKNOWN"), (e.facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e.facility] : "UNKNOWN")); db_printf(e.fmt, e.params[0], e.params[1], e.params[2], e.params[3], e.params[4], e.params[5], e.params[6], e.params[7]); if (++i == nentries) i = 0; } while (i != first && !db_pager_quit); } static struct command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table); _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table); DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL) { device_t dev; int t; bool valid; valid = false; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); valid = true; } db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 devlog \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } t4_dump_devlog(device_get_softc(dev)); } DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL) { device_t dev; int radix, tid, t; bool valid; valid = false; radix = db_radix; db_radix = 10; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); t = db_read_token(); if (t == tNUMBER) { tid = db_tok_number; valid = true; } } db_radix = radix; db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 tcb \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } if (tid < 0) { db_printf("invalid tid\n"); return; } t4_dump_tcb(device_get_softc(dev), tid); } #endif static eventhandler_tag vxlan_start_evtag; static eventhandler_tag vxlan_stop_evtag; struct vxlan_evargs { struct ifnet *ifp; uint16_t port; }; static void enable_vxlan_rx(struct adapter *sc) { int i, rc; struct port_info *pi; uint8_t match_all_mac[ETHER_ADDR_LEN] = {0}; ASSERT_SYNCHRONIZED_OP(sc); t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE, V_VXLAN(sc->vxlan_port) | F_VXLAN_EN); for_each_port(sc, i) { pi = sc->port[i]; if (pi->vxlan_tcam_entry == true) continue; rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid, match_all_mac, match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id, true); if (rc < 0) { rc = -rc; CH_ERR(&pi->vi[0], "failed to add VXLAN TCAM entry: %d.\n", rc); } else { MPASS(rc == sc->rawf_base + pi->port_id); pi->vxlan_tcam_entry = true; } } } static void t4_vxlan_start(struct adapter *sc, void *arg) { struct vxlan_evargs *v = arg; if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5) return; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0) return; if (sc->vxlan_refcount == 0) { sc->vxlan_port = v->port; sc->vxlan_refcount = 1; if (!hw_off_limits(sc)) enable_vxlan_rx(sc); } else if (sc->vxlan_port == v->port) { sc->vxlan_refcount++; } else { CH_ERR(sc, "VXLAN already configured on port %d; " "ignoring attempt to configure it on port %d\n", sc->vxlan_port, v->port); } end_synchronized_op(sc, 0); } static void t4_vxlan_stop(struct adapter *sc, void *arg) { struct vxlan_evargs *v = arg; if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5) return; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0) return; /* * VXLANs may have been configured before the driver was loaded so we * may see more stops than starts. This is not handled cleanly but at * least we keep the refcount sane. */ if (sc->vxlan_port != v->port) goto done; if (sc->vxlan_refcount == 0) { CH_ERR(sc, "VXLAN operation on port %d was stopped earlier; " "ignoring attempt to stop it again.\n", sc->vxlan_port); } else if (--sc->vxlan_refcount == 0 && !hw_off_limits(sc)) t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0); done: end_synchronized_op(sc, 0); } static void t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family, u_int port) { struct vxlan_evargs v; MPASS(family == AF_INET || family == AF_INET6); v.ifp = ifp; v.port = port; t4_iterate(t4_vxlan_start, &v); } static void t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family, u_int port) { struct vxlan_evargs v; MPASS(family == AF_INET || family == AF_INET6); v.ifp = ifp; v.port = port; t4_iterate(t4_vxlan_stop, &v); } static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); static int mod_event(module_t mod, int cmd, void *arg) { int rc = 0; static int loaded = 0; switch (cmd) { case MOD_LOAD: sx_xlock(&mlu); if (loaded++ == 0) { t4_sge_modload(); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, t4_filter_rpl, CPL_COOKIE_FILTER); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl, CPL_COOKIE_FILTER); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, t4_hashfilter_ao_rpl, CPL_COOKIE_HASHFILTER); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, t4_hashfilter_tcb_rpl, CPL_COOKIE_HASHFILTER); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, t4_del_hashfilter_rpl, CPL_COOKIE_HASHFILTER); t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt); t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt); t4_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl); sx_init(&t4_list_lock, "T4/T5 adapters"); SLIST_INIT(&t4_list); callout_init(&fatal_callout, 1); #ifdef TCP_OFFLOAD sx_init(&t4_uld_list_lock, "T4/T5 ULDs"); SLIST_INIT(&t4_uld_list); #endif #ifdef INET6 t4_clip_modload(); #endif #ifdef KERN_TLS t6_ktls_modload(); #endif t4_tracer_modload(); tweak_tunables(); vxlan_start_evtag = EVENTHANDLER_REGISTER(vxlan_start, t4_vxlan_start_handler, NULL, EVENTHANDLER_PRI_ANY); vxlan_stop_evtag = EVENTHANDLER_REGISTER(vxlan_stop, t4_vxlan_stop_handler, NULL, EVENTHANDLER_PRI_ANY); reset_tq = taskqueue_create("t4_rst_tq", M_WAITOK, taskqueue_thread_enqueue, &reset_tq); taskqueue_start_threads(&reset_tq, 1, PI_SOFT, "t4_rst_thr"); } sx_xunlock(&mlu); break; case MOD_UNLOAD: sx_xlock(&mlu); if (--loaded == 0) { int tries; taskqueue_free(reset_tq); sx_slock(&t4_list_lock); if (!SLIST_EMPTY(&t4_list)) { rc = EBUSY; sx_sunlock(&t4_list_lock); goto done_unload; } #ifdef TCP_OFFLOAD sx_slock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; sx_sunlock(&t4_uld_list_lock); sx_sunlock(&t4_list_lock); goto done_unload; } #endif tries = 0; while (tries++ < 5 && t4_sge_extfree_refs() != 0) { uprintf("%ju clusters with custom free routine " "still is use.\n", t4_sge_extfree_refs()); pause("t4unload", 2 * hz); } #ifdef TCP_OFFLOAD sx_sunlock(&t4_uld_list_lock); #endif sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { EVENTHANDLER_DEREGISTER(vxlan_start, vxlan_start_evtag); EVENTHANDLER_DEREGISTER(vxlan_stop, vxlan_stop_evtag); t4_tracer_modunload(); #ifdef KERN_TLS t6_ktls_modunload(); #endif #ifdef INET6 t4_clip_modunload(); #endif #ifdef TCP_OFFLOAD sx_destroy(&t4_uld_list_lock); #endif sx_destroy(&t4_list_lock); t4_sge_modunload(); loaded = 0; } else { rc = EBUSY; loaded++; /* undo earlier decrement */ } } done_unload: sx_xunlock(&mlu); break; } return (rc); } static devclass_t t4_devclass, t5_devclass, t6_devclass; static devclass_t cxgbe_devclass, cxl_devclass, cc_devclass; static devclass_t vcxgbe_devclass, vcxl_devclass, vcc_devclass; DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0); MODULE_VERSION(t4nex, 1); MODULE_DEPEND(t4nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t4nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0); MODULE_VERSION(t5nex, 1); MODULE_DEPEND(t5nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t5nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t6nex, pci, t6_driver, t6_devclass, mod_event, 0); MODULE_VERSION(t6nex, 1); MODULE_DEPEND(t6nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t6nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0); MODULE_VERSION(cxgbe, 1); DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0); MODULE_VERSION(cxl, 1); DRIVER_MODULE(cc, t6nex, cc_driver, cc_devclass, 0, 0); MODULE_VERSION(cc, 1); DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, vcxgbe_devclass, 0, 0); MODULE_VERSION(vcxgbe, 1); DRIVER_MODULE(vcxl, cxl, vcxl_driver, vcxl_devclass, 0, 0); MODULE_VERSION(vcxl, 1); DRIVER_MODULE(vcc, cc, vcc_driver, vcc_devclass, 0, 0); MODULE_VERSION(vcc, 1); diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index c71b9694bd3b..f4aa84d6514f 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -1,403 +1,403 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * Active open succeeded. */ static int do_act_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_establish *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); u_int atid = G_TID_TID(ntohl(cpl->tos_atid)); struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(sc, atid); CURVNET_SET(toep->vnet); INP_WLOCK(inp); toep->tid = tid; insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1); if (inp->inp_flags & INP_DROPPED) { /* socket closed by the kernel before hw told us it connected */ send_flowc_wr(toep, NULL); send_reset(sc, toep, be32toh(cpl->snd_isn)); goto done; } make_established(toep, be32toh(cpl->snd_isn) - 1, be32toh(cpl->rcv_isn) - 1, cpl->tcp_opt); inp->inp_flowtype = M_HASHTYPE_OPAQUE; inp->inp_flowid = tid; done: INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status) { struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; struct toedev *tod = &toep->td->tod; struct epoch_tracker et; free_atid(sc, atid); toep->tid = -1; CURVNET_SET(toep->vnet); if (status != EAGAIN) NET_EPOCH_ENTER(et); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Active open failed. */ static int do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct toepcb *toep = lookup_atid(sc, atid); int rc; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status); /* Ignore negative advice */ if (negative_advice(status)) return (0); if (status && act_open_has_tid(status)) release_tid(sc, GET_TID(cpl), toep->ctrlq); rc = act_open_rpl_status_to_errno(status); act_open_failure_cleanup(sc, atid, rc); return (0); } void t4_init_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl, CPL_COOKIE_TOM); } void t4_uninit_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, NULL); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_TOM); } #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ reason = __LINE__; \ rc = (x); \ goto failed; \ } while (0) static inline int act_open_cpl_size(struct adapter *sc, int isipv6) { int idx; static const int sz_table[3][2] = { { sizeof (struct cpl_act_open_req), sizeof (struct cpl_act_open_req6) }, { sizeof (struct cpl_t5_act_open_req), sizeof (struct cpl_t5_act_open_req6) }, { sizeof (struct cpl_t6_act_open_req), sizeof (struct cpl_t6_act_open_req6) }, }; MPASS(chip_id(sc) >= CHELSIO_T4); idx = min(chip_id(sc) - CHELSIO_T4, 2); return (sz_table[idx][!!isipv6]); } /* * active open (soconnect). * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (This has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = NULL; struct wrqe *wr = NULL; struct ifnet *rt_ifp = nh->nh_ifp; struct vi_info *vi; int qid_atid, rc, isipv6; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int reason; struct offload_settings settings; struct epoch_tracker et; uint16_t vid = 0xfff, pcp = 0; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family)); if (rt_ifp->if_type == IFT_ETHER) vi = rt_ifp->if_softc; else if (rt_ifp->if_type == IFT_L2VLAN) { struct ifnet *ifp = VLAN_TRUNKDEV(rt_ifp); vi = ifp->if_softc; VLAN_TAG(rt_ifp, &vid); VLAN_PCP(rt_ifp, &pcp); } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); if (sc->flags & KERN_TLS_ON) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, EVL_MAKETAG(vid, pcp, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) DONT_OFFLOAD_ACTIVE_OPEN(EPERM); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->tid = alloc_atid(sc, toep); if (toep->tid < 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(vi->pi, rt_ifp, nh->nh_flags & NHF_GATEWAY ? &nh->gw_sa : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; init_conn_params(vi, &settings, &inp->inp_inc, so, NULL, toep->l2te->idx, &toep->params); init_toepcb(vi, toep); isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) | V_TID_COOKIE(CPL_COOKIE_TOM); if (isipv6) { struct cpl_act_open_req6 *cpl = wrtod(wr); struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; if ((inp->inp_vflag & INP_IPV6) == 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); - toep->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL); + toep->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (toep->ce == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOENT); switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } offload_socket(so, toep); NET_EPOCH_ENTER(et); rc = t4_l2t_send(sc, wr, toep->l2te); NET_EPOCH_EXIT(et); if (rc == 0) { toep->flags |= TPF_CPL_PENDING; return (0); } undo_offload_socket(so); reason = __LINE__; failed: CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc); if (wr) free_wrqe(wr); if (toep) { if (toep->tid >= 0) free_atid(sc, toep->tid); if (toep->l2te) t4_l2t_release(toep->l2te); if (toep->ce) - t4_release_lip(sc, toep->ce); + t4_release_clip_entry(sc, toep->ce); free_toepcb(toep); } return (rc); } #endif diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 0245acfe005b..8623079fe429 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -1,1591 +1,1601 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* stid services */ static int alloc_stid(struct adapter *, struct listen_ctx *, int); static struct listen_ctx *lookup_stid(struct adapter *, int); static void free_stid(struct adapter *, struct listen_ctx *); /* lctx services */ static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, struct vi_info *); static int free_lctx(struct adapter *, struct listen_ctx *); static void hold_lctx(struct listen_ctx *); static void listen_hash_add(struct adapter *, struct listen_ctx *); static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); static int alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) { struct tid_info *t = &sc->tids; u_int stid, n, f, mask; struct stid_region *sr = &lctx->stid_region; /* * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in * the TCAM. The start of the stid region is properly aligned (the chip * requires each region to be 128-cell aligned). */ n = isipv6 ? 2 : 1; mask = n - 1; KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, ("%s: stid region (%u, %u) not properly aligned. n = %u", __func__, t->stid_base, t->nstids, n)); mtx_lock(&t->stid_lock); if (n > t->nstids - t->stids_in_use) { mtx_unlock(&t->stid_lock); return (-1); } if (t->nstids_free_head >= n) { /* * This allocation will definitely succeed because the region * starts at a good alignment and we just checked we have enough * stids free. */ f = t->nstids_free_head & mask; t->nstids_free_head -= n + f; stid = t->nstids_free_head; TAILQ_INSERT_HEAD(&t->stids, sr, link); } else { struct stid_region *s; stid = t->nstids_free_head; TAILQ_FOREACH(s, &t->stids, link) { stid += s->used + s->free; f = stid & mask; if (s->free >= n + f) { stid -= n + f; s->free -= n + f; TAILQ_INSERT_AFTER(&t->stids, s, sr, link); goto allocated; } } if (__predict_false(stid != t->nstids)) { panic("%s: stids TAILQ (%p) corrupt." " At %d instead of %d at the end of the queue.", __func__, &t->stids, stid, t->nstids); } mtx_unlock(&t->stid_lock); return (-1); } allocated: sr->used = n; sr->free = f; t->stids_in_use += n; t->stid_tab[stid] = lctx; mtx_unlock(&t->stid_lock); KASSERT(((stid + t->stid_base) & mask) == 0, ("%s: EDOOFUS.", __func__)); return (stid + t->stid_base); } static struct listen_ctx * lookup_stid(struct adapter *sc, int stid) { struct tid_info *t = &sc->tids; return (t->stid_tab[stid - t->stid_base]); } static void free_stid(struct adapter *sc, struct listen_ctx *lctx) { struct tid_info *t = &sc->tids; struct stid_region *sr = &lctx->stid_region; struct stid_region *s; KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); mtx_lock(&t->stid_lock); s = TAILQ_PREV(sr, stid_head, link); if (s != NULL) s->free += sr->used + sr->free; else t->nstids_free_head += sr->used + sr->free; KASSERT(t->stids_in_use >= sr->used, ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, t->stids_in_use, sr->used)); t->stids_in_use -= sr->used; TAILQ_REMOVE(&t->stids, sr, link); mtx_unlock(&t->stid_lock); } static struct listen_ctx * alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) { struct listen_ctx *lctx; INP_WLOCK_ASSERT(inp); lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); if (lctx == NULL) return (NULL); lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); if (lctx->stid < 0) { free(lctx, M_CXGBE); return (NULL); } if (inp->inp_vflag & INP_IPV6 && !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { - lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL); + lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (lctx->ce == NULL) { free(lctx, M_CXGBE); return (NULL); } } lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; refcount_init(&lctx->refcount, 1); lctx->inp = inp; lctx->vnet = inp->inp_socket->so_vnet; in_pcbref(inp); return (lctx); } /* Don't call this directly, use release_lctx instead */ static int free_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; INP_WLOCK_ASSERT(inp); KASSERT(lctx->refcount == 0, ("%s: refcount %d", __func__, lctx->refcount)); KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", __func__, lctx->stid, lctx, lctx->inp); if (lctx->ce) - t4_release_lip(sc, lctx->ce); + t4_release_clip_entry(sc, lctx->ce); free_stid(sc, lctx); free(lctx, M_CXGBE); return (in_pcbrele_wlocked(inp)); } static void hold_lctx(struct listen_ctx *lctx) { refcount_acquire(&lctx->refcount); } static inline uint32_t listen_hashfn(void *key, u_long mask) { return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); } /* * Add a listen_ctx entry to the listen hash table. */ static void listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(lctx->inp, td->listen_mask); mtx_lock(&td->lctx_hash_lock); LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); td->lctx_count++; mtx_unlock(&td->lctx_hash_lock); } /* * Look for the listening socket's context entry in the hash and return it. */ static struct listen_ctx * listen_hash_find(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { if (lctx->inp == inp) break; } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Removes the listen_ctx structure for inp from the hash and returns it. */ static struct listen_ctx * listen_hash_del(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx, *l; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { if (lctx->inp == inp) { LIST_REMOVE(lctx, link); td->lctx_count--; break; } } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Releases a hold on the lctx. Must be called with the listening socket's inp * locked. The inp may be freed by this function and it returns NULL to * indicate this. */ static struct inpcb * release_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; int inp_freed = 0; INP_WLOCK_ASSERT(inp); if (refcount_release(&lctx->refcount)) inp_freed = free_lctx(sc, lctx); return (inp_freed ? NULL : inp); } static void send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) { struct mbuf *m = synqe->syn; struct ifnet *ifp = m->m_pkthdr.rcvif; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct wrqe *wr; struct fw_flowc_wr *flowc; struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; const int nparams = 6; const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); const u_int pfvf = sc->pf << S_FW_VIID_PFN; INP_WLOCK_ASSERT(synqe->lctx->inp); MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(synqe->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; flowc->mnemval[4].val = htobe32(512); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[5].val = htobe32(512); synqe->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } static void send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, int rst_status) { struct adapter *sc = tod->tod_softc; struct wrqe *wr; struct cpl_abort_req *req; INP_WLOCK_ASSERT(synqe->lctx->inp); CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", __func__, synqe, synqe->flags, synqe->tid, synqe->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (synqe->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ synqe->flags |= TPF_ABORT_SHUTDOWN; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); wr = alloc_wrqe(sizeof(*req), &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); req->rsvd0 = 0; /* don't have a snd_nxt */ req->rsvd1 = 1; /* no data sent yet */ req->cmd = rst_status; t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); } static int create_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip = inp->inp_laddr.s_addr; req->peer_ip = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int create_server6(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req6 *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; req->peer_ip_hi = 0; req->peer_ip_lo = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int destroy_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_close_listsvr_req *req; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, lctx->stid)); req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); req->rsvd = htobe16(0); t4_wrq_tx(sc, wr); return (0); } /* * Start a listening server by sending a passive open request to HW. * * Can't take adapter lock here and access to sc->flags, * sc->offload_map, if_capenable are all race prone. */ int t4_listen_start(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct vi_info *vi; struct port_info *pi; struct inpcb *inp = tp->t_inpcb; struct listen_ctx *lctx; int i, rc, v; struct offload_settings settings; INP_WLOCK_ASSERT(inp); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) return (0); /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); if (!(inp->inp_vflag & INP_IPV6) && IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) return (0); if (sc->flags & KERN_TLS_ON) return (0); #if 0 ADAPTER_LOCK(sc); if (IS_BUSY(sc)) { log(LOG_ERR, "%s: listen request ignored, %s is busy", __func__, device_get_nameunit(sc->dev)); goto done; } KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM not initialized", __func__)); #endif /* * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first * such VI's queues to send the passive open and receive the reply to * it. * * XXX: need a way to mark a port in use by offload. if_cxgbe should * then reject any attempt to bring down such a port (and maybe reject * attempts to disable IFCAP_TOE on that port too?). */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE && vi->ifp->if_capenable & IFCAP_TOE) goto found; } } goto done; /* no port that's UP with IFCAP_TOE enabled */ found: if (listen_hash_find(sc, inp) != NULL) goto done; /* already setup */ lctx = alloc_lctx(sc, inp, vi); if (lctx == NULL) { log(LOG_ERR, "%s: listen request ignored, %s couldn't allocate lctx\n", __func__, device_get_nameunit(sc->dev)); goto done; } listen_hash_add(sc, lctx); CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, inp->inp_vflag); if (inp->inp_vflag & INP_IPV6) rc = create_server6(sc, lctx); else rc = create_server(sc, lctx); if (rc != 0) { log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", __func__, device_get_nameunit(sc->dev), rc); (void) listen_hash_del(sc, inp); inp = release_lctx(sc, lctx); /* can't be freed, host stack has a reference */ KASSERT(inp != NULL, ("%s: inp freed", __func__)); goto done; } lctx->flags |= LCTX_RPL_PENDING; done: #if 0 ADAPTER_UNLOCK(sc); #endif return (0); } int t4_listen_stop(struct toedev *tod, struct tcpcb *tp) { struct listen_ctx *lctx; struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); lctx = listen_hash_del(sc, inp); if (lctx == NULL) return (ENOENT); /* no hardware listener for this inp */ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, lctx, lctx->flags); /* * If the reply to the PASS_OPEN is still pending we'll wait for it to * arrive and clean up when it does. */ if (lctx->flags & LCTX_RPL_PENDING) { return (EINPROGRESS); } destroy_server(sc, lctx); return (0); } static inline struct synq_entry * alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags) { struct synq_entry *synqe; INP_RLOCK_ASSERT(lctx->inp); MPASS(flags == M_WAITOK || flags == M_NOWAIT); synqe = malloc(sizeof(*synqe), M_CXGBE, flags); if (__predict_true(synqe != NULL)) { synqe->flags = TPF_SYNQE; refcount_init(&synqe->refcnt, 1); synqe->lctx = lctx; hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ synqe->syn = NULL; } return (synqe); } static inline void hold_synqe(struct synq_entry *synqe) { refcount_acquire(&synqe->refcnt); } static inline struct inpcb * release_synqe(struct adapter *sc, struct synq_entry *synqe) { struct inpcb *inp; MPASS(synqe->flags & TPF_SYNQE); MPASS(synqe->lctx != NULL); inp = synqe->lctx->inp; MPASS(inp != NULL); INP_WLOCK_ASSERT(inp); if (refcount_release(&synqe->refcnt)) { inp = release_lctx(sc, synqe->lctx); m_freem(synqe->syn); free(synqe, M_CXGBE); } return (inp); } void t4_syncache_added(struct toedev *tod __unused, void *arg) { struct synq_entry *synqe = arg; hold_synqe(synqe); } void t4_syncache_removed(struct toedev *tod, void *arg) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = synqe->lctx->inp; /* * XXX: this is a LOR but harmless when running from the softclock. */ INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); } int t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) { struct synq_entry *synqe = arg; if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { struct tcpopt to; struct ip *ip = mtod(m, struct ip *); struct tcphdr *th; if (ip->ip_v == IPVERSION) th = (void *)(ip + 1); else th = (void *)((struct ip6_hdr *)ip + 1); bzero(&to, sizeof(to)); tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), TO_SYN); /* save these for later */ synqe->iss = be32toh(th->th_seq); synqe->irs = be32toh(th->th_ack) - 1; synqe->ts = to.to_tsval; } m_freem(m); /* don't need this any more */ return (0); } static int do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_OPEN_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); INP_WLOCK(inp); CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", __func__, stid, status, lctx->flags); lctx->flags &= ~LCTX_RPL_PENDING; if (status != CPL_ERR_NONE) log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); #ifdef INVARIANTS /* * If the inp has been dropped (listening socket closed) then * listen_stop must have run and taken the inp out of the hash. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(listen_hash_del(sc, inp) == NULL, ("%s: inp %p still in listen hash", __func__, inp)); } #endif if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* * Listening socket stopped listening earlier and now the chip tells us * it has started the hardware listener. Stop it; the lctx will be * released in do_close_server_rpl. */ if (inp->inp_flags & INP_DROPPED) { destroy_server(sc, lctx); INP_WUNLOCK(inp); return (status); } /* * Failed to start hardware listener. Take inp out of the hash and * release our reference on it. An error message has been logged * already. */ if (status != CPL_ERR_NONE) { listen_hash_del(sc, inp); if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* hardware listener open for business */ INP_WUNLOCK(inp); return (status); } static int do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); if (status != CPL_ERR_NONE) { log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", __func__, status, stid); return (status); } INP_WLOCK(inp); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); return (status); } static void done_with_synqe(struct adapter *sc, struct synq_entry *synqe) { struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; int ntids; INP_WLOCK_ASSERT(inp); ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; remove_tid(sc, synqe->tid, ntids); release_tid(sc, synqe->tid, lctx->ctrlq); t4_l2t_release(e); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } void synack_failure_cleanup(struct adapter *sc, int tid) { struct synq_entry *synqe = lookup_tid(sc, tid); INP_WLOCK(synqe->lctx->inp); done_with_synqe(sc, synqe); } int do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct sge_ofld_txq *ofld_txq; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); if (negative_advice(cpl->status)) return (0); /* Ignore negative advice */ INP_WLOCK(inp); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (synqe->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ done: send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } int do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); INP_WLOCK(inp); KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply for synqe %p (0x%x)", __func__, synqe, synqe->flags)); done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ return (0); } void t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = sotoinpcb(so); struct toepcb *toep = synqe->toep; NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); MPASS(toep->tid == synqe->tid); offload_socket(so, toep); make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); toep->flags |= TPF_CPL_PENDING; update_tid(sc, synqe->tid, toep); synqe->flags |= TPF_SYNQE_EXPANDED; inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; inp->inp_flowid = synqe->rss_hash; } static void t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) { bzero(to, sizeof(*to)); if (t4opt->mss) { to->to_flags |= TOF_MSS; to->to_mss = be16toh(t4opt->mss); } if (t4opt->wsf > 0 && t4opt->wsf < 15) { to->to_flags |= TOF_SCALE; to->to_wscale = t4opt->wsf; } if (t4opt->tstamp) to->to_flags |= TOF_TS; if (t4opt->sack) to->to_flags |= TOF_SACKPERM; } static bool encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) { u_int hlen = be32toh(cpl->hdr_len); if (chip_id(sc) >= CHELSIO_T6) return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); else return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); } static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) { const struct cpl_pass_accept_req *cpl = mtod(m, const void *); const struct ether_header *eh; unsigned int hlen = be32toh(cpl->hdr_len); uintptr_t l3hdr; const struct tcphdr *tcp; eh = (const void *)(cpl + 1); if (chip_id(sc) >= CHELSIO_T6) { l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); } else { l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); } /* extract TOS (DiffServ + ECN) byte for AccECN */ if (iptos) { if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; *iptos = ip->ip_tos; } #ifdef INET6 else if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { const struct ip6_hdr *ip6 = (const void *)l3hdr; *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; } #endif /* INET */ } if (inc) { bzero(inc, sizeof(*inc)); inc->inc_fport = tcp->th_sport; inc->inc_lport = tcp->th_dport; if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; inc->inc_faddr = ip->ip_src; inc->inc_laddr = ip->ip_dst; } else { const struct ip6_hdr *ip6 = (const void *)l3hdr; inc->inc_flags |= INC_ISIPV6; inc->inc6_faddr = ip6->ip6_src; inc->inc6_laddr = ip6->ip6_dst; } } if (th) { bcopy(tcp, th, sizeof(*th)); tcp_fields_to_host(th); /* just like tcp_input */ } } static struct l2t_entry * get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, struct in_conninfo *inc) { struct l2t_entry *e; struct sockaddr_in6 sin6; struct sockaddr *dst = (void *)&sin6; struct nhop_object *nh; if (inc->inc_flags & INC_ISIPV6) { bzero(dst, sizeof(struct sockaddr_in6)); dst->sa_len = sizeof(struct sockaddr_in6); dst->sa_family = AF_INET6; if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { /* no need for route lookup */ e = t4_l2t_get(pi, ifp, dst); return (e); } nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; else ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; } else { dst->sa_len = sizeof(struct sockaddr_in); dst->sa_family = AF_INET; nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; else ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; } e = t4_l2t_get(pi, ifp, dst); return (e); } static int send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, uint32_t opt2, int tid) { struct wrqe *wr; struct cpl_pass_accept_rpl *rpl; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); if (wr == NULL) return (ENOMEM); rpl = wrtod(wr); if (is_t4(sc)) INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); else { struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); rpl5->iss = htobe32(synqe->iss); } rpl->opt0 = opt0; rpl->opt2 = opt2; return (t4_l2t_send(sc, wr, e)); } #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ if (!tunnel) { \ m_freem(m); \ m = NULL; \ } \ reject_reason = __LINE__; \ goto reject; \ } while (0) /* * The context associated with a tid entry via insert_tid could be a synq_entry * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. */ CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); /* * Incoming SYN on a listening socket. * * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, * etc. */ static int do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct toedev *tod; const struct cpl_pass_accept_req *cpl = mtod(m, const void *); unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); unsigned int tid = GET_TID(cpl); struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp; struct socket *so; struct in_conninfo inc; struct tcphdr th; struct tcpopt to; struct port_info *pi; struct vi_info *vi; struct ifnet *hw_ifp, *ifp; struct l2t_entry *e = NULL; struct synq_entry *synqe = NULL; int reject_reason, v, ntids; uint16_t vid, l2info; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif struct offload_settings settings; uint8_t iptos; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, lctx); /* * Figure out the port the SYN arrived on. We'll look for an exact VI * match in a bit but in case we don't find any we'll use the main VI as * the incoming ifnet. */ l2info = be16toh(cpl->l2info); pi = sc->port[G_SYN_INTF(l2info)]; hw_ifp = pi->vi[0].ifp; m->m_pkthdr.rcvif = hw_ifp; CURVNET_SET(lctx->vnet); /* before any potential REJECT */ /* * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will * also hit the listener. We don't want to offload those. */ if (encapsulated_syn(sc, cpl)) { REJECT_PASS_ACCEPT_REQ(true); } /* * Use the MAC index to lookup the associated VI. If this SYN didn't * match a perfect MAC filter, punt. */ if (!(l2info & F_SYN_XACT_MATCH)) { REJECT_PASS_ACCEPT_REQ(true); } for_each_vi(pi, v, vi) { if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) goto found; } REJECT_PASS_ACCEPT_REQ(true); found: hw_ifp = vi->ifp; /* the cxgbe ifnet */ m->m_pkthdr.rcvif = hw_ifp; tod = TOEDEV(hw_ifp); /* * Don't offload if the peer requested a TCP option that's not known to * the silicon. Send the SYN to the kernel instead. */ if (__predict_false(cpl->tcpopt.unknown)) REJECT_PASS_ACCEPT_REQ(true); /* * Figure out if there is a pseudo interface (vlan, lagg, etc.) * involved. Don't offload if the SYN had a VLAN tag and the vid * doesn't match anything on this interface. * * XXX: lagg support, lagg + vlan support. */ vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); if (vid != 0xfff && vid != 0) { ifp = VLAN_DEVAT(hw_ifp, vid); if (ifp == NULL) REJECT_PASS_ACCEPT_REQ(true); } else ifp = hw_ifp; /* * Don't offload if the ifnet that the SYN came in on is not in the same * vnet as the listening socket. */ if (lctx->vnet != ifp->if_vnet) REJECT_PASS_ACCEPT_REQ(true); pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); if (inc.inc_flags & INC_ISIPV6) { /* Don't offload if the ifcap isn't enabled */ if ((ifp->if_capenable & IFCAP_TOE6) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP6 address on this ifnet. This * is more restrictive than in6_localip. */ NET_EPOCH_ENTER(et); if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 2; } else { /* Don't offload if the ifcap isn't enabled */ if ((ifp->if_capenable & IFCAP_TOE4) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP address on this ifnet. This * is more restrictive than in_localip. */ NET_EPOCH_ENTER(et); if (!in_ifhasaddr(ifp, inc.inc_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 1; } e = get_l2te_for_nexthop(pi, ifp, &inc); if (e == NULL) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } inp = lctx->inp; /* listening socket, not owned by TOE */ INP_RLOCK(inp); /* Don't offload if the listening socket has closed */ if (__predict_false(inp->inp_flags & INP_DROPPED)) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } so = inp->inp_socket; rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ } synqe = alloc_synqe(sc, lctx, M_NOWAIT); if (synqe == NULL) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } MPASS(rss->hash_type == RSS_HASH_TCP); synqe->rss_hash = be32toh(rss->hash_val); atomic_store_int(&synqe->ok_to_respond, 0); init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, &synqe->params); /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. */ t4opt_to_tcpopt(&cpl->tcpopt, &to); toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); if (atomic_load_int(&synqe->ok_to_respond) > 0) { uint64_t opt0; uint32_t opt2; opt0 = calc_options0(vi, &synqe->params); opt2 = calc_options2(vi, &synqe->params); insert_tid(sc, tid, synqe, ntids); synqe->tid = tid; synqe->syn = m; m = NULL; if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { remove_tid(sc, tid, ntids); m = synqe->syn; synqe->syn = NULL; NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); } else { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); reject: CURVNET_RESTORE(); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, reject_reason); if (e) t4_l2t_release(e); release_tid(sc, tid, lctx->ctrlq); if (synqe) { inp = synqe->lctx->inp; INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } if (m) { /* * The connection request hit a TOE listener but is being passed * on to the kernel sw stack instead of getting offloaded. */ m_adj(m, sizeof(*cpl)); m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; hw_ifp->if_input(hw_ifp, m); } return (reject_reason); } static void synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, const struct cpl_pass_establish *cpl, struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) { uint16_t tcp_opt = be16toh(cpl->tcp_opt); uint8_t iptos; /* start off with the original SYN */ pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); /* modify parts to make it look like the ACK to our SYN|ACK */ th->th_flags = TH_ACK; th->th_ack = synqe->iss + 1; th->th_seq = be32toh(cpl->rcv_isn); bzero(to, sizeof(*to)); if (G_TCPOPT_TSTAMP(tcp_opt)) { to->to_flags |= TOF_TS; to->to_tsecr = synqe->ts; } } static int do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct vi_info *vi; struct ifnet *ifp; const struct cpl_pass_establish *cpl = (const void *)(rss + 1); #if defined(KTR) || defined(INVARIANTS) unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); #endif unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; struct in_conninfo inc; struct toepcb *toep; struct epoch_tracker et; int rstreason; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_ESTABLISH, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); CURVNET_SET(lctx->vnet); NET_EPOCH_ENTER(et); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); ifp = synqe->syn->m_pkthdr.rcvif; vi = ifp->if_softc; KASSERT(vi->adapter == sc, ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); if (__predict_false(inp->inp_flags & INP_DROPPED)) { reset: send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], ("%s: CPL arrived on unexpected rxq. %d %d", __func__, synqe->params.rxq_idx, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) goto reset; toep->tid = tid; toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; toep->vnet = lctx->vnet; bcopy(&synqe->params, &toep->params, sizeof(toep->params)); init_toepcb(vi, toep); MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); synqe->tcp_opt = cpl->tcp_opt; synqe->toep = toep; /* Come up with something that syncache_expand should be ok with. */ synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); - if (inc.inc_flags & INC_ISIPV6) - toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce); + if (inc.inc_flags & INC_ISIPV6) { + if (lctx->ce == NULL) { + toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); + if (toep->ce == NULL) { + free_toepcb(toep); + goto reset; /* RST without a CLIP entry? */ + } + } else { + t4_hold_clip_entry(sc, lctx->ce); + toep->ce = lctx->ce; + } + } so = inp->inp_socket; KASSERT(so != NULL, ("%s: socket is NULL", __func__)); rstreason = toe_syncache_expand(&inc, &to, &th, &so); if (rstreason < 0) { free_toepcb(toep); send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } else if (rstreason == 0 || so == NULL) { free_toepcb(toep); goto reset; } /* New connection inpcb is already locked by syncache_expand(). */ new_inp = sotoinpcb(so); INP_WLOCK_ASSERT(new_inp); MPASS(so->so_vnet == lctx->vnet); /* * This is for expansion from syncookies. * * XXX: we've held the tcbinfo lock throughout so there's no risk of * anyone accept'ing a connection before we've installed our hooks, but * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); } INP_WUNLOCK(new_inp); /* Done with the synqe */ inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void t4_init_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); } void t4_uninit_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index 173357404ebe..97693ab74000 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -1,2113 +1,2113 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #include "tom/t4_tls.h" static struct protosw *tcp_protosw; static struct protosw toe_protosw; static struct pr_usrreqs toe_usrreqs; static struct protosw *tcp6_protosw; static struct protosw toe6_protosw; static struct pr_usrreqs toe6_usrreqs; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); struct toepcb * alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); return (toep); } /* * Initialize a toepcb after its params have been filled out. */ int init_toepcb(struct vi_info *vi, struct toepcb *toep) { struct conn_params *cp = &toep->params; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tx_cl_rl_params *tc; if (cp->tc_idx >= 0 && cp->tc_idx < sc->chip_params->nsched_cls) { tc = &pi->sched_params->cl_rl[cp->tc_idx]; mtx_lock(&sc->tc_lock); if (tc->flags & CLRL_ERR) { log(LOG_ERR, "%s: failed to associate traffic class %u with tid %u\n", device_get_nameunit(vi->dev), cp->tc_idx, toep->tid); cp->tc_idx = -1; } else { tc->refcount++; } mtx_unlock(&sc->tc_lock); } toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; tls_init_toep(toep); if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_init_toep(toep); toep->flags |= TPF_INITIALIZED; return (0); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); if (toep->flags & TPF_INITIALIZED) { if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); } free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } void restore_so_proto(struct socket *so, bool v6) { if (v6) so->so_proto = tcp6_protosw; else so->so_proto = tcp_protosw; } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; restore_so_proto(so, inp->inp_vflag & INP_IPV6); SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_len(&toep->ulp_pduq) == 0); MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0); #ifdef INVARIANTS if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) - t4_release_lip(sc, toep->ce); + t4_release_clip_entry(sc, toep->ce); if (toep->params.tc_idx != -1) t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tp->t_inpcb; #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif if (ulp_mode(toep) == ULP_MODE_TLS) tls_detach(toep); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; } } static inline uint64_t get_tcb_tflags(const uint64_t *tcb) { return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } static inline uint32_t get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { #define LAST_WORD ((TCB_SIZE / 4) - 1) uint64_t t1, t2; int flit_idx; MPASS(mask != 0); MPASS(word <= LAST_WORD); MPASS(shift < 32); flit_idx = (LAST_WORD - word) / 2; if (word & 0x1) shift += 32; t1 = be64toh(tcb[flit_idx]) >> shift; t2 = 0; if (fls(mask) > 64 - shift) { /* * Will spill over into the next logical flit, which is the flit * before this one. The flit_idx before this one must be valid. */ MPASS(flit_idx > 0); t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); } return ((t2 | t1) & mask); #undef LAST_WORD } #define GET_TCB_FIELD(tcb, F) \ get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) /* * Issues a CPL_GET_TCB to read the entire TCB for the tid. */ static int send_get_tcb(struct adapter *sc, u_int tid) { struct cpl_get_tcb *cpl; struct wrq_cookie cookie; MPASS(tid < sc->tids.ntids); cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), &cookie); if (__predict_false(cpl == NULL)) return (ENOMEM); bzero(cpl, sizeof(*cpl)); INIT_TP_WR(cpl, tid); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); cpl->cookie = 0xff; commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); return (0); } static struct tcb_histent * alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) { struct tcb_histent *te; MPASS(flags == M_NOWAIT || flags == M_WAITOK); te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); if (te == NULL) return (NULL); mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); callout_init_mtx(&te->te_callout, &te->te_lock, 0); te->te_adapter = sc; te->te_tid = tid; return (te); } static void free_tcb_histent(struct tcb_histent *te) { mtx_destroy(&te->te_lock); free(te, M_CXGBE); } /* * Start tracking the tid in the TCB history. */ int add_tid_to_history(struct adapter *sc, u_int tid) { struct tcb_histent *te = NULL; struct tom_data *td = sc->tom_softc; int rc; MPASS(tid < sc->tids.ntids); if (td->tcb_history == NULL) return (ENXIO); rw_wlock(&td->tcb_history_lock); if (td->tcb_history[tid] != NULL) { rc = EEXIST; goto done; } te = alloc_tcb_histent(sc, tid, M_NOWAIT); if (te == NULL) { rc = ENOMEM; goto done; } mtx_lock(&te->te_lock); rc = send_get_tcb(sc, tid); if (rc == 0) { te->te_flags |= TE_RPL_PENDING; td->tcb_history[tid] = te; } else { free(te, M_CXGBE); } mtx_unlock(&te->te_lock); done: rw_wunlock(&td->tcb_history_lock); return (rc); } static void remove_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; rw_assert(&td->tcb_history_lock, RA_WLOCKED); mtx_assert(&te->te_lock, MA_OWNED); MPASS(td->tcb_history[te->te_tid] == te); td->tcb_history[te->te_tid] = NULL; free_tcb_histent(te); rw_wunlock(&td->tcb_history_lock); } static inline struct tcb_histent * lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) { struct tcb_histent *te; struct tom_data *td = sc->tom_softc; MPASS(tid < sc->tids.ntids); if (td->tcb_history == NULL) return (NULL); if (addrem) rw_wlock(&td->tcb_history_lock); else rw_rlock(&td->tcb_history_lock); te = td->tcb_history[tid]; if (te != NULL) { mtx_lock(&te->te_lock); return (te); /* with both locks held */ } if (addrem) rw_wunlock(&td->tcb_history_lock); else rw_runlock(&td->tcb_history_lock); return (te); } static inline void release_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; mtx_assert(&te->te_lock, MA_OWNED); mtx_unlock(&te->te_lock); rw_assert(&td->tcb_history_lock, RA_RLOCKED); rw_runlock(&td->tcb_history_lock); } static void request_tcb(void *arg) { struct tcb_histent *te = arg; mtx_assert(&te->te_lock, MA_OWNED); /* Noone else is supposed to update the histent. */ MPASS(!(te->te_flags & TE_RPL_PENDING)); if (send_get_tcb(te->te_adapter, te->te_tid) == 0) te->te_flags |= TE_RPL_PENDING; else callout_schedule(&te->te_callout, hz / 100); } static void update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) { struct tom_data *td = te->te_adapter->tom_softc; uint64_t tflags = get_tcb_tflags(tcb); uint8_t sample = 0; if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) sample |= TS_RTO; if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) sample |= TS_DUPACKS; if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) sample |= TS_FASTREXMT; } if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { uint32_t snd_wnd; sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (tflags & V_TF_RECV_SCALE(1)) snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) sample |= TS_CWND_LIMITED; /* maybe due to CWND */ } if (tflags & V_TF_CCTRL_ECN(1)) { /* * CE marker on incoming IP hdr, echoing ECE back in the TCP * hdr. Indicates congestion somewhere on the way from the peer * to this node. */ if (tflags & V_TF_CCTRL_ECE(1)) sample |= TS_ECN_ECE; /* * ECE seen and CWR sent (or about to be sent). Might indicate * congestion on the way to the peer. This node is reducing its * congestion window in response. */ if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) sample |= TS_ECN_CWR; } te->te_sample[te->te_pidx] = sample; if (++te->te_pidx == nitems(te->te_sample)) te->te_pidx = 0; memcpy(te->te_tcb, tcb, TCB_SIZE); te->te_flags |= TE_ACTIVE; } static int do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); struct tcb_histent *te; const u_int tid = GET_TID(cpl); bool remove; remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; te = lookup_tcb_histent(sc, tid, remove); if (te == NULL) { /* Not in the history. Who issued the GET_TCB for this? */ device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); goto done; } MPASS(te->te_flags & TE_RPL_PENDING); te->te_flags &= ~TE_RPL_PENDING; if (remove) { remove_tcb_histent(te); } else { update_tcb_histent(te, tcb); callout_reset(&te->te_callout, hz / 10, request_tcb, te); release_tcb_histent(te); } done: m_freem(m); return (0); } static void fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) { uint32_t v; ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); v = GET_TCB_FIELD(tcb, T_SRTT); ti->tcpi_rtt = tcp_ticks_to_us(sc, v); v = GET_TCB_FIELD(tcb, T_RTTVAR); ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); v = GET_TCB_FIELD(tcb, TX_MAX); ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); /* Receive window being advertised by us. */ ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); /* Send window */ ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; else ti->tcpi_snd_wscale = 0; } static void fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, struct tcp_info *ti) { fill_tcp_info_from_tcb(sc, te->te_tcb, ti); } /* * Reads the TCB for the given tid using a memory window and copies it to 'buf' * in the same format as CPL_GET_TCB_RPL. */ static void read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) { int i, j, k, rc; uint32_t addr; u_char *tcb, tmp; MPASS(tid < sc->tids.ntids); addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); if (rc != 0) return; tcb = (u_char *)buf; for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { for (k = 0; k < 16; k++) { tmp = tcb[i + k]; tcb[i + k] = tcb[j + k]; tcb[j + k] = tmp; } } } static void fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) { uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; struct tcb_histent *te; ti->tcpi_toe_tid = tid; te = lookup_tcb_histent(sc, tid, false); if (te != NULL) { fill_tcp_info_from_history(sc, te, ti); release_tcb_histent(te); } else { if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { /* XXX: tell firmware to flush TCB cache. */ } read_tcb_using_memwin(sc, tid, tcb); fill_tcp_info_from_tcb(sc, tcb, ti); } } /* * Called by the kernel to allow the TOE driver to "refine" values filled up in * the tcp_info for an offloaded connection. */ static void t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(ti != NULL); fill_tcp_info(sc, toep->tid, ti); } #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, struct ktls_session *tls, int direction) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tp->t_inpcb); MPASS(tls != NULL); return (tls_alloc_ktls(toep, tls, direction)); } #endif /* SET_TCB_FIELD sent as a ULP command looks like this */ #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) static void * mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, uint64_t word, uint64_t mask, uint64_t val, uint32_t tid) { struct ulptx_idata *ulpsc; struct cpl_set_tcb_field_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); req->reply_ctrl = htobe16(V_NO_REPLY(1)); req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); req->mask = htobe64(mask); req->val = htobe64(val); ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__SET_TCB_FIELD_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static void send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep) { struct wrq_cookie cookie; struct fw_flowc_wr *flowc; struct ofld_tx_sdesc *txsd; const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval); const int flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) { CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__, toep->tid, toep->tx_credits, toep->txsd_avail); return; } flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie); if (__predict_false(flowc == NULL)) { CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid); return; } flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(1)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[0].val = htobe32(toep->params.emss); txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie); } static void t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu) { struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int idx, len; struct wrq_cookie cookie; struct inpcb *inp = tp->t_inpcb; struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); unsigned short *mtus = &sc->params.mtus[0]; INP_WLOCK_ASSERT(inp); MPASS(mtu > 0); /* kernel is supposed to provide something usable. */ /* tp->snd_una and snd_max are in host byte order too. */ seq = be32toh(seq); CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)", __func__, toep->tid, seq, mtu, toep->params.mtu_idx, mtus[toep->params.mtu_idx]); if (ulp_mode(toep) == ULP_MODE_NONE && /* XXX: Read TCB otherwise? */ (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) { CTR5(KTR_CXGBE, "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).", __func__, toep->tid, seq, tp->snd_una, tp->snd_max); return; } /* Find the best mtu_idx for the suggested MTU. */ for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++) continue; if (idx >= toep->params.mtu_idx) return; /* Never increase the PMTU (just like the kernel). */ /* * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first * one updates the mtu_idx and the second one triggers a retransmit. */ len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie); if (wrh == NULL) { CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n", toep->tid, toep->params.mtu_idx, idx); return; } INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_T_MAXSEG, V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx), toep->tid); ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_TIMESTAMP, V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0, toep->tid); commit_wrq_wr(toep->ctrlq, wrh, &cookie); /* Update the software toepcb and tcpcb. */ toep->params.mtu_idx = idx; tp->t_maxseg = mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (tp->t_flags & TF_RCVD_TSTMP) toep->params.emss -= TCPOLEN_TSTAMP_APPA; /* Update the firmware flowc. */ send_mss_flowc_wr(sc, toep); /* Update the MTU in the kernel's hostcache. */ if (sc->tt.update_hc_on_pmtu_change != 0) { struct in_conninfo inc = {0}; inc.inc_fibnum = inp->inp_inc.inc_fibnum; if (inp->inp_inc.inc_flags & INC_ISIPV6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = inp->inp_inc.inc6_faddr; } else { inc.inc_faddr = inp->inp_inc.inc_faddr; } tcp_hc_updatemtu(&inc, mtu); } CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", __func__, toep->tid, toep->params.mtu_idx, mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss); } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); else if (ulp_mode(toep) == ULP_MODE_TLS) tls_detach(toep); toep->inp = NULL; toep->flags &= ~TPF_CPL_PENDING; mbufq_drain(&toep->ulp_pdu_reclaimq); if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; MPASS(tid >= t->tid_base); MPASS(tid - t->tid_base < t->ntids); t->tid_tab[tid - t->tid_base] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid - t->tid_base]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } /* * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt * have the MSS that we should advertise in our SYN. Advertised MSS doesn't * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, mtu; MPASS(inc != NULL); mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } __be64 calc_options0(struct vi_info *vi, struct conn_params *cp) { uint64_t opt0 = 0; opt0 |= F_TCAM_BYPASS; MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); opt0 |= V_WND_SCALE(cp->wscale); MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); opt0 |= V_MSS_IDX(cp->mtu_idx); MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); opt0 |= V_ULP_MODE(cp->ulp_mode); MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size); opt0 |= V_L2T_IDX(cp->l2t_idx); opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); MPASS(cp->keepalive == 0 || cp->keepalive == 1); opt0 |= V_KEEP_ALIVE(cp->keepalive); MPASS(cp->nagle == 0 || cp->nagle == 1); opt0 |= V_NAGLE(cp->nagle); return (htobe64(opt0)); } __be32 calc_options2(struct vi_info *vi, struct conn_params *cp) { uint32_t opt2 = 0; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } MPASS(cp->sack == 0 || cp->sack == 1); opt2 |= V_SACK_EN(cp->sack); MPASS(cp->tstamp == 0 || cp->tstamp == 1); opt2 |= V_TSTAMPS_EN(cp->tstamp); if (cp->wscale > 0) opt2 |= F_WND_SCALE_EN; MPASS(cp->ecn == 0 || cp->ecn == 1); opt2 |= V_CCTRL_ECN(cp->ecn); /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); opt2 |= V_PACE(0); opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); opt2 |= V_CONG_CNTRL(cp->cong_algo); MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); if (cp->rx_coalesce == 1) opt2 |= V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); #ifdef USE_DDP_RX_FLOW_CONTROL if (cp->ulp_mode == ULP_MODE_TCPDDP) opt2 |= F_RX_FC_DDP; #endif return (htobe32(opt2)); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->adapter; struct tp_params *tp = &sc->params.tp; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) { ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) | V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } static int is_tls_sock(struct socket *so, struct adapter *sc) { struct inpcb *inp = sotoinpcb(so); int i, rc; /* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */ rc = 0; ADAPTER_LOCK(sc); for (i = 0; i < sc->tt.num_tls_rx_ports; i++) { if (inp->inp_lport == htons(sc->tt.tls_rx_ports[i]) || inp->inp_fport == htons(sc->tt.tls_rx_ports[i])) { rc = 1; break; } } ADAPTER_UNLOCK(sc); return (rc); } /* * Initialize various connection parameters. */ void init_conn_params(struct vi_info *vi , struct offload_settings *s, struct in_conninfo *inc, struct socket *so, const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tom_tunables *tt = &sc->tt; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); u_long wnd; MPASS(s->offload != 0); /* Congestion control algorithm */ if (s->cong_algo >= 0) cp->cong_algo = s->cong_algo & M_CONG_CNTRL; else if (sc->tt.cong_algorithm >= 0) cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) cp->cong_algo = CONG_ALG_RENO; else if (strcasecmp(cc->name, "tahoe") == 0) cp->cong_algo = CONG_ALG_TAHOE; if (strcasecmp(cc->name, "newreno") == 0) cp->cong_algo = CONG_ALG_NEWRENO; if (strcasecmp(cc->name, "highspeed") == 0) cp->cong_algo = CONG_ALG_HIGHSPEED; else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ cp->cong_algo = CONG_ALG_NEWRENO; } } /* Tx traffic scheduling class. */ if (s->sched_class >= 0 && s->sched_class < sc->chip_params->nsched_cls) { cp->tc_idx = s->sched_class; } else cp->tc_idx = -1; /* Nagle's algorithm. */ if (s->nagle >= 0) cp->nagle = s->nagle > 0 ? 1 : 0; else cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; /* TCP Keepalive. */ if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) cp->keepalive = 1; else cp->keepalive = 0; /* Optimization that's specific to T5 @ 40G. */ if (tt->tx_align >= 0) cp->tx_align = tt->tx_align > 0 ? 1 : 0; else if (chip_id(sc) == CHELSIO_T5 && (port_top_speed(pi) > 10 || sc->params.nports > 2)) cp->tx_align = 1; else cp->tx_align = 0; /* ULP mode. */ if (can_tls_offload(sc) && (s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc)))) cp->ulp_mode = ULP_MODE_TLS; else if (s->ddp > 0 || (s->ddp < 0 && sc->tt.ddp && (so_options_get(so) & SO_NO_DDP) == 0)) cp->ulp_mode = ULP_MODE_TCPDDP; else cp->ulp_mode = ULP_MODE_NONE; /* Rx coalescing. */ if (s->rx_coalesce >= 0) cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; else if (cp->ulp_mode == ULP_MODE_TLS) cp->rx_coalesce = 0; else if (tt->rx_coalesce >= 0) cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; else cp->rx_coalesce = 1; /* default */ /* * Index in the PMTU table. This controls the MSS that we announce in * our SYN initially, but after ESTABLISHED it controls the MSS that we * use to send data. */ cp->mtu_idx = find_best_mtu_idx(sc, inc, s); /* Tx queue for this connection. */ if (s->txq >= 0 && s->txq < vi->nofldtxq) cp->txq_idx = s->txq; else cp->txq_idx = arc4random() % vi->nofldtxq; cp->txq_idx += vi->first_ofld_txq; /* Rx queue for this connection. */ if (s->rxq >= 0 && s->rxq < vi->nofldrxq) cp->rxq_idx = s->rxq; else cp->rxq_idx = arc4random() % vi->nofldrxq; cp->rxq_idx += vi->first_ofld_rxq; if (SOLISTENING(so)) { /* Passive open */ MPASS(tcpopt != NULL); /* TCP timestamp option */ if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling. */ if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (tcpopt->ecn && /* XXX: review. */ (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) cp->ecn = 1; else cp->ecn = 0; wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else if (so->sol_sbsnd_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->sol_sbsnd_hiwat; } else { /* Active open */ /* TCP timestamp option */ if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling */ if (tp->t_flags & TF_REQ_SCALE) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) cp->ecn = 1; else cp->ecn = 0; SOCKBUF_LOCK(&so->so_rcv); wnd = max(select_rcv_wnd(so), MIN_RCV_WND); SOCKBUF_UNLOCK(&so->so_rcv); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); } } cp->l2t_idx = l2t_idx; /* This will be initialized on ESTABLISHED. */ cp->emss = 0; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tab(struct tid_info *t, int flags) { MPASS(t->ntids > 0); MPASS(t->tid_tab == NULL); t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE, M_ZERO | flags); if (t->tid_tab == NULL) return (ENOMEM); atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tab(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; } static int alloc_stid_tab(struct tid_info *t, int flags) { MPASS(t->nstids > 0); MPASS(t->stid_tab == NULL); t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, M_ZERO | flags); if (t->stid_tab == NULL) return (ENOMEM); mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; return (0); } static void free_stid_tab(struct tid_info *t) { KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); free(t->stid_tab, M_CXGBE); t->stid_tab = NULL; } static void free_tid_tabs(struct tid_info *t) { free_tid_tab(t); free_stid_tab(t); } static int alloc_tid_tabs(struct tid_info *t) { int rc; rc = alloc_tid_tab(t, M_NOWAIT); if (rc != 0) goto failed; rc = alloc_stid_tab(t, M_NOWAIT); if (rc != 0) goto failed; return (0); failed: free_tid_tabs(t); return (rc); } static inline void alloc_tcb_history(struct adapter *sc, struct tom_data *td) { if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) return; rw_init(&td->tcb_history_lock, "TCB history"); td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), M_CXGBE, M_ZERO | M_NOWAIT); td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); } static inline void free_tcb_history(struct adapter *sc, struct tom_data *td) { #ifdef INVARIANTS int i; if (td->tcb_history != NULL) { for (i = 0; i < sc->tids.ntids; i++) { MPASS(td->tcb_history[i] == NULL); } } #endif free(td->tcb_history, M_CXGBE); if (rw_initialized(&td->tcb_history_lock)) rw_destroy(&td->tcb_history_lock); } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tcb_history(sc, td); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static char * prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, int *buflen) { char *pkt; struct tcphdr *th; int ipv6, len; const int maxlen = max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + sizeof(struct tcphdr); MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); if (pkt == NULL) return (NULL); ipv6 = inp->inp_vflag & INP_IPV6; len = 0; if (EVL_VLANOFTAG(vtag) == 0xfff) { struct ether_header *eh = (void *)pkt; if (ipv6) eh->ether_type = htons(ETHERTYPE_IPV6); else eh->ether_type = htons(ETHERTYPE_IP); len += sizeof(*eh); } else { struct ether_vlan_header *evh = (void *)pkt; evh->evl_encap_proto = htons(ETHERTYPE_VLAN); evh->evl_tag = htons(vtag); if (ipv6) evh->evl_proto = htons(ETHERTYPE_IPV6); else evh->evl_proto = htons(ETHERTYPE_IP); len += sizeof(*evh); } if (ipv6) { struct ip6_hdr *ip6 = (void *)&pkt[len]; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = ip6->ip6_src; } len += sizeof(*ip6); } else { struct ip *ip = (void *)&pkt[len]; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); ip->ip_ttl = inp->inp_ip_ttl; ip->ip_p = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip->ip_src = inp->inp_laddr; ip->ip_dst = ip->ip_src; } len += sizeof(*ip); } th = (void *)&pkt[len]; if (open_type == OPEN_TYPE_ACTIVE) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = inp->inp_fport; /* ditto */ } else if (open_type == OPEN_TYPE_LISTEN) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = th->th_sport; } len += sizeof(th); *pktlen = *buflen = len; return (pkt); } const struct offload_settings * lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, uint16_t vtag, struct inpcb *inp) { const struct t4_offload_policy *op; char *pkt; struct offload_rule *r; int i, matched, pktlen, buflen; static const struct offload_settings allow_offloading_settings = { .offload = 1, .rx_coalesce = -1, .cong_algo = -1, .sched_class = -1, .tstamp = -1, .sack = -1, .nagle = -1, .ecn = -1, .ddp = -1, .tls = -1, .txq = -1, .rxq = -1, .mss = -1, }; static const struct offload_settings disallow_offloading_settings = { .offload = 0, /* rest is irrelevant when offload is off. */ }; rw_assert(&sc->policy_lock, RA_LOCKED); /* * If there's no Connection Offloading Policy attached to the device * then we need to return a default static policy. If * "cop_managed_offloading" is true, then we need to disallow * offloading until a COP is attached to the device. Otherwise we * allow offloading ... */ op = sc->policy; if (op == NULL) { if (sc->tt.cop_managed_offloading) return (&disallow_offloading_settings); else return (&allow_offloading_settings); } switch (open_type) { case OPEN_TYPE_ACTIVE: case OPEN_TYPE_LISTEN: pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen); break; case OPEN_TYPE_PASSIVE: MPASS(m != NULL); pkt = mtod(m, char *); MPASS(*pkt == CPL_PASS_ACCEPT_REQ); pkt += sizeof(struct cpl_pass_accept_req); pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); buflen = m->m_len - sizeof(struct cpl_pass_accept_req); break; default: MPASS(0); return (&disallow_offloading_settings); } if (pkt == NULL || pktlen == 0 || buflen == 0) return (&disallow_offloading_settings); matched = 0; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { if (r->open_type != open_type && r->open_type != OPEN_TYPE_DONTCARE) { continue; } matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); if (matched) break; } if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) free(pkt, M_CXGBE); return (matched ? &r->settings : &disallow_offloading_settings); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid, tid; struct wrqe *wr; struct adapter *sc = td_adapter(td); mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; case CPL_PASS_ACCEPT_RPL: tid = GET_TID(cpl); CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid); synack_failure_cleanup(sc, tid); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; int i, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); alloc_tcb_history(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; tod->tod_tcp_info = t4_tcp_info; #ifdef KERN_TLS tod->tod_alloc_tls_session = t4_alloc_tls_session; #endif tod->tod_pmtu_update = t4_pmtu_update; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { TOEDEV(vi->ifp) = &td->tod; } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; if (ulp_mode(toep) == ULP_MODE_TCPDDP) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) { if (sopt->sopt_level != IPPROTO_TCP) return (tcp_ctloutput(so, sopt)); switch (sopt->sopt_name) { case TCP_TLSOM_SET_TLS_CONTEXT: case TCP_TLSOM_GET_TLS_TOM: case TCP_TLSOM_CLR_TLS_TOM: case TCP_TLSOM_CLR_QUIES: return (t4_ctloutput_tls(so, sopt)); default: return (tcp_ctloutput(so, sopt)); } } static int t4_tom_mod_load(void) { /* CPL handlers */ t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); t4_ddp_mod_load(); t4_tls_mod_load(); tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); if (tcp_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw)); bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs)); toe_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_usrreqs = &toe_usrreqs; tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM); if (tcp6_protosw == NULL) return (ENOPROTOOPT); bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs)); toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom; toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_usrreqs = &toe6_usrreqs; return (t4_register_uld(&tom_uld_info)); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); t4_tls_mod_unload(); t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);