diff --git a/sys/dev/cxgb/common/cxgb_ctl_defs.h b/sys/dev/cxgb/common/cxgb_ctl_defs.h index 188e54fb0fc2..369debe34e29 100644 --- a/sys/dev/cxgb/common/cxgb_ctl_defs.h +++ b/sys/dev/cxgb/common/cxgb_ctl_defs.h @@ -1,158 +1,158 @@ /* * Copyright (C) 2003-2006 Chelsio Communications. All rights reserved. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this * release for licensing terms and conditions. * * $FreeBSD$ */ #ifndef _CXGB3_OFFLOAD_CTL_DEFS_H #define _CXGB3_OFFLOAD_CTL_DEFS_H enum { GET_MAX_OUTSTANDING_WR, GET_TX_MAX_CHUNK, GET_TID_RANGE, GET_STID_RANGE, GET_RTBL_RANGE, GET_L2T_CAPACITY, GET_MTUS, GET_WR_LEN, GET_IFF_FROM_MAC, GET_DDP_PARAMS, GET_PORTS, ULP_ISCSI_GET_PARAMS, ULP_ISCSI_SET_PARAMS, RDMA_GET_PARAMS, RDMA_CQ_OP, RDMA_CQ_SETUP, RDMA_CQ_DISABLE, RDMA_CTRL_QP_SETUP, RDMA_GET_MEM, FAILOVER = 30, FAILOVER_DONE = 31, FAILOVER_CLEAR = 32, GET_CPUIDX_OF_QSET = 40, GET_RX_PAGE_INFO = 50, }; /* * Structure used to describe a TID range. Valid TIDs are [base, base+num). */ struct tid_range { unsigned int base; /* first TID */ unsigned int num; /* number of TIDs in range */ }; /* * Structure used to request the size and contents of the MTU table. */ struct mtutab { unsigned int size; /* # of entries in the MTU table */ const unsigned short *mtus; /* the MTU table values */ }; /* * Structure used to request the ifnet that owns a given MAC address. */ struct iff_mac { - struct ifnet *dev; + if_t dev; const unsigned char *mac_addr; u16 vlan_tag; }; struct pci_dev; /* * Structure used to request the TCP DDP parameters. */ struct ddp_params { unsigned int llimit; /* TDDP region start address */ unsigned int ulimit; /* TDDP region end address */ unsigned int tag_mask; /* TDDP tag mask */ struct pci_dev *pdev; }; struct adap_ports { unsigned int nports; /* number of ports on this adapter */ - struct ifnet *lldevs[MAX_NPORTS]; + if_t devs[MAX_NPORTS]; }; /* * Structure used to return information to the iscsi layer. */ struct ulp_iscsi_info { unsigned int offset; unsigned int llimit; unsigned int ulimit; unsigned int tagmask; unsigned int pgsz3; unsigned int pgsz2; unsigned int pgsz1; unsigned int pgsz0; unsigned int max_rxsz; unsigned int max_txsz; struct pci_dev *pdev; }; /* * Offload TX/RX page information. */ struct ofld_page_info { unsigned int page_size; /* Page size, should be a power of 2 */ unsigned int num; /* Number of pages */ }; /* * Structure used to return information to the RDMA layer. */ struct rdma_info { unsigned int tpt_base; /* TPT base address */ unsigned int tpt_top; /* TPT last entry address */ unsigned int pbl_base; /* PBL base address */ unsigned int pbl_top; /* PBL last entry address */ unsigned int rqt_base; /* RQT base address */ unsigned int rqt_top; /* RQT last entry address */ unsigned int udbell_len; /* user doorbell region length */ unsigned long udbell_physbase; /* user doorbell physical start addr */ void *kdb_addr; /* kernel doorbell register address */ device_t pdev; /* associated PCI device */ }; /* * Structure used to request an operation on an RDMA completion queue. */ struct rdma_cq_op { unsigned int id; unsigned int op; unsigned int credits; }; /* * Structure used to setup RDMA completion queues. */ struct rdma_cq_setup { unsigned int id; unsigned long long base_addr; unsigned int size; unsigned int credits; unsigned int credit_thres; unsigned int ovfl_mode; }; /* * Structure used to setup the RDMA control egress context. */ struct rdma_ctrlqp_setup { unsigned long long base_addr; unsigned int size; }; #endif /* _CXGB3_OFFLOAD_CTL_DEFS_H */ diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h index ae5fa9f357c8..04ba7e056b31 100644 --- a/sys/dev/cxgb/cxgb_adapter.h +++ b/sys/dev/cxgb/cxgb_adapter.h @@ -1,563 +1,563 @@ /************************************************************************** SPDX-License-Identifier: BSD-2-Clause-FreeBSD Copyright (c) 2007-2009, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $FreeBSD$ ***************************************************************************/ #ifndef _CXGB_ADAPTER_H_ #define _CXGB_ADAPTER_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct adapter; struct sge_qset; extern int cxgb_debug; #ifdef DEBUG_LOCKING #define MTX_INIT(lock, lockname, class, flags) \ do { \ printf("initializing %s at %s:%d\n", lockname, __FILE__, __LINE__); \ mtx_init((lock), lockname, class, flags); \ } while (0) #define MTX_DESTROY(lock) \ do { \ printf("destroying %s at %s:%d\n", (lock)->lock_object.lo_name, __FILE__, __LINE__); \ mtx_destroy((lock)); \ } while (0) #else #define MTX_INIT mtx_init #define MTX_DESTROY mtx_destroy #endif enum { LF_NO = 0, LF_MAYBE, LF_YES }; struct port_info { struct adapter *adapter; - struct ifnet *ifp; + if_t ifp; int if_flags; int flags; const struct port_type_info *port_type; struct cphy phy; struct cmac mac; struct timeval last_refreshed; struct link_config link_config; struct ifmedia media; struct mtx lock; uint32_t port_id; uint32_t tx_chan; uint32_t txpkt_intf; uint32_t first_qset; uint32_t nqsets; int link_fault; uint8_t hw_addr[ETHER_ADDR_LEN]; struct callout link_check_ch; struct task link_check_task; struct task timer_reclaim_task; struct cdev *port_cdev; #define PORT_LOCK_NAME_LEN 32 #define PORT_NAME_LEN 32 char lockbuf[PORT_LOCK_NAME_LEN]; char namebuf[PORT_NAME_LEN]; } __aligned(CACHE_LINE_SIZE); enum { /* adapter flags */ FULL_INIT_DONE = (1 << 0), USING_MSI = (1 << 1), USING_MSIX = (1 << 2), QUEUES_BOUND = (1 << 3), FW_UPTODATE = (1 << 4), TPS_UPTODATE = (1 << 5), CXGB_SHUTDOWN = (1 << 6), CXGB_OFLD_INIT = (1 << 7), TP_PARITY_INIT = (1 << 8), CXGB_BUSY = (1 << 9), TOM_INIT_DONE = (1 << 10), /* port flags */ DOOMED = (1 << 0), }; #define IS_DOOMED(p) (p->flags & DOOMED) #define SET_DOOMED(p) do {p->flags |= DOOMED;} while (0) #define IS_BUSY(sc) (sc->flags & CXGB_BUSY) #define SET_BUSY(sc) do {sc->flags |= CXGB_BUSY;} while (0) #define CLR_BUSY(sc) do {sc->flags &= ~CXGB_BUSY;} while (0) #define FL_Q_SIZE 4096 #define JUMBO_Q_SIZE 1024 #define RSPQ_Q_SIZE 2048 #define TX_ETH_Q_SIZE 1024 #define TX_OFLD_Q_SIZE 1024 #define TX_CTRL_Q_SIZE 256 enum { TXQ_ETH = 0, TXQ_OFLD = 1, TXQ_CTRL = 2, }; /* * work request size in bytes */ #define WR_LEN (WR_FLITS * 8) #define PIO_LEN (WR_LEN - sizeof(struct cpl_tx_pkt_lso)) struct lro_state { unsigned short enabled; struct lro_ctrl ctrl; }; #define RX_BUNDLE_SIZE 8 struct rsp_desc; struct sge_rspq { uint32_t credits; uint32_t size; uint32_t cidx; uint32_t gen; uint32_t polling; uint32_t holdoff_tmr; uint32_t next_holdoff; uint32_t imm_data; uint32_t async_notif; uint32_t cntxt_id; uint32_t offload_pkts; uint32_t pure_rsps; uint32_t unhandled_irqs; uint32_t starved; bus_addr_t phys_addr; bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; struct t3_mbuf_hdr rspq_mh; struct rsp_desc *desc; struct mtx lock; #define RSPQ_NAME_LEN 32 char lockbuf[RSPQ_NAME_LEN]; uint32_t rspq_dump_start; uint32_t rspq_dump_count; }; struct rx_desc; struct rx_sw_desc; struct sge_fl { uint32_t buf_size; uint32_t credits; uint32_t size; uint32_t cidx; uint32_t pidx; uint32_t gen; uint32_t db_pending; bus_addr_t phys_addr; uint32_t cntxt_id; uint32_t empty; bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_dma_tag_t entry_tag; uma_zone_t zone; struct rx_desc *desc; struct rx_sw_desc *sdesc; int type; }; struct tx_desc; struct tx_sw_desc; #define TXQ_TRANSMITTING 0x1 struct sge_txq { uint64_t flags; uint32_t in_use; uint32_t size; uint32_t processed; uint32_t cleaned; uint32_t stop_thres; uint32_t cidx; uint32_t pidx; uint32_t gen; uint32_t unacked; uint32_t db_pending; struct tx_desc *desc; struct tx_sw_desc *sdesc; uint32_t token; bus_addr_t phys_addr; struct task qresume_task; struct task qreclaim_task; uint32_t cntxt_id; uint64_t stops; uint64_t restarts; bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_dma_tag_t entry_tag; struct mbufq sendq; struct buf_ring *txq_mr; struct ifaltq *txq_ifq; struct callout txq_timer; struct callout txq_watchdog; uint64_t txq_coalesced; uint32_t txq_skipped; uint32_t txq_enqueued; uint32_t txq_dump_start; uint32_t txq_dump_count; uint64_t txq_direct_packets; uint64_t txq_direct_bytes; uint64_t txq_frees; struct sg_ent txq_sgl[TX_MAX_SEGS / 2 + 1]; }; #define SGE_PSTAT_MAX (SGE_PSTAT_VLANINS+1) #define QS_EXITING 0x1 #define QS_RUNNING 0x2 #define QS_BOUND 0x4 #define QS_FLUSHING 0x8 #define QS_TIMEOUT 0x10 struct sge_qset { struct sge_rspq rspq; struct sge_fl fl[SGE_RXQ_PER_SET]; struct lro_state lro; struct sge_txq txq[SGE_TXQ_PER_SET]; uint32_t txq_stopped; /* which Tx queues are stopped */ struct port_info *port; struct adapter *adap; int idx; /* qset # */ int qs_flags; int coalescing; struct cv qs_cv; struct mtx lock; #define QS_NAME_LEN 32 char namebuf[QS_NAME_LEN]; }; struct sge { struct sge_qset qs[SGE_QSETS]; struct mtx reg_lock; }; struct filter_info; typedef int (*cpl_handler_t)(struct sge_qset *, struct rsp_desc *, struct mbuf *); struct adapter { SLIST_ENTRY(adapter) link; device_t dev; int flags; /* PCI register resources */ int regs_rid; struct resource *regs_res; int udbs_rid; struct resource *udbs_res; bus_space_handle_t bh; bus_space_tag_t bt; bus_size_t mmio_len; uint32_t link_width; /* DMA resources */ bus_dma_tag_t parent_dmat; bus_dma_tag_t rx_dmat; bus_dma_tag_t rx_jumbo_dmat; bus_dma_tag_t tx_dmat; /* Interrupt resources */ struct resource *irq_res; int irq_rid; void *intr_tag; uint32_t msix_regs_rid; struct resource *msix_regs_res; struct resource *msix_irq_res[SGE_QSETS]; int msix_irq_rid[SGE_QSETS]; void *msix_intr_tag[SGE_QSETS]; uint8_t rxpkt_map[8]; /* maps RX_PKT interface values to port ids */ uint8_t rrss_map[SGE_QSETS]; /* revers RSS map table */ uint16_t rspq_map[RSS_TABLE_SIZE]; /* maps 7-bit cookie to qidx */ union { uint8_t fill[SGE_QSETS]; uint64_t coalesce; } u; #define tunq_fill u.fill #define tunq_coalesce u.coalesce struct filter_info *filters; /* Tasks */ struct task slow_intr_task; struct task tick_task; struct taskqueue *tq; struct callout cxgb_tick_ch; struct callout sge_timer_ch; /* Register lock for use by the hardware layer */ struct mtx mdio_lock; struct mtx elmer_lock; /* Bookkeeping for the hardware layer */ struct adapter_params params; unsigned int slow_intr_mask; unsigned long irq_stats[IRQ_NUM_STATS]; unsigned nqsets; struct sge sge; struct mc7 pmrx; struct mc7 pmtx; struct mc7 cm; struct mc5 mc5; struct port_info port[MAX_NPORTS]; device_t portdev[MAX_NPORTS]; #ifdef TCP_OFFLOAD void *tom_softc; void *iwarp_softc; #endif char fw_version[64]; char port_types[MAX_NPORTS + 1]; uint32_t open_device_map; #ifdef TCP_OFFLOAD int offload_map; #endif struct mtx lock; driver_intr_t *cxgb_intr; int msi_count; #define ADAPTER_LOCK_NAME_LEN 32 char lockbuf[ADAPTER_LOCK_NAME_LEN]; char reglockbuf[ADAPTER_LOCK_NAME_LEN]; char mdiolockbuf[ADAPTER_LOCK_NAME_LEN]; char elmerlockbuf[ADAPTER_LOCK_NAME_LEN]; int timestamp; #ifdef TCP_OFFLOAD #define NUM_CPL_HANDLERS 0xa7 cpl_handler_t cpl_handler[NUM_CPL_HANDLERS] __aligned(CACHE_LINE_SIZE); #endif }; struct t3_rx_mode { uint32_t idx; struct port_info *port; }; #define MDIO_LOCK(adapter) mtx_lock(&(adapter)->mdio_lock) #define MDIO_UNLOCK(adapter) mtx_unlock(&(adapter)->mdio_lock) #define ELMR_LOCK(adapter) mtx_lock(&(adapter)->elmer_lock) #define ELMR_UNLOCK(adapter) mtx_unlock(&(adapter)->elmer_lock) #define PORT_LOCK(port) mtx_lock(&(port)->lock); #define PORT_UNLOCK(port) mtx_unlock(&(port)->lock); #define PORT_LOCK_INIT(port, name) mtx_init(&(port)->lock, name, 0, MTX_DEF) #define PORT_LOCK_DEINIT(port) mtx_destroy(&(port)->lock) #define PORT_LOCK_ASSERT_NOTOWNED(port) mtx_assert(&(port)->lock, MA_NOTOWNED) #define PORT_LOCK_ASSERT_OWNED(port) mtx_assert(&(port)->lock, MA_OWNED) #define ADAPTER_LOCK(adap) mtx_lock(&(adap)->lock); #define ADAPTER_UNLOCK(adap) mtx_unlock(&(adap)->lock); #define ADAPTER_LOCK_INIT(adap, name) mtx_init(&(adap)->lock, name, 0, MTX_DEF) #define ADAPTER_LOCK_DEINIT(adap) mtx_destroy(&(adap)->lock) #define ADAPTER_LOCK_ASSERT_NOTOWNED(adap) mtx_assert(&(adap)->lock, MA_NOTOWNED) #define ADAPTER_LOCK_ASSERT_OWNED(adap) mtx_assert(&(adap)->lock, MA_OWNED) static __inline uint32_t t3_read_reg(adapter_t *adapter, uint32_t reg_addr) { return (bus_space_read_4(adapter->bt, adapter->bh, reg_addr)); } static __inline void t3_write_reg(adapter_t *adapter, uint32_t reg_addr, uint32_t val) { bus_space_write_4(adapter->bt, adapter->bh, reg_addr, val); } static __inline void t3_os_pci_read_config_4(adapter_t *adapter, int reg, uint32_t *val) { *val = pci_read_config(adapter->dev, reg, 4); } static __inline void t3_os_pci_write_config_4(adapter_t *adapter, int reg, uint32_t val) { pci_write_config(adapter->dev, reg, val, 4); } static __inline void t3_os_pci_read_config_2(adapter_t *adapter, int reg, uint16_t *val) { *val = pci_read_config(adapter->dev, reg, 2); } static __inline void t3_os_pci_write_config_2(adapter_t *adapter, int reg, uint16_t val) { pci_write_config(adapter->dev, reg, val, 2); } static __inline void t3_init_rx_mode(struct t3_rx_mode *rm, struct port_info *port) { rm->idx = 0; rm->port = port; } static __inline struct port_info * adap2pinfo(struct adapter *adap, int idx) { return &adap->port[idx]; } int t3_os_find_pci_capability(adapter_t *adapter, int cap); int t3_os_pci_save_state(struct adapter *adapter); int t3_os_pci_restore_state(struct adapter *adapter); void t3_os_link_intr(struct port_info *); void t3_os_link_changed(adapter_t *adapter, int port_id, int link_status, int speed, int duplex, int fc, int mac_was_reset); void t3_os_phymod_changed(struct adapter *adap, int port_id); void t3_sge_err_intr_handler(adapter_t *adapter); #ifdef TCP_OFFLOAD int t3_offload_tx(struct adapter *, struct mbuf *); #endif void t3_os_set_hw_addr(adapter_t *adapter, int port_idx, u8 hw_addr[]); int t3_mgmt_tx(adapter_t *adap, struct mbuf *m); int t3_register_cpl_handler(struct adapter *, int, cpl_handler_t); int t3_sge_alloc(struct adapter *); int t3_sge_free(struct adapter *); int t3_sge_alloc_qset(adapter_t *, uint32_t, int, int, const struct qset_params *, int, struct port_info *); void t3_free_sge_resources(adapter_t *, int); void t3_sge_start(adapter_t *); void t3_sge_stop(adapter_t *); void t3b_intr(void *data); void t3_intr_msi(void *data); void t3_intr_msix(void *data); int t3_sge_init_adapter(adapter_t *); int t3_sge_reset_adapter(adapter_t *); int t3_sge_init_port(struct port_info *); void t3_free_tx_desc(struct sge_qset *qs, int n, int qid); void t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad); void t3_add_attach_sysctls(adapter_t *sc); void t3_add_configured_sysctls(adapter_t *sc); int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx, unsigned char *data); void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p); /* * XXX figure out how we can return this to being private to sge */ #define desc_reclaimable(q) ((int)((q)->processed - (q)->cleaned - TX_MAX_DESC)) #define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) static __inline struct sge_qset * fl_to_qset(struct sge_fl *q, int qidx) { return container_of(q, struct sge_qset, fl[qidx]); } static __inline struct sge_qset * rspq_to_qset(struct sge_rspq *q) { return container_of(q, struct sge_qset, rspq); } static __inline struct sge_qset * txq_to_qset(struct sge_txq *q, int qidx) { return container_of(q, struct sge_qset, txq[qidx]); } #undef container_of #define OFFLOAD_DEVMAP_BIT (1 << MAX_NPORTS) static inline int offload_running(adapter_t *adapter) { return isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT); } void cxgb_tx_watchdog(void *arg); -int cxgb_transmit(struct ifnet *ifp, struct mbuf *m); -void cxgb_qflush(struct ifnet *ifp); +int cxgb_transmit(if_t ifp, struct mbuf *m); +void cxgb_qflush(if_t ifp); void t3_iterate(void (*)(struct adapter *, void *), void *); void cxgb_refresh_stats(struct port_info *); #ifdef DEBUGNET int cxgb_debugnet_encap(struct sge_qset *qs, struct mbuf **m); int cxgb_debugnet_poll_rx(adapter_t *adap, struct sge_qset *qs); int cxgb_debugnet_poll_tx(struct sge_qset *qs); #endif #endif diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c index e31050a2f946..a2cf9d6b5a17 100644 --- a/sys/dev/cxgb/cxgb_main.c +++ b/sys/dev/cxgb/cxgb_main.c @@ -1,3664 +1,3663 @@ /************************************************************************** SPDX-License-Identifier: BSD-2-Clause-FreeBSD Copyright (c) 2007-2009, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PRIV_SUPPORTED #include #endif static int cxgb_setup_interrupts(adapter_t *); static void cxgb_teardown_interrupts(adapter_t *); static void cxgb_init(void *); static int cxgb_init_locked(struct port_info *); static int cxgb_uninit_locked(struct port_info *); static int cxgb_uninit_synchronized(struct port_info *); -static int cxgb_ioctl(struct ifnet *, unsigned long, caddr_t); -static int cxgb_media_change(struct ifnet *); +static int cxgb_ioctl(if_t, unsigned long, caddr_t); +static int cxgb_media_change(if_t); static int cxgb_ifm_type(int); static void cxgb_build_medialist(struct port_info *); -static void cxgb_media_status(struct ifnet *, struct ifmediareq *); -static uint64_t cxgb_get_counter(struct ifnet *, ift_counter); +static void cxgb_media_status(if_t, struct ifmediareq *); +static uint64_t cxgb_get_counter(if_t, ift_counter); static int setup_sge_qsets(adapter_t *); static void cxgb_async_intr(void *); static void cxgb_tick_handler(void *, int); static void cxgb_tick(void *); static void link_check_callout(void *); static void check_link_status(void *, int); static void setup_rss(adapter_t *sc); static int alloc_filters(struct adapter *); static int setup_hw_filters(struct adapter *); static int set_filter(struct adapter *, int, const struct filter_info *); static inline void mk_set_tcb_field(struct cpl_set_tcb_field *, unsigned int, unsigned int, u64, u64); static inline void set_tcb_field_ulp(struct cpl_set_tcb_field *, unsigned int, unsigned int, u64, u64); #ifdef TCP_OFFLOAD static int cpl_not_handled(struct sge_qset *, struct rsp_desc *, struct mbuf *); #endif /* Attachment glue for the PCI controller end of the device. Each port of * the device is attached separately, as defined later. */ static int cxgb_controller_probe(device_t); static int cxgb_controller_attach(device_t); static int cxgb_controller_detach(device_t); static void cxgb_free(struct adapter *); static __inline void reg_block_dump(struct adapter *ap, uint8_t *buf, unsigned int start, unsigned int end); static void cxgb_get_regs(adapter_t *sc, struct ch_ifconf_regs *regs, uint8_t *buf); static int cxgb_get_regs_len(void); static void touch_bars(device_t dev); static void cxgb_update_mac_settings(struct port_info *p); #ifdef TCP_OFFLOAD static int toe_capability(struct port_info *, int); #endif /* Table for probing the cards. The desc field isn't actually used */ struct cxgb_ident { uint16_t vendor; uint16_t device; int index; char *desc; } cxgb_identifiers[] = { {PCI_VENDOR_ID_CHELSIO, 0x0020, 0, "PE9000"}, {PCI_VENDOR_ID_CHELSIO, 0x0021, 1, "T302E"}, {PCI_VENDOR_ID_CHELSIO, 0x0022, 2, "T310E"}, {PCI_VENDOR_ID_CHELSIO, 0x0023, 3, "T320X"}, {PCI_VENDOR_ID_CHELSIO, 0x0024, 1, "T302X"}, {PCI_VENDOR_ID_CHELSIO, 0x0025, 3, "T320E"}, {PCI_VENDOR_ID_CHELSIO, 0x0026, 2, "T310X"}, {PCI_VENDOR_ID_CHELSIO, 0x0030, 2, "T3B10"}, {PCI_VENDOR_ID_CHELSIO, 0x0031, 3, "T3B20"}, {PCI_VENDOR_ID_CHELSIO, 0x0032, 1, "T3B02"}, {PCI_VENDOR_ID_CHELSIO, 0x0033, 4, "T3B04"}, {PCI_VENDOR_ID_CHELSIO, 0x0035, 6, "T3C10"}, {PCI_VENDOR_ID_CHELSIO, 0x0036, 3, "S320E-CR"}, {PCI_VENDOR_ID_CHELSIO, 0x0037, 7, "N320E-G2"}, {0, 0, 0, NULL} }; static device_method_t cxgb_controller_methods[] = { DEVMETHOD(device_probe, cxgb_controller_probe), DEVMETHOD(device_attach, cxgb_controller_attach), DEVMETHOD(device_detach, cxgb_controller_detach), DEVMETHOD_END }; static driver_t cxgb_controller_driver = { "cxgbc", cxgb_controller_methods, sizeof(struct adapter) }; static int cxgbc_mod_event(module_t, int, void *); DRIVER_MODULE(cxgbc, pci, cxgb_controller_driver, cxgbc_mod_event, NULL); MODULE_PNP_INFO("U16:vendor;U16:device", pci, cxgbc, cxgb_identifiers, nitems(cxgb_identifiers) - 1); MODULE_VERSION(cxgbc, 1); MODULE_DEPEND(cxgbc, firmware, 1, 1, 1); /* * Attachment glue for the ports. Attachment is done directly to the * controller device. */ static int cxgb_port_probe(device_t); static int cxgb_port_attach(device_t); static int cxgb_port_detach(device_t); static device_method_t cxgb_port_methods[] = { DEVMETHOD(device_probe, cxgb_port_probe), DEVMETHOD(device_attach, cxgb_port_attach), DEVMETHOD(device_detach, cxgb_port_detach), { 0, 0 } }; static driver_t cxgb_port_driver = { "cxgb", cxgb_port_methods, 0 }; static d_ioctl_t cxgb_extension_ioctl; static d_open_t cxgb_extension_open; static d_close_t cxgb_extension_close; static struct cdevsw cxgb_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = cxgb_extension_open, .d_close = cxgb_extension_close, .d_ioctl = cxgb_extension_ioctl, .d_name = "cxgb", }; DRIVER_MODULE(cxgb, cxgbc, cxgb_port_driver, 0, 0); MODULE_VERSION(cxgb, 1); DEBUGNET_DEFINE(cxgb); static struct mtx t3_list_lock; static SLIST_HEAD(, adapter) t3_list; #ifdef TCP_OFFLOAD static struct mtx t3_uld_list_lock; static SLIST_HEAD(, uld_info) t3_uld_list; #endif /* * The driver uses the best interrupt scheme available on a platform in the * order MSI-X, MSI, legacy pin interrupts. This parameter determines which * of these schemes the driver may consider as follows: * * msi = 2: choose from among all three options * msi = 1 : only consider MSI and pin interrupts * msi = 0: force pin interrupts */ static int msi_allowed = 2; SYSCTL_NODE(_hw, OID_AUTO, cxgb, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "CXGB driver parameters"); SYSCTL_INT(_hw_cxgb, OID_AUTO, msi_allowed, CTLFLAG_RDTUN, &msi_allowed, 0, "MSI-X, MSI, INTx selector"); /* * The driver uses an auto-queue algorithm by default. * To disable it and force a single queue-set per port, use multiq = 0 */ static int multiq = 1; SYSCTL_INT(_hw_cxgb, OID_AUTO, multiq, CTLFLAG_RDTUN, &multiq, 0, "use min(ncpus/ports, 8) queue-sets per port"); /* * By default the driver will not update the firmware unless * it was compiled against a newer version * */ static int force_fw_update = 0; SYSCTL_INT(_hw_cxgb, OID_AUTO, force_fw_update, CTLFLAG_RDTUN, &force_fw_update, 0, "update firmware even if up to date"); int cxgb_use_16k_clusters = -1; SYSCTL_INT(_hw_cxgb, OID_AUTO, use_16k_clusters, CTLFLAG_RDTUN, &cxgb_use_16k_clusters, 0, "use 16kB clusters for the jumbo queue "); static int nfilters = -1; SYSCTL_INT(_hw_cxgb, OID_AUTO, nfilters, CTLFLAG_RDTUN, &nfilters, 0, "max number of entries in the filter table"); enum { MAX_TXQ_ENTRIES = 16384, MAX_CTRL_TXQ_ENTRIES = 1024, MAX_RSPQ_ENTRIES = 16384, MAX_RX_BUFFERS = 16384, MAX_RX_JUMBO_BUFFERS = 16384, MIN_TXQ_ENTRIES = 4, MIN_CTRL_TXQ_ENTRIES = 4, MIN_RSPQ_ENTRIES = 32, MIN_FL_ENTRIES = 32, MIN_FL_JUMBO_ENTRIES = 32 }; struct filter_info { u32 sip; u32 sip_mask; u32 dip; u16 sport; u16 dport; u32 vlan:12; u32 vlan_prio:3; u32 mac_hit:1; u32 mac_idx:4; u32 mac_vld:1; u32 pkt_type:2; u32 report_filter_id:1; u32 pass:1; u32 rss:1; u32 qset:3; u32 locked:1; u32 valid:1; }; enum { FILTER_NO_VLAN_PRI = 7 }; #define EEPROM_MAGIC 0x38E2F10C #define PORT_MASK ((1 << MAX_NPORTS) - 1) static int set_eeprom(struct port_info *pi, const uint8_t *data, int len, int offset); static __inline char t3rev2char(struct adapter *adapter) { char rev = 'z'; switch(adapter->params.rev) { case T3_REV_A: rev = 'a'; break; case T3_REV_B: case T3_REV_B2: rev = 'b'; break; case T3_REV_C: rev = 'c'; break; } return rev; } static struct cxgb_ident * cxgb_get_ident(device_t dev) { struct cxgb_ident *id; for (id = cxgb_identifiers; id->desc != NULL; id++) { if ((id->vendor == pci_get_vendor(dev)) && (id->device == pci_get_device(dev))) { return (id); } } return (NULL); } static const struct adapter_info * cxgb_get_adapter_info(device_t dev) { struct cxgb_ident *id; const struct adapter_info *ai; id = cxgb_get_ident(dev); if (id == NULL) return (NULL); ai = t3_get_adapter_info(id->index); return (ai); } static int cxgb_controller_probe(device_t dev) { const struct adapter_info *ai; char *ports, buf[80]; int nports; ai = cxgb_get_adapter_info(dev); if (ai == NULL) return (ENXIO); nports = ai->nports0 + ai->nports1; if (nports == 1) ports = "port"; else ports = "ports"; snprintf(buf, sizeof(buf), "%s, %d %s", ai->desc, nports, ports); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define FW_FNAME "cxgb_t3fw" #define TPEEPROM_NAME "cxgb_t3%c_tp_eeprom" #define TPSRAM_NAME "cxgb_t3%c_protocol_sram" static int upgrade_fw(adapter_t *sc) { const struct firmware *fw; int status; u32 vers; if ((fw = firmware_get(FW_FNAME)) == NULL) { device_printf(sc->dev, "Could not find firmware image %s\n", FW_FNAME); return (ENOENT); } else device_printf(sc->dev, "installing firmware on card\n"); status = t3_load_fw(sc, (const uint8_t *)fw->data, fw->datasize); if (status != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", status); } else { t3_get_fw_version(sc, &vers); snprintf(&sc->fw_version[0], sizeof(sc->fw_version), "%d.%d.%d", G_FW_VERSION_MAJOR(vers), G_FW_VERSION_MINOR(vers), G_FW_VERSION_MICRO(vers)); } firmware_put(fw, FIRMWARE_UNLOAD); return (status); } /* * The cxgb_controller_attach function is responsible for the initial * bringup of the device. Its responsibilities include: * * 1. Determine if the device supports MSI or MSI-X. * 2. Allocate bus resources so that we can access the Base Address Register * 3. Create and initialize mutexes for the controller and its control * logic such as SGE and MDIO. * 4. Call hardware specific setup routine for the adapter as a whole. * 5. Allocate the BAR for doing MSI-X. * 6. Setup the line interrupt iff MSI-X is not supported. * 7. Create the driver's taskq. * 8. Start one task queue service thread. * 9. Check if the firmware and SRAM are up-to-date. They will be * auto-updated later (before FULL_INIT_DONE), if required. * 10. Create a child device for each MAC (port) * 11. Initialize T3 private state. * 12. Trigger the LED * 13. Setup offload iff supported. * 14. Reset/restart the tick callout. * 15. Attach sysctls * * NOTE: Any modification or deviation from this list MUST be reflected in * the above comment. Failure to do so will result in problems on various * error conditions including link flapping. */ static int cxgb_controller_attach(device_t dev) { device_t child; const struct adapter_info *ai; struct adapter *sc; int i, error = 0; uint32_t vers; int port_qsets = 1; int msi_needed, reg; char buf[80]; sc = device_get_softc(dev); sc->dev = dev; sc->msi_count = 0; ai = cxgb_get_adapter_info(dev); snprintf(sc->lockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb controller lock %d", device_get_unit(dev)); ADAPTER_LOCK_INIT(sc, sc->lockbuf); snprintf(sc->reglockbuf, ADAPTER_LOCK_NAME_LEN, "SGE reg lock %d", device_get_unit(dev)); snprintf(sc->mdiolockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb mdio lock %d", device_get_unit(dev)); snprintf(sc->elmerlockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb elmer lock %d", device_get_unit(dev)); MTX_INIT(&sc->sge.reg_lock, sc->reglockbuf, NULL, MTX_SPIN); MTX_INIT(&sc->mdio_lock, sc->mdiolockbuf, NULL, MTX_DEF); MTX_INIT(&sc->elmer_lock, sc->elmerlockbuf, NULL, MTX_DEF); mtx_lock(&t3_list_lock); SLIST_INSERT_HEAD(&t3_list, sc, link); mtx_unlock(&t3_list_lock); /* find the PCIe link width and set max read request to 4KB*/ if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) { uint16_t lnk; lnk = pci_read_config(dev, reg + PCIER_LINK_STA, 2); sc->link_width = (lnk & PCIEM_LINK_STA_WIDTH) >> 4; if (sc->link_width < 8 && (ai->caps & SUPPORTED_10000baseT_Full)) { device_printf(sc->dev, "PCIe x%d Link, expect reduced performance\n", sc->link_width); } pci_set_max_read_req(dev, 4096); } touch_bars(dev); pci_enable_busmaster(dev); /* * Allocate the registers and make them available to the driver. * The registers that we care about for NIC mode are in BAR 0 */ sc->regs_rid = PCIR_BAR(0); if ((sc->regs_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE)) == NULL) { device_printf(dev, "Cannot allocate BAR region 0\n"); error = ENXIO; goto out; } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); for (i = 0; i < MAX_NPORTS; i++) sc->port[i].adapter = sc; if (t3_prep_adapter(sc, ai, 1) < 0) { printf("prep adapter failed\n"); error = ENODEV; goto out; } sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = NULL; if (is_offload(sc) && ((sc->udbs_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE)) == NULL)) { device_printf(dev, "Cannot allocate BAR region 1\n"); error = ENXIO; goto out; } /* Allocate the BAR for doing MSI-X. If it succeeds, try to allocate * enough messages for the queue sets. If that fails, try falling * back to MSI. If that fails, then try falling back to the legacy * interrupt pin model. */ sc->msix_regs_rid = 0x20; if ((msi_allowed >= 2) && (sc->msix_regs_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->msix_regs_rid, RF_ACTIVE)) != NULL) { if (multiq) port_qsets = min(SGE_QSETS/sc->params.nports, mp_ncpus); msi_needed = sc->msi_count = sc->params.nports * port_qsets + 1; if (pci_msix_count(dev) == 0 || (error = pci_alloc_msix(dev, &sc->msi_count)) != 0 || sc->msi_count != msi_needed) { device_printf(dev, "alloc msix failed - " "msi_count=%d, msi_needed=%d, err=%d; " "will try MSI\n", sc->msi_count, msi_needed, error); sc->msi_count = 0; port_qsets = 1; pci_release_msi(dev); bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_regs_rid, sc->msix_regs_res); sc->msix_regs_res = NULL; } else { sc->flags |= USING_MSIX; sc->cxgb_intr = cxgb_async_intr; device_printf(dev, "using MSI-X interrupts (%u vectors)\n", sc->msi_count); } } if ((msi_allowed >= 1) && (sc->msi_count == 0)) { sc->msi_count = 1; if ((error = pci_alloc_msi(dev, &sc->msi_count)) != 0) { device_printf(dev, "alloc msi failed - " "err=%d; will try INTx\n", error); sc->msi_count = 0; port_qsets = 1; pci_release_msi(dev); } else { sc->flags |= USING_MSI; sc->cxgb_intr = t3_intr_msi; device_printf(dev, "using MSI interrupts\n"); } } if (sc->msi_count == 0) { device_printf(dev, "using line interrupts\n"); sc->cxgb_intr = t3b_intr; } /* Create a private taskqueue thread for handling driver events */ sc->tq = taskqueue_create("cxgb_taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq); if (sc->tq == NULL) { device_printf(dev, "failed to allocate controller task queue\n"); goto out; } taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq", device_get_nameunit(dev)); TASK_INIT(&sc->tick_task, 0, cxgb_tick_handler, sc); /* Create a periodic callout for checking adapter status */ callout_init(&sc->cxgb_tick_ch, 1); if (t3_check_fw_version(sc) < 0 || force_fw_update) { /* * Warn user that a firmware update will be attempted in init. */ device_printf(dev, "firmware needs to be updated to version %d.%d.%d\n", FW_VERSION_MAJOR, FW_VERSION_MINOR, FW_VERSION_MICRO); sc->flags &= ~FW_UPTODATE; } else { sc->flags |= FW_UPTODATE; } if (t3_check_tpsram_version(sc) < 0) { /* * Warn user that a firmware update will be attempted in init. */ device_printf(dev, "SRAM needs to be updated to version %c-%d.%d.%d\n", t3rev2char(sc), TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO); sc->flags &= ~TPS_UPTODATE; } else { sc->flags |= TPS_UPTODATE; } /* * Create a child device for each MAC. The ethernet attachment * will be done in these children. */ for (i = 0; i < (sc)->params.nports; i++) { struct port_info *pi; if ((child = device_add_child(dev, "cxgb", -1)) == NULL) { device_printf(dev, "failed to add child port\n"); error = EINVAL; goto out; } pi = &sc->port[i]; pi->adapter = sc; pi->nqsets = port_qsets; pi->first_qset = i*port_qsets; pi->port_id = i; pi->tx_chan = i >= ai->nports0; pi->txpkt_intf = pi->tx_chan ? 2 * (i - ai->nports0) + 1 : 2 * i; sc->rxpkt_map[pi->txpkt_intf] = i; sc->port[i].tx_chan = i >= ai->nports0; sc->portdev[i] = child; device_set_softc(child, pi); } if ((error = bus_generic_attach(dev)) != 0) goto out; /* initialize sge private state */ t3_sge_init_adapter(sc); t3_led_ready(sc); error = t3_get_fw_version(sc, &vers); if (error) goto out; snprintf(&sc->fw_version[0], sizeof(sc->fw_version), "%d.%d.%d", G_FW_VERSION_MAJOR(vers), G_FW_VERSION_MINOR(vers), G_FW_VERSION_MICRO(vers)); snprintf(buf, sizeof(buf), "%s %sNIC\t E/C: %s S/N: %s", ai->desc, is_offload(sc) ? "R" : "", sc->params.vpd.ec, sc->params.vpd.sn); device_set_desc_copy(dev, buf); snprintf(&sc->port_types[0], sizeof(sc->port_types), "%x%x%x%x", sc->params.vpd.port_type[0], sc->params.vpd.port_type[1], sc->params.vpd.port_type[2], sc->params.vpd.port_type[3]); device_printf(sc->dev, "Firmware Version %s\n", &sc->fw_version[0]); callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc); t3_add_attach_sysctls(sc); #ifdef TCP_OFFLOAD for (i = 0; i < NUM_CPL_HANDLERS; i++) sc->cpl_handler[i] = cpl_not_handled; #endif t3_intr_clear(sc); error = cxgb_setup_interrupts(sc); out: if (error) cxgb_free(sc); return (error); } /* * The cxgb_controller_detach routine is called with the device is * unloaded from the system. */ static int cxgb_controller_detach(device_t dev) { struct adapter *sc; sc = device_get_softc(dev); cxgb_free(sc); return (0); } /* * The cxgb_free() is called by the cxgb_controller_detach() routine * to tear down the structures that were built up in * cxgb_controller_attach(), and should be the final piece of work * done when fully unloading the driver. * * * 1. Shutting down the threads started by the cxgb_controller_attach() * routine. * 2. Stopping the lower level device and all callouts (cxgb_down_locked()). * 3. Detaching all of the port devices created during the * cxgb_controller_attach() routine. * 4. Removing the device children created via cxgb_controller_attach(). * 5. Releasing PCI resources associated with the device. * 6. Turning off the offload support, iff it was turned on. * 7. Destroying the mutexes created in cxgb_controller_attach(). * */ static void cxgb_free(struct adapter *sc) { int i, nqsets = 0; ADAPTER_LOCK(sc); sc->flags |= CXGB_SHUTDOWN; ADAPTER_UNLOCK(sc); /* * Make sure all child devices are gone. */ bus_generic_detach(sc->dev); for (i = 0; i < (sc)->params.nports; i++) { if (sc->portdev[i] && device_delete_child(sc->dev, sc->portdev[i]) != 0) device_printf(sc->dev, "failed to delete child port\n"); nqsets += sc->port[i].nqsets; } /* * At this point, it is as if cxgb_port_detach has run on all ports, and * cxgb_down has run on the adapter. All interrupts have been silenced, * all open devices have been closed. */ KASSERT(sc->open_device_map == 0, ("%s: device(s) still open (%x)", __func__, sc->open_device_map)); for (i = 0; i < sc->params.nports; i++) { KASSERT(sc->port[i].ifp == NULL, ("%s: port %i undead!", __func__, i)); } /* * Finish off the adapter's callouts. */ callout_drain(&sc->cxgb_tick_ch); callout_drain(&sc->sge_timer_ch); /* * Release resources grabbed under FULL_INIT_DONE by cxgb_up. The * sysctls are cleaned up by the kernel linker. */ if (sc->flags & FULL_INIT_DONE) { t3_free_sge_resources(sc, nqsets); sc->flags &= ~FULL_INIT_DONE; } /* * Release all interrupt resources. */ cxgb_teardown_interrupts(sc); if (sc->flags & (USING_MSI | USING_MSIX)) { device_printf(sc->dev, "releasing msi message(s)\n"); pci_release_msi(sc->dev); } else { device_printf(sc->dev, "no msi message to release\n"); } if (sc->msix_regs_res != NULL) { bus_release_resource(sc->dev, SYS_RES_MEMORY, sc->msix_regs_rid, sc->msix_regs_res); } /* * Free the adapter's taskqueue. */ if (sc->tq != NULL) { taskqueue_free(sc->tq); sc->tq = NULL; } free(sc->filters, M_DEVBUF); t3_sge_free(sc); if (sc->udbs_res != NULL) bus_release_resource(sc->dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->regs_res != NULL) bus_release_resource(sc->dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); MTX_DESTROY(&sc->mdio_lock); MTX_DESTROY(&sc->sge.reg_lock); MTX_DESTROY(&sc->elmer_lock); mtx_lock(&t3_list_lock); SLIST_REMOVE(&t3_list, sc, adapter, link); mtx_unlock(&t3_list_lock); ADAPTER_LOCK_DEINIT(sc); } /** * setup_sge_qsets - configure SGE Tx/Rx/response queues * @sc: the controller softc * * Determines how many sets of SGE queues to use and initializes them. * We support multiple queue sets per port if we have MSI-X, otherwise * just one queue set per port. */ static int setup_sge_qsets(adapter_t *sc) { int i, j, err, irq_idx = 0, qset_idx = 0; u_int ntxq = SGE_TXQ_PER_SET; if ((err = t3_sge_alloc(sc)) != 0) { device_printf(sc->dev, "t3_sge_alloc returned %d\n", err); return (err); } if (sc->params.rev > 0 && !(sc->flags & USING_MSI)) irq_idx = -1; for (i = 0; i < (sc)->params.nports; i++) { struct port_info *pi = &sc->port[i]; for (j = 0; j < pi->nqsets; j++, qset_idx++) { err = t3_sge_alloc_qset(sc, qset_idx, (sc)->params.nports, (sc->flags & USING_MSIX) ? qset_idx + 1 : irq_idx, &sc->params.sge.qset[qset_idx], ntxq, pi); if (err) { t3_free_sge_resources(sc, qset_idx); device_printf(sc->dev, "t3_sge_alloc_qset failed with %d\n", err); return (err); } } } sc->nqsets = qset_idx; return (0); } static void cxgb_teardown_interrupts(adapter_t *sc) { int i; for (i = 0; i < SGE_QSETS; i++) { if (sc->msix_intr_tag[i] == NULL) { /* Should have been setup fully or not at all */ KASSERT(sc->msix_irq_res[i] == NULL && sc->msix_irq_rid[i] == 0, ("%s: half-done interrupt (%d).", __func__, i)); continue; } bus_teardown_intr(sc->dev, sc->msix_irq_res[i], sc->msix_intr_tag[i]); bus_release_resource(sc->dev, SYS_RES_IRQ, sc->msix_irq_rid[i], sc->msix_irq_res[i]); sc->msix_irq_res[i] = sc->msix_intr_tag[i] = NULL; sc->msix_irq_rid[i] = 0; } if (sc->intr_tag) { KASSERT(sc->irq_res != NULL, ("%s: half-done interrupt.", __func__)); bus_teardown_intr(sc->dev, sc->irq_res, sc->intr_tag); bus_release_resource(sc->dev, SYS_RES_IRQ, sc->irq_rid, sc->irq_res); sc->irq_res = sc->intr_tag = NULL; sc->irq_rid = 0; } } static int cxgb_setup_interrupts(adapter_t *sc) { struct resource *res; void *tag; int i, rid, err, intr_flag = sc->flags & (USING_MSI | USING_MSIX); sc->irq_rid = intr_flag ? 1 : 0; sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &sc->irq_rid, RF_SHAREABLE | RF_ACTIVE); if (sc->irq_res == NULL) { device_printf(sc->dev, "Cannot allocate interrupt (%x, %u)\n", intr_flag, sc->irq_rid); err = EINVAL; sc->irq_rid = 0; } else { err = bus_setup_intr(sc->dev, sc->irq_res, INTR_MPSAFE | INTR_TYPE_NET, NULL, sc->cxgb_intr, sc, &sc->intr_tag); if (err) { device_printf(sc->dev, "Cannot set up interrupt (%x, %u, %d)\n", intr_flag, sc->irq_rid, err); bus_release_resource(sc->dev, SYS_RES_IRQ, sc->irq_rid, sc->irq_res); sc->irq_res = sc->intr_tag = NULL; sc->irq_rid = 0; } } /* That's all for INTx or MSI */ if (!(intr_flag & USING_MSIX) || err) return (err); bus_describe_intr(sc->dev, sc->irq_res, sc->intr_tag, "err"); for (i = 0; i < sc->msi_count - 1; i++) { rid = i + 2; res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (res == NULL) { device_printf(sc->dev, "Cannot allocate interrupt " "for message %d\n", rid); err = EINVAL; break; } err = bus_setup_intr(sc->dev, res, INTR_MPSAFE | INTR_TYPE_NET, NULL, t3_intr_msix, &sc->sge.qs[i], &tag); if (err) { device_printf(sc->dev, "Cannot set up interrupt " "for message %d (%d)\n", rid, err); bus_release_resource(sc->dev, SYS_RES_IRQ, rid, res); break; } sc->msix_irq_rid[i] = rid; sc->msix_irq_res[i] = res; sc->msix_intr_tag[i] = tag; bus_describe_intr(sc->dev, res, tag, "qs%d", i); } if (err) cxgb_teardown_interrupts(sc); return (err); } static int cxgb_port_probe(device_t dev) { struct port_info *p; char buf[80]; const char *desc; p = device_get_softc(dev); desc = p->phy.desc; snprintf(buf, sizeof(buf), "Port %d %s", p->port_id, desc); device_set_desc_copy(dev, buf); return (0); } static int cxgb_makedev(struct port_info *pi) { - pi->port_cdev = make_dev(&cxgb_cdevsw, pi->ifp->if_dunit, + pi->port_cdev = make_dev(&cxgb_cdevsw, if_getdunit(pi->ifp), UID_ROOT, GID_WHEEL, 0600, "%s", if_name(pi->ifp)); if (pi->port_cdev == NULL) return (ENOMEM); pi->port_cdev->si_drv1 = (void *)pi; return (0); } #define CXGB_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6) #define CXGB_CAP_ENABLE CXGB_CAP static int cxgb_port_attach(device_t dev) { struct port_info *p; - struct ifnet *ifp; + if_t ifp; int err; struct adapter *sc; p = device_get_softc(dev); sc = p->adapter; snprintf(p->lockbuf, PORT_NAME_LEN, "cxgb port lock %d:%d", device_get_unit(device_get_parent(dev)), p->port_id); PORT_LOCK_INIT(p, p->lockbuf); callout_init(&p->link_check_ch, 1); TASK_INIT(&p->link_check_task, 0, check_link_status, p); /* Allocate an ifnet object and set it up */ ifp = p->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ifp->if_init = cxgb_init; - ifp->if_softc = p; - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - ifp->if_ioctl = cxgb_ioctl; - ifp->if_transmit = cxgb_transmit; - ifp->if_qflush = cxgb_qflush; - ifp->if_get_counter = cxgb_get_counter; - - ifp->if_capabilities = CXGB_CAP; + if_setinitfn(ifp, cxgb_init); + if_setsoftc(ifp, p); + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); + if_setioctlfn(ifp, cxgb_ioctl); + if_settransmitfn(ifp, cxgb_transmit); + if_setqflushfn(ifp, cxgb_qflush); + if_setgetcounterfn(ifp, cxgb_get_counter); + + if_setcapabilities(ifp, CXGB_CAP); #ifdef TCP_OFFLOAD if (is_offload(sc)) - ifp->if_capabilities |= IFCAP_TOE4; + if_setcapabilitiesbit(ifp, IFCAP_TOE4, 0); #endif - ifp->if_capenable = CXGB_CAP_ENABLE; - ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | - CSUM_UDP_IPV6 | CSUM_TCP_IPV6; - ifp->if_hw_tsomax = IP_MAXPACKET; - ifp->if_hw_tsomaxsegcount = 36; - ifp->if_hw_tsomaxsegsize = 65536; + if_setcapenable(ifp, CXGB_CAP_ENABLE); + if_sethwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | + CSUM_UDP_IPV6 | CSUM_TCP_IPV6); + if_sethwtsomax(ifp, IP_MAXPACKET); + if_sethwtsomaxsegcount(ifp, 36); + if_sethwtsomaxsegsize(ifp, 65536); /* * Disable TSO on 4-port - it isn't supported by the firmware. */ if (sc->params.nports > 2) { - ifp->if_capabilities &= ~(IFCAP_TSO | IFCAP_VLAN_HWTSO); - ifp->if_capenable &= ~(IFCAP_TSO | IFCAP_VLAN_HWTSO); - ifp->if_hwassist &= ~CSUM_TSO; + if_setcapabilitiesbit(ifp, 0, IFCAP_TSO | IFCAP_VLAN_HWTSO); + if_setcapenablebit(ifp, 0, IFCAP_TSO | IFCAP_VLAN_HWTSO); + if_sethwassistbits(ifp, 0, CSUM_TSO); } ether_ifattach(ifp, p->hw_addr); /* Attach driver debugnet methods. */ DEBUGNET_SET(ifp, cxgb); #ifdef DEFAULT_JUMBO if (sc->params.nports <= 2) - ifp->if_mtu = ETHERMTU_JUMBO; + if_setmtu(ifp, ETHERMTU_JUMBO); #endif if ((err = cxgb_makedev(p)) != 0) { printf("makedev failed %d\n", err); return (err); } /* Create a list of media supported by this port */ ifmedia_init(&p->media, IFM_IMASK, cxgb_media_change, cxgb_media_status); cxgb_build_medialist(p); t3_sge_init_port(p); return (err); } /* * cxgb_port_detach() is called via the device_detach methods when * cxgb_free() calls the bus_generic_detach. It is responsible for * removing the device from the view of the kernel, i.e. from all * interfaces lists etc. This routine is only called when the driver is * being unloaded, not when the link goes down. */ static int cxgb_port_detach(device_t dev) { struct port_info *p; struct adapter *sc; int i; p = device_get_softc(dev); sc = p->adapter; /* Tell cxgb_ioctl and if_init that the port is going away */ ADAPTER_LOCK(sc); SET_DOOMED(p); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->lock, 0, "cxgbdtch", 0); SET_BUSY(sc); ADAPTER_UNLOCK(sc); if (p->port_cdev != NULL) destroy_dev(p->port_cdev); cxgb_uninit_synchronized(p); ether_ifdetach(p->ifp); for (i = p->first_qset; i < p->first_qset + p->nqsets; i++) { struct sge_qset *qs = &sc->sge.qs[i]; struct sge_txq *txq = &qs->txq[TXQ_ETH]; callout_drain(&txq->txq_watchdog); callout_drain(&txq->txq_timer); } PORT_LOCK_DEINIT(p); if_free(p->ifp); p->ifp = NULL; ADAPTER_LOCK(sc); CLR_BUSY(sc); wakeup_one(&sc->flags); ADAPTER_UNLOCK(sc); return (0); } void t3_fatal_err(struct adapter *sc) { u_int fw_status[4]; if (sc->flags & FULL_INIT_DONE) { t3_sge_stop(sc); t3_write_reg(sc, A_XGM_TX_CTRL, 0); t3_write_reg(sc, A_XGM_RX_CTRL, 0); t3_write_reg(sc, XGM_REG(A_XGM_TX_CTRL, 1), 0); t3_write_reg(sc, XGM_REG(A_XGM_RX_CTRL, 1), 0); t3_intr_disable(sc); } device_printf(sc->dev,"encountered fatal error, operation suspended\n"); if (!t3_cim_ctl_blk_read(sc, 0xa0, 4, fw_status)) device_printf(sc->dev, "FW_ status: 0x%x, 0x%x, 0x%x, 0x%x\n", fw_status[0], fw_status[1], fw_status[2], fw_status[3]); } int t3_os_find_pci_capability(adapter_t *sc, int cap) { device_t dev; struct pci_devinfo *dinfo; pcicfgregs *cfg; uint32_t status; uint8_t ptr; dev = sc->dev; dinfo = device_get_ivars(dev); cfg = &dinfo->cfg; status = pci_read_config(dev, PCIR_STATUS, 2); if (!(status & PCIM_STATUS_CAPPRESENT)) return (0); switch (cfg->hdrtype & PCIM_HDRTYPE) { case 0: case 1: ptr = PCIR_CAP_PTR; break; case 2: ptr = PCIR_CAP_PTR_2; break; default: return (0); break; } ptr = pci_read_config(dev, ptr, 1); while (ptr != 0) { if (pci_read_config(dev, ptr + PCICAP_ID, 1) == cap) return (ptr); ptr = pci_read_config(dev, ptr + PCICAP_NEXTPTR, 1); } return (0); } int t3_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t3_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } /** * t3_os_link_changed - handle link status changes * @sc: the adapter associated with the link change * @port_id: the port index whose link status has changed * @link_status: the new status of the link * @speed: the new speed setting * @duplex: the new duplex setting * @fc: the new flow-control setting * * This is the OS-dependent handler for link status changes. The OS * neutral handler takes care of most of the processing for these events, * then calls this handler for any OS-specific processing. */ void t3_os_link_changed(adapter_t *adapter, int port_id, int link_status, int speed, int duplex, int fc, int mac_was_reset) { struct port_info *pi = &adapter->port[port_id]; - struct ifnet *ifp = pi->ifp; + if_t ifp = pi->ifp; /* no race with detach, so ifp should always be good */ KASSERT(ifp, ("%s: if detached.", __func__)); /* Reapply mac settings if they were lost due to a reset */ if (mac_was_reset) { PORT_LOCK(pi); cxgb_update_mac_settings(pi); PORT_UNLOCK(pi); } if (link_status) { - ifp->if_baudrate = IF_Mbps(speed); + if_setbaudrate(ifp, IF_Mbps(speed)); if_link_state_change(ifp, LINK_STATE_UP); } else if_link_state_change(ifp, LINK_STATE_DOWN); } /** * t3_os_phymod_changed - handle PHY module changes * @phy: the PHY reporting the module change * @mod_type: new module type * * This is the OS-dependent handler for PHY module changes. It is * invoked when a PHY module is removed or inserted for any OS-specific * processing. */ void t3_os_phymod_changed(struct adapter *adap, int port_id) { static const char *mod_str[] = { NULL, "SR", "LR", "LRM", "TWINAX", "TWINAX-L", "unknown" }; struct port_info *pi = &adap->port[port_id]; int mod = pi->phy.modtype; if (mod != pi->media.ifm_cur->ifm_data) cxgb_build_medialist(pi); if (mod == phy_modtype_none) if_printf(pi->ifp, "PHY module unplugged\n"); else { KASSERT(mod < ARRAY_SIZE(mod_str), ("invalid PHY module type %d", mod)); if_printf(pi->ifp, "%s PHY module inserted\n", mod_str[mod]); } } void t3_os_set_hw_addr(adapter_t *adapter, int port_idx, u8 hw_addr[]) { /* * The ifnet might not be allocated before this gets called, * as this is called early on in attach by t3_prep_adapter * save the address off in the port structure */ if (cxgb_debug) printf("set_hw_addr on idx %d addr %6D\n", port_idx, hw_addr, ":"); bcopy(hw_addr, adapter->port[port_idx].hw_addr, ETHER_ADDR_LEN); } /* * Programs the XGMAC based on the settings in the ifnet. These settings * include MTU, MAC address, mcast addresses, etc. */ static void cxgb_update_mac_settings(struct port_info *p) { - struct ifnet *ifp = p->ifp; + if_t ifp = p->ifp; struct t3_rx_mode rm; struct cmac *mac = &p->mac; int mtu, hwtagging; PORT_LOCK_ASSERT_OWNED(p); - bcopy(IF_LLADDR(ifp), p->hw_addr, ETHER_ADDR_LEN); + bcopy(if_getlladdr(ifp), p->hw_addr, ETHER_ADDR_LEN); - mtu = ifp->if_mtu; - if (ifp->if_capenable & IFCAP_VLAN_MTU) + mtu = if_getmtu(ifp); + if (if_getcapenable(ifp) & IFCAP_VLAN_MTU) mtu += ETHER_VLAN_ENCAP_LEN; - hwtagging = (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0; + hwtagging = (if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING) != 0; t3_mac_set_mtu(mac, mtu); t3_set_vlan_accel(p->adapter, 1 << p->tx_chan, hwtagging); t3_mac_set_address(mac, 0, p->hw_addr); t3_init_rx_mode(&rm, p); t3_mac_set_rx_mode(mac, &rm); } static int await_mgmt_replies(struct adapter *adap, unsigned long init_cnt, unsigned long n) { int attempts = 5; while (adap->sge.qs[0].rspq.offload_pkts < init_cnt + n) { if (!--attempts) return (ETIMEDOUT); t3_os_sleep(10); } return 0; } static int init_tp_parity(struct adapter *adap) { int i; struct mbuf *m; struct cpl_set_tcb_field *greq; unsigned long cnt = adap->sge.qs[0].rspq.offload_pkts; t3_tp_set_offload_mode(adap, 1); for (i = 0; i < 16; i++) { struct cpl_smt_write_req *req; m = m_gethdr(M_WAITOK, MT_DATA); req = mtod(m, struct cpl_smt_write_req *); m->m_len = m->m_pkthdr.len = sizeof(*req); memset(req, 0, sizeof(*req)); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, i)); req->iff = i; t3_mgmt_tx(adap, m); } for (i = 0; i < 2048; i++) { struct cpl_l2t_write_req *req; m = m_gethdr(M_WAITOK, MT_DATA); req = mtod(m, struct cpl_l2t_write_req *); m->m_len = m->m_pkthdr.len = sizeof(*req); memset(req, 0, sizeof(*req)); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, i)); req->params = htonl(V_L2T_W_IDX(i)); t3_mgmt_tx(adap, m); } for (i = 0; i < 2048; i++) { struct cpl_rte_write_req *req; m = m_gethdr(M_WAITOK, MT_DATA); req = mtod(m, struct cpl_rte_write_req *); m->m_len = m->m_pkthdr.len = sizeof(*req); memset(req, 0, sizeof(*req)); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RTE_WRITE_REQ, i)); req->l2t_idx = htonl(V_L2T_W_IDX(i)); t3_mgmt_tx(adap, m); } m = m_gethdr(M_WAITOK, MT_DATA); greq = mtod(m, struct cpl_set_tcb_field *); m->m_len = m->m_pkthdr.len = sizeof(*greq); memset(greq, 0, sizeof(*greq)); greq->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(greq) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, 0)); greq->mask = htobe64(1); t3_mgmt_tx(adap, m); i = await_mgmt_replies(adap, cnt, 16 + 2048 + 2048 + 1); t3_tp_set_offload_mode(adap, 0); return (i); } /** * setup_rss - configure Receive Side Steering (per-queue connection demux) * @adap: the adapter * * Sets up RSS to distribute packets to multiple receive queues. We * configure the RSS CPU lookup table to distribute to the number of HW * receive queues, and the response queue lookup table to narrow that * down to the response queues actually configured for each port. * We always configure the RSS mapping for two ports since the mapping * table has plenty of entries. */ static void setup_rss(adapter_t *adap) { int i; u_int nq[2]; uint8_t cpus[SGE_QSETS + 1]; uint16_t rspq_map[RSS_TABLE_SIZE]; for (i = 0; i < SGE_QSETS; ++i) cpus[i] = i; cpus[SGE_QSETS] = 0xff; nq[0] = nq[1] = 0; for_each_port(adap, i) { const struct port_info *pi = adap2pinfo(adap, i); nq[pi->tx_chan] += pi->nqsets; } for (i = 0; i < RSS_TABLE_SIZE / 2; ++i) { rspq_map[i] = nq[0] ? i % nq[0] : 0; rspq_map[i + RSS_TABLE_SIZE / 2] = nq[1] ? i % nq[1] + nq[0] : 0; } /* Calculate the reverse RSS map table */ for (i = 0; i < SGE_QSETS; ++i) adap->rrss_map[i] = 0xff; for (i = 0; i < RSS_TABLE_SIZE; ++i) if (adap->rrss_map[rspq_map[i]] == 0xff) adap->rrss_map[rspq_map[i]] = i; t3_config_rss(adap, F_RQFEEDBACKENABLE | F_TNLLKPEN | F_TNLMAPEN | F_TNLPRTEN | F_TNL2TUPEN | F_TNL4TUPEN | F_OFDMAPEN | F_RRCPLMAPEN | V_RRCPLCPUSIZE(6) | F_HASHTOEPLITZ, cpus, rspq_map); } static void send_pktsched_cmd(struct adapter *adap, int sched, int qidx, int lo, int hi, int port) { struct mbuf *m; struct mngt_pktsched_wr *req; m = m_gethdr(M_NOWAIT, MT_DATA); if (m) { req = mtod(m, struct mngt_pktsched_wr *); req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_MNGT)); req->mngt_opcode = FW_MNGTOPCODE_PKTSCHED_SET; req->sched = sched; req->idx = qidx; req->min = lo; req->max = hi; req->binding = port; m->m_len = m->m_pkthdr.len = sizeof(*req); t3_mgmt_tx(adap, m); } } static void bind_qsets(adapter_t *sc) { int i, j; for (i = 0; i < (sc)->params.nports; ++i) { const struct port_info *pi = adap2pinfo(sc, i); for (j = 0; j < pi->nqsets; ++j) { send_pktsched_cmd(sc, 1, pi->first_qset + j, -1, -1, pi->tx_chan); } } } static void update_tpeeprom(struct adapter *adap) { const struct firmware *tpeeprom; uint32_t version; unsigned int major, minor; int ret, len; char rev, name[32]; t3_seeprom_read(adap, TP_SRAM_OFFSET, &version); major = G_TP_VERSION_MAJOR(version); minor = G_TP_VERSION_MINOR(version); if (major == TP_VERSION_MAJOR && minor == TP_VERSION_MINOR) return; rev = t3rev2char(adap); snprintf(name, sizeof(name), TPEEPROM_NAME, rev); tpeeprom = firmware_get(name); if (tpeeprom == NULL) { device_printf(adap->dev, "could not load TP EEPROM: unable to load %s\n", name); return; } len = tpeeprom->datasize - 4; ret = t3_check_tpsram(adap, tpeeprom->data, tpeeprom->datasize); if (ret) goto release_tpeeprom; if (len != TP_SRAM_LEN) { device_printf(adap->dev, "%s length is wrong len=%d expected=%d\n", name, len, TP_SRAM_LEN); return; } ret = set_eeprom(&adap->port[0], tpeeprom->data, tpeeprom->datasize, TP_SRAM_OFFSET); if (!ret) { device_printf(adap->dev, "Protocol SRAM image updated in EEPROM to %d.%d.%d\n", TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO); } else device_printf(adap->dev, "Protocol SRAM image update in EEPROM failed\n"); release_tpeeprom: firmware_put(tpeeprom, FIRMWARE_UNLOAD); return; } static int update_tpsram(struct adapter *adap) { const struct firmware *tpsram; int ret; char rev, name[32]; rev = t3rev2char(adap); snprintf(name, sizeof(name), TPSRAM_NAME, rev); update_tpeeprom(adap); tpsram = firmware_get(name); if (tpsram == NULL){ device_printf(adap->dev, "could not load TP SRAM\n"); return (EINVAL); } else device_printf(adap->dev, "updating TP SRAM\n"); ret = t3_check_tpsram(adap, tpsram->data, tpsram->datasize); if (ret) goto release_tpsram; ret = t3_set_proto_sram(adap, tpsram->data); if (ret) device_printf(adap->dev, "loading protocol SRAM failed\n"); release_tpsram: firmware_put(tpsram, FIRMWARE_UNLOAD); return ret; } /** * cxgb_up - enable the adapter * @adap: adapter being enabled * * Called when the first port is enabled, this function performs the * actions necessary to make an adapter operational, such as completing * the initialization of HW modules, and enabling interrupts. */ static int cxgb_up(struct adapter *sc) { int err = 0; unsigned int mxf = t3_mc5_size(&sc->mc5) - MC5_MIN_TIDS; KASSERT(sc->open_device_map == 0, ("%s: device(s) already open (%x)", __func__, sc->open_device_map)); if ((sc->flags & FULL_INIT_DONE) == 0) { ADAPTER_LOCK_ASSERT_NOTOWNED(sc); if ((sc->flags & FW_UPTODATE) == 0) if ((err = upgrade_fw(sc))) goto out; if ((sc->flags & TPS_UPTODATE) == 0) if ((err = update_tpsram(sc))) goto out; if (is_offload(sc) && nfilters != 0) { sc->params.mc5.nservers = 0; if (nfilters < 0) sc->params.mc5.nfilters = mxf; else sc->params.mc5.nfilters = min(nfilters, mxf); } err = t3_init_hw(sc, 0); if (err) goto out; t3_set_reg_field(sc, A_TP_PARA_REG5, 0, F_RXDDPOFFINIT); t3_write_reg(sc, A_ULPRX_TDDP_PSZ, V_HPZ0(PAGE_SHIFT - 12)); err = setup_sge_qsets(sc); if (err) goto out; alloc_filters(sc); setup_rss(sc); t3_add_configured_sysctls(sc); sc->flags |= FULL_INIT_DONE; } t3_intr_clear(sc); t3_sge_start(sc); t3_intr_enable(sc); if (sc->params.rev >= T3_REV_C && !(sc->flags & TP_PARITY_INIT) && is_offload(sc) && init_tp_parity(sc) == 0) sc->flags |= TP_PARITY_INIT; if (sc->flags & TP_PARITY_INIT) { t3_write_reg(sc, A_TP_INT_CAUSE, F_CMCACHEPERR | F_ARPLUTPERR); t3_write_reg(sc, A_TP_INT_ENABLE, 0x7fbfffff); } if (!(sc->flags & QUEUES_BOUND)) { bind_qsets(sc); setup_hw_filters(sc); sc->flags |= QUEUES_BOUND; } t3_sge_reset_adapter(sc); out: return (err); } /* * Called when the last open device is closed. Does NOT undo all of cxgb_up's * work. Specifically, the resources grabbed under FULL_INIT_DONE are released * during controller_detach, not here. */ static void cxgb_down(struct adapter *sc) { t3_sge_stop(sc); t3_intr_disable(sc); } /* * if_init for cxgb ports. */ static void cxgb_init(void *arg) { struct port_info *p = arg; struct adapter *sc = p->adapter; ADAPTER_LOCK(sc); cxgb_init_locked(p); /* releases adapter lock */ ADAPTER_LOCK_ASSERT_NOTOWNED(sc); } static int cxgb_init_locked(struct port_info *p) { struct adapter *sc = p->adapter; - struct ifnet *ifp = p->ifp; + if_t ifp = p->ifp; struct cmac *mac = &p->mac; int i, rc = 0, may_sleep = 0, gave_up_lock = 0; ADAPTER_LOCK_ASSERT_OWNED(sc); while (!IS_DOOMED(p) && IS_BUSY(sc)) { gave_up_lock = 1; if (mtx_sleep(&sc->flags, &sc->lock, PCATCH, "cxgbinit", 0)) { rc = EINTR; goto done; } } if (IS_DOOMED(p)) { rc = ENXIO; goto done; } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); /* * The code that runs during one-time adapter initialization can sleep * so it's important not to hold any locks across it. */ may_sleep = sc->flags & FULL_INIT_DONE ? 0 : 1; if (may_sleep) { SET_BUSY(sc); gave_up_lock = 1; ADAPTER_UNLOCK(sc); } if (sc->open_device_map == 0 && ((rc = cxgb_up(sc)) != 0)) goto done; PORT_LOCK(p); if (isset(&sc->open_device_map, p->port_id) && - (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + (if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { PORT_UNLOCK(p); goto done; } t3_port_intr_enable(sc, p->port_id); if (!mac->multiport) t3_mac_init(mac); cxgb_update_mac_settings(p); t3_link_start(&p->phy, mac, &p->link_config); t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX); - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); PORT_UNLOCK(p); for (i = p->first_qset; i < p->first_qset + p->nqsets; i++) { struct sge_qset *qs = &sc->sge.qs[i]; struct sge_txq *txq = &qs->txq[TXQ_ETH]; callout_reset_on(&txq->txq_watchdog, hz, cxgb_tx_watchdog, qs, txq->txq_watchdog.c_cpu); } /* all ok */ setbit(&sc->open_device_map, p->port_id); callout_reset(&p->link_check_ch, p->phy.caps & SUPPORTED_LINK_IRQ ? hz * 3 : hz / 4, link_check_callout, p); done: if (may_sleep) { ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); } if (gave_up_lock) wakeup_one(&sc->flags); ADAPTER_UNLOCK(sc); return (rc); } static int cxgb_uninit_locked(struct port_info *p) { struct adapter *sc = p->adapter; int rc; ADAPTER_LOCK_ASSERT_OWNED(sc); while (!IS_DOOMED(p) && IS_BUSY(sc)) { if (mtx_sleep(&sc->flags, &sc->lock, PCATCH, "cxgbunin", 0)) { rc = EINTR; goto done; } } if (IS_DOOMED(p)) { rc = ENXIO; goto done; } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); ADAPTER_UNLOCK(sc); rc = cxgb_uninit_synchronized(p); ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup_one(&sc->flags); done: ADAPTER_UNLOCK(sc); return (rc); } /* * Called on "ifconfig down", and from port_detach */ static int cxgb_uninit_synchronized(struct port_info *pi) { struct adapter *sc = pi->adapter; - struct ifnet *ifp = pi->ifp; + if_t ifp = pi->ifp; /* * taskqueue_drain may cause a deadlock if the adapter lock is held. */ ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* * Clear this port's bit from the open device map, and then drain all * the tasks that can access/manipulate this port's port_info or ifp. * We disable this port's interrupts here and so the slow/ext * interrupt tasks won't be enqueued. The tick task will continue to * be enqueued every second but the runs after this drain will not see * this port in the open device map. * * A well behaved task must take open_device_map into account and ignore * ports that are not open. */ clrbit(&sc->open_device_map, pi->port_id); t3_port_intr_disable(sc, pi->port_id); taskqueue_drain(sc->tq, &sc->slow_intr_task); taskqueue_drain(sc->tq, &sc->tick_task); callout_drain(&pi->link_check_ch); taskqueue_drain(sc->tq, &pi->link_check_task); PORT_LOCK(pi); - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING | IFF_DRV_OACTIVE); /* disable pause frames */ t3_set_reg_field(sc, A_XGM_TX_CFG + pi->mac.offset, F_TXPAUSEEN, 0); /* Reset RX FIFO HWM */ t3_set_reg_field(sc, A_XGM_RXFIFO_CFG + pi->mac.offset, V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM), 0); DELAY(100 * 1000); /* Wait for TXFIFO empty */ t3_wait_op_done(sc, A_XGM_TXFIFO_CFG + pi->mac.offset, F_TXFIFO_EMPTY, 1, 20, 5); DELAY(100 * 1000); t3_mac_disable(&pi->mac, MAC_DIRECTION_RX); pi->phy.ops->power_down(&pi->phy, 1); PORT_UNLOCK(pi); pi->link_config.link_ok = 0; t3_os_link_changed(sc, pi->port_id, 0, 0, 0, 0, 0); if (sc->open_device_map == 0) cxgb_down(pi->adapter); return (0); } /* * Mark lro enabled or disabled in all qsets for this port */ static int cxgb_set_lro(struct port_info *p, int enabled) { int i; struct adapter *adp = p->adapter; struct sge_qset *q; for (i = 0; i < p->nqsets; i++) { q = &adp->sge.qs[p->first_qset + i]; q->lro.enabled = (enabled != 0); } return (0); } static int -cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data) +cxgb_ioctl(if_t ifp, unsigned long command, caddr_t data) { - struct port_info *p = ifp->if_softc; + struct port_info *p = if_getsoftc(ifp); struct adapter *sc = p->adapter; struct ifreq *ifr = (struct ifreq *)data; int flags, error = 0, mtu; uint32_t mask; switch (command) { case SIOCSIFMTU: ADAPTER_LOCK(sc); error = IS_DOOMED(p) ? ENXIO : (IS_BUSY(sc) ? EBUSY : 0); if (error) { fail: ADAPTER_UNLOCK(sc); return (error); } mtu = ifr->ifr_mtu; if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO)) { error = EINVAL; } else { - ifp->if_mtu = mtu; + if_setmtu(ifp, mtu); PORT_LOCK(p); cxgb_update_mac_settings(p); PORT_UNLOCK(p); } ADAPTER_UNLOCK(sc); break; case SIOCSIFFLAGS: ADAPTER_LOCK(sc); if (IS_DOOMED(p)) { error = ENXIO; goto fail; } - if (ifp->if_flags & IFF_UP) { - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (if_getflags(ifp) & IFF_UP) { + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { flags = p->if_flags; - if (((ifp->if_flags ^ flags) & IFF_PROMISC) || - ((ifp->if_flags ^ flags) & IFF_ALLMULTI)) { + if (((if_getflags(ifp) ^ flags) & IFF_PROMISC) || + ((if_getflags(ifp) ^ flags) & IFF_ALLMULTI)) { if (IS_BUSY(sc)) { error = EBUSY; goto fail; } PORT_LOCK(p); cxgb_update_mac_settings(p); PORT_UNLOCK(p); } ADAPTER_UNLOCK(sc); } else error = cxgb_init_locked(p); - p->if_flags = ifp->if_flags; - } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) + p->if_flags = if_getflags(ifp); + } else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) error = cxgb_uninit_locked(p); else ADAPTER_UNLOCK(sc); ADAPTER_LOCK_ASSERT_NOTOWNED(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: ADAPTER_LOCK(sc); error = IS_DOOMED(p) ? ENXIO : (IS_BUSY(sc) ? EBUSY : 0); if (error) goto fail; - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { PORT_LOCK(p); cxgb_update_mac_settings(p); PORT_UNLOCK(p); } ADAPTER_UNLOCK(sc); break; case SIOCSIFCAP: ADAPTER_LOCK(sc); error = IS_DOOMED(p) ? ENXIO : (IS_BUSY(sc) ? EBUSY : 0); if (error) goto fail; - mask = ifr->ifr_reqcap ^ ifp->if_capenable; + mask = ifr->ifr_reqcap ^ if_getcapenable(ifp); if (mask & IFCAP_TXCSUM) { - ifp->if_capenable ^= IFCAP_TXCSUM; - ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); + if_togglecapenable(ifp, IFCAP_TXCSUM); + if_togglehwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_IP); - if (IFCAP_TSO4 & ifp->if_capenable && - !(IFCAP_TXCSUM & ifp->if_capenable)) { + if (IFCAP_TSO4 & if_getcapenable(ifp) && + !(IFCAP_TXCSUM & if_getcapenable(ifp))) { mask &= ~IFCAP_TSO4; - ifp->if_capenable &= ~IFCAP_TSO4; + if_setcapenablebit(ifp, 0, IFCAP_TSO4); if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { - ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; - ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); + if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); + if_togglehwassist(ifp, CSUM_UDP_IPV6 | CSUM_TCP_IPV6); - if (IFCAP_TSO6 & ifp->if_capenable && - !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + if (IFCAP_TSO6 & if_getcapenable(ifp) && + !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) { mask &= ~IFCAP_TSO6; - ifp->if_capenable &= ~IFCAP_TSO6; + if_setcapenablebit(ifp, 0, IFCAP_TSO6); if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) - ifp->if_capenable ^= IFCAP_RXCSUM; + if_togglecapenable(ifp, IFCAP_RXCSUM); if (mask & IFCAP_RXCSUM_IPV6) - ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; + if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { - if (!(IFCAP_TSO4 & ifp->if_capenable) && - !(IFCAP_TXCSUM & ifp->if_capenable)) { + if (!(IFCAP_TSO4 & if_getcapenable(ifp)) && + !(IFCAP_TXCSUM & if_getcapenable(ifp))) { if_printf(ifp, "enable txcsum first.\n"); error = EAGAIN; goto fail; } - ifp->if_capenable ^= IFCAP_TSO4; + if_togglecapenable(ifp, IFCAP_TSO4); } if (mask & IFCAP_TSO6) { - if (!(IFCAP_TSO6 & ifp->if_capenable) && - !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + if (!(IFCAP_TSO6 & if_getcapenable(ifp)) && + !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) { if_printf(ifp, "enable txcsum6 first.\n"); error = EAGAIN; goto fail; } - ifp->if_capenable ^= IFCAP_TSO6; + if_togglecapenable(ifp, IFCAP_TSO6); } if (mask & IFCAP_LRO) { - ifp->if_capenable ^= IFCAP_LRO; + if_togglecapenable(ifp, IFCAP_LRO); /* Safe to do this even if cxgb_up not called yet */ - cxgb_set_lro(p, ifp->if_capenable & IFCAP_LRO); + cxgb_set_lro(p, if_getcapenable(ifp) & IFCAP_LRO); } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE4) { - int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE4; + int enable = (if_getcapenable(ifp) ^ mask) & IFCAP_TOE4; error = toe_capability(p, enable); if (error == 0) - ifp->if_capenable ^= mask; + if_togglecapenable(ifp, mask); } #endif if (mask & IFCAP_VLAN_HWTAGGING) { - ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING); + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { PORT_LOCK(p); cxgb_update_mac_settings(p); PORT_UNLOCK(p); } } if (mask & IFCAP_VLAN_MTU) { - ifp->if_capenable ^= IFCAP_VLAN_MTU; - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if_togglecapenable(ifp, IFCAP_VLAN_MTU); + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { PORT_LOCK(p); cxgb_update_mac_settings(p); PORT_UNLOCK(p); } } if (mask & IFCAP_VLAN_HWTSO) - ifp->if_capenable ^= IFCAP_VLAN_HWTSO; + if_togglecapenable(ifp, IFCAP_VLAN_HWTSO); if (mask & IFCAP_VLAN_HWCSUM) - ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; + if_togglecapenable(ifp, IFCAP_VLAN_HWCSUM); #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif ADAPTER_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &p->media, command); break; default: error = ether_ioctl(ifp, command, data); } return (error); } static int -cxgb_media_change(struct ifnet *ifp) +cxgb_media_change(if_t ifp) { return (EOPNOTSUPP); } /* * Translates phy->modtype to the correct Ethernet media subtype. */ static int cxgb_ifm_type(int mod) { switch (mod) { case phy_modtype_sr: return (IFM_10G_SR); case phy_modtype_lr: return (IFM_10G_LR); case phy_modtype_lrm: return (IFM_10G_LRM); case phy_modtype_twinax: return (IFM_10G_TWINAX); case phy_modtype_twinax_long: return (IFM_10G_TWINAX_LONG); case phy_modtype_none: return (IFM_NONE); case phy_modtype_unknown: return (IFM_UNKNOWN); } KASSERT(0, ("%s: modtype %d unknown", __func__, mod)); return (IFM_UNKNOWN); } /* * Rebuilds the ifmedia list for this port, and sets the current media. */ static void cxgb_build_medialist(struct port_info *p) { struct cphy *phy = &p->phy; struct ifmedia *media = &p->media; int mod = phy->modtype; int m = IFM_ETHER | IFM_FDX; PORT_LOCK(p); ifmedia_removeall(media); if (phy->caps & SUPPORTED_TP && phy->caps & SUPPORTED_Autoneg) { /* Copper (RJ45) */ if (phy->caps & SUPPORTED_10000baseT_Full) ifmedia_add(media, m | IFM_10G_T, mod, NULL); if (phy->caps & SUPPORTED_1000baseT_Full) ifmedia_add(media, m | IFM_1000_T, mod, NULL); if (phy->caps & SUPPORTED_100baseT_Full) ifmedia_add(media, m | IFM_100_TX, mod, NULL); if (phy->caps & SUPPORTED_10baseT_Full) ifmedia_add(media, m | IFM_10_T, mod, NULL); ifmedia_add(media, IFM_ETHER | IFM_AUTO, mod, NULL); ifmedia_set(media, IFM_ETHER | IFM_AUTO); } else if (phy->caps & SUPPORTED_TP) { /* Copper (CX4) */ KASSERT(phy->caps & SUPPORTED_10000baseT_Full, ("%s: unexpected cap 0x%x", __func__, phy->caps)); ifmedia_add(media, m | IFM_10G_CX4, mod, NULL); ifmedia_set(media, m | IFM_10G_CX4); } else if (phy->caps & SUPPORTED_FIBRE && phy->caps & SUPPORTED_10000baseT_Full) { /* 10G optical (but includes SFP+ twinax) */ m |= cxgb_ifm_type(mod); if (IFM_SUBTYPE(m) == IFM_NONE) m &= ~IFM_FDX; ifmedia_add(media, m, mod, NULL); ifmedia_set(media, m); } else if (phy->caps & SUPPORTED_FIBRE && phy->caps & SUPPORTED_1000baseT_Full) { /* 1G optical */ /* XXX: Lie and claim to be SX, could actually be any 1G-X */ ifmedia_add(media, m | IFM_1000_SX, mod, NULL); ifmedia_set(media, m | IFM_1000_SX); } else { KASSERT(0, ("%s: don't know how to handle 0x%x.", __func__, phy->caps)); } PORT_UNLOCK(p); } static void -cxgb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) +cxgb_media_status(if_t ifp, struct ifmediareq *ifmr) { - struct port_info *p = ifp->if_softc; + struct port_info *p = if_getsoftc(ifp); struct ifmedia_entry *cur = p->media.ifm_cur; int speed = p->link_config.speed; if (cur->ifm_data != p->phy.modtype) { cxgb_build_medialist(p); cur = p->media.ifm_cur; } ifmr->ifm_status = IFM_AVALID; if (!p->link_config.link_ok) return; ifmr->ifm_status |= IFM_ACTIVE; /* * active and current will differ iff current media is autoselect. That * can happen only for copper RJ45. */ if (IFM_SUBTYPE(cur->ifm_media) != IFM_AUTO) return; KASSERT(p->phy.caps & SUPPORTED_TP && p->phy.caps & SUPPORTED_Autoneg, ("%s: unexpected PHY caps 0x%x", __func__, p->phy.caps)); ifmr->ifm_active = IFM_ETHER | IFM_FDX; if (speed == SPEED_10000) ifmr->ifm_active |= IFM_10G_T; else if (speed == SPEED_1000) ifmr->ifm_active |= IFM_1000_T; else if (speed == SPEED_100) ifmr->ifm_active |= IFM_100_TX; else if (speed == SPEED_10) ifmr->ifm_active |= IFM_10_T; else KASSERT(0, ("%s: link up but speed unknown (%u)", __func__, speed)); } static uint64_t -cxgb_get_counter(struct ifnet *ifp, ift_counter c) +cxgb_get_counter(if_t ifp, ift_counter c) { - struct port_info *pi = ifp->if_softc; + struct port_info *pi = if_getsoftc(ifp); struct adapter *sc = pi->adapter; struct cmac *mac = &pi->mac; struct mac_stats *mstats = &mac->stats; cxgb_refresh_stats(pi); switch (c) { case IFCOUNTER_IPACKETS: return (mstats->rx_frames); case IFCOUNTER_IERRORS: return (mstats->rx_jabber + mstats->rx_data_errs + mstats->rx_sequence_errs + mstats->rx_runt + mstats->rx_too_long + mstats->rx_mac_internal_errs + mstats->rx_short + mstats->rx_fcs_errs); case IFCOUNTER_OPACKETS: return (mstats->tx_frames); case IFCOUNTER_OERRORS: return (mstats->tx_excess_collisions + mstats->tx_underrun + mstats->tx_len_errs + mstats->tx_mac_internal_errs + mstats->tx_excess_deferral + mstats->tx_fcs_errs); case IFCOUNTER_COLLISIONS: return (mstats->tx_total_collisions); case IFCOUNTER_IBYTES: return (mstats->rx_octets); case IFCOUNTER_OBYTES: return (mstats->tx_octets); case IFCOUNTER_IMCASTS: return (mstats->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (mstats->tx_mcast_frames); case IFCOUNTER_IQDROPS: return (mstats->rx_cong_drops); case IFCOUNTER_OQDROPS: { int i; uint64_t drops; drops = 0; if (sc->flags & FULL_INIT_DONE) { for (i = pi->first_qset; i < pi->first_qset + pi->nqsets; i++) drops += sc->sge.qs[i].txq[TXQ_ETH].txq_mr->br_drops; } return (drops); } default: return (if_get_counter_default(ifp, c)); } } static void cxgb_async_intr(void *data) { adapter_t *sc = data; t3_write_reg(sc, A_PL_INT_ENABLE0, 0); (void) t3_read_reg(sc, A_PL_INT_ENABLE0); taskqueue_enqueue(sc->tq, &sc->slow_intr_task); } static void link_check_callout(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; if (!isset(&sc->open_device_map, pi->port_id)) return; taskqueue_enqueue(sc->tq, &pi->link_check_task); } static void check_link_status(void *arg, int pending) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; if (!isset(&sc->open_device_map, pi->port_id)) return; t3_link_changed(sc, pi->port_id); if (pi->link_fault || !(pi->phy.caps & SUPPORTED_LINK_IRQ) || pi->link_config.link_ok == 0) callout_reset(&pi->link_check_ch, hz, link_check_callout, pi); } void t3_os_link_intr(struct port_info *pi) { /* * Schedule a link check in the near future. If the link is flapping * rapidly we'll keep resetting the callout and delaying the check until * things stabilize a bit. */ callout_reset(&pi->link_check_ch, hz / 4, link_check_callout, pi); } static void check_t3b2_mac(struct adapter *sc) { int i; if (sc->flags & CXGB_SHUTDOWN) return; for_each_port(sc, i) { struct port_info *p = &sc->port[i]; int status; #ifdef INVARIANTS - struct ifnet *ifp = p->ifp; + if_t ifp = p->ifp; #endif if (!isset(&sc->open_device_map, p->port_id) || p->link_fault || !p->link_config.link_ok) continue; - KASSERT(ifp->if_drv_flags & IFF_DRV_RUNNING, + KASSERT(if_getdrvflags(ifp) & IFF_DRV_RUNNING, ("%s: state mismatch (drv_flags %x, device_map %x)", - __func__, ifp->if_drv_flags, sc->open_device_map)); + __func__, if_getdrvflags(ifp), sc->open_device_map)); PORT_LOCK(p); status = t3b2_mac_watchdog_task(&p->mac); if (status == 1) p->mac.stats.num_toggled++; else if (status == 2) { struct cmac *mac = &p->mac; cxgb_update_mac_settings(p); t3_link_start(&p->phy, mac, &p->link_config); t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX); t3_port_intr_enable(sc, p->port_id); p->mac.stats.num_resets++; } PORT_UNLOCK(p); } } static void cxgb_tick(void *arg) { adapter_t *sc = (adapter_t *)arg; if (sc->flags & CXGB_SHUTDOWN) return; taskqueue_enqueue(sc->tq, &sc->tick_task); callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc); } void cxgb_refresh_stats(struct port_info *pi) { struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &pi->last_refreshed, <)) return; PORT_LOCK(pi); t3_mac_update_stats(&pi->mac); PORT_UNLOCK(pi); getmicrotime(&pi->last_refreshed); } static void cxgb_tick_handler(void *arg, int count) { adapter_t *sc = (adapter_t *)arg; const struct adapter_params *p = &sc->params; int i; uint32_t cause, reset; if (sc->flags & CXGB_SHUTDOWN || !(sc->flags & FULL_INIT_DONE)) return; if (p->rev == T3_REV_B2 && p->nports < 4 && sc->open_device_map) check_t3b2_mac(sc); cause = t3_read_reg(sc, A_SG_INT_CAUSE) & (F_RSPQSTARVE | F_FLEMPTY); if (cause) { struct sge_qset *qs = &sc->sge.qs[0]; uint32_t mask, v; v = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS) & ~0xff00; mask = 1; for (i = 0; i < SGE_QSETS; i++) { if (v & mask) qs[i].rspq.starved++; mask <<= 1; } mask <<= SGE_QSETS; /* skip RSPQXDISABLED */ for (i = 0; i < SGE_QSETS * 2; i++) { if (v & mask) { qs[i / 2].fl[i % 2].empty++; } mask <<= 1; } /* clear */ t3_write_reg(sc, A_SG_RSPQ_FL_STATUS, v); t3_write_reg(sc, A_SG_INT_CAUSE, cause); } for (i = 0; i < sc->params.nports; i++) { struct port_info *pi = &sc->port[i]; struct cmac *mac = &pi->mac; if (!isset(&sc->open_device_map, pi->port_id)) continue; cxgb_refresh_stats(pi); if (mac->multiport) continue; /* Count rx fifo overflows, once per second */ cause = t3_read_reg(sc, A_XGM_INT_CAUSE + mac->offset); reset = 0; if (cause & F_RXFIFO_OVERFLOW) { mac->stats.rx_fifo_ovfl++; reset |= F_RXFIFO_OVERFLOW; } t3_write_reg(sc, A_XGM_INT_CAUSE + mac->offset, reset); } } static void touch_bars(device_t dev) { /* * Don't enable yet */ #if !defined(__LP64__) && 0 u32 v; pci_read_config_dword(pdev, PCI_BASE_ADDRESS_1, &v); pci_write_config_dword(pdev, PCI_BASE_ADDRESS_1, v); pci_read_config_dword(pdev, PCI_BASE_ADDRESS_3, &v); pci_write_config_dword(pdev, PCI_BASE_ADDRESS_3, v); pci_read_config_dword(pdev, PCI_BASE_ADDRESS_5, &v); pci_write_config_dword(pdev, PCI_BASE_ADDRESS_5, v); #endif } static int set_eeprom(struct port_info *pi, const uint8_t *data, int len, int offset) { uint8_t *buf; int err = 0; u32 aligned_offset, aligned_len, *p; struct adapter *adapter = pi->adapter; aligned_offset = offset & ~3; aligned_len = (len + (offset & 3) + 3) & ~3; if (aligned_offset != offset || aligned_len != len) { buf = malloc(aligned_len, M_DEVBUF, M_WAITOK|M_ZERO); if (!buf) return (ENOMEM); err = t3_seeprom_read(adapter, aligned_offset, (u32 *)buf); if (!err && aligned_len > 4) err = t3_seeprom_read(adapter, aligned_offset + aligned_len - 4, (u32 *)&buf[aligned_len - 4]); if (err) goto out; memcpy(buf + (offset & 3), data, len); } else buf = (uint8_t *)(uintptr_t)data; err = t3_seeprom_wp(adapter, 0); if (err) goto out; for (p = (u32 *)buf; !err && aligned_len; aligned_len -= 4, p++) { err = t3_seeprom_write(adapter, aligned_offset, *p); aligned_offset += 4; } if (!err) err = t3_seeprom_wp(adapter, 1); out: if (buf != data) free(buf, M_DEVBUF); return err; } static int in_range(int val, int lo, int hi) { return val < 0 || (val <= hi && val >= lo); } static int cxgb_extension_open(struct cdev *dev, int flags, int fmp, struct thread *td) { return (0); } static int cxgb_extension_close(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } static int cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int mmd, error = 0; struct port_info *pi = dev->si_drv1; adapter_t *sc = pi->adapter; #ifdef PRIV_SUPPORTED if (priv_check(td, PRIV_DRIVER)) { if (cxgb_debug) printf("user does not have access to privileged ioctls\n"); return (EPERM); } #else if (suser(td)) { if (cxgb_debug) printf("user does not have access to privileged ioctls\n"); return (EPERM); } #endif switch (cmd) { case CHELSIO_GET_MIIREG: { uint32_t val; struct cphy *phy = &pi->phy; struct ch_mii_data *mid = (struct ch_mii_data *)data; if (!phy->mdio_read) return (EOPNOTSUPP); if (is_10G(sc)) { mmd = mid->phy_id >> 8; if (!mmd) mmd = MDIO_DEV_PCS; else if (mmd > MDIO_DEV_VEND2) return (EINVAL); error = phy->mdio_read(sc, mid->phy_id & 0x1f, mmd, mid->reg_num, &val); } else error = phy->mdio_read(sc, mid->phy_id & 0x1f, 0, mid->reg_num & 0x1f, &val); if (error == 0) mid->val_out = val; break; } case CHELSIO_SET_MIIREG: { struct cphy *phy = &pi->phy; struct ch_mii_data *mid = (struct ch_mii_data *)data; if (!phy->mdio_write) return (EOPNOTSUPP); if (is_10G(sc)) { mmd = mid->phy_id >> 8; if (!mmd) mmd = MDIO_DEV_PCS; else if (mmd > MDIO_DEV_VEND2) return (EINVAL); error = phy->mdio_write(sc, mid->phy_id & 0x1f, mmd, mid->reg_num, mid->val_in); } else error = phy->mdio_write(sc, mid->phy_id & 0x1f, 0, mid->reg_num & 0x1f, mid->val_in); break; } case CHELSIO_SETREG: { struct ch_reg *edata = (struct ch_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); t3_write_reg(sc, edata->addr, edata->val); break; } case CHELSIO_GETREG: { struct ch_reg *edata = (struct ch_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); edata->val = t3_read_reg(sc, edata->addr); break; } case CHELSIO_GET_SGE_CONTEXT: { struct ch_cntxt *ecntxt = (struct ch_cntxt *)data; mtx_lock_spin(&sc->sge.reg_lock); switch (ecntxt->cntxt_type) { case CNTXT_TYPE_EGRESS: error = -t3_sge_read_ecntxt(sc, ecntxt->cntxt_id, ecntxt->data); break; case CNTXT_TYPE_FL: error = -t3_sge_read_fl(sc, ecntxt->cntxt_id, ecntxt->data); break; case CNTXT_TYPE_RSP: error = -t3_sge_read_rspq(sc, ecntxt->cntxt_id, ecntxt->data); break; case CNTXT_TYPE_CQ: error = -t3_sge_read_cq(sc, ecntxt->cntxt_id, ecntxt->data); break; default: error = EINVAL; break; } mtx_unlock_spin(&sc->sge.reg_lock); break; } case CHELSIO_GET_SGE_DESC: { struct ch_desc *edesc = (struct ch_desc *)data; int ret; if (edesc->queue_num >= SGE_QSETS * 6) return (EINVAL); ret = t3_get_desc(&sc->sge.qs[edesc->queue_num / 6], edesc->queue_num % 6, edesc->idx, edesc->data); if (ret < 0) return (EINVAL); edesc->size = ret; break; } case CHELSIO_GET_QSET_PARAMS: { struct qset_params *q; struct ch_qset_params *t = (struct ch_qset_params *)data; int q1 = pi->first_qset; int nqsets = pi->nqsets; int i; if (t->qset_idx >= nqsets) return EINVAL; i = q1 + t->qset_idx; q = &sc->params.sge.qset[i]; t->rspq_size = q->rspq_size; t->txq_size[0] = q->txq_size[0]; t->txq_size[1] = q->txq_size[1]; t->txq_size[2] = q->txq_size[2]; t->fl_size[0] = q->fl_size; t->fl_size[1] = q->jumbo_size; t->polling = q->polling; t->lro = q->lro; t->intr_lat = q->coalesce_usecs; t->cong_thres = q->cong_thres; t->qnum = i; if ((sc->flags & FULL_INIT_DONE) == 0) t->vector = 0; else if (sc->flags & USING_MSIX) t->vector = rman_get_start(sc->msix_irq_res[i]); else t->vector = rman_get_start(sc->irq_res); break; } case CHELSIO_GET_QSET_NUM: { struct ch_reg *edata = (struct ch_reg *)data; edata->val = pi->nqsets; break; } case CHELSIO_LOAD_FW: { uint8_t *fw_data; uint32_t vers; struct ch_mem_range *t = (struct ch_mem_range *)data; /* * You're allowed to load a firmware only before FULL_INIT_DONE * * FW_UPTODATE is also set so the rest of the initialization * will not overwrite what was loaded here. This gives you the * flexibility to load any firmware (and maybe shoot yourself in * the foot). */ ADAPTER_LOCK(sc); if (sc->open_device_map || sc->flags & FULL_INIT_DONE) { ADAPTER_UNLOCK(sc); return (EBUSY); } fw_data = malloc(t->len, M_DEVBUF, M_NOWAIT); if (!fw_data) error = ENOMEM; else error = copyin(t->buf, fw_data, t->len); if (!error) error = -t3_load_fw(sc, fw_data, t->len); if (t3_get_fw_version(sc, &vers) == 0) { snprintf(&sc->fw_version[0], sizeof(sc->fw_version), "%d.%d.%d", G_FW_VERSION_MAJOR(vers), G_FW_VERSION_MINOR(vers), G_FW_VERSION_MICRO(vers)); } if (!error) sc->flags |= FW_UPTODATE; free(fw_data, M_DEVBUF); ADAPTER_UNLOCK(sc); break; } case CHELSIO_LOAD_BOOT: { uint8_t *boot_data; struct ch_mem_range *t = (struct ch_mem_range *)data; boot_data = malloc(t->len, M_DEVBUF, M_NOWAIT); if (!boot_data) return ENOMEM; error = copyin(t->buf, boot_data, t->len); if (!error) error = -t3_load_boot(sc, boot_data, t->len); free(boot_data, M_DEVBUF); break; } case CHELSIO_GET_PM: { struct ch_pm *m = (struct ch_pm *)data; struct tp_params *p = &sc->params.tp; if (!is_offload(sc)) return (EOPNOTSUPP); m->tx_pg_sz = p->tx_pg_size; m->tx_num_pg = p->tx_num_pgs; m->rx_pg_sz = p->rx_pg_size; m->rx_num_pg = p->rx_num_pgs; m->pm_total = p->pmtx_size + p->chan_rx_size * p->nchan; break; } case CHELSIO_SET_PM: { struct ch_pm *m = (struct ch_pm *)data; struct tp_params *p = &sc->params.tp; if (!is_offload(sc)) return (EOPNOTSUPP); if (sc->flags & FULL_INIT_DONE) return (EBUSY); if (!m->rx_pg_sz || (m->rx_pg_sz & (m->rx_pg_sz - 1)) || !m->tx_pg_sz || (m->tx_pg_sz & (m->tx_pg_sz - 1))) return (EINVAL); /* not power of 2 */ if (!(m->rx_pg_sz & 0x14000)) return (EINVAL); /* not 16KB or 64KB */ if (!(m->tx_pg_sz & 0x1554000)) return (EINVAL); if (m->tx_num_pg == -1) m->tx_num_pg = p->tx_num_pgs; if (m->rx_num_pg == -1) m->rx_num_pg = p->rx_num_pgs; if (m->tx_num_pg % 24 || m->rx_num_pg % 24) return (EINVAL); if (m->rx_num_pg * m->rx_pg_sz > p->chan_rx_size || m->tx_num_pg * m->tx_pg_sz > p->chan_tx_size) return (EINVAL); p->rx_pg_size = m->rx_pg_sz; p->tx_pg_size = m->tx_pg_sz; p->rx_num_pgs = m->rx_num_pg; p->tx_num_pgs = m->tx_num_pg; break; } case CHELSIO_SETMTUTAB: { struct ch_mtus *m = (struct ch_mtus *)data; int i; if (!is_offload(sc)) return (EOPNOTSUPP); if (offload_running(sc)) return (EBUSY); if (m->nmtus != NMTUS) return (EINVAL); if (m->mtus[0] < 81) /* accommodate SACK */ return (EINVAL); /* * MTUs must be in ascending order */ for (i = 1; i < NMTUS; ++i) if (m->mtus[i] < m->mtus[i - 1]) return (EINVAL); memcpy(sc->params.mtus, m->mtus, sizeof(sc->params.mtus)); break; } case CHELSIO_GETMTUTAB: { struct ch_mtus *m = (struct ch_mtus *)data; if (!is_offload(sc)) return (EOPNOTSUPP); memcpy(m->mtus, sc->params.mtus, sizeof(m->mtus)); m->nmtus = NMTUS; break; } case CHELSIO_GET_MEM: { struct ch_mem_range *t = (struct ch_mem_range *)data; struct mc7 *mem; uint8_t *useraddr; u64 buf[32]; /* * Use these to avoid modifying len/addr in the return * struct */ uint32_t len = t->len, addr = t->addr; if (!is_offload(sc)) return (EOPNOTSUPP); if (!(sc->flags & FULL_INIT_DONE)) return (EIO); /* need the memory controllers */ if ((addr & 0x7) || (len & 0x7)) return (EINVAL); if (t->mem_id == MEM_CM) mem = &sc->cm; else if (t->mem_id == MEM_PMRX) mem = &sc->pmrx; else if (t->mem_id == MEM_PMTX) mem = &sc->pmtx; else return (EINVAL); /* * Version scheme: * bits 0..9: chip version * bits 10..15: chip revision */ t->version = 3 | (sc->params.rev << 10); /* * Read 256 bytes at a time as len can be large and we don't * want to use huge intermediate buffers. */ useraddr = (uint8_t *)t->buf; while (len) { unsigned int chunk = min(len, sizeof(buf)); error = t3_mc7_bd_read(mem, addr / 8, chunk / 8, buf); if (error) return (-error); if (copyout(buf, useraddr, chunk)) return (EFAULT); useraddr += chunk; addr += chunk; len -= chunk; } break; } case CHELSIO_READ_TCAM_WORD: { struct ch_tcam_word *t = (struct ch_tcam_word *)data; if (!is_offload(sc)) return (EOPNOTSUPP); if (!(sc->flags & FULL_INIT_DONE)) return (EIO); /* need MC5 */ return -t3_read_mc5_range(&sc->mc5, t->addr, 1, t->buf); break; } case CHELSIO_SET_TRACE_FILTER: { struct ch_trace *t = (struct ch_trace *)data; const struct trace_params *tp; tp = (const struct trace_params *)&t->sip; if (t->config_tx) t3_config_trace_filter(sc, tp, 0, t->invert_match, t->trace_tx); if (t->config_rx) t3_config_trace_filter(sc, tp, 1, t->invert_match, t->trace_rx); break; } case CHELSIO_SET_PKTSCHED: { struct ch_pktsched_params *p = (struct ch_pktsched_params *)data; if (sc->open_device_map == 0) return (EAGAIN); send_pktsched_cmd(sc, p->sched, p->idx, p->min, p->max, p->binding); break; } case CHELSIO_IFCONF_GETREGS: { struct ch_ifconf_regs *regs = (struct ch_ifconf_regs *)data; int reglen = cxgb_get_regs_len(); uint8_t *buf = malloc(reglen, M_DEVBUF, M_NOWAIT); if (buf == NULL) { return (ENOMEM); } if (regs->len > reglen) regs->len = reglen; else if (regs->len < reglen) error = ENOBUFS; if (!error) { cxgb_get_regs(sc, regs, buf); error = copyout(buf, regs->data, reglen); } free(buf, M_DEVBUF); break; } case CHELSIO_SET_HW_SCHED: { struct ch_hw_sched *t = (struct ch_hw_sched *)data; unsigned int ticks_per_usec = core_ticks_per_usec(sc); if ((sc->flags & FULL_INIT_DONE) == 0) return (EAGAIN); /* need TP to be initialized */ if (t->sched >= NTX_SCHED || !in_range(t->mode, 0, 1) || !in_range(t->channel, 0, 1) || !in_range(t->kbps, 0, 10000000) || !in_range(t->class_ipg, 0, 10000 * 65535 / ticks_per_usec) || !in_range(t->flow_ipg, 0, dack_ticks_to_usec(sc, 0x7ff))) return (EINVAL); if (t->kbps >= 0) { error = t3_config_sched(sc, t->kbps, t->sched); if (error < 0) return (-error); } if (t->class_ipg >= 0) t3_set_sched_ipg(sc, t->sched, t->class_ipg); if (t->flow_ipg >= 0) { t->flow_ipg *= 1000; /* us -> ns */ t3_set_pace_tbl(sc, &t->flow_ipg, t->sched, 1); } if (t->mode >= 0) { int bit = 1 << (S_TX_MOD_TIMER_MODE + t->sched); t3_set_reg_field(sc, A_TP_TX_MOD_QUEUE_REQ_MAP, bit, t->mode ? bit : 0); } if (t->channel >= 0) t3_set_reg_field(sc, A_TP_TX_MOD_QUEUE_REQ_MAP, 1 << t->sched, t->channel << t->sched); break; } case CHELSIO_GET_EEPROM: { int i; struct ch_eeprom *e = (struct ch_eeprom *)data; uint8_t *buf; if (e->offset & 3 || e->offset >= EEPROMSIZE || e->len > EEPROMSIZE || e->offset + e->len > EEPROMSIZE) { return (EINVAL); } buf = malloc(EEPROMSIZE, M_DEVBUF, M_NOWAIT); if (buf == NULL) { return (ENOMEM); } e->magic = EEPROM_MAGIC; for (i = e->offset & ~3; !error && i < e->offset + e->len; i += 4) error = -t3_seeprom_read(sc, i, (uint32_t *)&buf[i]); if (!error) error = copyout(buf + e->offset, e->data, e->len); free(buf, M_DEVBUF); break; } case CHELSIO_CLEAR_STATS: { if (!(sc->flags & FULL_INIT_DONE)) return EAGAIN; PORT_LOCK(pi); t3_mac_update_stats(&pi->mac); memset(&pi->mac.stats, 0, sizeof(pi->mac.stats)); PORT_UNLOCK(pi); break; } case CHELSIO_GET_UP_LA: { struct ch_up_la *la = (struct ch_up_la *)data; uint8_t *buf = malloc(LA_BUFSIZE, M_DEVBUF, M_NOWAIT); if (buf == NULL) { return (ENOMEM); } if (la->bufsize < LA_BUFSIZE) error = ENOBUFS; if (!error) error = -t3_get_up_la(sc, &la->stopped, &la->idx, &la->bufsize, buf); if (!error) error = copyout(buf, la->data, la->bufsize); free(buf, M_DEVBUF); break; } case CHELSIO_GET_UP_IOQS: { struct ch_up_ioqs *ioqs = (struct ch_up_ioqs *)data; uint8_t *buf = malloc(IOQS_BUFSIZE, M_DEVBUF, M_NOWAIT); uint32_t *v; if (buf == NULL) { return (ENOMEM); } if (ioqs->bufsize < IOQS_BUFSIZE) error = ENOBUFS; if (!error) error = -t3_get_up_ioqs(sc, &ioqs->bufsize, buf); if (!error) { v = (uint32_t *)buf; ioqs->ioq_rx_enable = *v++; ioqs->ioq_tx_enable = *v++; ioqs->ioq_rx_status = *v++; ioqs->ioq_tx_status = *v++; error = copyout(v, ioqs->data, ioqs->bufsize); } free(buf, M_DEVBUF); break; } case CHELSIO_SET_FILTER: { struct ch_filter *f = (struct ch_filter *)data; struct filter_info *p; unsigned int nfilters = sc->params.mc5.nfilters; if (!is_offload(sc)) return (EOPNOTSUPP); /* No TCAM */ if (!(sc->flags & FULL_INIT_DONE)) return (EAGAIN); /* mc5 not setup yet */ if (nfilters == 0) return (EBUSY); /* TOE will use TCAM */ /* sanity checks */ if (f->filter_id >= nfilters || (f->val.dip && f->mask.dip != 0xffffffff) || (f->val.sport && f->mask.sport != 0xffff) || (f->val.dport && f->mask.dport != 0xffff) || (f->val.vlan && f->mask.vlan != 0xfff) || (f->val.vlan_prio && f->mask.vlan_prio != FILTER_NO_VLAN_PRI) || (f->mac_addr_idx != 0xffff && f->mac_addr_idx > 15) || f->qset >= SGE_QSETS || sc->rrss_map[f->qset] >= RSS_TABLE_SIZE) return (EINVAL); /* Was allocated with M_WAITOK */ KASSERT(sc->filters, ("filter table NULL\n")); p = &sc->filters[f->filter_id]; if (p->locked) return (EPERM); bzero(p, sizeof(*p)); p->sip = f->val.sip; p->sip_mask = f->mask.sip; p->dip = f->val.dip; p->sport = f->val.sport; p->dport = f->val.dport; p->vlan = f->mask.vlan ? f->val.vlan : 0xfff; p->vlan_prio = f->mask.vlan_prio ? (f->val.vlan_prio & 6) : FILTER_NO_VLAN_PRI; p->mac_hit = f->mac_hit; p->mac_vld = f->mac_addr_idx != 0xffff; p->mac_idx = f->mac_addr_idx; p->pkt_type = f->proto; p->report_filter_id = f->want_filter_id; p->pass = f->pass; p->rss = f->rss; p->qset = f->qset; error = set_filter(sc, f->filter_id, p); if (error == 0) p->valid = 1; break; } case CHELSIO_DEL_FILTER: { struct ch_filter *f = (struct ch_filter *)data; struct filter_info *p; unsigned int nfilters = sc->params.mc5.nfilters; if (!is_offload(sc)) return (EOPNOTSUPP); if (!(sc->flags & FULL_INIT_DONE)) return (EAGAIN); if (nfilters == 0 || sc->filters == NULL) return (EINVAL); if (f->filter_id >= nfilters) return (EINVAL); p = &sc->filters[f->filter_id]; if (p->locked) return (EPERM); if (!p->valid) return (EFAULT); /* Read "Bad address" as "Bad index" */ bzero(p, sizeof(*p)); p->sip = p->sip_mask = 0xffffffff; p->vlan = 0xfff; p->vlan_prio = FILTER_NO_VLAN_PRI; p->pkt_type = 1; error = set_filter(sc, f->filter_id, p); break; } case CHELSIO_GET_FILTER: { struct ch_filter *f = (struct ch_filter *)data; struct filter_info *p; unsigned int i, nfilters = sc->params.mc5.nfilters; if (!is_offload(sc)) return (EOPNOTSUPP); if (!(sc->flags & FULL_INIT_DONE)) return (EAGAIN); if (nfilters == 0 || sc->filters == NULL) return (EINVAL); i = f->filter_id == 0xffffffff ? 0 : f->filter_id + 1; for (; i < nfilters; i++) { p = &sc->filters[i]; if (!p->valid) continue; bzero(f, sizeof(*f)); f->filter_id = i; f->val.sip = p->sip; f->mask.sip = p->sip_mask; f->val.dip = p->dip; f->mask.dip = p->dip ? 0xffffffff : 0; f->val.sport = p->sport; f->mask.sport = p->sport ? 0xffff : 0; f->val.dport = p->dport; f->mask.dport = p->dport ? 0xffff : 0; f->val.vlan = p->vlan == 0xfff ? 0 : p->vlan; f->mask.vlan = p->vlan == 0xfff ? 0 : 0xfff; f->val.vlan_prio = p->vlan_prio == FILTER_NO_VLAN_PRI ? 0 : p->vlan_prio; f->mask.vlan_prio = p->vlan_prio == FILTER_NO_VLAN_PRI ? 0 : FILTER_NO_VLAN_PRI; f->mac_hit = p->mac_hit; f->mac_addr_idx = p->mac_vld ? p->mac_idx : 0xffff; f->proto = p->pkt_type; f->want_filter_id = p->report_filter_id; f->pass = p->pass; f->rss = p->rss; f->qset = p->qset; break; } if (i == nfilters) f->filter_id = 0xffffffff; break; } default: return (EOPNOTSUPP); break; } return (error); } static __inline void reg_block_dump(struct adapter *ap, uint8_t *buf, unsigned int start, unsigned int end) { uint32_t *p = (uint32_t *)(buf + start); for ( ; start <= end; start += sizeof(uint32_t)) *p++ = t3_read_reg(ap, start); } #define T3_REGMAP_SIZE (3 * 1024) static int cxgb_get_regs_len(void) { return T3_REGMAP_SIZE; } static void cxgb_get_regs(adapter_t *sc, struct ch_ifconf_regs *regs, uint8_t *buf) { /* * Version scheme: * bits 0..9: chip version * bits 10..15: chip revision * bit 31: set for PCIe cards */ regs->version = 3 | (sc->params.rev << 10) | (is_pcie(sc) << 31); /* * We skip the MAC statistics registers because they are clear-on-read. * Also reading multi-register stats would need to synchronize with the * periodic mac stats accumulation. Hard to justify the complexity. */ memset(buf, 0, cxgb_get_regs_len()); reg_block_dump(sc, buf, 0, A_SG_RSPQ_CREDIT_RETURN); reg_block_dump(sc, buf, A_SG_HI_DRB_HI_THRSH, A_ULPRX_PBL_ULIMIT); reg_block_dump(sc, buf, A_ULPTX_CONFIG, A_MPS_INT_CAUSE); reg_block_dump(sc, buf, A_CPL_SWITCH_CNTRL, A_CPL_MAP_TBL_DATA); reg_block_dump(sc, buf, A_SMB_GLOBAL_TIME_CFG, A_XGM_SERDES_STAT3); reg_block_dump(sc, buf, A_XGM_SERDES_STATUS0, XGM_REG(A_XGM_SERDES_STAT3, 1)); reg_block_dump(sc, buf, XGM_REG(A_XGM_SERDES_STATUS0, 1), XGM_REG(A_XGM_RX_SPI4_SOP_EOP_CNT, 1)); } static int alloc_filters(struct adapter *sc) { struct filter_info *p; unsigned int nfilters = sc->params.mc5.nfilters; if (nfilters == 0) return (0); p = malloc(sizeof(*p) * nfilters, M_DEVBUF, M_WAITOK | M_ZERO); sc->filters = p; p = &sc->filters[nfilters - 1]; p->vlan = 0xfff; p->vlan_prio = FILTER_NO_VLAN_PRI; p->pass = p->rss = p->valid = p->locked = 1; return (0); } static int setup_hw_filters(struct adapter *sc) { int i, rc; unsigned int nfilters = sc->params.mc5.nfilters; if (!sc->filters) return (0); t3_enable_filters(sc); for (i = rc = 0; i < nfilters && !rc; i++) { if (sc->filters[i].locked) rc = set_filter(sc, i, &sc->filters[i]); } return (rc); } static int set_filter(struct adapter *sc, int id, const struct filter_info *f) { int len; struct mbuf *m; struct ulp_txpkt *txpkt; struct work_request_hdr *wr; struct cpl_pass_open_req *oreq; struct cpl_set_tcb_field *sreq; len = sizeof(*wr) + sizeof(*oreq) + 2 * sizeof(*sreq); KASSERT(len <= MHLEN, ("filter request too big for an mbuf")); id += t3_mc5_size(&sc->mc5) - sc->params.mc5.nroutes - sc->params.mc5.nfilters; m = m_gethdr(M_WAITOK, MT_DATA); m->m_len = m->m_pkthdr.len = len; bzero(mtod(m, char *), len); wr = mtod(m, struct work_request_hdr *); wr->wrh_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); oreq = (struct cpl_pass_open_req *)(wr + 1); txpkt = (struct ulp_txpkt *)oreq; txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*oreq) / 8)); OPCODE_TID(oreq) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, id)); oreq->local_port = htons(f->dport); oreq->peer_port = htons(f->sport); oreq->local_ip = htonl(f->dip); oreq->peer_ip = htonl(f->sip); oreq->peer_netmask = htonl(f->sip_mask); oreq->opt0h = 0; oreq->opt0l = htonl(F_NO_OFFLOAD); oreq->opt1 = htonl(V_MAC_MATCH_VALID(f->mac_vld) | V_CONN_POLICY(CPL_CONN_POLICY_FILTER) | V_VLAN_PRI(f->vlan_prio >> 1) | V_VLAN_PRI_VALID(f->vlan_prio != FILTER_NO_VLAN_PRI) | V_PKT_TYPE(f->pkt_type) | V_OPT1_VLAN(f->vlan) | V_MAC_MATCH(f->mac_idx | (f->mac_hit << 4))); sreq = (struct cpl_set_tcb_field *)(oreq + 1); set_tcb_field_ulp(sreq, id, 1, 0x1800808000ULL, (f->report_filter_id << 15) | (1 << 23) | ((u64)f->pass << 35) | ((u64)!f->rss << 36)); set_tcb_field_ulp(sreq + 1, id, 0, 0xffffffff, (2 << 19) | 1); t3_mgmt_tx(sc, m); if (f->pass && !f->rss) { len = sizeof(*sreq); m = m_gethdr(M_WAITOK, MT_DATA); m->m_len = m->m_pkthdr.len = len; bzero(mtod(m, char *), len); sreq = mtod(m, struct cpl_set_tcb_field *); sreq->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); mk_set_tcb_field(sreq, id, 25, 0x3f80000, (u64)sc->rrss_map[f->qset] << 19); t3_mgmt_tx(sc, m); } return 0; } static inline void mk_set_tcb_field(struct cpl_set_tcb_field *req, unsigned int tid, unsigned int word, u64 mask, u64 val) { OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); req->reply = V_NO_REPLY(1); req->cpu_idx = 0; req->word = htons(word); req->mask = htobe64(mask); req->val = htobe64(val); } static inline void set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, unsigned int word, u64 mask, u64 val) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); mk_set_tcb_field(req, tid, word, mask, val); } void t3_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; mtx_lock(&t3_list_lock); SLIST_FOREACH(sc, &t3_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } mtx_unlock(&t3_list_lock); } #ifdef TCP_OFFLOAD static int toe_capability(struct port_info *pi, int enable) { int rc; struct adapter *sc = pi->adapter; ADAPTER_LOCK_ASSERT_OWNED(sc); if (!is_offload(sc)) return (ENODEV); if (enable) { if (!(sc->flags & FULL_INIT_DONE)) { log(LOG_WARNING, "You must enable a cxgb interface first\n"); return (EAGAIN); } if (isset(&sc->offload_map, pi->port_id)) return (0); if (!(sc->flags & TOM_INIT_DONE)) { rc = t3_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t3_tom.ko before trying " "to enable TOE on a cxgb interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(sc->flags & TOM_INIT_DONE, ("%s: TOM activated but flag not set", __func__)); } setbit(&sc->offload_map, pi->port_id); /* * XXX: Temporary code to allow iWARP to be enabled when TOE is * enabled on any port. Need to figure out how to enable, * disable, load, and unload iWARP cleanly. */ if (!isset(&sc->offload_map, MAX_NPORTS) && t3_activate_uld(sc, ULD_IWARP) == 0) setbit(&sc->offload_map, MAX_NPORTS); } else { if (!isset(&sc->offload_map, pi->port_id)) return (0); KASSERT(sc->flags & TOM_INIT_DONE, ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t3_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; mtx_lock(&t3_uld_list_lock); SLIST_FOREACH(u, &t3_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t3_uld_list, ui, link); ui->refcount = 0; done: mtx_unlock(&t3_uld_list_lock); return (rc); } int t3_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; mtx_lock(&t3_uld_list_lock); SLIST_FOREACH(u, &t3_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t3_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: mtx_unlock(&t3_uld_list_lock); return (rc); } int t3_activate_uld(struct adapter *sc, int id) { int rc = EAGAIN; struct uld_info *ui; mtx_lock(&t3_uld_list_lock); SLIST_FOREACH(ui, &t3_uld_list, link) { if (ui->uld_id == id) { rc = ui->activate(sc); if (rc == 0) ui->refcount++; goto done; } } done: mtx_unlock(&t3_uld_list_lock); return (rc); } int t3_deactivate_uld(struct adapter *sc, int id) { int rc = EINVAL; struct uld_info *ui; mtx_lock(&t3_uld_list_lock); SLIST_FOREACH(ui, &t3_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) ui->refcount--; goto done; } } done: mtx_unlock(&t3_uld_list_lock); return (rc); } static int cpl_not_handled(struct sge_qset *qs __unused, struct rsp_desc *r __unused, struct mbuf *m) { m_freem(m); return (EDOOFUS); } int t3_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h) { uintptr_t *loc, new; if (opcode >= NUM_CPL_HANDLERS) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled; loc = (uintptr_t *) &sc->cpl_handler[opcode]; atomic_store_rel_ptr(loc, new); return (0); } #endif static int cxgbc_mod_event(module_t mod, int cmd, void *arg) { int rc = 0; switch (cmd) { case MOD_LOAD: mtx_init(&t3_list_lock, "T3 adapters", 0, MTX_DEF); SLIST_INIT(&t3_list); #ifdef TCP_OFFLOAD mtx_init(&t3_uld_list_lock, "T3 ULDs", 0, MTX_DEF); SLIST_INIT(&t3_uld_list); #endif break; case MOD_UNLOAD: #ifdef TCP_OFFLOAD mtx_lock(&t3_uld_list_lock); if (!SLIST_EMPTY(&t3_uld_list)) { rc = EBUSY; mtx_unlock(&t3_uld_list_lock); break; } mtx_unlock(&t3_uld_list_lock); mtx_destroy(&t3_uld_list_lock); #endif mtx_lock(&t3_list_lock); if (!SLIST_EMPTY(&t3_list)) { rc = EBUSY; mtx_unlock(&t3_list_lock); break; } mtx_unlock(&t3_list_lock); mtx_destroy(&t3_list_lock); break; } return (rc); } #ifdef DEBUGNET static void -cxgb_debugnet_init(struct ifnet *ifp, int *nrxr, int *ncl, int *clsize) +cxgb_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize) { struct port_info *pi; adapter_t *adap; pi = if_getsoftc(ifp); adap = pi->adapter; ADAPTER_LOCK(adap); *nrxr = adap->nqsets; *ncl = adap->sge.qs[0].fl[1].size; *clsize = adap->sge.qs[0].fl[1].buf_size; ADAPTER_UNLOCK(adap); } static void -cxgb_debugnet_event(struct ifnet *ifp, enum debugnet_ev event) +cxgb_debugnet_event(if_t ifp, enum debugnet_ev event) { struct port_info *pi; struct sge_qset *qs; int i; pi = if_getsoftc(ifp); if (event == DEBUGNET_START) for (i = 0; i < pi->adapter->nqsets; i++) { qs = &pi->adapter->sge.qs[i]; /* Need to reinit after debugnet_mbuf_start(). */ qs->fl[0].zone = zone_pack; qs->fl[1].zone = zone_clust; qs->lro.enabled = 0; } } static int -cxgb_debugnet_transmit(struct ifnet *ifp, struct mbuf *m) +cxgb_debugnet_transmit(if_t ifp, struct mbuf *m) { struct port_info *pi; struct sge_qset *qs; pi = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (ENOENT); qs = &pi->adapter->sge.qs[pi->first_qset]; return (cxgb_debugnet_encap(qs, &m)); } static int -cxgb_debugnet_poll(struct ifnet *ifp, int count) +cxgb_debugnet_poll(if_t ifp, int count) { struct port_info *pi; adapter_t *adap; int i; pi = if_getsoftc(ifp); if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) return (ENOENT); adap = pi->adapter; for (i = 0; i < adap->nqsets; i++) (void)cxgb_debugnet_poll_rx(adap, &adap->sge.qs[i]); (void)cxgb_debugnet_poll_tx(&adap->sge.qs[pi->first_qset]); return (0); } #endif /* DEBUGNET */ diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h index e3f745fee020..d5e7f84d523d 100644 --- a/sys/dev/cxgb/cxgb_osdep.h +++ b/sys/dev/cxgb/cxgb_osdep.h @@ -1,276 +1,276 @@ /************************************************************************** SPDX-License-Identifier: BSD-2-Clause-FreeBSD Copyright (c) 2007, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $FreeBSD$ ***************************************************************************/ #include #include #include #include #include #include #include #include #include #ifndef _CXGB_OSDEP_H_ #define _CXGB_OSDEP_H_ typedef struct adapter adapter_t; typedef struct port_info pinfo_t; struct sge_rspq; enum { TP_TMR_RES = 200, /* TP timer resolution in usec */ MAX_NPORTS = 4, /* max # of ports */ TP_SRAM_OFFSET = 4096, /* TP SRAM content offset in eeprom */ TP_SRAM_LEN = 2112, /* TP SRAM content offset in eeprom */ }; struct t3_mbuf_hdr { struct mbuf *mh_head; struct mbuf *mh_tail; }; #ifndef PANIC_IF #define PANIC_IF(exp) do { \ if (exp) \ panic("BUG: %s", #exp); \ } while (0) #endif /* * Workaround for weird Chelsio issue */ #define PRIV_SUPPORTED #define CXGB_TX_CLEANUP_THRESHOLD 32 #define TX_MAX_SIZE (1 << 16) /* 64KB */ #define TX_MAX_SEGS 36 /* maximum supported by card */ #define TX_MAX_DESC 4 /* max descriptors per packet */ #define TX_START_MAX_DESC (TX_MAX_DESC << 2) /* maximum number of descriptors * call to start used per */ #define TX_CLEAN_MAX_DESC (TX_MAX_DESC << 4) /* maximum tx descriptors * to clean per iteration */ #define TX_WR_SIZE_MAX 11*1024 /* the maximum total size of packets aggregated into a single * TX WR */ #define TX_WR_COUNT_MAX 7 /* the maximum total number of packets that can be * aggregated into a single TX WR */ #define prefetch(x) __builtin_prefetch(x) #if defined(__i386__) || defined(__amd64__) #define smp_mb() mb() #define WARN_ON(condition) do { \ if (__predict_false((condition)!=0)) { \ log(LOG_WARNING, "BUG: warning at %s:%d/%s()\n", __FILE__, __LINE__, __FUNCTION__); \ kdb_backtrace(); \ } \ } while (0) #else #define smp_mb() #endif #define DBG_RX (1 << 0) static const int debug_flags = DBG_RX; #ifdef DEBUG_PRINT #define DBG(flag, msg) do { \ if ((flag & debug_flags)) \ printf msg; \ } while (0) #else #define DBG(...) #endif #include -#define promisc_rx_mode(rm) ((rm)->port->ifp->if_flags & IFF_PROMISC) -#define allmulti_rx_mode(rm) ((rm)->port->ifp->if_flags & IFF_ALLMULTI) +#define promisc_rx_mode(rm) (if_getflags((rm)->port->ifp) & IFF_PROMISC) +#define allmulti_rx_mode(rm) (if_getflags((rm)->port->ifp) & IFF_ALLMULTI) #define CH_ERR(adap, fmt, ...) log(LOG_ERR, fmt, ##__VA_ARGS__) #define CH_WARN(adap, fmt, ...) log(LOG_WARNING, fmt, ##__VA_ARGS__) #define CH_ALERT(adap, fmt, ...) log(LOG_ALERT, fmt, ##__VA_ARGS__) #define t3_os_sleep(x) DELAY((x) * 1000) #define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | (1< __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int txq_fills = 0; int multiq_tx_enable = 1; #ifdef TCP_OFFLOAD CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS); #endif extern struct sysctl_oid_list sysctl__hw_cxgb_children; int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE; SYSCTL_INT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0, "size of per-queue mbuf ring"); static int cxgb_tx_coalesce_force = 0; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_force, CTLFLAG_RWTUN, &cxgb_tx_coalesce_force, 0, "coalesce small packets into a single work request regardless of ring state"); #define COALESCE_START_DEFAULT TX_ETH_Q_SIZE>>1 #define COALESCE_START_MAX (TX_ETH_Q_SIZE-(TX_ETH_Q_SIZE>>3)) #define COALESCE_STOP_DEFAULT TX_ETH_Q_SIZE>>2 #define COALESCE_STOP_MIN TX_ETH_Q_SIZE>>5 #define TX_RECLAIM_DEFAULT TX_ETH_Q_SIZE>>5 #define TX_RECLAIM_MAX TX_ETH_Q_SIZE>>2 #define TX_RECLAIM_MIN TX_ETH_Q_SIZE>>6 static int cxgb_tx_coalesce_enable_start = COALESCE_START_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_start, CTLFLAG_RWTUN, &cxgb_tx_coalesce_enable_start, 0, "coalesce enable threshold"); static int cxgb_tx_coalesce_enable_stop = COALESCE_STOP_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_coalesce_enable_stop, CTLFLAG_RWTUN, &cxgb_tx_coalesce_enable_stop, 0, "coalesce disable threshold"); static int cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT; SYSCTL_INT(_hw_cxgb, OID_AUTO, tx_reclaim_threshold, CTLFLAG_RWTUN, &cxgb_tx_reclaim_threshold, 0, "tx cleaning minimum threshold"); /* * XXX don't re-enable this until TOE stops assuming * we have an m_ext */ static int recycle_enable = 0; extern int cxgb_use_16k_clusters; extern int nmbjumbop; extern int nmbjumbo9; extern int nmbjumbo16; #define USE_GTS 0 #define SGE_RX_SM_BUF_SIZE 1536 #define SGE_RX_DROP_THRES 16 #define SGE_RX_COPY_THRES 128 /* * Period of the Tx buffer reclaim timer. This timer does not need to run * frequently as Tx buffers are usually reclaimed by new Tx packets. */ #define TX_RECLAIM_PERIOD (hz >> 1) /* * Values for sge_txq.flags */ enum { TXQ_RUNNING = 1 << 0, /* fetch engine is running */ TXQ_LAST_PKT_DB = 1 << 1, /* last packet rang the doorbell */ }; struct tx_desc { uint64_t flit[TX_DESC_FLITS]; } __packed; struct rx_desc { uint32_t addr_lo; uint32_t len_gen; uint32_t gen2; uint32_t addr_hi; } __packed; struct rsp_desc { /* response queue descriptor */ struct rss_header rss_hdr; uint32_t flags; uint32_t len_cq; uint8_t imm_data[47]; uint8_t intr_gen; } __packed; #define RX_SW_DESC_MAP_CREATED (1 << 0) #define TX_SW_DESC_MAP_CREATED (1 << 1) #define RX_SW_DESC_INUSE (1 << 3) #define TX_SW_DESC_MAPPED (1 << 4) #define RSPQ_NSOP_NEOP G_RSPD_SOP_EOP(0) #define RSPQ_EOP G_RSPD_SOP_EOP(F_RSPD_EOP) #define RSPQ_SOP G_RSPD_SOP_EOP(F_RSPD_SOP) #define RSPQ_SOP_EOP G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP) struct tx_sw_desc { /* SW state per Tx descriptor */ struct mbuf *m; bus_dmamap_t map; int flags; }; struct rx_sw_desc { /* SW state per Rx descriptor */ caddr_t rxsd_cl; struct mbuf *m; bus_dmamap_t map; int flags; }; struct txq_state { unsigned int compl; unsigned int gen; unsigned int pidx; }; struct refill_fl_cb_arg { int error; bus_dma_segment_t seg; int nseg; }; /* * Maps a number of flits to the number of Tx descriptors that can hold them. * The formula is * * desc = 1 + (flits - 2) / (WR_FLITS - 1). * * HW allows up to 4 descriptors to be combined into a WR. */ static uint8_t flit_desc_map[] = { 0, #if SGE_NUM_GENBITS == 1 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 #elif SGE_NUM_GENBITS == 2 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, #else # error "SGE_NUM_GENBITS must be 1 or 2" #endif }; #define TXQ_LOCK_ASSERT(qs) mtx_assert(&(qs)->lock, MA_OWNED) #define TXQ_TRYLOCK(qs) mtx_trylock(&(qs)->lock) #define TXQ_LOCK(qs) mtx_lock(&(qs)->lock) #define TXQ_UNLOCK(qs) mtx_unlock(&(qs)->lock) #define TXQ_RING_EMPTY(qs) drbr_empty((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_NEEDS_ENQUEUE(qs) \ drbr_needs_enqueue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_FLUSH(qs) drbr_flush((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) #define TXQ_RING_DEQUEUE_COND(qs, func, arg) \ drbr_dequeue_cond((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr, func, arg) #define TXQ_RING_DEQUEUE(qs) \ drbr_dequeue((qs)->port->ifp, (qs)->txq[TXQ_ETH].txq_mr) int cxgb_debug = 0; static void sge_timer_cb(void *arg); static void sge_timer_reclaim(void *arg, int ncount); static void sge_txq_reclaim_handler(void *arg, int ncount); static void cxgb_start_locked(struct sge_qset *qs); /* * XXX need to cope with bursty scheduling by looking at a wider * window than we are now for determining the need for coalescing * */ static __inline uint64_t check_pkt_coalesce(struct sge_qset *qs) { struct adapter *sc; struct sge_txq *txq; uint8_t *fill; if (__predict_false(cxgb_tx_coalesce_force)) return (1); txq = &qs->txq[TXQ_ETH]; sc = qs->port->adapter; fill = &sc->tunq_fill[qs->idx]; if (cxgb_tx_coalesce_enable_start > COALESCE_START_MAX) cxgb_tx_coalesce_enable_start = COALESCE_START_MAX; if (cxgb_tx_coalesce_enable_stop < COALESCE_STOP_MIN) cxgb_tx_coalesce_enable_start = COALESCE_STOP_MIN; /* * if the hardware transmit queue is more than 1/8 full * we mark it as coalescing - we drop back from coalescing * when we go below 1/32 full and there are no packets enqueued, * this provides us with some degree of hysteresis */ if (*fill != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) && TXQ_RING_EMPTY(qs) && (qs->coalescing == 0)) *fill = 0; else if (*fill == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start)) *fill = 1; return (sc->tunq_coalesce); } #ifdef __LP64__ static void set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo) { uint64_t wr_hilo; #if _BYTE_ORDER == _LITTLE_ENDIAN wr_hilo = wr_hi; wr_hilo |= (((uint64_t)wr_lo)<<32); #else wr_hilo = wr_lo; wr_hilo |= (((uint64_t)wr_hi)<<32); #endif wrp->wrh_hilo = wr_hilo; } #else static void set_wr_hdr(struct work_request_hdr *wrp, uint32_t wr_hi, uint32_t wr_lo) { wrp->wrh_hi = wr_hi; wmb(); wrp->wrh_lo = wr_lo; } #endif struct coalesce_info { int count; int nbytes; int noncoal; }; static int coalesce_check(struct mbuf *m, void *arg) { struct coalesce_info *ci = arg; if ((m->m_next != NULL) || ((mtod(m, vm_offset_t) & PAGE_MASK) + m->m_len > PAGE_SIZE)) ci->noncoal = 1; if ((ci->count == 0) || (ci->noncoal == 0 && (ci->count < 7) && (ci->nbytes + m->m_len <= 10500))) { ci->count++; ci->nbytes += m->m_len; return (1); } return (0); } static struct mbuf * cxgb_dequeue(struct sge_qset *qs) { struct mbuf *m, *m_head, *m_tail; struct coalesce_info ci; if (check_pkt_coalesce(qs) == 0) return TXQ_RING_DEQUEUE(qs); m_head = m_tail = NULL; ci.count = ci.nbytes = ci.noncoal = 0; do { m = TXQ_RING_DEQUEUE_COND(qs, coalesce_check, &ci); if (m_head == NULL) { m_tail = m_head = m; } else if (m != NULL) { m_tail->m_nextpkt = m; m_tail = m; } } while (m != NULL); if (ci.count > 7) panic("trying to coalesce %d packets in to one WR", ci.count); return (m_head); } /** * reclaim_completed_tx - reclaims completed Tx descriptors * @adapter: the adapter * @q: the Tx queue to reclaim completed descriptors from * * Reclaims Tx descriptors that the SGE has indicated it has processed, * and frees the associated buffers if possible. Called with the Tx * queue's lock held. */ static __inline int reclaim_completed_tx(struct sge_qset *qs, int reclaim_min, int queue) { struct sge_txq *q = &qs->txq[queue]; int reclaim = desc_reclaimable(q); if ((cxgb_tx_reclaim_threshold > TX_RECLAIM_MAX) || (cxgb_tx_reclaim_threshold < TX_RECLAIM_MIN)) cxgb_tx_reclaim_threshold = TX_RECLAIM_DEFAULT; if (reclaim < reclaim_min) return (0); mtx_assert(&qs->lock, MA_OWNED); if (reclaim > 0) { t3_free_tx_desc(qs, reclaim, queue); q->cleaned += reclaim; q->in_use -= reclaim; } if (isset(&qs->txq_stopped, TXQ_ETH)) clrbit(&qs->txq_stopped, TXQ_ETH); return (reclaim); } #ifdef DEBUGNET int cxgb_debugnet_poll_tx(struct sge_qset *qs) { return (reclaim_completed_tx(qs, TX_RECLAIM_MAX, TXQ_ETH)); } #endif /** * should_restart_tx - are there enough resources to restart a Tx queue? * @q: the Tx queue * * Checks if there are enough descriptors to restart a suspended Tx queue. */ static __inline int should_restart_tx(const struct sge_txq *q) { unsigned int r = q->processed - q->cleaned; return q->in_use - r < (q->size >> 1); } /** * t3_sge_init - initialize SGE * @adap: the adapter * @p: the SGE parameters * * Performs SGE initialization needed every time after a chip reset. * We do not initialize any of the queue sets here, instead the driver * top-level must request those individually. We also do not enable DMA * here, that should be done after the queues have been set up. */ void t3_sge_init(adapter_t *adap, struct sge_params *p) { u_int ctrl, ups; ups = 0; /* = ffs(pci_resource_len(adap->pdev, 2) >> 12); */ ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL | F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN | V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS | V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING; #if SGE_NUM_GENBITS == 1 ctrl |= F_EGRGENCTRL; #endif if (adap->params.rev > 0) { if (!(adap->flags & (USING_MSIX | USING_MSI))) ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ; } t3_write_reg(adap, A_SG_CONTROL, ctrl); t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) | V_LORCQDRBTHRSH(512)); t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10); t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) | V_TIMEOUT(200 * core_ticks_per_usec(adap))); t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, adap->params.rev < T3_REV_C ? 1000 : 500); t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256); t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000); t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256); t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff)); t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024); } /** * sgl_len - calculates the size of an SGL of the given capacity * @n: the number of SGL entries * * Calculates the number of flits needed for a scatter/gather list that * can hold the given number of entries. */ static __inline unsigned int sgl_len(unsigned int n) { return ((3 * n) / 2 + (n & 1)); } /** * get_imm_packet - return the next ingress packet buffer from a response * @resp: the response descriptor containing the packet data * * Return a packet containing the immediate data of the given response. */ static int get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m) { if (resp->rss_hdr.opcode == CPL_RX_DATA) { const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0]; m->m_len = sizeof(*cpl) + ntohs(cpl->len); } else if (resp->rss_hdr.opcode == CPL_RX_PKT) { const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0]; m->m_len = sizeof(*cpl) + ntohs(cpl->len); } else m->m_len = IMMED_PKT_SIZE; m->m_ext.ext_buf = NULL; m->m_ext.ext_type = 0; memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len); return (0); } static __inline u_int flits_to_desc(u_int n) { return (flit_desc_map[n]); } #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \ F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \ V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \ F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \ F_HIRCQPARITYERROR) #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR) #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \ F_RSPQDISABLED) /** * t3_sge_err_intr_handler - SGE async event interrupt handler * @adapter: the adapter * * Interrupt handler for SGE asynchronous (non-data) events. */ void t3_sge_err_intr_handler(adapter_t *adapter) { unsigned int v, status; status = t3_read_reg(adapter, A_SG_INT_CAUSE); if (status & SGE_PARERR) CH_ALERT(adapter, "SGE parity error (0x%x)\n", status & SGE_PARERR); if (status & SGE_FRAMINGERR) CH_ALERT(adapter, "SGE framing error (0x%x)\n", status & SGE_FRAMINGERR); if (status & F_RSPQCREDITOVERFOW) CH_ALERT(adapter, "SGE response queue credit overflow\n"); if (status & F_RSPQDISABLED) { v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS); CH_ALERT(adapter, "packet delivered to disabled response queue (0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff); } t3_write_reg(adapter, A_SG_INT_CAUSE, status); if (status & SGE_FATALERR) t3_fatal_err(adapter); } void t3_sge_prep(adapter_t *adap, struct sge_params *p) { int i, nqsets, fl_q_size, jumbo_q_size, use_16k, jumbo_buf_size; nqsets = min(SGE_QSETS / adap->params.nports, mp_ncpus); nqsets *= adap->params.nports; fl_q_size = min(nmbclusters/(3*nqsets), FL_Q_SIZE); while (!powerof2(fl_q_size)) fl_q_size--; use_16k = cxgb_use_16k_clusters != -1 ? cxgb_use_16k_clusters : is_offload(adap); if (use_16k) { jumbo_q_size = min(nmbjumbo16/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUM16BYTES; } else { jumbo_q_size = min(nmbjumbo9/(3*nqsets), JUMBO_Q_SIZE); jumbo_buf_size = MJUM9BYTES; } while (!powerof2(jumbo_q_size)) jumbo_q_size--; if (fl_q_size < (FL_Q_SIZE / 4) || jumbo_q_size < (JUMBO_Q_SIZE / 2)) device_printf(adap->dev, "Insufficient clusters and/or jumbo buffers.\n"); p->max_pkt_size = jumbo_buf_size - sizeof(struct cpl_rx_data); for (i = 0; i < SGE_QSETS; ++i) { struct qset_params *q = p->qset + i; if (adap->params.nports > 2) { q->coalesce_usecs = 50; } else { #ifdef INVARIANTS q->coalesce_usecs = 10; #else q->coalesce_usecs = 5; #endif } q->polling = 0; q->rspq_size = RSPQ_Q_SIZE; q->fl_size = fl_q_size; q->jumbo_size = jumbo_q_size; q->jumbo_buf_size = jumbo_buf_size; q->txq_size[TXQ_ETH] = TX_ETH_Q_SIZE; q->txq_size[TXQ_OFLD] = is_offload(adap) ? TX_OFLD_Q_SIZE : 16; q->txq_size[TXQ_CTRL] = TX_CTRL_Q_SIZE; q->cong_thres = 0; } } int t3_sge_alloc(adapter_t *sc) { /* The parent tag. */ if (bus_dma_tag_create( bus_get_dma_tag(sc->dev),/* PCI parent */ 1, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ BUS_SPACE_UNRESTRICTED, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */ 0, /* flags */ NULL, NULL, /* lock, lockarg */ &sc->parent_dmat)) { device_printf(sc->dev, "Cannot allocate parent DMA tag\n"); return (ENOMEM); } /* * DMA tag for normal sized RX frames */ if (bus_dma_tag_create(sc->parent_dmat, MCLBYTES, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_dmat)) { device_printf(sc->dev, "Cannot allocate RX DMA tag\n"); return (ENOMEM); } /* * DMA tag for jumbo sized RX frames. */ if (bus_dma_tag_create(sc->parent_dmat, MJUM16BYTES, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MJUM16BYTES, 1, MJUM16BYTES, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->rx_jumbo_dmat)) { device_printf(sc->dev, "Cannot allocate RX jumbo DMA tag\n"); return (ENOMEM); } /* * DMA tag for TX frames. */ if (bus_dma_tag_create(sc->parent_dmat, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS, TX_MAX_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->tx_dmat)) { device_printf(sc->dev, "Cannot allocate TX DMA tag\n"); return (ENOMEM); } return (0); } int t3_sge_free(struct adapter * sc) { if (sc->tx_dmat != NULL) bus_dma_tag_destroy(sc->tx_dmat); if (sc->rx_jumbo_dmat != NULL) bus_dma_tag_destroy(sc->rx_jumbo_dmat); if (sc->rx_dmat != NULL) bus_dma_tag_destroy(sc->rx_dmat); if (sc->parent_dmat != NULL) bus_dma_tag_destroy(sc->parent_dmat); return (0); } void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p) { qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U); qs->rspq.polling = 0 /* p->polling */; } #if !defined(__i386__) && !defined(__amd64__) static void refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { struct refill_fl_cb_arg *cb_arg = arg; cb_arg->error = error; cb_arg->seg = segs[0]; cb_arg->nseg = nseg; } #endif /** * refill_fl - refill an SGE free-buffer list * @sc: the controller softc * @q: the free-list to refill * @n: the number of new buffers to allocate * * (Re)populate an SGE free-buffer list with up to @n new packet buffers. * The caller must assure that @n does not exceed the queue's capacity. */ static void refill_fl(adapter_t *sc, struct sge_fl *q, int n) { struct rx_sw_desc *sd = &q->sdesc[q->pidx]; struct rx_desc *d = &q->desc[q->pidx]; struct refill_fl_cb_arg cb_arg; struct mbuf *m; caddr_t cl; int err; cb_arg.error = 0; while (n--) { /* * We allocate an uninitialized mbuf + cluster, mbuf is * initialized after rx. */ if (q->zone == zone_pack) { if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL) break; cl = m->m_ext.ext_buf; } else { if ((cl = m_cljget(NULL, M_NOWAIT, q->buf_size)) == NULL) break; if ((m = m_gethdr_raw(M_NOWAIT, 0)) == NULL) { uma_zfree(q->zone, cl); break; } } if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) { if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) { log(LOG_WARNING, "bus_dmamap_create failed %d\n", err); uma_zfree(q->zone, cl); goto done; } sd->flags |= RX_SW_DESC_MAP_CREATED; } #if !defined(__i386__) && !defined(__amd64__) err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size, refill_fl_cb, &cb_arg, 0); if (err != 0 || cb_arg.error) { if (q->zone != zone_pack) uma_zfree(q->zone, cl); m_free(m); goto done; } #else cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)cl); #endif sd->flags |= RX_SW_DESC_INUSE; sd->rxsd_cl = cl; sd->m = m; d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff); d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff); d->len_gen = htobe32(V_FLD_GEN1(q->gen)); d->gen2 = htobe32(V_FLD_GEN2(q->gen)); d++; sd++; if (++q->pidx == q->size) { q->pidx = 0; q->gen ^= 1; sd = q->sdesc; d = q->desc; } q->credits++; q->db_pending++; } done: if (q->db_pending >= 32) { q->db_pending = 0; t3_write_reg(sc, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id)); } } /** * free_rx_bufs - free the Rx buffers on an SGE free list * @sc: the controle softc * @q: the SGE free list to clean up * * Release the buffers on an SGE free-buffer Rx queue. HW fetching from * this queue should be stopped before calling this function. */ static void free_rx_bufs(adapter_t *sc, struct sge_fl *q) { u_int cidx = q->cidx; while (q->credits--) { struct rx_sw_desc *d = &q->sdesc[cidx]; if (d->flags & RX_SW_DESC_INUSE) { bus_dmamap_unload(q->entry_tag, d->map); bus_dmamap_destroy(q->entry_tag, d->map); if (q->zone == zone_pack) { m_init(d->m, M_NOWAIT, MT_DATA, M_EXT); uma_zfree(zone_pack, d->m); } else { m_init(d->m, M_NOWAIT, MT_DATA, 0); m_free_raw(d->m); uma_zfree(q->zone, d->rxsd_cl); } } d->rxsd_cl = NULL; d->m = NULL; if (++cidx == q->size) cidx = 0; } } static __inline void __refill_fl(adapter_t *adap, struct sge_fl *fl) { refill_fl(adap, fl, min(16U, fl->size - fl->credits)); } static __inline void __refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max) { uint32_t reclaimable = fl->size - fl->credits; if (reclaimable > 0) refill_fl(adap, fl, min(max, reclaimable)); } /** * recycle_rx_buf - recycle a receive buffer * @adapter: the adapter * @q: the SGE free list * @idx: index of buffer to recycle * * Recycles the specified buffer on the given free list by adding it at * the next available slot on the list. */ static void recycle_rx_buf(adapter_t *adap, struct sge_fl *q, unsigned int idx) { struct rx_desc *from = &q->desc[idx]; struct rx_desc *to = &q->desc[q->pidx]; q->sdesc[q->pidx] = q->sdesc[idx]; to->addr_lo = from->addr_lo; // already big endian to->addr_hi = from->addr_hi; // likewise wmb(); /* necessary ? */ to->len_gen = htobe32(V_FLD_GEN1(q->gen)); to->gen2 = htobe32(V_FLD_GEN2(q->gen)); q->credits++; if (++q->pidx == q->size) { q->pidx = 0; q->gen ^= 1; } t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id)); } static void alloc_ring_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { uint32_t *addr; addr = arg; *addr = segs[0].ds_addr; } static int alloc_ring(adapter_t *sc, size_t nelem, size_t elem_size, size_t sw_size, bus_addr_t *phys, void *desc, void *sdesc, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_dma_tag_t parent_entry_tag, bus_dma_tag_t *entry_tag) { size_t len = nelem * elem_size; void *s = NULL; void *p = NULL; int err; if ((err = bus_dma_tag_create(sc->parent_dmat, PAGE_SIZE, 0, BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor tag\n"); return (ENOMEM); } if ((err = bus_dmamem_alloc(*tag, (void **)&p, BUS_DMA_NOWAIT, map)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor memory\n"); return (ENOMEM); } bus_dmamap_load(*tag, *map, p, len, alloc_ring_cb, phys, 0); bzero(p, len); *(void **)desc = p; if (sw_size) { len = nelem * sw_size; s = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); *(void **)sdesc = s; } if (parent_entry_tag == NULL) return (0); if ((err = bus_dma_tag_create(parent_entry_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, TX_MAX_SIZE, TX_MAX_SEGS, TX_MAX_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL, entry_tag)) != 0) { device_printf(sc->dev, "Cannot allocate descriptor entry tag\n"); return (ENOMEM); } return (0); } static void sge_slow_intr_handler(void *arg, int ncount) { adapter_t *sc = arg; t3_slow_intr_handler(sc); t3_write_reg(sc, A_PL_INT_ENABLE0, sc->slow_intr_mask); (void) t3_read_reg(sc, A_PL_INT_ENABLE0); } /** * sge_timer_cb - perform periodic maintenance of an SGE qset * @data: the SGE queue set to maintain * * Runs periodically from a timer to perform maintenance of an SGE queue * set. It performs two tasks: * * a) Cleans up any completed Tx descriptors that may still be pending. * Normal descriptor cleanup happens when new packets are added to a Tx * queue so this timer is relatively infrequent and does any cleanup only * if the Tx queue has not seen any new packets in a while. We make a * best effort attempt to reclaim descriptors, in that we don't wait * around if we cannot get a queue's lock (which most likely is because * someone else is queueing new packets and so will also handle the clean * up). Since control queues use immediate data exclusively we don't * bother cleaning them up here. * * b) Replenishes Rx queues that have run out due to memory shortage. * Normally new Rx buffers are added when existing ones are consumed but * when out of memory a queue can become empty. We try to add only a few * buffers here, the queue will be replenished fully as these new buffers * are used up if memory shortage has subsided. * * c) Return coalesced response queue credits in case a response queue is * starved. * * d) Ring doorbells for T304 tunnel queues since we have seen doorbell * fifo overflows and the FW doesn't implement any recovery scheme yet. */ static void sge_timer_cb(void *arg) { adapter_t *sc = arg; if ((sc->flags & USING_MSIX) == 0) { struct port_info *pi; struct sge_qset *qs; struct sge_txq *txq; int i, j; int reclaim_ofl, refill_rx; if (sc->open_device_map == 0) return; for (i = 0; i < sc->params.nports; i++) { pi = &sc->port[i]; for (j = 0; j < pi->nqsets; j++) { qs = &sc->sge.qs[pi->first_qset + j]; txq = &qs->txq[0]; reclaim_ofl = txq[TXQ_OFLD].processed - txq[TXQ_OFLD].cleaned; refill_rx = ((qs->fl[0].credits < qs->fl[0].size) || (qs->fl[1].credits < qs->fl[1].size)); if (reclaim_ofl || refill_rx) { taskqueue_enqueue(sc->tq, &pi->timer_reclaim_task); break; } } } } if (sc->params.nports > 2) { int i; for_each_port(sc, i) { struct port_info *pi = &sc->port[i]; t3_write_reg(sc, A_SG_KDOORBELL, F_SELEGRCNTX | (FW_TUNNEL_SGEEC_START + pi->first_qset)); } } if (((sc->flags & USING_MSIX) == 0 || sc->params.nports > 2) && sc->open_device_map != 0) callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); } /* * This is meant to be a catch-all function to keep sge state private * to sge.c * */ int t3_sge_init_adapter(adapter_t *sc) { callout_init(&sc->sge_timer_ch, 1); callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc); return (0); } int t3_sge_reset_adapter(adapter_t *sc) { callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); return (0); } int t3_sge_init_port(struct port_info *pi) { TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi); return (0); } /** * refill_rspq - replenish an SGE response queue * @adapter: the adapter * @q: the response queue to replenish * @credits: how many new responses to make available * * Replenishes a response queue by making the supplied number of responses * available to HW. */ static __inline void refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits) { /* mbufs are allocated on demand when a rspq entry is processed. */ t3_write_reg(sc, A_SG_RSPQ_CREDIT_RETURN, V_RSPQ(q->cntxt_id) | V_CREDITS(credits)); } static void sge_txq_reclaim_handler(void *arg, int ncount) { struct sge_qset *qs = arg; int i; for (i = 0; i < 3; i++) reclaim_completed_tx(qs, 16, i); } static void sge_timer_reclaim(void *arg, int ncount) { struct port_info *pi = arg; int i, nqsets = pi->nqsets; adapter_t *sc = pi->adapter; struct sge_qset *qs; struct mtx *lock; KASSERT((sc->flags & USING_MSIX) == 0, ("can't call timer reclaim for msi-x")); for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[pi->first_qset + i]; reclaim_completed_tx(qs, 16, TXQ_OFLD); lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock : &sc->sge.qs[0].rspq.lock; if (mtx_trylock(lock)) { /* XXX currently assume that we are *NOT* polling */ uint32_t status = t3_read_reg(sc, A_SG_RSPQ_FL_STATUS); if (qs->fl[0].credits < qs->fl[0].size - 16) __refill_fl(sc, &qs->fl[0]); if (qs->fl[1].credits < qs->fl[1].size - 16) __refill_fl(sc, &qs->fl[1]); if (status & (1 << qs->rspq.cntxt_id)) { if (qs->rspq.credits) { refill_rspq(sc, &qs->rspq, 1); qs->rspq.credits--; t3_write_reg(sc, A_SG_RSPQ_FL_STATUS, 1 << qs->rspq.cntxt_id); } } mtx_unlock(lock); } } } /** * init_qset_cntxt - initialize an SGE queue set context info * @qs: the queue set * @id: the queue set id * * Initializes the TIDs and context ids for the queues of a queue set. */ static void init_qset_cntxt(struct sge_qset *qs, u_int id) { qs->rspq.cntxt_id = id; qs->fl[0].cntxt_id = 2 * id; qs->fl[1].cntxt_id = 2 * id + 1; qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id; qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id; qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id; qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id; qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id; /* XXX: a sane limit is needed instead of INT_MAX */ mbufq_init(&qs->txq[TXQ_ETH].sendq, INT_MAX); mbufq_init(&qs->txq[TXQ_OFLD].sendq, INT_MAX); mbufq_init(&qs->txq[TXQ_CTRL].sendq, INT_MAX); } static void txq_prod(struct sge_txq *txq, unsigned int ndesc, struct txq_state *txqs) { txq->in_use += ndesc; /* * XXX we don't handle stopping of queue * presumably start handles this when we bump against the end */ txqs->gen = txq->gen; txq->unacked += ndesc; txqs->compl = (txq->unacked & 32) << (S_WR_COMPL - 5); txq->unacked &= 31; txqs->pidx = txq->pidx; txq->pidx += ndesc; #ifdef INVARIANTS if (((txqs->pidx > txq->cidx) && (txq->pidx < txqs->pidx) && (txq->pidx >= txq->cidx)) || ((txqs->pidx < txq->cidx) && (txq->pidx >= txq-> cidx)) || ((txqs->pidx < txq->cidx) && (txq->cidx < txqs->pidx))) panic("txqs->pidx=%d txq->pidx=%d txq->cidx=%d", txqs->pidx, txq->pidx, txq->cidx); #endif if (txq->pidx >= txq->size) { txq->pidx -= txq->size; txq->gen ^= 1; } } /** * calc_tx_descs - calculate the number of Tx descriptors for a packet * @m: the packet mbufs * @nsegs: the number of segments * * Returns the number of Tx descriptors needed for the given Ethernet * packet. Ethernet packets require addition of WR and CPL headers. */ static __inline unsigned int calc_tx_descs(const struct mbuf *m, int nsegs) { unsigned int flits; if (m->m_pkthdr.len <= PIO_LEN) return 1; flits = sgl_len(nsegs) + 2; if (m->m_pkthdr.csum_flags & CSUM_TSO) flits++; return flits_to_desc(flits); } /** * make_sgl - populate a scatter/gather list for a packet * @sgp: the SGL to populate * @segs: the packet dma segments * @nsegs: the number of segments * * Generates a scatter/gather list for the buffers that make up a packet * and returns the SGL size in 8-byte words. The caller must size the SGL * appropriately. */ static __inline void make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs) { int i, idx; for (idx = 0, i = 0; i < nsegs; i++) { /* * firmware doesn't like empty segments */ if (segs[i].ds_len == 0) continue; if (i && idx == 0) ++sgp; sgp->len[idx] = htobe32(segs[i].ds_len); sgp->addr[idx] = htobe64(segs[i].ds_addr); idx ^= 1; } if (idx) { sgp->len[idx] = 0; sgp->addr[idx] = 0; } } /** * check_ring_tx_db - check and potentially ring a Tx queue's doorbell * @adap: the adapter * @q: the Tx queue * * Ring the doorbell if a Tx queue is asleep. There is a natural race, * where the HW is going to sleep just after we checked, however, * then the interrupt handler will detect the outstanding TX packet * and ring the doorbell for us. * * When GTS is disabled we unconditionally ring the doorbell. */ static __inline void check_ring_tx_db(adapter_t *adap, struct sge_txq *q, int mustring) { #if USE_GTS clear_bit(TXQ_LAST_PKT_DB, &q->flags); if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) { set_bit(TXQ_LAST_PKT_DB, &q->flags); #ifdef T3_TRACE T3_TRACE1(adap->tb[q->cntxt_id & 7], "doorbell Tx, cntxt %d", q->cntxt_id); #endif t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } #else if (mustring || ++q->db_pending >= 32) { wmb(); /* write descriptors before telling HW */ t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); q->db_pending = 0; } #endif } static __inline void wr_gen2(struct tx_desc *d, unsigned int gen) { #if SGE_NUM_GENBITS == 2 d->flit[TX_DESC_FLITS - 1] = htobe64(gen); #endif } /** * write_wr_hdr_sgl - write a WR header and, optionally, SGL * @ndesc: number of Tx descriptors spanned by the SGL * @txd: first Tx descriptor to be written * @txqs: txq state (generation and producer index) * @txq: the SGE Tx queue * @sgl: the SGL * @flits: number of flits to the start of the SGL in the first descriptor * @sgl_flits: the SGL size in flits * @wr_hi: top 32 bits of WR header based on WR type (big endian) * @wr_lo: low 32 bits of WR header based on WR type (big endian) * * Write a work request header and an associated SGL. If the SGL is * small enough to fit into one Tx descriptor it has already been written * and we just need to write the WR header. Otherwise we distribute the * SGL across the number of descriptors it spans. */ static void write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs, const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits, unsigned int sgl_flits, unsigned int wr_hi, unsigned int wr_lo) { struct work_request_hdr *wrp = (struct work_request_hdr *)txd; if (__predict_true(ndesc == 1)) { set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | wr_hi, htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) | wr_lo); wr_gen2(txd, txqs->gen); } else { unsigned int ogen = txqs->gen; const uint64_t *fp = (const uint64_t *)sgl; struct work_request_hdr *wp = wrp; wrp->wrh_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | wr_hi; while (sgl_flits) { unsigned int avail = WR_FLITS - flits; if (avail > sgl_flits) avail = sgl_flits; memcpy(&txd->flit[flits], fp, avail * sizeof(*fp)); sgl_flits -= avail; ndesc--; if (!sgl_flits) break; fp += avail; txd++; if (++txqs->pidx == txq->size) { txqs->pidx = 0; txqs->gen ^= 1; txd = txq->desc; } /* * when the head of the mbuf chain * is freed all clusters will be freed * with it */ wrp = (struct work_request_hdr *)txd; wrp->wrh_hi = htonl(V_WR_DATATYPE(1) | V_WR_SGLSFLT(1)) | wr_hi; wrp->wrh_lo = htonl(V_WR_LEN(min(WR_FLITS, sgl_flits + 1)) | V_WR_GEN(txqs->gen)) | wr_lo; wr_gen2(txd, txqs->gen); flits = 1; } wrp->wrh_hi |= htonl(F_WR_EOP); wmb(); wp->wrh_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo; wr_gen2((struct tx_desc *)wp, ogen); } } /* sizeof(*eh) + sizeof(*ip) + sizeof(*tcp) */ #define TCPPKTHDRSIZE (ETHER_HDR_LEN + 20 + 20) #define GET_VTAG(cntrl, m) \ do { \ if ((m)->m_flags & M_VLANTAG) \ cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \ } while (0) static int t3_encap(struct sge_qset *qs, struct mbuf **m) { adapter_t *sc; struct mbuf *m0; struct sge_txq *txq; struct txq_state txqs; struct port_info *pi; unsigned int ndesc, flits, cntrl, mlen; int err, nsegs, tso_info = 0; struct work_request_hdr *wrp; struct tx_sw_desc *txsd; struct sg_ent *sgp, *sgl; uint32_t wr_hi, wr_lo, sgl_flits; bus_dma_segment_t segs[TX_MAX_SEGS]; struct tx_desc *txd; pi = qs->port; sc = pi->adapter; txq = &qs->txq[TXQ_ETH]; txd = &txq->desc[txq->pidx]; txsd = &txq->sdesc[txq->pidx]; sgl = txq->txq_sgl; prefetch(txd); m0 = *m; mtx_assert(&qs->lock, MA_OWNED); cntrl = V_TXPKT_INTF(pi->txpkt_intf); KASSERT(m0->m_flags & M_PKTHDR, ("not packet header\n")); if (m0->m_nextpkt == NULL && m0->m_next != NULL && m0->m_pkthdr.csum_flags & (CSUM_TSO)) tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz); if (m0->m_nextpkt != NULL) { busdma_map_sg_vec(txq->entry_tag, txsd->map, m0, segs, &nsegs); ndesc = 1; mlen = 0; } else { if ((err = busdma_map_sg_collapse(txq->entry_tag, txsd->map, &m0, segs, &nsegs))) { if (cxgb_debug) printf("failed ... err=%d\n", err); return (err); } mlen = m0->m_pkthdr.len; ndesc = calc_tx_descs(m0, nsegs); } txq_prod(txq, ndesc, &txqs); KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d", nsegs)); txsd->m = m0; if (m0->m_nextpkt != NULL) { struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd; int i, fidx; if (nsegs > 7) panic("trying to coalesce %d packets in to one WR", nsegs); txq->txq_coalesced += nsegs; wrp = (struct work_request_hdr *)txd; flits = nsegs*2 + 1; for (fidx = 1, i = 0; i < nsegs; i++, fidx += 2) { struct cpl_tx_pkt_batch_entry *cbe; uint64_t flit; uint32_t *hflit = (uint32_t *)&flit; int cflags = m0->m_pkthdr.csum_flags; cntrl = V_TXPKT_INTF(pi->txpkt_intf); GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); if (__predict_false(!(cflags & CSUM_IP))) cntrl |= F_TXPKT_IPCSUM_DIS; if (__predict_false(!(cflags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6)))) cntrl |= F_TXPKT_L4CSUM_DIS; hflit[0] = htonl(cntrl); hflit[1] = htonl(segs[i].ds_len | 0x80000000); flit |= htobe64(1 << 24); cbe = &cpl_batch->pkt_entry[i]; cbe->cntrl = hflit[0]; cbe->len = hflit[1]; cbe->addr = htobe64(segs[i].ds_addr); } wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token)); set_wr_hdr(wrp, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); return (0); } else if (tso_info) { uint16_t eth_type; struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd; struct ether_header *eh; void *l3hdr; struct tcphdr *tcp; txd->flit[2] = 0; GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO); hdr->cntrl = htonl(cntrl); hdr->len = htonl(mlen | 0x80000000); if (__predict_false(mlen < TCPPKTHDRSIZE)) { printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%b,flags=%#x", m0, mlen, m0->m_pkthdr.tso_segsz, (int)m0->m_pkthdr.csum_flags, CSUM_BITS, m0->m_flags); panic("tx tso packet too small"); } /* Make sure that ether, ip, tcp headers are all in m0 */ if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) { m0 = m_pullup(m0, TCPPKTHDRSIZE); if (__predict_false(m0 == NULL)) { /* XXX panic probably an overreaction */ panic("couldn't fit header into mbuf"); } } eh = mtod(m0, struct ether_header *); eth_type = eh->ether_type; if (eth_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *evh = (void *)eh; tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II_VLAN); l3hdr = evh + 1; eth_type = evh->evl_proto; } else { tso_info |= V_LSO_ETH_TYPE(CPL_ETH_II); l3hdr = eh + 1; } if (eth_type == htons(ETHERTYPE_IP)) { struct ip *ip = l3hdr; tso_info |= V_LSO_IPHDR_WORDS(ip->ip_hl); tcp = (struct tcphdr *)(ip + 1); } else if (eth_type == htons(ETHERTYPE_IPV6)) { struct ip6_hdr *ip6 = l3hdr; KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO with ip6_nxt %d", __func__, ip6->ip6_nxt)); tso_info |= F_LSO_IPV6; tso_info |= V_LSO_IPHDR_WORDS(sizeof(*ip6) >> 2); tcp = (struct tcphdr *)(ip6 + 1); } else panic("%s: CSUM_TSO but neither ip nor ip6", __func__); tso_info |= V_LSO_TCPHDR_WORDS(tcp->th_off); hdr->lso_info = htonl(tso_info); if (__predict_false(mlen <= PIO_LEN)) { /* * pkt not undersized but fits in PIO_LEN * Indicates a TSO bug at the higher levels. */ txsd->m = NULL; m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]); flits = (mlen + 7) / 8 + 3; wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | F_WR_SOP | F_WR_EOP | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen) | V_WR_TID(txq->token)); set_wr_hdr(&hdr->wr, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); m_freem(m0); return (0); } flits = 3; } else { struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd; GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP))) cntrl |= F_TXPKT_IPCSUM_DIS; if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6)))) cntrl |= F_TXPKT_L4CSUM_DIS; cpl->cntrl = htonl(cntrl); cpl->len = htonl(mlen | 0x80000000); if (mlen <= PIO_LEN) { txsd->m = NULL; m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]); flits = (mlen + 7) / 8 + 2; wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | F_WR_SOP | F_WR_EOP | txqs.compl); wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(txqs.gen) | V_WR_TID(txq->token)); set_wr_hdr(&cpl->wr, wr_hi, wr_lo); wmb(); ETHER_BPF_MTAP(pi->ifp, m0); wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq, 0); m_freem(m0); return (0); } flits = 2; } wrp = (struct work_request_hdr *)txd; sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl; make_sgl(sgp, segs, nsegs); sgl_flits = sgl_len(nsegs); ETHER_BPF_MTAP(pi->ifp, m0); KASSERT(ndesc <= 4, ("ndesc too large %d", ndesc)); wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); wr_lo = htonl(V_WR_TID(txq->token)); write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo); check_ring_tx_db(sc, txq, 0); return (0); } #ifdef DEBUGNET int cxgb_debugnet_encap(struct sge_qset *qs, struct mbuf **m) { int error; error = t3_encap(qs, m); if (error == 0) check_ring_tx_db(qs->port->adapter, &qs->txq[TXQ_ETH], 1); else if (*m != NULL) { m_freem(*m); *m = NULL; } return (error); } #endif void cxgb_tx_watchdog(void *arg) { struct sge_qset *qs = arg; struct sge_txq *txq = &qs->txq[TXQ_ETH]; if (qs->coalescing != 0 && (txq->in_use <= cxgb_tx_coalesce_enable_stop) && TXQ_RING_EMPTY(qs)) qs->coalescing = 0; else if (qs->coalescing == 0 && (txq->in_use >= cxgb_tx_coalesce_enable_start)) qs->coalescing = 1; if (TXQ_TRYLOCK(qs)) { qs->qs_flags |= QS_FLUSHING; cxgb_start_locked(qs); qs->qs_flags &= ~QS_FLUSHING; TXQ_UNLOCK(qs); } - if (qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING) + if (if_getdrvflags(qs->port->ifp) & IFF_DRV_RUNNING) callout_reset_on(&txq->txq_watchdog, hz/4, cxgb_tx_watchdog, qs, txq->txq_watchdog.c_cpu); } static void cxgb_tx_timeout(void *arg) { struct sge_qset *qs = arg; struct sge_txq *txq = &qs->txq[TXQ_ETH]; if (qs->coalescing == 0 && (txq->in_use >= (txq->size>>3))) qs->coalescing = 1; if (TXQ_TRYLOCK(qs)) { qs->qs_flags |= QS_TIMEOUT; cxgb_start_locked(qs); qs->qs_flags &= ~QS_TIMEOUT; TXQ_UNLOCK(qs); } } static void cxgb_start_locked(struct sge_qset *qs) { struct mbuf *m_head = NULL; struct sge_txq *txq = &qs->txq[TXQ_ETH]; struct port_info *pi = qs->port; - struct ifnet *ifp = pi->ifp; + if_t ifp = pi->ifp; if (qs->qs_flags & (QS_FLUSHING|QS_TIMEOUT)) reclaim_completed_tx(qs, 0, TXQ_ETH); if (!pi->link_config.link_ok) { TXQ_RING_FLUSH(qs); return; } TXQ_LOCK_ASSERT(qs); - while (!TXQ_RING_EMPTY(qs) && (ifp->if_drv_flags & IFF_DRV_RUNNING) && + while (!TXQ_RING_EMPTY(qs) && (if_getdrvflags(ifp) & IFF_DRV_RUNNING) && pi->link_config.link_ok) { reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH); if (txq->size - txq->in_use <= TX_MAX_DESC) break; if ((m_head = cxgb_dequeue(qs)) == NULL) break; /* * Encapsulation can modify our pointer, and or make it * NULL on failure. In that event, we can't requeue. */ if (t3_encap(qs, &m_head) || m_head == NULL) break; m_head = NULL; } if (txq->db_pending) check_ring_tx_db(pi->adapter, txq, 1); if (!TXQ_RING_EMPTY(qs) && callout_pending(&txq->txq_timer) == 0 && pi->link_config.link_ok) callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout, qs, txq->txq_timer.c_cpu); if (m_head != NULL) m_freem(m_head); } static int -cxgb_transmit_locked(struct ifnet *ifp, struct sge_qset *qs, struct mbuf *m) +cxgb_transmit_locked(if_t ifp, struct sge_qset *qs, struct mbuf *m) { struct port_info *pi = qs->port; struct sge_txq *txq = &qs->txq[TXQ_ETH]; struct buf_ring *br = txq->txq_mr; int error, avail; avail = txq->size - txq->in_use; TXQ_LOCK_ASSERT(qs); /* * We can only do a direct transmit if the following are true: * - we aren't coalescing (ring < 3/4 full) * - the link is up -- checked in caller * - there are no packets enqueued already * - there is space in hardware transmit queue */ if (check_pkt_coalesce(qs) == 0 && !TXQ_RING_NEEDS_ENQUEUE(qs) && avail > TX_MAX_DESC) { if (t3_encap(qs, &m)) { if (m != NULL && (error = drbr_enqueue(ifp, br, m)) != 0) return (error); } else { if (txq->db_pending) check_ring_tx_db(pi->adapter, txq, 1); /* * We've bypassed the buf ring so we need to update * the stats directly */ txq->txq_direct_packets++; txq->txq_direct_bytes += m->m_pkthdr.len; } } else if ((error = drbr_enqueue(ifp, br, m)) != 0) return (error); reclaim_completed_tx(qs, cxgb_tx_reclaim_threshold, TXQ_ETH); if (!TXQ_RING_EMPTY(qs) && pi->link_config.link_ok && (!check_pkt_coalesce(qs) || (drbr_inuse(ifp, br) >= 7))) cxgb_start_locked(qs); else if (!TXQ_RING_EMPTY(qs) && !callout_pending(&txq->txq_timer)) callout_reset_on(&txq->txq_timer, 1, cxgb_tx_timeout, qs, txq->txq_timer.c_cpu); return (0); } int -cxgb_transmit(struct ifnet *ifp, struct mbuf *m) +cxgb_transmit(if_t ifp, struct mbuf *m) { struct sge_qset *qs; - struct port_info *pi = ifp->if_softc; + struct port_info *pi = if_getsoftc(ifp); int error, qidx = pi->first_qset; - if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 ||(!pi->link_config.link_ok)) { m_freem(m); return (0); } /* check if flowid is set */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) qidx = (m->m_pkthdr.flowid % pi->nqsets) + pi->first_qset; qs = &pi->adapter->sge.qs[qidx]; if (TXQ_TRYLOCK(qs)) { /* XXX running */ error = cxgb_transmit_locked(ifp, qs, m); TXQ_UNLOCK(qs); } else error = drbr_enqueue(ifp, qs->txq[TXQ_ETH].txq_mr, m); return (error); } void -cxgb_qflush(struct ifnet *ifp) +cxgb_qflush(if_t ifp) { /* * flush any enqueued mbufs in the buf_rings * and in the transmit queues * no-op for now */ return; } /** * write_imm - write a packet into a Tx descriptor as immediate data * @d: the Tx descriptor to write * @m: the packet * @len: the length of packet data to write as immediate data * @gen: the generation bit value to write * * Writes a packet as immediate data into a Tx descriptor. The packet * contains a work request at its beginning. We must write the packet * carefully so the SGE doesn't read accidentally before it's written in * its entirety. */ static __inline void write_imm(struct tx_desc *d, caddr_t src, unsigned int len, unsigned int gen) { struct work_request_hdr *from = (struct work_request_hdr *)src; struct work_request_hdr *to = (struct work_request_hdr *)d; uint32_t wr_hi, wr_lo; KASSERT(len <= WR_LEN && len >= sizeof(*from), ("%s: invalid len %d", __func__, len)); memcpy(&to[1], &from[1], len - sizeof(*from)); wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP | V_WR_BCNTLFLT(len & 7)); wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8)); set_wr_hdr(to, wr_hi, wr_lo); wmb(); wr_gen2(d, gen); } /** * check_desc_avail - check descriptor availability on a send queue * @adap: the adapter * @q: the TX queue * @m: the packet needing the descriptors * @ndesc: the number of Tx descriptors needed * @qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL) * * Checks if the requested number of Tx descriptors is available on an * SGE send queue. If the queue is already suspended or not enough * descriptors are available the packet is queued for later transmission. * Must be called with the Tx queue locked. * * Returns 0 if enough descriptors are available, 1 if there aren't * enough descriptors and the packet has been queued, and 2 if the caller * needs to retry because there weren't enough descriptors at the * beginning of the call but some freed up in the mean time. */ static __inline int check_desc_avail(adapter_t *adap, struct sge_txq *q, struct mbuf *m, unsigned int ndesc, unsigned int qid) { /* * XXX We currently only use this for checking the control queue * the control queue is only used for binding qsets which happens * at init time so we are guaranteed enough descriptors */ if (__predict_false(mbufq_len(&q->sendq))) { addq_exit: (void )mbufq_enqueue(&q->sendq, m); return 1; } if (__predict_false(q->size - q->in_use < ndesc)) { struct sge_qset *qs = txq_to_qset(q, qid); setbit(&qs->txq_stopped, qid); if (should_restart_tx(q) && test_and_clear_bit(qid, &qs->txq_stopped)) return 2; q->stops++; goto addq_exit; } return 0; } /** * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs * @q: the SGE control Tx queue * * This is a variant of reclaim_completed_tx() that is used for Tx queues * that send only immediate data (presently just the control queues) and * thus do not have any mbufs */ static __inline void reclaim_completed_tx_imm(struct sge_txq *q) { unsigned int reclaim = q->processed - q->cleaned; q->in_use -= reclaim; q->cleaned += reclaim; } /** * ctrl_xmit - send a packet through an SGE control Tx queue * @adap: the adapter * @q: the control queue * @m: the packet * * Send a packet through an SGE control Tx queue. Packets sent through * a control queue must fit entirely as immediate data in a single Tx * descriptor and have no page fragments. */ static int ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) { int ret; struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *); struct sge_txq *q = &qs->txq[TXQ_CTRL]; KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__)); wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP); wrp->wrh_lo = htonl(V_WR_TID(q->token)); TXQ_LOCK(qs); again: reclaim_completed_tx_imm(q); ret = check_desc_avail(adap, q, m, 1, TXQ_CTRL); if (__predict_false(ret)) { if (ret == 1) { TXQ_UNLOCK(qs); return (ENOSPC); } goto again; } write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); q->in_use++; if (++q->pidx >= q->size) { q->pidx = 0; q->gen ^= 1; } TXQ_UNLOCK(qs); wmb(); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); m_free(m); return (0); } /** * restart_ctrlq - restart a suspended control queue * @qs: the queue set cotaining the control queue * * Resumes transmission on a suspended Tx control queue. */ static void restart_ctrlq(void *data, int npending) { struct mbuf *m; struct sge_qset *qs = (struct sge_qset *)data; struct sge_txq *q = &qs->txq[TXQ_CTRL]; adapter_t *adap = qs->port->adapter; TXQ_LOCK(qs); again: reclaim_completed_tx_imm(q); while (q->in_use < q->size && (m = mbufq_dequeue(&q->sendq)) != NULL) { write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); m_free(m); if (++q->pidx >= q->size) { q->pidx = 0; q->gen ^= 1; } q->in_use++; } if (mbufq_len(&q->sendq)) { setbit(&qs->txq_stopped, TXQ_CTRL); if (should_restart_tx(q) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) goto again; q->stops++; } TXQ_UNLOCK(qs); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } /* * Send a management message through control queue 0 */ int t3_mgmt_tx(struct adapter *adap, struct mbuf *m) { return ctrl_xmit(adap, &adap->sge.qs[0], m); } /** * free_qset - free the resources of an SGE queue set * @sc: the controller owning the queue set * @q: the queue set * * Release the HW and SW resources associated with an SGE queue set, such * as HW contexts, packet buffers, and descriptor rings. Traffic to the * queue set must be quiesced prior to calling this. */ static void t3_free_qset(adapter_t *sc, struct sge_qset *q) { int i; reclaim_completed_tx(q, 0, TXQ_ETH); if (q->txq[TXQ_ETH].txq_mr != NULL) buf_ring_free(q->txq[TXQ_ETH].txq_mr, M_DEVBUF); if (q->txq[TXQ_ETH].txq_ifq != NULL) { ifq_delete(q->txq[TXQ_ETH].txq_ifq); free(q->txq[TXQ_ETH].txq_ifq, M_DEVBUF); } for (i = 0; i < SGE_RXQ_PER_SET; ++i) { if (q->fl[i].desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_disable_fl(sc, q->fl[i].cntxt_id); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->fl[i].desc_tag, q->fl[i].desc_map); bus_dmamem_free(q->fl[i].desc_tag, q->fl[i].desc, q->fl[i].desc_map); bus_dma_tag_destroy(q->fl[i].desc_tag); bus_dma_tag_destroy(q->fl[i].entry_tag); } if (q->fl[i].sdesc) { free_rx_bufs(sc, &q->fl[i]); free(q->fl[i].sdesc, M_DEVBUF); } } mtx_unlock(&q->lock); MTX_DESTROY(&q->lock); for (i = 0; i < SGE_TXQ_PER_SET; i++) { if (q->txq[i].desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_enable_ecntxt(sc, q->txq[i].cntxt_id, 0); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->txq[i].desc_tag, q->txq[i].desc_map); bus_dmamem_free(q->txq[i].desc_tag, q->txq[i].desc, q->txq[i].desc_map); bus_dma_tag_destroy(q->txq[i].desc_tag); bus_dma_tag_destroy(q->txq[i].entry_tag); } if (q->txq[i].sdesc) { free(q->txq[i].sdesc, M_DEVBUF); } } if (q->rspq.desc) { mtx_lock_spin(&sc->sge.reg_lock); t3_sge_disable_rspcntxt(sc, q->rspq.cntxt_id); mtx_unlock_spin(&sc->sge.reg_lock); bus_dmamap_unload(q->rspq.desc_tag, q->rspq.desc_map); bus_dmamem_free(q->rspq.desc_tag, q->rspq.desc, q->rspq.desc_map); bus_dma_tag_destroy(q->rspq.desc_tag); MTX_DESTROY(&q->rspq.lock); } #if defined(INET6) || defined(INET) tcp_lro_free(&q->lro.ctrl); #endif bzero(q, sizeof(*q)); } /** * t3_free_sge_resources - free SGE resources * @sc: the adapter softc * * Frees resources used by the SGE queue sets. */ void t3_free_sge_resources(adapter_t *sc, int nqsets) { int i; for (i = 0; i < nqsets; ++i) { TXQ_LOCK(&sc->sge.qs[i]); t3_free_qset(sc, &sc->sge.qs[i]); } } /** * t3_sge_start - enable SGE * @sc: the controller softc * * Enables the SGE for DMAs. This is the last step in starting packet * transfers. */ void t3_sge_start(adapter_t *sc) { t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE); } /** * t3_sge_stop - disable SGE operation * @sc: the adapter * * Disables the DMA engine. This can be called in emeregencies (e.g., * from error interrupts) or from normal process context. In the latter * case it also disables any pending queue restart tasklets. Note that * if it is called in interrupt context it cannot disable the restart * tasklets as it cannot wait, however the tasklets will have no effect * since the doorbells are disabled and the driver will call this again * later from process context, at which time the tasklets will be stopped * if they are still running. */ void t3_sge_stop(adapter_t *sc) { t3_set_reg_field(sc, A_SG_CONTROL, F_GLOBALENABLE, 0); } /** * t3_free_tx_desc - reclaims Tx descriptors and their buffers * @adapter: the adapter * @q: the Tx queue to reclaim descriptors from * @reclaimable: the number of descriptors to reclaim * @m_vec_size: maximum number of buffers to reclaim * @desc_reclaimed: returns the number of descriptors reclaimed * * Reclaims Tx descriptors from an SGE Tx queue and frees the associated * Tx buffers. Called with the Tx queue lock held. * * Returns number of buffers of reclaimed */ void t3_free_tx_desc(struct sge_qset *qs, int reclaimable, int queue) { struct tx_sw_desc *txsd; unsigned int cidx, mask; struct sge_txq *q = &qs->txq[queue]; #ifdef T3_TRACE T3_TRACE2(sc->tb[q->cntxt_id & 7], "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx); #endif cidx = q->cidx; mask = q->size - 1; txsd = &q->sdesc[cidx]; mtx_assert(&qs->lock, MA_OWNED); while (reclaimable--) { prefetch(q->sdesc[(cidx + 1) & mask].m); prefetch(q->sdesc[(cidx + 2) & mask].m); if (txsd->m != NULL) { if (txsd->flags & TX_SW_DESC_MAPPED) { bus_dmamap_unload(q->entry_tag, txsd->map); txsd->flags &= ~TX_SW_DESC_MAPPED; } m_freem_list(txsd->m); txsd->m = NULL; } else q->txq_skipped++; ++txsd; if (++cidx == q->size) { cidx = 0; txsd = q->sdesc; } } q->cidx = cidx; } /** * is_new_response - check if a response is newly written * @r: the response descriptor * @q: the response queue * * Returns true if a response descriptor contains a yet unprocessed * response. */ static __inline int is_new_response(const struct rsp_desc *r, const struct sge_rspq *q) { return (r->intr_gen & F_RSPD_GEN2) == q->gen; } #define RSPD_GTS_MASK (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS) #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \ V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \ V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \ V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR)) /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */ #define NOMEM_INTR_DELAY 2500 #ifdef TCP_OFFLOAD /** * write_ofld_wr - write an offload work request * @adap: the adapter * @m: the packet to send * @q: the Tx queue * @pidx: index of the first Tx descriptor to write * @gen: the generation value to use * @ndesc: number of descriptors the packet will occupy * * Write an offload work request to send the supplied packet. The packet * data already carry the work request with most fields populated. */ static void write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q, unsigned int pidx, unsigned int gen, unsigned int ndesc) { unsigned int sgl_flits, flits; int i, idx, nsegs, wrlen; struct work_request_hdr *from; struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1]; struct tx_desc *d = &q->desc[pidx]; struct txq_state txqs; struct sglist_seg *segs; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); struct sglist *sgl; from = (void *)(oh + 1); /* Start of WR within mbuf */ wrlen = m->m_len - sizeof(*oh); if (!(oh->flags & F_HDR_SGL)) { write_imm(d, (caddr_t)from, wrlen, gen); /* * mbuf with "real" immediate tx data will be enqueue_wr'd by * t3_push_frames and freed in wr_ack. Others, like those sent * down by close_conn, t3_send_reset, etc. should be freed here. */ if (!(oh->flags & F_HDR_DF)) m_free(m); return; } memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from)); sgl = oh->sgl; flits = wrlen / 8; sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl; nsegs = sgl->sg_nseg; segs = sgl->sg_segs; for (idx = 0, i = 0; i < nsegs; i++) { KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__)); if (i && idx == 0) ++sgp; sgp->len[idx] = htobe32(segs[i].ss_len); sgp->addr[idx] = htobe64(segs[i].ss_paddr); idx ^= 1; } if (idx) { sgp->len[idx] = 0; sgp->addr[idx] = 0; } sgl_flits = sgl_len(nsegs); txqs.gen = gen; txqs.pidx = pidx; txqs.compl = 0; write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits, from->wrh_hi, from->wrh_lo); } /** * ofld_xmit - send a packet through an offload queue * @adap: the adapter * @q: the Tx offload queue * @m: the packet * * Send an offload packet through an SGE offload queue. */ static int ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) { int ret; unsigned int ndesc; unsigned int pidx, gen; struct sge_txq *q = &qs->txq[TXQ_OFLD]; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); ndesc = G_HDR_NDESC(oh->flags); TXQ_LOCK(qs); again: reclaim_completed_tx(qs, 16, TXQ_OFLD); ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD); if (__predict_false(ret)) { if (ret == 1) { TXQ_UNLOCK(qs); return (EINTR); } goto again; } gen = q->gen; q->in_use += ndesc; pidx = q->pidx; q->pidx += ndesc; if (q->pidx >= q->size) { q->pidx -= q->size; q->gen ^= 1; } write_ofld_wr(adap, m, q, pidx, gen, ndesc); check_ring_tx_db(adap, q, 1); TXQ_UNLOCK(qs); return (0); } /** * restart_offloadq - restart a suspended offload queue * @qs: the queue set cotaining the offload queue * * Resumes transmission on a suspended Tx offload queue. */ static void restart_offloadq(void *data, int npending) { struct mbuf *m; struct sge_qset *qs = data; struct sge_txq *q = &qs->txq[TXQ_OFLD]; adapter_t *adap = qs->port->adapter; TXQ_LOCK(qs); again: while ((m = mbufq_first(&q->sendq)) != NULL) { unsigned int gen, pidx; struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); unsigned int ndesc = G_HDR_NDESC(oh->flags); if (__predict_false(q->size - q->in_use < ndesc)) { setbit(&qs->txq_stopped, TXQ_OFLD); if (should_restart_tx(q) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) goto again; q->stops++; break; } gen = q->gen; q->in_use += ndesc; pidx = q->pidx; q->pidx += ndesc; if (q->pidx >= q->size) { q->pidx -= q->size; q->gen ^= 1; } (void)mbufq_dequeue(&q->sendq); TXQ_UNLOCK(qs); write_ofld_wr(adap, m, q, pidx, gen, ndesc); TXQ_LOCK(qs); } #if USE_GTS set_bit(TXQ_RUNNING, &q->flags); set_bit(TXQ_LAST_PKT_DB, &q->flags); #endif TXQ_UNLOCK(qs); wmb(); t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); } /** * t3_offload_tx - send an offload packet * @m: the packet * * Sends an offload packet. We use the packet priority to select the * appropriate Tx queue as follows: bit 0 indicates whether the packet * should be sent as regular or control, bits 1-3 select the queue set. */ int t3_offload_tx(struct adapter *sc, struct mbuf *m) { struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)]; if (oh->flags & F_HDR_CTRL) { m_adj(m, sizeof (*oh)); /* trim ofld_hdr off */ return (ctrl_xmit(sc, qs, m)); } else return (ofld_xmit(sc, qs, m)); } #endif static void restart_tx(struct sge_qset *qs) { struct adapter *sc = qs->port->adapter; if (isset(&qs->txq_stopped, TXQ_OFLD) && should_restart_tx(&qs->txq[TXQ_OFLD]) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) { qs->txq[TXQ_OFLD].restarts++; taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task); } if (isset(&qs->txq_stopped, TXQ_CTRL) && should_restart_tx(&qs->txq[TXQ_CTRL]) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) { qs->txq[TXQ_CTRL].restarts++; taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task); } } /** * t3_sge_alloc_qset - initialize an SGE queue set * @sc: the controller softc * @id: the queue set id * @nports: how many Ethernet ports will be using this queue set * @irq_vec_idx: the IRQ vector index for response queue interrupts * @p: configuration parameters for this queue set * @ntxq: number of Tx queues for the queue set * @pi: port info for queue set * * Allocate resources and initialize an SGE queue set. A queue set * comprises a response queue, two Rx free-buffer queues, and up to 3 * Tx queues. The Tx queues are assigned roles in the order Ethernet * queue, offload queue, and control queue. */ int t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, const struct qset_params *p, int ntxq, struct port_info *pi) { struct sge_qset *q = &sc->sge.qs[id]; int i, ret = 0; MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF); q->port = pi; q->adap = sc; if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size, M_DEVBUF, M_WAITOK, &q->lock)) == NULL) { device_printf(sc->dev, "failed to allocate mbuf ring\n"); goto err; } if ((q->txq[TXQ_ETH].txq_ifq = malloc(sizeof(struct ifaltq), M_DEVBUF, M_NOWAIT | M_ZERO)) == NULL) { device_printf(sc->dev, "failed to allocate ifq\n"); goto err; } ifq_init(q->txq[TXQ_ETH].txq_ifq, pi->ifp); callout_init(&q->txq[TXQ_ETH].txq_timer, 1); callout_init(&q->txq[TXQ_ETH].txq_watchdog, 1); q->txq[TXQ_ETH].txq_timer.c_cpu = id % mp_ncpus; q->txq[TXQ_ETH].txq_watchdog.c_cpu = id % mp_ncpus; init_qset_cntxt(q, id); q->idx = id; if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc), sizeof(struct rx_sw_desc), &q->fl[0].phys_addr, &q->fl[0].desc, &q->fl[0].sdesc, &q->fl[0].desc_tag, &q->fl[0].desc_map, sc->rx_dmat, &q->fl[0].entry_tag)) != 0) { printf("error %d from alloc ring fl0\n", ret); goto err; } if ((ret = alloc_ring(sc, p->jumbo_size, sizeof(struct rx_desc), sizeof(struct rx_sw_desc), &q->fl[1].phys_addr, &q->fl[1].desc, &q->fl[1].sdesc, &q->fl[1].desc_tag, &q->fl[1].desc_map, sc->rx_jumbo_dmat, &q->fl[1].entry_tag)) != 0) { printf("error %d from alloc ring fl1\n", ret); goto err; } if ((ret = alloc_ring(sc, p->rspq_size, sizeof(struct rsp_desc), 0, &q->rspq.phys_addr, &q->rspq.desc, NULL, &q->rspq.desc_tag, &q->rspq.desc_map, NULL, NULL)) != 0) { printf("error %d from alloc ring rspq\n", ret); goto err; } snprintf(q->rspq.lockbuf, RSPQ_NAME_LEN, "t3 rspq lock %d:%d", device_get_unit(sc->dev), irq_vec_idx); MTX_INIT(&q->rspq.lock, q->rspq.lockbuf, NULL, MTX_DEF); for (i = 0; i < ntxq; ++i) { size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc); if ((ret = alloc_ring(sc, p->txq_size[i], sizeof(struct tx_desc), sz, &q->txq[i].phys_addr, &q->txq[i].desc, &q->txq[i].sdesc, &q->txq[i].desc_tag, &q->txq[i].desc_map, sc->tx_dmat, &q->txq[i].entry_tag)) != 0) { printf("error %d from alloc ring tx %i\n", ret, i); goto err; } mbufq_init(&q->txq[i].sendq, INT_MAX); q->txq[i].gen = 1; q->txq[i].size = p->txq_size[i]; } #ifdef TCP_OFFLOAD TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q); #endif TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q); TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q); TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q); q->fl[0].gen = q->fl[1].gen = 1; q->fl[0].size = p->fl_size; q->fl[1].size = p->jumbo_size; q->rspq.gen = 1; q->rspq.cidx = 0; q->rspq.size = p->rspq_size; q->txq[TXQ_ETH].stop_thres = nports * flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3); q->fl[0].buf_size = MCLBYTES; q->fl[0].zone = zone_pack; q->fl[0].type = EXT_PACKET; if (p->jumbo_buf_size == MJUM16BYTES) { q->fl[1].zone = zone_jumbo16; q->fl[1].type = EXT_JUMBO16; } else if (p->jumbo_buf_size == MJUM9BYTES) { q->fl[1].zone = zone_jumbo9; q->fl[1].type = EXT_JUMBO9; } else if (p->jumbo_buf_size == MJUMPAGESIZE) { q->fl[1].zone = zone_jumbop; q->fl[1].type = EXT_JUMBOP; } else { KASSERT(0, ("can't deal with jumbo_buf_size %d.", p->jumbo_buf_size)); ret = EDOOFUS; goto err; } q->fl[1].buf_size = p->jumbo_buf_size; /* Allocate and setup the lro_ctrl structure */ - q->lro.enabled = !!(pi->ifp->if_capenable & IFCAP_LRO); + q->lro.enabled = !!(if_getcapenable(pi->ifp) & IFCAP_LRO); #if defined(INET6) || defined(INET) ret = tcp_lro_init(&q->lro.ctrl); if (ret) { printf("error %d from tcp_lro_init\n", ret); goto err; } #endif q->lro.ctrl.ifp = pi->ifp; mtx_lock_spin(&sc->sge.reg_lock); ret = -t3_sge_init_rspcntxt(sc, q->rspq.cntxt_id, irq_vec_idx, q->rspq.phys_addr, q->rspq.size, q->fl[0].buf_size, 1, 0); if (ret) { printf("error %d from t3_sge_init_rspcntxt\n", ret); goto err_unlock; } for (i = 0; i < SGE_RXQ_PER_SET; ++i) { ret = -t3_sge_init_flcntxt(sc, q->fl[i].cntxt_id, 0, q->fl[i].phys_addr, q->fl[i].size, q->fl[i].buf_size, p->cong_thres, 1, 0); if (ret) { printf("error %d from t3_sge_init_flcntxt for index i=%d\n", ret, i); goto err_unlock; } } ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_ETH].cntxt_id, USE_GTS, SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr, q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } if (ntxq > 1) { ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_OFLD].cntxt_id, USE_GTS, SGE_CNTXT_OFLD, id, q->txq[TXQ_OFLD].phys_addr, q->txq[TXQ_OFLD].size, 0, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } } if (ntxq > 2) { ret = -t3_sge_init_ecntxt(sc, q->txq[TXQ_CTRL].cntxt_id, 0, SGE_CNTXT_CTRL, id, q->txq[TXQ_CTRL].phys_addr, q->txq[TXQ_CTRL].size, q->txq[TXQ_CTRL].token, 1, 0); if (ret) { printf("error %d from t3_sge_init_ecntxt\n", ret); goto err_unlock; } } mtx_unlock_spin(&sc->sge.reg_lock); t3_update_qset_coalesce(q, p); refill_fl(sc, &q->fl[0], q->fl[0].size); refill_fl(sc, &q->fl[1], q->fl[1].size); refill_rspq(sc, &q->rspq, q->rspq.size - 1); t3_write_reg(sc, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) | V_NEWTIMER(q->rspq.holdoff_tmr)); return (0); err_unlock: mtx_unlock_spin(&sc->sge.reg_lock); err: TXQ_LOCK(q); t3_free_qset(sc, q); return (ret); } /* * Remove CPL_RX_PKT headers from the mbuf and reduce it to a regular mbuf with * ethernet data. Hardware assistance with various checksums and any vlan tag * will also be taken into account here. */ void t3_rx_eth(struct adapter *adap, struct mbuf *m, int ethpad) { struct cpl_rx_pkt *cpl = (struct cpl_rx_pkt *)(mtod(m, uint8_t *) + ethpad); struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]]; - struct ifnet *ifp = pi->ifp; + if_t ifp = pi->ifp; if (cpl->vlan_valid) { m->m_pkthdr.ether_vtag = ntohs(cpl->vlan); m->m_flags |= M_VLANTAG; } m->m_pkthdr.rcvif = ifp; /* * adjust after conversion to mbuf chain */ m->m_pkthdr.len -= (sizeof(*cpl) + ethpad); m->m_len -= (sizeof(*cpl) + ethpad); m->m_data += (sizeof(*cpl) + ethpad); if (!cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) { struct ether_header *eh = mtod(m, void *); uint16_t eh_type; if (eh->ether_type == htons(ETHERTYPE_VLAN)) { struct ether_vlan_header *evh = mtod(m, void *); eh_type = evh->evl_proto; } else eh_type = eh->ether_type; - if (ifp->if_capenable & IFCAP_RXCSUM && + if (if_getcapenable(ifp) & IFCAP_RXCSUM && eh_type == htons(ETHERTYPE_IP)) { m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; - } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && + } else if (if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6 && eh_type == htons(ETHERTYPE_IPV6)) { m->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } } /** * get_packet - return the next ingress packet buffer from a free list * @adap: the adapter that received the packet * @drop_thres: # of remaining buffers before we start dropping packets * @qs: the qset that the SGE free list holding the packet belongs to * @mh: the mbuf header, contains a pointer to the head and tail of the mbuf chain * @r: response descriptor * * Get the next packet from a free list and complete setup of the * sk_buff. If the packet is small we make a copy and recycle the * original buffer, otherwise we use the original buffer itself. If a * positive drop threshold is supplied packets are dropped and their * buffers recycled if (a) the number of remaining buffers is under the * threshold and the packet is too big to copy, or (b) the packet should * be copied but there is no memory for the copy. */ static int get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs, struct t3_mbuf_hdr *mh, struct rsp_desc *r) { unsigned int len_cq = ntohl(r->len_cq); struct sge_fl *fl = (len_cq & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0]; int mask, cidx = fl->cidx; struct rx_sw_desc *sd = &fl->sdesc[cidx]; uint32_t len = G_RSPD_LEN(len_cq); uint32_t flags = M_EXT; uint8_t sopeop = G_RSPD_SOP_EOP(ntohl(r->flags)); caddr_t cl; struct mbuf *m; int ret = 0; mask = fl->size - 1; prefetch(fl->sdesc[(cidx + 1) & mask].m); prefetch(fl->sdesc[(cidx + 2) & mask].m); prefetch(fl->sdesc[(cidx + 1) & mask].rxsd_cl); prefetch(fl->sdesc[(cidx + 2) & mask].rxsd_cl); fl->credits--; bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD); if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) { if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) goto skip_recycle; cl = mtod(m, void *); memcpy(cl, sd->rxsd_cl, len); recycle_rx_buf(adap, fl, fl->cidx); m->m_pkthdr.len = m->m_len = len; m->m_flags = 0; mh->mh_head = mh->mh_tail = m; ret = 1; goto done; } else { skip_recycle: bus_dmamap_unload(fl->entry_tag, sd->map); cl = sd->rxsd_cl; m = sd->m; if ((sopeop == RSPQ_SOP_EOP) || (sopeop == RSPQ_SOP)) flags |= M_PKTHDR; m_init(m, M_NOWAIT, MT_DATA, flags); if (fl->zone == zone_pack) { /* * restore clobbered data pointer */ m->m_data = m->m_ext.ext_buf; } else { m_cljset(m, cl, fl->type); } m->m_len = len; } switch(sopeop) { case RSPQ_SOP_EOP: ret = 1; /* FALLTHROUGH */ case RSPQ_SOP: mh->mh_head = mh->mh_tail = m; m->m_pkthdr.len = len; break; case RSPQ_EOP: ret = 1; /* FALLTHROUGH */ case RSPQ_NSOP_NEOP: if (mh->mh_tail == NULL) { log(LOG_ERR, "discarding intermediate descriptor entry\n"); m_freem(m); m = NULL; break; } mh->mh_tail->m_next = m; mh->mh_tail = m; mh->mh_head->m_pkthdr.len += len; break; } if (cxgb_debug && m != NULL) printf("len=%d pktlen=%d\n", m->m_len, m->m_pkthdr.len); done: if (++fl->cidx == fl->size) fl->cidx = 0; return (ret); } /** * handle_rsp_cntrl_info - handles control information in a response * @qs: the queue set corresponding to the response * @flags: the response control flags * * Handles the control information of an SGE response, such as GTS * indications and completion credits for the queue set's Tx queues. * HW coalesces credits, we don't do any extra SW coalescing. */ static __inline void handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags) { unsigned int credits; #if USE_GTS if (flags & F_RSPD_TXQ0_GTS) clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags); #endif credits = G_RSPD_TXQ0_CR(flags); if (credits) qs->txq[TXQ_ETH].processed += credits; credits = G_RSPD_TXQ2_CR(flags); if (credits) qs->txq[TXQ_CTRL].processed += credits; # if USE_GTS if (flags & F_RSPD_TXQ1_GTS) clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags); # endif credits = G_RSPD_TXQ1_CR(flags); if (credits) qs->txq[TXQ_OFLD].processed += credits; } static void check_ring_db(adapter_t *adap, struct sge_qset *qs, unsigned int sleeping) { ; } /** * process_responses - process responses from an SGE response queue * @adap: the adapter * @qs: the queue set to which the response queue belongs * @budget: how many responses can be processed in this round * * Process responses from an SGE response queue up to the supplied budget. * Responses include received packets as well as credits and other events * for the queues that belong to the response queue's queue set. * A negative budget is effectively unlimited. * * Additionally choose the interrupt holdoff time for the next interrupt * on this queue. If the system is under memory shortage use a fairly * long delay to help recovery. */ static int process_responses(adapter_t *adap, struct sge_qset *qs, int budget) { struct sge_rspq *rspq = &qs->rspq; struct rsp_desc *r = &rspq->desc[rspq->cidx]; int budget_left = budget; unsigned int sleeping = 0; #if defined(INET6) || defined(INET) int lro_enabled = qs->lro.enabled; int skip_lro; struct lro_ctrl *lro_ctrl = &qs->lro.ctrl; #endif struct t3_mbuf_hdr *mh = &rspq->rspq_mh; #ifdef DEBUG static int last_holdoff = 0; if (cxgb_debug && rspq->holdoff_tmr != last_holdoff) { printf("next_holdoff=%d\n", rspq->holdoff_tmr); last_holdoff = rspq->holdoff_tmr; } #endif rspq->next_holdoff = rspq->holdoff_tmr; while (__predict_true(budget_left && is_new_response(r, rspq))) { int eth, eop = 0, ethpad = 0; uint32_t flags = ntohl(r->flags); uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val); uint8_t opcode = r->rss_hdr.opcode; eth = (opcode == CPL_RX_PKT); if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) { struct mbuf *m; if (cxgb_debug) printf("async notification\n"); if (mh->mh_head == NULL) { mh->mh_head = m_gethdr(M_NOWAIT, MT_DATA); m = mh->mh_head; } else { m = m_gethdr(M_NOWAIT, MT_DATA); } if (m == NULL) goto no_mem; memcpy(mtod(m, char *), r, AN_PKT_SIZE); m->m_len = m->m_pkthdr.len = AN_PKT_SIZE; *mtod(m, uint8_t *) = CPL_ASYNC_NOTIF; opcode = CPL_ASYNC_NOTIF; eop = 1; rspq->async_notif++; goto skip; } else if (flags & F_RSPD_IMM_DATA_VALID) { struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { no_mem: rspq->next_holdoff = NOMEM_INTR_DELAY; budget_left--; break; } if (mh->mh_head == NULL) mh->mh_head = m; else mh->mh_tail->m_next = m; mh->mh_tail = m; get_imm_packet(adap, r, m); mh->mh_head->m_pkthdr.len += m->m_len; eop = 1; rspq->imm_data++; } else if (r->len_cq) { int drop_thresh = eth ? SGE_RX_DROP_THRES : 0; eop = get_packet(adap, drop_thresh, qs, mh, r); if (eop) { if (r->rss_hdr.hash_type && !adap->timestamp) { M_HASHTYPE_SET(mh->mh_head, M_HASHTYPE_OPAQUE_HASH); mh->mh_head->m_pkthdr.flowid = rss_hash; } } ethpad = 2; } else { rspq->pure_rsps++; } skip: if (flags & RSPD_CTRL_MASK) { sleeping |= flags & RSPD_GTS_MASK; handle_rsp_cntrl_info(qs, flags); } if (!eth && eop) { rspq->offload_pkts++; #ifdef TCP_OFFLOAD adap->cpl_handler[opcode](qs, r, mh->mh_head); #else m_freem(mh->mh_head); #endif mh->mh_head = NULL; } else if (eth && eop) { struct mbuf *m = mh->mh_head; t3_rx_eth(adap, m, ethpad); /* * The T304 sends incoming packets on any qset. If LRO * is also enabled, we could end up sending packet up * lro_ctrl->ifp's input. That is incorrect. * * The mbuf's rcvif was derived from the cpl header and * is accurate. Skip LRO and just use that. */ #if defined(INET6) || defined(INET) skip_lro = __predict_false(qs->port->ifp != m->m_pkthdr.rcvif); if (lro_enabled && lro_ctrl->lro_cnt && !skip_lro && (tcp_lro_rx(lro_ctrl, m, 0) == 0) ) { /* successfully queue'd for LRO */ } else #endif { /* * LRO not enabled, packet unsuitable for LRO, * or unable to queue. Pass it up right now in * either case. */ - struct ifnet *ifp = m->m_pkthdr.rcvif; - (*ifp->if_input)(ifp, m); + if_t ifp = m->m_pkthdr.rcvif; + if_input(ifp, m); } mh->mh_head = NULL; } r++; if (__predict_false(++rspq->cidx == rspq->size)) { rspq->cidx = 0; rspq->gen ^= 1; r = rspq->desc; } if (++rspq->credits >= 64) { refill_rspq(adap, rspq, rspq->credits); rspq->credits = 0; } __refill_fl_lt(adap, &qs->fl[0], 32); __refill_fl_lt(adap, &qs->fl[1], 32); --budget_left; } #if defined(INET6) || defined(INET) /* Flush LRO */ tcp_lro_flush_all(lro_ctrl); #endif if (sleeping) check_ring_db(adap, qs, sleeping); mb(); /* commit Tx queue processed updates */ if (__predict_false(qs->txq_stopped > 1)) restart_tx(qs); __refill_fl_lt(adap, &qs->fl[0], 512); __refill_fl_lt(adap, &qs->fl[1], 512); budget -= budget_left; return (budget); } /* * A helper function that processes responses and issues GTS. */ static __inline int process_responses_gts(adapter_t *adap, struct sge_rspq *rq) { int work; static int last_holdoff = 0; work = process_responses(adap, rspq_to_qset(rq), -1); if (cxgb_debug && (rq->next_holdoff != last_holdoff)) { printf("next_holdoff=%d\n", rq->next_holdoff); last_holdoff = rq->next_holdoff; } t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) | V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx)); return (work); } #ifdef DEBUGNET int cxgb_debugnet_poll_rx(adapter_t *adap, struct sge_qset *qs) { return (process_responses_gts(adap, &qs->rspq)); } #endif /* * Interrupt handler for legacy INTx interrupts for T3B-based cards. * Handles data events from SGE response queues as well as error and other * async events as they all use the same interrupt pin. We use one SGE * response queue per port in this mode and protect all response queues with * queue 0's lock. */ void t3b_intr(void *data) { uint32_t i, map; adapter_t *adap = data; struct sge_rspq *q0 = &adap->sge.qs[0].rspq; t3_write_reg(adap, A_PL_CLI, 0); map = t3_read_reg(adap, A_SG_DATA_INTR); if (!map) return; if (__predict_false(map & F_ERRINTR)) { t3_write_reg(adap, A_PL_INT_ENABLE0, 0); (void) t3_read_reg(adap, A_PL_INT_ENABLE0); taskqueue_enqueue(adap->tq, &adap->slow_intr_task); } mtx_lock(&q0->lock); for_each_port(adap, i) if (map & (1 << i)) process_responses_gts(adap, &adap->sge.qs[i].rspq); mtx_unlock(&q0->lock); } /* * The MSI interrupt handler. This needs to handle data events from SGE * response queues as well as error and other async events as they all use * the same MSI vector. We use one SGE response queue per port in this mode * and protect all response queues with queue 0's lock. */ void t3_intr_msi(void *data) { adapter_t *adap = data; struct sge_rspq *q0 = &adap->sge.qs[0].rspq; int i, new_packets = 0; mtx_lock(&q0->lock); for_each_port(adap, i) if (process_responses_gts(adap, &adap->sge.qs[i].rspq)) new_packets = 1; mtx_unlock(&q0->lock); if (new_packets == 0) { t3_write_reg(adap, A_PL_INT_ENABLE0, 0); (void) t3_read_reg(adap, A_PL_INT_ENABLE0); taskqueue_enqueue(adap->tq, &adap->slow_intr_task); } } void t3_intr_msix(void *data) { struct sge_qset *qs = data; adapter_t *adap = qs->port->adapter; struct sge_rspq *rspq = &qs->rspq; if (process_responses_gts(adap, rspq) == 0) rspq->unhandled_irqs++; } #define QDUMP_SBUF_SIZE 32 * 400 static int t3_dump_rspq(SYSCTL_HANDLER_ARGS) { struct sge_rspq *rspq; struct sge_qset *qs; int i, err, dump_end, idx; struct sbuf *sb; struct rsp_desc *rspd; uint32_t data[4]; rspq = arg1; qs = rspq_to_qset(rspq); if (rspq->rspq_dump_count == 0) return (0); if (rspq->rspq_dump_count > RSPQ_Q_SIZE) { log(LOG_WARNING, "dump count is too large %d\n", rspq->rspq_dump_count); rspq->rspq_dump_count = 0; return (EINVAL); } if (rspq->rspq_dump_start > (RSPQ_Q_SIZE-1)) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", rspq->rspq_dump_start); rspq->rspq_dump_start = 0; return (EINVAL); } err = t3_sge_read_rspq(qs->port->adapter, rspq->cntxt_id, data); if (err) return (err); err = sysctl_wire_old_buffer(req, 0); if (err) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " \n index=%u size=%u MSI-X/RspQ=%u intr enable=%u intr armed=%u\n", (data[0] & 0xffff), data[0] >> 16, ((data[2] >> 20) & 0x3f), ((data[2] >> 26) & 1), ((data[2] >> 27) & 1)); sbuf_printf(sb, " generation=%u CQ mode=%u FL threshold=%u\n", ((data[2] >> 28) & 1), ((data[2] >> 31) & 1), data[3]); sbuf_printf(sb, " start=%d -> end=%d\n", rspq->rspq_dump_start, (rspq->rspq_dump_start + rspq->rspq_dump_count) & (RSPQ_Q_SIZE-1)); dump_end = rspq->rspq_dump_start + rspq->rspq_dump_count; for (i = rspq->rspq_dump_start; i < dump_end; i++) { idx = i & (RSPQ_Q_SIZE-1); rspd = &rspq->desc[idx]; sbuf_printf(sb, "\tidx=%04d opcode=%02x cpu_idx=%x hash_type=%x cq_idx=%x\n", idx, rspd->rss_hdr.opcode, rspd->rss_hdr.cpu_idx, rspd->rss_hdr.hash_type, be16toh(rspd->rss_hdr.cq_idx)); sbuf_printf(sb, "\trss_hash_val=%x flags=%08x len_cq=%x intr_gen=%x\n", rspd->rss_hdr.rss_hash_val, be32toh(rspd->flags), be32toh(rspd->len_cq), rspd->intr_gen); } err = sbuf_finish(sb); sbuf_delete(sb); return (err); } static int t3_dump_txq_eth(SYSCTL_HANDLER_ARGS) { struct sge_txq *txq; struct sge_qset *qs; int i, j, err, dump_end; struct sbuf *sb; struct tx_desc *txd; uint32_t *WR, wr_hi, wr_lo, gen; uint32_t data[4]; txq = arg1; qs = txq_to_qset(txq, TXQ_ETH); if (txq->txq_dump_count == 0) { return (0); } if (txq->txq_dump_count > TX_ETH_Q_SIZE) { log(LOG_WARNING, "dump count is too large %d\n", txq->txq_dump_count); txq->txq_dump_count = 1; return (EINVAL); } if (txq->txq_dump_start > (TX_ETH_Q_SIZE-1)) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", txq->txq_dump_start); txq->txq_dump_start = 0; return (EINVAL); } err = t3_sge_read_ecntxt(qs->port->adapter, qs->rspq.cntxt_id, data); if (err) return (err); err = sysctl_wire_old_buffer(req, 0); if (err) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " \n credits=%u GTS=%u index=%u size=%u rspq#=%u cmdq#=%u\n", (data[0] & 0x7fff), ((data[0] >> 15) & 1), (data[0] >> 16), (data[1] & 0xffff), ((data[3] >> 4) & 7), ((data[3] >> 7) & 1)); sbuf_printf(sb, " TUN=%u TOE=%u generation%u uP token=%u valid=%u\n", ((data[3] >> 8) & 1), ((data[3] >> 9) & 1), ((data[3] >> 10) & 1), ((data[3] >> 11) & 0xfffff), ((data[3] >> 31) & 1)); sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx, txq->txq_dump_start, (txq->txq_dump_start + txq->txq_dump_count) & (TX_ETH_Q_SIZE-1)); dump_end = txq->txq_dump_start + txq->txq_dump_count; for (i = txq->txq_dump_start; i < dump_end; i++) { txd = &txq->desc[i & (TX_ETH_Q_SIZE-1)]; WR = (uint32_t *)txd->flit; wr_hi = ntohl(WR[0]); wr_lo = ntohl(WR[1]); gen = G_WR_GEN(wr_lo); sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n", wr_hi, wr_lo, gen); for (j = 2; j < 30; j += 4) sbuf_printf(sb, "\t%08x %08x %08x %08x \n", WR[j], WR[j + 1], WR[j + 2], WR[j + 3]); } err = sbuf_finish(sb); sbuf_delete(sb); return (err); } static int t3_dump_txq_ctrl(SYSCTL_HANDLER_ARGS) { struct sge_txq *txq; struct sge_qset *qs; int i, j, err, dump_end; struct sbuf *sb; struct tx_desc *txd; uint32_t *WR, wr_hi, wr_lo, gen; txq = arg1; qs = txq_to_qset(txq, TXQ_CTRL); if (txq->txq_dump_count == 0) { return (0); } if (txq->txq_dump_count > 256) { log(LOG_WARNING, "dump count is too large %d\n", txq->txq_dump_count); txq->txq_dump_count = 1; return (EINVAL); } if (txq->txq_dump_start > 255) { log(LOG_WARNING, "dump start of %d is greater than queue size\n", txq->txq_dump_start); txq->txq_dump_start = 0; return (EINVAL); } err = sysctl_wire_old_buffer(req, 0); if (err != 0) return (err); sb = sbuf_new_for_sysctl(NULL, NULL, QDUMP_SBUF_SIZE, req); sbuf_printf(sb, " qid=%d start=%d -> end=%d\n", qs->idx, txq->txq_dump_start, (txq->txq_dump_start + txq->txq_dump_count) & 255); dump_end = txq->txq_dump_start + txq->txq_dump_count; for (i = txq->txq_dump_start; i < dump_end; i++) { txd = &txq->desc[i & (255)]; WR = (uint32_t *)txd->flit; wr_hi = ntohl(WR[0]); wr_lo = ntohl(WR[1]); gen = G_WR_GEN(wr_lo); sbuf_printf(sb," wr_hi %08x wr_lo %08x gen %d\n", wr_hi, wr_lo, gen); for (j = 2; j < 30; j += 4) sbuf_printf(sb, "\t%08x %08x %08x %08x \n", WR[j], WR[j + 1], WR[j + 2], WR[j + 3]); } err = sbuf_finish(sb); sbuf_delete(sb); return (err); } static int t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS) { adapter_t *sc = arg1; struct qset_params *qsp = &sc->params.sge.qset[0]; int coalesce_usecs; struct sge_qset *qs; int i, j, err, nqsets = 0; struct mtx *lock; if ((sc->flags & FULL_INIT_DONE) == 0) return (ENXIO); coalesce_usecs = qsp->coalesce_usecs; err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req); if (err != 0) { return (err); } if (coalesce_usecs == qsp->coalesce_usecs) return (0); for (i = 0; i < sc->params.nports; i++) for (j = 0; j < sc->port[i].nqsets; j++) nqsets++; coalesce_usecs = max(1, coalesce_usecs); for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[i]; qsp = &sc->params.sge.qset[i]; qsp->coalesce_usecs = coalesce_usecs; lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock : &sc->sge.qs[0].rspq.lock; mtx_lock(lock); t3_update_qset_coalesce(qs, qsp); t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) | V_NEWTIMER(qs->rspq.holdoff_tmr)); mtx_unlock(lock); } return (0); } static int t3_pkt_timestamp(SYSCTL_HANDLER_ARGS) { adapter_t *sc = arg1; int rc, timestamp; if ((sc->flags & FULL_INIT_DONE) == 0) return (ENXIO); timestamp = sc->timestamp; rc = sysctl_handle_int(oidp, ×tamp, arg2, req); if (rc != 0) return (rc); if (timestamp != sc->timestamp) { t3_set_reg_field(sc, A_TP_PC_CONFIG2, F_ENABLERXPKTTMSTPRSS, timestamp ? F_ENABLERXPKTTMSTPRSS : 0); sc->timestamp = timestamp; } return (0); } void t3_add_attach_sysctls(adapter_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); /* random information */ SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, &sc->params.rev, 0, "chip model"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "port_types", CTLFLAG_RD, sc->port_types, 0, "type of ports"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "enable_debug", CTLFLAG_RW, &cxgb_debug, 0, "enable verbose debugging output"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tunq_coalesce", CTLFLAG_RD, &sc->tunq_coalesce, "#tunneled packets freed"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "txq_overrun", CTLFLAG_RD, &txq_fills, 0, "#times txq overrun"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, &sc->params.vpd.cclk, 0, "core clock frequency (in KHz)"); } static const char *rspq_name = "rspq"; static const char *txq_names[] = { "txq_eth", "txq_ofld", "txq_ctrl" }; static int sysctl_handle_macstat(SYSCTL_HANDLER_ARGS) { struct port_info *p = arg1; uint64_t *parg; if (!p) return (EINVAL); cxgb_refresh_stats(p); parg = (uint64_t *) ((uint8_t *)&p->mac.stats + arg2); return (sysctl_handle_64(oidp, parg, 0, req)); } void t3_add_configured_sysctls(adapter_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; int i, j; ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, sc, 0, t3_set_coalesce_usecs, "I", "interrupt coalescing timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pkt_timestamp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, sc, 0, t3_pkt_timestamp, "I", "provide packet timestamp instead of connection hash"); for (i = 0; i < sc->params.nports; i++) { struct port_info *pi = &sc->port[i]; struct sysctl_oid *poid; struct sysctl_oid_list *poidlist; struct mac_stats *mstats = &pi->mac.stats; snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i); poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, pi->namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "port statistics"); poidlist = SYSCTL_CHILDREN(poid); SYSCTL_ADD_UINT(ctx, poidlist, OID_AUTO, "nqsets", CTLFLAG_RD, &pi->nqsets, 0, "#queue sets"); for (j = 0; j < pi->nqsets; j++) { struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j]; struct sysctl_oid *qspoid, *rspqpoid, *txqpoid, *ctrlqpoid, *lropoid; struct sysctl_oid_list *qspoidlist, *rspqpoidlist, *txqpoidlist, *ctrlqpoidlist, *lropoidlist; struct sge_txq *txq = &qs->txq[TXQ_ETH]; snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j); qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, qs->namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "qset statistics"); qspoidlist = SYSCTL_CHILDREN(qspoid); SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl0_empty", CTLFLAG_RD, &qs->fl[0].empty, 0, "freelist #0 empty"); SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "fl1_empty", CTLFLAG_RD, &qs->fl[1].empty, 0, "freelist #1 empty"); rspqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, rspq_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rspq statistics"); rspqpoidlist = SYSCTL_CHILDREN(rspqpoid); txqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, txq_names[0], CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "txq statistics"); txqpoidlist = SYSCTL_CHILDREN(txqpoid); ctrlqpoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, txq_names[2], CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ctrlq statistics"); ctrlqpoidlist = SYSCTL_CHILDREN(ctrlqpoid); lropoid = SYSCTL_ADD_NODE(ctx, qspoidlist, OID_AUTO, "lro_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "LRO statistics"); lropoidlist = SYSCTL_CHILDREN(lropoid); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "size", CTLFLAG_RD, &qs->rspq.size, 0, "#entries in response queue"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "cidx", CTLFLAG_RD, &qs->rspq.cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "credits", CTLFLAG_RD, &qs->rspq.credits, 0, "#credits"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "starved", CTLFLAG_RD, &qs->rspq.starved, 0, "#times starved"); SYSCTL_ADD_UAUTO(ctx, rspqpoidlist, OID_AUTO, "phys_addr", CTLFLAG_RD, &qs->rspq.phys_addr, "physical_address_of the queue"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->rspq.rspq_dump_start, 0, "start rspq dump entry"); SYSCTL_ADD_UINT(ctx, rspqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->rspq.rspq_dump_count, 0, "#rspq entries to dump"); SYSCTL_ADD_PROC(ctx, rspqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &qs->rspq, 0, t3_dump_rspq, "A", "dump of the response queue"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "dropped", CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_mr->br_drops, "#tunneled packets dropped"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "sendqlen", CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.mq_len, 0, "#tunneled packets waiting to be sent"); #if 0 SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_pidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod, 0, "#tunneled packets queue producer index"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "queue_cidx", CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons, 0, "#tunneled packets queue consumer index"); #endif SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "processed", CTLFLAG_RD, &qs->txq[TXQ_ETH].processed, 0, "#tunneled packets processed by the card"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "cleaned", CTLFLAG_RD, &txq->cleaned, 0, "#tunneled packets cleaned"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "in_use", CTLFLAG_RD, &txq->in_use, 0, "#tunneled packet slots in use"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "frees", CTLFLAG_RD, &txq->txq_frees, "#tunneled packets freed"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "skipped", CTLFLAG_RD, &txq->txq_skipped, 0, "#tunneled packet descriptors skipped"); SYSCTL_ADD_UQUAD(ctx, txqpoidlist, OID_AUTO, "coalesced", CTLFLAG_RD, &txq->txq_coalesced, "#tunneled packets coalesced"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "enqueued", CTLFLAG_RD, &txq->txq_enqueued, 0, "#tunneled packets enqueued to hardware"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "stopped_flags", CTLFLAG_RD, &qs->txq_stopped, 0, "tx queues stopped"); SYSCTL_ADD_UAUTO(ctx, txqpoidlist, OID_AUTO, "phys_addr", CTLFLAG_RD, &txq->phys_addr, "physical_address_of the queue"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "qgen", CTLFLAG_RW, &qs->txq[TXQ_ETH].gen, 0, "txq generation"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_cidx", CTLFLAG_RD, &txq->cidx, 0, "hardware queue cidx"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "hw_pidx", CTLFLAG_RD, &txq->pidx, 0, "hardware queue pidx"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_start, 0, "txq start idx for dump"); SYSCTL_ADD_UINT(ctx, txqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->txq[TXQ_ETH].txq_dump_count, 0, "txq #entries to dump"); SYSCTL_ADD_PROC(ctx, txqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &qs->txq[TXQ_ETH], 0, t3_dump_txq_eth, "A", "dump of the transmit queue"); SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_start", CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_start, 0, "ctrlq start idx for dump"); SYSCTL_ADD_UINT(ctx, ctrlqpoidlist, OID_AUTO, "dump_count", CTLFLAG_RW, &qs->txq[TXQ_CTRL].txq_dump_count, 0, "ctrl #entries to dump"); SYSCTL_ADD_PROC(ctx, ctrlqpoidlist, OID_AUTO, "qdump", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &qs->txq[TXQ_CTRL], 0, t3_dump_txq_ctrl, "A", "dump of the transmit queue"); SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued", CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL); SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed", CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL); SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum", CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL); SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt", CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL); } /* Now add a node for mac stats. */ poid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, "mac_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "MAC statistics"); poidlist = SYSCTL_CHILDREN(poid); /* * We (ab)use the length argument (arg2) to pass on the offset * of the data that we are interested in. This is only required * for the quad counters that are updated from the hardware (we * make sure that we return the latest value). * sysctl_handle_macstat first updates *all* the counters from * the hardware, and then returns the latest value of the * requested counter. Best would be to update only the * requested counter from hardware, but t3_mac_update_stats() * hides all the register details and we don't want to dive into * all that here. */ #define CXGB_SYSCTL_ADD_QUAD(a) SYSCTL_ADD_OID(ctx, poidlist, OID_AUTO, #a, \ CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_NEEDGIANT, pi, \ offsetof(struct mac_stats, a), sysctl_handle_macstat, "QU", 0) CXGB_SYSCTL_ADD_QUAD(tx_octets); CXGB_SYSCTL_ADD_QUAD(tx_octets_bad); CXGB_SYSCTL_ADD_QUAD(tx_frames); CXGB_SYSCTL_ADD_QUAD(tx_mcast_frames); CXGB_SYSCTL_ADD_QUAD(tx_bcast_frames); CXGB_SYSCTL_ADD_QUAD(tx_pause); CXGB_SYSCTL_ADD_QUAD(tx_deferred); CXGB_SYSCTL_ADD_QUAD(tx_late_collisions); CXGB_SYSCTL_ADD_QUAD(tx_total_collisions); CXGB_SYSCTL_ADD_QUAD(tx_excess_collisions); CXGB_SYSCTL_ADD_QUAD(tx_underrun); CXGB_SYSCTL_ADD_QUAD(tx_len_errs); CXGB_SYSCTL_ADD_QUAD(tx_mac_internal_errs); CXGB_SYSCTL_ADD_QUAD(tx_excess_deferral); CXGB_SYSCTL_ADD_QUAD(tx_fcs_errs); CXGB_SYSCTL_ADD_QUAD(tx_frames_64); CXGB_SYSCTL_ADD_QUAD(tx_frames_65_127); CXGB_SYSCTL_ADD_QUAD(tx_frames_128_255); CXGB_SYSCTL_ADD_QUAD(tx_frames_256_511); CXGB_SYSCTL_ADD_QUAD(tx_frames_512_1023); CXGB_SYSCTL_ADD_QUAD(tx_frames_1024_1518); CXGB_SYSCTL_ADD_QUAD(tx_frames_1519_max); CXGB_SYSCTL_ADD_QUAD(rx_octets); CXGB_SYSCTL_ADD_QUAD(rx_octets_bad); CXGB_SYSCTL_ADD_QUAD(rx_frames); CXGB_SYSCTL_ADD_QUAD(rx_mcast_frames); CXGB_SYSCTL_ADD_QUAD(rx_bcast_frames); CXGB_SYSCTL_ADD_QUAD(rx_pause); CXGB_SYSCTL_ADD_QUAD(rx_fcs_errs); CXGB_SYSCTL_ADD_QUAD(rx_align_errs); CXGB_SYSCTL_ADD_QUAD(rx_symbol_errs); CXGB_SYSCTL_ADD_QUAD(rx_data_errs); CXGB_SYSCTL_ADD_QUAD(rx_sequence_errs); CXGB_SYSCTL_ADD_QUAD(rx_runt); CXGB_SYSCTL_ADD_QUAD(rx_jabber); CXGB_SYSCTL_ADD_QUAD(rx_short); CXGB_SYSCTL_ADD_QUAD(rx_too_long); CXGB_SYSCTL_ADD_QUAD(rx_mac_internal_errs); CXGB_SYSCTL_ADD_QUAD(rx_cong_drops); CXGB_SYSCTL_ADD_QUAD(rx_frames_64); CXGB_SYSCTL_ADD_QUAD(rx_frames_65_127); CXGB_SYSCTL_ADD_QUAD(rx_frames_128_255); CXGB_SYSCTL_ADD_QUAD(rx_frames_256_511); CXGB_SYSCTL_ADD_QUAD(rx_frames_512_1023); CXGB_SYSCTL_ADD_QUAD(rx_frames_1024_1518); CXGB_SYSCTL_ADD_QUAD(rx_frames_1519_max); #undef CXGB_SYSCTL_ADD_QUAD #define CXGB_SYSCTL_ADD_ULONG(a) SYSCTL_ADD_ULONG(ctx, poidlist, OID_AUTO, #a, \ CTLFLAG_RD, &mstats->a, 0) CXGB_SYSCTL_ADD_ULONG(tx_fifo_parity_err); CXGB_SYSCTL_ADD_ULONG(rx_fifo_parity_err); CXGB_SYSCTL_ADD_ULONG(tx_fifo_urun); CXGB_SYSCTL_ADD_ULONG(rx_fifo_ovfl); CXGB_SYSCTL_ADD_ULONG(serdes_signal_loss); CXGB_SYSCTL_ADD_ULONG(xaui_pcs_ctc_err); CXGB_SYSCTL_ADD_ULONG(xaui_pcs_align_change); CXGB_SYSCTL_ADD_ULONG(num_toggled); CXGB_SYSCTL_ADD_ULONG(num_resets); CXGB_SYSCTL_ADD_ULONG(link_faults); #undef CXGB_SYSCTL_ADD_ULONG } } /** * t3_get_desc - dump an SGE descriptor for debugging purposes * @qs: the queue set * @qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx) * @idx: the descriptor index in the queue * @data: where to dump the descriptor contents * * Dumps the contents of a HW descriptor of an SGE queue. Returns the * size of the descriptor. */ int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx, unsigned char *data) { if (qnum >= 6) return (EINVAL); if (qnum < 3) { if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size) return -EINVAL; memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc)); return sizeof(struct tx_desc); } if (qnum == 3) { if (!qs->rspq.desc || idx >= qs->rspq.size) return (EINVAL); memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc)); return sizeof(struct rsp_desc); } qnum -= 4; if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size) return (EINVAL); memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc)); return sizeof(struct rx_desc); } diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 4206bca95cdf..2a5f50f51d13 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -1,1570 +1,1570 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_ADAPTER_H__ #define __T4_ADAPTER_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "offload.h" #include "t4_ioctl.h" #include "common/t4_msg.h" #include "firmware/t4fw_interface.h" #define KTR_CXGBE KTR_SPARE3 MALLOC_DECLARE(M_CXGBE); #define CXGBE_UNIMPLEMENTED(s) \ panic("%s (%s, line %d) not implemented yet.", s, __FILE__, __LINE__) /* * Same as LIST_HEAD from queue.h. This is to avoid conflict with LinuxKPI's * LIST_HEAD when building iw_cxgbe. */ #define CXGBE_LIST_HEAD(name, type) \ struct name { \ struct type *lh_first; /* first element */ \ } #ifndef SYSCTL_ADD_UQUAD #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD #define sysctl_handle_64 sysctl_handle_quad #define CTLTYPE_U64 CTLTYPE_QUAD #endif SYSCTL_DECL(_hw_cxgbe); struct adapter; typedef struct adapter adapter_t; enum { /* * All ingress queues use this entry size. Note that the firmware event * queue and any iq expecting CPL_RX_PKT in the descriptor needs this to * be at least 64. */ IQ_ESIZE = 64, /* Default queue sizes for all kinds of ingress queues */ FW_IQ_QSIZE = 256, RX_IQ_QSIZE = 1024, /* All egress queues use this entry size */ EQ_ESIZE = 64, /* Default queue sizes for all kinds of egress queues */ CTRL_EQ_QSIZE = 1024, TX_EQ_QSIZE = 1024, SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */ CL_METADATA_SIZE = CACHE_LINE_SIZE, SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ TX_SGL_SEGS = 39, TX_SGL_SEGS_TSO = 38, TX_SGL_SEGS_VM = 38, TX_SGL_SEGS_VM_TSO = 37, TX_SGL_SEGS_EO_TSO = 30, /* XXX: lower for IPv6. */ TX_SGL_SEGS_VXLAN_TSO = 37, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 }; enum { /* adapter intr_type */ INTR_INTX = (1 << 0), INTR_MSI = (1 << 1), INTR_MSIX = (1 << 2) }; enum { XGMAC_MTU = (1 << 0), XGMAC_PROMISC = (1 << 1), XGMAC_ALLMULTI = (1 << 2), XGMAC_VLANEX = (1 << 3), XGMAC_UCADDR = (1 << 4), XGMAC_MCADDRS = (1 << 5), XGMAC_ALL = 0xffff }; enum { /* flags understood by begin_synchronized_op */ HOLD_LOCK = (1 << 0), SLEEP_OK = (1 << 1), INTR_OK = (1 << 2), /* flags understood by end_synchronized_op */ LOCK_HELD = HOLD_LOCK, }; enum { /* adapter flags. synch_op or adapter_lock. */ FULL_INIT_DONE = (1 << 0), FW_OK = (1 << 1), CHK_MBOX_ACCESS = (1 << 2), MASTER_PF = (1 << 3), BUF_PACKING_OK = (1 << 6), IS_VF = (1 << 7), KERN_TLS_ON = (1 << 8), /* HW is configured for KERN_TLS */ CXGBE_BUSY = (1 << 9), /* adapter error_flags. reg_lock for HW_OFF_LIMITS, atomics for the rest. */ ADAP_STOPPED = (1 << 0), /* Adapter has been stopped. */ ADAP_FATAL_ERR = (1 << 1), /* Encountered a fatal error. */ HW_OFF_LIMITS = (1 << 2), /* off limits to all except reset_thread */ ADAP_CIM_ERR = (1 << 3), /* Error was related to FW/CIM. */ /* port flags */ HAS_TRACEQ = (1 << 3), FIXED_IFMEDIA = (1 << 4), /* ifmedia list doesn't change. */ /* VI flags */ DOOMED = (1 << 0), VI_INIT_DONE = (1 << 1), /* 1 << 2 is unused, was VI_SYSCTL_CTX */ TX_USES_VM_WR = (1 << 3), VI_SKIP_STATS = (1 << 4), /* adapter debug_flags */ DF_DUMP_MBOX = (1 << 0), /* Log all mbox cmd/rpl. */ DF_LOAD_FW_ANYTIME = (1 << 1), /* Allow LOAD_FW after init */ DF_DISABLE_TCB_CACHE = (1 << 2), /* Disable TCB cache (T6+) */ DF_DISABLE_CFG_RETRY = (1 << 3), /* Disable fallback config */ DF_VERBOSE_SLOWINTR = (1 << 4), /* Chatty slow intr handler */ }; #define IS_DOOMED(vi) ((vi)->flags & DOOMED) #define SET_DOOMED(vi) do {(vi)->flags |= DOOMED;} while (0) #define IS_BUSY(sc) ((sc)->flags & CXGBE_BUSY) #define SET_BUSY(sc) do {(sc)->flags |= CXGBE_BUSY;} while (0) #define CLR_BUSY(sc) do {(sc)->flags &= ~CXGBE_BUSY;} while (0) struct vi_info { device_t dev; struct port_info *pi; struct adapter *adapter; - struct ifnet *ifp; + if_t ifp; struct pfil_head *pfil; unsigned long flags; int if_flags; uint16_t *rss, *nm_rss; uint16_t viid; /* opaque VI identifier */ uint16_t smt_idx; uint16_t vin; uint8_t vfvld; int16_t xact_addr_filt;/* index of exact MAC address filter */ uint16_t rss_size; /* size of VI's RSS table slice */ uint16_t rss_base; /* start of VI's RSS table slice */ int hashen; int nintr; int first_intr; /* These need to be int as they are used in sysctl */ int ntxq; /* # of tx queues */ int first_txq; /* index of first tx queue */ int rsrv_noflowq; /* Reserve queue 0 for non-flowid packets */ int nrxq; /* # of rx queues */ int first_rxq; /* index of first rx queue */ int nofldtxq; /* # of offload tx queues */ int first_ofld_txq; /* index of first offload tx queue */ int nofldrxq; /* # of offload rx queues */ int first_ofld_rxq; /* index of first offload rx queue */ int nnmtxq; int first_nm_txq; int nnmrxq; int first_nm_rxq; int tmr_idx; int ofld_tmr_idx; int pktc_idx; int ofld_pktc_idx; int qsize_rxq; int qsize_txq; struct timeval last_refreshed; struct fw_vi_stats_vf stats; struct mtx tick_mtx; struct callout tick; struct sysctl_ctx_list ctx; struct sysctl_oid *rxq_oid; struct sysctl_oid *txq_oid; struct sysctl_oid *nm_rxq_oid; struct sysctl_oid *nm_txq_oid; struct sysctl_oid *ofld_rxq_oid; struct sysctl_oid *ofld_txq_oid; uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */ u_int txq_rr; u_int rxq_rr; }; struct tx_ch_rl_params { enum fw_sched_params_rate ratemode; /* %port (REL) or kbps (ABS) */ uint32_t maxrate; }; /* CLRL state */ enum clrl_state { CS_UNINITIALIZED = 0, CS_PARAMS_SET, /* sw parameters have been set. */ CS_HW_UPDATE_REQUESTED, /* async HW update requested. */ CS_HW_UPDATE_IN_PROGRESS, /* sync hw update in progress. */ CS_HW_CONFIGURED /* configured in the hardware. */ }; /* CLRL flags */ enum { CF_USER = (1 << 0), /* was configured by driver ioctl. */ }; struct tx_cl_rl_params { enum clrl_state state; int refcount; uint8_t flags; enum fw_sched_params_rate ratemode; /* %port REL or ABS value */ enum fw_sched_params_unit rateunit; /* kbps or pps (when ABS) */ enum fw_sched_params_mode mode; /* aggr or per-flow */ uint32_t maxrate; uint16_t pktsize; uint16_t burstsize; }; /* Tx scheduler parameters for a channel/port */ struct tx_sched_params { /* Channel Rate Limiter */ struct tx_ch_rl_params ch_rl; /* Class WRR */ /* XXX */ /* Class Rate Limiter (including the default pktsize and burstsize). */ int pktsize; int burstsize; struct tx_cl_rl_params cl_rl[]; }; struct port_info { device_t dev; struct adapter *adapter; struct vi_info *vi; int nvi; int up_vis; int uld_vis; bool vxlan_tcam_entry; struct tx_sched_params *sched_params; struct mtx pi_lock; char lockname[16]; unsigned long flags; uint8_t lport; /* associated offload logical port */ int8_t mdio_addr; uint8_t port_type; uint8_t mod_type; uint8_t port_id; uint8_t tx_chan; uint8_t mps_bg_map; /* rx MPS buffer group bitmap */ uint8_t rx_e_chan_map; /* rx TP e-channel bitmap */ uint8_t rx_c_chan; /* rx TP c-channel */ struct link_config link_cfg; struct ifmedia media; struct port_stats stats; u_int tnl_cong_drops; u_int tx_parse_error; int fcs_reg; uint64_t fcs_base; struct sysctl_ctx_list ctx; }; #define IS_MAIN_VI(vi) ((vi) == &((vi)->pi->vi[0])) struct cluster_metadata { uma_zone_t zone; caddr_t cl; u_int refcount; }; struct fl_sdesc { caddr_t cl; uint16_t nmbuf; /* # of driver originated mbufs with ref on cluster */ int16_t moff; /* offset of metadata from cl */ uint8_t zidx; }; struct tx_desc { __be64 flit[8]; }; struct tx_sdesc { struct mbuf *m; /* m_nextpkt linked chain of frames */ uint8_t desc_used; /* # of hardware descriptors used by the WR */ }; #define IQ_PAD (IQ_ESIZE - sizeof(struct rsp_ctrl) - sizeof(struct rss_header)) struct iq_desc { struct rss_header rss; uint8_t cpl[IQ_PAD]; struct rsp_ctrl rsp; }; #undef IQ_PAD CTASSERT(sizeof(struct iq_desc) == IQ_ESIZE); enum { /* iq type */ IQ_OTHER = FW_IQ_IQTYPE_OTHER, IQ_ETH = FW_IQ_IQTYPE_NIC, IQ_OFLD = FW_IQ_IQTYPE_OFLD, /* iq flags */ IQ_SW_ALLOCATED = (1 << 0), /* sw resources allocated */ IQ_HAS_FL = (1 << 1), /* iq associated with a freelist */ IQ_RX_TIMESTAMP = (1 << 2), /* provide the SGE rx timestamp */ IQ_LRO_ENABLED = (1 << 3), /* iq is an eth rxq with LRO enabled */ IQ_ADJ_CREDIT = (1 << 4), /* hw is off by 1 credit for this iq */ IQ_HW_ALLOCATED = (1 << 5), /* fw/hw resources allocated */ /* iq state */ IQS_DISABLED = 0, IQS_BUSY = 1, IQS_IDLE = 2, /* netmap related flags */ NM_OFF = 0, NM_ON = 1, NM_BUSY = 2, }; enum { CPL_COOKIE_RESERVED = 0, CPL_COOKIE_FILTER, CPL_COOKIE_DDP0, CPL_COOKIE_DDP1, CPL_COOKIE_TOM, CPL_COOKIE_HASHFILTER, CPL_COOKIE_ETHOFLD, CPL_COOKIE_KERN_TLS, NUM_CPL_COOKIES = 8 /* Limited by M_COOKIE. Do not increase. */ }; struct sge_iq; struct rss_header; typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *, struct mbuf *); typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *); typedef int (*fw_msg_handler_t)(struct adapter *, const __be64 *); /* * Ingress Queue: T4 is producer, driver is consumer. */ struct sge_iq { uint16_t flags; uint8_t qtype; volatile int state; struct adapter *adapter; struct iq_desc *desc; /* KVA of descriptor ring */ int8_t intr_pktc_idx; /* packet count threshold index */ uint8_t gen; /* generation bit */ uint8_t intr_params; /* interrupt holdoff parameters */ int8_t cong_drop; /* congestion drop settings for the queue */ uint16_t qsize; /* size (# of entries) of the queue */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer index */ uint16_t cntxt_id; /* SGE context id for the iq */ uint16_t abs_id; /* absolute SGE id for the iq */ int16_t intr_idx; /* interrupt used by the queue */ STAILQ_ENTRY(sge_iq) link; bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ }; enum { /* eq type */ EQ_CTRL = 1, EQ_ETH = 2, EQ_OFLD = 3, /* eq flags */ EQ_SW_ALLOCATED = (1 << 0), /* sw resources allocated */ EQ_HW_ALLOCATED = (1 << 1), /* hw/fw resources allocated */ EQ_ENABLED = (1 << 3), /* open for business */ EQ_QFLUSH = (1 << 4), /* if_qflush in progress */ }; /* Listed in order of preference. Update t4_sysctls too if you change these */ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB}; /* * Egress Queue: driver is producer, T4 is consumer. * * Note: A free list is an egress queue (driver produces the buffers and T4 * consumes them) but it's special enough to have its own struct (see sge_fl). */ struct sge_eq { unsigned int flags; /* MUST be first */ unsigned int cntxt_id; /* SGE context id for the eq */ unsigned int abs_id; /* absolute SGE id for the eq */ uint8_t type; /* EQ_CTRL/EQ_ETH/EQ_OFLD */ uint8_t doorbells; uint8_t tx_chan; /* tx channel used by the eq */ struct mtx eq_lock; struct tx_desc *desc; /* KVA of descriptor ring */ volatile uint32_t *udb; /* KVA of doorbell (lies within BAR2) */ u_int udb_qid; /* relative qid within the doorbell page */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer idx (desc idx) */ uint16_t pidx; /* producer idx (desc idx) */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint16_t iqid; /* cached iq->cntxt_id (see iq below) */ volatile u_int equiq; /* EQUIQ outstanding */ struct sge_iq *iq; /* iq that receives egr_update for the eq */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ char lockname[16]; }; struct rx_buf_info { uma_zone_t zone; /* zone that this cluster comes from */ uint16_t size1; /* same as size of cluster: 2K/4K/9K/16K. * hwsize[hwidx1] = size1. No spare. */ uint16_t size2; /* hwsize[hwidx2] = size2. * spare in cluster = size1 - size2. */ int8_t hwidx1; /* SGE bufsize idx for size1 */ int8_t hwidx2; /* SGE bufsize idx for size2 */ uint8_t type; /* EXT_xxx type of the cluster */ }; enum { NUM_MEMWIN = 3, MEMWIN0_APERTURE = 2048, MEMWIN0_BASE = 0x1b800, MEMWIN1_APERTURE = 32768, MEMWIN1_BASE = 0x28000, MEMWIN2_APERTURE_T4 = 65536, MEMWIN2_BASE_T4 = 0x30000, MEMWIN2_APERTURE_T5 = 128 * 1024, MEMWIN2_BASE_T5 = 0x60000, }; struct memwin { struct rwlock mw_lock __aligned(CACHE_LINE_SIZE); uint32_t mw_base; /* constant after setup_memwin */ uint32_t mw_aperture; /* ditto */ uint32_t mw_curpos; /* protected by mw_lock */ }; enum { FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */ FL_DOOMED = (1 << 1), /* about to be destroyed */ FL_BUF_PACKING = (1 << 2), /* buffer packing enabled */ FL_BUF_RESUME = (1 << 3), /* resume from the middle of the frame */ }; #define FL_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) <= fl->lowat) #define FL_NOT_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) >= 2 * fl->lowat) struct sge_fl { struct mtx fl_lock; __be64 *desc; /* KVA of descriptor ring, ptr to addresses */ struct fl_sdesc *sdesc; /* KVA of software descriptor ring */ uint16_t zidx; /* refill zone idx */ uint16_t safe_zidx; uint16_t lowat; /* # of buffers <= this means fl needs help */ int flags; uint16_t buf_boundary; /* The 16b idx all deal with hw descriptors */ uint16_t dbidx; /* hw pidx after last doorbell */ uint16_t sidx; /* index of status page */ volatile uint16_t hw_cidx; /* The 32b idx are all buffer idx, not hardware descriptor idx */ uint32_t cidx; /* consumer index */ uint32_t pidx; /* producer index */ uint32_t dbval; u_int rx_offset; /* offset in fl buf (when buffer packing) */ volatile uint32_t *udb; uint64_t cl_allocated; /* # of clusters allocated */ uint64_t cl_recycled; /* # of clusters recycled */ uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */ /* These 3 are valid when FL_BUF_RESUME is set, stale otherwise. */ struct mbuf *m0; struct mbuf **pnext; u_int remaining; uint16_t qsize; /* # of hw descriptors (status page included) */ uint16_t cntxt_id; /* SGE context id for the freelist */ TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; char lockname[16]; bus_addr_t ba; /* bus address of descriptor ring */ }; struct mp_ring; struct txpkts { uint8_t wr_type; /* type 0 or type 1 */ uint8_t npkt; /* # of packets in this work request */ uint8_t len16; /* # of 16B pieces used by this work request */ uint8_t score; uint8_t max_npkt; /* maximum number of packets allowed */ uint16_t plen; /* total payload (sum of all packets) */ /* straight from fw_eth_tx_pkts_vm_wr. */ __u8 ethmacdst[6]; __u8 ethmacsrc[6]; __be16 ethtype; __be16 vlantci; struct mbuf *mb[15]; }; /* txq: SGE egress queue + what's needed for Ethernet NIC */ struct sge_txq { struct sge_eq eq; /* MUST be first */ - struct ifnet *ifp; /* the interface this txq belongs to */ + if_t ifp; /* the interface this txq belongs to */ struct mp_ring *r; /* tx software ring */ struct tx_sdesc *sdesc; /* KVA of software descriptor ring */ struct sglist *gl; __be32 cpl_ctrl0; /* for convenience */ int tc_idx; /* traffic class */ uint64_t last_tx; /* cycle count when eth_tx was last called */ struct txpkts txp; struct task tx_reclaim_task; /* stats for common events first */ uint64_t txcsum; /* # of times hardware assisted with checksum */ uint64_t tso_wrs; /* # of TSO work requests */ uint64_t vlan_insertion;/* # of times VLAN tag was inserted */ uint64_t imm_wrs; /* # of work requests with immediate data */ uint64_t sgl_wrs; /* # of work requests with direct SGL */ uint64_t txpkt_wrs; /* # of txpkt work requests (not coalesced) */ uint64_t txpkts0_wrs; /* # of type0 coalesced tx work requests */ uint64_t txpkts1_wrs; /* # of type1 coalesced tx work requests */ uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */ uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */ uint64_t txpkts_flush; /* # of times txp had to be sent by tx_update */ uint64_t raw_wrs; /* # of raw work requests (alloc_wr_mbuf) */ uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */ uint64_t vxlan_txcsum; uint64_t kern_tls_records; uint64_t kern_tls_short; uint64_t kern_tls_partial; uint64_t kern_tls_full; uint64_t kern_tls_octets; uint64_t kern_tls_waste; uint64_t kern_tls_options; uint64_t kern_tls_header; uint64_t kern_tls_fin; uint64_t kern_tls_fin_short; uint64_t kern_tls_cbc; uint64_t kern_tls_gcm; /* stats for not-that-common events */ /* Optional scratch space for constructing work requests. */ uint8_t ss[SGE_MAX_WR_LEN] __aligned(16); } __aligned(CACHE_LINE_SIZE); /* rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ - struct ifnet *ifp; /* the interface this rxq belongs to */ + if_t ifp; /* the interface this rxq belongs to */ struct lro_ctrl lro; /* LRO state */ /* stats for common events first */ uint64_t rxcsum; /* # of times hardware assisted with checksum */ uint64_t vlan_extraction;/* # of times VLAN tag was extracted */ uint64_t vxlan_rxcsum; /* stats for not-that-common events */ } __aligned(CACHE_LINE_SIZE); static inline struct sge_rxq * iq_to_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_rxq, iq)); } /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_ofld_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ counter_u64_t rx_iscsi_ddp_setup_ok; counter_u64_t rx_iscsi_ddp_setup_error; uint64_t rx_iscsi_ddp_pdus; uint64_t rx_iscsi_ddp_octets; uint64_t rx_iscsi_fl_pdus; uint64_t rx_iscsi_fl_octets; uint64_t rx_iscsi_padding_errors; uint64_t rx_iscsi_header_digest_errors; uint64_t rx_iscsi_data_digest_errors; u_long rx_toe_tls_records; u_long rx_toe_tls_octets; } __aligned(CACHE_LINE_SIZE); static inline struct sge_ofld_rxq * iq_to_ofld_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_ofld_rxq, iq)); } struct wrqe { STAILQ_ENTRY(wrqe) link; struct sge_wrq *wrq; int wr_len; char wr[] __aligned(16); }; struct wrq_cookie { TAILQ_ENTRY(wrq_cookie) link; int ndesc; int pidx; }; /* * wrq: SGE egress queue that is given prebuilt work requests. Control queues * are of this type. */ struct sge_wrq { struct sge_eq eq; /* MUST be first */ struct adapter *adapter; struct task wrq_tx_task; /* Tx desc reserved but WR not "committed" yet. */ TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs; /* List of WRs ready to go out as soon as descriptors are available. */ STAILQ_HEAD(, wrqe) wr_list; u_int nwr_pending; u_int ndesc_needed; /* stats for common events first */ uint64_t tx_wrs_direct; /* # of WRs written directly to desc ring. */ uint64_t tx_wrs_ss; /* # of WRs copied from scratch space. */ uint64_t tx_wrs_copied; /* # of WRs queued and copied to desc ring. */ /* stats for not-that-common events */ /* * Scratch space for work requests that wrap around after reaching the * status page, and some information about the last WR that used it. */ uint16_t ss_pidx; uint16_t ss_len; uint8_t ss[SGE_MAX_WR_LEN]; } __aligned(CACHE_LINE_SIZE); /* ofld_txq: SGE egress queue + miscellaneous items */ struct sge_ofld_txq { struct sge_wrq wrq; counter_u64_t tx_iscsi_pdus; counter_u64_t tx_iscsi_octets; counter_u64_t tx_iscsi_iso_wrs; counter_u64_t tx_toe_tls_records; counter_u64_t tx_toe_tls_octets; } __aligned(CACHE_LINE_SIZE); #define INVALID_NM_RXQ_CNTXT_ID ((uint16_t)(-1)) struct sge_nm_rxq { /* Items used by the driver rx ithread are in this cacheline. */ volatile int nm_state __aligned(CACHE_LINE_SIZE); /* NM_OFF, NM_ON, or NM_BUSY */ u_int nid; /* netmap ring # for this queue */ struct vi_info *vi; struct iq_desc *iq_desc; uint16_t iq_abs_id; uint16_t iq_cntxt_id; uint16_t iq_cidx; uint16_t iq_sidx; uint8_t iq_gen; uint32_t fl_sidx; /* Items used by netmap rxsync are in this cacheline. */ __be64 *fl_desc __aligned(CACHE_LINE_SIZE); uint16_t fl_cntxt_id; uint32_t fl_pidx; uint32_t fl_sidx2; /* copy of fl_sidx */ uint32_t fl_db_val; u_int fl_db_saved; u_int fl_db_threshold; /* in descriptors */ u_int fl_hwidx:4; /* * fl_cidx is used by both the ithread and rxsync, the rest are not used * in the rx fast path. */ uint32_t fl_cidx __aligned(CACHE_LINE_SIZE); bus_dma_tag_t iq_desc_tag; bus_dmamap_t iq_desc_map; bus_addr_t iq_ba; int intr_idx; bus_dma_tag_t fl_desc_tag; bus_dmamap_t fl_desc_map; bus_addr_t fl_ba; }; #define INVALID_NM_TXQ_CNTXT_ID ((u_int)(-1)) struct sge_nm_txq { struct tx_desc *desc; uint16_t cidx; uint16_t pidx; uint16_t sidx; uint16_t equiqidx; /* EQUIQ last requested at this pidx */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint8_t doorbells; volatile uint32_t *udb; u_int udb_qid; u_int cntxt_id; __be32 cpl_ctrl0; /* for convenience */ __be32 op_pkd; /* ditto */ u_int nid; /* netmap ring # for this queue */ /* infrequently used items after this */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; int iqidx; } __aligned(CACHE_LINE_SIZE); struct sge { int nrxq; /* total # of Ethernet rx queues */ int ntxq; /* total # of Ethernet tx queues */ int nofldrxq; /* total # of TOE rx queues */ int nofldtxq; /* total # of TOE tx queues */ int nnmrxq; /* total # of netmap rx queues */ int nnmtxq; /* total # of netmap tx queues */ int niq; /* total # of ingress queues */ int neq; /* total # of egress queues */ struct sge_iq fwq; /* Firmware event queue */ struct sge_wrq *ctrlq; /* Control queues */ struct sge_txq *txq; /* NIC tx queues */ struct sge_rxq *rxq; /* NIC rx queues */ struct sge_ofld_txq *ofld_txq; /* TOE tx queues */ struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */ struct sge_nm_txq *nm_txq; /* netmap tx queues */ struct sge_nm_rxq *nm_rxq; /* netmap rx queues */ uint16_t iq_start; /* first cntxt_id */ uint16_t iq_base; /* first abs_id */ int eq_start; /* first cntxt_id */ int eq_base; /* first abs_id */ int iqmap_sz; int eqmap_sz; struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ int8_t safe_zidx; struct rx_buf_info rx_buf_info[SW_ZONE_SIZES]; }; struct devnames { const char *nexus_name; const char *ifnet_name; const char *vi_ifnet_name; const char *pf03_drv_name; const char *vf_nexus_name; const char *vf_ifnet_name; }; struct clip_entry; #define CNT_CAL_INFO 3 struct clock_sync { uint64_t hw_cur; uint64_t hw_prev; sbintime_t sbt_cur; sbintime_t sbt_prev; seqc_t gen; }; struct adapter { SLIST_ENTRY(adapter) link; device_t dev; struct cdev *cdev; const struct devnames *names; /* PCIe register resources */ int regs_rid; struct resource *regs_res; int msix_rid; struct resource *msix_res; bus_space_handle_t bh; bus_space_tag_t bt; bus_size_t mmio_len; int udbs_rid; struct resource *udbs_res; volatile uint8_t *udbs_base; unsigned int pf; unsigned int mbox; unsigned int vpd_busy; unsigned int vpd_flag; /* Interrupt information */ int intr_type; int intr_count; struct irq { struct resource *res; int rid; void *tag; struct sge_rxq *rxq; struct sge_nm_rxq *nm_rxq; } __aligned(CACHE_LINE_SIZE) *irq; int sge_gts_reg; int sge_kdoorbell_reg; bus_dma_tag_t dmat; /* Parent DMA tag */ struct sge sge; int lro_timeout; int sc_do_rxcopy; int vxlan_port; u_int vxlan_refcount; int rawf_base; int nrawf; struct taskqueue *tq[MAX_NCHAN]; /* General purpose taskqueues */ struct port_info *port[MAX_NPORTS]; uint8_t chan_map[MAX_NCHAN]; /* channel -> port */ CXGBE_LIST_HEAD(, clip_entry) *clip_table; TAILQ_HEAD(, clip_entry) clip_pending; /* these need hw update. */ u_long clip_mask; int clip_gen; struct timeout_task clip_task; void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; struct t4_offload_policy *policy; struct rwlock policy_lock; void *iwarp_softc; /* (struct c4iw_dev *) */ struct iw_tunables iwt; void *iscsi_ulp_softc; /* (struct cxgbei_data *) */ struct l2t_data *l2t; /* L2 table */ struct smt_data *smt; /* Source MAC Table */ struct tid_info tids; vmem_t *key_map; struct tls_tunables tlst; uint8_t doorbells; int offload_map; /* port_id's with IFCAP_TOE enabled */ int bt_map; /* tx_chan's with BASE-T */ int active_ulds; /* ULDs activated on this adapter */ int flags; int debug_flags; int error_flags; /* Used by error handler and live reset. */ char ifp_lockname[16]; struct mtx ifp_lock; - struct ifnet *ifp; /* tracer ifp */ + if_t ifp; /* tracer ifp */ struct ifmedia media; int traceq; /* iq used by all tracers, -1 if none */ int tracer_valid; /* bitmap of valid tracers */ int tracer_enabled; /* bitmap of enabled tracers */ char fw_version[16]; char tp_version[16]; char er_version[16]; char bs_version[16]; char cfg_file[32]; u_int cfcsum; struct adapter_params params; const struct chip_params *chip_params; struct t4_virt_res vres; uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; struct sysctl_ctx_list ctx; struct sysctl_oid *ctrlq_oid; struct sysctl_oid *fwq_oid; struct mtx sc_lock; char lockname[16]; /* Starving free lists */ struct mtx sfl_lock; /* same cache-line as sc_lock? but that's ok */ TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; struct callout cal_callout; struct clock_sync cal_info[CNT_CAL_INFO]; int cal_current; int cal_count; uint32_t cal_gen; /* * Driver code that can run when the adapter is suspended must use this * lock or a synchronized_op and check for HW_OFF_LIMITS before * accessing hardware. * * XXX: could be changed to rwlock. wlock in suspend/resume and for * indirect register access, rlock everywhere else. */ struct mtx reg_lock; struct memwin memwin[NUM_MEMWIN]; /* memory windows */ struct mtx tc_lock; struct task tc_task; struct task fatal_error_task; struct task reset_task; const void *reset_thread; int num_resets; int incarnation; const char *last_op; const void *last_op_thr; int last_op_flags; int swintr; int sensor_resets; struct callout ktls_tick; }; #define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock) #define ADAPTER_UNLOCK(sc) mtx_unlock(&(sc)->sc_lock) #define ADAPTER_LOCK_ASSERT_OWNED(sc) mtx_assert(&(sc)->sc_lock, MA_OWNED) #define ADAPTER_LOCK_ASSERT_NOTOWNED(sc) mtx_assert(&(sc)->sc_lock, MA_NOTOWNED) #define ASSERT_SYNCHRONIZED_OP(sc) \ KASSERT(IS_BUSY(sc) && \ (mtx_owned(&(sc)->sc_lock) || sc->last_op_thr == curthread), \ ("%s: operation not synchronized.", __func__)) #define PORT_LOCK(pi) mtx_lock(&(pi)->pi_lock) #define PORT_UNLOCK(pi) mtx_unlock(&(pi)->pi_lock) #define PORT_LOCK_ASSERT_OWNED(pi) mtx_assert(&(pi)->pi_lock, MA_OWNED) #define PORT_LOCK_ASSERT_NOTOWNED(pi) mtx_assert(&(pi)->pi_lock, MA_NOTOWNED) #define FL_LOCK(fl) mtx_lock(&(fl)->fl_lock) #define FL_TRYLOCK(fl) mtx_trylock(&(fl)->fl_lock) #define FL_UNLOCK(fl) mtx_unlock(&(fl)->fl_lock) #define FL_LOCK_ASSERT_OWNED(fl) mtx_assert(&(fl)->fl_lock, MA_OWNED) #define FL_LOCK_ASSERT_NOTOWNED(fl) mtx_assert(&(fl)->fl_lock, MA_NOTOWNED) #define RXQ_FL_LOCK(rxq) FL_LOCK(&(rxq)->fl) #define RXQ_FL_UNLOCK(rxq) FL_UNLOCK(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_OWNED(rxq) FL_LOCK_ASSERT_OWNED(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl) #define EQ_LOCK(eq) mtx_lock(&(eq)->eq_lock) #define EQ_TRYLOCK(eq) mtx_trylock(&(eq)->eq_lock) #define EQ_UNLOCK(eq) mtx_unlock(&(eq)->eq_lock) #define EQ_LOCK_ASSERT_OWNED(eq) mtx_assert(&(eq)->eq_lock, MA_OWNED) #define EQ_LOCK_ASSERT_NOTOWNED(eq) mtx_assert(&(eq)->eq_lock, MA_NOTOWNED) #define TXQ_LOCK(txq) EQ_LOCK(&(txq)->eq) #define TXQ_TRYLOCK(txq) EQ_TRYLOCK(&(txq)->eq) #define TXQ_UNLOCK(txq) EQ_UNLOCK(&(txq)->eq) #define TXQ_LOCK_ASSERT_OWNED(txq) EQ_LOCK_ASSERT_OWNED(&(txq)->eq) #define TXQ_LOCK_ASSERT_NOTOWNED(txq) EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq) #define for_each_txq(vi, iter, q) \ for (q = &vi->adapter->sge.txq[vi->first_txq], iter = 0; \ iter < vi->ntxq; ++iter, ++q) #define for_each_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.rxq[vi->first_rxq], iter = 0; \ iter < vi->nrxq; ++iter, ++q) #define for_each_ofld_txq(vi, iter, q) \ for (q = &vi->adapter->sge.ofld_txq[vi->first_ofld_txq], iter = 0; \ iter < vi->nofldtxq; ++iter, ++q) #define for_each_ofld_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.ofld_rxq[vi->first_ofld_rxq], iter = 0; \ iter < vi->nofldrxq; ++iter, ++q) #define for_each_nm_txq(vi, iter, q) \ for (q = &vi->adapter->sge.nm_txq[vi->first_nm_txq], iter = 0; \ iter < vi->nnmtxq; ++iter, ++q) #define for_each_nm_rxq(vi, iter, q) \ for (q = &vi->adapter->sge.nm_rxq[vi->first_nm_rxq], iter = 0; \ iter < vi->nnmrxq; ++iter, ++q) #define for_each_vi(_pi, _iter, _vi) \ for ((_vi) = (_pi)->vi, (_iter) = 0; (_iter) < (_pi)->nvi; \ ++(_iter), ++(_vi)) #define IDXINCR(idx, incr, wrap) do { \ idx = wrap - idx > incr ? idx + incr : incr - (wrap - idx); \ } while (0) #define IDXDIFF(head, tail, wrap) \ ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) /* One for errors, one for firmware events */ #define T4_EXTRA_INTR 2 /* One for firmware events */ #define T4VF_EXTRA_INTR 1 static inline int forwarding_intr_to_fwq(struct adapter *sc) { return (sc->intr_count == 1); } /* Works reliably inside a sync_op or with reg_lock held. */ static inline bool hw_off_limits(struct adapter *sc) { int off_limits = atomic_load_int(&sc->error_flags) & HW_OFF_LIMITS; return (__predict_false(off_limits != 0)); } static inline int mbuf_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.inner_l5hlen > 0, ("%s: mbuf %p missing information on # of segments.", __func__, m)); return (m->m_pkthdr.inner_l5hlen); } static inline void set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.inner_l5hlen = nsegs; } /* Internal mbuf flags stored in PH_loc.eight[1]. */ #define MC_NOMAP 0x01 #define MC_RAW_WR 0x02 #define MC_TLS 0x04 static inline int mbuf_cflags(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[4]); } static inline void set_mbuf_cflags(struct mbuf *m, uint8_t flags) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[4] = flags; } static inline int mbuf_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[0]; if (!(mbuf_cflags(m) & MC_TLS)) MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } static inline void set_mbuf_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); if (!(mbuf_cflags(m) & MC_TLS)) MPASS(len16 > 0 && len16 <= SGE_MAX_WR_LEN / 16); m->m_pkthdr.PH_loc.eight[0] = len16; } static inline uint32_t t4_read_reg(struct adapter *sc, uint32_t reg) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); return bus_space_read_4(sc->bt, sc->bh, reg); } static inline void t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); bus_space_write_4(sc->bt, sc->bh, reg, val); } static inline uint64_t t4_read_reg64(struct adapter *sc, uint32_t reg) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); #ifdef __LP64__ return bus_space_read_8(sc->bt, sc->bh, reg); #else return (uint64_t)bus_space_read_4(sc->bt, sc->bh, reg) + ((uint64_t)bus_space_read_4(sc->bt, sc->bh, reg + 4) << 32); #endif } static inline void t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); #ifdef __LP64__ bus_space_write_8(sc->bt, sc->bh, reg, val); #else bus_space_write_4(sc->bt, sc->bh, reg, val); bus_space_write_4(sc->bt, sc->bh, reg + 4, val>> 32); #endif } static inline void t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 1); } static inline void t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 1); } static inline void t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 2); } static inline void t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 2); } static inline void t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); *val = pci_read_config(sc->dev, reg, 4); } static inline void t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val) { if (hw_off_limits(sc)) MPASS(curthread == sc->reset_thread); pci_write_config(sc->dev, reg, val, 4); } static inline struct port_info * adap2pinfo(struct adapter *sc, int idx) { return (sc->port[idx]); } static inline void t4_os_set_hw_addr(struct port_info *pi, uint8_t hw_addr[]) { bcopy(hw_addr, pi->vi[0].hw_addr, ETHER_ADDR_LEN); } static inline int tx_resume_threshold(struct sge_eq *eq) { /* not quite the same as qsize / 4, but this will do. */ return (eq->sidx / 4); } static inline int t4_use_ldst(struct adapter *sc) { #ifdef notyet return (sc->flags & FW_OK || !sc->use_bd); #else return (0); #endif } static inline void CH_DUMP_MBOX(struct adapter *sc, int mbox, const int reg, const char *msg, const __be64 *const p, const bool err) { if (!(sc->debug_flags & DF_DUMP_MBOX) && !err) return; if (p != NULL) { log(err ? LOG_ERR : LOG_DEBUG, "%s: mbox %u %s %016llx %016llx %016llx %016llx " "%016llx %016llx %016llx %016llx\n", device_get_nameunit(sc->dev), mbox, msg, (long long)be64_to_cpu(p[0]), (long long)be64_to_cpu(p[1]), (long long)be64_to_cpu(p[2]), (long long)be64_to_cpu(p[3]), (long long)be64_to_cpu(p[4]), (long long)be64_to_cpu(p[5]), (long long)be64_to_cpu(p[6]), (long long)be64_to_cpu(p[7])); } else { log(err ? LOG_ERR : LOG_DEBUG, "%s: mbox %u %s %016llx %016llx %016llx %016llx " "%016llx %016llx %016llx %016llx\n", device_get_nameunit(sc->dev), mbox, msg, (long long)t4_read_reg64(sc, reg), (long long)t4_read_reg64(sc, reg + 8), (long long)t4_read_reg64(sc, reg + 16), (long long)t4_read_reg64(sc, reg + 24), (long long)t4_read_reg64(sc, reg + 32), (long long)t4_read_reg64(sc, reg + 40), (long long)t4_read_reg64(sc, reg + 48), (long long)t4_read_reg64(sc, reg + 56)); } } /* t4_main.c */ extern int t4_ntxq; extern int t4_nrxq; extern int t4_intr_types; extern int t4_tmr_idx; extern int t4_pktc_idx; extern unsigned int t4_qsize_rxq; extern unsigned int t4_qsize_txq; extern device_method_t cxgbe_methods[]; int t4_os_find_pci_capability(struct adapter *, int); int t4_os_pci_save_state(struct adapter *); int t4_os_pci_restore_state(struct adapter *); void t4_os_portmod_changed(struct port_info *); void t4_os_link_changed(struct port_info *); void t4_iterate(void (*)(struct adapter *, void *), void *); void t4_init_devnames(struct adapter *); void t4_add_adapter(struct adapter *); int t4_detach_common(device_t); int t4_map_bars_0_and_4(struct adapter *); int t4_map_bar_2(struct adapter *); int t4_setup_intr_handlers(struct adapter *); void t4_sysctls(struct adapter *); int begin_synchronized_op(struct adapter *, struct vi_info *, int, char *); void doom_vi(struct adapter *, struct vi_info *); void end_synchronized_op(struct adapter *, int); -int update_mac_settings(struct ifnet *, int); +int update_mac_settings(if_t, int); int adapter_init(struct adapter *); int vi_init(struct vi_info *); void vi_sysctls(struct vi_info *); int rw_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int, int); int alloc_atid(struct adapter *, void *); void *lookup_atid(struct adapter *, int); void free_atid(struct adapter *, int); void release_tid(struct adapter *, int, struct sge_wrq *); -int cxgbe_media_change(struct ifnet *); -void cxgbe_media_status(struct ifnet *, struct ifmediareq *); +int cxgbe_media_change(if_t); +void cxgbe_media_status(if_t, struct ifmediareq *); void t4_os_cim_err(struct adapter *); #ifdef KERN_TLS /* t6_kern_tls.c */ -int t6_tls_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, +int t6_tls_tag_alloc(if_t, union if_snd_tag_alloc_params *, struct m_snd_tag **); void t6_ktls_modload(void); void t6_ktls_modunload(void); -int t6_ktls_try(struct ifnet *, struct socket *, struct ktls_session *); +int t6_ktls_try(if_t, struct socket *, struct ktls_session *); int t6_ktls_parse_pkt(struct mbuf *); int t6_ktls_write_wr(struct sge_txq *, void *, struct mbuf *, u_int); #endif /* t4_keyctx.c */ struct auth_hash; union authctx; #ifdef KERN_TLS struct ktls_session; struct tls_key_req; struct tls_keyctx; #endif void t4_aes_getdeckey(void *, const void *, unsigned int); void t4_copy_partial_hash(int, union authctx *, void *); void t4_init_gmac_hash(const char *, int, char *); void t4_init_hmac_digest(const struct auth_hash *, u_int, const char *, int, char *); #ifdef KERN_TLS u_int t4_tls_key_info_size(const struct ktls_session *); int t4_tls_proto_ver(const struct ktls_session *); int t4_tls_cipher_mode(const struct ktls_session *); int t4_tls_auth_mode(const struct ktls_session *); int t4_tls_hmac_ctrl(const struct ktls_session *); void t4_tls_key_ctx(const struct ktls_session *, int, struct tls_keyctx *); int t4_alloc_tls_keyid(struct adapter *); void t4_free_tls_keyid(struct adapter *, int); void t4_write_tlskey_wr(const struct ktls_session *, int, int, int, int, struct tls_key_req *); #endif #ifdef DEV_NETMAP /* t4_netmap.c */ struct sge_nm_rxq; void cxgbe_nm_attach(struct vi_info *); void cxgbe_nm_detach(struct vi_info *); void service_nm_rxq(struct sge_nm_rxq *); int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int); int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *); int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int); int free_nm_txq(struct vi_info *, struct sge_nm_txq *); #endif /* t4_sge.c */ void t4_sge_modload(void); void t4_sge_modunload(void); uint64_t t4_sge_extfree_refs(void); void t4_tweak_chip_settings(struct adapter *); int t4_verify_chip_settings(struct adapter *); void t4_init_rx_buf_info(struct adapter *); int t4_create_dma_tag(struct adapter *); void t4_sge_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid_list *); int t4_destroy_dma_tag(struct adapter *); int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); void free_fl_buffers(struct adapter *, struct sge_fl *); int t4_setup_adapter_queues(struct adapter *); int t4_teardown_adapter_queues(struct adapter *); int t4_setup_vi_queues(struct vi_info *); int t4_teardown_vi_queues(struct vi_info *); void t4_intr_all(void *); void t4_intr(void *); #ifdef DEV_NETMAP void t4_nm_intr(void *); void t4_vi_intr(void *); #endif void t4_intr_err(void *); void t4_intr_evt(void *); void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); -void t4_update_fl_bufsize(struct ifnet *); +void t4_update_fl_bufsize(if_t); struct mbuf *alloc_wr_mbuf(int, int); int parse_pkt(struct mbuf **, bool); void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *); void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *); int t4_sge_set_conm_context(struct adapter *, int, int, int); void t4_register_an_handler(an_handler_t); void t4_register_fw_msg_handler(int, fw_msg_handler_t); void t4_register_cpl_handler(int, cpl_handler_t); void t4_register_shared_cpl_handler(int, cpl_handler_t, int); #ifdef RATELIMIT void send_etid_flush_wr(struct cxgbe_rate_tag *); #endif /* t4_tracer.c */ struct t4_tracer; void t4_tracer_modload(void); void t4_tracer_modunload(void); void t4_tracer_port_detach(struct adapter *); int t4_get_tracer(struct adapter *, struct t4_tracer *); int t4_set_tracer(struct adapter *, struct t4_tracer *); int t4_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); int t5_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); /* t4_sched.c */ int t4_set_sched_class(struct adapter *, struct t4_sched_params *); int t4_set_sched_queue(struct adapter *, struct t4_sched_queue *); int t4_init_tx_sched(struct adapter *); int t4_free_tx_sched(struct adapter *); void t4_update_tx_sched(struct adapter *); int t4_reserve_cl_rl_kbps(struct adapter *, int, u_int, int *); void t4_release_cl_rl(struct adapter *, int, int); int sysctl_tc(SYSCTL_HANDLER_ARGS); int sysctl_tc_params(SYSCTL_HANDLER_ARGS); #ifdef RATELIMIT void t4_init_etid_table(struct adapter *); void t4_free_etid_table(struct adapter *); struct cxgbe_rate_tag *lookup_etid(struct adapter *, int); -int cxgbe_rate_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, +int cxgbe_rate_tag_alloc(if_t, union if_snd_tag_alloc_params *, struct m_snd_tag **); void cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *); -void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); +void cxgbe_ratelimit_query(if_t, struct if_ratelimit_query_results *); #endif /* t4_filter.c */ int get_filter_mode(struct adapter *, uint32_t *); int set_filter_mode(struct adapter *, uint32_t); int set_filter_mask(struct adapter *, uint32_t); int get_filter(struct adapter *, struct t4_filter *); int set_filter(struct adapter *, struct t4_filter *); int del_filter(struct adapter *, struct t4_filter *); int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_hashfilter_ao_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_hashfilter_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int t4_del_hashfilter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); void free_hftid_hash(struct tid_info *); static inline struct wrqe * alloc_wrqe(int wr_len, struct sge_wrq *wrq) { int len = offsetof(struct wrqe, wr) + wr_len; struct wrqe *wr; wr = malloc(len, M_CXGBE, M_NOWAIT); if (__predict_false(wr == NULL)) return (NULL); wr->wr_len = wr_len; wr->wrq = wrq; return (wr); } static inline void * wrtod(struct wrqe *wr) { return (&wr->wr[0]); } static inline void free_wrqe(struct wrqe *wr) { free(wr, M_CXGBE); } static inline void t4_wrq_tx(struct adapter *sc, struct wrqe *wr) { struct sge_wrq *wrq = wr->wrq; TXQ_LOCK(wrq); t4_wrq_tx_locked(sc, wrq, wr); TXQ_UNLOCK(wrq); } static inline int read_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, val, len, 0)); } static inline int write_via_memwin(struct adapter *sc, int idx, uint32_t addr, const uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, (void *)(uintptr_t)val, len, 1)); } /* Number of len16 -> number of descriptors */ static inline int tx_len16_to_desc(int len16) { return (howmany(len16, EQ_ESIZE / 16)); } #endif diff --git a/sys/dev/cxgbe/crypto/t6_kern_tls.c b/sys/dev/cxgbe/crypto/t6_kern_tls.c index f1302d063051..2b50899ba17a 100644 --- a/sys/dev/cxgbe/crypto/t6_kern_tls.c +++ b/sys/dev/cxgbe/crypto/t6_kern_tls.c @@ -1,2141 +1,2141 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018-2019 Chelsio Communications, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_l2t.h" #include "t4_clip.h" #include "t4_mp_ring.h" #include "crypto/t4_crypto.h" #if defined(INET) || defined(INET6) #define TLS_HEADER_LENGTH 5 struct tls_scmd { __be32 seqno_numivs; __be32 ivgen_hdrlen; }; struct tlspcb { struct m_snd_tag com; struct vi_info *vi; /* virtual interface */ struct adapter *sc; struct l2t_entry *l2te; /* L2 table entry used by this connection */ struct sge_txq *txq; int tid; /* Connection identifier */ int tx_key_addr; bool inline_key; bool using_timestamps; unsigned char enc_mode; struct tls_scmd scmd0; struct tls_scmd scmd0_short; unsigned int tx_key_info_size; uint32_t prev_seq; uint32_t prev_ack; uint32_t prev_tsecr; uint16_t prev_win; uint16_t prev_mss; /* Only used outside of setup and teardown when using inline keys. */ struct tls_keyctx keyctx; /* Fields only used during setup and teardown. */ struct inpcb *inp; /* backpointer to host stack's PCB */ struct sge_wrq *ctrlq; struct clip_entry *ce; /* CLIP table entry used by this tid */ bool open_pending; }; static void t6_tls_tag_free(struct m_snd_tag *mst); static int ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls, struct sge_txq *txq); static const struct if_snd_tag_sw t6_tls_tag_sw = { .snd_tag_free = t6_tls_tag_free, .type = IF_SND_TAG_TYPE_TLS }; static inline struct tlspcb * mst_to_tls(struct m_snd_tag *t) { return (__containerof(t, struct tlspcb, com)); } static struct tlspcb * -alloc_tlspcb(struct ifnet *ifp, struct vi_info *vi, int flags) +alloc_tlspcb(if_t ifp, struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tlspcb *tlsp; tlsp = malloc(sizeof(*tlsp), M_CXGBE, M_ZERO | flags); if (tlsp == NULL) return (NULL); m_snd_tag_init(&tlsp->com, ifp, &t6_tls_tag_sw); tlsp->vi = vi; tlsp->sc = sc; tlsp->ctrlq = &sc->sge.ctrlq[pi->port_id]; tlsp->tid = -1; tlsp->tx_key_addr = -1; return (tlsp); } static int ktls_act_open_cpl_size(bool isipv6) { if (isipv6) return (sizeof(struct cpl_t6_act_open_req6)); else return (sizeof(struct cpl_t6_act_open_req)); } static void mk_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst) { struct tcpcb *tp = intotcpcb(inp); struct cpl_t6_act_open_req *cpl6; struct cpl_act_open_req *cpl; uint64_t options; int qid_atid; cpl6 = dst; cpl = (struct cpl_act_open_req *)cpl6; INIT_TP_WR(cpl6, 0); qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_KERN_TLS); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE); options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan); options |= F_NON_OFFLOAD; cpl->opt0 = htobe64(options); options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]); if (tp->t_flags & TF_REQ_TSTMP) options |= F_TSTAMPS_EN; cpl->opt2 = htobe32(options); } static void mk_ktls_act_open_req6(struct adapter *sc, struct vi_info *vi, struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst) { struct tcpcb *tp = intotcpcb(inp); struct cpl_t6_act_open_req6 *cpl6; struct cpl_act_open_req6 *cpl; uint64_t options; int qid_atid; cpl6 = dst; cpl = (struct cpl_act_open_req6 *)cpl6; INIT_TP_WR(cpl6, 0); qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_KERN_TLS); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE); options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan); options |= F_NON_OFFLOAD; cpl->opt0 = htobe64(options); options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]); if (tp->t_flags & TF_REQ_TSTMP) options |= F_TSTAMPS_EN; cpl->opt2 = htobe32(options); } static int send_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp, struct tlspcb *tlsp, int atid) { struct wrqe *wr; bool isipv6; isipv6 = (inp->inp_vflag & INP_IPV6) != 0; if (isipv6) { tlsp->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (tlsp->ce == NULL) return (ENOENT); } wr = alloc_wrqe(ktls_act_open_cpl_size(isipv6), tlsp->ctrlq); if (wr == NULL) { CTR2(KTR_CXGBE, "%s: atid %d failed to alloc WR", __func__, atid); return (ENOMEM); } if (isipv6) mk_ktls_act_open_req6(sc, vi, inp, tlsp, atid, wrtod(wr)); else mk_ktls_act_open_req(sc, vi, inp, tlsp, atid, wrtod(wr)); tlsp->open_pending = true; t4_wrq_tx(sc, wr); return (0); } static int ktls_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct tlspcb *tlsp = lookup_atid(sc, atid); struct inpcb *inp = tlsp->inp; CTR3(KTR_CXGBE, "%s: atid %d status %d", __func__, atid, status); free_atid(sc, atid); if (status == 0) tlsp->tid = GET_TID(cpl); INP_WLOCK(inp); tlsp->open_pending = false; wakeup(tlsp); INP_WUNLOCK(inp); return (0); } /* SET_TCB_FIELD sent as a ULP command looks like this */ #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) _Static_assert((LEN__SET_TCB_FIELD_ULP + sizeof(struct ulptx_idata)) % 16 == 0, "CPL_SET_TCB_FIELD ULP command not 16-byte aligned"); static void write_set_tcb_field_ulp(struct tlspcb *tlsp, void *dst, struct sge_txq *txq, uint16_t word, uint64_t mask, uint64_t val) { struct ulp_txpkt *txpkt; struct ulptx_idata *idata; struct cpl_set_tcb_field_core *cpl; /* ULP_TXPKT */ txpkt = dst; txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DATAMODIFY(0) | V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1)); txpkt->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); /* ULPTX_IDATA sub-command */ idata = (struct ulptx_idata *)(txpkt + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); idata->len = htobe32(sizeof(*cpl)); /* CPL_SET_TCB_FIELD */ cpl = (struct cpl_set_tcb_field_core *)(idata + 1); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tlsp->tid)); cpl->reply_ctrl = htobe16(F_NO_REPLY); cpl->word_cookie = htobe16(V_WORD(word)); cpl->mask = htobe64(mask); cpl->val = htobe64(val); /* ULPTX_NOOP */ idata = (struct ulptx_idata *)(cpl + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); idata->len = htobe32(0); } static int ktls_set_tcb_fields(struct tlspcb *tlsp, struct tcpcb *tp, struct sge_txq *txq) { struct fw_ulptx_wr *wr; struct mbuf *m; char *dst; void *items[1]; int error, len; len = sizeof(*wr) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); if (tp->t_flags & TF_REQ_TSTMP) len += roundup2(LEN__SET_TCB_FIELD_ULP, 16); m = alloc_wr_mbuf(len, M_NOWAIT); if (m == NULL) { CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__, tlsp->tid); return (ENOMEM); } m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; /* FW_ULPTX_WR */ wr = mtod(m, void *); wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR)); wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA | V_FW_WR_LEN16(len / 16)); wr->cookie = 0; dst = (char *)(wr + 1); /* Clear TF_NON_OFFLOAD and set TF_CORE_BYPASS */ write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_T_FLAGS, V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1) | V_TF_NON_OFFLOAD(1)), V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1))); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); /* Clear the SND_UNA_RAW, SND_NXT_RAW, and SND_MAX_RAW offsets. */ write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_UNA_RAW, V_TCB_SND_NXT_RAW(M_TCB_SND_NXT_RAW) | V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW), V_TCB_SND_NXT_RAW(0) | V_TCB_SND_UNA_RAW(0)); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_MAX_RAW, V_TCB_SND_MAX_RAW(M_TCB_SND_MAX_RAW), V_TCB_SND_MAX_RAW(0)); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); if (tp->t_flags & TF_REQ_TSTMP) { write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_TIMESTAMP_OFFSET, V_TCB_TIMESTAMP_OFFSET(M_TCB_TIMESTAMP_OFFSET), V_TCB_TIMESTAMP_OFFSET(tp->ts_offset >> 28)); dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16); } KASSERT(dst - (char *)wr == len, ("%s: length mismatch", __func__)); items[0] = m; error = mp_ring_enqueue(txq->r, items, 1, 1); if (error) m_free(m); return (error); } int -t6_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, +t6_tls_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { const struct ktls_session *tls; struct tlspcb *tlsp; struct adapter *sc; struct vi_info *vi; struct inpcb *inp; struct tcpcb *tp; struct sge_txq *txq; int atid, error, explicit_iv_size, keyid, mac_first; tls = params->tls.tls; /* Only TLS 1.1 and TLS 1.2 are currently supported. */ if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE || tls->params.tls_vminor < TLS_MINOR_VER_ONE || tls->params.tls_vminor > TLS_MINOR_VER_TWO) return (EPROTONOSUPPORT); /* Sanity check values in *tls. */ switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: /* XXX: Explicitly ignore any provided IV. */ switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } switch (tls->params.auth_algorithm) { case CRYPTO_SHA1_HMAC: case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: break; default: return (EPROTONOSUPPORT); } explicit_iv_size = AES_BLOCK_LEN; mac_first = 1; break; case CRYPTO_AES_NIST_GCM_16: if (tls->params.iv_len != SALT_SIZE) return (EINVAL); switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } explicit_iv_size = 8; mac_first = 0; break; default: return (EPROTONOSUPPORT); } - vi = ifp->if_softc; + vi = if_getsoftc(ifp); sc = vi->adapter; tlsp = alloc_tlspcb(ifp, vi, M_WAITOK); atid = alloc_atid(sc, tlsp); if (atid < 0) { error = ENOMEM; goto failed; } if (sc->tlst.inline_keys) keyid = -1; else keyid = t4_alloc_tls_keyid(sc); if (keyid < 0) { CTR2(KTR_CXGBE, "%s: atid %d using immediate key ctx", __func__, atid); tlsp->inline_key = true; } else { tlsp->tx_key_addr = keyid; CTR3(KTR_CXGBE, "%s: atid %d allocated TX key addr %#x", __func__, atid, tlsp->tx_key_addr); } inp = params->tls.inp; INP_RLOCK(inp); if (inp->inp_flags & INP_DROPPED) { INP_RUNLOCK(inp); error = ECONNRESET; goto failed; } tlsp->inp = inp; tp = intotcpcb(inp); if (tp->t_flags & TF_REQ_TSTMP) { tlsp->using_timestamps = true; if ((tp->ts_offset & 0xfffffff) != 0) { INP_RUNLOCK(inp); error = EINVAL; goto failed; } } else tlsp->using_timestamps = false; error = send_ktls_act_open_req(sc, vi, inp, tlsp, atid); if (error) { INP_RUNLOCK(inp); goto failed; } /* Wait for reply to active open. */ CTR2(KTR_CXGBE, "%s: atid %d sent CPL_ACT_OPEN_REQ", __func__, atid); while (tlsp->open_pending) { /* * XXX: PCATCH? We would then have to discard the PCB * when the completion CPL arrived. */ error = rw_sleep(tlsp, &inp->inp_lock, 0, "t6tlsop", 0); } atid = -1; if (tlsp->tid < 0) { INP_RUNLOCK(inp); error = ENOMEM; goto failed; } if (inp->inp_flags & INP_DROPPED) { INP_RUNLOCK(inp); error = ECONNRESET; goto failed; } txq = &sc->sge.txq[vi->first_txq]; if (inp->inp_flowtype != M_HASHTYPE_NONE) txq += ((inp->inp_flowid % (vi->ntxq - vi->rsrv_noflowq)) + vi->rsrv_noflowq); tlsp->txq = txq; error = ktls_set_tcb_fields(tlsp, tp, txq); INP_RUNLOCK(inp); if (error) goto failed; error = ktls_setup_keys(tlsp, tls, txq); if (error) goto failed; tlsp->enc_mode = t4_tls_cipher_mode(tls); tlsp->tx_key_info_size = t4_tls_key_info_size(tls); /* The SCMD fields used when encrypting a full TLS record. */ tlsp->scmd0.seqno_numivs = htobe32(V_SCMD_SEQ_NO_CTRL(3) | V_SCMD_PROTO_VERSION(t4_tls_proto_ver(tls)) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) | V_SCMD_CIPH_MODE(tlsp->enc_mode) | V_SCMD_AUTH_MODE(t4_tls_auth_mode(tls)) | V_SCMD_HMAC_CTRL(t4_tls_hmac_ctrl(tls)) | V_SCMD_IV_SIZE(explicit_iv_size / 2) | V_SCMD_NUM_IVS(1)); tlsp->scmd0.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) | V_SCMD_TLS_FRAG_ENABLE(0); if (tlsp->inline_key) tlsp->scmd0.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1); tlsp->scmd0.ivgen_hdrlen = htobe32(tlsp->scmd0.ivgen_hdrlen); /* * The SCMD fields used when encrypting a partial TLS record * (no trailer and possibly a truncated payload). */ tlsp->scmd0_short.seqno_numivs = V_SCMD_SEQ_NO_CTRL(0) | V_SCMD_PROTO_VERSION(SCMD_PROTO_VERSION_GENERIC) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) | V_SCMD_AUTH_MODE(SCMD_AUTH_MODE_NOP) | V_SCMD_HMAC_CTRL(SCMD_HMAC_CTRL_NOP) | V_SCMD_IV_SIZE(AES_BLOCK_LEN / 2) | V_SCMD_NUM_IVS(0); if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) tlsp->scmd0_short.seqno_numivs |= V_SCMD_CIPH_MODE(SCMD_CIPH_MODE_AES_CTR); else tlsp->scmd0_short.seqno_numivs |= V_SCMD_CIPH_MODE(tlsp->enc_mode); tlsp->scmd0_short.seqno_numivs = htobe32(tlsp->scmd0_short.seqno_numivs); tlsp->scmd0_short.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) | V_SCMD_TLS_FRAG_ENABLE(0) | V_SCMD_AADIVDROP(1); if (tlsp->inline_key) tlsp->scmd0_short.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1); TXQ_LOCK(txq); if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) txq->kern_tls_gcm++; else txq->kern_tls_cbc++; TXQ_UNLOCK(txq); *pt = &tlsp->com; return (0); failed: if (atid >= 0) free_atid(sc, atid); m_snd_tag_rele(&tlsp->com); return (error); } static int ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls, struct sge_txq *txq) { struct tls_key_req *kwr; struct tls_keyctx *kctx; void *items[1]; struct mbuf *m; int error; /* * Store the salt and keys in the key context. For * connections with an inline key, this key context is passed * as immediate data in each work request. For connections * storing the key in DDR, a work request is used to store a * copy of the key context in DDR. */ t4_tls_key_ctx(tls, KTLS_TX, &tlsp->keyctx); if (tlsp->inline_key) return (0); /* Populate key work request. */ m = alloc_wr_mbuf(TLS_KEY_WR_SZ, M_NOWAIT); if (m == NULL) { CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__, tlsp->tid); return (ENOMEM); } m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com); m->m_pkthdr.csum_flags |= CSUM_SND_TAG; kwr = mtod(m, void *); memset(kwr, 0, TLS_KEY_WR_SZ); t4_write_tlskey_wr(tls, KTLS_TX, tlsp->tid, 0, tlsp->tx_key_addr, kwr); kctx = (struct tls_keyctx *)(kwr + 1); memcpy(kctx, &tlsp->keyctx, sizeof(*kctx)); /* * Place the key work request in the transmit queue. It * should be sent to the NIC before any TLS packets using this * session. */ items[0] = m; error = mp_ring_enqueue(txq->r, items, 1, 1); if (error) m_free(m); else CTR2(KTR_CXGBE, "%s: tid %d sent key WR", __func__, tlsp->tid); return (error); } static u_int ktls_base_wr_size(struct tlspcb *tlsp) { u_int wr_len; wr_len = sizeof(struct fw_ulptx_wr); // 16 wr_len += sizeof(struct ulp_txpkt); // 8 wr_len += sizeof(struct ulptx_idata); // 8 wr_len += sizeof(struct cpl_tx_sec_pdu);// 32 if (tlsp->inline_key) wr_len += tlsp->tx_key_info_size; else { wr_len += sizeof(struct ulptx_sc_memrd);// 8 wr_len += sizeof(struct ulptx_idata); // 8 } wr_len += sizeof(struct cpl_tx_data); // 16 return (wr_len); } /* How many bytes of TCP payload to send for a given TLS record. */ static u_int ktls_tcp_payload_length(struct tlspcb *tlsp, struct mbuf *m_tls) { struct tls_record_layer *hdr; u_int plen, mlen; M_ASSERTEXTPG(m_tls); hdr = (void *)m_tls->m_epg_hdr; plen = ntohs(hdr->tls_length); /* * What range of the TLS record is the mbuf requesting to be * sent. */ mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len; /* Always send complete records. */ if (mlen == TLS_HEADER_LENGTH + plen) return (mlen); /* * If the host stack has asked to send part of the trailer, * trim the length to avoid sending any of the trailer. There * is no way to send a partial trailer currently. */ if (mlen > TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen) mlen = TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen; /* * For AES-CBC adjust the ciphertext length for the block * size. */ if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC && mlen > TLS_HEADER_LENGTH) { mlen = TLS_HEADER_LENGTH + rounddown(mlen - TLS_HEADER_LENGTH, AES_BLOCK_LEN); } #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d short TLS record (%u vs %u)", __func__, tlsp->tid, mlen, TLS_HEADER_LENGTH + plen); #endif return (mlen); } /* * For a "short" TLS record, determine the offset into the TLS record * payload to send. This offset does not include the TLS header, but * a non-zero offset implies that a header will not be sent. */ static u_int ktls_payload_offset(struct tlspcb *tlsp, struct mbuf *m_tls) { struct tls_record_layer *hdr; u_int offset, plen; #ifdef INVARIANTS u_int mlen; #endif M_ASSERTEXTPG(m_tls); hdr = (void *)m_tls->m_epg_hdr; plen = ntohs(hdr->tls_length); #ifdef INVARIANTS mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len; MPASS(mlen < TLS_HEADER_LENGTH + plen); #endif if (mtod(m_tls, vm_offset_t) <= m_tls->m_epg_hdrlen) return (0); if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) { /* * Always send something. This function is only called * if we aren't sending the tag at all, but if the * request starts in the tag then we are in an odd * state where would effectively send nothing. Cap * the offset at the last byte of the record payload * to send the last cipher block. */ offset = min(mtod(m_tls, vm_offset_t) - m_tls->m_epg_hdrlen, (plen - TLS_HEADER_LENGTH - m_tls->m_epg_trllen) - 1); return (rounddown(offset, AES_BLOCK_LEN)); } return (0); } static u_int ktls_sgl_size(u_int nsegs) { u_int wr_len; /* First segment is part of ulptx_sgl. */ nsegs--; wr_len = sizeof(struct ulptx_sgl); wr_len += 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (wr_len); } static int ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struct mbuf *m_tls, int *nsegsp) { struct tls_record_layer *hdr; u_int imm_len, offset, plen, wr_len, tlen; M_ASSERTEXTPG(m_tls); /* * Determine the size of the TLS record payload to send * excluding header and trailer. */ tlen = ktls_tcp_payload_length(tlsp, m_tls); if (tlen <= m_tls->m_epg_hdrlen) { /* * For requests that only want to send the TLS header, * send a tunnelled packet as immediate data. */ wr_len = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len + m_tls->m_len, 16); if (wr_len > SGE_MAX_WR_LEN) { CTR3(KTR_CXGBE, "%s: tid %d TLS header-only packet too long (len %d)", __func__, tlsp->tid, m->m_len + m_tls->m_len); } /* This should always be the last TLS record in a chain. */ MPASS(m_tls->m_next == NULL); *nsegsp = 0; return (wr_len); } hdr = (void *)m_tls->m_epg_hdr; plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen; if (tlen < plen) { plen = tlen; offset = ktls_payload_offset(tlsp, m_tls); } else offset = 0; /* Calculate the size of the work request. */ wr_len = ktls_base_wr_size(tlsp); /* * Full records and short records with an offset of 0 include * the TLS header as immediate data. Short records include a * raw AES IV as immediate data. */ imm_len = 0; if (offset == 0) imm_len += m_tls->m_epg_hdrlen; if (plen == tlen) imm_len += AES_BLOCK_LEN; wr_len += roundup2(imm_len, 16); /* TLS record payload via DSGL. */ *nsegsp = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen + offset, plen - (m_tls->m_epg_hdrlen + offset)); wr_len += ktls_sgl_size(*nsegsp); wr_len = roundup2(wr_len, 16); return (wr_len); } /* * See if we have any TCP options requiring a dedicated options-only * packet. */ static int ktls_has_tcp_options(struct tcphdr *tcp) { u_char *cp; int cnt, opt, optlen; cp = (u_char *)(tcp + 1); cnt = tcp->th_off * 4 - sizeof(struct tcphdr); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_NOP: case TCPOPT_TIMESTAMP: break; default: return (1); } } return (0); } /* * Find the TCP timestamp option. */ static void * ktls_find_tcp_timestamps(struct tcphdr *tcp) { u_char *cp; int cnt, opt, optlen; cp = (u_char *)(tcp + 1); cnt = tcp->th_off * 4 - sizeof(struct tcphdr); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } if (opt == TCPOPT_TIMESTAMP && optlen == TCPOLEN_TIMESTAMP) return (cp + 2); } return (NULL); } int t6_ktls_parse_pkt(struct mbuf *m) { struct tlspcb *tlsp; struct ether_header *eh; struct ip *ip; struct ip6_hdr *ip6; struct tcphdr *tcp; struct mbuf *m_tls; void *items[1]; int nsegs; u_int wr_len, tot_len; /* * Locate headers in initial mbuf. * * XXX: This assumes all of the headers are in the initial mbuf. * Could perhaps use m_advance() like parse_pkt() if that turns * out to not be true. */ M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.snd_tag != NULL); tlsp = mst_to_tls(m->m_pkthdr.snd_tag); if (m->m_len <= sizeof(*eh) + sizeof(*ip)) { CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short", __func__, tlsp->tid); return (EINVAL); } eh = mtod(m, struct ether_header *); if (ntohs(eh->ether_type) != ETHERTYPE_IP && ntohs(eh->ether_type) != ETHERTYPE_IPV6) { CTR2(KTR_CXGBE, "%s: tid %d mbuf not ETHERTYPE_IP{,V6}", __func__, tlsp->tid); return (EINVAL); } m->m_pkthdr.l2hlen = sizeof(*eh); /* XXX: Reject unsupported IP options? */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (struct ip *)(eh + 1); if (ip->ip_p != IPPROTO_TCP) { CTR2(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP", __func__, tlsp->tid); return (EINVAL); } m->m_pkthdr.l3hlen = ip->ip_hl * 4; } else { ip6 = (struct ip6_hdr *)(eh + 1); if (ip6->ip6_nxt != IPPROTO_TCP) { CTR3(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP (%u)", __func__, tlsp->tid, ip6->ip6_nxt); return (EINVAL); } m->m_pkthdr.l3hlen = sizeof(struct ip6_hdr); } if (m->m_len < m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)) { CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short (2)", __func__, tlsp->tid); return (EINVAL); } tcp = (struct tcphdr *)((char *)(eh + 1) + m->m_pkthdr.l3hlen); m->m_pkthdr.l4hlen = tcp->th_off * 4; /* Bail if there is TCP payload before the TLS record. */ if (m->m_len != m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen) { CTR6(KTR_CXGBE, "%s: tid %d header mbuf bad length (%d + %d + %d != %d)", __func__, tlsp->tid, m->m_pkthdr.l2hlen, m->m_pkthdr.l3hlen, m->m_pkthdr.l4hlen, m->m_len); return (EINVAL); } /* Assume all headers are in 'm' for now. */ MPASS(m->m_next != NULL); MPASS(m->m_next->m_flags & M_EXTPG); tot_len = 0; /* * Each of the remaining mbufs in the chain should reference a * TLS record. */ for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) { MPASS(m_tls->m_flags & M_EXTPG); wr_len = ktls_wr_len(tlsp, m, m_tls, &nsegs); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d wr_len %d nsegs %d", __func__, tlsp->tid, wr_len, nsegs); #endif if (wr_len > SGE_MAX_WR_LEN || nsegs > TX_SGL_SEGS) return (EFBIG); tot_len += roundup2(wr_len, EQ_ESIZE); /* * Store 'nsegs' for the first TLS record in the * header mbuf's metadata. */ if (m_tls == m->m_next) set_mbuf_nsegs(m, nsegs); } MPASS(tot_len != 0); /* * See if we have any TCP options or a FIN requiring a * dedicated packet. */ if ((tcp->th_flags & TH_FIN) != 0 || ktls_has_tcp_options(tcp)) { wr_len = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len, 16); if (wr_len > SGE_MAX_WR_LEN) { CTR3(KTR_CXGBE, "%s: tid %d options-only packet too long (len %d)", __func__, tlsp->tid, m->m_len); return (EINVAL); } tot_len += roundup2(wr_len, EQ_ESIZE); } /* Include room for a TP work request to program an L2T entry. */ tot_len += EQ_ESIZE; /* * Include room for a ULPTX work request including up to 5 * CPL_SET_TCB_FIELD commands before the first TLS work * request. */ wr_len = sizeof(struct fw_ulptx_wr) + 5 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); /* * If timestamps are present, reserve 1 more command for * setting the echoed timestamp. */ if (tlsp->using_timestamps) wr_len += roundup2(LEN__SET_TCB_FIELD_ULP, 16); tot_len += roundup2(wr_len, EQ_ESIZE); set_mbuf_len16(m, tot_len / 16); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d len16 %d nsegs %d", __func__, tlsp->tid, mbuf_len16(m), mbuf_nsegs(m)); #endif items[0] = m; return (mp_ring_enqueue(tlsp->txq->r, items, 1, 256)); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_buf(struct sglist *gl, caddr_t to) { struct sglist_seg *seg; __be64 *flitp; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)to & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, to)); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)to; seg = &gl->sg_segs[0]; usgl = (void *)flitp; usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; if ((uintptr_t)(*to) == (uintptr_t)&eq->desc[eq->sidx]) (*to) = (caddr_t)eq->desc; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static int ktls_write_tcp_options(struct sge_txq *txq, void *dst, struct mbuf *m, u_int available, u_int pidx) { struct tx_sdesc *txsd; struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int len16, ndesc, pktlen; struct ether_header *eh; struct ip *ip, newip; struct ip6_hdr *ip6, newip6; struct tcphdr *tcp, newtcp; caddr_t out; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m); wr = dst; pktlen = m->m_len; ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; cpl = (void *)(wr + 1); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); out = (void *)(cpl + 1); /* Copy over Ethernet header. */ eh = mtod(m, struct ether_header *); copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen); /* Fixup length in IP header and copy out. */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip = *ip; newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip)); if (m->m_pkthdr.l3hlen > sizeof(*ip)) copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out, m->m_pkthdr.l3hlen - sizeof(*ip)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } else { ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip6 = *ip6; newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6)); MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } cpl->ctrl1 = htobe64(ctrl1); txq->txcsum++; /* Clear PUSH and FIN in the TCP header if present. */ tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); newtcp = *tcp; newtcp.th_flags &= ~(TH_PUSH | TH_FIN); copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp)); /* Copy rest of packet. */ copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, pktlen - (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp))); txq->imm_wrs++; txq->txpkt_wrs++; txq->kern_tls_options++; txsd = &txq->sdesc[pidx]; txsd->m = NULL; txsd->desc_used = ndesc; return (ndesc); } static int ktls_write_tunnel_packet(struct sge_txq *txq, void *dst, struct mbuf *m, struct mbuf *m_tls, u_int available, tcp_seq tcp_seqno, u_int pidx) { struct tx_sdesc *txsd; struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int len16, ndesc, pktlen; struct ether_header *eh; struct ip *ip, newip; struct ip6_hdr *ip6, newip6; struct tcphdr *tcp, newtcp; caddr_t out; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m); /* Locate the template TLS header. */ M_ASSERTEXTPG(m_tls); /* This should always be the last TLS record in a chain. */ MPASS(m_tls->m_next == NULL); wr = dst; pktlen = m->m_len + m_tls->m_len; ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; cpl = (void *)(wr + 1); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); out = (void *)(cpl + 1); /* Copy over Ethernet header. */ eh = mtod(m, struct ether_header *); copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen); /* Fixup length in IP header and copy out. */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip = *ip; newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip)); if (m->m_pkthdr.l3hlen > sizeof(*ip)) copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out, m->m_pkthdr.l3hlen - sizeof(*ip)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } else { ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip6 = *ip6; newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6)); MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } cpl->ctrl1 = htobe64(ctrl1); txq->txcsum++; /* Set sequence number in TCP header. */ tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); newtcp = *tcp; newtcp.th_seq = htonl(tcp_seqno + mtod(m_tls, vm_offset_t)); copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp)); /* Copy rest of TCP header. */ copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len - (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp))); /* Copy the subset of the TLS header requested. */ copy_to_txd(&txq->eq, (char *)m_tls->m_epg_hdr + mtod(m_tls, vm_offset_t), &out, m_tls->m_len); txq->imm_wrs++; txq->txpkt_wrs++; txq->kern_tls_header++; txsd = &txq->sdesc[pidx]; txsd->m = m; txsd->desc_used = ndesc; return (ndesc); } _Static_assert(sizeof(struct cpl_set_tcb_field) <= EQ_ESIZE, "CPL_SET_TCB_FIELD must be smaller than a single TX descriptor"); _Static_assert(W_TCB_SND_UNA_RAW == W_TCB_SND_NXT_RAW, "SND_NXT_RAW and SND_UNA_RAW are in different words"); static int ktls_write_tls_wr(struct tlspcb *tlsp, struct sge_txq *txq, void *dst, struct mbuf *m, struct tcphdr *tcp, struct mbuf *m_tls, u_int available, tcp_seq tcp_seqno, uint32_t *tsopt, u_int pidx, bool set_l2t_idx) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct fw_ulptx_wr *wr; struct ulp_txpkt *txpkt; struct ulptx_sc_memrd *memrd; struct ulptx_idata *idata; struct cpl_tx_sec_pdu *sec_pdu; struct cpl_tx_data *tx_data; struct tls_record_layer *hdr; char *iv, *out; u_int aad_start, aad_stop; u_int auth_start, auth_stop, auth_insert; u_int cipher_start, cipher_stop, iv_offset; u_int imm_len, mss, ndesc, offset, plen, tlen, twr_len, wr_len; u_int fields, nsegs, tx_max_offset, tx_max; bool first_wr, last_wr, using_scratch; ndesc = 0; MPASS(tlsp->txq == txq); first_wr = (tlsp->prev_seq == 0 && tlsp->prev_ack == 0 && tlsp->prev_win == 0); /* * Use the per-txq scratch pad if near the end of the ring to * simplify handling of wrap-around. This uses a simple but * not quite perfect test of using the scratch buffer if we * can't fit a maximal work request in without wrapping. */ using_scratch = (eq->sidx - pidx < SGE_MAX_WR_LEN / EQ_ESIZE); /* Locate the TLS header. */ M_ASSERTEXTPG(m_tls); hdr = (void *)m_tls->m_epg_hdr; plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen; /* Determine how much of the TLS record to send. */ tlen = ktls_tcp_payload_length(tlsp, m_tls); if (tlen <= m_tls->m_epg_hdrlen) { /* * For requests that only want to send the TLS header, * send a tunnelled packet as immediate data. */ #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d header-only TLS record %u", __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno); #endif return (ktls_write_tunnel_packet(txq, dst, m, m_tls, available, tcp_seqno, pidx)); } if (tlen < plen) { plen = tlen; offset = ktls_payload_offset(tlsp, m_tls); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d short TLS record %u with offset %u", __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno, offset); #endif if (m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) != 0) { txq->kern_tls_fin_short++; #ifdef INVARIANTS panic("%s: FIN on short TLS record", __func__); #endif } } else offset = 0; /* * This is the last work request for a given TLS mbuf chain if * it is the last mbuf in the chain and FIN is not set. If * FIN is set, then ktls_write_tcp_fin() will write out the * last work request. */ last_wr = m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) == 0; /* * The host stack may ask us to not send part of the start of * a TLS record. (For example, the stack might have * previously sent a "short" TLS record and might later send * down an mbuf that requests to send the remainder of the TLS * record.) The crypto engine must process a TLS record from * the beginning if computing a GCM tag or HMAC, so we always * send the TLS record from the beginning as input to the * crypto engine and via CPL_TX_DATA to TP. However, TP will * drop individual packets after they have been chopped up * into MSS-sized chunks if the entire sequence range of those * packets is less than SND_UNA. SND_UNA is computed as * TX_MAX - SND_UNA_RAW. Thus, use the offset stored in * m_data to set TX_MAX to the first byte in the TCP sequence * space the host actually wants us to send and set * SND_UNA_RAW to 0. * * If the host sends us back to back requests that span the * trailer of a single TLS record (first request ends "in" the * trailer and second request starts at the next byte but * still "in" the trailer), the initial bytes of the trailer * that the first request drops will not be retransmitted. If * the host uses the same requests when retransmitting the * connection will hang. To handle this, always transmit the * full trailer for a request that begins "in" the trailer * (the second request in the example above). This should * also help to avoid retransmits for the common case. * * A similar condition exists when using CBC for back to back * requests that span a single AES block. The first request * will be truncated to end at the end of the previous AES * block. To handle this, always begin transmission at the * start of the current AES block. */ tx_max_offset = mtod(m_tls, vm_offset_t); if (tx_max_offset > TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen) { /* Always send the full trailer. */ tx_max_offset = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen; } if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC && tx_max_offset > TLS_HEADER_LENGTH) { /* Always send all of the first AES block. */ tx_max_offset = TLS_HEADER_LENGTH + rounddown(tx_max_offset - TLS_HEADER_LENGTH, AES_BLOCK_LEN); } tx_max = tcp_seqno + tx_max_offset; /* * Update TCB fields. Reserve space for the FW_ULPTX_WR header * but don't populate it until we know how many field updates * are required. */ if (using_scratch) wr = (void *)txq->ss; else wr = dst; out = (void *)(wr + 1); fields = 0; if (set_l2t_idx) { KASSERT(m->m_next == m_tls, ("trying to set L2T_IX for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %d set L2T_IX to %d", __func__, tlsp->tid, tlsp->l2te->idx); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_L2T_IX, V_TCB_L2T_IX(M_TCB_L2T_IX), V_TCB_L2T_IX(tlsp->l2te->idx)); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; } if (tsopt != NULL && tlsp->prev_tsecr != ntohl(tsopt[1])) { KASSERT(m->m_next == m_tls, ("trying to set T_RTSEQ_RECENT for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d wrote updated T_RTSEQ_RECENT", __func__, tlsp->tid); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_T_RTSEQ_RECENT, V_TCB_T_RTSEQ_RECENT(M_TCB_T_RTSEQ_RECENT), V_TCB_T_RTSEQ_RECENT(ntohl(tsopt[1]))); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; tlsp->prev_tsecr = ntohl(tsopt[1]); } if (first_wr || tlsp->prev_seq != tx_max) { KASSERT(m->m_next == m_tls, ("trying to set TX_MAX for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d setting TX_MAX to %u (tcp_seqno %u)", __func__, tlsp->tid, tx_max, tcp_seqno); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_TX_MAX, V_TCB_TX_MAX(M_TCB_TX_MAX), V_TCB_TX_MAX(tx_max)); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; } /* * If there is data to drop at the beginning of this TLS * record or if this is a retransmit, * reset SND_UNA_RAW to 0 so that SND_UNA == TX_MAX. */ if (tlsp->prev_seq != tx_max || mtod(m_tls, vm_offset_t) != 0) { KASSERT(m->m_next == m_tls, ("trying to clear SND_UNA_RAW for subsequent TLS WR")); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d clearing SND_UNA_RAW", __func__, tlsp->tid); #endif write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_SND_UNA_RAW, V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW), V_TCB_SND_UNA_RAW(0)); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; } /* * Store the expected sequence number of the next byte after * this record. */ tlsp->prev_seq = tcp_seqno + tlen; if (first_wr || tlsp->prev_ack != ntohl(tcp->th_ack)) { KASSERT(m->m_next == m_tls, ("trying to set RCV_NXT for subsequent TLS WR")); write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_NXT, V_TCB_RCV_NXT(M_TCB_RCV_NXT), V_TCB_RCV_NXT(ntohl(tcp->th_ack))); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; tlsp->prev_ack = ntohl(tcp->th_ack); } if (first_wr || tlsp->prev_win != ntohs(tcp->th_win)) { KASSERT(m->m_next == m_tls, ("trying to set RCV_WND for subsequent TLS WR")); write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_WND, V_TCB_RCV_WND(M_TCB_RCV_WND), V_TCB_RCV_WND(ntohs(tcp->th_win))); out += roundup2(LEN__SET_TCB_FIELD_ULP, 16); fields++; tlsp->prev_win = ntohs(tcp->th_win); } /* Use cached value for first record in chain. */ if (m->m_next == m_tls) nsegs = mbuf_nsegs(m); else nsegs = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen + offset, plen - (m_tls->m_epg_hdrlen + offset)); /* Calculate the size of the TLS work request. */ twr_len = ktls_base_wr_size(tlsp); imm_len = 0; if (offset == 0) imm_len += m_tls->m_epg_hdrlen; if (plen == tlen) imm_len += AES_BLOCK_LEN; twr_len += roundup2(imm_len, 16); twr_len += ktls_sgl_size(nsegs); /* * If any field updates were required, determine if they can * be included in the TLS work request. If not, use the * FW_ULPTX_WR work request header at 'wr' as a dedicated work * request for the field updates and start a new work request * for the TLS work request afterward. */ if (fields != 0) { wr_len = fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16); if (twr_len + wr_len <= SGE_MAX_WR_LEN && tlsp->sc->tlst.combo_wrs) { wr_len += twr_len; txpkt = (void *)out; } else { wr_len += sizeof(*wr); wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR)); wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA | V_FW_WR_LEN16(wr_len / 16)); wr->cookie = 0; /* * If we were using scratch space, copy the * field updates work request to the ring. */ if (using_scratch) { out = dst; copy_to_txd(eq, txq->ss, &out, wr_len); } ndesc = howmany(wr_len, EQ_ESIZE); MPASS(ndesc <= available); txq->raw_wrs++; txsd = &txq->sdesc[pidx]; txsd->m = NULL; txsd->desc_used = ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; /* * Determine if we should use scratch space * for the TLS work request based on the * available space after advancing pidx for * the field updates work request. */ wr_len = twr_len; using_scratch = (eq->sidx - pidx < howmany(wr_len, EQ_ESIZE)); if (using_scratch) wr = (void *)txq->ss; else wr = dst; txpkt = (void *)(wr + 1); } } else { wr_len = twr_len; txpkt = (void *)out; } wr_len = roundup2(wr_len, 16); MPASS(ndesc + howmany(wr_len, EQ_ESIZE) <= available); /* FW_ULPTX_WR */ wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR)); wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA | V_FW_WR_LEN16(wr_len / 16)); wr->cookie = 0; /* ULP_TXPKT */ txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DATAMODIFY(0) | V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1)); txpkt->len = htobe32(howmany(twr_len - sizeof(*wr), 16)); /* ULPTX_IDATA sub-command */ idata = (void *)(txpkt + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | V_ULP_TX_SC_MORE(1)); idata->len = sizeof(struct cpl_tx_sec_pdu); /* * The key context, CPL_TX_DATA, and immediate data are part * of this ULPTX_IDATA when using an inline key. When reading * the key from memory, the CPL_TX_DATA and immediate data are * part of a separate ULPTX_IDATA. */ if (tlsp->inline_key) idata->len += tlsp->tx_key_info_size + sizeof(struct cpl_tx_data) + imm_len; idata->len = htobe32(idata->len); /* CPL_TX_SEC_PDU */ sec_pdu = (void *)(idata + 1); /* * For short records, AAD is counted as header data in SCMD0, * the IV is next followed by a cipher region for the payload. */ if (plen == tlen) { aad_start = 0; aad_stop = 0; iv_offset = 1; auth_start = 0; auth_stop = 0; auth_insert = 0; cipher_start = AES_BLOCK_LEN + 1; cipher_stop = 0; sec_pdu->pldlen = htobe32(16 + plen - (m_tls->m_epg_hdrlen + offset)); /* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */ sec_pdu->seqno_numivs = tlsp->scmd0_short.seqno_numivs; sec_pdu->ivgen_hdrlen = htobe32( tlsp->scmd0_short.ivgen_hdrlen | V_SCMD_HDR_LEN(offset == 0 ? m_tls->m_epg_hdrlen : 0)); txq->kern_tls_short++; } else { /* * AAD is TLS header. IV is after AAD. The cipher region * starts after the IV. See comments in ccr_authenc() and * ccr_gmac() in t4_crypto.c regarding cipher and auth * start/stop values. */ aad_start = 1; aad_stop = TLS_HEADER_LENGTH; iv_offset = TLS_HEADER_LENGTH + 1; cipher_start = m_tls->m_epg_hdrlen + 1; if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) { cipher_stop = 0; auth_start = cipher_start; auth_stop = 0; auth_insert = 0; } else { cipher_stop = 0; auth_start = cipher_start; auth_stop = 0; auth_insert = 0; } sec_pdu->pldlen = htobe32(plen); /* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */ sec_pdu->seqno_numivs = tlsp->scmd0.seqno_numivs; sec_pdu->ivgen_hdrlen = tlsp->scmd0.ivgen_hdrlen; if (mtod(m_tls, vm_offset_t) == 0) txq->kern_tls_full++; else txq->kern_tls_partial++; } sec_pdu->op_ivinsrtofst = htobe32( V_CPL_TX_SEC_PDU_OPCODE(CPL_TX_SEC_PDU) | V_CPL_TX_SEC_PDU_CPLLEN(2) | V_CPL_TX_SEC_PDU_PLACEHOLDER(0) | V_CPL_TX_SEC_PDU_IVINSRTOFST(iv_offset)); sec_pdu->aadstart_cipherstop_hi = htobe32( V_CPL_TX_SEC_PDU_AADSTART(aad_start) | V_CPL_TX_SEC_PDU_AADSTOP(aad_stop) | V_CPL_TX_SEC_PDU_CIPHERSTART(cipher_start) | V_CPL_TX_SEC_PDU_CIPHERSTOP_HI(cipher_stop >> 4)); sec_pdu->cipherstop_lo_authinsert = htobe32( V_CPL_TX_SEC_PDU_CIPHERSTOP_LO(cipher_stop & 0xf) | V_CPL_TX_SEC_PDU_AUTHSTART(auth_start) | V_CPL_TX_SEC_PDU_AUTHSTOP(auth_stop) | V_CPL_TX_SEC_PDU_AUTHINSERT(auth_insert)); sec_pdu->scmd1 = htobe64(m_tls->m_epg_seqno); /* Key context */ out = (void *)(sec_pdu + 1); if (tlsp->inline_key) { memcpy(out, &tlsp->keyctx, tlsp->tx_key_info_size); out += tlsp->tx_key_info_size; } else { /* ULPTX_SC_MEMRD to read key context. */ memrd = (void *)out; memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) | V_ULP_TX_SC_MORE(1) | V_ULPTX_LEN16(tlsp->tx_key_info_size >> 4)); memrd->addr = htobe32(tlsp->tx_key_addr >> 5); /* ULPTX_IDATA for CPL_TX_DATA and TLS header. */ idata = (void *)(memrd + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | V_ULP_TX_SC_MORE(1)); idata->len = htobe32(sizeof(struct cpl_tx_data) + imm_len); out = (void *)(idata + 1); } /* CPL_TX_DATA */ tx_data = (void *)out; OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tlsp->tid)); if (m->m_pkthdr.csum_flags & CSUM_TSO) { mss = m->m_pkthdr.tso_segsz; tlsp->prev_mss = mss; } else if (tlsp->prev_mss != 0) mss = tlsp->prev_mss; else - mss = tlsp->vi->ifp->if_mtu - + mss = if_getmtu(tlsp->vi->ifp) - (m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen); if (offset == 0) { tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen)); tx_data->rsvd = htobe32(tcp_seqno); } else { tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen - (m_tls->m_epg_hdrlen + offset))); tx_data->rsvd = htobe32(tcp_seqno + m_tls->m_epg_hdrlen + offset); } tx_data->flags = htobe32(F_TX_BYPASS); if (last_wr && tcp->th_flags & TH_PUSH) tx_data->flags |= htobe32(F_TX_PUSH | F_TX_SHOVE); /* Populate the TLS header */ out = (void *)(tx_data + 1); if (offset == 0) { memcpy(out, m_tls->m_epg_hdr, m_tls->m_epg_hdrlen); out += m_tls->m_epg_hdrlen; } /* AES IV for a short record. */ if (plen == tlen) { iv = out; if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) { memcpy(iv, tlsp->keyctx.u.txhdr.txsalt, SALT_SIZE); memcpy(iv + 4, hdr + 1, 8); *(uint32_t *)(iv + 12) = htobe32(2 + offset / AES_BLOCK_LEN); } else memcpy(iv, hdr + 1, AES_BLOCK_LEN); out += AES_BLOCK_LEN; } if (imm_len % 16 != 0) { /* Zero pad to an 8-byte boundary. */ memset(out, 0, 8 - (imm_len % 8)); out += 8 - (imm_len % 8); /* * Insert a ULP_TX_SC_NOOP if needed so the SGL is * 16-byte aligned. */ if (imm_len % 16 <= 8) { idata = (void *)out; idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); idata->len = htobe32(0); out = (void *)(idata + 1); } } /* SGL for record payload */ sglist_reset(txq->gl); if (sglist_append_mbuf_epg(txq->gl, m_tls, m_tls->m_epg_hdrlen + offset, plen - (m_tls->m_epg_hdrlen + offset)) != 0) { #ifdef INVARIANTS panic("%s: failed to append sglist", __func__); #endif } write_gl_to_buf(txq->gl, out); if (using_scratch) { out = dst; copy_to_txd(eq, txq->ss, &out, wr_len); } ndesc += howmany(wr_len, EQ_ESIZE); MPASS(ndesc <= available); txq->kern_tls_records++; txq->kern_tls_octets += tlen - mtod(m_tls, vm_offset_t); if (mtod(m_tls, vm_offset_t) != 0) { if (offset == 0) txq->kern_tls_waste += mtod(m_tls, vm_offset_t); else txq->kern_tls_waste += mtod(m_tls, vm_offset_t) - (m_tls->m_epg_hdrlen + offset); } txsd = &txq->sdesc[pidx]; if (last_wr) txsd->m = m; else txsd->m = NULL; txsd->desc_used = howmany(wr_len, EQ_ESIZE); return (ndesc); } static int ktls_write_tcp_fin(struct sge_txq *txq, void *dst, struct mbuf *m, u_int available, tcp_seq tcp_seqno, u_int pidx) { struct tx_sdesc *txsd; struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int len16, ndesc, pktlen; struct ether_header *eh; struct ip *ip, newip; struct ip6_hdr *ip6, newip6; struct tcphdr *tcp, newtcp; caddr_t out; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m); wr = dst; pktlen = m->m_len; ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; cpl = (void *)(wr + 1); /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); out = (void *)(cpl + 1); /* Copy over Ethernet header. */ eh = mtod(m, struct ether_header *); copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen); /* Fixup length in IP header and copy out. */ if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip = *ip; newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip)); if (m->m_pkthdr.l3hlen > sizeof(*ip)) copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out, m->m_pkthdr.l3hlen - sizeof(*ip)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } else { ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen); newip6 = *ip6; newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen); copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6)); MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6)); ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) | V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) | V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); } cpl->ctrl1 = htobe64(ctrl1); txq->txcsum++; /* Set sequence number in TCP header. */ tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); newtcp = *tcp; newtcp.th_seq = htonl(tcp_seqno); copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp)); /* Copy rest of packet. */ copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len - (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp))); txq->imm_wrs++; txq->txpkt_wrs++; txq->kern_tls_fin++; txsd = &txq->sdesc[pidx]; txsd->m = m; txsd->desc_used = ndesc; return (ndesc); } int t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct tlspcb *tlsp; struct tcphdr *tcp; struct mbuf *m_tls; struct ether_header *eh; tcp_seq tcp_seqno; u_int ndesc, pidx, totdesc; uint16_t vlan_tag; bool has_fin, set_l2t_idx; void *tsopt; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.snd_tag != NULL); tlsp = mst_to_tls(m->m_pkthdr.snd_tag); totdesc = 0; eh = mtod(m, struct ether_header *); tcp = (struct tcphdr *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen); pidx = eq->pidx; has_fin = (tcp->th_flags & TH_FIN) != 0; /* * If this TLS record has a FIN, then we will send any * requested options as part of the FIN packet. */ if (!has_fin && ktls_has_tcp_options(tcp)) { ndesc = ktls_write_tcp_options(txq, dst, m, available, pidx); totdesc += ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d wrote TCP options packet", __func__, tlsp->tid); #endif } /* * Allocate a new L2T entry if necessary. This may write out * a work request to the txq. */ if (m->m_flags & M_VLANTAG) vlan_tag = m->m_pkthdr.ether_vtag; else vlan_tag = 0xfff; set_l2t_idx = false; if (tlsp->l2te == NULL || tlsp->l2te->vlan != vlan_tag || memcmp(tlsp->l2te->dmac, eh->ether_dhost, ETHER_ADDR_LEN) != 0) { set_l2t_idx = true; if (tlsp->l2te) t4_l2t_release(tlsp->l2te); tlsp->l2te = t4_l2t_alloc_tls(tlsp->sc, txq, dst, &ndesc, vlan_tag, tlsp->vi->pi->lport, eh->ether_dhost); if (tlsp->l2te == NULL) CXGBE_UNIMPLEMENTED("failed to allocate TLS L2TE"); if (ndesc != 0) { MPASS(ndesc <= available - totdesc); txq->raw_wrs++; txsd = &txq->sdesc[pidx]; txsd->m = NULL; txsd->desc_used = ndesc; totdesc += ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; } } /* * Iterate over each TLS record constructing a work request * for that record. */ for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) { MPASS(m_tls->m_flags & M_EXTPG); /* * Determine the initial TCP sequence number for this * record. */ tsopt = NULL; if (m_tls == m->m_next) { tcp_seqno = ntohl(tcp->th_seq) - mtod(m_tls, vm_offset_t); if (tlsp->using_timestamps) tsopt = ktls_find_tcp_timestamps(tcp); } else { MPASS(mtod(m_tls, vm_offset_t) == 0); tcp_seqno = tlsp->prev_seq; } ndesc = ktls_write_tls_wr(tlsp, txq, dst, m, tcp, m_tls, available - totdesc, tcp_seqno, tsopt, pidx, set_l2t_idx); totdesc += ndesc; IDXINCR(pidx, ndesc, eq->sidx); dst = &eq->desc[pidx]; /* Only need to set the L2T index once. */ set_l2t_idx = false; } if (has_fin) { /* * If the TCP header for this chain has FIN sent, then * explicitly send a packet that has FIN set. This * will also have PUSH set if requested. This assumes * we sent at least one TLS record work request and * uses the TCP sequence number after that reqeust as * the sequence number for the FIN packet. */ ndesc = ktls_write_tcp_fin(txq, dst, m, available, tlsp->prev_seq, pidx); totdesc += ndesc; } MPASS(totdesc <= available); return (totdesc); } static void t6_tls_tag_free(struct m_snd_tag *mst) { struct adapter *sc; struct tlspcb *tlsp; tlsp = mst_to_tls(mst); sc = tlsp->sc; CTR2(KTR_CXGBE, "%s: tid %d", __func__, tlsp->tid); if (tlsp->l2te) t4_l2t_release(tlsp->l2te); if (tlsp->tid >= 0) release_tid(sc, tlsp->tid, tlsp->ctrlq); if (tlsp->ce) t4_release_clip_entry(sc, tlsp->ce); if (tlsp->tx_key_addr >= 0) t4_free_tls_keyid(sc, tlsp->tx_key_addr); zfree(tlsp, M_CXGBE); } void t6_ktls_modload(void) { t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, ktls_act_open_rpl, CPL_COOKIE_KERN_TLS); } void t6_ktls_modunload(void) { t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_KERN_TLS); } #else int -t6_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, +t6_tls_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { return (ENXIO); } int t6_ktls_parse_pkt(struct mbuf *m) { return (EINVAL); } int t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int available) { panic("can't happen"); } void t6_ktls_modload(void) { } void t6_ktls_modunload(void) { } #endif diff --git a/sys/dev/cxgbe/iw_cxgbe/cm.c b/sys/dev/cxgbe/iw_cxgbe/cm.c index 08d507524c01..457290875583 100644 --- a/sys/dev/cxgbe/iw_cxgbe/cm.c +++ b/sys/dev/cxgbe/iw_cxgbe/cm.c @@ -1,3058 +1,3058 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct sge_iq; struct rss_header; struct cpl_set_tcb_rpl; #include #include "offload.h" #include "tom/t4_tom.h" #define TOEPCB(so) ((struct toepcb *)(sototcpcb((so))->t_toe)) #include "iw_cxgbe.h" #include #include #include #include #include static spinlock_t req_lock; static TAILQ_HEAD(c4iw_ep_list, c4iw_ep_common) req_list; static struct work_struct c4iw_task; static struct workqueue_struct *c4iw_taskq; static LIST_HEAD(err_cqe_list); static spinlock_t err_cqe_lock; static LIST_HEAD(listen_port_list); static DEFINE_MUTEX(listen_port_mutex); static void process_req(struct work_struct *ctx); static void start_ep_timer(struct c4iw_ep *ep); static int stop_ep_timer(struct c4iw_ep *ep); static int set_tcpinfo(struct c4iw_ep *ep); static void process_timeout(struct c4iw_ep *ep); static void process_err_cqes(void); static void *alloc_ep(int size, gfp_t flags); static void close_socket(struct socket *so); static int send_mpa_req(struct c4iw_ep *ep); static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen); static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen); static void close_complete_upcall(struct c4iw_ep *ep, int status); static int send_abort(struct c4iw_ep *ep); static void peer_close_upcall(struct c4iw_ep *ep); static void peer_abort_upcall(struct c4iw_ep *ep); static void connect_reply_upcall(struct c4iw_ep *ep, int status); static int connect_request_upcall(struct c4iw_ep *ep); static void established_upcall(struct c4iw_ep *ep); static int process_mpa_reply(struct c4iw_ep *ep); static int process_mpa_request(struct c4iw_ep *ep); static void process_peer_close(struct c4iw_ep *ep); static void process_conn_error(struct c4iw_ep *ep); static void process_close_complete(struct c4iw_ep *ep); static void ep_timeout(unsigned long arg); static void setiwsockopt(struct socket *so); static void init_iwarp_socket(struct socket *so, void *arg); static void uninit_iwarp_socket(struct socket *so); static void process_data(struct c4iw_ep *ep); static void process_connected(struct c4iw_ep *ep); static int c4iw_so_upcall(struct socket *so, void *arg, int waitflag); static void process_socket_event(struct c4iw_ep *ep); static void release_ep_resources(struct c4iw_ep *ep); static int process_terminate(struct c4iw_ep *ep); static int terminate(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m); static int add_ep_to_req_list(struct c4iw_ep *ep, int ep_events); static struct listen_port_info * add_ep_to_listenlist(struct c4iw_listen_ep *lep); static int rem_ep_from_listenlist(struct c4iw_listen_ep *lep); static struct c4iw_listen_ep * find_real_listen_ep(struct c4iw_listen_ep *master_lep, struct socket *so); static int get_ifnet_from_raddr(struct sockaddr_storage *raddr, - struct ifnet **ifp); + if_t *ifp); static void process_newconn(struct c4iw_listen_ep *master_lep, struct socket *new_so); #define START_EP_TIMER(ep) \ do { \ CTR3(KTR_IW_CXGBE, "start_ep_timer (%s:%d) ep %p", \ __func__, __LINE__, (ep)); \ start_ep_timer(ep); \ } while (0) #define STOP_EP_TIMER(ep) \ ({ \ CTR3(KTR_IW_CXGBE, "stop_ep_timer (%s:%d) ep %p", \ __func__, __LINE__, (ep)); \ stop_ep_timer(ep); \ }) #define GET_LOCAL_ADDR(pladdr, so) \ do { \ struct sockaddr_storage *__a = NULL; \ struct inpcb *__inp = sotoinpcb(so); \ KASSERT(__inp != NULL, \ ("GET_LOCAL_ADDR(%s):so:%p, inp = NULL", __func__, so)); \ if (__inp->inp_vflag & INP_IPV4) \ in_getsockaddr(so, (struct sockaddr **)&__a); \ else \ in6_getsockaddr(so, (struct sockaddr **)&__a); \ *(pladdr) = *__a; \ free(__a, M_SONAME); \ } while (0) #define GET_REMOTE_ADDR(praddr, so) \ do { \ struct sockaddr_storage *__a = NULL; \ struct inpcb *__inp = sotoinpcb(so); \ KASSERT(__inp != NULL, \ ("GET_REMOTE_ADDR(%s):so:%p, inp = NULL", __func__, so)); \ if (__inp->inp_vflag & INP_IPV4) \ in_getpeeraddr(so, (struct sockaddr **)&__a); \ else \ in6_getpeeraddr(so, (struct sockaddr **)&__a); \ *(praddr) = *__a; \ free(__a, M_SONAME); \ } while (0) static char *states[] = { "idle", "listen", "connecting", "mpa_wait_req", "mpa_req_sent", "mpa_req_rcvd", "mpa_rep_sent", "fpdu_mode", "aborting", "closing", "moribund", "dead", NULL, }; static void deref_cm_id(struct c4iw_ep_common *epc) { epc->cm_id->rem_ref(epc->cm_id); epc->cm_id = NULL; set_bit(CM_ID_DEREFED, &epc->history); } static void ref_cm_id(struct c4iw_ep_common *epc) { set_bit(CM_ID_REFED, &epc->history); epc->cm_id->add_ref(epc->cm_id); } static void deref_qp(struct c4iw_ep *ep) { c4iw_qp_rem_ref(&ep->com.qp->ibqp); clear_bit(QP_REFERENCED, &ep->com.flags); set_bit(QP_DEREFED, &ep->com.history); } static void ref_qp(struct c4iw_ep *ep) { set_bit(QP_REFERENCED, &ep->com.flags); set_bit(QP_REFED, &ep->com.history); c4iw_qp_add_ref(&ep->com.qp->ibqp); } /* allocated per TCP port while listening */ struct listen_port_info { uint16_t port_num; /* TCP port address */ struct list_head list; /* belongs to listen_port_list */ struct list_head lep_list; /* per port lep list */ uint32_t refcnt; /* number of lep's listening */ }; /* * Following two lists are used to manage INADDR_ANY listeners: * 1)listen_port_list * 2)lep_list * * Below is the INADDR_ANY listener lists overview on a system with a two port * adapter: * |------------------| * |listen_port_list | * |------------------| * | * | |-----------| |-----------| * | | port_num:X| | port_num:X| * |--------------|-list------|-------|-list------|-------.... * | lep_list----| | lep_list----| * | refcnt | | | refcnt | | * | | | | | | * | | | | | | * |-----------| | |-----------| | * | | * | | * | | * | | lep1 lep2 * | | |----------------| |----------------| * | |----| listen_ep_list |----| listen_ep_list | * | |----------------| |----------------| * | * | * | lep1 lep2 * | |----------------| |----------------| * |---| listen_ep_list |----| listen_ep_list | * |----------------| |----------------| * * Because of two port adapter, the number of lep's are two(lep1 & lep2) for * each TCP port number. * * Here 'lep1' is always marked as Master lep, because solisten() is always * called through first lep. * */ static struct listen_port_info * add_ep_to_listenlist(struct c4iw_listen_ep *lep) { uint16_t port; struct listen_port_info *port_info = NULL; struct sockaddr_storage *laddr = &lep->com.local_addr; port = (laddr->ss_family == AF_INET) ? ((struct sockaddr_in *)laddr)->sin_port : ((struct sockaddr_in6 *)laddr)->sin6_port; mutex_lock(&listen_port_mutex); list_for_each_entry(port_info, &listen_port_list, list) if (port_info->port_num == port) goto found_port; port_info = malloc(sizeof(*port_info), M_CXGBE, M_WAITOK); port_info->port_num = port; port_info->refcnt = 0; list_add_tail(&port_info->list, &listen_port_list); INIT_LIST_HEAD(&port_info->lep_list); found_port: port_info->refcnt++; list_add_tail(&lep->listen_ep_list, &port_info->lep_list); mutex_unlock(&listen_port_mutex); return port_info; } static int rem_ep_from_listenlist(struct c4iw_listen_ep *lep) { uint16_t port; struct listen_port_info *port_info = NULL; struct sockaddr_storage *laddr = &lep->com.local_addr; int refcnt = 0; port = (laddr->ss_family == AF_INET) ? ((struct sockaddr_in *)laddr)->sin_port : ((struct sockaddr_in6 *)laddr)->sin6_port; mutex_lock(&listen_port_mutex); /* get the port_info structure based on the lep's port address */ list_for_each_entry(port_info, &listen_port_list, list) { if (port_info->port_num == port) { port_info->refcnt--; refcnt = port_info->refcnt; /* remove the current lep from the listen list */ list_del(&lep->listen_ep_list); if (port_info->refcnt == 0) { /* Remove this entry from the list as there * are no more listeners for this port_num. */ list_del(&port_info->list); kfree(port_info); } break; } } mutex_unlock(&listen_port_mutex); return refcnt; } /* * Find the lep that belongs to the ifnet on which the SYN frame was received. */ struct c4iw_listen_ep * find_real_listen_ep(struct c4iw_listen_ep *master_lep, struct socket *so) { struct adapter *adap = NULL; struct c4iw_listen_ep *lep = NULL; - struct ifnet *ifp = NULL, *hw_ifp = NULL; + if_t ifp = NULL, hw_ifp = NULL; struct listen_port_info *port_info = NULL; int i = 0, found_portinfo = 0, found_lep = 0; uint16_t port; /* * STEP 1: Figure out 'ifp' of the physical interface, not pseudo * interfaces like vlan, lagg, etc.. * TBD: lagg support, lagg + vlan support. */ ifp = TOEPCB(so)->l2te->ifp; - if (ifp->if_type == IFT_L2VLAN) { + if (if_gettype(ifp) == IFT_L2VLAN) { hw_ifp = VLAN_TRUNKDEV(ifp); if (hw_ifp == NULL) { CTR4(KTR_IW_CXGBE, "%s: Failed to get parent ifnet of " "vlan ifnet %p, sock %p, master_lep %p", __func__, ifp, so, master_lep); return (NULL); } } else hw_ifp = ifp; /* STEP 2: Find 'port_info' with listener local port address. */ port = (master_lep->com.local_addr.ss_family == AF_INET) ? ((struct sockaddr_in *)&master_lep->com.local_addr)->sin_port : ((struct sockaddr_in6 *)&master_lep->com.local_addr)->sin6_port; mutex_lock(&listen_port_mutex); list_for_each_entry(port_info, &listen_port_list, list) if (port_info->port_num == port) { found_portinfo =1; break; } if (!found_portinfo) goto out; /* STEP 3: Traverse through list of lep's that are bound to the current * TCP port address and find the lep that belongs to the ifnet on which * the SYN frame was received. */ list_for_each_entry(lep, &port_info->lep_list, listen_ep_list) { adap = lep->com.dev->rdev.adap; for_each_port(adap, i) { if (hw_ifp == adap->port[i]->vi[0].ifp) { found_lep =1; goto out; } } } out: mutex_unlock(&listen_port_mutex); return found_lep ? lep : (NULL); } static void process_timeout(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs = {0}; int abort = 1; CTR4(KTR_IW_CXGBE, "%s ep :%p, tid:%u, state %d", __func__, ep, ep->hwtid, ep->com.state); set_bit(TIMEDOUT, &ep->com.history); switch (ep->com.state) { case MPA_REQ_SENT: connect_reply_upcall(ep, -ETIMEDOUT); break; case MPA_REQ_WAIT: case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: break; case CLOSING: case MORIBUND: if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_ERROR; c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } close_complete_upcall(ep, -ETIMEDOUT); break; case ABORTING: case DEAD: /* * These states are expected if the ep timed out at the same * time as another thread was calling stop_ep_timer(). * So we silently do nothing for these states. */ abort = 0; break; default: CTR4(KTR_IW_CXGBE, "%s unexpected state ep %p tid %u state %u" , __func__, ep, ep->hwtid, ep->com.state); abort = 0; } if (abort) c4iw_ep_disconnect(ep, 1, GFP_KERNEL); c4iw_put_ep(&ep->com); return; } struct cqe_list_entry { struct list_head entry; struct c4iw_dev *rhp; struct t4_cqe err_cqe; }; static void process_err_cqes(void) { unsigned long flag; struct cqe_list_entry *cle; spin_lock_irqsave(&err_cqe_lock, flag); while (!list_empty(&err_cqe_list)) { struct list_head *tmp; tmp = err_cqe_list.next; list_del(tmp); tmp->next = tmp->prev = NULL; spin_unlock_irqrestore(&err_cqe_lock, flag); cle = list_entry(tmp, struct cqe_list_entry, entry); c4iw_ev_dispatch(cle->rhp, &cle->err_cqe); free(cle, M_CXGBE); spin_lock_irqsave(&err_cqe_lock, flag); } spin_unlock_irqrestore(&err_cqe_lock, flag); return; } static void process_req(struct work_struct *ctx) { struct c4iw_ep_common *epc; unsigned long flag; int ep_events; process_err_cqes(); spin_lock_irqsave(&req_lock, flag); while (!TAILQ_EMPTY(&req_list)) { epc = TAILQ_FIRST(&req_list); TAILQ_REMOVE(&req_list, epc, entry); epc->entry.tqe_prev = NULL; ep_events = epc->ep_events; epc->ep_events = 0; spin_unlock_irqrestore(&req_lock, flag); mutex_lock(&epc->mutex); CTR5(KTR_IW_CXGBE, "%s: so %p, ep %p, ep_state %s events 0x%x", __func__, epc->so, epc, states[epc->state], ep_events); if (ep_events & C4IW_EVENT_TERM) process_terminate((struct c4iw_ep *)epc); if (ep_events & C4IW_EVENT_TIMEOUT) process_timeout((struct c4iw_ep *)epc); if (ep_events & C4IW_EVENT_SOCKET) process_socket_event((struct c4iw_ep *)epc); mutex_unlock(&epc->mutex); c4iw_put_ep(epc); process_err_cqes(); spin_lock_irqsave(&req_lock, flag); } spin_unlock_irqrestore(&req_lock, flag); } /* * XXX: doesn't belong here in the iWARP driver. * XXX: assumes that the connection was offloaded by cxgbe/t4_tom if TF_TOE is * set. Is this a valid assumption for active open? */ static int set_tcpinfo(struct c4iw_ep *ep) { struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; struct toepcb *toep; int rc = 0; INP_WLOCK(inp); tp = intotcpcb(inp); if ((tp->t_flags & TF_TOE) == 0) { rc = EINVAL; log(LOG_ERR, "%s: connection not offloaded (so %p, ep %p)\n", __func__, so, ep); goto done; } toep = TOEPCB(so); ep->hwtid = toep->tid; ep->snd_seq = tp->snd_nxt; ep->rcv_seq = tp->rcv_nxt; done: INP_WUNLOCK(inp); return (rc); } static int -get_ifnet_from_raddr(struct sockaddr_storage *raddr, struct ifnet **ifp) +get_ifnet_from_raddr(struct sockaddr_storage *raddr, if_t *ifp) { int err = 0; struct nhop_object *nh; if (raddr->ss_family == AF_INET) { struct sockaddr_in *raddr4 = (struct sockaddr_in *)raddr; nh = fib4_lookup(RT_DEFAULT_FIB, raddr4->sin_addr, 0, NHR_NONE, 0); } else { struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *)raddr; struct in6_addr addr6; uint32_t scopeid; memset(&addr6, 0, sizeof(addr6)); in6_splitscope((struct in6_addr *)&raddr6->sin6_addr, &addr6, &scopeid); nh = fib6_lookup(RT_DEFAULT_FIB, &addr6, scopeid, NHR_NONE, 0); } if (nh == NULL) err = EHOSTUNREACH; else *ifp = nh->nh_ifp; CTR2(KTR_IW_CXGBE, "%s: return: %d", __func__, err); return err; } static void close_socket(struct socket *so) { uninit_iwarp_socket(so); soclose(so); } static void process_peer_close(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs = {0}; int disconnect = 1; int release = 0; CTR4(KTR_IW_CXGBE, "%s:ppcB ep %p so %p state %s", __func__, ep, ep->com.so, states[ep->com.state]); switch (ep->com.state) { case MPA_REQ_WAIT: CTR2(KTR_IW_CXGBE, "%s:ppc1 %p MPA_REQ_WAIT DEAD", __func__, ep); /* Fallthrough */ case MPA_REQ_SENT: CTR2(KTR_IW_CXGBE, "%s:ppc2 %p MPA_REQ_SENT DEAD", __func__, ep); ep->com.state = DEAD; connect_reply_upcall(ep, -ECONNABORTED); disconnect = 0; STOP_EP_TIMER(ep); close_socket(ep->com.so); deref_cm_id(&ep->com); release = 1; break; case MPA_REQ_RCVD: /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or * rejects the CR. */ CTR2(KTR_IW_CXGBE, "%s:ppc3 %p MPA_REQ_RCVD CLOSING", __func__, ep); ep->com.state = CLOSING; break; case MPA_REP_SENT: CTR2(KTR_IW_CXGBE, "%s:ppc4 %p MPA_REP_SENT CLOSING", __func__, ep); ep->com.state = CLOSING; break; case FPDU_MODE: CTR2(KTR_IW_CXGBE, "%s:ppc5 %p FPDU_MODE CLOSING", __func__, ep); START_EP_TIMER(ep); ep->com.state = CLOSING; attrs.next_state = C4IW_QP_STATE_CLOSING; c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); peer_close_upcall(ep); break; case ABORTING: CTR2(KTR_IW_CXGBE, "%s:ppc6 %p ABORTING (disconn)", __func__, ep); disconnect = 0; break; case CLOSING: CTR2(KTR_IW_CXGBE, "%s:ppc7 %p CLOSING MORIBUND", __func__, ep); ep->com.state = MORIBUND; disconnect = 0; break; case MORIBUND: CTR2(KTR_IW_CXGBE, "%s:ppc8 %p MORIBUND DEAD", __func__, ep); STOP_EP_TIMER(ep); if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } close_socket(ep->com.so); close_complete_upcall(ep, 0); ep->com.state = DEAD; release = 1; disconnect = 0; break; case DEAD: CTR2(KTR_IW_CXGBE, "%s:ppc9 %p DEAD (disconn)", __func__, ep); disconnect = 0; break; default: panic("%s: ep %p state %d", __func__, ep, ep->com.state); break; } if (disconnect) { CTR2(KTR_IW_CXGBE, "%s:ppca %p", __func__, ep); c4iw_ep_disconnect(ep, 0, M_NOWAIT); } if (release) { CTR2(KTR_IW_CXGBE, "%s:ppcb %p", __func__, ep); c4iw_put_ep(&ep->com); } CTR2(KTR_IW_CXGBE, "%s:ppcE %p", __func__, ep); return; } static void process_conn_error(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs = {0}; int ret; int state; state = ep->com.state; CTR5(KTR_IW_CXGBE, "%s:pceB ep %p so %p so->so_error %u state %s", __func__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]); switch (state) { case MPA_REQ_WAIT: STOP_EP_TIMER(ep); c4iw_put_ep(&ep->parent_ep->com); break; case MPA_REQ_SENT: STOP_EP_TIMER(ep); connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REP_SENT: ep->com.rpl_err = ECONNRESET; CTR1(KTR_IW_CXGBE, "waking up ep %p", ep); break; case MPA_REQ_RCVD: break; case MORIBUND: case CLOSING: STOP_EP_TIMER(ep); /*FALLTHROUGH*/ case FPDU_MODE: if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_ERROR; ret = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); if (ret) log(LOG_ERR, "%s - qp <- error failed!\n", __func__); } peer_abort_upcall(ep); break; case ABORTING: break; case DEAD: CTR2(KTR_IW_CXGBE, "%s so_error %d IN DEAD STATE!!!!", __func__, ep->com.so->so_error); return; default: panic("%s: ep %p state %d", __func__, ep, state); break; } if (state != ABORTING) { close_socket(ep->com.so); ep->com.state = DEAD; c4iw_put_ep(&ep->com); } CTR2(KTR_IW_CXGBE, "%s:pceE %p", __func__, ep); return; } static void process_close_complete(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs = {0}; int release = 0; CTR4(KTR_IW_CXGBE, "%s:pccB ep %p so %p state %s", __func__, ep, ep->com.so, states[ep->com.state]); /* The cm_id may be null if we failed to connect */ set_bit(CLOSE_CON_RPL, &ep->com.history); switch (ep->com.state) { case CLOSING: CTR2(KTR_IW_CXGBE, "%s:pcc1 %p CLOSING MORIBUND", __func__, ep); ep->com.state = MORIBUND; break; case MORIBUND: CTR2(KTR_IW_CXGBE, "%s:pcc1 %p MORIBUND DEAD", __func__, ep); STOP_EP_TIMER(ep); if ((ep->com.cm_id) && (ep->com.qp)) { CTR2(KTR_IW_CXGBE, "%s:pcc2 %p QP_STATE_IDLE", __func__, ep); attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } close_socket(ep->com.so); close_complete_upcall(ep, 0); ep->com.state = DEAD; release = 1; break; case ABORTING: CTR2(KTR_IW_CXGBE, "%s:pcc5 %p ABORTING", __func__, ep); break; case DEAD: CTR2(KTR_IW_CXGBE, "%s:pcc6 %p DEAD", __func__, ep); break; default: CTR2(KTR_IW_CXGBE, "%s:pcc7 %p unknown ep state", __func__, ep); panic("%s:pcc6 %p unknown ep state", __func__, ep); break; } if (release) { CTR2(KTR_IW_CXGBE, "%s:pcc8 %p", __func__, ep); release_ep_resources(ep); } CTR2(KTR_IW_CXGBE, "%s:pccE %p", __func__, ep); return; } static void setiwsockopt(struct socket *so) { int rc; struct sockopt sopt; int on = 1; sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = (caddr_t)&on; sopt.sopt_valsize = sizeof on; sopt.sopt_td = NULL; rc = -sosetopt(so, &sopt); if (rc) { log(LOG_ERR, "%s: can't set TCP_NODELAY on so %p (%d)\n", __func__, so, rc); } } static void init_iwarp_socket(struct socket *so, void *arg) { if (SOLISTENING(so)) { SOLISTEN_LOCK(so); solisten_upcall_set(so, c4iw_so_upcall, arg); so->so_state |= SS_NBIO; SOLISTEN_UNLOCK(so); } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, c4iw_so_upcall, arg); so->so_state |= SS_NBIO; SOCKBUF_UNLOCK(&so->so_rcv); } } static void uninit_iwarp_socket(struct socket *so) { if (SOLISTENING(so)) { SOLISTEN_LOCK(so); solisten_upcall_set(so, NULL, NULL); SOLISTEN_UNLOCK(so); } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); } } static void process_data(struct c4iw_ep *ep) { int ret = 0; int disconnect = 0; struct c4iw_qp_attributes attrs = {0}; CTR5(KTR_IW_CXGBE, "%s: so %p, ep %p, state %s, sbused %d", __func__, ep->com.so, ep, states[ep->com.state], sbused(&ep->com.so->so_rcv)); switch (ep->com.state) { case MPA_REQ_SENT: disconnect = process_mpa_reply(ep); break; case MPA_REQ_WAIT: disconnect = process_mpa_request(ep); if (disconnect) /* Refered in process_newconn() */ c4iw_put_ep(&ep->parent_ep->com); break; case FPDU_MODE: MPASS(ep->com.qp != NULL); attrs.next_state = C4IW_QP_STATE_TERMINATE; ret = c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); if (ret != -EINPROGRESS) disconnect = 1; break; default: log(LOG_ERR, "%s: Unexpected streaming data. ep %p, " "state %d, so %p, so_state 0x%x, sbused %u\n", __func__, ep, ep->com.state, ep->com.so, ep->com.so->so_state, sbused(&ep->com.so->so_rcv)); break; } if (disconnect) c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL); } static void process_connected(struct c4iw_ep *ep) { struct socket *so = ep->com.so; if ((so->so_state & SS_ISCONNECTED) && !so->so_error) { if (send_mpa_req(ep)) goto err; } else { connect_reply_upcall(ep, -so->so_error); goto err; } return; err: close_socket(so); ep->com.state = DEAD; c4iw_put_ep(&ep->com); return; } static inline int c4iw_zero_addr(struct sockaddr *addr) { struct in6_addr *ip6; if (addr->sa_family == AF_INET) return (((struct sockaddr_in *)addr)->sin_addr.s_addr == 0); else { ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; } } static inline int c4iw_loopback_addr(struct sockaddr *addr) { if (addr->sa_family == AF_INET) return IN_LOOPBACK( ntohl(((struct sockaddr_in *) addr)->sin_addr.s_addr)); else return IN6_IS_ADDR_LOOPBACK( &((struct sockaddr_in6 *) addr)->sin6_addr); } static inline int c4iw_any_addr(struct sockaddr *addr) { return c4iw_zero_addr(addr) || c4iw_loopback_addr(addr); } static void process_newconn(struct c4iw_listen_ep *master_lep, struct socket *new_so) { struct c4iw_listen_ep *real_lep = NULL; struct c4iw_ep *new_ep = NULL; struct sockaddr_in *remote = NULL; int ret = 0; MPASS(new_so != NULL); if (c4iw_any_addr((struct sockaddr *)&master_lep->com.local_addr)) { /* Here we need to find the 'real_lep' that belongs to the * incomming socket's network interface, such that the newly * created 'ep' can be attached to the real 'lep'. */ real_lep = find_real_listen_ep(master_lep, new_so); if (real_lep == NULL) { CTR2(KTR_IW_CXGBE, "%s: Could not find the real listen " "ep for sock: %p", __func__, new_so); log(LOG_ERR,"%s: Could not find the real listen ep for " "sock: %p\n", __func__, new_so); /* FIXME: properly free the 'new_so' in failure case. * Use of soabort() and soclose() are not legal * here(before soaccept()). */ return; } } else /* for Non-Wildcard address, master_lep is always the real_lep */ real_lep = master_lep; new_ep = alloc_ep(sizeof(*new_ep), GFP_KERNEL); CTR6(KTR_IW_CXGBE, "%s: master_lep %p, real_lep: %p, new ep %p, " "listening so %p, new so %p", __func__, master_lep, real_lep, new_ep, master_lep->com.so, new_so); new_ep->com.dev = real_lep->com.dev; new_ep->com.so = new_so; new_ep->com.cm_id = NULL; new_ep->com.thread = real_lep->com.thread; new_ep->parent_ep = real_lep; GET_LOCAL_ADDR(&new_ep->com.local_addr, new_so); GET_REMOTE_ADDR(&new_ep->com.remote_addr, new_so); c4iw_get_ep(&real_lep->com); init_timer(&new_ep->timer); new_ep->com.state = MPA_REQ_WAIT; setiwsockopt(new_so); ret = soaccept(new_so, (struct sockaddr **)&remote); if (ret != 0) { CTR4(KTR_IW_CXGBE, "%s:listen sock:%p, new sock:%p, ret:%d", __func__, master_lep->com.so, new_so, ret); if (remote != NULL) free(remote, M_SONAME); soclose(new_so); c4iw_put_ep(&new_ep->com); c4iw_put_ep(&real_lep->com); return; } free(remote, M_SONAME); START_EP_TIMER(new_ep); /* MPA request might have been queued up on the socket already, so we * initialize the socket/upcall_handler under lock to prevent processing * MPA request on another thread(via process_req()) simultaneously. */ c4iw_get_ep(&new_ep->com); /* Dereferenced at the end below, this is to avoid freeing of ep before ep unlock. */ mutex_lock(&new_ep->com.mutex); init_iwarp_socket(new_so, &new_ep->com); ret = process_mpa_request(new_ep); if (ret) { /* ABORT */ c4iw_ep_disconnect(new_ep, 1, GFP_KERNEL); c4iw_put_ep(&real_lep->com); } mutex_unlock(&new_ep->com.mutex); c4iw_put_ep(&new_ep->com); return; } static int add_ep_to_req_list(struct c4iw_ep *ep, int new_ep_event) { unsigned long flag; spin_lock_irqsave(&req_lock, flag); if (ep && ep->com.so) { ep->com.ep_events |= new_ep_event; if (!ep->com.entry.tqe_prev) { c4iw_get_ep(&ep->com); TAILQ_INSERT_TAIL(&req_list, &ep->com, entry); queue_work(c4iw_taskq, &c4iw_task); } } spin_unlock_irqrestore(&req_lock, flag); return (0); } static int c4iw_so_upcall(struct socket *so, void *arg, int waitflag) { struct c4iw_ep *ep = arg; CTR6(KTR_IW_CXGBE, "%s: so %p, so_state 0x%x, ep %p, ep_state %s, tqe_prev %p", __func__, so, so->so_state, ep, states[ep->com.state], ep->com.entry.tqe_prev); MPASS(ep->com.so == so); /* * Wake up any threads waiting in rdma_init()/rdma_fini(), * with locks held. */ if (so->so_error || (ep->com.dev->rdev.flags & T4_FATAL_ERROR)) c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); add_ep_to_req_list(ep, C4IW_EVENT_SOCKET); return (SU_OK); } static int terminate(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rdma_terminate *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct socket *so; struct c4iw_ep *ep; INP_WLOCK(toep->inp); so = inp_inpcbtosocket(toep->inp); ep = so->so_rcv.sb_upcallarg; INP_WUNLOCK(toep->inp); CTR3(KTR_IW_CXGBE, "%s: so %p, ep %p", __func__, so, ep); add_ep_to_req_list(ep, C4IW_EVENT_TERM); return 0; } static void process_socket_event(struct c4iw_ep *ep) { int state = ep->com.state; struct socket *so = ep->com.so; if (ep->com.state == DEAD) { CTR3(KTR_IW_CXGBE, "%s: Pending socket event discarded " "ep %p ep_state %s", __func__, ep, states[state]); return; } CTR6(KTR_IW_CXGBE, "process_socket_event: so %p, so_state 0x%x, " "so_err %d, sb_state 0x%x, ep %p, ep_state %s", so, so->so_state, so->so_error, so->so_rcv.sb_state, ep, states[state]); if (state == CONNECTING) { process_connected(ep); return; } if (state == LISTEN) { struct c4iw_listen_ep *lep = (struct c4iw_listen_ep *)ep; struct socket *listen_so = so, *new_so = NULL; int error = 0; SOLISTEN_LOCK(listen_so); do { error = solisten_dequeue(listen_so, &new_so, SOCK_NONBLOCK); if (error) { CTR4(KTR_IW_CXGBE, "%s: lep %p listen_so %p " "error %d", __func__, lep, listen_so, error); return; } process_newconn(lep, new_so); /* solisten_dequeue() unlocks while return, so aquire * lock again for sol_qlen and also for next iteration. */ SOLISTEN_LOCK(listen_so); } while (listen_so->sol_qlen); SOLISTEN_UNLOCK(listen_so); return; } /* connection error */ if (so->so_error) { process_conn_error(ep); return; } /* peer close */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state <= CLOSING) { process_peer_close(ep); /* * check whether socket disconnect event is pending before * returning. Fallthrough if yes. */ if (!(so->so_state & SS_ISDISCONNECTED)) return; } /* close complete */ if (so->so_state & SS_ISDISCONNECTED) { process_close_complete(ep); return; } /* rx data */ if (sbused(&ep->com.so->so_rcv)) { process_data(ep); return; } /* Socket events for 'MPA Request Received' and 'Close Complete' * were already processed earlier in their previous events handlers. * Hence, these socket events are skipped. * And any other socket events must have handled above. */ MPASS((ep->com.state == MPA_REQ_RCVD) || (ep->com.state == MORIBUND)); if ((ep->com.state != MPA_REQ_RCVD) && (ep->com.state != MORIBUND)) log(LOG_ERR, "%s: Unprocessed socket event so %p, " "so_state 0x%x, so_err %d, sb_state 0x%x, ep %p, ep_state %s\n", __func__, so, so->so_state, so->so_error, so->so_rcv.sb_state, ep, states[state]); } SYSCTL_NODE(_hw, OID_AUTO, iw_cxgbe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "iw_cxgbe driver parameters"); static int dack_mode = 0; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, dack_mode, CTLFLAG_RWTUN, &dack_mode, 0, "Delayed ack mode (default = 0)"); int c4iw_max_read_depth = 8; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, c4iw_max_read_depth, CTLFLAG_RWTUN, &c4iw_max_read_depth, 0, "Per-connection max ORD/IRD (default = 8)"); static int enable_tcp_timestamps; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_timestamps, CTLFLAG_RWTUN, &enable_tcp_timestamps, 0, "Enable tcp timestamps (default = 0)"); static int enable_tcp_sack; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_sack, CTLFLAG_RWTUN, &enable_tcp_sack, 0, "Enable tcp SACK (default = 0)"); static int enable_tcp_window_scaling = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_window_scaling, CTLFLAG_RWTUN, &enable_tcp_window_scaling, 0, "Enable tcp window scaling (default = 1)"); int c4iw_debug = 0; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, c4iw_debug, CTLFLAG_RWTUN, &c4iw_debug, 0, "Enable debug logging (default = 0)"); static int peer2peer = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, peer2peer, CTLFLAG_RWTUN, &peer2peer, 0, "Support peer2peer ULPs (default = 1)"); static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, p2p_type, CTLFLAG_RWTUN, &p2p_type, 0, "RDMAP opcode to use for the RTR message: 1 = RDMA_READ 0 = RDMA_WRITE (default 1)"); static int ep_timeout_secs = 60; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, ep_timeout_secs, CTLFLAG_RWTUN, &ep_timeout_secs, 0, "CM Endpoint operation timeout in seconds (default = 60)"); static int mpa_rev = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, mpa_rev, CTLFLAG_RWTUN, &mpa_rev, 0, "MPA Revision, 0 supports amso1100, 1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft compliant (default = 1)"); static int markers_enabled; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, markers_enabled, CTLFLAG_RWTUN, &markers_enabled, 0, "Enable MPA MARKERS (default(0) = disabled)"); static int crc_enabled = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, crc_enabled, CTLFLAG_RWTUN, &crc_enabled, 0, "Enable MPA CRC (default(1) = enabled)"); static int rcv_win = 256 * 1024; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, rcv_win, CTLFLAG_RWTUN, &rcv_win, 0, "TCP receive window in bytes (default = 256KB)"); static int snd_win = 128 * 1024; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, snd_win, CTLFLAG_RWTUN, &snd_win, 0, "TCP send window in bytes (default = 128KB)"); int use_dsgl = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0, "Use DSGL for PBL/FastReg (default=1)"); int inline_threshold = 128; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, inline_threshold, CTLFLAG_RWTUN, &inline_threshold, 0, "inline vs dsgl threshold (default=128)"); static int reuseaddr = 0; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, reuseaddr, CTLFLAG_RWTUN, &reuseaddr, 0, "Enable SO_REUSEADDR & SO_REUSEPORT socket options on all iWARP client connections(default = 0)"); static void start_ep_timer(struct c4iw_ep *ep) { if (timer_pending(&ep->timer)) { CTR2(KTR_IW_CXGBE, "%s: ep %p, already started", __func__, ep); printk(KERN_ERR "%s timer already started! ep %p\n", __func__, ep); return; } clear_bit(TIMEOUT, &ep->com.flags); c4iw_get_ep(&ep->com); ep->timer.expires = jiffies + ep_timeout_secs * HZ; ep->timer.data = (unsigned long)ep; ep->timer.function = ep_timeout; add_timer(&ep->timer); } static int stop_ep_timer(struct c4iw_ep *ep) { del_timer_sync(&ep->timer); if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { c4iw_put_ep(&ep->com); return 0; } return 1; } static void * alloc_ep(int size, gfp_t gfp) { struct c4iw_ep_common *epc; epc = kzalloc(size, gfp); if (epc == NULL) return (NULL); kref_init(&epc->kref); mutex_init(&epc->mutex); c4iw_init_wr_wait(&epc->wr_wait); return (epc); } void _c4iw_free_ep(struct kref *kref) { struct c4iw_ep *ep; #if defined(KTR) || defined(INVARIANTS) struct c4iw_ep_common *epc; #endif ep = container_of(kref, struct c4iw_ep, com.kref); #if defined(KTR) || defined(INVARIANTS) epc = &ep->com; #endif KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list", __func__, epc)); if (test_bit(QP_REFERENCED, &ep->com.flags)) deref_qp(ep); CTR4(KTR_IW_CXGBE, "%s: ep %p, history 0x%lx, flags 0x%lx", __func__, ep, epc->history, epc->flags); kfree(ep); } static void release_ep_resources(struct c4iw_ep *ep) { CTR2(KTR_IW_CXGBE, "%s:rerB %p", __func__, ep); set_bit(RELEASE_RESOURCES, &ep->com.flags); c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:rerE %p", __func__, ep); } static int send_mpa_req(struct c4iw_ep *ep) { int mpalen; struct mpa_message *mpa; struct mpa_v2_conn_params mpa_v2_params; struct mbuf *m; char mpa_rev_to_use = mpa_rev; int err = 0; if (ep->retry_with_mpa_v1) mpa_rev_to_use = 1; mpalen = sizeof(*mpa) + ep->plen; if (mpa_rev_to_use == 2) mpalen += sizeof(struct mpa_v2_conn_params); mpa = malloc(mpalen, M_CXGBE, M_NOWAIT); if (mpa == NULL) { err = -ENOMEM; CTR3(KTR_IW_CXGBE, "%s:smr1 ep: %p , error: %d", __func__, ep, err); goto err; } memset(mpa, 0, mpalen); memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); mpa->flags = (crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0) | (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0); mpa->private_data_size = htons(ep->plen); mpa->revision = mpa_rev_to_use; if (mpa_rev_to_use == 1) { ep->tried_with_mpa_v1 = 1; ep->retry_with_mpa_v1 = 0; } if (mpa_rev_to_use == 2) { mpa->private_data_size = htons(ntohs(mpa->private_data_size) + sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons((u16)ep->ird); mpa_v2_params.ord = htons((u16)ep->ord); if (peer2peer) { mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_WRITE_RTR); } else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_READ_RTR); } } memcpy(mpa->private_data, &mpa_v2_params, sizeof(struct mpa_v2_conn_params)); if (ep->plen) { memcpy(mpa->private_data + sizeof(struct mpa_v2_conn_params), ep->mpa_pkt + sizeof(*mpa), ep->plen); } } else { if (ep->plen) memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); CTR2(KTR_IW_CXGBE, "%s:smr7 %p", __func__, ep); } m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA); if (m == NULL) { err = -ENOMEM; CTR3(KTR_IW_CXGBE, "%s:smr2 ep: %p , error: %d", __func__, ep, err); free(mpa, M_CXGBE); goto err; } m_copyback(m, 0, mpalen, (void *)mpa); free(mpa, M_CXGBE); err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); if (err) { CTR3(KTR_IW_CXGBE, "%s:smr3 ep: %p , error: %d", __func__, ep, err); goto err; } START_EP_TIMER(ep); ep->com.state = MPA_REQ_SENT; ep->mpa_attr.initiator = 1; CTR3(KTR_IW_CXGBE, "%s:smrE %p, error: %d", __func__, ep, err); return 0; err: connect_reply_upcall(ep, err); CTR3(KTR_IW_CXGBE, "%s:smrE %p, error: %d", __func__, ep, err); return err; } static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) { int mpalen ; struct mpa_message *mpa; struct mpa_v2_conn_params mpa_v2_params; struct mbuf *m; int err; CTR4(KTR_IW_CXGBE, "%s:smrejB %p %u %d", __func__, ep, ep->hwtid, ep->plen); mpalen = sizeof(*mpa) + plen; if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpalen += sizeof(struct mpa_v2_conn_params); CTR4(KTR_IW_CXGBE, "%s:smrej1 %p %u %d", __func__, ep, ep->mpa_attr.version, mpalen); } mpa = malloc(mpalen, M_CXGBE, M_NOWAIT); if (mpa == NULL) return (-ENOMEM); memset(mpa, 0, mpalen); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = MPA_REJECT; mpa->revision = mpa_rev; mpa->private_data_size = htons(plen); if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; mpa->private_data_size = htons(ntohs(mpa->private_data_size) + sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons(((u16)ep->ird) | (peer2peer ? MPA_V2_PEER2PEER_MODEL : 0)); mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ? (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE ? MPA_V2_RDMA_WRITE_RTR : p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ ? MPA_V2_RDMA_READ_RTR : 0) : 0)); memcpy(mpa->private_data, &mpa_v2_params, sizeof(struct mpa_v2_conn_params)); if (ep->plen) memcpy(mpa->private_data + sizeof(struct mpa_v2_conn_params), pdata, plen); CTR5(KTR_IW_CXGBE, "%s:smrej3 %p %d %d %d", __func__, ep, mpa_v2_params.ird, mpa_v2_params.ord, ep->plen); } else if (plen) memcpy(mpa->private_data, pdata, plen); m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA); if (m == NULL) { free(mpa, M_CXGBE); return (-ENOMEM); } m_copyback(m, 0, mpalen, (void *)mpa); free(mpa, M_CXGBE); err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); if (!err) ep->snd_seq += mpalen; CTR4(KTR_IW_CXGBE, "%s:smrejE %p %u %d", __func__, ep, ep->hwtid, err); return err; } static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) { int mpalen; struct mpa_message *mpa; struct mbuf *m; struct mpa_v2_conn_params mpa_v2_params; int err; CTR2(KTR_IW_CXGBE, "%s:smrepB %p", __func__, ep); mpalen = sizeof(*mpa) + plen; if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { CTR3(KTR_IW_CXGBE, "%s:smrep1 %p %d", __func__, ep, ep->mpa_attr.version); mpalen += sizeof(struct mpa_v2_conn_params); } mpa = malloc(mpalen, M_CXGBE, M_NOWAIT); if (mpa == NULL) return (-ENOMEM); memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0); mpa->revision = ep->mpa_attr.version; mpa->private_data_size = htons(plen); if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; mpa->private_data_size += htons(sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons((u16)ep->ird); mpa_v2_params.ord = htons((u16)ep->ord); CTR5(KTR_IW_CXGBE, "%s:smrep3 %p %d %d %d", __func__, ep, ep->mpa_attr.version, mpa_v2_params.ird, mpa_v2_params.ord); if (peer2peer && (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED)) { mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_WRITE_RTR); CTR5(KTR_IW_CXGBE, "%s:smrep4 %p %d %d %d", __func__, ep, p2p_type, mpa_v2_params.ird, mpa_v2_params.ord); } else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_READ_RTR); CTR5(KTR_IW_CXGBE, "%s:smrep5 %p %d %d %d", __func__, ep, p2p_type, mpa_v2_params.ird, mpa_v2_params.ord); } } memcpy(mpa->private_data, &mpa_v2_params, sizeof(struct mpa_v2_conn_params)); if (ep->plen) memcpy(mpa->private_data + sizeof(struct mpa_v2_conn_params), pdata, plen); } else if (plen) memcpy(mpa->private_data, pdata, plen); m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA); if (m == NULL) { free(mpa, M_CXGBE); return (-ENOMEM); } m_copyback(m, 0, mpalen, (void *)mpa); free(mpa, M_CXGBE); ep->com.state = MPA_REP_SENT; ep->snd_seq += mpalen; err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); CTR3(KTR_IW_CXGBE, "%s:smrepE %p %d", __func__, ep, err); return err; } static void close_complete_upcall(struct c4iw_ep *ep, int status) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:ccuB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; event.status = status; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:ccu1 %1", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); deref_cm_id(&ep->com); set_bit(CLOSE_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:ccuE %p", __func__, ep); } static int send_abort(struct c4iw_ep *ep) { struct socket *so = ep->com.so; struct sockopt sopt; int rc; struct linger l; CTR5(KTR_IW_CXGBE, "%s ep %p so %p state %s tid %d", __func__, ep, so, states[ep->com.state], ep->hwtid); l.l_onoff = 1; l.l_linger = 0; /* linger_time of 0 forces RST to be sent */ sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_LINGER; sopt.sopt_val = (caddr_t)&l; sopt.sopt_valsize = sizeof l; sopt.sopt_td = NULL; rc = -sosetopt(so, &sopt); if (rc != 0) { log(LOG_ERR, "%s: sosetopt(%p, linger = 0) failed with %d.\n", __func__, so, rc); } uninit_iwarp_socket(so); soclose(so); set_bit(ABORT_CONN, &ep->com.history); /* * TBD: iw_cxgbe driver should receive ABORT reply for every ABORT * request it has sent. But the current TOE driver is not propagating * this ABORT reply event (via do_abort_rpl) to iw_cxgbe. So as a work- * around de-refererece 'ep' here instead of doing it in abort_rpl() * handler(not yet implemented) of iw_cxgbe driver. */ release_ep_resources(ep); ep->com.state = DEAD; return (0); } static void peer_close_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:pcuB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_DISCONNECT; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:pcu1 %p", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); set_bit(DISCONN_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:pcuE %p", __func__, ep); } static void peer_abort_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:pauB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; event.status = -ECONNRESET; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:pau1 %p", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); deref_cm_id(&ep->com); set_bit(ABORT_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:pauE %p", __func__, ep); } static void connect_reply_upcall(struct c4iw_ep *ep, int status) { struct iw_cm_event event; CTR3(KTR_IW_CXGBE, "%s:cruB %p, status: %d", __func__, ep, status); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REPLY; event.status = ((status == -ECONNABORTED) || (status == -EPIPE)) ? -ECONNRESET : status; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; if ((status == 0) || (status == -ECONNREFUSED)) { if (!ep->tried_with_mpa_v1) { CTR2(KTR_IW_CXGBE, "%s:cru1 %p", __func__, ep); /* this means MPA_v2 is used */ event.ord = ep->ird; event.ird = ep->ord; event.private_data_len = ep->plen - sizeof(struct mpa_v2_conn_params); event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) + sizeof(struct mpa_v2_conn_params); } else { CTR2(KTR_IW_CXGBE, "%s:cru2 %p", __func__, ep); /* this means MPA_v1 is used */ event.ord = c4iw_max_read_depth; event.ird = c4iw_max_read_depth; event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } } if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:cru3 %p", __func__, ep); set_bit(CONN_RPL_UPCALL, &ep->com.history); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } if(status == -ECONNABORTED) { CTR3(KTR_IW_CXGBE, "%s:cruE %p %d", __func__, ep, status); return; } if (status < 0) { CTR3(KTR_IW_CXGBE, "%s:cru4 %p %d", __func__, ep, status); deref_cm_id(&ep->com); } CTR2(KTR_IW_CXGBE, "%s:cruE %p", __func__, ep); } static int connect_request_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; int ret; CTR3(KTR_IW_CXGBE, "%s: ep %p, mpa_v1 %d", __func__, ep, ep->tried_with_mpa_v1); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REQUEST; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; event.provider_data = ep; if (!ep->tried_with_mpa_v1) { /* this means MPA_v2 is used */ event.ord = ep->ord; event.ird = ep->ird; event.private_data_len = ep->plen - sizeof(struct mpa_v2_conn_params); event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) + sizeof(struct mpa_v2_conn_params); } else { /* this means MPA_v1 is used. Send max supported */ event.ord = c4iw_max_read_depth; event.ird = c4iw_max_read_depth; event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } c4iw_get_ep(&ep->com); ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id, &event); if(ret) { CTR3(KTR_IW_CXGBE, "%s: ep %p, Failure while notifying event to" " IWCM, err:%d", __func__, ep, ret); c4iw_put_ep(&ep->com); } else /* Dereference parent_ep only in success case. * In case of failure, parent_ep is dereferenced by the caller * of process_mpa_request(). */ c4iw_put_ep(&ep->parent_ep->com); set_bit(CONNREQ_UPCALL, &ep->com.history); return ret; } static void established_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:euB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_ESTABLISHED; event.ird = ep->ord; event.ord = ep->ird; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:eu1 %p", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); set_bit(ESTAB_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:euE %p", __func__, ep); } #define RELAXED_IRD_NEGOTIATION 1 /* * process_mpa_reply - process streaming mode MPA reply * * Returns: * * 0 upon success indicating a connect request was delivered to the ULP * or the mpa request is incomplete but valid so far. * * 1 if a failure requires the caller to close the connection. * * 2 if a failure requires the caller to abort the connection. */ static int process_mpa_reply(struct c4iw_ep *ep) { struct mpa_message *mpa; struct mpa_v2_conn_params *mpa_v2_params; u16 plen; u16 resp_ird, resp_ord; u8 rtr_mismatch = 0, insuff_ird = 0; struct c4iw_qp_attributes attrs = {0}; enum c4iw_qp_attr_mask mask; int err; struct mbuf *top, *m; int flags = MSG_DONTWAIT; struct uio uio; int disconnect = 0; CTR2(KTR_IW_CXGBE, "%s:pmrB %p", __func__, ep); /* * Stop mpa timer. If it expired, then * we ignore the MPA reply. process_timeout() * will abort the connection. */ if (STOP_EP_TIMER(ep)) return 0; uio.uio_resid = 1000000; uio.uio_td = ep->com.thread; err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); if (err) { if (err == EWOULDBLOCK) { CTR2(KTR_IW_CXGBE, "%s:pmr1 %p", __func__, ep); START_EP_TIMER(ep); return 0; } err = -err; CTR2(KTR_IW_CXGBE, "%s:pmr2 %p", __func__, ep); goto err; } if (ep->com.so->so_rcv.sb_mb) { CTR2(KTR_IW_CXGBE, "%s:pmr3 %p", __func__, ep); printf("%s data after soreceive called! so %p sb_mb %p top %p\n", __func__, ep->com.so, ep->com.so->so_rcv.sb_mb, top); } m = top; do { CTR2(KTR_IW_CXGBE, "%s:pmr4 %p", __func__, ep); /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { CTR3(KTR_IW_CXGBE, "%s:pmr5 %p %d", __func__, ep, ep->mpa_pkt_len + m->m_len); err = (-EINVAL); goto err_stop_timer; } /* * copy the new data into our accumulation buffer. */ m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); ep->mpa_pkt_len += m->m_len; if (!m->m_next) m = m->m_nextpkt; else m = m->m_next; } while (m); m_freem(top); /* * if we don't even have the mpa message, then bail. */ if (ep->mpa_pkt_len < sizeof(*mpa)) { return 0; } mpa = (struct mpa_message *) ep->mpa_pkt; /* Validate MPA header. */ if (mpa->revision > mpa_rev) { CTR4(KTR_IW_CXGBE, "%s:pmr6 %p %d %d", __func__, ep, mpa->revision, mpa_rev); printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d, " " Received = %d\n", __func__, mpa_rev, mpa->revision); err = -EPROTO; goto err_stop_timer; } if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { CTR2(KTR_IW_CXGBE, "%s:pmr7 %p", __func__, ep); err = -EPROTO; goto err_stop_timer; } plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { CTR2(KTR_IW_CXGBE, "%s:pmr8 %p", __func__, ep); err = -EPROTO; goto err_stop_timer; } /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGBE, "%s:pmr9 %p", __func__, ep); STOP_EP_TIMER(ep); err = -EPROTO; goto err_stop_timer; } ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGBE, "%s:pmra %p", __func__, ep); return 0; } if (mpa->flags & MPA_REJECT) { CTR2(KTR_IW_CXGBE, "%s:pmrb %p", __func__, ep); err = -ECONNREFUSED; goto err_stop_timer; } /* * If we get here we have accumulated the entire mpa * start reply message including private data. And * the MPA header is valid. */ ep->com.state = FPDU_MODE; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa->revision; ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; if (mpa->revision == 2) { CTR2(KTR_IW_CXGBE, "%s:pmrc %p", __func__, ep); ep->mpa_attr.enhanced_rdma_conn = mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; if (ep->mpa_attr.enhanced_rdma_conn) { CTR2(KTR_IW_CXGBE, "%s:pmrd %p", __func__, ep); mpa_v2_params = (struct mpa_v2_conn_params *) (ep->mpa_pkt + sizeof(*mpa)); resp_ird = ntohs(mpa_v2_params->ird) & MPA_V2_IRD_ORD_MASK; resp_ord = ntohs(mpa_v2_params->ord) & MPA_V2_IRD_ORD_MASK; /* * This is a double-check. Ideally, below checks are * not required since ird/ord stuff has been taken * care of in c4iw_accept_cr */ if (ep->ird < resp_ord) { if (RELAXED_IRD_NEGOTIATION && resp_ord <= ep->com.dev->rdev.adap->params.max_ordird_qp) ep->ird = resp_ord; else insuff_ird = 1; } else if (ep->ird > resp_ord) { ep->ird = resp_ord; } if (ep->ord > resp_ird) { if (RELAXED_IRD_NEGOTIATION) ep->ord = resp_ird; else insuff_ird = 1; } if (insuff_ird) { err = -ENOMEM; ep->ird = resp_ord; ep->ord = resp_ird; } if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL) { CTR2(KTR_IW_CXGBE, "%s:pmrf %p", __func__, ep); if (ntohs(mpa_v2_params->ord) & MPA_V2_RDMA_WRITE_RTR) { CTR2(KTR_IW_CXGBE, "%s:pmrg %p", __func__, ep); ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_RDMA_WRITE; } else if (ntohs(mpa_v2_params->ord) & MPA_V2_RDMA_READ_RTR) { CTR2(KTR_IW_CXGBE, "%s:pmrh %p", __func__, ep); ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; } } } } else { CTR2(KTR_IW_CXGBE, "%s:pmri %p", __func__, ep); if (mpa->revision == 1) { CTR2(KTR_IW_CXGBE, "%s:pmrj %p", __func__, ep); if (peer2peer) { CTR2(KTR_IW_CXGBE, "%s:pmrk %p", __func__, ep); ep->mpa_attr.p2p_type = p2p_type; } } } if (set_tcpinfo(ep)) { CTR2(KTR_IW_CXGBE, "%s:pmrl %p", __func__, ep); printf("%s set_tcpinfo error\n", __func__); err = -ECONNRESET; goto err; } CTR6(KTR_IW_CXGBE, "%s - crc_enabled = %d, recv_marker_enabled = %d, " "xmit_marker_enabled = %d, version = %d p2p_type = %d", __func__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, ep->mpa_attr.p2p_type); /* * If responder's RTR does not match with that of initiator, assign * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not * generated when moving QP to RTS state. * A TERM message will be sent after QP has moved to RTS state */ if ((ep->mpa_attr.version == 2) && peer2peer && (ep->mpa_attr.p2p_type != p2p_type)) { CTR2(KTR_IW_CXGBE, "%s:pmrm %p", __func__, ep); ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; rtr_mismatch = 1; } //ep->ofld_txq = TOEPCB(ep->com.so)->ofld_txq; attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = C4IW_QP_STATE_RTS; mask = C4IW_QP_ATTR_NEXT_STATE | C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR | C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD; /* bind QP and TID with INIT_WR */ err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) { CTR2(KTR_IW_CXGBE, "%s:pmrn %p", __func__, ep); goto err; } /* * If responder's RTR requirement did not match with what initiator * supports, generate TERM message */ if (rtr_mismatch) { CTR2(KTR_IW_CXGBE, "%s:pmro %p", __func__, ep); printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__); attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_NOMATCH_RTR; attrs.next_state = C4IW_QP_STATE_TERMINATE; attrs.send_term = 1; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); err = -ENOMEM; disconnect = 1; goto out; } /* * Generate TERM if initiator IRD is not sufficient for responder * provided ORD. Currently, we do the same behaviour even when * responder provided IRD is also not sufficient as regards to * initiator ORD. */ if (insuff_ird) { CTR2(KTR_IW_CXGBE, "%s:pmrp %p", __func__, ep); printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n", __func__); attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_INSUFF_IRD; attrs.next_state = C4IW_QP_STATE_TERMINATE; attrs.send_term = 1; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); err = -ENOMEM; disconnect = 1; goto out; } goto out; err_stop_timer: STOP_EP_TIMER(ep); err: disconnect = 2; out: connect_reply_upcall(ep, err); CTR2(KTR_IW_CXGBE, "%s:pmrE %p", __func__, ep); return disconnect; } /* * process_mpa_request - process streaming mode MPA request * * Returns: * * 0 upon success indicating a connect request was delivered to the ULP * or the mpa request is incomplete but valid so far. * * 1 if a failure requires the caller to close the connection. * * 2 if a failure requires the caller to abort the connection. */ static int process_mpa_request(struct c4iw_ep *ep) { struct mpa_message *mpa; struct mpa_v2_conn_params *mpa_v2_params; u16 plen; int flags = MSG_DONTWAIT; int rc; struct iovec iov; struct uio uio; enum c4iw_ep_state state = ep->com.state; CTR3(KTR_IW_CXGBE, "%s: ep %p, state %s", __func__, ep, states[state]); if (state != MPA_REQ_WAIT) return 0; iov.iov_base = &ep->mpa_pkt[ep->mpa_pkt_len]; iov.iov_len = sizeof(ep->mpa_pkt) - ep->mpa_pkt_len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(ep->mpa_pkt) - ep->mpa_pkt_len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = NULL; /* uio.uio_td = ep->com.thread; */ rc = soreceive(ep->com.so, NULL, &uio, NULL, NULL, &flags); if (rc == EAGAIN) return 0; else if (rc) goto err_stop_timer; KASSERT(uio.uio_offset > 0, ("%s: sorecieve on so %p read no data", __func__, ep->com.so)); ep->mpa_pkt_len += uio.uio_offset; /* * If we get more than the supported amount of private data then we must * fail this connection. XXX: check so_rcv->sb_cc, or peek with another * soreceive, or increase the size of mpa_pkt by 1 and abort if the last * byte is filled by the soreceive above. */ /* Don't even have the MPA message. Wait for more data to arrive. */ if (ep->mpa_pkt_len < sizeof(*mpa)) return 0; mpa = (struct mpa_message *) ep->mpa_pkt; /* * Validate MPA Header. */ if (mpa->revision > mpa_rev) { log(LOG_ERR, "%s: MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); goto err_stop_timer; } if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) goto err_stop_timer; /* * Fail if there's too much private data. */ plen = ntohs(mpa->private_data_size); if (plen > MPA_MAX_PRIVATE_DATA) goto err_stop_timer; /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) goto err_stop_timer; ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) return 0; /* * If we get here we have accumulated the entire mpa * start reply message including private data. */ ep->mpa_attr.initiator = 0; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa->revision; if (mpa->revision == 1) ep->tried_with_mpa_v1 = 1; ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; if (mpa->revision == 2) { ep->mpa_attr.enhanced_rdma_conn = mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; if (ep->mpa_attr.enhanced_rdma_conn) { mpa_v2_params = (struct mpa_v2_conn_params *) (ep->mpa_pkt + sizeof(*mpa)); ep->ird = ntohs(mpa_v2_params->ird) & MPA_V2_IRD_ORD_MASK; ep->ird = min_t(u32, ep->ird, cur_max_read_depth(ep->com.dev)); ep->ord = ntohs(mpa_v2_params->ord) & MPA_V2_IRD_ORD_MASK; ep->ord = min_t(u32, ep->ord, cur_max_read_depth(ep->com.dev)); CTR3(KTR_IW_CXGBE, "%s initiator ird %u ord %u", __func__, ep->ird, ep->ord); if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL) if (peer2peer) { if (ntohs(mpa_v2_params->ord) & MPA_V2_RDMA_WRITE_RTR) ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_RDMA_WRITE; else if (ntohs(mpa_v2_params->ord) & MPA_V2_RDMA_READ_RTR) ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; } } } else if (mpa->revision == 1 && peer2peer) ep->mpa_attr.p2p_type = p2p_type; if (set_tcpinfo(ep)) goto err_stop_timer; CTR5(KTR_IW_CXGBE, "%s: crc_enabled = %d, recv_marker_enabled = %d, " "xmit_marker_enabled = %d, version = %d", __func__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); ep->com.state = MPA_REQ_RCVD; STOP_EP_TIMER(ep); /* drive upcall */ if (ep->parent_ep->com.state != DEAD) if (connect_request_upcall(ep)) goto err_out; return 0; err_stop_timer: STOP_EP_TIMER(ep); err_out: return 2; } /* * Upcall from the adapter indicating data has been transmitted. * For us its just the single MPA request or reply. We can now free * the skb holding the mpa message. */ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { #ifdef KTR int err; #endif struct c4iw_ep *ep = to_ep(cm_id); int abort = 0; mutex_lock(&ep->com.mutex); CTR2(KTR_IW_CXGBE, "%s:crcB %p", __func__, ep); if ((ep->com.state == DEAD) || (ep->com.state != MPA_REQ_RCVD)) { CTR2(KTR_IW_CXGBE, "%s:crc1 %p", __func__, ep); mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return -ECONNRESET; } set_bit(ULP_REJECT, &ep->com.history); if (mpa_rev == 0) { CTR2(KTR_IW_CXGBE, "%s:crc2 %p", __func__, ep); abort = 1; } else { CTR2(KTR_IW_CXGBE, "%s:crc3 %p", __func__, ep); abort = send_mpa_reject(ep, pdata, pdata_len); } STOP_EP_TIMER(ep); #ifdef KTR err = c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL); #else c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL); #endif mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); CTR3(KTR_IW_CXGBE, "%s:crc4 %p, err: %d", __func__, ep, err); return 0; } int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err; struct c4iw_qp_attributes attrs = {0}; enum c4iw_qp_attr_mask mask; struct c4iw_ep *ep = to_ep(cm_id); struct c4iw_dev *h = to_c4iw_dev(cm_id->device); struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); int abort = 0; mutex_lock(&ep->com.mutex); CTR2(KTR_IW_CXGBE, "%s:cacB %p", __func__, ep); if ((ep->com.state == DEAD) || (ep->com.state != MPA_REQ_RCVD)) { CTR2(KTR_IW_CXGBE, "%s:cac1 %p", __func__, ep); err = -ECONNRESET; goto err_out; } BUG_ON(!qp); set_bit(ULP_ACCEPT, &ep->com.history); if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { CTR2(KTR_IW_CXGBE, "%s:cac2 %p", __func__, ep); err = -EINVAL; goto err_abort; } if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { CTR2(KTR_IW_CXGBE, "%s:cac3 %p", __func__, ep); if (conn_param->ord > ep->ird) { if (RELAXED_IRD_NEGOTIATION) { conn_param->ord = ep->ird; } else { ep->ird = conn_param->ird; ep->ord = conn_param->ord; send_mpa_reject(ep, conn_param->private_data, conn_param->private_data_len); err = -ENOMEM; goto err_abort; } } if (conn_param->ird < ep->ord) { if (RELAXED_IRD_NEGOTIATION && ep->ord <= h->rdev.adap->params.max_ordird_qp) { conn_param->ird = ep->ord; } else { err = -ENOMEM; goto err_abort; } } } ep->ird = conn_param->ird; ep->ord = conn_param->ord; if (ep->mpa_attr.version == 1) { if (peer2peer && ep->ird == 0) ep->ird = 1; } else { if (peer2peer && (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED) && (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) && ep->ird == 0) ep->ird = 1; } CTR4(KTR_IW_CXGBE, "%s %d ird %d ord %d", __func__, __LINE__, ep->ird, ep->ord); ep->com.cm_id = cm_id; ref_cm_id(&ep->com); ep->com.qp = qp; ref_qp(ep); //ep->ofld_txq = TOEPCB(ep->com.so)->ofld_txq; /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = C4IW_QP_STATE_RTS; /* bind QP and TID with INIT_WR */ mask = C4IW_QP_ATTR_NEXT_STATE | C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR | C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) { CTR3(KTR_IW_CXGBE, "%s:caca %p, err: %d", __func__, ep, err); goto err_defef_cm_id; } err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) { CTR3(KTR_IW_CXGBE, "%s:cacb %p, err: %d", __func__, ep, err); goto err_defef_cm_id; } ep->com.state = FPDU_MODE; established_upcall(ep); mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:cacE %p", __func__, ep); return 0; err_defef_cm_id: deref_cm_id(&ep->com); err_abort: abort = 1; err_out: if (abort) c4iw_ep_disconnect(ep, 1, GFP_KERNEL); mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:cacE err %p", __func__, ep); return err; } static int c4iw_sock_create(struct sockaddr_storage *laddr, struct socket **so) { int ret; int size, on; struct socket *sock = NULL; struct sockopt sopt; ret = sock_create_kern(laddr->ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) { CTR2(KTR_IW_CXGBE, "%s:Failed to create TCP socket. err %d", __func__, ret); return ret; } if (reuseaddr) { bzero(&sopt, sizeof(struct sockopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_REUSEADDR; on = 1; sopt.sopt_val = &on; sopt.sopt_valsize = sizeof(on); ret = -sosetopt(sock, &sopt); if (ret != 0) { log(LOG_ERR, "%s: sosetopt(%p, SO_REUSEADDR) " "failed with %d.\n", __func__, sock, ret); } bzero(&sopt, sizeof(struct sockopt)); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_REUSEPORT; on = 1; sopt.sopt_val = &on; sopt.sopt_valsize = sizeof(on); ret = -sosetopt(sock, &sopt); if (ret != 0) { log(LOG_ERR, "%s: sosetopt(%p, SO_REUSEPORT) " "failed with %d.\n", __func__, sock, ret); } } ret = -sobind(sock, (struct sockaddr *)laddr, curthread); if (ret) { CTR2(KTR_IW_CXGBE, "%s:Failed to bind socket. err %p", __func__, ret); sock_release(sock); return ret; } size = laddr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); ret = sock_getname(sock, (struct sockaddr *)laddr, &size, 0); if (ret) { CTR2(KTR_IW_CXGBE, "%s:sock_getname failed. err %p", __func__, ret); sock_release(sock); return ret; } *so = sock; return 0; } int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err = 0; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_ep *ep = NULL; - struct ifnet *nh_ifp; /* Logical egress interface */ + if_t nh_ifp; /* Logical egress interface */ struct epoch_tracker et; #ifdef VIMAGE struct rdma_cm_id *rdma_id = (struct rdma_cm_id*)cm_id->context; struct vnet *vnet = rdma_id->route.addr.dev_addr.net; #endif CTR2(KTR_IW_CXGBE, "%s:ccB %p", __func__, cm_id); if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { CTR2(KTR_IW_CXGBE, "%s:cc1 %p", __func__, cm_id); err = -EINVAL; goto out; } ep = alloc_ep(sizeof(*ep), GFP_KERNEL); cm_id->provider_data = ep; init_timer(&ep->timer); ep->plen = conn_param->private_data_len; if (ep->plen) { CTR2(KTR_IW_CXGBE, "%s:cc3 %p", __func__, ep); memcpy(ep->mpa_pkt + sizeof(struct mpa_message), conn_param->private_data, ep->plen); } ep->ird = conn_param->ird; ep->ord = conn_param->ord; if (peer2peer && ep->ord == 0) { CTR2(KTR_IW_CXGBE, "%s:cc4 %p", __func__, ep); ep->ord = 1; } ep->com.dev = dev; ep->com.cm_id = cm_id; ref_cm_id(&ep->com); ep->com.qp = get_qhp(dev, conn_param->qpn); if (!ep->com.qp) { CTR2(KTR_IW_CXGBE, "%s:cc5 %p", __func__, ep); err = -EINVAL; goto fail; } ref_qp(ep); ep->com.thread = curthread; NET_EPOCH_ENTER(et); CURVNET_SET(vnet); err = get_ifnet_from_raddr(&cm_id->remote_addr, &nh_ifp); CURVNET_RESTORE(); NET_EPOCH_EXIT(et); if (err) { CTR2(KTR_IW_CXGBE, "%s:cc7 %p", __func__, ep); printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); err = EHOSTUNREACH; return err; } - if (!(nh_ifp->if_capenable & IFCAP_TOE) || + if (!(if_getcapenable(nh_ifp) & IFCAP_TOE) || TOEDEV(nh_ifp) == NULL) { err = -ENOPROTOOPT; goto fail; } ep->com.state = CONNECTING; ep->tos = 0; ep->com.local_addr = cm_id->local_addr; ep->com.remote_addr = cm_id->remote_addr; err = c4iw_sock_create(&cm_id->local_addr, &ep->com.so); if (err) goto fail; setiwsockopt(ep->com.so); init_iwarp_socket(ep->com.so, &ep->com); err = -soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr, ep->com.thread); if (err) goto fail_free_so; CTR2(KTR_IW_CXGBE, "%s:ccE, ep %p", __func__, ep); return 0; fail_free_so: uninit_iwarp_socket(ep->com.so); ep->com.state = DEAD; sock_release(ep->com.so); fail: deref_cm_id(&ep->com); c4iw_put_ep(&ep->com); ep = NULL; out: CTR2(KTR_IW_CXGBE, "%s:ccE Error %d", __func__, err); return err; } /* * iwcm->create_listen. Returns -errno on failure. */ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) { struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_listen_ep *lep = NULL; struct listen_port_info *port_info = NULL; int rc = 0; CTR3(KTR_IW_CXGBE, "%s: cm_id %p, backlog %s", __func__, cm_id, backlog); if (c4iw_fatal_error(&dev->rdev)) { CTR2(KTR_IW_CXGBE, "%s: cm_id %p, fatal error", __func__, cm_id); return -EIO; } lep = alloc_ep(sizeof(*lep), GFP_KERNEL); lep->com.cm_id = cm_id; ref_cm_id(&lep->com); lep->com.dev = dev; lep->backlog = backlog; lep->com.local_addr = cm_id->local_addr; lep->com.thread = curthread; cm_id->provider_data = lep; lep->com.state = LISTEN; /* In case of INDADDR_ANY, ibcore creates cmid for each device and * invokes iw_cxgbe listener callbacks assuming that iw_cxgbe creates * HW listeners for each device seperately. But toecore expects single * solisten() call with INADDR_ANY address to create HW listeners on * all devices for a given port number. So iw_cxgbe driver calls * solisten() only once for INADDR_ANY(usually done at first time * listener callback from ibcore). And all the subsequent INADDR_ANY * listener callbacks from ibcore(for the same port address) do not * invoke solisten() as first listener callback has already created * listeners for all other devices(via solisten). */ if (c4iw_any_addr((struct sockaddr *)&lep->com.local_addr)) { port_info = add_ep_to_listenlist(lep); /* skip solisten() if refcnt > 1, as the listeners were * already created by 'Master lep' */ if (port_info->refcnt > 1) { /* As there will be only one listener socket for a TCP * port, copy Master lep's socket pointer to other lep's * that are belonging to same TCP port. */ struct c4iw_listen_ep *head_lep = container_of(port_info->lep_list.next, struct c4iw_listen_ep, listen_ep_list); lep->com.so = head_lep->com.so; goto out; } } rc = c4iw_sock_create(&cm_id->local_addr, &lep->com.so); if (rc) { CTR2(KTR_IW_CXGBE, "%s:Failed to create socket. err %d", __func__, rc); goto fail; } rc = -solisten(lep->com.so, backlog, curthread); if (rc) { CTR3(KTR_IW_CXGBE, "%s:Failed to listen on sock:%p. err %d", __func__, lep->com.so, rc); goto fail_free_so; } init_iwarp_socket(lep->com.so, &lep->com); out: return 0; fail_free_so: sock_release(lep->com.so); fail: if (port_info) rem_ep_from_listenlist(lep); deref_cm_id(&lep->com); c4iw_put_ep(&lep->com); return rc; } int c4iw_destroy_listen(struct iw_cm_id *cm_id) { struct c4iw_listen_ep *lep = to_listen_ep(cm_id); mutex_lock(&lep->com.mutex); CTR3(KTR_IW_CXGBE, "%s: cm_id %p, state %s", __func__, cm_id, states[lep->com.state]); lep->com.state = DEAD; if (c4iw_any_addr((struct sockaddr *)&lep->com.local_addr)) { /* if no refcount then close listen socket */ if (!rem_ep_from_listenlist(lep)) close_socket(lep->com.so); } else close_socket(lep->com.so); deref_cm_id(&lep->com); mutex_unlock(&lep->com.mutex); c4iw_put_ep(&lep->com); return 0; } int __c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) { int ret; mutex_lock(&ep->com.mutex); ret = c4iw_ep_disconnect(ep, abrupt, gfp); mutex_unlock(&ep->com.mutex); return ret; } int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) { int ret = 0; int close = 0; struct c4iw_rdev *rdev; CTR2(KTR_IW_CXGBE, "%s:cedB %p", __func__, ep); rdev = &ep->com.dev->rdev; if (c4iw_fatal_error(rdev)) { CTR3(KTR_IW_CXGBE, "%s:ced1 fatal error %p %s", __func__, ep, states[ep->com.state]); if (ep->com.state != DEAD) { send_abort(ep); ep->com.state = DEAD; } close_complete_upcall(ep, -ECONNRESET); return ECONNRESET; } CTR3(KTR_IW_CXGBE, "%s:ced2 %p %s", __func__, ep, states[ep->com.state]); /* * Ref the ep here in case we have fatal errors causing the * ep to be released and freed. */ c4iw_get_ep(&ep->com); switch (ep->com.state) { case MPA_REQ_WAIT: case MPA_REQ_SENT: case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: close = 1; if (abrupt) ep->com.state = ABORTING; else { ep->com.state = CLOSING; START_EP_TIMER(ep); } set_bit(CLOSE_SENT, &ep->com.flags); break; case CLOSING: if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { close = 1; if (abrupt) { STOP_EP_TIMER(ep); ep->com.state = ABORTING; } else ep->com.state = MORIBUND; } break; case MORIBUND: case ABORTING: case DEAD: CTR3(KTR_IW_CXGBE, "%s ignoring disconnect ep %p state %u", __func__, ep, ep->com.state); break; default: BUG(); break; } if (close) { CTR2(KTR_IW_CXGBE, "%s:ced3 %p", __func__, ep); if (abrupt) { CTR2(KTR_IW_CXGBE, "%s:ced4 %p", __func__, ep); set_bit(EP_DISC_ABORT, &ep->com.history); close_complete_upcall(ep, -ECONNRESET); send_abort(ep); } else { CTR2(KTR_IW_CXGBE, "%s:ced5 %p", __func__, ep); set_bit(EP_DISC_CLOSE, &ep->com.history); if (!ep->parent_ep) ep->com.state = MORIBUND; CURVNET_SET(ep->com.so->so_vnet); ret = sodisconnect(ep->com.so); CURVNET_RESTORE(); if (ret) { CTR2(KTR_IW_CXGBE, "%s:ced6 %p", __func__, ep); STOP_EP_TIMER(ep); send_abort(ep); ep->com.state = DEAD; close_complete_upcall(ep, -ECONNRESET); set_bit(EP_DISC_FAIL, &ep->com.history); if (ep->com.qp) { struct c4iw_qp_attributes attrs = {0}; attrs.next_state = C4IW_QP_STATE_ERROR; ret = c4iw_modify_qp( ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); CTR3(KTR_IW_CXGBE, "%s:ced7 %p ret %d", __func__, ep, ret); } } } } c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:cedE %p", __func__, ep); return ret; } #ifdef C4IW_EP_REDIRECT int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t) { struct c4iw_ep *ep = ctx; if (ep->dst != old) return 0; PDBG("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new, l2t); dst_hold(new); cxgb4_l2t_release(ep->l2t); ep->l2t = l2t; dst_release(old); ep->dst = new; return 1; } #endif static void ep_timeout(unsigned long arg) { struct c4iw_ep *ep = (struct c4iw_ep *)arg; if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { /* * Only insert if it is not already on the list. */ if (!(ep->com.ep_events & C4IW_EVENT_TIMEOUT)) { CTR2(KTR_IW_CXGBE, "%s:et1 %p", __func__, ep); add_ep_to_req_list(ep, C4IW_EVENT_TIMEOUT); } } } static int fw6_wr_rpl(struct adapter *sc, const __be64 *rpl) { uint64_t val = be64toh(*rpl); int ret; struct c4iw_wr_wait *wr_waitp; ret = (int)((val >> 8) & 0xff); wr_waitp = (struct c4iw_wr_wait *)rpl[1]; CTR3(KTR_IW_CXGBE, "%s wr_waitp %p ret %u", __func__, wr_waitp, ret); if (wr_waitp) c4iw_wake_up(wr_waitp, ret ? -ret : 0); return (0); } static int fw6_cqe_handler(struct adapter *sc, const __be64 *rpl) { struct cqe_list_entry *cle; unsigned long flag; cle = malloc(sizeof(*cle), M_CXGBE, M_NOWAIT); cle->rhp = sc->iwarp_softc; cle->err_cqe = *(const struct t4_cqe *)(&rpl[0]); spin_lock_irqsave(&err_cqe_lock, flag); list_add_tail(&cle->entry, &err_cqe_list); queue_work(c4iw_taskq, &c4iw_task); spin_unlock_irqrestore(&err_cqe_lock, flag); return (0); } static int process_terminate(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs = {0}; CTR2(KTR_IW_CXGBE, "%s:tB %p %d", __func__, ep); if (ep && ep->com.qp) { printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", ep->hwtid, ep->com.qp->wq.sq.qid); attrs.next_state = C4IW_QP_STATE_TERMINATE; c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } else printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", ep->hwtid); CTR2(KTR_IW_CXGBE, "%s:tE %p %d", __func__, ep); return 0; } int __init c4iw_cm_init(void) { t4_register_cpl_handler(CPL_RDMA_TERMINATE, terminate); t4_register_fw_msg_handler(FW6_TYPE_WR_RPL, fw6_wr_rpl); t4_register_fw_msg_handler(FW6_TYPE_CQE, fw6_cqe_handler); t4_register_an_handler(c4iw_ev_handler); TAILQ_INIT(&req_list); spin_lock_init(&req_lock); INIT_LIST_HEAD(&err_cqe_list); spin_lock_init(&err_cqe_lock); INIT_WORK(&c4iw_task, process_req); c4iw_taskq = create_singlethread_workqueue("iw_cxgbe"); if (!c4iw_taskq) return -ENOMEM; return 0; } void __exit c4iw_cm_term(void) { WARN_ON(!TAILQ_EMPTY(&req_list)); WARN_ON(!list_empty(&err_cqe_list)); flush_workqueue(c4iw_taskq); destroy_workqueue(c4iw_taskq); t4_register_cpl_handler(CPL_RDMA_TERMINATE, NULL); t4_register_fw_msg_handler(FW6_TYPE_WR_RPL, NULL); t4_register_fw_msg_handler(FW6_TYPE_CQE, NULL); t4_register_an_handler(NULL); } #endif diff --git a/sys/dev/cxgbe/iw_cxgbe/provider.c b/sys/dev/cxgbe/iw_cxgbe/provider.c index bfa00bb4c658..15f9cbe47062 100644 --- a/sys/dev/cxgbe/iw_cxgbe/provider.c +++ b/sys/dev/cxgbe/iw_cxgbe/provider.c @@ -1,523 +1,523 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #define LINUXKPI_PARAM_PREFIX iw_cxgbe_ #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include "iw_cxgbe.h" #include "user.h" static int fastreg_support = 1; module_param(fastreg_support, int, 0644); MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default = 1)"); static int c4iw_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, struct ib_port_modify *props) { return -ENOSYS; } static int c4iw_ah_create(struct ib_ah *ah, struct ib_ah_attr *ah_attr, u32 flags, struct ib_udata *udata) { return -ENOSYS; } static void c4iw_ah_destroy(struct ib_ah *ah, u32 flags) { } static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in_mad, size_t in_mad_size, struct ib_mad_hdr *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index) { return -ENOSYS; } static void c4iw_dealloc_ucontext(struct ib_ucontext *context) { struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); struct c4iw_dev *rhp; struct c4iw_mm_entry *mm, *tmp; pr_debug("context %p\n", context); rhp = to_c4iw_dev(ucontext->ibucontext.device); CTR2(KTR_IW_CXGBE, "%s ucontext %p", __func__, ucontext); list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); } static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext, struct ib_udata *udata) { struct ib_device *ibdev = ucontext->device; struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext); struct c4iw_dev *rhp = to_c4iw_dev(ibdev); static int warned; struct c4iw_alloc_ucontext_resp uresp; int ret = 0; struct c4iw_mm_entry *mm = NULL; PDBG("%s ibdev %p\n", __func__, ibdev); c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { if (!warned++) log(LOG_ERR, "%s Warning - downlevel libcxgb4 " "(non-fatal), device status page disabled.\n", __func__); rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED; } else { mm = kmalloc(sizeof *mm, GFP_KERNEL); if (!mm) goto err; uresp.status_page_size = PAGE_SIZE; spin_lock(&context->mmap_lock); uresp.status_page_key = context->key; context->key += PAGE_SIZE; spin_unlock(&context->mmap_lock); ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp) - sizeof(uresp.reserved)); if (ret) goto err_mm; mm->key = uresp.status_page_key; mm->addr = vtophys(rhp->rdev.status_page); mm->len = PAGE_SIZE; insert_mmap(context, mm); } return 0; err_mm: kfree(mm); err: return ret; } static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { int len = vma->vm_end - vma->vm_start; u32 key = vma->vm_pgoff << PAGE_SHIFT; struct c4iw_rdev *rdev; int ret = 0; struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr = 0; CTR4(KTR_IW_CXGBE, "%s:1 ctx %p vma %p, vm_start %u", __func__, context, vma, vma->vm_start); CTR4(KTR_IW_CXGBE, "%s:1a pgoff 0x%lx key 0x%x len %d", __func__, vma->vm_pgoff, key, len); if (vma->vm_start & (PAGE_SIZE-1)) { CTR3(KTR_IW_CXGBE, "%s:2 unaligned vm_start %u vma %p", __func__, vma->vm_start, vma); return -EINVAL; } rdev = &(to_c4iw_dev(context->device)->rdev); ucontext = to_c4iw_ucontext(context); mm = remove_mmap(ucontext, key, len); if (!mm) { CTR4(KTR_IW_CXGBE, "%s:3 ucontext %p key %u len %u", __func__, ucontext, key, len); return -EINVAL; } addr = mm->addr; kfree(mm); /* user DB-GTS registers if addr in udbs_res range, * else WQ or CQ memory. * */ if (rdev->adap->iwt.wc_en && addr >= rdev->bar2_pa && addr < rdev->bar2_pa + rdev->bar2_len) vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); ret = rdma_user_mmap_io(context, vma, addr >> PAGE_SHIFT, len, vma->vm_page_prot, NULL); CTR4(KTR_IW_CXGBE, "%s:4 ctx %p vma %p ret %u", __func__, context, vma, ret); return ret; } static void c4iw_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata) { struct c4iw_pd *php = to_c4iw_pd(pd); struct c4iw_dev *rhp = php->rhp; CTR3(KTR_IW_CXGBE, "%s: pd %p, pdid 0x%x", __func__, pd, php->pdid); c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid); mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur--; mutex_unlock(&rhp->rdev.stats.lock); } static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_udata *udata) { struct c4iw_pd *php = to_c4iw_pd(pd); struct ib_device *ibdev = pd->device; u32 pdid; struct c4iw_dev *rhp; CTR4(KTR_IW_CXGBE, "%s: ibdev %p, pd %p, data %p", __func__, ibdev, pd, udata); rhp = (struct c4iw_dev *) ibdev; pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); if (!pdid) return -EINVAL; php->pdid = pdid; php->rhp = rhp; if (udata) { if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) { c4iw_deallocate_pd(&php->ibpd, udata); return -EFAULT; } } mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur++; if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max) rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; mutex_unlock(&rhp->rdev.stats.lock); CTR5(KTR_IW_CXGBE, "%s: ibdev %p, context %p, data %p, pddid 0x%x, pd %p", __func__, ibdev, udata, pdid, php); return (0); } static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, pkey %p", __func__, ibdev, port, index, pkey); *pkey = 0; return (0); } static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct c4iw_dev *dev; struct port_info *pi; struct adapter *sc; CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, gid %p", __func__, ibdev, port, index, gid); memset(&gid->raw[0], 0, sizeof(gid->raw)); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port == 0 || port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; memcpy(&gid->raw[0], pi->vi[0].hw_addr, ETHER_ADDR_LEN); return (0); } static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { struct c4iw_dev *dev = to_c4iw_dev(ibdev); struct adapter *sc = dev->rdev.adap; CTR3(KTR_IW_CXGBE, "%s ibdev %p, props %p", __func__, ibdev, props); if (uhw->inlen || uhw->outlen) return -EINVAL; memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); props->hw_ver = sc->params.chipid; props->fw_ver = sc->params.fw_vers; props->device_cap_flags = dev->device_cap_flags; props->page_size_cap = T4_PAGESIZE_MASK; props->vendor_id = pci_get_vendor(sc->dev); props->vendor_part_id = pci_get_device(sc->dev); props->max_mr_size = T4_MAX_MR_SIZE; props->max_qp = sc->vres.qp.size / 2; props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth; props->max_sge = T4_MAX_RECV_SGE; props->max_sge_rd = 1; props->max_res_rd_atom = sc->params.max_ird_adapter; props->max_qp_rd_atom = min(sc->params.max_ordird_qp, c4iw_max_read_depth); props->max_qp_init_rd_atom = props->max_qp_rd_atom; props->max_cq = sc->vres.qp.size; props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth; props->max_mr = c4iw_num_stags(&dev->rdev); props->max_pd = T4_MAX_NUM_PD; props->local_ca_ack_delay = 0; props->max_fast_reg_page_list_len = t4_max_fr_depth(&dev->rdev, use_dsgl); return (0); } /* * Returns -errno on failure. */ static int c4iw_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct c4iw_dev *dev; struct adapter *sc; struct port_info *pi; - struct ifnet *ifp; + if_t ifp; CTR4(KTR_IW_CXGBE, "%s ibdev %p, port %d, props %p", __func__, ibdev, port, props); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; ifp = pi->vi[0].ifp; memset(props, 0, sizeof(struct ib_port_attr)); props->max_mtu = IB_MTU_4096; - if (ifp->if_mtu >= 4096) + if (if_getmtu(ifp) >= 4096) props->active_mtu = IB_MTU_4096; - else if (ifp->if_mtu >= 2048) + else if (if_getmtu(ifp) >= 2048) props->active_mtu = IB_MTU_2048; - else if (ifp->if_mtu >= 1024) + else if (if_getmtu(ifp) >= 1024) props->active_mtu = IB_MTU_1024; - else if (ifp->if_mtu >= 512) + else if (if_getmtu(ifp) >= 512) props->active_mtu = IB_MTU_512; else props->active_mtu = IB_MTU_256; props->state = pi->link_cfg.link_ok ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_REINIT_SUP | IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; props->active_width = 2; props->active_speed = 2; props->max_msg_sz = -1; return 0; } static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; int err; immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; err = ib_query_port(ibdev, port_num, &attr); if (err) return err; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; return 0; } /* * Returns -errno on error. */ int c4iw_register_device(struct c4iw_dev *dev) { struct adapter *sc = dev->rdev.adap; struct ib_device *ibdev = &dev->ibdev; struct iw_cm_verbs *iwcm; int ret; CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, sc); BUG_ON(!sc->port[0]); ret = linux_pci_attach_device(sc->dev, NULL, NULL, &dev->pdev); if (ret) return (ret); #define c4iw_ib_cq c4iw_cq #define c4iw_ib_pd c4iw_pd #define c4iw_ib_qp c4iw_qp #define c4iw_ib_ucontext c4iw_ucontext INIT_IB_DEVICE_OPS(&ibdev->ops, c4iw, CXGB4); strlcpy(ibdev->name, device_get_nameunit(sc->dev), sizeof(ibdev->name)); memset(&ibdev->node_guid, 0, sizeof(ibdev->node_guid)); memcpy(&ibdev->node_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); ibdev->owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; if (fastreg_support) dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; ibdev->local_dma_lkey = 0; ibdev->uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_POLL_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_POST_SEND) | (1ull << IB_USER_VERBS_CMD_POST_RECV); ibdev->node_type = RDMA_NODE_RNIC; strlcpy(ibdev->node_desc, C4IW_NODE_DESC, sizeof(ibdev->node_desc)); ibdev->phys_port_cnt = sc->params.nports; ibdev->num_comp_vectors = 1; ibdev->dma_device = &dev->pdev.dev; ibdev->query_device = c4iw_query_device; ibdev->query_port = c4iw_query_port; ibdev->modify_port = c4iw_modify_port; ibdev->query_pkey = c4iw_query_pkey; ibdev->query_gid = c4iw_query_gid; ibdev->alloc_ucontext = c4iw_alloc_ucontext; ibdev->dealloc_ucontext = c4iw_dealloc_ucontext; ibdev->mmap = c4iw_mmap; ibdev->alloc_pd = c4iw_allocate_pd; ibdev->dealloc_pd = c4iw_deallocate_pd; ibdev->create_ah = c4iw_ah_create; ibdev->destroy_ah = c4iw_ah_destroy; ibdev->create_qp = c4iw_create_qp; ibdev->modify_qp = c4iw_ib_modify_qp; ibdev->query_qp = c4iw_ib_query_qp; ibdev->destroy_qp = c4iw_destroy_qp; ibdev->create_cq = c4iw_create_cq; ibdev->destroy_cq = c4iw_destroy_cq; ibdev->resize_cq = c4iw_resize_cq; ibdev->poll_cq = c4iw_poll_cq; ibdev->get_dma_mr = c4iw_get_dma_mr; ibdev->reg_user_mr = c4iw_reg_user_mr; ibdev->dereg_mr = c4iw_dereg_mr; ibdev->alloc_mw = c4iw_alloc_mw; ibdev->dealloc_mw = c4iw_dealloc_mw; ibdev->alloc_mr = c4iw_alloc_mr; ibdev->map_mr_sg = c4iw_map_mr_sg; ibdev->attach_mcast = c4iw_multicast_attach; ibdev->detach_mcast = c4iw_multicast_detach; ibdev->process_mad = c4iw_process_mad; ibdev->req_notify_cq = c4iw_arm_cq; ibdev->post_send = c4iw_post_send; ibdev->post_recv = c4iw_post_receive; ibdev->uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; ibdev->get_port_immutable = c4iw_port_immutable; iwcm = kmalloc(sizeof(*iwcm), GFP_KERNEL); if (iwcm == NULL) return (-ENOMEM); iwcm->connect = c4iw_connect; iwcm->accept = c4iw_accept_cr; iwcm->reject = c4iw_reject_cr; iwcm->create_listen = c4iw_create_listen; iwcm->destroy_listen = c4iw_destroy_listen; iwcm->add_ref = c4iw_qp_add_ref; iwcm->rem_ref = c4iw_qp_rem_ref; iwcm->get_qp = c4iw_get_qp; ibdev->iwcm = iwcm; ret = ib_register_device(&dev->ibdev, NULL); if (ret) { kfree(iwcm); linux_pci_detach_device(&dev->pdev); } return (ret); } void c4iw_unregister_device(struct c4iw_dev *dev) { CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, dev->rdev.adap); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); linux_pci_detach_device(&dev->pdev); return; } #endif diff --git a/sys/dev/cxgbe/t4_clip.c b/sys/dev/cxgbe/t4_clip.c index 5f4fbd0f07a6..72147cd9c48d 100644 --- a/sys/dev/cxgbe/t4_clip.c +++ b/sys/dev/cxgbe/t4_clip.c @@ -1,875 +1,875 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012-2021 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "t4_clip.h" /* * Code to deal with the Compressed Local IPv6 (CLIP) table in the ASIC. * * The driver maintains a global CLIP database (clip_db) of IPv6 addresses and a * per-adapter CLIP table (sc->clip_table) with entries that point to an IPv6 in * the clip_db. All access is protected by a single global lock (clip_db_lock). * The correct lock order is clip lock before synchronized op. * * By default (hw.cxgbe.clip_db_auto=1) all local IPv6 addresses are added to * the db. Addresses are also added on-demand when the driver allocates an * entry for a filter, TOE tid, etc. krn_ref counts the number of times an * address appears in the system. adp_ref counts the number of adapters that * have that address in their CLIP table. If both are 0 then the entry is * evicted from the db. Consumers of the CLIP table entry (filters, TOE tids) * are tracked in ce->refcount. Driver ioctls let external consumers add/remove * addresses from the CLIP table. */ #if defined(INET6) struct clip_db_entry { LIST_ENTRY(clip_db_entry) link; /* clip_db hash linkage */ struct in6_addr lip; u_int krn_ref; /* # of times this IP6 appears in list of all IP6 */ u_int adp_ref; /* # of adapters with this IP6 in their CLIP */ u_int tmp_ref; /* Used only during refresh */ }; struct clip_entry { LIST_ENTRY(clip_entry) link; /* clip_table hash linkage */ TAILQ_ENTRY(clip_entry) plink; /* clip_pending linkage */ struct clip_db_entry *cde; int16_t clip_idx; /* index in the hw table */ bool pending; /* in clip_pending list */ int refcount; }; static eventhandler_tag ifaddr_evhandler; static struct mtx clip_db_lock; static LIST_HEAD(, clip_db_entry) *clip_db; static u_long clip_db_mask; static int clip_db_gen; static struct task clip_db_task; static int add_lip(struct adapter *, struct in6_addr *, int16_t *); static int del_lip(struct adapter *, struct in6_addr *); static void t4_clip_db_task(void *, int); static void t4_clip_task(void *, int); static void update_clip_db(void); static int update_sw_clip_table(struct adapter *); static int update_hw_clip_table(struct adapter *); static void update_clip_table(struct adapter *, void *); static int sysctl_clip_db(SYSCTL_HANDLER_ARGS); static int sysctl_clip_db_auto(SYSCTL_HANDLER_ARGS); static struct clip_db_entry *lookup_clip_db_entry(struct in6_addr *, bool); static struct clip_entry *lookup_clip_entry(struct adapter *, struct in6_addr *, bool); SYSCTL_PROC(_hw_cxgbe, OID_AUTO, clip_db, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_clip_db, "A", "CLIP database"); int t4_clip_db_auto = 1; SYSCTL_PROC(_hw_cxgbe, OID_AUTO, clip_db_auto, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_clip_db_auto, "I", "Add local IPs to CLIP db automatically (0 = no, 1 = yes)"); static inline uint32_t clip_hashfn(struct in6_addr *addr) { return (fnv_32_buf(addr, sizeof(*addr), FNV1_32_INIT) & clip_db_mask); } static inline struct clip_db_entry * alloc_clip_db_entry(struct in6_addr *in6) { struct clip_db_entry *cde; cde = malloc(sizeof(*cde), M_CXGBE, M_NOWAIT | M_ZERO); if (__predict_true(cde != NULL)) memcpy(&cde->lip, in6, sizeof(cde->lip)); return (cde); } static inline struct clip_entry * alloc_clip_entry(struct clip_db_entry *cde) { struct clip_entry *ce; mtx_assert(&clip_db_lock, MA_OWNED); ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT | M_ZERO); if (__predict_true(ce != NULL)) { ce->cde = cde; cde->adp_ref++; ce->clip_idx = -1; } return (ce); } /* * Look up the IP6 address in the CLIP db. If add is set then an entry for the * IP6 will be added to the db. */ static struct clip_db_entry * lookup_clip_db_entry(struct in6_addr *in6, bool add) { struct clip_db_entry *cde; const int bucket = clip_hashfn(in6); mtx_assert(&clip_db_lock, MA_OWNED); LIST_FOREACH(cde, &clip_db[bucket], link) { if (IN6_ARE_ADDR_EQUAL(&cde->lip, in6)) return (cde); } /* Not found. Create a new entry if requested. */ if (add) { cde = alloc_clip_db_entry(in6); if (cde != NULL) LIST_INSERT_HEAD(&clip_db[bucket], cde, link); } return (cde); } /* * Look up the IP6 address in the CLIP db. If add is set then an entry for the * IP6 will be added to the db. */ static struct clip_entry * lookup_clip_entry(struct adapter *sc, struct in6_addr *in6, bool add) { struct clip_db_entry *cde; struct clip_entry *ce; const int bucket = clip_hashfn(in6); mtx_assert(&clip_db_lock, MA_OWNED); cde = lookup_clip_db_entry(in6, add); if (cde == NULL) return (NULL); LIST_FOREACH(ce, &sc->clip_table[bucket], link) { if (ce->cde == cde) return (ce); } /* Not found. Create a new entry if requested. */ if (add) { ce = alloc_clip_entry(cde); if (ce != NULL) { LIST_INSERT_HEAD(&sc->clip_table[bucket], ce, link); TAILQ_INSERT_TAIL(&sc->clip_pending, ce, plink); ce->pending = true; } } return (ce); } static int add_lip(struct adapter *sc, struct in6_addr *lip, int16_t *idx) { struct fw_clip_cmd c; int rc; ASSERT_SYNCHRONIZED_OP(sc); memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; rc = -t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c); if (rc == 0 && idx != NULL) *idx = G_FW_CLIP_CMD_INDEX(ntohl(c.alloc_to_len16)); return (rc); } static int del_lip(struct adapter *sc, struct in6_addr *lip) { struct fw_clip_cmd c; ASSERT_SYNCHRONIZED_OP(sc); memset(&c, 0, sizeof(c)); c.op_to_write = htonl(V_FW_CMD_OP(FW_CLIP_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); c.ip_hi = *(uint64_t *)&lip->s6_addr[0]; c.ip_lo = *(uint64_t *)&lip->s6_addr[8]; return (-t4_wr_mbox_ns(sc, sc->mbox, &c, sizeof(c), &c)); } #endif struct clip_entry * t4_get_clip_entry(struct adapter *sc, struct in6_addr *in6, bool add) { #ifdef INET6 struct clip_entry *ce; bool schedule = false; mtx_lock(&clip_db_lock); ce = lookup_clip_entry(sc, in6, add); if (ce != NULL) { MPASS(ce->cde->adp_ref > 0); if (++ce->refcount == 1 && ce->pending && ce->clip_idx != -1) { /* * Valid entry that was waiting to be deleted. It is in * use now so take it off the pending list. */ TAILQ_REMOVE(&sc->clip_pending, ce, plink); ce->pending = false; } if (ce->clip_idx == -1 && update_hw_clip_table(sc) != 0) schedule = true; } mtx_unlock(&clip_db_lock); if (schedule) taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, 0); return (ce); #else return (NULL); #endif } void t4_hold_clip_entry(struct adapter *sc, struct clip_entry *ce) { #ifdef INET6 MPASS(ce != NULL); MPASS(ce->cde->adp_ref > 0); mtx_lock(&clip_db_lock); MPASS(ce->refcount > 0); /* Caller should already have a reference */ ce->refcount++; mtx_unlock(&clip_db_lock); #endif } #ifdef INET6 static void release_clip_entry_locked(struct adapter *sc, struct clip_entry *ce) { struct clip_db_entry *cde; mtx_assert(&clip_db_lock, MA_OWNED); MPASS(ce->refcount > 0); cde = ce->cde; MPASS(cde->adp_ref > 0); if (--ce->refcount == 0 && cde->krn_ref == 0) { if (ce->clip_idx == -1) { /* Was never written to the hardware. */ MPASS(ce->pending); TAILQ_REMOVE(&sc->clip_pending, ce, plink); LIST_REMOVE(ce, link); free(ce, M_CXGBE); if (--cde->adp_ref == 0) { LIST_REMOVE(cde, link); free(cde, M_CXGBE); } } else { /* * Valid entry is now unused, add to the pending list * for deletion. Its refcount was 1 on entry so it * can't already be pending. */ MPASS(!ce->pending); TAILQ_INSERT_HEAD(&sc->clip_pending, ce, plink); ce->pending = true; } } } #endif void t4_release_clip_entry(struct adapter *sc, struct clip_entry *ce) { #ifdef INET6 MPASS(ce != NULL); mtx_lock(&clip_db_lock); release_clip_entry_locked(sc, ce); /* * This isn't a manual release via the ioctl. No need to update the * hw right now even if the release resulted in the entry being queued * for deletion. */ mtx_unlock(&clip_db_lock); #endif } int t4_release_clip_addr(struct adapter *sc, struct in6_addr *in6) { int rc = ENOTSUP; #ifdef INET6 struct clip_entry *ce; bool schedule = false; mtx_lock(&clip_db_lock); ce = lookup_clip_entry(sc, in6, false); if (ce == NULL) rc = ENOENT; else if (ce->refcount == 0) rc = EIO; else { release_clip_entry_locked(sc, ce); if (update_hw_clip_table(sc) != 0) schedule = true; rc = 0; } mtx_unlock(&clip_db_lock); if (schedule) taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, 0); #endif return (rc); } #ifdef INET6 void t4_init_clip_table(struct adapter *sc) { TAILQ_INIT(&sc->clip_pending); TIMEOUT_TASK_INIT(taskqueue_thread, &sc->clip_task, 0, t4_clip_task, sc); sc->clip_gen = -1; sc->clip_table = hashinit(CLIP_HASH_SIZE, M_CXGBE, &sc->clip_mask); /* Both the hashes must use the same bucket for the same key. */ if (sc->clip_table != NULL) MPASS(sc->clip_mask == clip_db_mask); /* * Don't bother forcing an update of the clip table when the * adapter is initialized. Before an interface can be used it * must be assigned an address which will trigger the event * handler to update the table. */ } /* * Returns true if any additions or deletions were made to the CLIP DB. */ static void update_clip_db(void) { VNET_ITERATOR_DECL(vnet_iter); struct rm_priotracker in6_ifa_tracker; struct in6_addr *in6, tin6; struct in6_ifaddr *ia; struct clip_db_entry *cde, *cde_tmp; int i, addel; VNET_LIST_RLOCK(); IN6_IFADDR_RLOCK(&in6_ifa_tracker); mtx_lock(&clip_db_lock); VNET_FOREACH(vnet_iter) { CURVNET_SET_QUIET(vnet_iter); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { - if (ia->ia_ifp->if_flags & IFF_LOOPBACK) + if (if_getflags(ia->ia_ifp) & IFF_LOOPBACK) continue; in6 = &ia->ia_addr.sin6_addr; KASSERT(!IN6_IS_ADDR_MULTICAST(in6), ("%s: mcast address in in6_ifaddr list", __func__)); if (IN6_IS_ADDR_LOOPBACK(in6)) continue; if (IN6_IS_SCOPE_EMBED(in6)) { tin6 = *in6; in6 = &tin6; in6_clearscope(in6); } cde = lookup_clip_db_entry(in6, true); if (cde == NULL) continue; cde->tmp_ref++; } CURVNET_RESTORE(); } addel = 0; for (i = 0; i <= clip_db_mask; i++) { LIST_FOREACH_SAFE(cde, &clip_db[i], link, cde_tmp) { if (cde->krn_ref == 0 && cde->tmp_ref > 0) { addel++; /* IP6 addr added. */ } else if (cde->krn_ref > 0 && cde->tmp_ref == 0) { if (cde->adp_ref == 0) { LIST_REMOVE(cde, link); free(cde, M_CXGBE); continue; } addel++; /* IP6 addr deleted. */ } cde->krn_ref = cde->tmp_ref; cde->tmp_ref = 0; } } if (addel > 0) clip_db_gen++; mtx_unlock(&clip_db_lock); IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); VNET_LIST_RUNLOCK(); } /* * Update the CLIP db and then update the CLIP tables on all the adapters. */ static void t4_clip_db_task(void *arg, int count) { update_clip_db(); t4_iterate(update_clip_table, NULL); } /* * Refresh the sw CLIP table for this adapter from the global CLIP db. Entries * that need to be added or deleted from the hardware CLIP table are placed on a * pending list but the hardware is not touched. The pending list is something * reasonable even if this fails so it's ok to apply that to the hardware. */ static int update_sw_clip_table(struct adapter *sc) { struct clip_db_entry *cde; struct clip_entry *ce, *ce_temp; int i; bool found; mtx_assert(&clip_db_lock, MA_OWNED); /* * We are about to rebuild the pending list from scratch. Deletions are * placed before additions because that's how we want to submit them to * the hardware. */ TAILQ_INIT(&sc->clip_pending); /* * Walk the sw CLIP table first. We want to reset every entry's pending * status as we're rebuilding the pending list. */ for (i = 0; i <= clip_db_mask; i++) { LIST_FOREACH_SAFE(ce, &sc->clip_table[i], link, ce_temp) { cde = ce->cde; MPASS(cde->adp_ref > 0); if (ce->refcount != 0 || cde->krn_ref != 0) { /* * Entry should stay in the CLIP. */ if (ce->clip_idx != -1) { ce->pending = false; } else { /* Was never added, carry forward. */ MPASS(ce->pending); TAILQ_INSERT_TAIL(&sc->clip_pending, ce, plink); } continue; } /* * Entry should be removed from the CLIP. */ if (ce->clip_idx != -1) { ce->pending = true; TAILQ_INSERT_HEAD(&sc->clip_pending, ce, plink); } else { /* Was never added, free right now. */ MPASS(ce->pending); LIST_REMOVE(ce, link); free(ce, M_CXGBE); if (--cde->adp_ref == 0) { LIST_REMOVE(cde, link); free(cde, M_CXGBE); } } } } for (i = 0; i <= clip_db_mask; i++) { LIST_FOREACH(cde, &clip_db[i], link) { if (cde->krn_ref == 0) continue; found = false; LIST_FOREACH(ce, &sc->clip_table[i], link) { if (ce->cde == cde) { found = true; break; } } if (found) continue; ce = alloc_clip_entry(cde); if (ce == NULL) return (ENOMEM); LIST_INSERT_HEAD(&sc->clip_table[i], ce, link); TAILQ_INSERT_TAIL(&sc->clip_pending, ce, plink); ce->pending = true; } } sc->clip_gen = clip_db_gen; return (0); } static int update_hw_clip_table(struct adapter *sc) { struct clip_db_entry *cde; struct clip_entry *ce; int rc; char ip[INET6_ADDRSTRLEN]; mtx_assert(&clip_db_lock, MA_OWNED); rc = begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4clip"); if (rc != 0) return (rc); if (hw_off_limits(sc)) goto done; /* with rc = 0, we don't want to reschedule. */ while (!TAILQ_EMPTY(&sc->clip_pending)) { ce = TAILQ_FIRST(&sc->clip_pending); MPASS(ce->pending); cde = ce->cde; MPASS(cde->adp_ref > 0); if (ce->clip_idx == -1) { /* * Entry was queued for addition to the HW CLIP. */ if (ce->refcount == 0 && cde->krn_ref == 0) { /* No need to add to HW CLIP. */ TAILQ_REMOVE(&sc->clip_pending, ce, plink); LIST_REMOVE(ce, link); free(ce, M_CXGBE); if (--cde->adp_ref == 0) { LIST_REMOVE(cde, link); free(cde, M_CXGBE); } } else { /* Add to the HW CLIP. */ rc = add_lip(sc, &cde->lip, &ce->clip_idx); if (rc == FW_ENOMEM) { /* CLIP full, no point in retrying. */ rc = 0; goto done; } if (rc != 0) { inet_ntop(AF_INET6, &cde->lip, &ip[0], sizeof(ip)); CH_ERR(sc, "add_lip(%s) failed: %d\n", ip, rc); goto done; } MPASS(ce->clip_idx != -1); TAILQ_REMOVE(&sc->clip_pending, ce, plink); ce->pending = false; } } else { /* * Entry was queued for deletion from the HW CLIP. */ if (ce->refcount == 0 && cde->krn_ref == 0) { /* * Delete from the HW CLIP. Delete should never * fail so we always log an error. But if the * failure is that the entry wasn't found in the * CLIP then we carry on as if it was deleted. */ rc = del_lip(sc, &cde->lip); if (rc != 0) CH_ERR(sc, "del_lip(%s) failed: %d\n", ip, rc); if (rc == FW_EPROTO) rc = 0; if (rc != 0) goto done; TAILQ_REMOVE(&sc->clip_pending, ce, plink); LIST_REMOVE(ce, link); free(ce, M_CXGBE); if (--cde->adp_ref == 0) { LIST_REMOVE(cde, link); free(cde, M_CXGBE); } } else { /* No need to delete from HW CLIP. */ TAILQ_REMOVE(&sc->clip_pending, ce, plink); ce->pending = false; } } } done: end_synchronized_op(sc, LOCK_HELD); return (rc); } static void update_clip_table(struct adapter *sc, void *arg __unused) { bool reschedule; if (sc->clip_table == NULL) return; reschedule = false; mtx_lock(&clip_db_lock); if (sc->clip_gen != clip_db_gen && update_sw_clip_table(sc) != 0) reschedule = true; if (!TAILQ_EMPTY(&sc->clip_pending) && update_hw_clip_table(sc) != 0) reschedule = true; mtx_unlock(&clip_db_lock); if (reschedule) taskqueue_enqueue_timeout(taskqueue_thread, &sc->clip_task, -hz / 4); } /* * Update the CLIP table of the specified adapter. */ static void t4_clip_task(void *sc, int count) { update_clip_table(sc, NULL); } void t4_destroy_clip_table(struct adapter *sc) { struct clip_entry *ce, *ce_temp; int i; mtx_lock(&clip_db_lock); if (sc->clip_table == NULL) goto done; /* CLIP was never initialized. */ for (i = 0; i <= sc->clip_mask; i++) { LIST_FOREACH_SAFE(ce, &sc->clip_table[i], link, ce_temp) { MPASS(ce->refcount == 0); MPASS(ce->cde->adp_ref > 0); #if 0 del_lip(sc, &ce->lip); #endif LIST_REMOVE(ce, link); if (--ce->cde->adp_ref == 0 && ce->cde->krn_ref == 0) { LIST_REMOVE(ce->cde, link); free(ce->cde, M_CXGBE); } free(ce, M_CXGBE); } } hashdestroy(sc->clip_table, M_CXGBE, sc->clip_mask); sc->clip_table = NULL; done: mtx_unlock(&clip_db_lock); } static void -t4_ifaddr_event(void *arg __unused, struct ifnet *ifp, struct ifaddr *ifa, +t4_ifaddr_event(void *arg __unused, if_t ifp, struct ifaddr *ifa, int event) { struct in6_addr *in6; if (t4_clip_db_auto == 0) return; /* Automatic updates not allowed. */ if (ifa->ifa_addr->sa_family != AF_INET6) return; - if (ifp->if_flags & IFF_LOOPBACK) + if (if_getflags(ifp) & IFF_LOOPBACK) return; in6 = &((struct in6_ifaddr *)ifa)->ia_addr.sin6_addr; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_MULTICAST(in6)) return; taskqueue_enqueue(taskqueue_thread, &clip_db_task); } int sysctl_clip(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct clip_entry *ce; struct sbuf *sb; int i, rc, header = 0; char ip[INET6_ADDRSTRLEN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); mtx_lock(&clip_db_lock); for (i = 0; i <= sc->clip_mask; i++) { LIST_FOREACH(ce, &sc->clip_table[i], link) { if (header == 0) { sbuf_printf(sb, "%-4s %-4s %s", "Indx", "Refs", "IP address"); header = 1; } inet_ntop(AF_INET6, &ce->cde->lip, &ip[0], sizeof(ip)); if (ce->clip_idx == -1) { sbuf_printf(sb, "\n%-4s %-4d %s", "-", ce->refcount, ip); } else { sbuf_printf(sb, "\n%-4d %-4d %s", ce->clip_idx, ce->refcount, ip); } } } mtx_unlock(&clip_db_lock); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_clip_db(SYSCTL_HANDLER_ARGS) { struct clip_db_entry *cde; struct sbuf *sb; int i, rc, header = 0; char ip[INET6_ADDRSTRLEN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); mtx_lock(&clip_db_lock); for (i = 0; i <= clip_db_mask; i++) { LIST_FOREACH(cde, &clip_db[i], link) { MPASS(cde->tmp_ref == 0); if (header == 0) { sbuf_printf(sb, "%-4s %-4s %s", "Kref", "Aref", "IP address"); header = 1; } inet_ntop(AF_INET6, &cde->lip, &ip[0], sizeof(ip)); sbuf_printf(sb, "\n%-4d %-4d %s", cde->krn_ref, cde->adp_ref, ip); } } mtx_unlock(&clip_db_lock); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_clip_db_auto(SYSCTL_HANDLER_ARGS) { int rc, val; val = t4_clip_db_auto; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val == 0 || val == 1) t4_clip_db_auto = val; else { /* * Writing a value other than 0 or 1 forces a one-time update of * the clip_db directly in the sysctl and not in some taskqueue. */ t4_clip_db_task(NULL, 0); } return (0); } void t4_clip_modload(void) { mtx_init(&clip_db_lock, "clip_db", NULL, MTX_DEF); clip_db = hashinit(CLIP_HASH_SIZE, M_CXGBE, &clip_db_mask); TASK_INIT(&clip_db_task, 0, t4_clip_db_task, NULL); ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event_ext, t4_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); } void t4_clip_modunload(void) { struct clip_db_entry *cde; int i; EVENTHANDLER_DEREGISTER(ifaddr_event_ext, ifaddr_evhandler); taskqueue_drain(taskqueue_thread, &clip_db_task); mtx_lock(&clip_db_lock); for (i = 0; i <= clip_db_mask; i++) { while ((cde = LIST_FIRST(&clip_db[i])) != NULL) { MPASS(cde->tmp_ref == 0); MPASS(cde->adp_ref == 0); LIST_REMOVE(cde, link); free(cde, M_CXGBE); } } mtx_unlock(&clip_db_lock); hashdestroy(clip_db, M_CXGBE, clip_db_mask); mtx_destroy(&clip_db_lock); } #endif diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c index 47c995932a21..517d39a32be4 100644 --- a/sys/dev/cxgbe/t4_l2t.c +++ b/sys/dev/cxgbe/t4_l2t.c @@ -1,465 +1,465 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "t4_l2t.h" /* * Module locking notes: There is a RW lock protecting the L2 table as a * whole plus a spinlock per L2T entry. Entry lookups and allocations happen * under the protection of the table lock, individual entry changes happen * while holding that entry's spinlock. The table lock nests outside the * entry locks. Allocations of new entries take the table lock as writers so * no other lookups can happen while allocating new entries. Entry updates * take the table lock as readers so multiple entries can be updated in * parallel. An L2T entry can be dropped by decrementing its reference count * and therefore can happen in parallel with entry allocation but no entry * can change state or increment its ref count during allocation as both of * these perform lookups. * * Note: We do not take references to ifnets in this module because both * the TOE and the sockets already hold references to the interfaces and the * lifetime of an L2T entry is fully contained in the lifetime of the TOE. */ /* * Allocate a free L2T entry. Must be called with l2t_data.lock held. */ struct l2t_entry * t4_alloc_l2e(struct l2t_data *d) { struct l2t_entry *end, *e, **p; rw_assert(&d->lock, RA_WLOCKED); if (!atomic_load_acq_int(&d->nfree)) return (NULL); /* there's definitely a free entry */ for (e = d->rover, end = &d->l2tab[d->l2t_size]; e != end; ++e) if (atomic_load_acq_int(&e->refcnt) == 0) goto found; for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) continue; found: d->rover = e + 1; atomic_subtract_int(&d->nfree, 1); /* * The entry we found may be an inactive entry that is * presently in the hash table. We need to remove it. */ if (e->state < L2T_STATE_SWITCHING) { for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next) { if (*p == e) { *p = e->next; e->next = NULL; break; } } } e->state = L2T_STATE_UNUSED; return (e); } static struct l2t_entry * find_or_alloc_l2e(struct l2t_data *d, uint16_t vlan, uint8_t port, uint8_t *dmac) { struct l2t_entry *end, *e, **p; struct l2t_entry *first_free = NULL; for (e = &d->l2tab[0], end = &d->l2tab[d->l2t_size]; e != end; ++e) { if (atomic_load_acq_int(&e->refcnt) == 0) { if (!first_free) first_free = e; } else if (e->state == L2T_STATE_SWITCHING && memcmp(e->dmac, dmac, ETHER_ADDR_LEN) == 0 && e->vlan == vlan && e->lport == port) return (e); /* Found existing entry that matches. */ } if (first_free == NULL) return (NULL); /* No match and no room for a new entry. */ /* * The entry we found may be an inactive entry that is * presently in the hash table. We need to remove it. */ e = first_free; if (e->state < L2T_STATE_SWITCHING) { for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next) { if (*p == e) { *p = e->next; e->next = NULL; break; } } } e->state = L2T_STATE_UNUSED; return (e); } static void mk_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync, int reply, void *dst) { struct cpl_l2t_write_req *req; int idx; req = dst; idx = e->idx + sc->vres.l2t.start; INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, idx | V_SYNC_WR(sync) | V_TID_QID(e->iqid))); req->params = htons(V_L2T_W_PORT(e->lport) | V_L2T_W_NOREPLY(!reply)); req->l2t_idx = htons(idx); req->vlan = htons(e->vlan); memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); } /* * Write an L2T entry. Must be called with the entry locked. * The write may be synchronous or asynchronous. */ int t4_write_l2e(struct l2t_entry *e, int sync) { struct sge_wrq *wrq; struct adapter *sc; struct wrq_cookie cookie; struct cpl_l2t_write_req *req; mtx_assert(&e->lock, MA_OWNED); MPASS(e->wrq != NULL); wrq = e->wrq; sc = wrq->adapter; req = start_wrq_wr(wrq, howmany(sizeof(*req), 16), &cookie); if (req == NULL) return (ENOMEM); mk_write_l2e(sc, e, sync, sync, req); commit_wrq_wr(wrq, req, &cookie); if (sync && e->state != L2T_STATE_SWITCHING) e->state = L2T_STATE_SYNC_WRITE; return (0); } /* * Allocate an L2T entry for use by a TLS connection. These entries are * associated with a specific VLAN and destination MAC that never changes. * However, multiple TLS connections might share a single entry. * * If a new L2T entry is allocated, a work request to initialize it is * written to 'txq' and 'ndesc' will be set to 1. Otherwise, 'ndesc' * will be set to 0. * * To avoid races, separate L2T entries are reserved for individual * queues since the L2T entry update is written to a txq just prior to * TLS work requests that will depend on it being written. */ struct l2t_entry * t4_l2t_alloc_tls(struct adapter *sc, struct sge_txq *txq, void *dst, int *ndesc, uint16_t vlan, uint8_t port, uint8_t *eth_addr) { struct l2t_data *d; struct l2t_entry *e; int i; TXQ_LOCK_ASSERT_OWNED(txq); d = sc->l2t; *ndesc = 0; rw_rlock(&d->lock); /* First, try to find an existing entry. */ for (i = 0; i < d->l2t_size; i++) { e = &d->l2tab[i]; if (e->state != L2T_STATE_TLS) continue; if (e->vlan == vlan && e->lport == port && e->wrq == (struct sge_wrq *)txq && memcmp(e->dmac, eth_addr, ETHER_ADDR_LEN) == 0) { if (atomic_fetchadd_int(&e->refcnt, 1) == 0) { /* * This entry wasn't held but is still * valid, so decrement nfree. */ atomic_subtract_int(&d->nfree, 1); } KASSERT(e->refcnt > 0, ("%s: refcount overflow", __func__)); rw_runlock(&d->lock); return (e); } } /* * Don't bother rechecking if the upgrade fails since the txq is * already locked. */ if (!rw_try_upgrade(&d->lock)) { rw_runlock(&d->lock); rw_wlock(&d->lock); } /* Match not found, allocate a new entry. */ e = t4_alloc_l2e(d); if (e == NULL) { rw_wunlock(&d->lock); return (e); } /* Initialize the entry. */ e->state = L2T_STATE_TLS; e->vlan = vlan; e->lport = port; e->iqid = sc->sge.fwq.abs_id; e->wrq = (struct sge_wrq *)txq; memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); atomic_store_rel_int(&e->refcnt, 1); rw_wunlock(&d->lock); /* Write out the work request. */ *ndesc = howmany(sizeof(struct cpl_l2t_write_req), EQ_ESIZE); MPASS(*ndesc == 1); mk_write_l2e(sc, e, 1, 0, dst); return (e); } /* * Allocate an L2T entry for use by a switching rule. Such need to be * explicitly freed and while busy they are not on any hash chain, so normal * address resolution updates do not see them. */ struct l2t_entry * t4_l2t_alloc_switching(struct adapter *sc, uint16_t vlan, uint8_t port, uint8_t *eth_addr) { struct l2t_data *d = sc->l2t; struct l2t_entry *e; int rc; rw_wlock(&d->lock); e = find_or_alloc_l2e(d, vlan, port, eth_addr); if (e) { if (atomic_load_acq_int(&e->refcnt) == 0) { mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ e->wrq = &sc->sge.ctrlq[0]; e->iqid = sc->sge.fwq.abs_id; e->state = L2T_STATE_SWITCHING; e->vlan = vlan; e->lport = port; memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); atomic_store_rel_int(&e->refcnt, 1); atomic_subtract_int(&d->nfree, 1); rc = t4_write_l2e(e, 0); mtx_unlock(&e->lock); if (rc != 0) e = NULL; } else { MPASS(e->vlan == vlan); MPASS(e->lport == port); atomic_add_int(&e->refcnt, 1); } } rw_wunlock(&d->lock); return (e); } int t4_init_l2t(struct adapter *sc, int flags) { int i, l2t_size; struct l2t_data *d; l2t_size = sc->vres.l2t.size; if (l2t_size < 2) /* At least 1 bucket for IP and 1 for IPv6 */ return (EINVAL); d = malloc(sizeof(*d) + l2t_size * sizeof (struct l2t_entry), M_CXGBE, M_ZERO | flags); if (!d) return (ENOMEM); d->l2t_size = l2t_size; d->rover = d->l2tab; atomic_store_rel_int(&d->nfree, l2t_size); rw_init(&d->lock, "L2T"); for (i = 0; i < l2t_size; i++) { struct l2t_entry *e = &d->l2tab[i]; e->idx = i; e->state = L2T_STATE_UNUSED; mtx_init(&e->lock, "L2T_E", NULL, MTX_DEF); STAILQ_INIT(&e->wr_list); atomic_store_rel_int(&e->refcnt, 0); } sc->l2t = d; return (0); } int t4_free_l2t(struct l2t_data *d) { int i; for (i = 0; i < d->l2t_size; i++) mtx_destroy(&d->l2tab[i].lock); rw_destroy(&d->lock); free(d, M_CXGBE); return (0); } int do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); unsigned int tid = GET_TID(rpl); unsigned int idx = tid % L2T_SIZE; if (__predict_false(rpl->status != CPL_ERR_NONE)) { log(LOG_ERR, "Unexpected L2T_WRITE_RPL (%u) for entry at hw_idx %u\n", rpl->status, idx); return (EINVAL); } return (0); } static inline unsigned int vlan_prio(const struct l2t_entry *e) { return e->vlan >> 13; } static char l2e_state(const struct l2t_entry *e) { switch (e->state) { case L2T_STATE_VALID: return 'V'; /* valid, fast-path entry */ case L2T_STATE_STALE: return 'S'; /* needs revalidation, but usable */ case L2T_STATE_SYNC_WRITE: return 'W'; case L2T_STATE_RESOLVING: return STAILQ_EMPTY(&e->wr_list) ? 'R' : 'A'; case L2T_STATE_SWITCHING: return 'X'; case L2T_STATE_TLS: return 'T'; default: return 'U'; } } int sysctl_l2t(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct l2t_data *l2t = sc->l2t; struct l2t_entry *e; struct sbuf *sb; int rc, i, header = 0; char ip[INET6_ADDRSTRLEN]; if (l2t == NULL) return (ENXIO); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); e = &l2t->l2tab[0]; for (i = 0; i < l2t->l2t_size; i++, e++) { mtx_lock(&e->lock); if (e->state == L2T_STATE_UNUSED) goto skip; if (header == 0) { sbuf_printf(sb, " Idx IP address " "Ethernet address VLAN/P LP State Users Port"); header = 1; } if (e->state >= L2T_STATE_SWITCHING) ip[0] = 0; else { inet_ntop(e->ipv6 ? AF_INET6 : AF_INET, &e->addr[0], &ip[0], sizeof(ip)); } /* * XXX: IPv6 addresses may not align properly in the output. */ sbuf_printf(sb, "\n%4u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d" " %u %2u %c %5u %s", e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5], e->vlan & 0xfff, vlan_prio(e), e->lport, l2e_state(e), atomic_load_acq_int(&e->refcnt), - e->ifp ? e->ifp->if_xname : "-"); + e->ifp ? if_name(e->ifp) : "-"); skip: mtx_unlock(&e->lock); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } diff --git a/sys/dev/cxgbe/t4_l2t.h b/sys/dev/cxgbe/t4_l2t.h index 19b738752c2d..f638c55381fa 100644 --- a/sys/dev/cxgbe/t4_l2t.h +++ b/sys/dev/cxgbe/t4_l2t.h @@ -1,115 +1,115 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_L2T_H #define __T4_L2T_H /* identifies sync vs async L2T_WRITE_REQs */ #define S_SYNC_WR 12 #define V_SYNC_WR(x) ((x) << S_SYNC_WR) #define F_SYNC_WR V_SYNC_WR(1) enum { L2T_SIZE = 4096 }; /* # of L2T entries */ enum { L2T_STATE_VALID, /* entry is up to date */ L2T_STATE_STALE, /* entry may be used but needs revalidation */ L2T_STATE_RESOLVING, /* entry needs address resolution */ L2T_STATE_FAILED, /* failed to resolve */ L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */ /* when state is one of the below the entry is not hashed */ L2T_STATE_SWITCHING, /* entry is being used by a switching filter */ L2T_STATE_TLS, /* entry is being used by TLS sessions */ L2T_STATE_UNUSED /* entry not in use */ }; /* * Each L2T entry plays multiple roles. First of all, it keeps state for the * corresponding entry of the HW L2 table and maintains a queue of offload * packets awaiting address resolution. Second, it is a node of a hash table * chain, where the nodes of the chain are linked together through their next * pointer. Finally, each node is a bucket of a hash table, pointing to the * first element in its chain through its first pointer. */ struct l2t_entry { uint16_t state; /* entry state */ uint16_t idx; /* entry index */ uint32_t addr[4]; /* next hop IP or IPv6 address */ uint32_t iqid; /* iqid for reply to write_l2e */ struct sge_wrq *wrq; /* queue to use for write_l2e */ - struct ifnet *ifp; /* outgoing interface */ + if_t ifp; /* outgoing interface */ uint16_t smt_idx; /* SMT index */ uint16_t vlan; /* VLAN TCI (id: 0-11, prio: 13-15) */ struct l2t_entry *first; /* start of hash chain */ struct l2t_entry *next; /* next l2t_entry on chain */ STAILQ_HEAD(, wrqe) wr_list; /* list of WRs awaiting resolution */ struct mtx lock; volatile int refcnt; /* entry reference count */ uint16_t hash; /* hash bucket the entry is on */ uint8_t ipv6; /* entry is for an IPv6 address */ uint8_t lport; /* associated offload logical port */ uint8_t dmac[ETHER_ADDR_LEN]; /* next hop's MAC address */ }; struct l2t_data { struct rwlock lock; u_int l2t_size; volatile int nfree; /* number of free entries */ struct l2t_entry *rover;/* starting point for next allocation */ struct l2t_entry l2tab[]; }; int t4_init_l2t(struct adapter *, int); int t4_free_l2t(struct l2t_data *); struct l2t_entry *t4_alloc_l2e(struct l2t_data *); struct l2t_entry *t4_l2t_alloc_switching(struct adapter *, uint16_t, uint8_t, uint8_t *); struct l2t_entry *t4_l2t_alloc_tls(struct adapter *, struct sge_txq *, void *, int *, uint16_t, uint8_t, uint8_t *); int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t, uint8_t, uint8_t *); int t4_write_l2e(struct l2t_entry *, int); int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void t4_l2t_release(struct l2t_entry *e) { struct l2t_data *d = __containerof(e, struct l2t_data, l2tab[e->idx]); if (atomic_fetchadd_int(&e->refcnt, -1) == 1) atomic_add_int(&d->nfree, 1); } int sysctl_l2t(SYSCTL_HANDLER_ARGS); #endif /* __T4_L2T_H */ diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 71ee366e04e0..18698ff468f3 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -1,13265 +1,13265 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #ifdef KERN_TLS #include #endif #if defined(__i386__) || defined(__amd64__) #include #include #include #include #endif #ifdef DDB #include #include #endif #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "cudbg/cudbg.h" #include "t4_clip.h" #include "t4_ioctl.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #include "t4_if.h" #include "t4_smt.h" /* T4 bus driver interface */ static int t4_probe(device_t); static int t4_attach(device_t); static int t4_detach(device_t); static int t4_child_location(device_t, device_t, struct sbuf *); static int t4_ready(device_t); static int t4_read_port_device(device_t, int, device_t *); static int t4_suspend(device_t); static int t4_resume(device_t); static int t4_reset_prepare(device_t, device_t); static int t4_reset_post(device_t, device_t); static device_method_t t4_methods[] = { DEVMETHOD(device_probe, t4_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(device_suspend, t4_suspend), DEVMETHOD(device_resume, t4_resume), DEVMETHOD(bus_child_location, t4_child_location), DEVMETHOD(bus_reset_prepare, t4_reset_prepare), DEVMETHOD(bus_reset_post, t4_reset_post), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t4_driver = { "t4nex", t4_methods, sizeof(struct adapter) }; /* T4 port (cxgbe) interface */ static int cxgbe_probe(device_t); static int cxgbe_attach(device_t); static int cxgbe_detach(device_t); device_method_t cxgbe_methods[] = { DEVMETHOD(device_probe, cxgbe_probe), DEVMETHOD(device_attach, cxgbe_attach), DEVMETHOD(device_detach, cxgbe_detach), { 0, 0 } }; static driver_t cxgbe_driver = { "cxgbe", cxgbe_methods, sizeof(struct port_info) }; /* T4 VI (vcxgbe) interface */ static int vcxgbe_probe(device_t); static int vcxgbe_attach(device_t); static int vcxgbe_detach(device_t); static device_method_t vcxgbe_methods[] = { DEVMETHOD(device_probe, vcxgbe_probe), DEVMETHOD(device_attach, vcxgbe_attach), DEVMETHOD(device_detach, vcxgbe_detach), { 0, 0 } }; static driver_t vcxgbe_driver = { "vcxgbe", vcxgbe_methods, sizeof(struct vi_info) }; static d_ioctl_t t4_ioctl; static struct cdevsw t4_cdevsw = { .d_version = D_VERSION, .d_ioctl = t4_ioctl, .d_name = "t4nex", }; /* T5 bus driver interface */ static int t5_probe(device_t); static device_method_t t5_methods[] = { DEVMETHOD(device_probe, t5_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(device_suspend, t4_suspend), DEVMETHOD(device_resume, t4_resume), DEVMETHOD(bus_child_location, t4_child_location), DEVMETHOD(bus_reset_prepare, t4_reset_prepare), DEVMETHOD(bus_reset_post, t4_reset_post), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t5_driver = { "t5nex", t5_methods, sizeof(struct adapter) }; /* T5 port (cxl) interface */ static driver_t cxl_driver = { "cxl", cxgbe_methods, sizeof(struct port_info) }; /* T5 VI (vcxl) interface */ static driver_t vcxl_driver = { "vcxl", vcxgbe_methods, sizeof(struct vi_info) }; /* T6 bus driver interface */ static int t6_probe(device_t); static device_method_t t6_methods[] = { DEVMETHOD(device_probe, t6_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(device_suspend, t4_suspend), DEVMETHOD(device_resume, t4_resume), DEVMETHOD(bus_child_location, t4_child_location), DEVMETHOD(bus_reset_prepare, t4_reset_prepare), DEVMETHOD(bus_reset_post, t4_reset_post), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t6_driver = { "t6nex", t6_methods, sizeof(struct adapter) }; /* T6 port (cc) interface */ static driver_t cc_driver = { "cc", cxgbe_methods, sizeof(struct port_info) }; /* T6 VI (vcc) interface */ static driver_t vcc_driver = { "vcc", vcxgbe_methods, sizeof(struct vi_info) }; /* ifnet interface */ static void cxgbe_init(void *); -static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t); -static int cxgbe_transmit(struct ifnet *, struct mbuf *); -static void cxgbe_qflush(struct ifnet *); +static int cxgbe_ioctl(if_t, unsigned long, caddr_t); +static int cxgbe_transmit(if_t, struct mbuf *); +static void cxgbe_qflush(if_t); #if defined(KERN_TLS) || defined(RATELIMIT) -static int cxgbe_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, +static int cxgbe_snd_tag_alloc(if_t, union if_snd_tag_alloc_params *, struct m_snd_tag **); #endif MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services"); /* * Correct lock order when you need to acquire multiple locks is t4_list_lock, * then ADAPTER_LOCK, then t4_uld_list_lock. */ static struct sx t4_list_lock; SLIST_HEAD(, adapter) t4_list; #ifdef TCP_OFFLOAD static struct sx t4_uld_list_lock; SLIST_HEAD(, uld_info) t4_uld_list; #endif /* * Tunables. See tweak_tunables() too. * * Each tunable is set to a default value here if it's known at compile-time. * Otherwise it is set to -n as an indication to tweak_tunables() that it should * provide a reasonable default (upto n) when the driver is loaded. * * Tunables applicable to both T4 and T5 are under hw.cxgbe. Those specific to * T5 are under hw.cxl. */ SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) parameters"); SYSCTL_NODE(_hw, OID_AUTO, cxl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) T5+ parameters"); SYSCTL_NODE(_hw_cxgbe, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) TOE parameters"); /* * Number of queues for tx and rx, NIC and offload. */ #define NTXQ 16 int t4_ntxq = -NTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq, CTLFLAG_RDTUN, &t4_ntxq, 0, "Number of TX queues per port"); TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq); /* Old name, undocumented */ #define NRXQ 8 int t4_nrxq = -NRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq, CTLFLAG_RDTUN, &t4_nrxq, 0, "Number of RX queues per port"); TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq); /* Old name, undocumented */ #define NTXQ_VI 1 static int t4_ntxq_vi = -NTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq_vi, CTLFLAG_RDTUN, &t4_ntxq_vi, 0, "Number of TX queues per VI"); #define NRXQ_VI 1 static int t4_nrxq_vi = -NRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq_vi, CTLFLAG_RDTUN, &t4_nrxq_vi, 0, "Number of RX queues per VI"); static int t4_rsrv_noflowq = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, rsrv_noflowq, CTLFLAG_RDTUN, &t4_rsrv_noflowq, 0, "Reserve TX queue 0 of each VI for non-flowid packets"); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) #define NOFLDTXQ 8 static int t4_nofldtxq = -NOFLDTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0, "Number of offload TX queues per port"); #define NOFLDRXQ 2 static int t4_nofldrxq = -NOFLDRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0, "Number of offload RX queues per port"); #define NOFLDTXQ_VI 1 static int t4_nofldtxq_vi = -NOFLDTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq_vi, CTLFLAG_RDTUN, &t4_nofldtxq_vi, 0, "Number of offload TX queues per VI"); #define NOFLDRXQ_VI 1 static int t4_nofldrxq_vi = -NOFLDRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq_vi, CTLFLAG_RDTUN, &t4_nofldrxq_vi, 0, "Number of offload RX queues per VI"); #define TMR_IDX_OFLD 1 int t4_tmr_idx_ofld = TMR_IDX_OFLD; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx_ofld, CTLFLAG_RDTUN, &t4_tmr_idx_ofld, 0, "Holdoff timer index for offload queues"); #define PKTC_IDX_OFLD (-1) int t4_pktc_idx_ofld = PKTC_IDX_OFLD; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx_ofld, CTLFLAG_RDTUN, &t4_pktc_idx_ofld, 0, "holdoff packet counter index for offload queues"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_keepalive_idle = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_idle, CTLFLAG_RDTUN, &t4_toe_keepalive_idle, 0, "TOE keepalive idle timer (us)"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_keepalive_interval = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_interval, CTLFLAG_RDTUN, &t4_toe_keepalive_interval, 0, "TOE keepalive interval timer (us)"); /* 0 means chip/fw default, non-zero number is # of keepalives before abort */ static int t4_toe_keepalive_count = 0; SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, keepalive_count, CTLFLAG_RDTUN, &t4_toe_keepalive_count, 0, "Number of TOE keepalive probes before abort"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_rexmt_min = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_min, CTLFLAG_RDTUN, &t4_toe_rexmt_min, 0, "Minimum TOE retransmit interval (us)"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_rexmt_max = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_max, CTLFLAG_RDTUN, &t4_toe_rexmt_max, 0, "Maximum TOE retransmit interval (us)"); /* 0 means chip/fw default, non-zero number is # of rexmt before abort */ static int t4_toe_rexmt_count = 0; SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, rexmt_count, CTLFLAG_RDTUN, &t4_toe_rexmt_count, 0, "Number of TOE retransmissions before abort"); /* -1 means chip/fw default, other values are raw backoff values to use */ static int t4_toe_rexmt_backoff[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; SYSCTL_NODE(_hw_cxgbe_toe, OID_AUTO, rexmt_backoff, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) TOE retransmit backoff values"); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 0, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[0], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 1, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[1], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 2, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[2], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 3, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[3], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 4, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[4], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 5, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[5], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 6, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[6], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 7, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[7], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 8, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[8], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 9, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[9], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 10, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[10], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 11, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[11], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 12, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[12], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 13, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[13], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[14], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[15], 0, ""); #endif #ifdef DEV_NETMAP #define NN_MAIN_VI (1 << 0) /* Native netmap on the main VI */ #define NN_EXTRA_VI (1 << 1) /* Native netmap on the extra VI(s) */ static int t4_native_netmap = NN_EXTRA_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, native_netmap, CTLFLAG_RDTUN, &t4_native_netmap, 0, "Native netmap support. bit 0 = main VI, bit 1 = extra VIs"); #define NNMTXQ 8 static int t4_nnmtxq = -NNMTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq, CTLFLAG_RDTUN, &t4_nnmtxq, 0, "Number of netmap TX queues"); #define NNMRXQ 8 static int t4_nnmrxq = -NNMRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq, CTLFLAG_RDTUN, &t4_nnmrxq, 0, "Number of netmap RX queues"); #define NNMTXQ_VI 2 static int t4_nnmtxq_vi = -NNMTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq_vi, CTLFLAG_RDTUN, &t4_nnmtxq_vi, 0, "Number of netmap TX queues per VI"); #define NNMRXQ_VI 2 static int t4_nnmrxq_vi = -NNMRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq_vi, CTLFLAG_RDTUN, &t4_nnmrxq_vi, 0, "Number of netmap RX queues per VI"); #endif /* * Holdoff parameters for ports. */ #define TMR_IDX 1 int t4_tmr_idx = TMR_IDX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx, CTLFLAG_RDTUN, &t4_tmr_idx, 0, "Holdoff timer index"); TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx); /* Old name */ #define PKTC_IDX (-1) int t4_pktc_idx = PKTC_IDX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx, CTLFLAG_RDTUN, &t4_pktc_idx, 0, "Holdoff packet counter index"); TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx); /* Old name */ /* * Size (# of entries) of each tx and rx queue. */ unsigned int t4_qsize_txq = TX_EQ_QSIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_txq, CTLFLAG_RDTUN, &t4_qsize_txq, 0, "Number of descriptors in each TX queue"); unsigned int t4_qsize_rxq = RX_IQ_QSIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_rxq, CTLFLAG_RDTUN, &t4_qsize_rxq, 0, "Number of descriptors in each RX queue"); /* * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively). */ int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, interrupt_types, CTLFLAG_RDTUN, &t4_intr_types, 0, "Interrupt types allowed (bit 0 = INTx, 1 = MSI, 2 = MSI-X)"); /* * Configuration file. All the _CF names here are special. */ #define DEFAULT_CF "default" #define BUILTIN_CF "built-in" #define FLASH_CF "flash" #define UWIRE_CF "uwire" #define FPGA_CF "fpga" static char t4_cfg_file[32] = DEFAULT_CF; SYSCTL_STRING(_hw_cxgbe, OID_AUTO, config_file, CTLFLAG_RDTUN, t4_cfg_file, sizeof(t4_cfg_file), "Firmware configuration file"); /* * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. * pause_autoneg = 1 means PAUSE will be negotiated if possible and the * negotiated settings will override rx_pause/tx_pause. * Otherwise rx_pause/tx_pause are applied forcibly. */ static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG; SYSCTL_INT(_hw_cxgbe, OID_AUTO, pause_settings, CTLFLAG_RDTUN, &t4_pause_settings, 0, "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)"); /* * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively). * -1 to run with the firmware default. Same as FEC_AUTO (bit 5) * 0 to disable FEC. */ static int t4_fec = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fec, CTLFLAG_RDTUN, &t4_fec, 0, "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)"); /* * Controls when the driver sets the FORCE_FEC bit in the L1_CFG32 that it * issues to the firmware. If the firmware doesn't support FORCE_FEC then the * driver runs as if this is set to 0. * -1 to set FORCE_FEC iff requested_fec != AUTO. Multiple FEC bits are okay. * 0 to never set FORCE_FEC. requested_fec = AUTO means use the hint from the * transceiver. Multiple FEC bits may not be okay but will be passed on to * the firmware anyway (may result in l1cfg errors with old firmwares). * 1 to always set FORCE_FEC. Multiple FEC bits are okay. requested_fec = AUTO * means set all FEC bits that are valid for the speed. */ static int t4_force_fec = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, force_fec, CTLFLAG_RDTUN, &t4_force_fec, 0, "Controls the use of FORCE_FEC bit in L1 configuration."); /* * Link autonegotiation. * -1 to run with the firmware default. * 0 to disable. * 1 to enable. */ static int t4_autoneg = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, autoneg, CTLFLAG_RDTUN, &t4_autoneg, 0, "Link autonegotiation"); /* * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed, * encouraged respectively). '-n' is the same as 'n' except the firmware * version used in the checks is read from the firmware bundled with the driver. */ static int t4_fw_install = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fw_install, CTLFLAG_RDTUN, &t4_fw_install, 0, "Firmware auto-install (0 = prohibited, 1 = allowed, 2 = encouraged)"); /* * ASIC features that will be used. Disable the ones you don't want so that the * chip resources aren't wasted on features that will not be used. */ static int t4_nbmcaps_allowed = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nbmcaps_allowed, CTLFLAG_RDTUN, &t4_nbmcaps_allowed, 0, "Default NBM capabilities"); static int t4_linkcaps_allowed = 0; /* No DCBX, PPP, etc. by default */ SYSCTL_INT(_hw_cxgbe, OID_AUTO, linkcaps_allowed, CTLFLAG_RDTUN, &t4_linkcaps_allowed, 0, "Default link capabilities"); static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS | FW_CAPS_CONFIG_SWITCH_EGRESS; SYSCTL_INT(_hw_cxgbe, OID_AUTO, switchcaps_allowed, CTLFLAG_RDTUN, &t4_switchcaps_allowed, 0, "Default switch capabilities"); #ifdef RATELIMIT static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD; #else static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER; #endif SYSCTL_INT(_hw_cxgbe, OID_AUTO, niccaps_allowed, CTLFLAG_RDTUN, &t4_niccaps_allowed, 0, "Default NIC capabilities"); static int t4_toecaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, toecaps_allowed, CTLFLAG_RDTUN, &t4_toecaps_allowed, 0, "Default TCP offload capabilities"); static int t4_rdmacaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, rdmacaps_allowed, CTLFLAG_RDTUN, &t4_rdmacaps_allowed, 0, "Default RDMA capabilities"); static int t4_cryptocaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cryptocaps_allowed, CTLFLAG_RDTUN, &t4_cryptocaps_allowed, 0, "Default crypto capabilities"); static int t4_iscsicaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, iscsicaps_allowed, CTLFLAG_RDTUN, &t4_iscsicaps_allowed, 0, "Default iSCSI capabilities"); static int t4_fcoecaps_allowed = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fcoecaps_allowed, CTLFLAG_RDTUN, &t4_fcoecaps_allowed, 0, "Default FCoE capabilities"); static int t5_write_combine = 0; SYSCTL_INT(_hw_cxl, OID_AUTO, write_combine, CTLFLAG_RDTUN, &t5_write_combine, 0, "Use WC instead of UC for BAR2"); static int t4_num_vis = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, num_vis, CTLFLAG_RDTUN, &t4_num_vis, 0, "Number of VIs per port"); /* * PCIe Relaxed Ordering. * -1: driver should figure out a good value. * 0: disable RO. * 1: enable RO. * 2: leave RO alone. */ static int pcie_relaxed_ordering = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, pcie_relaxed_ordering, CTLFLAG_RDTUN, &pcie_relaxed_ordering, 0, "PCIe Relaxed Ordering: 0 = disable, 1 = enable, 2 = leave alone"); static int t4_panic_on_fatal_err = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RWTUN, &t4_panic_on_fatal_err, 0, "panic on fatal errors"); static int t4_reset_on_fatal_err = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, reset_on_fatal_err, CTLFLAG_RWTUN, &t4_reset_on_fatal_err, 0, "reset adapter on fatal errors"); static int t4_clock_gate_on_suspend = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, clock_gate_on_suspend, CTLFLAG_RWTUN, &t4_clock_gate_on_suspend, 0, "gate the clock on suspend"); static int t4_tx_vm_wr = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_vm_wr, CTLFLAG_RWTUN, &t4_tx_vm_wr, 0, "Use VM work requests to transmit packets."); /* * Set to non-zero to enable the attack filter. A packet that matches any of * these conditions will get dropped on ingress: * 1) IP && source address == destination address. * 2) TCP/IP && source address is not a unicast address. * 3) TCP/IP && destination address is not a unicast address. * 4) IP && source address is loopback (127.x.y.z). * 5) IP && destination address is loopback (127.x.y.z). * 6) IPv6 && source address == destination address. * 7) IPv6 && source address is not a unicast address. * 8) IPv6 && source address is loopback (::1/128). * 9) IPv6 && destination address is loopback (::1/128). * 10) IPv6 && source address is unspecified (::/128). * 11) IPv6 && destination address is unspecified (::/128). * 12) TCP/IPv6 && source address is multicast (ff00::/8). * 13) TCP/IPv6 && destination address is multicast (ff00::/8). */ static int t4_attack_filter = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, attack_filter, CTLFLAG_RDTUN, &t4_attack_filter, 0, "Drop suspicious traffic"); static int t4_drop_ip_fragments = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_ip_fragments, CTLFLAG_RDTUN, &t4_drop_ip_fragments, 0, "Drop IP fragments"); static int t4_drop_pkts_with_l2_errors = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l2_errors, CTLFLAG_RDTUN, &t4_drop_pkts_with_l2_errors, 0, "Drop all frames with Layer 2 length or checksum errors"); static int t4_drop_pkts_with_l3_errors = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l3_errors, CTLFLAG_RDTUN, &t4_drop_pkts_with_l3_errors, 0, "Drop all frames with IP version, length, or checksum errors"); static int t4_drop_pkts_with_l4_errors = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, drop_pkts_with_l4_errors, CTLFLAG_RDTUN, &t4_drop_pkts_with_l4_errors, 0, "Drop all frames with Layer 4 length, checksum, or other errors"); #ifdef TCP_OFFLOAD /* * TOE tunables. */ static int t4_cop_managed_offloading = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cop_managed_offloading, CTLFLAG_RDTUN, &t4_cop_managed_offloading, 0, "COP (Connection Offload Policy) controls all TOE offload"); #endif #ifdef KERN_TLS /* * This enables KERN_TLS for all adapters if set. */ static int t4_kern_tls = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, kern_tls, CTLFLAG_RDTUN, &t4_kern_tls, 0, "Enable KERN_TLS mode for T6 adapters"); SYSCTL_NODE(_hw_cxgbe, OID_AUTO, tls, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "cxgbe(4) KERN_TLS parameters"); static int t4_tls_inline_keys = 0; SYSCTL_INT(_hw_cxgbe_tls, OID_AUTO, inline_keys, CTLFLAG_RDTUN, &t4_tls_inline_keys, 0, "Always pass TLS keys in work requests (1) or attempt to store TLS keys " "in card memory."); static int t4_tls_combo_wrs = 0; SYSCTL_INT(_hw_cxgbe_tls, OID_AUTO, combo_wrs, CTLFLAG_RDTUN, &t4_tls_combo_wrs, 0, "Attempt to combine TCB field updates with TLS record work requests."); #endif /* Functions used by VIs to obtain unique MAC addresses for each VI. */ static int vi_mac_funcs[] = { FW_VI_FUNC_ETH, FW_VI_FUNC_OFLD, FW_VI_FUNC_IWARP, FW_VI_FUNC_OPENISCSI, FW_VI_FUNC_OPENFCOE, FW_VI_FUNC_FOISCSI, FW_VI_FUNC_FOFCOE, }; struct intrs_and_queues { uint16_t intr_type; /* INTx, MSI, or MSI-X */ uint16_t num_vis; /* number of VIs for each port */ uint16_t nirq; /* Total # of vectors */ uint16_t ntxq; /* # of NIC txq's for each port */ uint16_t nrxq; /* # of NIC rxq's for each port */ uint16_t nofldtxq; /* # of TOE/ETHOFLD txq's for each port */ uint16_t nofldrxq; /* # of TOE rxq's for each port */ uint16_t nnmtxq; /* # of netmap txq's */ uint16_t nnmrxq; /* # of netmap rxq's */ /* The vcxgbe/vcxl interfaces use these and not the ones above. */ uint16_t ntxq_vi; /* # of NIC txq's */ uint16_t nrxq_vi; /* # of NIC rxq's */ uint16_t nofldtxq_vi; /* # of TOE txq's */ uint16_t nofldrxq_vi; /* # of TOE rxq's */ uint16_t nnmtxq_vi; /* # of netmap txq's */ uint16_t nnmrxq_vi; /* # of netmap rxq's */ }; static void setup_memwin(struct adapter *); static void position_memwin(struct adapter *, int, uint32_t); static int validate_mem_range(struct adapter *, uint32_t, uint32_t); static int fwmtype_to_hwmtype(int); static int validate_mt_off_len(struct adapter *, int, uint32_t, uint32_t, uint32_t *); static int fixup_devlog_params(struct adapter *); static int cfg_itype_and_nqueues(struct adapter *, struct intrs_and_queues *); static int contact_firmware(struct adapter *); static int partition_resources(struct adapter *); static int get_params__pre_init(struct adapter *); static int set_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); static bool fixed_ifmedia(struct port_info *); static void build_medialist(struct port_info *); static void init_link_config(struct port_info *); static int fixup_link_config(struct port_info *); static int apply_link_config(struct port_info *); static int cxgbe_init_synchronized(struct vi_info *); static int cxgbe_uninit_synchronized(struct vi_info *); static int adapter_full_init(struct adapter *); static void adapter_full_uninit(struct adapter *); static int vi_full_init(struct vi_info *); static void vi_full_uninit(struct vi_info *); static int alloc_extra_vi(struct adapter *, struct port_info *, struct vi_info *); static void quiesce_txq(struct sge_txq *); static void quiesce_wrq(struct sge_wrq *); static void quiesce_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *); static void quiesce_vi(struct vi_info *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, driver_intr_t *, void *, char *); static int t4_free_irq(struct adapter *, struct irq *); static void t4_init_atid_table(struct adapter *); static void t4_free_atid_table(struct adapter *); static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void vi_refresh_stats(struct vi_info *); static void cxgbe_refresh_stats(struct vi_info *); static void cxgbe_tick(void *); static void vi_tick(void *); static void cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS); static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS); static int sysctl_link_fec(SYSCTL_HANDLER_ARGS); static int sysctl_requested_fec(SYSCTL_HANDLER_ARGS); static int sysctl_module_fec(SYSCTL_HANDLER_ARGS); static int sysctl_autoneg(SYSCTL_HANDLER_ARGS); static int sysctl_force_fec(SYSCTL_HANDLER_ARGS); static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS); static int sysctl_temperature(SYSCTL_HANDLER_ARGS); static int sysctl_vdd(SYSCTL_HANDLER_ARGS); static int sysctl_reset_sensor(SYSCTL_HANDLER_ARGS); static int sysctl_loadavg(SYSCTL_HANDLER_ARGS); static int sysctl_cctrl(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS); static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tid_stats(SYSCTL_HANDLER_ARGS); static int sysctl_devlog(SYSCTL_HANDLER_ARGS); static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS); static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS); static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS); static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS); static int sysctl_meminfo(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS); static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS); static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS); static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tids(SYSCTL_HANDLER_ARGS); static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tnl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la(SYSCTL_HANDLER_ARGS); static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); static int sysctl_cpus(SYSCTL_HANDLER_ARGS); static int sysctl_reset(SYSCTL_HANDLER_ARGS); #ifdef TCP_OFFLOAD static int sysctl_tls(SYSCTL_HANDLER_ARGS); static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS); static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS); static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS); #endif static int get_sge_context(struct adapter *, struct t4_sge_context *); static int load_fw(struct adapter *, struct t4_data *); static int load_cfg(struct adapter *, struct t4_data *); static int load_boot(struct adapter *, struct t4_bootrom *); static int load_bootcfg(struct adapter *, struct t4_data *); static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *); static void free_offload_policy(struct t4_offload_policy *); static int set_offload_policy(struct adapter *, struct t4_offload_policy *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); static int clear_stats(struct adapter *, u_int); static int hold_clip_addr(struct adapter *, struct t4_clip_addr *); static int release_clip_addr(struct adapter *, struct t4_clip_addr *); #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *, bool); static int t4_deactivate_all_uld(struct adapter *); static void t4_async_event(struct adapter *); #endif #ifdef KERN_TLS static int ktls_capability(struct adapter *, bool); #endif static int mod_event(module_t, int, void *); static int notify_siblings(device_t, int); -static uint64_t vi_get_counter(struct ifnet *, ift_counter); -static uint64_t cxgbe_get_counter(struct ifnet *, ift_counter); +static uint64_t vi_get_counter(if_t, ift_counter); +static uint64_t cxgbe_get_counter(if_t, ift_counter); static void enable_vxlan_rx(struct adapter *); static void reset_adapter_task(void *, int); static void fatal_error_task(void *, int); static void dump_devlog(struct adapter *); static void dump_cim_regs(struct adapter *); static void dump_cimla(struct adapter *); struct { uint16_t device; char *desc; } t4_pciids[] = { {0xa000, "Chelsio Terminator 4 FPGA"}, {0x4400, "Chelsio T440-dbg"}, {0x4401, "Chelsio T420-CR"}, {0x4402, "Chelsio T422-CR"}, {0x4403, "Chelsio T440-CR"}, {0x4404, "Chelsio T420-BCH"}, {0x4405, "Chelsio T440-BCH"}, {0x4406, "Chelsio T440-CH"}, {0x4407, "Chelsio T420-SO"}, {0x4408, "Chelsio T420-CX"}, {0x4409, "Chelsio T420-BT"}, {0x440a, "Chelsio T404-BT"}, {0x440e, "Chelsio T440-LP-CR"}, }, t5_pciids[] = { {0xb000, "Chelsio Terminator 5 FPGA"}, {0x5400, "Chelsio T580-dbg"}, {0x5401, "Chelsio T520-CR"}, /* 2 x 10G */ {0x5402, "Chelsio T522-CR"}, /* 2 x 10G, 2 X 1G */ {0x5403, "Chelsio T540-CR"}, /* 4 x 10G */ {0x5407, "Chelsio T520-SO"}, /* 2 x 10G, nomem */ {0x5409, "Chelsio T520-BT"}, /* 2 x 10GBaseT */ {0x540a, "Chelsio T504-BT"}, /* 4 x 1G */ {0x540d, "Chelsio T580-CR"}, /* 2 x 40G */ {0x540e, "Chelsio T540-LP-CR"}, /* 4 x 10G */ {0x5410, "Chelsio T580-LP-CR"}, /* 2 x 40G */ {0x5411, "Chelsio T520-LL-CR"}, /* 2 x 10G */ {0x5412, "Chelsio T560-CR"}, /* 1 x 40G, 2 x 10G */ {0x5414, "Chelsio T580-LP-SO-CR"}, /* 2 x 40G, nomem */ {0x5415, "Chelsio T502-BT"}, /* 2 x 1G */ {0x5418, "Chelsio T540-BT"}, /* 4 x 10GBaseT */ {0x5419, "Chelsio T540-LP-BT"}, /* 4 x 10GBaseT */ {0x541a, "Chelsio T540-SO-BT"}, /* 4 x 10GBaseT, nomem */ {0x541b, "Chelsio T540-SO-CR"}, /* 4 x 10G, nomem */ /* Custom */ {0x5483, "Custom T540-CR"}, {0x5484, "Custom T540-BT"}, }, t6_pciids[] = { {0xc006, "Chelsio Terminator 6 FPGA"}, /* T6 PE10K6 FPGA (PF0) */ {0x6400, "Chelsio T6-DBG-25"}, /* 2 x 10/25G, debug */ {0x6401, "Chelsio T6225-CR"}, /* 2 x 10/25G */ {0x6402, "Chelsio T6225-SO-CR"}, /* 2 x 10/25G, nomem */ {0x6403, "Chelsio T6425-CR"}, /* 4 x 10/25G */ {0x6404, "Chelsio T6425-SO-CR"}, /* 4 x 10/25G, nomem */ {0x6405, "Chelsio T6225-OCP-SO"}, /* 2 x 10/25G, nomem */ {0x6406, "Chelsio T62100-OCP-SO"}, /* 2 x 40/50/100G, nomem */ {0x6407, "Chelsio T62100-LP-CR"}, /* 2 x 40/50/100G */ {0x6408, "Chelsio T62100-SO-CR"}, /* 2 x 40/50/100G, nomem */ {0x6409, "Chelsio T6210-BT"}, /* 2 x 10GBASE-T */ {0x640d, "Chelsio T62100-CR"}, /* 2 x 40/50/100G */ {0x6410, "Chelsio T6-DBG-100"}, /* 2 x 40/50/100G, debug */ {0x6411, "Chelsio T6225-LL-CR"}, /* 2 x 10/25G */ {0x6414, "Chelsio T61100-OCP-SO"}, /* 1 x 40/50/100G, nomem */ {0x6415, "Chelsio T6201-BT"}, /* 2 x 1000BASE-T */ /* Custom */ {0x6480, "Custom T6225-CR"}, {0x6481, "Custom T62100-CR"}, {0x6482, "Custom T6225-CR"}, {0x6483, "Custom T62100-CR"}, {0x6484, "Custom T64100-CR"}, {0x6485, "Custom T6240-SO"}, {0x6486, "Custom T6225-SO-CR"}, {0x6487, "Custom T6225-CR"}, }; #ifdef TCP_OFFLOAD /* * service_iq_fl() has an iq and needs the fl. Offset of fl from the iq should * be exactly the same for both rxq and ofld_rxq. */ CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); static int t4_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xa000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t4_pciids); i++) { if (d == t4_pciids[i].device) { device_set_desc(dev, t4_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t5_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xb000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t5_pciids); i++) { if (d == t5_pciids[i].device) { device_set_desc(dev, t5_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t6_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); for (i = 0; i < nitems(t6_pciids); i++) { if (d == t6_pciids[i].device) { device_set_desc(dev, t6_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void t5_attribute_workaround(device_t dev) { device_t root_port; uint32_t v; /* * The T5 chips do not properly echo the No Snoop and Relaxed * Ordering attributes when replying to a TLP from a Root * Port. As a workaround, find the parent Root Port and * disable No Snoop and Relaxed Ordering. Note that this * affects all devices under this root port. */ root_port = pci_find_pcie_root_port(dev); if (root_port == NULL) { device_printf(dev, "Unable to find parent root port\n"); return; } v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL, PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2); if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) != 0) device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n", device_get_nameunit(root_port)); } static const struct devnames devnames[] = { { .nexus_name = "t4nex", .ifnet_name = "cxgbe", .vi_ifnet_name = "vcxgbe", .pf03_drv_name = "t4iov", .vf_nexus_name = "t4vf", .vf_ifnet_name = "cxgbev" }, { .nexus_name = "t5nex", .ifnet_name = "cxl", .vi_ifnet_name = "vcxl", .pf03_drv_name = "t5iov", .vf_nexus_name = "t5vf", .vf_ifnet_name = "cxlv" }, { .nexus_name = "t6nex", .ifnet_name = "cc", .vi_ifnet_name = "vcc", .pf03_drv_name = "t6iov", .vf_nexus_name = "t6vf", .vf_ifnet_name = "ccv" } }; void t4_init_devnames(struct adapter *sc) { int id; id = chip_id(sc); if (id >= CHELSIO_T4 && id - CHELSIO_T4 < nitems(devnames)) sc->names = &devnames[id - CHELSIO_T4]; else { device_printf(sc->dev, "chip id %d is not supported.\n", id); sc->names = NULL; } } static int t4_ifnet_unit(struct adapter *sc, struct port_info *pi) { const char *parent, *name; long value; int line, unit; line = 0; parent = device_get_nameunit(sc->dev); name = sc->names->ifnet_name; while (resource_find_dev(&line, name, &unit, "at", parent) == 0) { if (resource_long_value(name, unit, "port", &value) == 0 && value == pi->port_id) return (unit); } return (-1); } static void t4_calibration(void *arg) { struct adapter *sc; struct clock_sync *cur, *nex; uint64_t hw; sbintime_t sbt; int next_up; sc = (struct adapter *)arg; KASSERT((hw_off_limits(sc) == 0), ("hw_off_limits at t4_calibration")); hw = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO); sbt = sbinuptime(); cur = &sc->cal_info[sc->cal_current]; next_up = (sc->cal_current + 1) % CNT_CAL_INFO; nex = &sc->cal_info[next_up]; if (__predict_false(sc->cal_count == 0)) { /* First time in, just get the values in */ cur->hw_cur = hw; cur->sbt_cur = sbt; sc->cal_count++; goto done; } if (cur->hw_cur == hw) { /* The clock is not advancing? */ sc->cal_count = 0; atomic_store_rel_int(&cur->gen, 0); goto done; } seqc_write_begin(&nex->gen); nex->hw_prev = cur->hw_cur; nex->sbt_prev = cur->sbt_cur; nex->hw_cur = hw; nex->sbt_cur = sbt; seqc_write_end(&nex->gen); sc->cal_current = next_up; done: callout_reset_sbt_curcpu(&sc->cal_callout, SBT_1S, 0, t4_calibration, sc, C_DIRECT_EXEC); } static void t4_calibration_start(struct adapter *sc) { /* * Here if we have not done a calibration * then do so otherwise start the appropriate * timer. */ int i; for (i = 0; i < CNT_CAL_INFO; i++) { sc->cal_info[i].gen = 0; } sc->cal_current = 0; sc->cal_count = 0; sc->cal_gen = 0; t4_calibration(sc); } static int t4_attach(device_t dev) { struct adapter *sc; int rc = 0, i, j, rqidx, tqidx, nports; struct make_dev_args mda; struct intrs_and_queues iaq; struct sge *s; uint32_t *buf; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) int ofld_tqidx; #endif #ifdef TCP_OFFLOAD int ofld_rqidx; #endif #ifdef DEV_NETMAP int nm_rqidx, nm_tqidx; #endif int num_vis; sc = device_get_softc(dev); sc->dev = dev; sysctl_ctx_init(&sc->ctx); TUNABLE_INT_FETCH("hw.cxgbe.dflags", &sc->debug_flags); if ((pci_get_device(dev) & 0xff00) == 0x5400) t5_attribute_workaround(dev); pci_enable_busmaster(dev); if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) { uint32_t v; pci_set_max_read_req(dev, 4096); v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2); sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5); if (pcie_relaxed_ordering == 0 && (v & PCIEM_CTL_RELAXED_ORD_ENABLE) != 0) { v &= ~PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); } else if (pcie_relaxed_ordering == 1 && (v & PCIEM_CTL_RELAXED_ORD_ENABLE) == 0) { v |= PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); } } sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS); sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL); sc->traceq = -1; mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF); snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer", device_get_nameunit(dev)); snprintf(sc->lockname, sizeof(sc->lockname), "%s", device_get_nameunit(dev)); mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF); t4_add_adapter(sc); mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF); TAILQ_INIT(&sc->sfl); callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0); mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF); sc->policy = NULL; rw_init(&sc->policy_lock, "connection offload policy"); callout_init(&sc->ktls_tick, 1); callout_init(&sc->cal_callout, 1); refcount_init(&sc->vxlan_refcount, 0); TASK_INIT(&sc->reset_task, 0, reset_adapter_task, sc); TASK_INIT(&sc->fatal_error_task, 0, fatal_error_task, sc); sc->ctrlq_oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "ctrlq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "control queues"); sc->fwq_oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "fwq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "firmware event queue"); rc = t4_map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); /* Prepare the adapter for operation. */ buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_prep_adapter(sc, buf); free(buf, M_CXGBE); if (rc != 0) { device_printf(dev, "failed to prepare adapter: %d.\n", rc); goto done; } /* * This is the real PF# to which we're attaching. Works from within PCI * passthrough environments too, where pci_get_function() could return a * different PF# depending on the passthrough configuration. We need to * use the real PF# in all our communication with the firmware. */ j = t4_read_reg(sc, A_PL_WHOAMI); sc->pf = chip_id(sc) <= CHELSIO_T5 ? G_SOURCEPF(j) : G_T6_SOURCEPF(j); sc->mbox = sc->pf; t4_init_devnames(sc); if (sc->names == NULL) { rc = ENOTSUP; goto done; /* error message displayed already */ } /* * Do this really early, with the memory windows set up even before the * character device. The userland tool's register i/o and mem read * will work even in "recovery mode". */ setup_memwin(sc); if (t4_init_devlog_params(sc, 0) == 0) fixup_devlog_params(sc); make_dev_args_init(&mda); mda.mda_devsw = &t4_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (rc != 0) device_printf(dev, "failed to create nexus char device: %d.\n", rc); /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { device_printf(dev, "recovery mode.\n"); goto done; } #if defined(__i386__) if ((cpu_feature & CPUID_CX8) == 0) { device_printf(dev, "64 bit atomics not available.\n"); rc = ENOTSUP; goto done; } #endif /* Contact the firmware and try to become the master driver. */ rc = contact_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ MPASS(sc->flags & FW_OK); rc = get_params__pre_init(sc); if (rc != 0) goto done; /* error message displayed already */ if (sc->flags & MASTER_PF) { rc = partition_resources(sc); if (rc != 0) goto done; /* error message displayed already */ t4_intr_clear(sc); } rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_map_bar_2(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_create_dma_tag(sc); if (rc != 0) goto done; /* error message displayed already */ /* * First pass over all the ports - allocate VIs and initialize some * basic parameters like mac address, port type, etc. */ for_each_port(sc, i) { struct port_info *pi; pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK); sc->port[i] = pi; /* These must be set before t4_port_init */ pi->adapter = sc; pi->port_id = i; /* * XXX: vi[0] is special so we can't delay this allocation until * pi->nvi's final value is known. */ pi->vi = malloc(sizeof(struct vi_info) * t4_num_vis, M_CXGBE, M_ZERO | M_WAITOK); /* * Allocate the "main" VI and initialize parameters * like mac addr. */ rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i); if (rc != 0) { device_printf(dev, "unable to initialize port %d: %d\n", i, rc); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } if (is_bt(pi->port_type)) setbit(&sc->bt_map, pi->tx_chan); else MPASS(!isset(&sc->bt_map, pi->tx_chan)); snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF); sc->chan_map[pi->tx_chan] = i; /* * The MPS counter for FCS errors doesn't work correctly on the * T6 so we use the MAC counter here. Which MAC is in use * depends on the link settings which will be known when the * link comes up. */ if (is_t6(sc)) { pi->fcs_reg = -1; } else if (is_t4(sc)) { pi->fcs_reg = PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L); } else { pi->fcs_reg = T5_PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L); } pi->fcs_base = 0; /* All VIs on this port share this media. */ ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); PORT_LOCK(pi); init_link_config(pi); fixup_link_config(pi); build_medialist(pi); if (fixed_ifmedia(pi)) pi->flags |= FIXED_IFMEDIA; PORT_UNLOCK(pi); pi->dev = device_add_child(dev, sc->names->ifnet_name, t4_ifnet_unit(sc, pi)); if (pi->dev == NULL) { device_printf(dev, "failed to add device for port %d.\n", i); rc = ENXIO; goto done; } pi->vi[0].dev = pi->dev; device_set_softc(pi->dev, pi); } /* * Interrupt type, # of interrupts, # of rx/tx queues, etc. */ nports = sc->params.nports; rc = cfg_itype_and_nqueues(sc, &iaq); if (rc != 0) goto done; /* error message displayed already */ num_vis = iaq.num_vis; sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; s = &sc->sge; s->nrxq = nports * iaq.nrxq; s->ntxq = nports * iaq.ntxq; if (num_vis > 1) { s->nrxq += nports * (num_vis - 1) * iaq.nrxq_vi; s->ntxq += nports * (num_vis - 1) * iaq.ntxq_vi; } s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += nports; /* ctrl queues: 1 per port */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (is_offload(sc) || is_ethoffload(sc)) { s->nofldtxq = nports * iaq.nofldtxq; if (num_vis > 1) s->nofldtxq += nports * (num_vis - 1) * iaq.nofldtxq_vi; s->neq += s->nofldtxq; s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_ofld_txq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = nports * iaq.nofldrxq; if (num_vis > 1) s->nofldrxq += nports * (num_vis - 1) * iaq.nofldrxq_vi; s->neq += s->nofldrxq; /* free list */ s->niq += s->nofldrxq; s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef DEV_NETMAP s->nnmrxq = 0; s->nnmtxq = 0; if (t4_native_netmap & NN_MAIN_VI) { s->nnmrxq += nports * iaq.nnmrxq; s->nnmtxq += nports * iaq.nnmtxq; } if (num_vis > 1 && t4_native_netmap & NN_EXTRA_VI) { s->nnmrxq += nports * (num_vis - 1) * iaq.nnmrxq_vi; s->nnmtxq += nports * (num_vis - 1) * iaq.nnmtxq_vi; } s->neq += s->nnmtxq + s->nnmrxq; s->niq += s->nnmrxq; s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), M_CXGBE, M_ZERO | M_WAITOK); #endif MPASS(s->niq <= s->iqmap_sz); MPASS(s->neq <= s->eqmap_sz); s->ctrlq = malloc(nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE, M_ZERO | M_WAITOK); s->iqmap = malloc(s->iqmap_sz * sizeof(struct sge_iq *), M_CXGBE, M_ZERO | M_WAITOK); s->eqmap = malloc(s->eqmap_sz * sizeof(struct sge_eq *), M_CXGBE, M_ZERO | M_WAITOK); sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE, M_ZERO | M_WAITOK); t4_init_l2t(sc, M_WAITOK); t4_init_smt(sc, M_WAITOK); t4_init_tx_sched(sc); t4_init_atid_table(sc); #ifdef RATELIMIT t4_init_etid_table(sc); #endif #ifdef INET6 t4_init_clip_table(sc); #endif if (sc->vres.key.size != 0) sc->key_map = vmem_create("T4TLS key map", sc->vres.key.start, sc->vres.key.size, 32, 0, M_FIRSTFIT | M_WAITOK); /* * Second pass over the ports. This time we know the number of rx and * tx queues that each port should get. */ rqidx = tqidx = 0; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) ofld_tqidx = 0; #endif #ifdef TCP_OFFLOAD ofld_rqidx = 0; #endif #ifdef DEV_NETMAP nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; struct vi_info *vi; if (pi == NULL) continue; pi->nvi = num_vis; for_each_vi(pi, j, vi) { vi->pi = pi; vi->adapter = sc; vi->first_intr = -1; vi->qsize_rxq = t4_qsize_rxq; vi->qsize_txq = t4_qsize_txq; vi->first_rxq = rqidx; vi->first_txq = tqidx; vi->tmr_idx = t4_tmr_idx; vi->pktc_idx = t4_pktc_idx; vi->nrxq = j == 0 ? iaq.nrxq : iaq.nrxq_vi; vi->ntxq = j == 0 ? iaq.ntxq : iaq.ntxq_vi; rqidx += vi->nrxq; tqidx += vi->ntxq; if (j == 0 && vi->ntxq > 1) vi->rsrv_noflowq = t4_rsrv_noflowq ? 1 : 0; else vi->rsrv_noflowq = 0; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) vi->first_ofld_txq = ofld_tqidx; vi->nofldtxq = j == 0 ? iaq.nofldtxq : iaq.nofldtxq_vi; ofld_tqidx += vi->nofldtxq; #endif #ifdef TCP_OFFLOAD vi->ofld_tmr_idx = t4_tmr_idx_ofld; vi->ofld_pktc_idx = t4_pktc_idx_ofld; vi->first_ofld_rxq = ofld_rqidx; vi->nofldrxq = j == 0 ? iaq.nofldrxq : iaq.nofldrxq_vi; ofld_rqidx += vi->nofldrxq; #endif #ifdef DEV_NETMAP vi->first_nm_rxq = nm_rqidx; vi->first_nm_txq = nm_tqidx; if (j == 0) { vi->nnmrxq = iaq.nnmrxq; vi->nnmtxq = iaq.nnmtxq; } else { vi->nnmrxq = iaq.nnmrxq_vi; vi->nnmtxq = iaq.nnmtxq_vi; } nm_rqidx += vi->nnmrxq; nm_tqidx += vi->nnmtxq; #endif } } rc = t4_setup_intr_handlers(sc); if (rc != 0) { device_printf(dev, "failed to setup interrupt handlers: %d\n", rc); goto done; } rc = bus_generic_probe(dev); if (rc != 0) { device_printf(dev, "failed to probe child drivers: %d\n", rc); goto done; } /* * Ensure thread-safe mailbox access (in debug builds). * * So far this was the only thread accessing the mailbox but various * ifnets and sysctls are about to be created and their handlers/ioctls * will access the mailbox from different threads. */ sc->flags |= CHK_MBOX_ACCESS; rc = bus_generic_attach(dev); if (rc != 0) { device_printf(dev, "failed to attach all child ports: %d\n", rc); goto done; } t4_calibration_start(sc); device_printf(dev, "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n", sc->params.pci.speed, sc->params.pci.width, sc->params.nports, sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" : (sc->intr_type == INTR_MSI ? "MSI" : "INTx"), sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq); t4_set_desc(sc); notify_siblings(dev, 0); done: if (rc != 0 && sc->cdev) { /* cdev was created and so cxgbetool works; recover that way. */ device_printf(dev, "error during attach, adapter is now in recovery mode.\n"); rc = 0; } if (rc != 0) t4_detach_common(dev); else t4_sysctls(sc); return (rc); } static int t4_child_location(device_t bus, device_t dev, struct sbuf *sb) { struct adapter *sc; struct port_info *pi; int i; sc = device_get_softc(bus); for_each_port(sc, i) { pi = sc->port[i]; if (pi != NULL && pi->dev == dev) { sbuf_printf(sb, "port=%d", pi->port_id); break; } } return (0); } static int t4_ready(device_t dev) { struct adapter *sc; sc = device_get_softc(dev); if (sc->flags & FW_OK) return (0); return (ENXIO); } static int t4_read_port_device(device_t dev, int port, device_t *child) { struct adapter *sc; struct port_info *pi; sc = device_get_softc(dev); if (port < 0 || port >= MAX_NPORTS) return (EINVAL); pi = sc->port[port]; if (pi == NULL || pi->dev == NULL) return (ENXIO); *child = pi->dev; return (0); } static int notify_siblings(device_t dev, int detaching) { device_t sibling; int error, i; error = 0; for (i = 0; i < PCI_FUNCMAX; i++) { if (i == pci_get_function(dev)) continue; sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), i); if (sibling == NULL || !device_is_attached(sibling)) continue; if (detaching) error = T4_DETACH_CHILD(sibling); else (void)T4_ATTACH_CHILD(sibling); if (error) break; } return (error); } /* * Idempotent */ static int t4_detach(device_t dev) { int rc; rc = notify_siblings(dev, 1); if (rc) { device_printf(dev, "failed to detach sibling devices: %d\n", rc); return (rc); } return (t4_detach_common(dev)); } int t4_detach_common(device_t dev) { struct adapter *sc; struct port_info *pi; int i, rc; sc = device_get_softc(dev); #ifdef TCP_OFFLOAD rc = t4_deactivate_all_uld(sc); if (rc) { device_printf(dev, "failed to detach upper layer drivers: %d\n", rc); return (rc); } #endif if (sc->cdev) { destroy_dev(sc->cdev); sc->cdev = NULL; } sx_xlock(&t4_list_lock); SLIST_REMOVE(&t4_list, sc, adapter, link); sx_xunlock(&t4_list_lock); sc->flags &= ~CHK_MBOX_ACCESS; if (sc->flags & FULL_INIT_DONE) { if (!(sc->flags & IS_VF)) t4_intr_disable(sc); } if (device_is_attached(dev)) { rc = bus_generic_detach(dev); if (rc) { device_printf(dev, "failed to detach child devices: %d\n", rc); return (rc); } } for (i = 0; i < sc->intr_count; i++) t4_free_irq(sc, &sc->irq[i]); if ((sc->flags & (IS_VF | FW_OK)) == FW_OK) t4_free_tx_sched(sc); for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid); if (pi->dev) device_delete_child(dev, pi->dev); mtx_destroy(&pi->pi_lock); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); } } callout_stop(&sc->cal_callout); callout_drain(&sc->cal_callout); device_delete_children(dev); sysctl_ctx_free(&sc->ctx); adapter_full_uninit(sc); if ((sc->flags & (IS_VF | FW_OK)) == FW_OK) t4_fw_bye(sc, sc->mbox); if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX) pci_release_msi(dev); if (sc->regs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); if (sc->udbs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->msix_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid, sc->msix_res); if (sc->l2t) t4_free_l2t(sc->l2t); if (sc->smt) t4_free_smt(sc->smt); t4_free_atid_table(sc); #ifdef RATELIMIT t4_free_etid_table(sc); #endif if (sc->key_map) vmem_destroy(sc->key_map); #ifdef INET6 t4_destroy_clip_table(sc); #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) free(sc->sge.ofld_txq, M_CXGBE); #endif #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); #endif #ifdef DEV_NETMAP free(sc->sge.nm_rxq, M_CXGBE); free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); free(sc->sge.txq, M_CXGBE); free(sc->sge.ctrlq, M_CXGBE); free(sc->sge.iqmap, M_CXGBE); free(sc->sge.eqmap, M_CXGBE); free(sc->tids.ftid_tab, M_CXGBE); free(sc->tids.hpftid_tab, M_CXGBE); free_hftid_hash(&sc->tids); free(sc->tids.tid_tab, M_CXGBE); t4_destroy_dma_tag(sc); callout_drain(&sc->ktls_tick); callout_drain(&sc->sfl_callout); if (mtx_initialized(&sc->tids.ftid_lock)) { mtx_destroy(&sc->tids.ftid_lock); cv_destroy(&sc->tids.ftid_cv); } if (mtx_initialized(&sc->tids.atid_lock)) mtx_destroy(&sc->tids.atid_lock); if (mtx_initialized(&sc->ifp_lock)) mtx_destroy(&sc->ifp_lock); if (rw_initialized(&sc->policy_lock)) { rw_destroy(&sc->policy_lock); #ifdef TCP_OFFLOAD if (sc->policy != NULL) free_offload_policy(sc->policy); #endif } for (i = 0; i < NUM_MEMWIN; i++) { struct memwin *mw = &sc->memwin[i]; if (rw_initialized(&mw->mw_lock)) rw_destroy(&mw->mw_lock); } mtx_destroy(&sc->sfl_lock); mtx_destroy(&sc->reg_lock); mtx_destroy(&sc->sc_lock); bzero(sc, sizeof(*sc)); return (0); } static inline bool ok_to_reset(struct adapter *sc) { struct tid_info *t = &sc->tids; struct port_info *pi; struct vi_info *vi; int i, j; int caps = IFCAP_TOE | IFCAP_NETMAP | IFCAP_TXRTLMT; if (is_t6(sc)) caps |= IFCAP_TXTLS; ASSERT_SYNCHRONIZED_OP(sc); MPASS(!(sc->flags & IS_VF)); for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, j, vi) { - if (vi->ifp->if_capenable & caps) + if (if_getcapenable(vi->ifp) & caps) return (false); } } if (atomic_load_int(&t->tids_in_use) > 0) return (false); if (atomic_load_int(&t->stids_in_use) > 0) return (false); if (atomic_load_int(&t->atids_in_use) > 0) return (false); if (atomic_load_int(&t->ftids_in_use) > 0) return (false); if (atomic_load_int(&t->hpftids_in_use) > 0) return (false); if (atomic_load_int(&t->etids_in_use) > 0) return (false); return (true); } static inline int stop_adapter(struct adapter *sc) { if (atomic_testandset_int(&sc->error_flags, ilog2(ADAP_STOPPED))) return (1); /* Already stopped. */ return (t4_shutdown_adapter(sc)); } static int t4_suspend(device_t dev) { struct adapter *sc = device_get_softc(dev); struct port_info *pi; struct vi_info *vi; - struct ifnet *ifp; + if_t ifp; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif int rc, i, j, k; CH_ALERT(sc, "suspend requested\n"); rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4sus"); if (rc != 0) return (ENXIO); /* XXX: Can the kernel call suspend repeatedly without resume? */ MPASS(!hw_off_limits(sc)); if (!ok_to_reset(sc)) { /* XXX: should list what resource is preventing suspend. */ CH_ERR(sc, "not safe to suspend.\n"); rc = EBUSY; goto done; } /* No more DMA or interrupts. */ stop_adapter(sc); /* Quiesce all activity. */ for_each_port(sc, i) { pi = sc->port[i]; pi->vxlan_tcam_entry = false; PORT_LOCK(pi); if (pi->up_vis > 0) { /* * t4_shutdown_adapter has already shut down all the * PHYs but it also disables interrupts and DMA so there * won't be a link interrupt. So we update the state * manually and inform the kernel. */ pi->link_cfg.link_ok = false; t4_os_link_changed(pi); } PORT_UNLOCK(pi); for_each_vi(pi, j, vi) { vi->xact_addr_filt = -1; mtx_lock(&vi->tick_mtx); vi->flags |= VI_SKIP_STATS; mtx_unlock(&vi->tick_mtx); if (!(vi->flags & VI_INIT_DONE)) continue; ifp = vi->ifp; - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { mtx_lock(&vi->tick_mtx); callout_stop(&vi->tick); mtx_unlock(&vi->tick_mtx); callout_drain(&vi->tick); } /* * Note that the HW is not available. */ for_each_txq(vi, k, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~(EQ_ENABLED | EQ_HW_ALLOCATED); TXQ_UNLOCK(txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, k, ofld_txq) { ofld_txq->wrq.eq.flags &= ~EQ_HW_ALLOCATED; } #endif for_each_rxq(vi, k, rxq) { rxq->iq.flags &= ~IQ_HW_ALLOCATED; } #if defined(TCP_OFFLOAD) for_each_ofld_rxq(vi, k, ofld_rxq) { ofld_rxq->iq.flags &= ~IQ_HW_ALLOCATED; } #endif quiesce_vi(vi); } if (sc->flags & FULL_INIT_DONE) { /* Control queue */ wrq = &sc->sge.ctrlq[i]; wrq->eq.flags &= ~EQ_HW_ALLOCATED; quiesce_wrq(wrq); } } if (sc->flags & FULL_INIT_DONE) { /* Firmware event queue */ sc->sge.fwq.flags &= ~IQ_HW_ALLOCATED; quiesce_iq_fl(sc, &sc->sge.fwq, NULL); } /* Stop calibration */ callout_stop(&sc->cal_callout); callout_drain(&sc->cal_callout); /* Mark the adapter totally off limits. */ mtx_lock(&sc->reg_lock); atomic_set_int(&sc->error_flags, HW_OFF_LIMITS); sc->flags &= ~(FW_OK | MASTER_PF); sc->reset_thread = NULL; mtx_unlock(&sc->reg_lock); if (t4_clock_gate_on_suspend) { t4_set_reg_field(sc, A_PMU_PART_CG_PWRMODE, F_MA_PART_CGEN | F_LE_PART_CGEN | F_EDC1_PART_CGEN | F_EDC0_PART_CGEN | F_TP_PART_CGEN | F_PDP_PART_CGEN | F_SGE_PART_CGEN, 0); } CH_ALERT(sc, "suspend completed.\n"); done: end_synchronized_op(sc, 0); return (rc); } struct adapter_pre_reset_state { u_int flags; uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; u_int cfcsum; char cfg_file[32]; struct adapter_params params; struct t4_virt_res vres; struct tid_info tids; struct sge sge; int rawf_base; int nrawf; }; static void save_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o) { ASSERT_SYNCHRONIZED_OP(sc); o->flags = sc->flags; o->nbmcaps = sc->nbmcaps; o->linkcaps = sc->linkcaps; o->switchcaps = sc->switchcaps; o->niccaps = sc->niccaps; o->toecaps = sc->toecaps; o->rdmacaps = sc->rdmacaps; o->cryptocaps = sc->cryptocaps; o->iscsicaps = sc->iscsicaps; o->fcoecaps = sc->fcoecaps; o->cfcsum = sc->cfcsum; MPASS(sizeof(o->cfg_file) == sizeof(sc->cfg_file)); memcpy(o->cfg_file, sc->cfg_file, sizeof(o->cfg_file)); o->params = sc->params; o->vres = sc->vres; o->tids = sc->tids; o->sge = sc->sge; o->rawf_base = sc->rawf_base; o->nrawf = sc->nrawf; } static int compare_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o) { int rc = 0; ASSERT_SYNCHRONIZED_OP(sc); /* Capabilities */ #define COMPARE_CAPS(c) do { \ if (o->c##caps != sc->c##caps) { \ CH_ERR(sc, "%scaps 0x%04x -> 0x%04x.\n", #c, o->c##caps, \ sc->c##caps); \ rc = EINVAL; \ } \ } while (0) COMPARE_CAPS(nbm); COMPARE_CAPS(link); COMPARE_CAPS(switch); COMPARE_CAPS(nic); COMPARE_CAPS(toe); COMPARE_CAPS(rdma); COMPARE_CAPS(crypto); COMPARE_CAPS(iscsi); COMPARE_CAPS(fcoe); #undef COMPARE_CAPS /* Firmware config file */ if (o->cfcsum != sc->cfcsum) { CH_ERR(sc, "config file %s (0x%x) -> %s (0x%x)\n", o->cfg_file, o->cfcsum, sc->cfg_file, sc->cfcsum); rc = EINVAL; } #define COMPARE_PARAM(p, name) do { \ if (o->p != sc->p) { \ CH_ERR(sc, #name " %d -> %d\n", o->p, sc->p); \ rc = EINVAL; \ } \ } while (0) COMPARE_PARAM(sge.iq_start, iq_start); COMPARE_PARAM(sge.eq_start, eq_start); COMPARE_PARAM(tids.ftid_base, ftid_base); COMPARE_PARAM(tids.ftid_end, ftid_end); COMPARE_PARAM(tids.nftids, nftids); COMPARE_PARAM(vres.l2t.start, l2t_start); COMPARE_PARAM(vres.l2t.size, l2t_size); COMPARE_PARAM(sge.iqmap_sz, iqmap_sz); COMPARE_PARAM(sge.eqmap_sz, eqmap_sz); COMPARE_PARAM(tids.tid_base, tid_base); COMPARE_PARAM(tids.hpftid_base, hpftid_base); COMPARE_PARAM(tids.hpftid_end, hpftid_end); COMPARE_PARAM(tids.nhpftids, nhpftids); COMPARE_PARAM(rawf_base, rawf_base); COMPARE_PARAM(nrawf, nrawf); COMPARE_PARAM(params.mps_bg_map, mps_bg_map); COMPARE_PARAM(params.filter2_wr_support, filter2_wr_support); COMPARE_PARAM(params.ulptx_memwrite_dsgl, ulptx_memwrite_dsgl); COMPARE_PARAM(params.fr_nsmr_tpte_wr_support, fr_nsmr_tpte_wr_support); COMPARE_PARAM(params.max_pkts_per_eth_tx_pkts_wr, max_pkts_per_eth_tx_pkts_wr); COMPARE_PARAM(tids.ntids, ntids); COMPARE_PARAM(tids.etid_base, etid_base); COMPARE_PARAM(tids.etid_end, etid_end); COMPARE_PARAM(tids.netids, netids); COMPARE_PARAM(params.eo_wr_cred, eo_wr_cred); COMPARE_PARAM(params.ethoffload, ethoffload); COMPARE_PARAM(tids.natids, natids); COMPARE_PARAM(tids.stid_base, stid_base); COMPARE_PARAM(vres.ddp.start, ddp_start); COMPARE_PARAM(vres.ddp.size, ddp_size); COMPARE_PARAM(params.ofldq_wr_cred, ofldq_wr_cred); COMPARE_PARAM(vres.stag.start, stag_start); COMPARE_PARAM(vres.stag.size, stag_size); COMPARE_PARAM(vres.rq.start, rq_start); COMPARE_PARAM(vres.rq.size, rq_size); COMPARE_PARAM(vres.pbl.start, pbl_start); COMPARE_PARAM(vres.pbl.size, pbl_size); COMPARE_PARAM(vres.qp.start, qp_start); COMPARE_PARAM(vres.qp.size, qp_size); COMPARE_PARAM(vres.cq.start, cq_start); COMPARE_PARAM(vres.cq.size, cq_size); COMPARE_PARAM(vres.ocq.start, ocq_start); COMPARE_PARAM(vres.ocq.size, ocq_size); COMPARE_PARAM(vres.srq.start, srq_start); COMPARE_PARAM(vres.srq.size, srq_size); COMPARE_PARAM(params.max_ordird_qp, max_ordird_qp); COMPARE_PARAM(params.max_ird_adapter, max_ird_adapter); COMPARE_PARAM(vres.iscsi.start, iscsi_start); COMPARE_PARAM(vres.iscsi.size, iscsi_size); COMPARE_PARAM(vres.key.start, key_start); COMPARE_PARAM(vres.key.size, key_size); #undef COMPARE_PARAM return (rc); } static int t4_resume(device_t dev) { struct adapter *sc = device_get_softc(dev); struct adapter_pre_reset_state *old_state = NULL; struct port_info *pi; struct vi_info *vi; - struct ifnet *ifp; + if_t ifp; struct sge_txq *txq; int rc, i, j, k; CH_ALERT(sc, "resume requested.\n"); rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4res"); if (rc != 0) return (ENXIO); MPASS(hw_off_limits(sc)); MPASS((sc->flags & FW_OK) == 0); MPASS((sc->flags & MASTER_PF) == 0); MPASS(sc->reset_thread == NULL); sc->reset_thread = curthread; /* Register access is expected to work by the time we're here. */ if (t4_read_reg(sc, A_PL_WHOAMI) == 0xffffffff) { CH_ERR(sc, "%s: can't read device registers\n", __func__); rc = ENXIO; goto done; } /* Note that HW_OFF_LIMITS is cleared a bit later. */ atomic_clear_int(&sc->error_flags, ADAP_FATAL_ERR | ADAP_STOPPED); /* Restore memory window. */ setup_memwin(sc); /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { CH_ALERT(sc, "recovery mode on resume.\n"); rc = 0; mtx_lock(&sc->reg_lock); atomic_clear_int(&sc->error_flags, HW_OFF_LIMITS); mtx_unlock(&sc->reg_lock); goto done; } old_state = malloc(sizeof(*old_state), M_CXGBE, M_ZERO | M_WAITOK); save_caps_and_params(sc, old_state); /* Reestablish contact with firmware and become the primary PF. */ rc = contact_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ MPASS(sc->flags & FW_OK); if (sc->flags & MASTER_PF) { rc = partition_resources(sc); if (rc != 0) goto done; /* error message displayed already */ t4_intr_clear(sc); } rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = compare_caps_and_params(sc, old_state); if (rc != 0) goto done; /* error message displayed already */ for_each_port(sc, i) { pi = sc->port[i]; MPASS(pi != NULL); MPASS(pi->vi != NULL); MPASS(pi->vi[0].dev == pi->dev); rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i); if (rc != 0) { CH_ERR(sc, "failed to re-initialize port %d: %d\n", i, rc); goto done; } MPASS(sc->chan_map[pi->tx_chan] == i); PORT_LOCK(pi); fixup_link_config(pi); build_medialist(pi); PORT_UNLOCK(pi); for_each_vi(pi, j, vi) { if (IS_MAIN_VI(vi)) continue; rc = alloc_extra_vi(sc, pi, vi); if (rc != 0) { CH_ERR(vi, "failed to re-allocate extra VI: %d\n", rc); goto done; } } } /* * Interrupts and queues are about to be enabled and other threads will * want to access the hardware too. It is safe to do so. Note that * this thread is still in the middle of a synchronized_op. */ mtx_lock(&sc->reg_lock); atomic_clear_int(&sc->error_flags, HW_OFF_LIMITS); mtx_unlock(&sc->reg_lock); if (sc->flags & FULL_INIT_DONE) { rc = adapter_full_init(sc); if (rc != 0) { CH_ERR(sc, "failed to re-initialize adapter: %d\n", rc); goto done; } if (sc->vxlan_refcount > 0) enable_vxlan_rx(sc); for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, j, vi) { mtx_lock(&vi->tick_mtx); vi->flags &= ~VI_SKIP_STATS; mtx_unlock(&vi->tick_mtx); if (!(vi->flags & VI_INIT_DONE)) continue; rc = vi_full_init(vi); if (rc != 0) { CH_ERR(vi, "failed to re-initialize " "interface: %d\n", rc); goto done; } ifp = vi->ifp; - if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) + if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) continue; /* * Note that we do not setup multicast addresses * in the first pass. This ensures that the * unicast DMACs for all VIs on all ports get an * MPS TCAM entry. */ rc = update_mac_settings(ifp, XGMAC_ALL & ~XGMAC_MCADDRS); if (rc != 0) { CH_ERR(vi, "failed to re-configure MAC: %d\n", rc); goto done; } rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { CH_ERR(vi, "failed to re-enable VI: %d\n", rc); goto done; } for_each_txq(vi, k, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_ENABLED; TXQ_UNLOCK(txq); } mtx_lock(&vi->tick_mtx); callout_schedule(&vi->tick, hz); mtx_unlock(&vi->tick_mtx); } PORT_LOCK(pi); if (pi->up_vis > 0) { t4_update_port_info(pi); fixup_link_config(pi); build_medialist(pi); apply_link_config(pi); if (pi->link_cfg.link_ok) t4_os_link_changed(pi); } PORT_UNLOCK(pi); } /* Now reprogram the L2 multicast addresses. */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, j, vi) { if (!(vi->flags & VI_INIT_DONE)) continue; ifp = vi->ifp; - if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) + if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) continue; rc = update_mac_settings(ifp, XGMAC_MCADDRS); if (rc != 0) { CH_ERR(vi, "failed to re-configure MCAST MACs: %d\n", rc); rc = 0; /* carry on */ } } } } /* Reset all calibration */ t4_calibration_start(sc); done: if (rc == 0) { sc->incarnation++; CH_ALERT(sc, "resume completed.\n"); } end_synchronized_op(sc, 0); free(old_state, M_CXGBE); return (rc); } static int t4_reset_prepare(device_t dev, device_t child) { struct adapter *sc = device_get_softc(dev); CH_ALERT(sc, "reset_prepare.\n"); return (0); } static int t4_reset_post(device_t dev, device_t child) { struct adapter *sc = device_get_softc(dev); CH_ALERT(sc, "reset_post.\n"); return (0); } static int reset_adapter(struct adapter *sc) { int rc, oldinc, error_flags; CH_ALERT(sc, "reset requested.\n"); rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst1"); if (rc != 0) return (EBUSY); if (hw_off_limits(sc)) { CH_ERR(sc, "adapter is suspended, use resume (not reset).\n"); rc = ENXIO; goto done; } if (!ok_to_reset(sc)) { /* XXX: should list what resource is preventing reset. */ CH_ERR(sc, "not safe to reset.\n"); rc = EBUSY; goto done; } done: oldinc = sc->incarnation; end_synchronized_op(sc, 0); if (rc != 0) return (rc); /* Error logged already. */ atomic_add_int(&sc->num_resets, 1); mtx_lock(&Giant); rc = BUS_RESET_CHILD(device_get_parent(sc->dev), sc->dev, 0); mtx_unlock(&Giant); if (rc != 0) CH_ERR(sc, "bus_reset_child failed: %d.\n", rc); else { rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst2"); if (rc != 0) return (EBUSY); error_flags = atomic_load_int(&sc->error_flags); if (sc->incarnation > oldinc && error_flags == 0) { CH_ALERT(sc, "bus_reset_child succeeded.\n"); } else { CH_ERR(sc, "adapter did not reset properly, flags " "0x%08x, error_flags 0x%08x.\n", sc->flags, error_flags); rc = ENXIO; } end_synchronized_op(sc, 0); } return (rc); } static void reset_adapter_task(void *arg, int pending) { /* XXX: t4_async_event here? */ reset_adapter(arg); } static int cxgbe_probe(device_t dev) { char buf[128]; struct port_info *pi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d", pi->port_id); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \ IFCAP_HWRXTSTMP | IFCAP_MEXTPG) #define T4_CAP_ENABLE (T4_CAP) static int cxgbe_vi_attach(device_t dev, struct vi_info *vi) { - struct ifnet *ifp; + if_t ifp; struct sbuf *sb; struct sysctl_ctx_list *ctx = &vi->ctx; struct sysctl_oid_list *children; struct pfil_head_args pa; struct adapter *sc = vi->adapter; sysctl_ctx_init(ctx); children = SYSCTL_CHILDREN(device_get_sysctl_tree(vi->dev)); vi->rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NIC rx queues"); vi->txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NIC tx queues"); #ifdef DEV_NETMAP vi->nm_rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "nm_rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap rx queues"); vi->nm_txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "nm_txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap tx queues"); #endif #ifdef TCP_OFFLOAD vi->ofld_rxq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE rx queues"); #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) vi->ofld_txq_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE/ETHOFLD tx queues"); #endif vi->xact_addr_filt = -1; mtx_init(&vi->tick_mtx, "vi tick", NULL, MTX_DEF); callout_init_mtx(&vi->tick, &vi->tick_mtx, 0); if (sc->flags & IS_VF || t4_tx_vm_wr != 0) vi->flags |= TX_USES_VM_WR; /* Allocate an ifnet and set it up */ ifp = if_alloc_dev(IFT_ETHER, dev); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } vi->ifp = ifp; - ifp->if_softc = vi; + if_setsoftc(ifp, vi); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); - ifp->if_init = cxgbe_init; - ifp->if_ioctl = cxgbe_ioctl; - ifp->if_transmit = cxgbe_transmit; - ifp->if_qflush = cxgbe_qflush; + if_setinitfn(ifp, cxgbe_init); + if_setioctlfn(ifp, cxgbe_ioctl); + if_settransmitfn(ifp, cxgbe_transmit); + if_setqflushfn(ifp, cxgbe_qflush); if (vi->pi->nvi > 1 || sc->flags & IS_VF) - ifp->if_get_counter = vi_get_counter; + if_setgetcounterfn(ifp, vi_get_counter); else - ifp->if_get_counter = cxgbe_get_counter; + if_setgetcounterfn(ifp, cxgbe_get_counter); #if defined(KERN_TLS) || defined(RATELIMIT) - ifp->if_snd_tag_alloc = cxgbe_snd_tag_alloc; + if_setsndtagallocfn(ifp, cxgbe_snd_tag_alloc); #endif #ifdef RATELIMIT - ifp->if_ratelimit_query = cxgbe_ratelimit_query; + if_setratelimitqueryfn(ifp, cxgbe_ratelimit_query); #endif - ifp->if_capabilities = T4_CAP; - ifp->if_capenable = T4_CAP_ENABLE; - ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | - CSUM_UDP_IPV6 | CSUM_TCP_IPV6; + if_setcapabilities(ifp, T4_CAP); + if_setcapenable(ifp, T4_CAP_ENABLE); + if_sethwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | + CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (chip_id(sc) >= CHELSIO_T6) { - ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO; - ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO; - ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | + if_setcapabilitiesbit(ifp, IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO, 0); + if_setcapenablebit(ifp, IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO, 0); + if_sethwassistbits(ifp, CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | - CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN; + CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN, 0); } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) - ifp->if_capabilities |= IFCAP_TOE; + if_setcapabilitiesbit(ifp, IFCAP_TOE, 0); #endif #ifdef RATELIMIT if (is_ethoffload(sc) && vi->nofldtxq != 0) { - ifp->if_capabilities |= IFCAP_TXRTLMT; - ifp->if_capenable |= IFCAP_TXRTLMT; + if_setcapabilitiesbit(ifp, IFCAP_TXRTLMT, 0); + if_setcapenablebit(ifp, IFCAP_TXRTLMT, 0); } #endif - ifp->if_hw_tsomax = IP_MAXPACKET; + if_sethwtsomax(ifp, IP_MAXPACKET); if (vi->flags & TX_USES_VM_WR) - ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO; + if_sethwtsomaxsegcount(ifp, TX_SGL_SEGS_VM_TSO); else - ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; + if_sethwtsomaxsegcount(ifp, TX_SGL_SEGS_TSO); #ifdef RATELIMIT if (is_ethoffload(sc) && vi->nofldtxq != 0) - ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO; + if_sethwtsomaxsegcount(ifp, TX_SGL_SEGS_EO_TSO); #endif - ifp->if_hw_tsomaxsegsize = 65536; + if_sethwtsomaxsegsize(ifp, 65536); #ifdef KERN_TLS if (is_ktls(sc)) { - ifp->if_capabilities |= IFCAP_TXTLS; + if_setcapabilitiesbit(ifp, IFCAP_TXTLS, 0); if (sc->flags & KERN_TLS_ON || !is_t6(sc)) - ifp->if_capenable |= IFCAP_TXTLS; + if_setcapenablebit(ifp, IFCAP_TXTLS, 0); } #endif ether_ifattach(ifp, vi->hw_addr); #ifdef DEV_NETMAP if (vi->nnmrxq != 0) cxgbe_nm_attach(vi); #endif sb = sbuf_new_auto(); sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) - switch (ifp->if_capabilities & (IFCAP_TOE | IFCAP_TXRTLMT)) { + switch (if_getcapabilities(ifp) & (IFCAP_TOE | IFCAP_TXRTLMT)) { case IFCAP_TOE: sbuf_printf(sb, "; %d txq (TOE)", vi->nofldtxq); break; case IFCAP_TOE | IFCAP_TXRTLMT: sbuf_printf(sb, "; %d txq (TOE/ETHOFLD)", vi->nofldtxq); break; case IFCAP_TXRTLMT: sbuf_printf(sb, "; %d txq (ETHOFLD)", vi->nofldtxq); break; } #endif #ifdef TCP_OFFLOAD - if (ifp->if_capabilities & IFCAP_TOE) + if (if_getcapabilities(ifp) & IFCAP_TOE) sbuf_printf(sb, ", %d rxq (TOE)", vi->nofldrxq); #endif #ifdef DEV_NETMAP - if (ifp->if_capabilities & IFCAP_NETMAP) + if (if_getcapabilities(ifp) & IFCAP_NETMAP) sbuf_printf(sb, "; %d txq, %d rxq (netmap)", vi->nnmtxq, vi->nnmrxq); #endif sbuf_finish(sb); device_printf(dev, "%s\n", sbuf_data(sb)); sbuf_delete(sb); vi_sysctls(vi); pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; - pa.pa_headname = ifp->if_xname; + pa.pa_headname = if_name(ifp); vi->pfil = pfil_head_register(&pa); return (0); } static int cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; struct vi_info *vi; int i, rc; sysctl_ctx_init(&pi->ctx); rc = cxgbe_vi_attach(dev, &pi->vi[0]); if (rc) return (rc); for_each_vi(pi, i, vi) { if (i == 0) continue; vi->dev = device_add_child(dev, sc->names->vi_ifnet_name, -1); if (vi->dev == NULL) { device_printf(dev, "failed to add VI %d\n", i); continue; } device_set_softc(vi->dev, vi); } cxgbe_sysctls(pi); bus_generic_attach(dev); return (0); } static void cxgbe_vi_detach(struct vi_info *vi) { - struct ifnet *ifp = vi->ifp; + if_t ifp = vi->ifp; if (vi->pfil != NULL) { pfil_head_unregister(vi->pfil); vi->pfil = NULL; } ether_ifdetach(ifp); /* Let detach proceed even if these fail. */ #ifdef DEV_NETMAP - if (ifp->if_capabilities & IFCAP_NETMAP) + if (if_getcapabilities(ifp) & IFCAP_NETMAP) cxgbe_nm_detach(vi); #endif cxgbe_uninit_synchronized(vi); callout_drain(&vi->tick); sysctl_ctx_free(&vi->ctx); vi_full_uninit(vi); if_free(vi->ifp); vi->ifp = NULL; } static int cxgbe_detach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; int rc; /* Detach the extra VIs first. */ rc = bus_generic_detach(dev); if (rc) return (rc); device_delete_children(dev); sysctl_ctx_free(&pi->ctx); doom_vi(sc, &pi->vi[0]); if (pi->flags & HAS_TRACEQ) { sc->traceq = -1; /* cloner should not create ifnet */ t4_tracer_port_detach(sc); } cxgbe_vi_detach(&pi->vi[0]); ifmedia_removeall(&pi->media); end_synchronized_op(sc, 0); return (0); } static void cxgbe_init(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->adapter; if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0) return; cxgbe_init_synchronized(vi); end_synchronized_op(sc, 0); } static int -cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) +cxgbe_ioctl(if_t ifp, unsigned long cmd, caddr_t data) { int rc = 0, mtu, flags; - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifreq *ifr = (struct ifreq *)data; uint32_t mask; switch (cmd) { case SIOCSIFMTU: mtu = ifr->ifr_mtu; if (mtu < ETHERMIN || mtu > MAX_MTU) return (EINVAL); rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu"); if (rc) return (rc); - ifp->if_mtu = mtu; + if_setmtu(ifp, mtu); if (vi->flags & VI_INIT_DONE) { t4_update_fl_bufsize(ifp); if (!hw_off_limits(sc) && - ifp->if_drv_flags & IFF_DRV_RUNNING) + if_getdrvflags(ifp) & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4flg"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto fail; } - if (ifp->if_flags & IFF_UP) { - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (if_getflags(ifp) & IFF_UP) { + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { flags = vi->if_flags; - if ((ifp->if_flags ^ flags) & + if ((if_getflags(ifp) ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } } else { rc = cxgbe_init_synchronized(vi); } - vi->if_flags = ifp->if_flags; - } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + vi->if_flags = if_getflags(ifp); + } else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { rc = cxgbe_uninit_synchronized(vi); } end_synchronized_op(sc, 0); break; case SIOCADDMULTI: case SIOCDELMULTI: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4multi"); if (rc) return (rc); - if (!hw_off_limits(sc) && ifp->if_drv_flags & IFF_DRV_RUNNING) + if (!hw_off_limits(sc) && if_getdrvflags(ifp) & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, 0); break; case SIOCSIFCAP: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap"); if (rc) return (rc); - mask = ifr->ifr_reqcap ^ ifp->if_capenable; + mask = ifr->ifr_reqcap ^ if_getcapenable(ifp); if (mask & IFCAP_TXCSUM) { - ifp->if_capenable ^= IFCAP_TXCSUM; - ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); + if_togglecapenable(ifp, IFCAP_TXCSUM); + if_togglehwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_IP); - if (IFCAP_TSO4 & ifp->if_capenable && - !(IFCAP_TXCSUM & ifp->if_capenable)) { + if (IFCAP_TSO4 & if_getcapenable(ifp) && + !(IFCAP_TXCSUM & if_getcapenable(ifp))) { mask &= ~IFCAP_TSO4; - ifp->if_capenable &= ~IFCAP_TSO4; + if_setcapenablebit(ifp, 0, IFCAP_TSO4); if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { - ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; - ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); + if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); + if_togglehwassist(ifp, CSUM_UDP_IPV6 | CSUM_TCP_IPV6); - if (IFCAP_TSO6 & ifp->if_capenable && - !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + if (IFCAP_TSO6 & if_getcapenable(ifp) && + !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) { mask &= ~IFCAP_TSO6; - ifp->if_capenable &= ~IFCAP_TSO6; + if_setcapenablebit(ifp, 0, IFCAP_TSO6); if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) - ifp->if_capenable ^= IFCAP_RXCSUM; + if_togglecapenable(ifp, IFCAP_RXCSUM); if (mask & IFCAP_RXCSUM_IPV6) - ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; + if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { - if (!(IFCAP_TSO4 & ifp->if_capenable) && - !(IFCAP_TXCSUM & ifp->if_capenable)) { + if (!(IFCAP_TSO4 & if_getcapenable(ifp)) && + !(IFCAP_TXCSUM & if_getcapenable(ifp))) { if_printf(ifp, "enable txcsum first.\n"); rc = EAGAIN; goto fail; } - ifp->if_capenable ^= IFCAP_TSO4; + if_togglecapenable(ifp, IFCAP_TSO4); } if (mask & IFCAP_TSO6) { - if (!(IFCAP_TSO6 & ifp->if_capenable) && - !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + if (!(IFCAP_TSO6 & if_getcapenable(ifp)) && + !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) { if_printf(ifp, "enable txcsum6 first.\n"); rc = EAGAIN; goto fail; } - ifp->if_capenable ^= IFCAP_TSO6; + if_togglecapenable(ifp, IFCAP_TSO6); } if (mask & IFCAP_LRO) { #if defined(INET) || defined(INET6) int i; struct sge_rxq *rxq; - ifp->if_capenable ^= IFCAP_LRO; + if_togglecapenable(ifp, IFCAP_LRO); for_each_rxq(vi, i, rxq) { - if (ifp->if_capenable & IFCAP_LRO) + if (if_getcapenable(ifp) & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; else rxq->iq.flags &= ~IQ_LRO_ENABLED; } #endif } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { - int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; + int enable = (if_getcapenable(ifp) ^ mask) & IFCAP_TOE; rc = toe_capability(vi, enable); if (rc != 0) goto fail; - ifp->if_capenable ^= mask; + if_togglecapenable(ifp, mask); } #endif if (mask & IFCAP_VLAN_HWTAGGING) { - ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; - if (ifp->if_drv_flags & IFF_DRV_RUNNING) + if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING); + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { - ifp->if_capenable ^= IFCAP_VLAN_MTU; + if_togglecapenable(ifp, IFCAP_VLAN_MTU); /* Need to find out how to disable auto-mtu-inflation */ } if (mask & IFCAP_VLAN_HWTSO) - ifp->if_capenable ^= IFCAP_VLAN_HWTSO; + if_togglecapenable(ifp, IFCAP_VLAN_HWTSO); if (mask & IFCAP_VLAN_HWCSUM) - ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; + if_togglecapenable(ifp, IFCAP_VLAN_HWCSUM); #ifdef RATELIMIT if (mask & IFCAP_TXRTLMT) - ifp->if_capenable ^= IFCAP_TXRTLMT; + if_togglecapenable(ifp, IFCAP_TXRTLMT); #endif if (mask & IFCAP_HWRXTSTMP) { int i; struct sge_rxq *rxq; - ifp->if_capenable ^= IFCAP_HWRXTSTMP; + if_togglecapenable(ifp, IFCAP_HWRXTSTMP); for_each_rxq(vi, i, rxq) { - if (ifp->if_capenable & IFCAP_HWRXTSTMP) + if (if_getcapenable(ifp) & IFCAP_HWRXTSTMP) rxq->iq.flags |= IQ_RX_TIMESTAMP; else rxq->iq.flags &= ~IQ_RX_TIMESTAMP; } } if (mask & IFCAP_MEXTPG) - ifp->if_capenable ^= IFCAP_MEXTPG; + if_togglecapenable(ifp, IFCAP_MEXTPG); #ifdef KERN_TLS if (mask & IFCAP_TXTLS) { - int enable = (ifp->if_capenable ^ mask) & IFCAP_TXTLS; + int enable = (if_getcapenable(ifp) ^ mask) & IFCAP_TXTLS; rc = ktls_capability(sc, enable); if (rc != 0) goto fail; - ifp->if_capenable ^= (mask & IFCAP_TXTLS); + if_togglecapenable(ifp, mask & IFCAP_TXTLS); } #endif if (mask & IFCAP_VXLAN_HWCSUM) { - ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM; - ifp->if_hwassist ^= CSUM_INNER_IP6_UDP | + if_togglecapenable(ifp, IFCAP_VXLAN_HWCSUM); + if_togglehwassist(ifp, CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP | - CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP; + CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP); } if (mask & IFCAP_VXLAN_HWTSO) { - ifp->if_capenable ^= IFCAP_VXLAN_HWTSO; - ifp->if_hwassist ^= CSUM_INNER_IP6_TSO | - CSUM_INNER_IP_TSO; + if_togglecapenable(ifp, IFCAP_VXLAN_HWTSO); + if_togglehwassist(ifp, CSUM_INNER_IP6_TSO | + CSUM_INNER_IP_TSO); } #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif fail: end_synchronized_op(sc, 0); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: rc = ifmedia_ioctl(ifp, ifr, &pi->media, cmd); break; case SIOCGI2C: { struct ifi2creq i2c; rc = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (rc != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { rc = EPERM; break; } if (i2c.len > sizeof(i2c.data)) { rc = EINVAL; break; } rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr, i2c.offset, i2c.len, &i2c.data[0]); end_synchronized_op(sc, 0); if (rc == 0) rc = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); break; } default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int -cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) +cxgbe_transmit(if_t ifp, struct mbuf *m) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct adapter *sc; struct sge_txq *txq; void *items[1]; int rc; M_ASSERTPKTHDR(m); MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) MPASS(m->m_pkthdr.snd_tag->ifp == ifp); #endif if (__predict_false(pi->link_cfg.link_ok == false)) { m_freem(m); return (ENETDOWN); } rc = parse_pkt(&m, vi->flags & TX_USES_VM_WR); if (__predict_false(rc != 0)) { if (__predict_true(rc == EINPROGRESS)) { /* queued by parse_pkt */ MPASS(m != NULL); return (0); } MPASS(m == NULL); /* was freed already */ atomic_add_int(&pi->tx_parse_error, 1); /* rare, atomic is ok */ return (rc); } /* Select a txq. */ sc = vi->adapter; txq = &sc->sge.txq[vi->first_txq]; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) + vi->rsrv_noflowq); items[0] = m; rc = mp_ring_enqueue(txq->r, items, 1, 256); if (__predict_false(rc != 0)) m_freem(m); return (rc); } static void -cxgbe_qflush(struct ifnet *ifp) +cxgbe_qflush(if_t ifp) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct sge_txq *txq; int i; /* queues do not exist if !VI_INIT_DONE. */ if (vi->flags & VI_INIT_DONE) { for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_QFLUSH; TXQ_UNLOCK(txq); while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 4096); pause("qflush", 1); } TXQ_LOCK(txq); txq->eq.flags &= ~EQ_QFLUSH; TXQ_UNLOCK(txq); } } if_qflush(ifp); } static uint64_t -vi_get_counter(struct ifnet *ifp, ift_counter c) +vi_get_counter(if_t ifp, ift_counter c) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct fw_vi_stats_vf *s = &vi->stats; mtx_lock(&vi->tick_mtx); vi_refresh_stats(vi); mtx_unlock(&vi->tick_mtx); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_bcast_frames + s->rx_mcast_frames + s->rx_ucast_frames); case IFCOUNTER_IERRORS: return (s->rx_err_frames); case IFCOUNTER_OPACKETS: return (s->tx_bcast_frames + s->tx_mcast_frames + s->tx_ucast_frames + s->tx_offload_frames); case IFCOUNTER_OERRORS: return (s->tx_drop_frames); case IFCOUNTER_IBYTES: return (s->rx_bcast_bytes + s->rx_mcast_bytes + s->rx_ucast_bytes); case IFCOUNTER_OBYTES: return (s->tx_bcast_bytes + s->tx_mcast_bytes + s->tx_ucast_bytes + s->tx_offload_bytes); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = 0; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->dropped); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } static uint64_t -cxgbe_get_counter(struct ifnet *ifp, ift_counter c) +cxgbe_get_counter(if_t ifp, ift_counter c) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct port_stats *s = &pi->stats; mtx_lock(&vi->tick_mtx); cxgbe_refresh_stats(vi); mtx_unlock(&vi->tick_mtx); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_frames); case IFCOUNTER_IERRORS: return (s->rx_jabber + s->rx_runt + s->rx_too_long + s->rx_fcs_err + s->rx_len_err); case IFCOUNTER_OPACKETS: return (s->tx_frames); case IFCOUNTER_OERRORS: return (s->tx_error_frames); case IFCOUNTER_IBYTES: return (s->rx_octets); case IFCOUNTER_OBYTES: return (s->tx_octets); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_IQDROPS: return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3 + pi->tnl_cong_drops); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = s->tx_drop; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->dropped); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } #if defined(KERN_TLS) || defined(RATELIMIT) static int -cxgbe_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, +cxgbe_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { int error; switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: error = cxgbe_rate_tag_alloc(ifp, params, pt); break; #endif #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); if (is_t6(vi->pi->adapter)) error = t6_tls_tag_alloc(ifp, params, pt); else error = EOPNOTSUPP; break; } #endif default: error = EOPNOTSUPP; } return (error); } #endif /* * The kernel picks a media from the list we had provided but we still validate * the requeste. */ int -cxgbe_media_change(struct ifnet *ifp) +cxgbe_media_change(if_t ifp) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct ifmedia *ifm = &pi->media; struct link_config *lc = &pi->link_cfg; struct adapter *sc = pi->adapter; int rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mec"); if (rc != 0) return (rc); PORT_LOCK(pi); if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) { /* ifconfig .. media autoselect */ if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; /* AN not supported by transceiver */ goto done; } lc->requested_aneg = AUTONEG_ENABLE; lc->requested_speed = 0; lc->requested_fc |= PAUSE_AUTONEG; } else { lc->requested_aneg = AUTONEG_DISABLE; lc->requested_speed = ifmedia_baudrate(ifm->ifm_media) / 1000000; lc->requested_fc = 0; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE) lc->requested_fc |= PAUSE_RX; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) lc->requested_fc |= PAUSE_TX; } if (pi->up_vis > 0 && !hw_off_limits(sc)) { fixup_link_config(pi); rc = apply_link_config(pi); } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } /* * Base media word (without ETHER, pause, link active, etc.) for the port at the * given speed. */ static int port_mword(struct port_info *pi, uint32_t speed) { MPASS(speed & M_FW_PORT_CAP32_SPEED); MPASS(powerof2(speed)); switch(pi->port_type) { case FW_PORT_TYPE_BT_SGMII: case FW_PORT_TYPE_BT_XFI: case FW_PORT_TYPE_BT_XAUI: /* BaseT */ switch (speed) { case FW_PORT_CAP32_SPEED_100M: return (IFM_100_T); case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_T); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_T); } break; case FW_PORT_TYPE_KX4: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_KX4); break; case FW_PORT_TYPE_CX4: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_CX4); break; case FW_PORT_TYPE_KX: if (speed == FW_PORT_CAP32_SPEED_1G) return (IFM_1000_KX); break; case FW_PORT_TYPE_KR: case FW_PORT_TYPE_BP_AP: case FW_PORT_TYPE_BP4_AP: case FW_PORT_TYPE_BP40_BA: case FW_PORT_TYPE_KR4_100G: case FW_PORT_TYPE_KR_SFP28: case FW_PORT_TYPE_KR_XLAUI: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_KX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_KR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_KR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_KR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_KR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_KR4); } break; case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_QSA: case FW_PORT_TYPE_QSFP: case FW_PORT_TYPE_CR4_QSFP: case FW_PORT_TYPE_CR_QSFP: case FW_PORT_TYPE_CR2_QSFP: case FW_PORT_TYPE_SFP28: /* Pluggable transceiver */ switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_LX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_LR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_LR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_LR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_LR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_LR4); } break; case FW_PORT_MOD_TYPE_SR: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_SX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_SR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_SR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_SR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_SR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_SR4); } break; case FW_PORT_MOD_TYPE_ER: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_ER); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_CX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_TWINAX); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_CR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_CR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_CR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_CR4); } break; case FW_PORT_MOD_TYPE_LRM: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_NA: MPASS(0); /* Not pluggable? */ /* fall throough */ case FW_PORT_MOD_TYPE_ERROR: case FW_PORT_MOD_TYPE_UNKNOWN: case FW_PORT_MOD_TYPE_NOTSUPPORTED: break; case FW_PORT_MOD_TYPE_NONE: return (IFM_NONE); } break; case FW_PORT_TYPE_NONE: return (IFM_NONE); } return (IFM_UNKNOWN); } void -cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) +cxgbe_media_status(if_t ifp, struct ifmediareq *ifmr) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4med") != 0) return; PORT_LOCK(pi); if (pi->up_vis == 0 && !hw_off_limits(sc)) { /* * If all the interfaces are administratively down the firmware * does not report transceiver changes. Refresh port info here * so that ifconfig displays accurate ifmedia at all times. * This is the only reason we have a synchronized op in this * function. Just PORT_LOCK would have been enough otherwise. */ t4_update_port_info(pi); build_medialist(pi); } /* ifm_status */ ifmr->ifm_status = IFM_AVALID; if (lc->link_ok == false) goto done; ifmr->ifm_status |= IFM_ACTIVE; /* ifm_active */ ifmr->ifm_active = IFM_ETHER | IFM_FDX; ifmr->ifm_active &= ~(IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE); if (lc->fc & PAUSE_RX) ifmr->ifm_active |= IFM_ETH_RXPAUSE; if (lc->fc & PAUSE_TX) ifmr->ifm_active |= IFM_ETH_TXPAUSE; ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed)); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } static int vcxgbe_probe(device_t dev) { char buf[128]; struct vi_info *vi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id, vi - vi->pi->vi); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } static int alloc_extra_vi(struct adapter *sc, struct port_info *pi, struct vi_info *vi) { int func, index, rc; uint32_t param, val; ASSERT_SYNCHRONIZED_OP(sc); index = vi - pi->vi; MPASS(index > 0); /* This function deals with _extra_ VIs only */ KASSERT(index < nitems(vi_mac_funcs), ("%s: VI %s doesn't have a MAC func", __func__, device_get_nameunit(vi->dev))); func = vi_mac_funcs[index]; rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1, vi->hw_addr, &vi->rss_size, &vi->vfvld, &vi->vin, func, 0); if (rc < 0) { CH_ERR(vi, "failed to allocate virtual interface %d" "for port %d: %d\n", index, pi->port_id, -rc); return (-rc); } vi->viid = rc; if (vi->rss_size == 1) { /* * This VI didn't get a slice of the RSS table. Reduce the * number of VIs being created (hw.cxgbe.num_vis) or modify the * configuration file (nvi, rssnvi for this PF) if this is a * problem. */ device_printf(vi->dev, "RSS table not available.\n"); vi->rss_base = 0xffff; return (0); } param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) | V_FW_PARAMS_PARAM_YZ(vi->viid); rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc) vi->rss_base = 0xffff; else { MPASS((val >> 16) == vi->rss_size); vi->rss_base = val & 0xffff; } return (0); } static int vcxgbe_attach(device_t dev) { struct vi_info *vi; struct port_info *pi; struct adapter *sc; int rc; vi = device_get_softc(dev); pi = vi->pi; sc = pi->adapter; rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4via"); if (rc) return (rc); rc = alloc_extra_vi(sc, pi, vi); end_synchronized_op(sc, 0); if (rc) return (rc); rc = cxgbe_vi_attach(dev, vi); if (rc) { t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); return (rc); } return (0); } static int vcxgbe_detach(device_t dev) { struct vi_info *vi; struct adapter *sc; vi = device_get_softc(dev); sc = vi->adapter; doom_vi(sc, vi); cxgbe_vi_detach(vi); t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); end_synchronized_op(sc, 0); return (0); } static struct callout fatal_callout; static struct taskqueue *reset_tq; static void delayed_panic(void *arg) { struct adapter *sc = arg; panic("%s: panic on fatal error", device_get_nameunit(sc->dev)); } static void fatal_error_task(void *arg, int pending) { struct adapter *sc = arg; int rc; #ifdef TCP_OFFLOAD t4_async_event(sc); #endif if (atomic_testandclear_int(&sc->error_flags, ilog2(ADAP_CIM_ERR))) { dump_cim_regs(sc); dump_cimla(sc); dump_devlog(sc); } if (t4_reset_on_fatal_err) { CH_ALERT(sc, "resetting on fatal error.\n"); rc = reset_adapter(sc); if (rc == 0 && t4_panic_on_fatal_err) { CH_ALERT(sc, "reset was successful, " "system will NOT panic.\n"); return; } } if (t4_panic_on_fatal_err) { CH_ALERT(sc, "panicking on fatal error (after 30s).\n"); callout_reset(&fatal_callout, hz * 30, delayed_panic, sc); } } void t4_fatal_err(struct adapter *sc, bool fw_error) { const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0; stop_adapter(sc); if (atomic_testandset_int(&sc->error_flags, ilog2(ADAP_FATAL_ERR))) return; if (fw_error) { /* * We are here because of a firmware error/timeout and not * because of a hardware interrupt. It is possible (although * not very likely) that an error interrupt was also raised but * this thread ran first and inhibited t4_intr_err. We walk the * main INT_CAUSE registers here to make sure we haven't missed * anything interesting. */ t4_slow_intr_handler(sc, verbose); atomic_set_int(&sc->error_flags, ADAP_CIM_ERR); } t4_report_fw_error(sc); log(LOG_ALERT, "%s: encountered fatal error, adapter stopped (%d).\n", device_get_nameunit(sc->dev), fw_error); taskqueue_enqueue(reset_tq, &sc->fatal_error_task); } void t4_add_adapter(struct adapter *sc) { sx_xlock(&t4_list_lock); SLIST_INSERT_HEAD(&t4_list, sc, link); sx_xunlock(&t4_list_lock); } int t4_map_bars_0_and_4(struct adapter *sc) { sc->regs_rid = PCIR_BAR(0); sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE); if (sc->regs_res == NULL) { device_printf(sc->dev, "cannot map registers.\n"); return (ENXIO); } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); setbit(&sc->doorbells, DOORBELL_KDB); sc->msix_rid = PCIR_BAR(4); sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->msix_rid, RF_ACTIVE); if (sc->msix_res == NULL) { device_printf(sc->dev, "cannot map MSI-X BAR.\n"); return (ENXIO); } return (0); } int t4_map_bar_2(struct adapter *sc) { /* * T4: only iWARP driver uses the userspace doorbells. There is no need * to map it if RDMA is disabled. */ if (is_t4(sc) && sc->rdmacaps == 0) return (0); sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE); if (sc->udbs_res == NULL) { device_printf(sc->dev, "cannot map doorbell BAR.\n"); return (ENXIO); } sc->udbs_base = rman_get_virtual(sc->udbs_res); if (chip_id(sc) >= CHELSIO_T5) { setbit(&sc->doorbells, DOORBELL_UDB); #if defined(__i386__) || defined(__amd64__) if (t5_write_combine) { int rc, mode; /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B * (UDBS_SEG_SIZE) doorbell regions, each associated * with an egress queue. The first 64B has the doorbell * and the second 64B can be used to submit a tx work * request with an implicit doorbell. */ rc = pmap_change_attr((vm_offset_t)sc->udbs_base, rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING); if (rc == 0) { clrbit(&sc->doorbells, DOORBELL_UDB); setbit(&sc->doorbells, DOORBELL_WCWR); setbit(&sc->doorbells, DOORBELL_UDBWC); } else { device_printf(sc->dev, "couldn't enable write combining: %d\n", rc); } mode = is_t5(sc) ? V_STATMODE(0) : V_T6_STATMODE(0); t4_write_reg(sc, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) | mode); } #endif } sc->iwt.wc_en = isset(&sc->doorbells, DOORBELL_UDBWC) ? 1 : 0; return (0); } struct memwin_init { uint32_t base; uint32_t aperture; }; static const struct memwin_init t4_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 } }; static const struct memwin_init t5_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 }, }; static void setup_memwin(struct adapter *sc) { const struct memwin_init *mw_init; struct memwin *mw; int i; uint32_t bar0; if (is_t4(sc)) { /* * Read low 32b of bar0 indirectly via the hardware backdoor * mechanism. Works from within PCI passthrough environments * too, where rman_get_start() can return a different value. We * need to program the T4 memory window decoders with the actual * addresses that will be coming across the PCIe link. */ bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0)); bar0 &= (uint32_t) PCIM_BAR_MEM_BASE; mw_init = &t4_memwin[0]; } else { /* T5+ use the relative offset inside the PCIe BAR */ bar0 = 0; mw_init = &t5_memwin[0]; } for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) { if (!rw_initialized(&mw->mw_lock)) { rw_init(&mw->mw_lock, "memory window access"); mw->mw_base = mw_init->base; mw->mw_aperture = mw_init->aperture; mw->mw_curpos = 0; } t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i), (mw->mw_base + bar0) | V_BIR(0) | V_WINDOW(ilog2(mw->mw_aperture) - 10)); rw_wlock(&mw->mw_lock); position_memwin(sc, i, mw->mw_curpos); rw_wunlock(&mw->mw_lock); } /* flush */ t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2)); } /* * Positions the memory window at the given address in the card's address space. * There are some alignment requirements and the actual position may be at an * address prior to the requested address. mw->mw_curpos always has the actual * position of the window. */ static void position_memwin(struct adapter *sc, int idx, uint32_t addr) { struct memwin *mw; uint32_t pf; uint32_t reg; MPASS(idx >= 0 && idx < NUM_MEMWIN); mw = &sc->memwin[idx]; rw_assert(&mw->mw_lock, RA_WLOCKED); if (is_t4(sc)) { pf = 0; mw->mw_curpos = addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); mw->mw_curpos = addr & ~0x7f; /* start must be 128B aligned */ } reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx); t4_write_reg(sc, reg, mw->mw_curpos | pf); t4_read_reg(sc, reg); /* flush */ } int rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len, int rw) { struct memwin *mw; uint32_t mw_end, v; MPASS(idx >= 0 && idx < NUM_MEMWIN); /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len <= 0) return (EINVAL); mw = &sc->memwin[idx]; while (len > 0) { rw_rlock(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; if (addr >= mw_end || addr < mw->mw_curpos) { /* Will need to reposition the window */ if (!rw_try_upgrade(&mw->mw_lock)) { rw_runlock(&mw->mw_lock); rw_wlock(&mw->mw_lock); } rw_assert(&mw->mw_lock, RA_WLOCKED); position_memwin(sc, idx, addr); rw_downgrade(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; } rw_assert(&mw->mw_lock, RA_RLOCKED); while (addr < mw_end && len > 0) { if (rw == 0) { v = t4_read_reg(sc, mw->mw_base + addr - mw->mw_curpos); *val++ = le32toh(v); } else { v = *val++; t4_write_reg(sc, mw->mw_base + addr - mw->mw_curpos, htole32(v)); } addr += 4; len -= 4; } rw_runlock(&mw->mw_lock); } return (0); } static void t4_init_atid_table(struct adapter *sc) { struct tid_info *t; int i; t = &sc->tids; if (t->natids == 0) return; MPASS(t->atid_tab == NULL); t->atid_tab = malloc(t->natids * sizeof(*t->atid_tab), M_CXGBE, M_ZERO | M_WAITOK); mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); t->afree = t->atid_tab; t->atids_in_use = 0; for (i = 1; i < t->natids; i++) t->atid_tab[i - 1].next = &t->atid_tab[i]; t->atid_tab[t->natids - 1].next = NULL; } static void t4_free_atid_table(struct adapter *sc) { struct tid_info *t; t = &sc->tids; KASSERT(t->atids_in_use == 0, ("%s: %d atids still in use.", __func__, t->atids_in_use)); if (mtx_initialized(&t->atid_lock)) mtx_destroy(&t->atid_lock); free(t->atid_tab, M_CXGBE); t->atid_tab = NULL; } int alloc_atid(struct adapter *sc, void *ctx) { struct tid_info *t = &sc->tids; int atid = -1; mtx_lock(&t->atid_lock); if (t->afree) { union aopen_entry *p = t->afree; atid = p - t->atid_tab; MPASS(atid <= M_TID_TID); t->afree = p->next; p->data = ctx; t->atids_in_use++; } mtx_unlock(&t->atid_lock); return (atid); } void * lookup_atid(struct adapter *sc, int atid) { struct tid_info *t = &sc->tids; return (t->atid_tab[atid].data); } void free_atid(struct adapter *sc, int atid) { struct tid_info *t = &sc->tids; union aopen_entry *p = &t->atid_tab[atid]; mtx_lock(&t->atid_lock); p->next = t->afree; t->afree = p; t->atids_in_use--; mtx_unlock(&t->atid_lock); } static void queue_tid_release(struct adapter *sc, int tid) { CXGBE_UNIMPLEMENTED("deferred tid release"); } void release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) { struct wrqe *wr; struct cpl_tid_release *req; wr = alloc_wrqe(sizeof(*req), ctrlq); if (wr == NULL) { queue_tid_release(sc, tid); /* defer */ return; } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); t4_wrq_tx(sc, wr); } static int t4_range_cmp(const void *a, const void *b) { return ((const struct t4_range *)a)->start - ((const struct t4_range *)b)->start; } /* * Verify that the memory range specified by the addr/len pair is valid within * the card's address space. */ static int validate_mem_range(struct adapter *sc, uint32_t addr, uint32_t len) { struct t4_range mem_ranges[4], *r, *next; uint32_t em, addr_len; int i, n, remaining; /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len == 0) return (EINVAL); /* Enabled memories */ em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); r = &mem_ranges[0]; n = 0; bzero(r, sizeof(mem_ranges)); if (em & F_EDRAM0_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); r->size = G_EDRAM0_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM0_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EDRAM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); r->size = G_EDRAM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EXT_MEM_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); r->size = G_EXT_MEM_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); r->size = G_EXT_MEM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } MPASS(n <= nitems(mem_ranges)); if (n > 1) { /* Sort and merge the ranges. */ qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp); /* Start from index 0 and examine the next n - 1 entries. */ r = &mem_ranges[0]; for (remaining = n - 1; remaining > 0; remaining--, r++) { MPASS(r->size > 0); /* r is a valid entry. */ next = r + 1; MPASS(next->size > 0); /* and so is the next one. */ while (r->start + r->size >= next->start) { /* Merge the next one into the current entry. */ r->size = max(r->start + r->size, next->start + next->size) - r->start; n--; /* One fewer entry in total. */ if (--remaining == 0) goto done; /* short circuit */ next++; } if (next != r + 1) { /* * Some entries were merged into r and next * points to the first valid entry that couldn't * be merged. */ MPASS(next->size > 0); /* must be valid */ memcpy(r + 1, next, remaining * sizeof(*r)); #ifdef INVARIANTS /* * This so that the foo->size assertion in the * next iteration of the loop do the right * thing for entries that were pulled up and are * no longer valid. */ MPASS(n < nitems(mem_ranges)); bzero(&mem_ranges[n], (nitems(mem_ranges) - n) * sizeof(struct t4_range)); #endif } } done: /* Done merging the ranges. */ MPASS(n > 0); r = &mem_ranges[0]; for (i = 0; i < n; i++, r++) { if (addr >= r->start && addr + len <= r->start + r->size) return (0); } } return (EFAULT); } static int fwmtype_to_hwmtype(int mtype) { switch (mtype) { case FW_MEMTYPE_EDC0: return (MEM_EDC0); case FW_MEMTYPE_EDC1: return (MEM_EDC1); case FW_MEMTYPE_EXTMEM: return (MEM_MC0); case FW_MEMTYPE_EXTMEM1: return (MEM_MC1); default: panic("%s: cannot translate fw mtype %d.", __func__, mtype); } } /* * Verify that the memory range specified by the memtype/offset/len pair is * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ static int validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, uint32_t len, uint32_t *addr) { uint32_t em, addr_len, maddr; /* Memory can only be accessed in naturally aligned 4 byte units */ if (off & 3 || len & 3 || len == 0) return (EINVAL); em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); switch (fwmtype_to_hwmtype(mtype)) { case MEM_EDC0: if (!(em & F_EDRAM0_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; break; case MEM_EDC1: if (!(em & F_EDRAM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; break; case MEM_MC: if (!(em & F_EXT_MEM_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; break; case MEM_MC1: if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; break; default: return (EINVAL); } *addr = maddr + off; /* global address */ return (validate_mem_range(sc, *addr, len)); } static int fixup_devlog_params(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; int rc; rc = validate_mt_off_len(sc, dparams->memtype, dparams->start, dparams->size, &dparams->addr); return (rc); } static void update_nirq(struct intrs_and_queues *iaq, int nports) { iaq->nirq = T4_EXTRA_INTR; iaq->nirq += nports * max(iaq->nrxq, iaq->nnmrxq); iaq->nirq += nports * iaq->nofldrxq; iaq->nirq += nports * (iaq->num_vis - 1) * max(iaq->nrxq_vi, iaq->nnmrxq_vi); iaq->nirq += nports * (iaq->num_vis - 1) * iaq->nofldrxq_vi; } /* * Adjust requirements to fit the number of interrupts available. */ static void calculate_iaq(struct adapter *sc, struct intrs_and_queues *iaq, int itype, int navail) { int old_nirq; const int nports = sc->params.nports; MPASS(nports > 0); MPASS(navail > 0); bzero(iaq, sizeof(*iaq)); iaq->intr_type = itype; iaq->num_vis = t4_num_vis; iaq->ntxq = t4_ntxq; iaq->ntxq_vi = t4_ntxq_vi; iaq->nrxq = t4_nrxq; iaq->nrxq_vi = t4_nrxq_vi; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (is_offload(sc) || is_ethoffload(sc)) { iaq->nofldtxq = t4_nofldtxq; iaq->nofldtxq_vi = t4_nofldtxq_vi; } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldrxq = t4_nofldrxq; iaq->nofldrxq_vi = t4_nofldrxq_vi; } #endif #ifdef DEV_NETMAP if (t4_native_netmap & NN_MAIN_VI) { iaq->nnmtxq = t4_nnmtxq; iaq->nnmrxq = t4_nnmrxq; } if (t4_native_netmap & NN_EXTRA_VI) { iaq->nnmtxq_vi = t4_nnmtxq_vi; iaq->nnmrxq_vi = t4_nnmrxq_vi; } #endif update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { /* * This is the normal case -- there are enough interrupts for * everything. */ goto done; } /* * If extra VIs have been configured try reducing their count and see if * that works. */ while (iaq->num_vis > 1) { iaq->num_vis--; update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { device_printf(sc->dev, "virtual interfaces per port " "reduced to %d from %d. nrxq=%u, nofldrxq=%u, " "nrxq_vi=%u nofldrxq_vi=%u, nnmrxq_vi=%u. " "itype %d, navail %u, nirq %d.\n", iaq->num_vis, t4_num_vis, iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, itype, navail, iaq->nirq); goto done; } } /* * Extra VIs will not be created. Log a message if they were requested. */ MPASS(iaq->num_vis == 1); iaq->ntxq_vi = iaq->nrxq_vi = 0; iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0; iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0; if (iaq->num_vis != t4_num_vis) { device_printf(sc->dev, "extra virtual interfaces disabled. " "nrxq=%u, nofldrxq=%u, nrxq_vi=%u nofldrxq_vi=%u, " "nnmrxq_vi=%u. itype %d, navail %u, nirq %d.\n", iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, itype, navail, iaq->nirq); } /* * Keep reducing the number of NIC rx queues to the next lower power of * 2 (for even RSS distribution) and halving the TOE rx queues and see * if that works. */ do { if (iaq->nrxq > 1) { do { iaq->nrxq--; } while (!powerof2(iaq->nrxq)); if (iaq->nnmrxq > iaq->nrxq) iaq->nnmrxq = iaq->nrxq; } if (iaq->nofldrxq > 1) iaq->nofldrxq >>= 1; old_nirq = iaq->nirq; update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { device_printf(sc->dev, "running with reduced number of " "rx queues because of shortage of interrupts. " "nrxq=%u, nofldrxq=%u. " "itype %d, navail %u, nirq %d.\n", iaq->nrxq, iaq->nofldrxq, itype, navail, iaq->nirq); goto done; } } while (old_nirq != iaq->nirq); /* One interrupt for everything. Ugh. */ device_printf(sc->dev, "running with minimal number of queues. " "itype %d, navail %u.\n", itype, navail); iaq->nirq = 1; iaq->nrxq = 1; iaq->ntxq = 1; if (iaq->nofldrxq > 0) { iaq->nofldrxq = 1; iaq->nofldtxq = 1; } iaq->nnmtxq = 0; iaq->nnmrxq = 0; done: MPASS(iaq->num_vis > 0); if (iaq->num_vis > 1) { MPASS(iaq->nrxq_vi > 0); MPASS(iaq->ntxq_vi > 0); } MPASS(iaq->nirq > 0); MPASS(iaq->nrxq > 0); MPASS(iaq->ntxq > 0); if (itype == INTR_MSI) { MPASS(powerof2(iaq->nirq)); } } static int cfg_itype_and_nqueues(struct adapter *sc, struct intrs_and_queues *iaq) { int rc, itype, navail, nalloc; for (itype = INTR_MSIX; itype; itype >>= 1) { if ((itype & t4_intr_types) == 0) continue; /* not allowed */ if (itype == INTR_MSIX) navail = pci_msix_count(sc->dev); else if (itype == INTR_MSI) navail = pci_msi_count(sc->dev); else navail = 1; restart: if (navail == 0) continue; calculate_iaq(sc, iaq, itype, navail); nalloc = iaq->nirq; rc = 0; if (itype == INTR_MSIX) rc = pci_alloc_msix(sc->dev, &nalloc); else if (itype == INTR_MSI) rc = pci_alloc_msi(sc->dev, &nalloc); if (rc == 0 && nalloc > 0) { if (nalloc == iaq->nirq) return (0); /* * Didn't get the number requested. Use whatever number * the kernel is willing to allocate. */ device_printf(sc->dev, "fewer vectors than requested, " "type=%d, req=%d, rcvd=%d; will downshift req.\n", itype, iaq->nirq, nalloc); pci_release_msi(sc->dev); navail = nalloc; goto restart; } device_printf(sc->dev, "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n", itype, rc, iaq->nirq, nalloc); } device_printf(sc->dev, "failed to find a usable interrupt type. " "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types, pci_msix_count(sc->dev), pci_msi_count(sc->dev)); return (ENXIO); } #define FW_VERSION(chip) ( \ V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \ V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \ V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \ V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD)) #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf) /* Just enough of fw_hdr to cover all version info. */ struct fw_h { __u8 ver; __u8 chip; __be16 len512; __be32 fw_ver; __be32 tp_microcode_ver; __u8 intfver_nic; __u8 intfver_vnic; __u8 intfver_ofld; __u8 intfver_ri; __u8 intfver_iscsipdu; __u8 intfver_iscsi; __u8 intfver_fcoepdu; __u8 intfver_fcoe; }; /* Spot check a couple of fields. */ CTASSERT(offsetof(struct fw_h, fw_ver) == offsetof(struct fw_hdr, fw_ver)); CTASSERT(offsetof(struct fw_h, intfver_nic) == offsetof(struct fw_hdr, intfver_nic)); CTASSERT(offsetof(struct fw_h, intfver_fcoe) == offsetof(struct fw_hdr, intfver_fcoe)); struct fw_info { uint8_t chip; char *kld_name; char *fw_mod_name; struct fw_h fw_h; } fw_info[] = { { .chip = CHELSIO_T4, .kld_name = "t4fw_cfg", .fw_mod_name = "t4fw", .fw_h = { .chip = FW_HDR_CHIP_T4, .fw_ver = htobe32(FW_VERSION(T4)), .intfver_nic = FW_INTFVER(T4, NIC), .intfver_vnic = FW_INTFVER(T4, VNIC), .intfver_ofld = FW_INTFVER(T4, OFLD), .intfver_ri = FW_INTFVER(T4, RI), .intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T4, ISCSI), .intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU), .intfver_fcoe = FW_INTFVER(T4, FCOE), }, }, { .chip = CHELSIO_T5, .kld_name = "t5fw_cfg", .fw_mod_name = "t5fw", .fw_h = { .chip = FW_HDR_CHIP_T5, .fw_ver = htobe32(FW_VERSION(T5)), .intfver_nic = FW_INTFVER(T5, NIC), .intfver_vnic = FW_INTFVER(T5, VNIC), .intfver_ofld = FW_INTFVER(T5, OFLD), .intfver_ri = FW_INTFVER(T5, RI), .intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T5, ISCSI), .intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU), .intfver_fcoe = FW_INTFVER(T5, FCOE), }, }, { .chip = CHELSIO_T6, .kld_name = "t6fw_cfg", .fw_mod_name = "t6fw", .fw_h = { .chip = FW_HDR_CHIP_T6, .fw_ver = htobe32(FW_VERSION(T6)), .intfver_nic = FW_INTFVER(T6, NIC), .intfver_vnic = FW_INTFVER(T6, VNIC), .intfver_ofld = FW_INTFVER(T6, OFLD), .intfver_ri = FW_INTFVER(T6, RI), .intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T6, ISCSI), .intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU), .intfver_fcoe = FW_INTFVER(T6, FCOE), }, } }; static struct fw_info * find_fw_info(int chip) { int i; for (i = 0; i < nitems(fw_info); i++) { if (fw_info[i].chip == chip) return (&fw_info[i]); } return (NULL); } /* * Is the given firmware API compatible with the one the driver was compiled * with? */ static int fw_compatible(const struct fw_h *hdr1, const struct fw_h *hdr2) { /* short circuit if it's the exact same firmware version */ if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) return (1); /* * XXX: Is this too conservative? Perhaps I should limit this to the * features that are supported in the driver. */ #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) && SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe)) return (1); #undef SAME_INTF return (0); } static int load_fw_module(struct adapter *sc, const struct firmware **dcfg, const struct firmware **fw) { struct fw_info *fw_info; *dcfg = NULL; if (fw != NULL) *fw = NULL; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } *dcfg = firmware_get(fw_info->kld_name); if (*dcfg != NULL) { if (fw != NULL) *fw = firmware_get(fw_info->fw_mod_name); return (0); } return (ENOENT); } static void unload_fw_module(struct adapter *sc, const struct firmware *dcfg, const struct firmware *fw) { if (fw != NULL) firmware_put(fw, FIRMWARE_UNLOAD); if (dcfg != NULL) firmware_put(dcfg, FIRMWARE_UNLOAD); } /* * Return values: * 0 means no firmware install attempted. * ERESTART means a firmware install was attempted and was successful. * +ve errno means a firmware install was attempted but failed. */ static int install_kld_firmware(struct adapter *sc, struct fw_h *card_fw, const struct fw_h *drv_fw, const char *reason, int *already) { const struct firmware *cfg, *fw; const uint32_t c = be32toh(card_fw->fw_ver); uint32_t d, k; int rc, fw_install; struct fw_h bundled_fw; bool load_attempted; cfg = fw = NULL; load_attempted = false; fw_install = t4_fw_install < 0 ? -t4_fw_install : t4_fw_install; memcpy(&bundled_fw, drv_fw, sizeof(bundled_fw)); if (t4_fw_install < 0) { rc = load_fw_module(sc, &cfg, &fw); if (rc != 0 || fw == NULL) { device_printf(sc->dev, "failed to load firmware module: %d. cfg %p, fw %p;" " will use compiled-in firmware version for" "hw.cxgbe.fw_install checks.\n", rc, cfg, fw); } else { memcpy(&bundled_fw, fw->data, sizeof(bundled_fw)); } load_attempted = true; } d = be32toh(bundled_fw.fw_ver); if (reason != NULL) goto install; if ((sc->flags & FW_OK) == 0) { if (c == 0xffffffff) { reason = "missing"; goto install; } rc = 0; goto done; } if (!fw_compatible(card_fw, &bundled_fw)) { reason = "incompatible or unusable"; goto install; } if (d > c) { reason = "older than the version bundled with this driver"; goto install; } if (fw_install == 2 && d != c) { reason = "different than the version bundled with this driver"; goto install; } /* No reason to do anything to the firmware already on the card. */ rc = 0; goto done; install: rc = 0; if ((*already)++) goto done; if (fw_install == 0) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver is prohibited from installing a firmware " "on the card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); goto done; } /* * We'll attempt to install a firmware. Load the module first (if it * hasn't been loaded already). */ if (!load_attempted) { rc = load_fw_module(sc, &cfg, &fw); if (rc != 0 || fw == NULL) { device_printf(sc->dev, "failed to load firmware module: %d. cfg %p, fw %p\n", rc, cfg, fw); /* carry on */ } } if (fw == NULL) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver cannot take corrective action because it " "is unable to load the firmware module.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); rc = sc->flags & FW_OK ? 0 : ENOENT; goto done; } k = be32toh(((const struct fw_hdr *)fw->data)->fw_ver); if (k != d) { MPASS(t4_fw_install > 0); device_printf(sc->dev, "firmware in KLD (%u.%u.%u.%u) is not what the driver was " "expecting (%u.%u.%u.%u) and will not be used.\n", G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k), G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d)); rc = sc->flags & FW_OK ? 0 : EINVAL; goto done; } device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "installing firmware %u.%u.%u.%u on card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason, G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d)); rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0); if (rc != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", rc); } else { /* Installed successfully, update the cached header too. */ rc = ERESTART; memcpy(card_fw, fw->data, sizeof(*card_fw)); } done: unload_fw_module(sc, cfg, fw); return (rc); } /* * Establish contact with the firmware and attempt to become the master driver. * * A firmware will be installed to the card if needed (if the driver is allowed * to do so). */ static int contact_firmware(struct adapter *sc) { int rc, already = 0; enum dev_state state; struct fw_info *fw_info; struct fw_hdr *card_fw; /* fw on the card */ const struct fw_h *drv_fw; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } drv_fw = &fw_info->fw_h; /* Read the header of the firmware on the card */ card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK); restart: rc = -t4_get_fw_hdr(sc, card_fw); if (rc != 0) { device_printf(sc->dev, "unable to read firmware header from card's flash: %d\n", rc); goto done; } rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL, &already); if (rc == ERESTART) goto restart; if (rc != 0) goto done; rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state); if (rc < 0 || state == DEV_STATE_ERR) { rc = -rc; device_printf(sc->dev, "failed to connect to the firmware: %d, %d. " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); #if 0 if (install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, "not responding properly to HELLO", &already) == ERESTART) goto restart; #endif goto done; } MPASS(be32toh(card_fw->flags) & FW_HDR_FLAGS_RESET_HALT); sc->flags |= FW_OK; /* The firmware responded to the FW_HELLO. */ if (rc == sc->pf) { sc->flags |= MASTER_PF; rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL, &already); if (rc == ERESTART) rc = 0; else if (rc != 0) goto done; } else if (state == DEV_STATE_UNINIT) { /* * We didn't get to be the master so we definitely won't be * configuring the chip. It's a bug if someone else hasn't * configured it already. */ device_printf(sc->dev, "couldn't be master(%d), " "device not already initialized either(%d). " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); rc = EPROTO; goto done; } else { /* * Some other PF is the master and has configured the chip. * This is allowed but untested. */ device_printf(sc->dev, "PF%d is master, device state %d. " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", rc); sc->cfcsum = 0; rc = 0; } done: if (rc != 0 && sc->flags & FW_OK) { t4_fw_bye(sc, sc->mbox); sc->flags &= ~FW_OK; } free(card_fw, M_CXGBE); return (rc); } static int copy_cfg_file_to_card(struct adapter *sc, char *cfg_file, uint32_t mtype, uint32_t moff) { struct fw_info *fw_info; const struct firmware *dcfg, *rcfg = NULL; const uint32_t *cfdata; uint32_t cflen, addr; int rc; load_fw_module(sc, &dcfg, NULL); /* Card specific interpretation of "default". */ if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { if (pci_get_device(sc->dev) == 0x440a) snprintf(cfg_file, sizeof(t4_cfg_file), UWIRE_CF); if (is_fpga(sc)) snprintf(cfg_file, sizeof(t4_cfg_file), FPGA_CF); } if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { if (dcfg == NULL) { device_printf(sc->dev, "KLD with default config is not available.\n"); rc = ENOENT; goto done; } cfdata = dcfg->data; cflen = dcfg->datasize & ~3; } else { char s[32]; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); rc = EINVAL; goto done; } snprintf(s, sizeof(s), "%s_%s", fw_info->kld_name, cfg_file); rcfg = firmware_get(s); if (rcfg == NULL) { device_printf(sc->dev, "unable to load module \"%s\" for configuration " "profile \"%s\".\n", s, cfg_file); rc = ENOENT; goto done; } cfdata = rcfg->data; cflen = rcfg->datasize & ~3; } if (cflen > FLASH_CFG_MAX_SIZE) { device_printf(sc->dev, "config file too long (%d, max allowed is %d).\n", cflen, FLASH_CFG_MAX_SIZE); rc = EINVAL; goto done; } rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr); if (rc != 0) { device_printf(sc->dev, "%s: addr (%d/0x%x) or len %d is not valid: %d.\n", __func__, mtype, moff, cflen, rc); rc = EINVAL; goto done; } write_via_memwin(sc, 2, addr, cfdata, cflen); done: if (rcfg != NULL) firmware_put(rcfg, FIRMWARE_UNLOAD); unload_fw_module(sc, dcfg, NULL); return (rc); } struct caps_allowed { uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; }; #define FW_PARAM_DEV(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) #define FW_PARAM_PFVF(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)) /* * Provide a configuration profile to the firmware and have it initialize the * chip accordingly. This may involve uploading a configuration file to the * card. */ static int apply_cfg_and_initialize(struct adapter *sc, char *cfg_file, const struct caps_allowed *caps_allowed) { int rc; struct fw_caps_config_cmd caps; uint32_t mtype, moff, finicsum, cfcsum, param, val; rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST); if (rc != 0) { device_printf(sc->dev, "firmware reset failed: %d.\n", rc); return (rc); } bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); if (strncmp(cfg_file, BUILTIN_CF, sizeof(t4_cfg_file)) == 0) { mtype = 0; moff = 0; caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); } else if (strncmp(cfg_file, FLASH_CF, sizeof(t4_cfg_file)) == 0) { mtype = FW_MEMTYPE_FLASH; moff = t4_flash_cfg_addr(sc); caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); } else { /* * Ask the firmware where it wants us to upload the config file. */ param = FW_PARAM_DEV(CF); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* No support for config file? Shouldn't happen. */ device_printf(sc->dev, "failed to query config file location: %d.\n", rc); goto done; } mtype = G_FW_PARAMS_PARAM_Y(val); moff = G_FW_PARAMS_PARAM_Z(val) << 16; caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); rc = copy_cfg_file_to_card(sc, cfg_file, mtype, moff); if (rc != 0) { device_printf(sc->dev, "failed to upload config file to card: %d.\n", rc); goto done; } } rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to pre-process config file: %d " "(mtype %d, moff 0x%x).\n", rc, mtype, moff); goto done; } finicsum = be32toh(caps.finicsum); cfcsum = be32toh(caps.cfcsum); /* actual */ if (finicsum != cfcsum) { device_printf(sc->dev, "WARNING: config file checksum mismatch: %08x %08x\n", finicsum, cfcsum); } sc->cfcsum = cfcsum; snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", cfg_file); /* * Let the firmware know what features will (not) be used so it can tune * things accordingly. */ #define LIMIT_CAPS(x) do { \ caps.x##caps &= htobe16(caps_allowed->x##caps); \ } while (0) LIMIT_CAPS(nbm); LIMIT_CAPS(link); LIMIT_CAPS(switch); LIMIT_CAPS(nic); LIMIT_CAPS(toe); LIMIT_CAPS(rdma); LIMIT_CAPS(crypto); LIMIT_CAPS(iscsi); LIMIT_CAPS(fcoe); #undef LIMIT_CAPS if (caps.niccaps & htobe16(FW_CAPS_CONFIG_NIC_HASHFILTER)) { /* * TOE and hashfilters are mutually exclusive. It is a config * file or firmware bug if both are reported as available. Try * to cope with the situation in non-debug builds by disabling * TOE. */ MPASS(caps.toecaps == 0); caps.toecaps = 0; caps.rdmacaps = 0; caps.iscsicaps = 0; } caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL); if (rc != 0) { device_printf(sc->dev, "failed to process config file: %d.\n", rc); goto done; } t4_tweak_chip_settings(sc); set_params__pre_init(sc); /* get basic stuff going */ rc = -t4_fw_initialize(sc, sc->mbox); if (rc != 0) { device_printf(sc->dev, "fw_initialize failed: %d.\n", rc); goto done; } done: return (rc); } /* * Partition chip resources for use between various PFs, VFs, etc. */ static int partition_resources(struct adapter *sc) { char cfg_file[sizeof(t4_cfg_file)]; struct caps_allowed caps_allowed; int rc; bool fallback; /* Only the master driver gets to configure the chip resources. */ MPASS(sc->flags & MASTER_PF); #define COPY_CAPS(x) do { \ caps_allowed.x##caps = t4_##x##caps_allowed; \ } while (0) bzero(&caps_allowed, sizeof(caps_allowed)); COPY_CAPS(nbm); COPY_CAPS(link); COPY_CAPS(switch); COPY_CAPS(nic); COPY_CAPS(toe); COPY_CAPS(rdma); COPY_CAPS(crypto); COPY_CAPS(iscsi); COPY_CAPS(fcoe); fallback = sc->debug_flags & DF_DISABLE_CFG_RETRY ? false : true; snprintf(cfg_file, sizeof(cfg_file), "%s", t4_cfg_file); retry: rc = apply_cfg_and_initialize(sc, cfg_file, &caps_allowed); if (rc != 0 && fallback) { device_printf(sc->dev, "failed (%d) to configure card with \"%s\" profile, " "will fall back to a basic configuration and retry.\n", rc, cfg_file); snprintf(cfg_file, sizeof(cfg_file), "%s", BUILTIN_CF); bzero(&caps_allowed, sizeof(caps_allowed)); COPY_CAPS(switch); caps_allowed.niccaps = FW_CAPS_CONFIG_NIC; fallback = false; goto retry; } #undef COPY_CAPS return (rc); } /* * Retrieve parameters that are needed (or nice to have) very early. */ static int get_params__pre_init(struct adapter *sc) { int rc; uint32_t param[2], val[2]; t4_get_version_info(sc); snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); snprintf(sc->bs_version, sizeof(sc->bs_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MINOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MICRO(sc->params.bs_vers), G_FW_HDR_FW_VER_BUILD(sc->params.bs_vers)); snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers), G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers)); snprintf(sc->er_version, sizeof(sc->er_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.er_vers), G_FW_HDR_FW_VER_MINOR(sc->params.er_vers), G_FW_HDR_FW_VER_MICRO(sc->params.er_vers), G_FW_HDR_FW_VER_BUILD(sc->params.er_vers)); param[0] = FW_PARAM_DEV(PORTVEC); param[1] = FW_PARAM_DEV(CCLK); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (pre_init): %d.\n", rc); return (rc); } sc->params.portvec = val[0]; sc->params.nports = bitcount32(val[0]); sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ rc = -t4_init_devlog_params(sc, 1); if (rc == 0) fixup_devlog_params(sc); else { device_printf(sc->dev, "failed to get devlog parameters: %d.\n", rc); rc = 0; /* devlog isn't critical for device operation */ } return (rc); } /* * Any params that need to be set before FW_INITIALIZE. */ static int set_params__pre_init(struct adapter *sc) { int rc = 0; uint32_t param, val; if (chip_id(sc) >= CHELSIO_T6) { param = FW_PARAM_DEV(HPFILTER_REGION_SUPPORT); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); /* firmwares < 1.20.1.0 do not have this param. */ if (rc == FW_EINVAL && sc->params.fw_vers < FW_VERSION32(1, 20, 1, 0)) { rc = 0; } if (rc != 0) { device_printf(sc->dev, "failed to enable high priority filters :%d.\n", rc); } param = FW_PARAM_DEV(PPOD_EDRAM); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc == 0 && val == 1) { rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { device_printf(sc->dev, "failed to set PPOD_EDRAM: %d.\n", rc); } } } /* Enable opaque VIIDs with firmwares that support it. */ param = FW_PARAM_DEV(OPAQUE_VIID_SMT_EXTN); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc == 0 && val == 1) sc->params.viid_smt_extn_support = true; else sc->params.viid_smt_extn_support = false; return (rc); } /* * Retrieve various parameters that are of interest to the driver. The device * has been initialized by the firmware at this point. */ static int get_params__post_init(struct adapter *sc) { int rc; uint32_t param[7], val[7]; struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); param[2] = FW_PARAM_PFVF(FILTER_START); param[3] = FW_PARAM_PFVF(FILTER_END); param[4] = FW_PARAM_PFVF(L2T_START); param[5] = FW_PARAM_PFVF(L2T_END); param[6] = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 7, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init): %d.\n", rc); return (rc); } sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; if ((int)val[3] > (int)val[2]) { sc->tids.ftid_base = val[2]; sc->tids.ftid_end = val[3]; sc->tids.nftids = val[3] - val[2] + 1; } sc->vres.l2t.start = val[4]; sc->vres.l2t.size = val[5] - val[4] + 1; KASSERT(sc->vres.l2t.size <= L2T_SIZE, ("%s: L2 table size (%u) larger than expected (%u)", __func__, sc->vres.l2t.size, L2T_SIZE)); sc->params.core_vdd = val[6]; param[0] = FW_PARAM_PFVF(IQFLINT_END); param[1] = FW_PARAM_PFVF(EQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init2): %d.\n", rc); return (rc); } MPASS((int)val[0] >= sc->sge.iq_start); sc->sge.iqmap_sz = val[0] - sc->sge.iq_start + 1; MPASS((int)val[1] >= sc->sge.eq_start); sc->sge.eqmap_sz = val[1] - sc->sge.eq_start + 1; if (chip_id(sc) >= CHELSIO_T6) { sc->tids.tid_base = t4_read_reg(sc, A_LE_DB_ACTIVE_TABLE_START_INDEX); param[0] = FW_PARAM_PFVF(HPFILTER_START); param[1] = FW_PARAM_PFVF(HPFILTER_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query hpfilter parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->tids.hpftid_base = val[0]; sc->tids.hpftid_end = val[1]; sc->tids.nhpftids = val[1] - val[0] + 1; /* * These should go off if the layout changes and the * driver needs to catch up. */ MPASS(sc->tids.hpftid_base == 0); MPASS(sc->tids.tid_base == sc->tids.nhpftids); } param[0] = FW_PARAM_PFVF(RAWF_START); param[1] = FW_PARAM_PFVF(RAWF_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query rawf parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->rawf_base = val[0]; sc->nrawf = val[1] - val[0] + 1; } } /* * MPSBGMAP is queried separately because only recent firmwares support * it as a parameter and we don't want the compound query above to fail * on older firmwares. */ param[0] = FW_PARAM_DEV(MPSBGMAP); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.mps_bg_map = val[0]; else sc->params.mps_bg_map = 0; /* * Determine whether the firmware supports the filter2 work request. * This is queried separately for the same reason as MPSBGMAP above. */ param[0] = FW_PARAM_DEV(FILTER2_WR); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.filter2_wr_support = val[0] != 0; else sc->params.filter2_wr_support = 0; /* * Find out whether we're allowed to use the ULPTX MEMWRITE DSGL. * This is queried separately for the same reason as other params above. */ param[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.ulptx_memwrite_dsgl = val[0] != 0; else sc->params.ulptx_memwrite_dsgl = false; /* FW_RI_FR_NSMR_TPTE_WR support */ param[0] = FW_PARAM_DEV(RI_FR_NSMR_TPTE_WR); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.fr_nsmr_tpte_wr_support = val[0] != 0; else sc->params.fr_nsmr_tpte_wr_support = false; /* Support for 512 SGL entries per FR MR. */ param[0] = FW_PARAM_DEV(DEV_512SGL_MR); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.dev_512sgl_mr = val[0] != 0; else sc->params.dev_512sgl_mr = false; param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.max_pkts_per_eth_tx_pkts_wr = val[0]; else sc->params.max_pkts_per_eth_tx_pkts_wr = 15; param[0] = FW_PARAM_DEV(NUM_TM_CLASS); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) { MPASS(val[0] > 0 && val[0] < 256); /* nsched_cls is 8b */ sc->params.nsched_cls = val[0]; } else sc->params.nsched_cls = sc->chip_params->nsched_cls; /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to get card capabilities: %d.\n", rc); return (rc); } #define READ_CAPS(x) do { \ sc->x = htobe16(caps.x); \ } while (0) READ_CAPS(nbmcaps); READ_CAPS(linkcaps); READ_CAPS(switchcaps); READ_CAPS(niccaps); READ_CAPS(toecaps); READ_CAPS(rdmacaps); READ_CAPS(cryptocaps); READ_CAPS(iscsicaps); READ_CAPS(fcoecaps); if (sc->niccaps & FW_CAPS_CONFIG_NIC_HASHFILTER) { MPASS(chip_id(sc) > CHELSIO_T4); MPASS(sc->toecaps == 0); sc->toecaps = 0; param[0] = FW_PARAM_DEV(NTID); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query HASHFILTER parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; if (sc->params.fw_vers < FW_VERSION32(1, 20, 5, 0)) { MPASS(sc->tids.ntids >= sc->tids.nhpftids); sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); sc->params.hash_filter = 1; } if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) { param[0] = FW_PARAM_PFVF(ETHOFLD_START); param[1] = FW_PARAM_PFVF(ETHOFLD_END); param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query NIC parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->tids.etid_base = val[0]; sc->tids.etid_end = val[1]; sc->tids.netids = val[1] - val[0] + 1; sc->params.eo_wr_cred = val[2]; sc->params.ethoffload = 1; } } if (sc->toecaps) { /* query offload-related parameters */ param[0] = FW_PARAM_DEV(NTID); param[1] = FW_PARAM_PFVF(SERVER_START); param[2] = FW_PARAM_PFVF(SERVER_END); param[3] = FW_PARAM_PFVF(TDDP_START); param[4] = FW_PARAM_PFVF(TDDP_END); param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TOE parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; if (sc->params.fw_vers < FW_VERSION32(1, 20, 5, 0)) { MPASS(sc->tids.ntids >= sc->tids.nhpftids); sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); if ((int)val[2] > (int)val[1]) { sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; } sc->vres.ddp.start = val[3]; sc->vres.ddp.size = val[4] - val[3] + 1; sc->params.ofldq_wr_cred = val[5]; sc->params.offload = 1; } else { /* * The firmware attempts memfree TOE configuration for -SO cards * and will report toecaps=0 if it runs out of resources (this * depends on the config file). It may not report 0 for other * capabilities dependent on the TOE in this case. Set them to * 0 here so that the driver doesn't bother tracking resources * that will never be used. */ sc->iscsicaps = 0; sc->rdmacaps = 0; } if (sc->rdmacaps) { param[0] = FW_PARAM_PFVF(STAG_START); param[1] = FW_PARAM_PFVF(STAG_END); param[2] = FW_PARAM_PFVF(RQ_START); param[3] = FW_PARAM_PFVF(RQ_END); param[4] = FW_PARAM_PFVF(PBL_START); param[5] = FW_PARAM_PFVF(PBL_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(1): %d.\n", rc); return (rc); } sc->vres.stag.start = val[0]; sc->vres.stag.size = val[1] - val[0] + 1; sc->vres.rq.start = val[2]; sc->vres.rq.size = val[3] - val[2] + 1; sc->vres.pbl.start = val[4]; sc->vres.pbl.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SQRQ_START); param[1] = FW_PARAM_PFVF(SQRQ_END); param[2] = FW_PARAM_PFVF(CQ_START); param[3] = FW_PARAM_PFVF(CQ_END); param[4] = FW_PARAM_PFVF(OCQ_START); param[5] = FW_PARAM_PFVF(OCQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(2): %d.\n", rc); return (rc); } sc->vres.qp.start = val[0]; sc->vres.qp.size = val[1] - val[0] + 1; sc->vres.cq.start = val[2]; sc->vres.cq.size = val[3] - val[2] + 1; sc->vres.ocq.start = val[4]; sc->vres.ocq.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SRQ_START); param[1] = FW_PARAM_PFVF(SRQ_END); param[2] = FW_PARAM_DEV(MAXORDIRD_QP); param[3] = FW_PARAM_DEV(MAXIRD_ADAPTER); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 4, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(3): %d.\n", rc); return (rc); } sc->vres.srq.start = val[0]; sc->vres.srq.size = val[1] - val[0] + 1; sc->params.max_ordird_qp = val[2]; sc->params.max_ird_adapter = val[3]; } if (sc->iscsicaps) { param[0] = FW_PARAM_PFVF(ISCSI_START); param[1] = FW_PARAM_PFVF(ISCSI_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query iSCSI parameters: %d.\n", rc); return (rc); } sc->vres.iscsi.start = val[0]; sc->vres.iscsi.size = val[1] - val[0] + 1; } if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS) { param[0] = FW_PARAM_PFVF(TLS_START); param[1] = FW_PARAM_PFVF(TLS_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TLS parameters: %d.\n", rc); return (rc); } sc->vres.key.start = val[0]; sc->vres.key.size = val[1] - val[0] + 1; } /* * We've got the params we wanted to query directly from the firmware. * Grab some others via other means. */ t4_init_sge_params(sc); t4_init_tp_params(sc); t4_read_mtu_tbl(sc, sc->params.mtus, NULL); t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd); rc = t4_verify_chip_settings(sc); if (rc != 0) return (rc); t4_init_rx_buf_info(sc); return (rc); } #ifdef KERN_TLS static void ktls_tick(void *arg) { struct adapter *sc; uint32_t tstamp; sc = arg; tstamp = tcp_ts_getticks(); t4_write_reg(sc, A_TP_SYNC_TIME_HI, tstamp >> 1); t4_write_reg(sc, A_TP_SYNC_TIME_LO, tstamp << 31); callout_schedule_sbt(&sc->ktls_tick, SBT_1MS, 0, C_HARDCLOCK); } static int t6_config_kern_tls(struct adapter *sc, bool enable) { int rc; uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_KTLS_HW) | V_FW_PARAMS_PARAM_Y(enable ? 1 : 0) | V_FW_PARAMS_PARAM_Z(FW_PARAMS_PARAM_DEV_KTLS_HW_USER_ENABLE); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, ¶m); if (rc != 0) { CH_ERR(sc, "failed to %s NIC TLS: %d\n", enable ? "enable" : "disable", rc); return (rc); } if (enable) { sc->flags |= KERN_TLS_ON; callout_reset_sbt(&sc->ktls_tick, SBT_1MS, 0, ktls_tick, sc, C_HARDCLOCK); } else { sc->flags &= ~KERN_TLS_ON; callout_stop(&sc->ktls_tick); } return (rc); } #endif static int set_params__post_init(struct adapter *sc) { uint32_t mask, param, val; #ifdef TCP_OFFLOAD int i, v, shift; #endif /* ask for encapsulated CPLs */ param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); /* Enable 32b port caps if the firmware supports it. */ param = FW_PARAM_PFVF(PORT_CAPS32); val = 1; if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val) == 0) sc->params.port_caps32 = 1; /* Let filter + maskhash steer to a part of the VI's RSS region. */ val = 1 << (G_MASKSIZE(t4_read_reg(sc, A_TP_RSS_CONFIG_TNL)) - 1); t4_set_reg_field(sc, A_TP_RSS_CONFIG_TNL, V_MASKFILTER(M_MASKFILTER), V_MASKFILTER(val - 1)); mask = F_DROPERRORANY | F_DROPERRORMAC | F_DROPERRORIPVER | F_DROPERRORFRAG | F_DROPERRORATTACK | F_DROPERRORETHHDRLEN | F_DROPERRORIPHDRLEN | F_DROPERRORTCPHDRLEN | F_DROPERRORPKTLEN | F_DROPERRORTCPOPT | F_DROPERRORCSUMIP | F_DROPERRORCSUM; val = 0; if (chip_id(sc) < CHELSIO_T6 && t4_attack_filter != 0) { t4_set_reg_field(sc, A_TP_GLOBAL_CONFIG, F_ATTACKFILTERENABLE, F_ATTACKFILTERENABLE); val |= F_DROPERRORATTACK; } if (t4_drop_ip_fragments != 0) { t4_set_reg_field(sc, A_TP_GLOBAL_CONFIG, F_FRAGMENTDROP, F_FRAGMENTDROP); val |= F_DROPERRORFRAG; } if (t4_drop_pkts_with_l2_errors != 0) val |= F_DROPERRORMAC | F_DROPERRORETHHDRLEN; if (t4_drop_pkts_with_l3_errors != 0) { val |= F_DROPERRORIPVER | F_DROPERRORIPHDRLEN | F_DROPERRORCSUMIP; } if (t4_drop_pkts_with_l4_errors != 0) { val |= F_DROPERRORTCPHDRLEN | F_DROPERRORPKTLEN | F_DROPERRORTCPOPT | F_DROPERRORCSUM; } t4_set_reg_field(sc, A_TP_ERR_CONFIG, mask, val); #ifdef TCP_OFFLOAD /* * Override the TOE timers with user provided tunables. This is not the * recommended way to change the timers (the firmware config file is) so * these tunables are not documented. * * All the timer tunables are in microseconds. */ if (t4_toe_keepalive_idle != 0) { v = us_to_tcp_ticks(sc, t4_toe_keepalive_idle); v &= M_KEEPALIVEIDLE; t4_set_reg_field(sc, A_TP_KEEP_IDLE, V_KEEPALIVEIDLE(M_KEEPALIVEIDLE), V_KEEPALIVEIDLE(v)); } if (t4_toe_keepalive_interval != 0) { v = us_to_tcp_ticks(sc, t4_toe_keepalive_interval); v &= M_KEEPALIVEINTVL; t4_set_reg_field(sc, A_TP_KEEP_INTVL, V_KEEPALIVEINTVL(M_KEEPALIVEINTVL), V_KEEPALIVEINTVL(v)); } if (t4_toe_keepalive_count != 0) { v = t4_toe_keepalive_count & M_KEEPALIVEMAXR2; t4_set_reg_field(sc, A_TP_SHIFT_CNT, V_KEEPALIVEMAXR1(M_KEEPALIVEMAXR1) | V_KEEPALIVEMAXR2(M_KEEPALIVEMAXR2), V_KEEPALIVEMAXR1(1) | V_KEEPALIVEMAXR2(v)); } if (t4_toe_rexmt_min != 0) { v = us_to_tcp_ticks(sc, t4_toe_rexmt_min); v &= M_RXTMIN; t4_set_reg_field(sc, A_TP_RXT_MIN, V_RXTMIN(M_RXTMIN), V_RXTMIN(v)); } if (t4_toe_rexmt_max != 0) { v = us_to_tcp_ticks(sc, t4_toe_rexmt_max); v &= M_RXTMAX; t4_set_reg_field(sc, A_TP_RXT_MAX, V_RXTMAX(M_RXTMAX), V_RXTMAX(v)); } if (t4_toe_rexmt_count != 0) { v = t4_toe_rexmt_count & M_RXTSHIFTMAXR2; t4_set_reg_field(sc, A_TP_SHIFT_CNT, V_RXTSHIFTMAXR1(M_RXTSHIFTMAXR1) | V_RXTSHIFTMAXR2(M_RXTSHIFTMAXR2), V_RXTSHIFTMAXR1(1) | V_RXTSHIFTMAXR2(v)); } for (i = 0; i < nitems(t4_toe_rexmt_backoff); i++) { if (t4_toe_rexmt_backoff[i] != -1) { v = t4_toe_rexmt_backoff[i] & M_TIMERBACKOFFINDEX0; shift = (i & 3) << 3; t4_set_reg_field(sc, A_TP_TCP_BACKOFF_REG0 + (i & ~3), M_TIMERBACKOFFINDEX0 << shift, v << shift); } } #endif #ifdef KERN_TLS if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS && sc->toecaps & FW_CAPS_CONFIG_TOE) { /* * Limit TOE connections to 2 reassembly "islands". * This is required to permit migrating TOE * connections to UPL_MODE_TLS. */ t4_tp_wr_bits_indirect(sc, A_TP_FRAG_CONFIG, V_PASSMODE(M_PASSMODE), V_PASSMODE(2)); } if (is_ktls(sc)) { sc->tlst.inline_keys = t4_tls_inline_keys; sc->tlst.combo_wrs = t4_tls_combo_wrs; if (t4_kern_tls != 0 && is_t6(sc)) t6_config_kern_tls(sc, true); } #endif return (0); } #undef FW_PARAM_PFVF #undef FW_PARAM_DEV static void t4_set_desc(struct adapter *sc) { char buf[128]; struct adapter_params *p = &sc->params; snprintf(buf, sizeof(buf), "Chelsio %s", p->vpd.id); device_set_desc_copy(sc->dev, buf); } static inline void ifmedia_add4(struct ifmedia *ifm, int m) { ifmedia_add(ifm, m, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_TXPAUSE, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_RXPAUSE, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL); } /* * This is the selected media, which is not quite the same as the active media. * The media line in ifconfig is "media: Ethernet selected (active)" if selected * and active are not the same, and "media: Ethernet selected" otherwise. */ static void set_current_media(struct port_info *pi) { struct link_config *lc; struct ifmedia *ifm; int mword; u_int speed; PORT_LOCK_ASSERT_OWNED(pi); /* Leave current media alone if it's already set to IFM_NONE. */ ifm = &pi->media; if (ifm->ifm_cur != NULL && IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE) return; lc = &pi->link_cfg; if (lc->requested_aneg != AUTONEG_DISABLE && lc->pcaps & FW_PORT_CAP32_ANEG) { ifmedia_set(ifm, IFM_ETHER | IFM_AUTO); return; } mword = IFM_ETHER | IFM_FDX; if (lc->requested_fc & PAUSE_TX) mword |= IFM_ETH_TXPAUSE; if (lc->requested_fc & PAUSE_RX) mword |= IFM_ETH_RXPAUSE; if (lc->requested_speed == 0) speed = port_top_speed(pi) * 1000; /* Gbps -> Mbps */ else speed = lc->requested_speed; mword |= port_mword(pi, speed_to_fwcap(speed)); ifmedia_set(ifm, mword); } /* * Returns true if the ifmedia list for the port cannot change. */ static bool fixed_ifmedia(struct port_info *pi) { return (pi->port_type == FW_PORT_TYPE_BT_SGMII || pi->port_type == FW_PORT_TYPE_BT_XFI || pi->port_type == FW_PORT_TYPE_BT_XAUI || pi->port_type == FW_PORT_TYPE_KX4 || pi->port_type == FW_PORT_TYPE_KX || pi->port_type == FW_PORT_TYPE_KR || pi->port_type == FW_PORT_TYPE_BP_AP || pi->port_type == FW_PORT_TYPE_BP4_AP || pi->port_type == FW_PORT_TYPE_BP40_BA || pi->port_type == FW_PORT_TYPE_KR4_100G || pi->port_type == FW_PORT_TYPE_KR_SFP28 || pi->port_type == FW_PORT_TYPE_KR_XLAUI); } static void build_medialist(struct port_info *pi) { uint32_t ss, speed; int unknown, mword, bit; struct link_config *lc; struct ifmedia *ifm; PORT_LOCK_ASSERT_OWNED(pi); if (pi->flags & FIXED_IFMEDIA) return; /* * Rebuild the ifmedia list. */ ifm = &pi->media; ifmedia_removeall(ifm); lc = &pi->link_cfg; ss = G_FW_PORT_CAP32_SPEED(lc->pcaps); /* Supported Speeds */ if (__predict_false(ss == 0)) { /* not supposed to happen. */ MPASS(ss != 0); no_media: MPASS(LIST_EMPTY(&ifm->ifm_list)); ifmedia_add(ifm, IFM_ETHER | IFM_NONE, 0, NULL); ifmedia_set(ifm, IFM_ETHER | IFM_NONE); return; } unknown = 0; for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) { speed = 1 << bit; MPASS(speed & M_FW_PORT_CAP32_SPEED); if (ss & speed) { mword = port_mword(pi, speed); if (mword == IFM_NONE) { goto no_media; } else if (mword == IFM_UNKNOWN) unknown++; else ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | mword); } } if (unknown > 0) /* Add one unknown for all unknown media types. */ ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN); if (lc->pcaps & FW_PORT_CAP32_ANEG) ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL); set_current_media(pi); } /* * Initialize the requested fields in the link config based on driver tunables. */ static void init_link_config(struct port_info *pi) { struct link_config *lc = &pi->link_cfg; PORT_LOCK_ASSERT_OWNED(pi); lc->requested_caps = 0; lc->requested_speed = 0; if (t4_autoneg == 0) lc->requested_aneg = AUTONEG_DISABLE; else if (t4_autoneg == 1) lc->requested_aneg = AUTONEG_ENABLE; else lc->requested_aneg = AUTONEG_AUTO; lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG); if (t4_fec & FEC_AUTO) lc->requested_fec = FEC_AUTO; else if (t4_fec == 0) lc->requested_fec = FEC_NONE; else { /* -1 is handled by the FEC_AUTO block above and not here. */ lc->requested_fec = t4_fec & (FEC_RS | FEC_BASER_RS | FEC_NONE | FEC_MODULE); if (lc->requested_fec == 0) lc->requested_fec = FEC_AUTO; } if (t4_force_fec < 0) lc->force_fec = -1; else if (t4_force_fec > 0) lc->force_fec = 1; else lc->force_fec = 0; } /* * Makes sure that all requested settings comply with what's supported by the * port. Returns the number of settings that were invalid and had to be fixed. */ static int fixup_link_config(struct port_info *pi) { int n = 0; struct link_config *lc = &pi->link_cfg; uint32_t fwspeed; PORT_LOCK_ASSERT_OWNED(pi); /* Speed (when not autonegotiating) */ if (lc->requested_speed != 0) { fwspeed = speed_to_fwcap(lc->requested_speed); if ((fwspeed & lc->pcaps) == 0) { n++; lc->requested_speed = 0; } } /* Link autonegotiation */ MPASS(lc->requested_aneg == AUTONEG_ENABLE || lc->requested_aneg == AUTONEG_DISABLE || lc->requested_aneg == AUTONEG_AUTO); if (lc->requested_aneg == AUTONEG_ENABLE && !(lc->pcaps & FW_PORT_CAP32_ANEG)) { n++; lc->requested_aneg = AUTONEG_AUTO; } /* Flow control */ MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0); if (lc->requested_fc & PAUSE_TX && !(lc->pcaps & FW_PORT_CAP32_FC_TX)) { n++; lc->requested_fc &= ~PAUSE_TX; } if (lc->requested_fc & PAUSE_RX && !(lc->pcaps & FW_PORT_CAP32_FC_RX)) { n++; lc->requested_fc &= ~PAUSE_RX; } if (!(lc->requested_fc & PAUSE_AUTONEG) && !(lc->pcaps & FW_PORT_CAP32_FORCE_PAUSE)) { n++; lc->requested_fc |= PAUSE_AUTONEG; } /* FEC */ if ((lc->requested_fec & FEC_RS && !(lc->pcaps & FW_PORT_CAP32_FEC_RS)) || (lc->requested_fec & FEC_BASER_RS && !(lc->pcaps & FW_PORT_CAP32_FEC_BASER_RS))) { n++; lc->requested_fec = FEC_AUTO; } return (n); } /* * Apply the requested L1 settings, which are expected to be valid, to the * hardware. */ static int apply_link_config(struct port_info *pi) { struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; #ifdef INVARIANTS ASSERT_SYNCHRONIZED_OP(sc); PORT_LOCK_ASSERT_OWNED(pi); if (lc->requested_aneg == AUTONEG_ENABLE) MPASS(lc->pcaps & FW_PORT_CAP32_ANEG); if (!(lc->requested_fc & PAUSE_AUTONEG)) MPASS(lc->pcaps & FW_PORT_CAP32_FORCE_PAUSE); if (lc->requested_fc & PAUSE_TX) MPASS(lc->pcaps & FW_PORT_CAP32_FC_TX); if (lc->requested_fc & PAUSE_RX) MPASS(lc->pcaps & FW_PORT_CAP32_FC_RX); if (lc->requested_fec & FEC_RS) MPASS(lc->pcaps & FW_PORT_CAP32_FEC_RS); if (lc->requested_fec & FEC_BASER_RS) MPASS(lc->pcaps & FW_PORT_CAP32_FEC_BASER_RS); #endif rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); if (rc != 0) { /* Don't complain if the VF driver gets back an EPERM. */ if (!(sc->flags & IS_VF) || rc != FW_EPERM) device_printf(pi->dev, "l1cfg failed: %d\n", rc); } else { /* * An L1_CFG will almost always result in a link-change event if * the link is up, and the driver will refresh the actual * fec/fc/etc. when the notification is processed. If the link * is down then the actual settings are meaningless. * * This takes care of the case where a change in the L1 settings * may not result in a notification. */ if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG)) lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX); } return (rc); } #define FW_MAC_EXACT_CHUNK 7 struct mcaddr_ctx { - struct ifnet *ifp; + if_t ifp; const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK]; uint64_t hash; int i; int del; int rc; }; static u_int add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { struct mcaddr_ctx *ctx = arg; - struct vi_info *vi = ctx->ifp->if_softc; + struct vi_info *vi = if_getsoftc(ctx->ifp); struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; if (ctx->rc < 0) return (0); ctx->mcaddr[ctx->i] = LLADDR(sdl); MPASS(ETHER_IS_MULTICAST(ctx->mcaddr[ctx->i])); ctx->i++; if (ctx->i == FW_MAC_EXACT_CHUNK) { ctx->rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, ctx->del, ctx->i, ctx->mcaddr, NULL, &ctx->hash, 0); if (ctx->rc < 0) { int j; for (j = 0; j < ctx->i; j++) { if_printf(ctx->ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", ctx->mcaddr[j][0], ctx->mcaddr[j][1], ctx->mcaddr[j][2], ctx->mcaddr[j][3], ctx->mcaddr[j][4], ctx->mcaddr[j][5], -ctx->rc); } return (0); } ctx->del = 0; ctx->i = 0; } return (1); } /* * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ int -update_mac_settings(struct ifnet *ifp, int flags) +update_mac_settings(if_t ifp, int flags) { int rc = 0; - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; uint8_t match_all_mac[ETHER_ADDR_LEN] = {0}; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); if (flags & XGMAC_MTU) - mtu = ifp->if_mtu; + mtu = if_getmtu(ifp); if (flags & XGMAC_PROMISC) - promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0; + promisc = if_getflags(ifp) & IFF_PROMISC ? 1 : 0; if (flags & XGMAC_ALLMULTI) - allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0; + allmulti = if_getflags(ifp) & IFF_ALLMULTI ? 1 : 0; if (flags & XGMAC_VLANEX) - vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; + vlanex = if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING ? 1 : 0; if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc, allmulti, 1, vlanex, false); if (rc) { if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); return (rc); } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; - bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); + bcopy(if_getlladdr(ifp), ucaddr, sizeof(ucaddr)); rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt, ucaddr, true, &vi->smt_idx); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { vi->xact_addr_filt = rc; rc = 0; } } if (flags & XGMAC_MCADDRS) { struct epoch_tracker et; struct mcaddr_ctx ctx; int j; ctx.ifp = ifp; ctx.hash = 0; ctx.i = 0; ctx.del = 1; ctx.rc = 0; /* * Unlike other drivers, we accumulate list of pointers into * interface address lists and we need to keep it safe even * after if_foreach_llmaddr() returns, thus we must enter the * network epoch. */ NET_EPOCH_ENTER(et); if_foreach_llmaddr(ifp, add_maddr, &ctx); if (ctx.rc < 0) { NET_EPOCH_EXIT(et); rc = -ctx.rc; return (rc); } if (ctx.i > 0) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, ctx.del, ctx.i, ctx.mcaddr, NULL, &ctx.hash, 0); NET_EPOCH_EXIT(et); if (rc < 0) { rc = -rc; for (j = 0; j < ctx.i; j++) { if_printf(ifp, "failed to add mcast address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", ctx.mcaddr[j][0], ctx.mcaddr[j][1], ctx.mcaddr[j][2], ctx.mcaddr[j][3], ctx.mcaddr[j][4], ctx.mcaddr[j][5], rc); } return (rc); } ctx.del = 0; } else NET_EPOCH_EXIT(et); rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0); if (rc != 0) if_printf(ifp, "failed to set mcast address hash: %d\n", rc); if (ctx.del == 0) { /* We clobbered the VXLAN entry if there was one. */ pi->vxlan_tcam_entry = false; } } if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 && pi->vxlan_tcam_entry == false) { rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac, match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id, true); if (rc < 0) { rc = -rc; if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n", rc); } else { MPASS(rc == sc->rawf_base + pi->port_id); rc = 0; pi->vxlan_tcam_entry = true; } } return (rc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ int begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags, char *wmesg) { int rc, pri; #ifdef WITNESS /* the caller thinks it's ok to sleep, but is it really? */ if (flags & SLEEP_OK) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "begin_synchronized_op"); #endif if (INTR_OK) pri = PCATCH; else pri = 0; ADAPTER_LOCK(sc); for (;;) { if (vi && IS_DOOMED(vi)) { rc = ENXIO; goto done; } if (!IS_BUSY(sc)) { rc = 0; break; } if (!(flags & SLEEP_OK)) { rc = EBUSY; goto done; } if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) { rc = EINTR; goto done; } } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = wmesg; sc->last_op_thr = curthread; sc->last_op_flags = flags; #endif done: if (!(flags & HOLD_LOCK) || rc) ADAPTER_UNLOCK(sc); return (rc); } /* * Tell if_ioctl and if_init that the VI is going away. This is * special variant of begin_synchronized_op and must be paired with a * call to end_synchronized_op. */ void doom_vi(struct adapter *sc, struct vi_info *vi) { ADAPTER_LOCK(sc); SET_DOOMED(vi); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = "t4detach"; sc->last_op_thr = curthread; sc->last_op_flags = 0; #endif ADAPTER_UNLOCK(sc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ void end_synchronized_op(struct adapter *sc, int flags) { if (flags & LOCK_HELD) ADAPTER_LOCK_ASSERT_OWNED(sc); else ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); } static int cxgbe_init_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; - struct ifnet *ifp = vi->ifp; + if_t ifp = vi->ifp; int rc = 0, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); - if (ifp->if_drv_flags & IFF_DRV_RUNNING) + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) return (0); /* already running */ if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_init(sc)) != 0)) return (rc); /* error message displayed already */ if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_init(vi)) != 0)) return (rc); /* error message displayed already */ rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ PORT_LOCK(pi); if (pi->up_vis == 0) { t4_update_port_info(pi); fixup_link_config(pi); build_medialist(pi); apply_link_config(pi); } rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); PORT_UNLOCK(pi); goto done; } /* * Can't fail from this point onwards. Review cxgbe_uninit_synchronized * if this changes. */ for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_ENABLED; TXQ_UNLOCK(txq); } /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0 && IS_MAIN_VI(vi)) { sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id; t4_write_reg(sc, is_t4(sc) ? A_MPS_TRC_RSS_CONTROL : A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) | V_QUEUENUMBER(sc->traceq)); pi->flags |= HAS_TRACEQ; } /* all ok */ pi->up_vis++; - ifp->if_drv_flags |= IFF_DRV_RUNNING; + if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); if (pi->link_cfg.link_ok) t4_os_link_changed(pi); PORT_UNLOCK(pi); mtx_lock(&vi->tick_mtx); - if (ifp->if_get_counter == vi_get_counter) + if (vi->pi->nvi > 1 || sc->flags & IS_VF) callout_reset(&vi->tick, hz, vi_tick, vi); else callout_reset(&vi->tick, hz, cxgbe_tick, vi); mtx_unlock(&vi->tick_mtx); done: if (rc != 0) cxgbe_uninit_synchronized(vi); return (rc); } /* * Idempotent. */ static int cxgbe_uninit_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; - struct ifnet *ifp = vi->ifp; + if_t ifp = vi->ifp; int rc, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (!(vi->flags & VI_INIT_DONE)) { - if (__predict_false(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (__predict_false(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { KASSERT(0, ("uninited VI is running")); if_printf(ifp, "uninited VI with running ifnet. " "vi->flags 0x%016lx, if_flags 0x%08x, " - "if_drv_flags 0x%08x\n", vi->flags, ifp->if_flags, - ifp->if_drv_flags); + "if_drv_flags 0x%08x\n", vi->flags, if_getflags(ifp), + if_getdrvflags(ifp)); } return (0); } /* * Disable the VI so that all its data in either direction is discarded * by the MPS. Leave everything else (the queues, interrupts, and 1Hz * tick) intact as the TP can deliver negative advice or data that it's * holding in its RAM (for an offloaded connection) even after the VI is * disabled. */ rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false); if (rc) { if_printf(ifp, "disable_vi failed: %d\n", rc); return (rc); } for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); } mtx_lock(&vi->tick_mtx); callout_stop(&vi->tick); mtx_unlock(&vi->tick_mtx); PORT_LOCK(pi); - if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { PORT_UNLOCK(pi); return (0); } - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); pi->up_vis--; if (pi->up_vis > 0) { PORT_UNLOCK(pi); return (0); } pi->link_cfg.link_ok = false; pi->link_cfg.speed = 0; pi->link_cfg.link_down_rc = 255; t4_os_link_changed(pi); PORT_UNLOCK(pi); return (0); } /* * It is ok for this function to fail midway and return right away. t4_detach * will walk the entire sc->irq list and clean up whatever is valid. */ int t4_setup_intr_handlers(struct adapter *sc) { int rc, rid, p, q, v; char s[8]; struct irq *irq; struct port_info *pi; struct vi_info *vi; struct sge *sge = &sc->sge; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; #endif #ifdef RSS int nbuckets = rss_getnumbuckets(); #endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; if (forwarding_intr_to_fwq(sc)) return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); /* Multiple interrupts. */ if (sc->flags & IS_VF) KASSERT(sc->intr_count >= T4VF_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); else KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); /* The first one is always error intr on PFs */ if (!(sc->flags & IS_VF)) { rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); if (rc != 0) return (rc); irq++; rid++; } /* The second one is always the firmware event queue (first on VFs) */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt"); if (rc != 0) return (rc); irq++; rid++; for_each_port(sc, p) { pi = sc->port[p]; for_each_vi(pi, v, vi) { vi->first_intr = rid - 1; if (vi->nnmrxq > 0) { int n = max(vi->nrxq, vi->nnmrxq); rxq = &sge->rxq[vi->first_rxq]; #ifdef DEV_NETMAP nm_rxq = &sge->nm_rxq[vi->first_nm_rxq]; #endif for (q = 0; q < n; q++) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); if (q < vi->nrxq) irq->rxq = rxq++; #ifdef DEV_NETMAP if (q < vi->nnmrxq) irq->nm_rxq = nm_rxq++; if (irq->nm_rxq != NULL && irq->rxq == NULL) { /* Netmap rx only */ rc = t4_alloc_irq(sc, irq, rid, t4_nm_intr, irq->nm_rxq, s); } if (irq->nm_rxq != NULL && irq->rxq != NULL) { /* NIC and Netmap rx */ rc = t4_alloc_irq(sc, irq, rid, t4_vi_intr, irq, s); } #endif if (irq->rxq != NULL && irq->nm_rxq == NULL) { /* NIC rx only */ rc = t4_alloc_irq(sc, irq, rid, t4_intr, irq->rxq, s); } if (rc != 0) return (rc); #ifdef RSS if (q < vi->nrxq) { bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); } #endif irq++; rid++; vi->nintr++; } } else { for_each_rxq(vi, q, rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); if (rc != 0) return (rc); #ifdef RSS bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); #endif irq++; rid++; vi->nintr++; } } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, q, ofld_rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); if (rc != 0) return (rc); irq++; rid++; vi->nintr++; } #endif } } MPASS(irq == &sc->irq[sc->intr_count]); return (0); } static void write_global_rss_key(struct adapter *sc) { #ifdef RSS int i; uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; CTASSERT(RSS_KEYSIZE == 40); rss_getkey((void *)&raw_rss_key[0]); for (i = 0; i < nitems(rss_key); i++) { rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]); } t4_write_rss_key(sc, &rss_key[0], -1, 1); #endif } /* * Idempotent. */ static int adapter_full_init(struct adapter *sc) { int rc, i; ASSERT_SYNCHRONIZED_OP(sc); /* * queues that belong to the adapter (not any particular port). */ rc = t4_setup_adapter_queues(sc); if (rc != 0) return (rc); for (i = 0; i < nitems(sc->tq); i++) { if (sc->tq[i] != NULL) continue; sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq[i]); if (sc->tq[i] == NULL) { CH_ERR(sc, "failed to allocate task queue %d\n", i); return (ENOMEM); } taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d", device_get_nameunit(sc->dev), i); } if (!(sc->flags & IS_VF)) { write_global_rss_key(sc); t4_intr_enable(sc); } return (0); } int adapter_init(struct adapter *sc) { int rc; ASSERT_SYNCHRONIZED_OP(sc); ADAPTER_LOCK_ASSERT_NOTOWNED(sc); KASSERT((sc->flags & FULL_INIT_DONE) == 0, ("%s: FULL_INIT_DONE already", __func__)); rc = adapter_full_init(sc); if (rc != 0) adapter_full_uninit(sc); else sc->flags |= FULL_INIT_DONE; return (rc); } /* * Idempotent. */ static void adapter_full_uninit(struct adapter *sc) { int i; t4_teardown_adapter_queues(sc); for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) { taskqueue_free(sc->tq[i]); sc->tq[i] = NULL; } sc->flags &= ~FULL_INIT_DONE; } #ifdef RSS #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \ RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \ RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \ RSS_HASHTYPE_RSS_UDP_IPV6) /* Translates kernel hash types to hardware. */ static int hashconfig_to_hashen(int hashconfig) { int hashen = 0; if (hashconfig & RSS_HASHTYPE_RSS_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; return (hashen); } /* Translates hardware hash types to kernel. */ static int hashen_to_hashconfig(int hashen) { int hashconfig = 0; if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) { /* * If UDP hashing was enabled it must have been enabled for * either IPv4 or IPv6 (inclusive or). Enabling UDP without * enabling any 4-tuple hash is nonsense configuration. */ MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)); if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6; } if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV6; return (hashconfig); } #endif /* * Idempotent. */ static int vi_full_init(struct vi_info *vi) { struct adapter *sc = vi->adapter; struct sge_rxq *rxq; int rc, i, j; #ifdef RSS int nbuckets = rss_getnumbuckets(); int hashconfig = rss_gethashconfig(); int extra; #endif ASSERT_SYNCHRONIZED_OP(sc); /* * Allocate tx/rx/fl queues for this VI. */ rc = t4_setup_vi_queues(vi); if (rc != 0) return (rc); /* * Setup RSS for this VI. Save a copy of the RSS table for later use. */ if (vi->nrxq > vi->rss_size) { CH_ALERT(vi, "nrxq (%d) > hw RSS table size (%d); " "some queues will never receive traffic.\n", vi->nrxq, vi->rss_size); } else if (vi->rss_size % vi->nrxq) { CH_ALERT(vi, "nrxq (%d), hw RSS table size (%d); " "expect uneven traffic distribution.\n", vi->nrxq, vi->rss_size); } #ifdef RSS if (vi->nrxq != nbuckets) { CH_ALERT(vi, "nrxq (%d) != kernel RSS buckets (%d);" "performance will be impacted.\n", vi->nrxq, nbuckets); } #endif if (vi->rss == NULL) vi->rss = malloc(vi->rss_size * sizeof (*vi->rss), M_CXGBE, M_ZERO | M_WAITOK); for (i = 0; i < vi->rss_size;) { #ifdef RSS j = rss_get_indirection_to_bucket(i); j %= vi->nrxq; rxq = &sc->sge.rxq[vi->first_rxq + j]; vi->rss[i++] = rxq->iq.abs_id; #else for_each_rxq(vi, j, rxq) { vi->rss[i++] = rxq->iq.abs_id; if (i == vi->rss_size) break; } #endif } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, vi->rss, vi->rss_size); if (rc != 0) { CH_ERR(vi, "rss_config failed: %d\n", rc); return (rc); } #ifdef RSS vi->hashen = hashconfig_to_hashen(hashconfig); /* * We may have had to enable some hashes even though the global config * wants them disabled. This is a potential problem that must be * reported to the user. */ extra = hashen_to_hashconfig(vi->hashen) ^ hashconfig; /* * If we consider only the supported hash types, then the enabled hashes * are a superset of the requested hashes. In other words, there cannot * be any supported hash that was requested but not enabled, but there * can be hashes that were not requested but had to be enabled. */ extra &= SUPPORTED_RSS_HASHTYPES; MPASS((extra & hashconfig) == 0); if (extra) { CH_ALERT(vi, "global RSS config (0x%x) cannot be accommodated.\n", hashconfig); } if (extra & RSS_HASHTYPE_RSS_IPV4) CH_ALERT(vi, "IPv4 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV4) CH_ALERT(vi, "TCP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_IPV6) CH_ALERT(vi, "IPv6 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV6) CH_ALERT(vi, "TCP/IPv6 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV4) CH_ALERT(vi, "UDP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV6) CH_ALERT(vi, "UDP/IPv6 4-tuple hashing forced on.\n"); #else vi->hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN; #endif rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, vi->rss[0], 0, 0); if (rc != 0) { CH_ERR(vi, "rss hash/defaultq config failed: %d\n", rc); return (rc); } return (0); } int vi_init(struct vi_info *vi) { int rc; ASSERT_SYNCHRONIZED_OP(vi->adapter); KASSERT((vi->flags & VI_INIT_DONE) == 0, ("%s: VI_INIT_DONE already", __func__)); rc = vi_full_init(vi); if (rc != 0) vi_full_uninit(vi); else vi->flags |= VI_INIT_DONE; return (rc); } /* * Idempotent. */ static void vi_full_uninit(struct vi_info *vi) { if (vi->flags & VI_INIT_DONE) { quiesce_vi(vi); free(vi->rss, M_CXGBE); free(vi->nm_rss, M_CXGBE); } t4_teardown_vi_queues(vi); vi->flags &= ~VI_INIT_DONE; } static void quiesce_txq(struct sge_txq *txq) { struct sge_eq *eq = &txq->eq; struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; MPASS(eq->flags & EQ_SW_ALLOCATED); MPASS(!(eq->flags & EQ_ENABLED)); /* Wait for the mp_ring to empty. */ while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 4096); pause("rquiesce", 1); } MPASS(txq->txp.npkt == 0); if (eq->flags & EQ_HW_ALLOCATED) { /* * Hardware is alive and working normally. Wait for it to * finish and then wait for the driver to catch up and reclaim * all descriptors. */ while (spg->cidx != htobe16(eq->pidx)) pause("equiesce", 1); while (eq->cidx != eq->pidx) pause("dquiesce", 1); } else { /* * Hardware is unavailable. Discard all pending tx and reclaim * descriptors directly. */ TXQ_LOCK(txq); while (eq->cidx != eq->pidx) { struct mbuf *m, *nextpkt; struct tx_sdesc *txsd; txsd = &txq->sdesc[eq->cidx]; for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } IDXINCR(eq->cidx, txsd->desc_used, eq->sidx); } spg->pidx = spg->cidx = htobe16(eq->cidx); TXQ_UNLOCK(txq); } } static void quiesce_wrq(struct sge_wrq *wrq) { /* XXXTX */ } static void quiesce_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) { /* Synchronize with the interrupt handler */ while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED)) pause("iqfree", 1); if (fl != NULL) { MPASS(iq->flags & IQ_HAS_FL); mtx_lock(&sc->sfl_lock); FL_LOCK(fl); fl->flags |= FL_DOOMED; FL_UNLOCK(fl); callout_stop(&sc->sfl_callout); mtx_unlock(&sc->sfl_lock); KASSERT((fl->flags & FL_STARVING) == 0, ("%s: still starving", __func__)); /* Release all buffers if hardware is no longer available. */ if (!(iq->flags & IQ_HW_ALLOCATED)) free_fl_buffers(sc, fl); } } /* * Wait for all activity on all the queues of the VI to complete. It is assumed * that no new work is being enqueued by the hardware or the driver. That part * should be arranged before calling this function. */ static void quiesce_vi(struct vi_info *vi) { int i; struct adapter *sc = vi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif if (!(vi->flags & VI_INIT_DONE)) return; for_each_txq(vi, i, txq) { quiesce_txq(txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { quiesce_wrq(&ofld_txq->wrq); } #endif for_each_rxq(vi, i, rxq) { quiesce_iq_fl(sc, &rxq->iq, &rxq->fl); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { quiesce_iq_fl(sc, &ofld_rxq->iq, &ofld_rxq->fl); } #endif } static int t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid, driver_intr_t *handler, void *arg, char *name) { int rc; irq->rid = rid; irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid, RF_SHAREABLE | RF_ACTIVE); if (irq->res == NULL) { device_printf(sc->dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET, NULL, handler, arg, &irq->tag); if (rc != 0) { device_printf(sc->dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name, rc); } else if (name) bus_describe_intr(sc->dev, irq->res, irq->tag, "%s", name); return (rc); } static int t4_free_irq(struct adapter *sc, struct irq *irq) { if (irq->tag) bus_teardown_intr(sc->dev, irq->res, irq->tag); if (irq->res) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res); bzero(irq, sizeof(*irq)); return (0); } static void get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf) { regs->version = chip_id(sc) | chip_rev(sc) << 10; t4_get_regs(sc, buf, regs->len); } #define A_PL_INDIR_CMD 0x1f8 #define S_PL_AUTOINC 31 #define M_PL_AUTOINC 0x1U #define V_PL_AUTOINC(x) ((x) << S_PL_AUTOINC) #define G_PL_AUTOINC(x) (((x) >> S_PL_AUTOINC) & M_PL_AUTOINC) #define S_PL_VFID 20 #define M_PL_VFID 0xffU #define V_PL_VFID(x) ((x) << S_PL_VFID) #define G_PL_VFID(x) (((x) >> S_PL_VFID) & M_PL_VFID) #define S_PL_ADDR 0 #define M_PL_ADDR 0xfffffU #define V_PL_ADDR(x) ((x) << S_PL_ADDR) #define G_PL_ADDR(x) (((x) >> S_PL_ADDR) & M_PL_ADDR) #define A_PL_INDIR_DATA 0x1fc static uint64_t read_vf_stat(struct adapter *sc, u_int vin, int reg) { u32 stats[2]; if (sc->flags & IS_VF) { stats[0] = t4_read_reg(sc, VF_MPS_REG(reg)); stats[1] = t4_read_reg(sc, VF_MPS_REG(reg + 4)); } else { mtx_assert(&sc->reg_lock, MA_OWNED); t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(reg))); stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA); stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA); } return (((uint64_t)stats[1]) << 32 | stats[0]); } static void t4_get_vi_stats(struct adapter *sc, u_int vin, struct fw_vi_stats_vf *stats) { #define GET_STAT(name) \ read_vf_stat(sc, vin, A_MPS_VF_STAT_##name##_L) if (!(sc->flags & IS_VF)) mtx_lock(&sc->reg_lock); stats->tx_bcast_bytes = GET_STAT(TX_VF_BCAST_BYTES); stats->tx_bcast_frames = GET_STAT(TX_VF_BCAST_FRAMES); stats->tx_mcast_bytes = GET_STAT(TX_VF_MCAST_BYTES); stats->tx_mcast_frames = GET_STAT(TX_VF_MCAST_FRAMES); stats->tx_ucast_bytes = GET_STAT(TX_VF_UCAST_BYTES); stats->tx_ucast_frames = GET_STAT(TX_VF_UCAST_FRAMES); stats->tx_drop_frames = GET_STAT(TX_VF_DROP_FRAMES); stats->tx_offload_bytes = GET_STAT(TX_VF_OFFLOAD_BYTES); stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES); stats->rx_bcast_bytes = GET_STAT(RX_VF_BCAST_BYTES); stats->rx_bcast_frames = GET_STAT(RX_VF_BCAST_FRAMES); stats->rx_mcast_bytes = GET_STAT(RX_VF_MCAST_BYTES); stats->rx_mcast_frames = GET_STAT(RX_VF_MCAST_FRAMES); stats->rx_ucast_bytes = GET_STAT(RX_VF_UCAST_BYTES); stats->rx_ucast_frames = GET_STAT(RX_VF_UCAST_FRAMES); stats->rx_err_frames = GET_STAT(RX_VF_ERR_FRAMES); if (!(sc->flags & IS_VF)) mtx_unlock(&sc->reg_lock); #undef GET_STAT } static void t4_clr_vi_stats(struct adapter *sc, u_int vin) { int reg; t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L))); for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L; reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4) t4_write_reg(sc, A_PL_INDIR_DATA, 0); } static void vi_refresh_stats(struct vi_info *vi) { struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ mtx_assert(&vi->tick_mtx, MA_OWNED); if (vi->flags & VI_SKIP_STATS) return; getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &vi->last_refreshed, <)) return; t4_get_vi_stats(vi->adapter, vi->vin, &vi->stats); getmicrotime(&vi->last_refreshed); } static void cxgbe_refresh_stats(struct vi_info *vi) { u_int i, v, tnl_cong_drops, chan_map; struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ struct port_info *pi; struct adapter *sc; mtx_assert(&vi->tick_mtx, MA_OWNED); if (vi->flags & VI_SKIP_STATS) return; getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &vi->last_refreshed, <)) return; pi = vi->pi; sc = vi->adapter; tnl_cong_drops = 0; t4_get_port_stats(sc, pi->port_id, &pi->stats); chan_map = pi->rx_e_chan_map; while (chan_map) { i = ffs(chan_map) - 1; mtx_lock(&sc->reg_lock); t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); mtx_unlock(&sc->reg_lock); tnl_cong_drops += v; chan_map &= ~(1 << i); } pi->tnl_cong_drops = tnl_cong_drops; getmicrotime(&vi->last_refreshed); } static void cxgbe_tick(void *arg) { struct vi_info *vi = arg; MPASS(IS_MAIN_VI(vi)); mtx_assert(&vi->tick_mtx, MA_OWNED); cxgbe_refresh_stats(vi); callout_schedule(&vi->tick, hz); } static void vi_tick(void *arg) { struct vi_info *vi = arg; mtx_assert(&vi->tick_mtx, MA_OWNED); vi_refresh_stats(vi); callout_schedule(&vi->tick, hz); } /* * Should match fw_caps_config_ enums in t4fw_interface.h */ static char *caps_decoder[] = { "\20\001IPMI\002NCSI", /* 0: NBM */ "\20\001PPP\002QFC\003DCBX", /* 1: link */ "\20\001INGRESS\002EGRESS", /* 2: switch */ "\20\001NIC\002VM\003IDS\004UM\005UM_ISGL" /* 3: NIC */ "\006HASHFILTER\007ETHOFLD", "\20\001TOE", /* 4: TOE */ "\20\001RDDP\002RDMAC", /* 5: RDMA */ "\20\001INITIATOR_PDU\002TARGET_PDU" /* 6: iSCSI */ "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD" "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD" "\007T10DIF" "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD", "\20\001LOOKASIDE\002TLSKEYS\003IPSEC_INLINE" /* 7: Crypto */ "\004TLS_HW", "\20\001INITIATOR\002TARGET\003CTRL_OFLD" /* 8: FCoE */ "\004PO_INITIATOR\005PO_TARGET", }; void t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx = &sc->ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *c0; static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"}; /* * dev.t4nex.X. */ oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); sc->sc_do_rxcopy = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, doorbells, (uintptr_t)&sc->doorbells, sysctl_bitfield_8b, "A", "available doorbells"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL, sc->params.vpd.cclk, "core clock frequency (in KHz)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc->params.sge.timer_val, sizeof(sc->params.sge.timer_val), sysctl_int_array, "A", "interrupt holdoff timer values (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc->params.sge.counter_val, sizeof(sc->params.sge.counter_val), sysctl_int_array, "A", "interrupt holdoff packet counter values"); t4_sge_sysctls(sc, ctx, children); sc->lro_timeout = 100; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW, &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dflags", CTLFLAG_RW, &sc->debug_flags, 0, "flags to enable runtime debugging"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version", CTLFLAG_RD, sc->tp_version, 0, "TP microcode version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); if (sc->flags & IS_VF) return; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, NULL, chip_rev(sc), "chip hardware revision"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "sn", CTLFLAG_RD, sc->params.vpd.sn, 0, "serial number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pn", CTLFLAG_RD, sc->params.vpd.pn, 0, "part number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "ec", CTLFLAG_RD, sc->params.vpd.ec, 0, "engineering change"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "md_version", CTLFLAG_RD, sc->params.vpd.md, 0, "manufacturing diags version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "na", CTLFLAG_RD, sc->params.vpd.na, 0, "network address"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "er_version", CTLFLAG_RD, sc->er_version, 0, "expansion ROM version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bs_version", CTLFLAG_RD, sc->bs_version, 0, "bootstrap firmware version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "scfg_version", CTLFLAG_RD, NULL, sc->params.scfg_vers, "serial config version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "vpd_version", CTLFLAG_RD, NULL, sc->params.vpd_vers, "VPD version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf", CTLFLAG_RD, sc->cfg_file, 0, "configuration file"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL, sc->cfcsum, "config file checksum"); #define SYSCTL_CAP(name, n, text) \ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, caps_decoder[n], \ (uintptr_t)&sc->name, sysctl_bitfield_16b, "A", \ "available " text " capabilities") SYSCTL_CAP(nbmcaps, 0, "NBM"); SYSCTL_CAP(linkcaps, 1, "link"); SYSCTL_CAP(switchcaps, 2, "switch"); SYSCTL_CAP(niccaps, 3, "NIC"); SYSCTL_CAP(toecaps, 4, "TCP offload"); SYSCTL_CAP(rdmacaps, 5, "RDMA"); SYSCTL_CAP(iscsicaps, 6, "iSCSI"); SYSCTL_CAP(cryptocaps, 7, "crypto"); SYSCTL_CAP(fcoecaps, 8, "FCoE"); #undef SYSCTL_CAP SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD, NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "reset_sensor", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_reset_sensor, "I", "reset the chip's temperature sensor."); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "loadavg", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_loadavg, "A", "microprocessor load averages (debug firmwares only)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "core_vdd", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_vdd, "I", "core Vdd (in mV)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "local_cpus", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, LOCAL_CPUS, sysctl_cpus, "A", "local CPUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_cpus", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, INTR_CPUS, sysctl_cpus, "A", "preferred CPUs for interrupts"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "swintr", CTLFLAG_RW, &sc->swintr, 0, "software triggered interrupts"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "reset", CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_reset, "I", "1 = reset adapter, 0 = zero reset counter"); /* * dev.t4nex.X.misc. Marked CTLFLAG_SKIP to avoid information overload. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc", CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, "logs and miscellaneous information"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cctrl, "A", "congestion control"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1, sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2, sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 3, sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 4, sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 5, sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_la, "A", "CIM logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_ma_la, "A", "CIM MA logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)"); if (chip_id(sc) > CHELSIO_T4) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_pif_la, "A", "CIM PIF logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cim_qcfg, "A", "CIM queue configuration"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_cpl_stats, "A", "CPL statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_ddp_stats, "A", "non-TCP DDP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tid_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tid_stats, "A", "tid stats"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_devlog, "A", "firmware's device log"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_fcoe_stats, "A", "FCoE statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_hw_sched, "A", "hardware scheduler "); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_l2t, "A", "hardware L2 table"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "smt", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_smt, "A", "hardware source MAC table"); #ifdef INET6 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "clip", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_clip, "A", "active CLIP table entries"); #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_lb_stats, "A", "loopback statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_meminfo, "A", "memory regions"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6, "A", "MPS TCAM entries"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_path_mtus, "A", "path MTUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_pm_stats, "A", "PM statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_rdma_stats, "A", "RDMA statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tcp_stats, "A", "TCP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tids, "A", "TID information"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_err_stats, "A", "TP error statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tnl_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tnl_stats, "A", "TP tunnel statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_la_mask, "I", "TP logic analyzer event capture mask"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_la, "A", "TP logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tx_rate, "A", "Tx rate"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_ulprx_la, "A", "ULPRX logic analyzer"); if (chip_id(sc) >= CHELSIO_T5) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_wcwr_stats, "A", "write combined work requests"); } #ifdef KERN_TLS if (is_ktls(sc)) { /* * dev.t4nex.0.tls. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "tls", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "KERN_TLS parameters"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "inline_keys", CTLFLAG_RW, &sc->tlst.inline_keys, 0, "Always pass TLS " "keys in work requests (1) or attempt to store TLS keys " "in card memory."); if (is_t6(sc)) SYSCTL_ADD_INT(ctx, children, OID_AUTO, "combo_wrs", CTLFLAG_RW, &sc->tlst.combo_wrs, 0, "Attempt to " "combine TCB field updates with TLS record work " "requests."); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { int i; char s[4]; /* * dev.t4nex.X.toe. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE parameters"); children = SYSCTL_CHILDREN(oid); sc->tt.cong_algorithm = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_algorithm", CTLFLAG_RW, &sc->tt.cong_algorithm, 0, "congestion control " "(-1 = default, 0 = reno, 1 = tahoe, 2 = newreno, " "3 = highspeed)"); sc->tt.sndbuf = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, &sc->tt.sndbuf, 0, "hardware send buffer"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW | CTLFLAG_SKIP, &sc->tt.ddp, 0, ""); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_zcopy", CTLFLAG_RW, &sc->tt.ddp, 0, "Enable zero-copy aio_read(2)"); sc->tt.rx_coalesce = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); sc->tt.tls = 0; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tls, "I", "Inline TLS allowed"); sc->tt.tx_align = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align", CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload"); sc->tt.tx_zcopy = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy", CTLFLAG_RW, &sc->tt.tx_zcopy, 0, "Enable zero-copy aio_write(2)"); sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cop_managed_offloading", CTLFLAG_RW, &sc->tt.cop_managed_offloading, 0, "COP (Connection Offload Policy) controls all TOE offload"); sc->tt.autorcvbuf_inc = 16 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "autorcvbuf_inc", CTLFLAG_RW, &sc->tt.autorcvbuf_inc, 0, "autorcvbuf increment"); sc->tt.update_hc_on_pmtu_change = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "update_hc_on_pmtu_change", CTLFLAG_RW, &sc->tt.update_hc_on_pmtu_change, 0, "Update hostcache entry if the PMTU changes"); sc->tt.iso = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "iso", CTLFLAG_RW, &sc->tt.iso, 0, "Enable iSCSI segmentation offload"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_tick, "A", "TP timer tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 1, sysctl_tp_tick, "A", "TCP timestamp tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 2, sysctl_tp_tick, "A", "DACK tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_tp_dack_timer, "IU", "DACK timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_RXT_MIN, sysctl_tp_timer, "LU", "Minimum retransmit interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_RXT_MAX, sysctl_tp_timer, "LU", "Maximum retransmit interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_PERS_MIN, sysctl_tp_timer, "LU", "Persist timer min (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_PERS_MAX, sysctl_tp_timer, "LU", "Persist timer max (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_KEEP_IDLE, sysctl_tp_timer, "LU", "Keepalive idle timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_interval", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_KEEP_INTVL, sysctl_tp_timer, "LU", "Keepalive interval timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_INIT_SRTT, sysctl_tp_timer, "LU", "Initial SRTT (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer", CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, A_TP_FINWAIT2_TIMER, sysctl_tp_timer, "LU", "FINWAIT2 timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "syn_rexmt_count", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, S_SYNSHIFTMAX, sysctl_tp_shift_cnt, "IU", "Number of SYN retransmissions before abort"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_count", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, S_RXTSHIFTMAXR2, sysctl_tp_shift_cnt, "IU", "Number of retransmissions before abort"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_count", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, S_KEEPALIVEMAXR2, sysctl_tp_shift_cnt, "IU", "Number of keepalive probes before abort"); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rexmt_backoff", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE retransmit backoffs"); children = SYSCTL_CHILDREN(oid); for (i = 0; i < 16; i++) { snprintf(s, sizeof(s), "%u", i); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, s, CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, i, sysctl_tp_backoff, "IU", "TOE retransmit backoff"); } } #endif } void vi_sysctls(struct vi_info *vi) { struct sysctl_ctx_list *ctx = &vi->ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children; /* * dev.v?(cxgbe|cxl).X. */ oid = device_get_sysctl_tree(vi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL, vi->viid, "VI identifer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD, &vi->nrxq, 0, "# of rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD, &vi->ntxq, 0, "# of tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD, &vi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &vi->first_txq, 0, "index of first tx queue"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_base", CTLFLAG_RD, NULL, vi->rss_base, "start of RSS indirection table"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL, vi->rss_size, "size of RSS indirection table"); if (IS_MAIN_VI(vi)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_noflowq, "IU", "Reserve queue 0 for non-flowid packets"); } if (vi->adapter->flags & IS_VF) { MPASS(vi->flags & TX_USES_VM_WR); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_vm_wr", CTLFLAG_RD, NULL, 1, "use VM work requests for transmit"); } else { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_vm_wr", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_tx_vm_wr, "I", "use VM work requestes for transmit"); } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &vi->nofldrxq, 0, "# of rx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq", CTLFLAG_RD, &vi->first_ofld_rxq, 0, "index of first TOE rx queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx_ofld", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_tmr_idx_ofld, "I", "holdoff timer index for TOE queues"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx_ofld", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_pktc_idx_ofld, "I", "holdoff packet counter index for TOE queues"); } #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (vi->nofldtxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD, &vi->nofldtxq, 0, "# of tx queues for TOE/ETHOFLD"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq", CTLFLAG_RD, &vi->first_ofld_txq, 0, "index of first TOE/ETHOFLD tx queue"); } #endif #ifdef DEV_NETMAP if (vi->nnmrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, &vi->nnmrxq, 0, "# of netmap rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, &vi->nnmtxq, 0, "# of netmap tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", CTLFLAG_RD, &vi->first_nm_rxq, 0, "index of first netmap rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", CTLFLAG_RD, &vi->first_nm_txq, 0, "index of first netmap tx queue"); } #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_tmr_idx, "I", "holdoff timer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_qsize_rxq, "I", "rx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, 0, sysctl_qsize_txq, "I", "tx queue size"); } static void cxgbe_sysctls(struct port_info *pi) { struct sysctl_ctx_list *ctx = &pi->ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *children2; struct adapter *sc = pi->adapter; int i; char name[16]; static char *tc_flags = {"\20\1USER"}; /* * dev.cxgbe.X. */ oid = device_get_sysctl_tree(pi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 0, sysctl_linkdnrc, "A", "reason why link is down"); if (pi->port_type == FW_PORT_TYPE_BT_XAUI) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 0, sysctl_btphy, "I", "PHY temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, pi, 1, sysctl_btphy, "I", "PHY firmware version"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0, sysctl_pause_settings, "A", "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_fec", CTLTYPE_STRING | CTLFLAG_MPSAFE, pi, 0, sysctl_link_fec, "A", "FEC in use on the link"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "requested_fec", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0, sysctl_requested_fec, "A", "FECs to use (bit 0 = RS, 1 = FC, 2 = none, 5 = auto, 6 = module)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "module_fec", CTLTYPE_STRING | CTLFLAG_MPSAFE, pi, 0, sysctl_module_fec, "A", "FEC recommended by the cable/transceiver"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "autoneg", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0, sysctl_autoneg, "I", "autonegotiation (-1 = not supported)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "force_fec", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, pi, 0, sysctl_force_fec, "I", "when to use FORCE_FEC bit for link config"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rcaps", CTLFLAG_RD, &pi->link_cfg.requested_caps, 0, "L1 config requested by driver"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcaps", CTLFLAG_RD, &pi->link_cfg.pcaps, 0, "port capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "acaps", CTLFLAG_RD, &pi->link_cfg.acaps, 0, "advertised capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lpacaps", CTLFLAG_RD, &pi->link_cfg.lpacaps, 0, "link partner advertised capabilities"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL, port_top_speed(pi), "max speed (in Gbps)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "mps_bg_map", CTLFLAG_RD, NULL, pi->mps_bg_map, "MPS buffer group map"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_e_chan_map", CTLFLAG_RD, NULL, pi->rx_e_chan_map, "TP rx e-channel map"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_c_chan", CTLFLAG_RD, NULL, pi->rx_c_chan, "TP rx c-channel"); if (sc->flags & IS_VF) return; /* * dev.(cxgbe|cxl).X.tc. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Tx scheduler traffic classes (cl_rl)"); children2 = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "pktsize", CTLFLAG_RW, &pi->sched_params->pktsize, 0, "pktsize for per-flow cl-rl (0 means up to the driver )"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "burstsize", CTLFLAG_RW, &pi->sched_params->burstsize, 0, "burstsize for per-flow cl-rl (0 means up to the driver)"); for (i = 0; i < sc->params.nsched_cls; i++) { struct tx_cl_rl_params *tc = &pi->sched_params->cl_rl[i]; snprintf(name, sizeof(name), "%d", i); children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "traffic class")); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "state", CTLFLAG_RD, &tc->state, 0, "current state"); SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "flags", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, tc_flags, (uintptr_t)&tc->flags, sysctl_bitfield_8b, "A", "flags"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount", CTLFLAG_RD, &tc->refcount, 0, "references to this class"); SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, (pi->port_id << 16) | i, sysctl_tc_params, "A", "traffic class parameters"); } /* * dev.cxgbe.X.stats. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD, &pi->tx_parse_error, 0, "# of tx packets with invalid length or # of segments"); #define T4_REGSTAT(name, stat, desc) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, #name, \ CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, \ (is_t4(sc) ? PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_##stat##_L) : \ T5_PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_##stat##_L)), \ sysctl_handle_t4_reg64, "QU", desc) /* We get these from port_stats and they may be stale by up to 1s */ #define T4_PORTSTAT(name, desc) \ SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \ &pi->stats.name, desc) T4_REGSTAT(tx_octets, TX_PORT_BYTES, "# of octets in good frames"); T4_REGSTAT(tx_frames, TX_PORT_FRAMES, "total # of good frames"); T4_REGSTAT(tx_bcast_frames, TX_PORT_BCAST, "# of broadcast frames"); T4_REGSTAT(tx_mcast_frames, TX_PORT_MCAST, "# of multicast frames"); T4_REGSTAT(tx_ucast_frames, TX_PORT_UCAST, "# of unicast frames"); T4_REGSTAT(tx_error_frames, TX_PORT_ERROR, "# of error frames"); T4_REGSTAT(tx_frames_64, TX_PORT_64B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_65_127, TX_PORT_65B_127B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_128_255, TX_PORT_128B_255B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_256_511, TX_PORT_256B_511B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_512_1023, TX_PORT_512B_1023B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_1024_1518, TX_PORT_1024B_1518B, "# of tx frames in this range"); T4_REGSTAT(tx_frames_1519_max, TX_PORT_1519B_MAX, "# of tx frames in this range"); T4_REGSTAT(tx_drop, TX_PORT_DROP, "# of dropped tx frames"); T4_REGSTAT(tx_pause, TX_PORT_PAUSE, "# of pause frames transmitted"); T4_REGSTAT(tx_ppp0, TX_PORT_PPP0, "# of PPP prio 0 frames transmitted"); T4_REGSTAT(tx_ppp1, TX_PORT_PPP1, "# of PPP prio 1 frames transmitted"); T4_REGSTAT(tx_ppp2, TX_PORT_PPP2, "# of PPP prio 2 frames transmitted"); T4_REGSTAT(tx_ppp3, TX_PORT_PPP3, "# of PPP prio 3 frames transmitted"); T4_REGSTAT(tx_ppp4, TX_PORT_PPP4, "# of PPP prio 4 frames transmitted"); T4_REGSTAT(tx_ppp5, TX_PORT_PPP5, "# of PPP prio 5 frames transmitted"); T4_REGSTAT(tx_ppp6, TX_PORT_PPP6, "# of PPP prio 6 frames transmitted"); T4_REGSTAT(tx_ppp7, TX_PORT_PPP7, "# of PPP prio 7 frames transmitted"); T4_REGSTAT(rx_octets, RX_PORT_BYTES, "# of octets in good frames"); T4_REGSTAT(rx_frames, RX_PORT_FRAMES, "total # of good frames"); T4_REGSTAT(rx_bcast_frames, RX_PORT_BCAST, "# of broadcast frames"); T4_REGSTAT(rx_mcast_frames, RX_PORT_MCAST, "# of multicast frames"); T4_REGSTAT(rx_ucast_frames, RX_PORT_UCAST, "# of unicast frames"); T4_REGSTAT(rx_too_long, RX_PORT_MTU_ERROR, "# of frames exceeding MTU"); T4_REGSTAT(rx_jabber, RX_PORT_MTU_CRC_ERROR, "# of jabber frames"); if (is_t6(sc)) { T4_PORTSTAT(rx_fcs_err, "# of frames received with bad FCS since last link up"); } else { T4_REGSTAT(rx_fcs_err, RX_PORT_CRC_ERROR, "# of frames received with bad FCS"); } T4_REGSTAT(rx_len_err, RX_PORT_LEN_ERROR, "# of frames received with length error"); T4_REGSTAT(rx_symbol_err, RX_PORT_SYM_ERROR, "symbol errors"); T4_REGSTAT(rx_runt, RX_PORT_LESS_64B, "# of short frames received"); T4_REGSTAT(rx_frames_64, RX_PORT_64B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_65_127, RX_PORT_65B_127B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_128_255, RX_PORT_128B_255B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_256_511, RX_PORT_256B_511B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_512_1023, RX_PORT_512B_1023B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_1024_1518, RX_PORT_1024B_1518B, "# of rx frames in this range"); T4_REGSTAT(rx_frames_1519_max, RX_PORT_1519B_MAX, "# of rx frames in this range"); T4_REGSTAT(rx_pause, RX_PORT_PAUSE, "# of pause frames received"); T4_REGSTAT(rx_ppp0, RX_PORT_PPP0, "# of PPP prio 0 frames received"); T4_REGSTAT(rx_ppp1, RX_PORT_PPP1, "# of PPP prio 1 frames received"); T4_REGSTAT(rx_ppp2, RX_PORT_PPP2, "# of PPP prio 2 frames received"); T4_REGSTAT(rx_ppp3, RX_PORT_PPP3, "# of PPP prio 3 frames received"); T4_REGSTAT(rx_ppp4, RX_PORT_PPP4, "# of PPP prio 4 frames received"); T4_REGSTAT(rx_ppp5, RX_PORT_PPP5, "# of PPP prio 5 frames received"); T4_REGSTAT(rx_ppp6, RX_PORT_PPP6, "# of PPP prio 6 frames received"); T4_REGSTAT(rx_ppp7, RX_PORT_PPP7, "# of PPP prio 7 frames received"); T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows"); T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows"); T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows"); T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows"); T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets"); T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets"); T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets"); T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets"); #undef T4_REGSTAT #undef T4_PORTSTAT } static int sysctl_int_array(SYSCTL_HANDLER_ARGS) { int rc, *i, space = 0; struct sbuf sb; sbuf_new_for_sysctl(&sb, NULL, 64, req); for (i = arg1; arg2; arg2 -= sizeof(int), i++) { if (space) sbuf_printf(&sb, " "); sbuf_printf(&sb, "%d", *i); space = 1; } rc = sbuf_finish(&sb); sbuf_delete(&sb); return (rc); } static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", *(uint8_t *)(uintptr_t)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", *(uint16_t *)(uintptr_t)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_btphy(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int op = arg2; struct adapter *sc = pi->adapter; u_int v; int rc; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { /* XXX: magic numbers */ rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820, &v); } end_synchronized_op(sc, 0); if (rc) return (rc); if (op == 0) v /= 256; rc = sysctl_handle_int(oidp, &v, 0, req); return (rc); } static int sysctl_noflowq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; int rc, val; val = vi->rsrv_noflowq; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if ((val >= 1) && (vi->ntxq > 1)) vi->rsrv_noflowq = 1; else vi->rsrv_noflowq = 0; return (rc); } static int sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int rc, val, i; MPASS(!(sc->flags & IS_VF)); val = vi->flags & TX_USES_VM_WR ? 1 : 0; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val != 0 && val != 1) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txvm"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; - else if (vi->ifp->if_drv_flags & IFF_DRV_RUNNING) { + else if (if_getdrvflags(vi->ifp) & IFF_DRV_RUNNING) { /* * We don't want parse_pkt to run with one setting (VF or PF) * and then eth_tx to see a different setting but still use * stale information calculated by parse_pkt. */ rc = EBUSY; } else { struct port_info *pi = vi->pi; struct sge_txq *txq; uint32_t ctrl0; uint8_t npkt = sc->params.max_pkts_per_eth_tx_pkts_wr; if (val) { vi->flags |= TX_USES_VM_WR; - vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO; + if_sethwtsomaxsegcount(vi->ifp, TX_SGL_SEGS_VM_TSO); ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan)); if (!(sc->flags & IS_VF)) npkt--; } else { vi->flags &= ~TX_USES_VM_WR; - vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; + if_sethwtsomaxsegcount(vi->ifp, TX_SGL_SEGS_TSO); ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); } for_each_txq(vi, i, txq) { txq->cpl_ctrl0 = ctrl0; txq->txp.max_npkt = npkt; } } end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc, i; struct sge_rxq *rxq; uint8_t v; idx = vi->tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1); for_each_rxq(vi, i, rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&rxq->iq.intr_params, v); #else rxq->iq.intr_params = v; #endif } vi->tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc; idx = vi->pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4pktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int qsize, rc; qsize = vi->qsize_rxq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || (qsize & 7)) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4rxqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_rxq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int qsize, rc; qsize = vi->qsize_txq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || qsize > 65536) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_txq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1RX\2TX\3AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); if (lc->link_ok) { sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) | (lc->requested_fc & PAUSE_AUTONEG), bits); } else { sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG), bits); } rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); if (s[1] != 0) return (EINVAL); if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4PAUSE"); if (rc) return (rc); if (!hw_off_limits(sc)) { PORT_LOCK(pi); lc->requested_fc = n; fixup_link_config(pi); if (pi->up_vis > 0) rc = apply_link_config(pi); set_current_media(pi); PORT_UNLOCK(pi); } end_synchronized_op(sc, 0); } return (rc); } static int sysctl_link_fec(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct link_config *lc = &pi->link_cfg; int rc; struct sbuf *sb; static char *bits = "\20\1RS-FEC\2FC-FEC\3NO-FEC\4RSVD1\5RSVD2"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); if (lc->link_ok) sbuf_printf(sb, "%b", lc->fec, bits); else sbuf_printf(sb, "no link"); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_requested_fec(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; int8_t old; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1RS-FEC\2FC-FEC\3NO-FEC\4RSVD2" "\5RSVD3\6auto\7module"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", lc->requested_fec, bits); rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[8]; int n; snprintf(s, sizeof(s), "%d", lc->requested_fec == FEC_AUTO ? -1 : lc->requested_fec & (M_FW_PORT_CAP32_FEC | FEC_MODULE)); rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); n = strtol(&s[0], NULL, 0); if (n < 0 || n & FEC_AUTO) n = FEC_AUTO; else if (n & ~(M_FW_PORT_CAP32_FEC | FEC_MODULE)) return (EINVAL);/* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4reqf"); if (rc) return (rc); PORT_LOCK(pi); old = lc->requested_fec; if (n == FEC_AUTO) lc->requested_fec = FEC_AUTO; else if (n == 0 || n == FEC_NONE) lc->requested_fec = FEC_NONE; else { if ((lc->pcaps | V_FW_PORT_CAP32_FEC(n & M_FW_PORT_CAP32_FEC)) != lc->pcaps) { rc = ENOTSUP; goto done; } lc->requested_fec = n & (M_FW_PORT_CAP32_FEC | FEC_MODULE); } if (!hw_off_limits(sc)) { fixup_link_config(pi); if (pi->up_vis > 0) { rc = apply_link_config(pi); if (rc != 0) { lc->requested_fec = old; if (rc == FW_EPROTO) rc = ENOTSUP; } } } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } return (rc); } static int sysctl_module_fec(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; int8_t fec; struct sbuf *sb; static char *bits = "\20\1RS-FEC\2FC-FEC\3NO-FEC\4RSVD2\5RSVD3"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mfec") != 0) { rc = EBUSY; goto done; } if (hw_off_limits(sc)) { rc = ENXIO; goto done; } PORT_LOCK(pi); if (pi->up_vis == 0) { /* * If all the interfaces are administratively down the firmware * does not report transceiver changes. Refresh port info here. * This is the only reason we have a synchronized op in this * function. Just PORT_LOCK would have been enough otherwise. */ t4_update_port_info(pi); } fec = lc->fec_hint; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE || !fec_supported(lc->pcaps)) { sbuf_printf(sb, "n/a"); } else { if (fec == 0) fec = FEC_NONE; sbuf_printf(sb, "%b", fec & M_FW_PORT_CAP32_FEC, bits); } rc = sbuf_finish(sb); PORT_UNLOCK(pi); done: sbuf_delete(sb); end_synchronized_op(sc, 0); return (rc); } static int sysctl_autoneg(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc, val; if (lc->pcaps & FW_PORT_CAP32_ANEG) val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1; else val = -1; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val == 0) val = AUTONEG_DISABLE; else if (val == 1) val = AUTONEG_ENABLE; else val = AUTONEG_AUTO; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4aneg"); if (rc) return (rc); PORT_LOCK(pi); if (val == AUTONEG_ENABLE && !(lc->pcaps & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; goto done; } lc->requested_aneg = val; if (!hw_off_limits(sc)) { fixup_link_config(pi); if (pi->up_vis > 0) rc = apply_link_config(pi); set_current_media(pi); } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } static int sysctl_force_fec(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc, val; val = lc->force_fec; MPASS(val >= -1 && val <= 1); rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (!(lc->pcaps & FW_PORT_CAP32_FORCE_FEC)) return (ENOTSUP); if (val < -1 || val > 1) return (EINVAL); rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4ff"); if (rc) return (rc); PORT_LOCK(pi); lc->force_fec = val; if (!hw_off_limits(sc)) { fixup_link_config(pi); if (pi->up_vis > 0) rc = apply_link_config(pi); } PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, reg = arg2; uint64_t val; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; val = t4_read_reg64(sc, reg); } mtx_unlock(&sc->reg_lock); if (rc == 0) rc = sysctl_handle_64(oidp, &val, 0, req); return (rc); } static int sysctl_temperature(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, t; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc) return (rc); /* unknown is returned as 0 but we display -1 in that case */ t = val == 0 ? -1 : val; rc = sysctl_handle_int(oidp, &t, 0, req); return (rc); } static int sysctl_vdd(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc; uint32_t param, val; if (sc->params.core_vdd == 0) { rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vdd"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc) return (rc); sc->params.core_vdd = val; } return (sysctl_handle_int(oidp, &sc->params.core_vdd, 0, req)); } static int sysctl_reset_sensor(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, v; uint32_t param, val; v = sc->sensor_resets; rc = sysctl_handle_int(oidp, &v, 0, req); if (rc != 0 || req->newptr == NULL || v <= 0) return (rc); if (sc->params.fw_vers < FW_VERSION32(1, 24, 7, 0) || chip_id(sc) < CHELSIO_T5) return (ENOTSUP); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4srst"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_RESET_TMP_SENSOR)); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc == 0) sc->sensor_resets++; return (rc); } static int sysctl_loadavg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4lavg"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_LOAD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); } end_synchronized_op(sc, 0); if (rc) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); if (val == 0xffffffff) { /* Only debug and custom firmwares report load averages. */ sbuf_printf(sb, "not available"); } else { sbuf_printf(sb, "%d %d %d", val & 0xff, (val >> 8) & 0xff, (val >> 16) & 0xff); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cctrl(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t incr[NMTUS][NCCTRL_WIN]; static const char *dec_fac[] = { "0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875", "0.9375" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_read_cong_tbl(sc, incr); mtx_unlock(&sc->reg_lock); if (rc) goto done; for (i = 0; i < NCCTRL_WIN; ++i) { sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i, incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i], incr[5][i], incr[6][i], incr[7][i]); sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n", incr[8][i], incr[9][i], incr[10][i], incr[11][i], incr[12][i], incr[13][i], incr[14][i], incr[15][i], sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); return (rc); } static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = { "TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI", /* ibq's */ "ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI", /* obq's */ "SGE0-RX", "SGE1-RX" /* additional obq's (T5 onwards) */ }; static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n, qid = arg2; uint32_t *buf, *p; char *qtype; u_int cim_num_obq = sc->chip_params->cim_num_obq; KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq, ("%s: bad qid %d\n", __func__, qid)); if (qid < CIM_NUM_IBQ) { /* inbound queue */ qtype = "IBQ"; n = 4 * CIM_IBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = -ENXIO; else rc = t4_read_cim_ibq(sc, qid, buf, n); mtx_unlock(&sc->reg_lock); } else { /* outbound queue */ qtype = "OBQ"; qid -= CIM_NUM_IBQ; n = 4 * cim_num_obq * CIM_OBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = -ENXIO; else rc = t4_read_cim_obq(sc, qid, buf, n); mtx_unlock(&sc->reg_lock); } if (rc < 0) { rc = -rc; goto done; } n = rc * sizeof(uint32_t); /* rc has # of words actually read */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]); for (i = 0, p = buf; i < n; i += 16, p += 4) sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1], p[2], p[3]); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static void sbuf_cim_la4(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg) { uint32_t *p; sbuf_printf(sb, "Status Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x", p[5] & 0xff, p[6], p[7]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x", (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8, p[4] & 0xff, p[5] >> 8); sbuf_printf(sb, "\n %02x %x%07x %x%07x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4); } else { sbuf_printf(sb, "\n %02x %x%07x %x%07x %08x %08x " "%08x%08x%08x%08x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5], p[6], p[7]); } } } static void sbuf_cim_la6(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg) { uint32_t *p; sbuf_printf(sb, "Status Inst Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data LS1Stat LS1Addr LS1Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x %08x", p[3] & 0xff, p[2], p[1], p[0]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x %02x%06x", (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8, p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8); sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16); } else { sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x " "%08x %08x %08x %08x %08x %08x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16, p[2], p[1], p[0], p[5], p[4], p[3]); } } } static int sbuf_cim_la(struct adapter *sc, struct sbuf *sb, int flags) { uint32_t cfg, *buf; int rc; MPASS(flags == M_WAITOK || flags == M_NOWAIT); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | flags); if (buf == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc == 0) rc = -t4_cim_read_la(sc, buf, NULL); } mtx_unlock(&sc->reg_lock); if (rc == 0) { if (chip_id(sc) < CHELSIO_T6) sbuf_cim_la4(sc, sb, buf, cfg); else sbuf_cim_la6(sc, sb, buf, cfg); } free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); rc = sbuf_cim_la(sc, sb, M_WAITOK); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static void dump_cim_regs(struct adapter *sc) { log(LOG_DEBUG, "%s: CIM debug regs1 %08x %08x %08x %08x %08x\n", device_get_nameunit(sc->dev), t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA0), t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA1), t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA2), t4_read_reg(sc, A_EDC_H_BIST_DATA_PATTERN), t4_read_reg(sc, A_EDC_H_BIST_STATUS_RDATA)); log(LOG_DEBUG, "%s: CIM debug regs2 %08x %08x %08x %08x %08x\n", device_get_nameunit(sc->dev), t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA0), t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA1), t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA0 + 0x800), t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA1 + 0x800), t4_read_reg(sc, A_EDC_H_BIST_CMD_LEN)); } static void dump_cimla(struct adapter *sc) { struct sbuf sb; int rc; if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) { log(LOG_DEBUG, "%s: failed to generate CIM LA dump.\n", device_get_nameunit(sc->dev)); return; } rc = sbuf_cim_la(sc, &sb, M_WAITOK); if (rc == 0) { rc = sbuf_finish(&sb); if (rc == 0) { log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s\n", device_get_nameunit(sc->dev), sbuf_data(&sb)); } } sbuf_delete(&sb); } void t4_os_cim_err(struct adapter *sc) { atomic_set_int(&sc->error_flags, ADAP_CIM_ERR); } static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE); mtx_unlock(&sc->reg_lock); if (rc) goto done; p = buf; for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCnt ID Tag UE Data RDY VLD"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%3u %2u %x %u %08x%08x %u %u", (p[2] >> 10) & 0xff, (p[2] >> 7) & 7, (p[2] >> 3) & 0xf, (p[2] >> 2) & 1, (p[1] >> 2) | ((p[2] & 3) << 30), (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1, p[0] & 1); } rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL); mtx_unlock(&sc->reg_lock); if (rc) goto done; p = buf; sbuf_printf(sb, "Cntl ID DataBE Addr Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %04x %08x %08x%08x%08x%08x", (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCntl ID Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %08x%08x%08x%08x", (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t thres[CIM_NUM_IBQ]; uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr; uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat; u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq; cim_num_obq = sc->chip_params->cim_num_obq; if (is_t4(sc)) { ibq_rdaddr = A_UP_IBQ_0_RDADDR; obq_rdaddr = A_UP_OBQ_0_REALADDR; } else { ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR; obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR; } nq = CIM_NUM_IBQ + cim_num_obq; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat); if (rc == 0) { rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr); if (rc == 0) t4_read_cimq_cfg(sc, base, size, thres); } } mtx_unlock(&sc->reg_lock); if (rc) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, " Queue Base Size Thres RdPtr WrPtr SOP EOP Avail"); for (i = 0; i < CIM_NUM_IBQ; i++, p += 4) sbuf_printf(sb, "\n%7s %5x %5u %5u %6x %4x %4u %4u %5u", qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]), G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); for ( ; i < nq; i++, p += 4, wr += 2) sbuf_printf(sb, "\n%7s %5x %5u %12x %4x %4u %4u %5u", qname[i], base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff, wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_cpl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_cpl_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (rc) goto done; if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\nCPL requests: %10u %10u %10u %10u", stats.req[0], stats.req[1], stats.req[2], stats.req[3]); sbuf_printf(sb, "\nCPL responses: %10u %10u %10u %10u", stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\nCPL requests: %10u %10u", stats.req[0], stats.req[1]); sbuf_printf(sb, "\nCPL responses: %10u %10u", stats.rsp[0], stats.rsp[1]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); return (rc); } static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_usm_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_get_usm_stats(sc, &stats, 1); mtx_unlock(&sc->reg_lock); if (rc == 0) { sbuf_printf(sb, "Frames: %u\n", stats.frames); sbuf_printf(sb, "Octets: %ju\n", stats.octets); sbuf_printf(sb, "Drops: %u", stats.drops); rc = sbuf_finish(sb); } sbuf_delete(sb); return (rc); } static int sysctl_tid_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tid_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_tid_stats(sc, &stats, 1); mtx_unlock(&sc->reg_lock); if (rc == 0) { sbuf_printf(sb, "Delete: %u\n", stats.del); sbuf_printf(sb, "Invalidate: %u\n", stats.inv); sbuf_printf(sb, "Active: %u\n", stats.act); sbuf_printf(sb, "Passive: %u", stats.pas); rc = sbuf_finish(sb); } sbuf_delete(sb); return (rc); } static const char * const devlog_level_strings[] = { [FW_DEVLOG_LEVEL_EMERG] = "EMERG", [FW_DEVLOG_LEVEL_CRIT] = "CRIT", [FW_DEVLOG_LEVEL_ERR] = "ERR", [FW_DEVLOG_LEVEL_NOTICE] = "NOTICE", [FW_DEVLOG_LEVEL_INFO] = "INFO", [FW_DEVLOG_LEVEL_DEBUG] = "DEBUG" }; static const char * const devlog_facility_strings[] = { [FW_DEVLOG_FACILITY_CORE] = "CORE", [FW_DEVLOG_FACILITY_CF] = "CF", [FW_DEVLOG_FACILITY_SCHED] = "SCHED", [FW_DEVLOG_FACILITY_TIMER] = "TIMER", [FW_DEVLOG_FACILITY_RES] = "RES", [FW_DEVLOG_FACILITY_HW] = "HW", [FW_DEVLOG_FACILITY_FLR] = "FLR", [FW_DEVLOG_FACILITY_DMAQ] = "DMAQ", [FW_DEVLOG_FACILITY_PHY] = "PHY", [FW_DEVLOG_FACILITY_MAC] = "MAC", [FW_DEVLOG_FACILITY_PORT] = "PORT", [FW_DEVLOG_FACILITY_VI] = "VI", [FW_DEVLOG_FACILITY_FILTER] = "FILTER", [FW_DEVLOG_FACILITY_ACL] = "ACL", [FW_DEVLOG_FACILITY_TM] = "TM", [FW_DEVLOG_FACILITY_QFC] = "QFC", [FW_DEVLOG_FACILITY_DCB] = "DCB", [FW_DEVLOG_FACILITY_ETH] = "ETH", [FW_DEVLOG_FACILITY_OFLD] = "OFLD", [FW_DEVLOG_FACILITY_RI] = "RI", [FW_DEVLOG_FACILITY_ISCSI] = "ISCSI", [FW_DEVLOG_FACILITY_FCOE] = "FCOE", [FW_DEVLOG_FACILITY_FOISCSI] = "FOISCSI", [FW_DEVLOG_FACILITY_FOFCOE] = "FOFCOE", [FW_DEVLOG_FACILITY_CHNET] = "CHNET", }; static int sbuf_devlog(struct adapter *sc, struct sbuf *sb, int flags) { int i, j, rc, nentries, first = 0; struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e *buf, *e; uint64_t ftstamp = UINT64_MAX; if (dparams->addr == 0) return (ENXIO); MPASS(flags == M_WAITOK || flags == M_NOWAIT); buf = malloc(dparams->size, M_CXGBE, M_ZERO | flags); if (buf == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf, dparams->size); mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; nentries = dparams->size / sizeof(struct fw_devlog_e); for (i = 0; i < nentries; i++) { e = &buf[i]; if (e->timestamp == 0) break; /* end */ e->timestamp = be64toh(e->timestamp); e->seqno = be32toh(e->seqno); for (j = 0; j < 8; j++) e->params[j] = be32toh(e->params[j]); if (e->timestamp < ftstamp) { ftstamp = e->timestamp; first = i; } } if (buf[first].timestamp == 0) goto done; /* nothing in the log */ sbuf_printf(sb, "%10s %15s %8s %8s %s\n", "Seq#", "Tstamp", "Level", "Facility", "Message"); i = first; do { e = &buf[i]; if (e->timestamp == 0) break; /* end */ sbuf_printf(sb, "%10d %15ju %8s %8s ", e->seqno, e->timestamp, (e->level < nitems(devlog_level_strings) ? devlog_level_strings[e->level] : "UNKNOWN"), (e->facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e->facility] : "UNKNOWN")); sbuf_printf(sb, e->fmt, e->params[0], e->params[1], e->params[2], e->params[3], e->params[4], e->params[5], e->params[6], e->params[7]); if (++i == nentries) i = 0; } while (i != first); done: free(buf, M_CXGBE); return (rc); } static int sysctl_devlog(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); rc = sbuf_devlog(sc, sb, M_WAITOK); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static void dump_devlog(struct adapter *sc) { int rc; struct sbuf sb; if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) { log(LOG_DEBUG, "%s: failed to generate devlog dump.\n", device_get_nameunit(sc->dev)); return; } rc = sbuf_devlog(sc, &sb, M_WAITOK); if (rc == 0) { rc = sbuf_finish(&sb); if (rc == 0) { log(LOG_DEBUG, "%s: device log follows.\n%s", device_get_nameunit(sc->dev), sbuf_data(&sb)); } } sbuf_delete(&sb); } static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_fcoe_stats stats[MAX_NCHAN]; int i, nchan = sc->chip_params->nchan; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { for (i = 0; i < nchan; i++) t4_get_fcoe_stats(sc, i, &stats[i], 1); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp, stats[2].octets_ddp, stats[3].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp, stats[2].frames_ddp, stats[3].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u", stats[0].frames_drop, stats[1].frames_drop, stats[2].frames_drop, stats[3].frames_drop); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u", stats[0].frames_drop, stats[1].frames_drop); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; unsigned int map, kbps, ipg, mode; unsigned int pace_tab[NTX_SCHED]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP); mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG)); t4_read_pace_tbl(sc, pace_tab); sbuf_printf(sb, "Scheduler Mode Channel Rate (Kbps) " "Class IPG (0.1 ns) Flow IPG (us)"); for (i = 0; i < NTX_SCHED; ++i, map >>= 2) { t4_get_tx_sched(sc, i, &kbps, &ipg, 1); sbuf_printf(sb, "\n %u %-5s %u ", i, (mode & (1 << i)) ? "flow" : "class", map & 3); if (kbps) sbuf_printf(sb, "%9u ", kbps); else sbuf_printf(sb, " disabled "); if (ipg) sbuf_printf(sb, "%13u ", ipg); else sbuf_printf(sb, " disabled "); if (pace_tab[i]) sbuf_printf(sb, "%10u", pace_tab[i]); else sbuf_printf(sb, " disabled"); } rc = sbuf_finish(sb); done: mtx_unlock(&sc->reg_lock); sbuf_delete(sb); return (rc); } static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, j; uint64_t *p0, *p1; struct lb_port_stats s[2]; static const char *stat_name[] = { "OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:", "UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:", "Frames128To255:", "Frames256To511:", "Frames512To1023:", "Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:", "BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:", "BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:", "BG2FramesTrunc:", "BG3FramesTrunc:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); memset(s, 0, sizeof(s)); for (i = 0; i < sc->chip_params->nchan; i += 2) { mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_get_lb_stats(sc, i, &s[0]); t4_get_lb_stats(sc, i + 1, &s[1]); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; p0 = &s[0].octets; p1 = &s[1].octets; sbuf_printf(sb, "%s Loopback %u" " Loopback %u", i == 0 ? "" : "\n", i, i + 1); for (j = 0; j < nitems(stat_name); j++) sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j], *p0++, *p1++); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS) { int rc = 0; struct port_info *pi = arg1; struct link_config *lc = &pi->link_cfg; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 64, req); if (sb == NULL) return (ENOMEM); if (lc->link_ok || lc->link_down_rc == 255) sbuf_printf(sb, "n/a"); else sbuf_printf(sb, "%s", t4_link_down_rc_str(lc->link_down_rc)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct mem_desc { u_int base; u_int limit; u_int idx; }; static int mem_desc_cmp(const void *a, const void *b) { const u_int v1 = ((const struct mem_desc *)a)->base; const u_int v2 = ((const struct mem_desc *)b)->base; if (v1 < v2) return (-1); else if (v1 > v2) return (1); return (0); } static void mem_region_show(struct sbuf *sb, const char *name, unsigned int from, unsigned int to) { unsigned int size; if (from == to) return; size = to - from + 1; if (size == 0) return; /* XXX: need humanize_number(3) in libkern for a more readable 'size' */ sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size); } static int sysctl_meminfo(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; uint32_t lo, hi, used, free, alloc; static const char *memory[] = { "EDC0:", "EDC1:", "MC:", "MC0:", "MC1:", "HMA:" }; static const char *region[] = { "DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:", "Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:", "Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:", "TDDP region:", "TPT region:", "STAG region:", "RQ region:", "RQUDP region:", "PBL region:", "TXPBL region:", "TLSKey region:", "DBVFIFO region:", "ULPRX state:", "ULPTX state:", "On-chip queues:", }; struct mem_desc avail[4]; struct mem_desc mem[nitems(region) + 3]; /* up to 3 holes */ struct mem_desc *md = mem; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nitems(mem); i++) { mem[i].limit = 0; mem[i].idx = i; } mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } /* Find and sort the populated memory ranges */ i = 0; lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (lo & F_EDRAM0_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM0_BAR); avail[i].base = G_EDRAM0_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20); avail[i].idx = 0; i++; } if (lo & F_EDRAM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM1_BAR); avail[i].base = G_EDRAM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20); avail[i].idx = 1; i++; } if (lo & F_EXT_MEM_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); avail[i].base = G_EXT_MEM_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20); avail[i].idx = is_t5(sc) ? 3 : 2; /* Call it MC0 for T5 */ i++; } if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 4; i++; } if (is_t6(sc) && lo & F_HMA_MUX) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 5; i++; } MPASS(i <= nitems(avail)); if (!i) /* no memory available */ goto done; qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp); (md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR); (md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE); /* the next few have explicit upper bounds */ md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) * G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE)); md++; md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) * G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE)); md++; if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { if (chip_id(sc) <= CHELSIO_T5) md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE); else md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR); md->limit = 0; } else { md->base = 0; md->idx = nitems(region); /* hide it */ } md++; #define ulp_region(reg) \ md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\ (md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT) ulp_region(RX_ISCSI); ulp_region(RX_TDDP); ulp_region(TX_TPT); ulp_region(RX_STAG); ulp_region(RX_RQ); ulp_region(RX_RQUDP); ulp_region(RX_PBL); ulp_region(TX_PBL); if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS) { ulp_region(RX_TLS_KEY); } #undef ulp_region md->base = 0; if (is_t4(sc)) md->idx = nitems(region); else { uint32_t size = 0; uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2); uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE); if (is_t5(sc)) { if (sge_ctrl & F_VFIFO_ENABLE) size = fifo_size << 2; } else size = G_T6_DBVFIFO_SIZE(fifo_size) << 6; if (size) { md->base = t4_read_reg(sc, A_SGE_DBVFIFO_BADDR); md->limit = md->base + size - 1; } else md->idx = nitems(region); } md++; md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE); md->limit = 0; md++; md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE); md->limit = 0; md++; md->base = sc->vres.ocq.start; if (sc->vres.ocq.size) md->limit = md->base + sc->vres.ocq.size - 1; else md->idx = nitems(region); /* hide it */ md++; /* add any address-space holes, there can be up to 3 */ for (n = 0; n < i - 1; n++) if (avail[n].limit < avail[n + 1].base) (md++)->base = avail[n].limit; if (avail[n].limit) (md++)->base = avail[n].limit; n = md - mem; MPASS(n <= nitems(mem)); qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp); for (lo = 0; lo < i; lo++) mem_region_show(sb, memory[avail[lo].idx], avail[lo].base, avail[lo].limit - 1); sbuf_printf(sb, "\n"); for (i = 0; i < n; i++) { if (mem[i].idx >= nitems(region)) continue; /* skip holes */ if (!mem[i].limit) mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0; mem_region_show(sb, region[mem[i].idx], mem[i].base, mem[i].limit); } sbuf_printf(sb, "\n"); lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP RAM:", lo, hi); lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP Extmem2:", lo, hi); lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE); for (i = 0, free = 0; i < 2; i++) free += G_FREERXPAGECOUNT(t4_read_reg(sc, A_TP_FLM_FREE_RX_CNT)); sbuf_printf(sb, "\n%u Rx pages (%u free) of size %uKiB for %u channels\n", G_PMRXMAXPAGE(lo), free, t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10, (lo & F_PMRXNUMCHN) ? 2 : 1); lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE); hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); for (i = 0, free = 0; i < 4; i++) free += G_FREETXPAGECOUNT(t4_read_reg(sc, A_TP_FLM_FREE_TX_CNT)); sbuf_printf(sb, "%u Tx pages (%u free) of size %u%ciB for %u channels\n", G_PMTXMAXPAGE(lo), free, hi >= (1 << 20) ? (hi >> 20) : (hi >> 10), hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo)); sbuf_printf(sb, "%u p-structs (%u free)\n", t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT), G_FREEPSTRUCTCOUNT(t4_read_reg(sc, A_TP_FLM_FREE_PS_CNT))); for (i = 0; i < 4; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated", i, used, alloc); } for (i = 0; i < sc->chip_params->nchan; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nLoopback %d using %u pages out of %u allocated", i, used, alloc); } done: mtx_unlock(&sc->reg_lock); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static inline void tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask) { *mask = x | y; y = htobe64(y); memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN); } static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) <= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask Vld Ports PF" " VF Replication P0 P1 P2 P3 ML"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint64_t tcamx, tcamy, mask; uint32_t cls_lo, cls_hi; uint8_t addr[ETHER_ADDR_LEN]; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i)); tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i)); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx" " %c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, (cls_lo & F_SRAM_VLD) ? 'Y' : 'N', G_PORTMAP(cls_hi), G_PF(cls_lo), (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1); if (cls_lo & F_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mps"); if (rc) break; if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) break; else { sbuf_printf(sb, " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%36s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo), G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo), G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) > CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask VNI Mask" " IVLAN Vld DIP_Hit Lookup Port Vld Ports PF VF" " Replication" " P0 P1 P2 P3 ML\n"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint8_t dip_hit, vlan_vld, lookup_type, port_num; uint16_t ivlan; uint64_t tcamx, tcamy, val, mask; uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy; uint8_t addr[ETHER_ADDR_LEN]; ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0); if (i < 256) ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0); else ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamy = G_DMACH(val) << 32; tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; lookup_type = G_DATALKPTYPE(data2); port_num = G_DATAPORTNUM(data2); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI */ vniy = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); dip_hit = data2 & F_DATADIPHIT; vlan_vld = 0; } else { vniy = 0; dip_hit = 0; vlan_vld = data2 & F_DATAVIDH2; ivlan = G_VIDL(val); } ctl |= V_CTLXYBITSEL(1); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamx = G_DMACH(val) << 32; tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI mask */ vnix = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); } else vnix = 0; if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); } mtx_unlock(&sc->reg_lock); if (rc != 0) break; if (lookup_type && lookup_type != M_DATALKPTYPE) { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx %06x %06x - - %3c" " I %4x %3c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } else { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx - - ", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask); if (vlan_vld) sbuf_printf(sb, "%4u Y ", ivlan); else sbuf_printf(sb, " - N "); sbuf_printf(sb, "- %3c %4x %3c %#x%4u%4d", lookup_type ? 'I' : 'O', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } if (cls_lo & F_T6_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t6mps"); if (rc) break; if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) break; else { sbuf_printf(sb, " %08x %08x %08x %08x" " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc255_224), be32toh(ldst_cmd.u.mps.rplc.rplc223_192), be32toh(ldst_cmd.u.mps.rplc.rplc191_160), be32toh(ldst_cmd.u.mps.rplc.rplc159_128), be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%72s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#x", G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo), G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo), (cls_lo >> S_T6_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint16_t mtus[NMTUS]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_read_mtu_tbl(sc, mtus, NULL); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u", mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6], mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13], mtus[14], mtus[15]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS]; uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS]; static const char *tx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Bypass + mem:", "Tx FIFO wait", NULL, "Tx latency" }; static const char *rx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Flush:", "Rx FIFO wait", NULL, "Rx latency" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_pmtx_get_stats(sc, tx_cnt, tx_cyc); t4_pmrx_get_stats(sc, rx_cnt, rx_cyc); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, " Tx pcmds Tx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); } sbuf_printf(sb, "\n Rx pcmds Rx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } if (chip_id(sc) > CHELSIO_T5) { sbuf_printf(sb, "\n Total wait Total occupancy"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); i += 2; MPASS(i < nitems(tx_stats)); sbuf_printf(sb, "\n Reads Total wait"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_rdma_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_rdma_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod); sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tcp_stats v4, v6; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_tcp_stats(sc, &v4, &v6, 0); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, " IP IPv6\n"); sbuf_printf(sb, "OutRsts: %20u %20u\n", v4.tcp_out_rsts, v6.tcp_out_rsts); sbuf_printf(sb, "InSegs: %20ju %20ju\n", v4.tcp_in_segs, v6.tcp_in_segs); sbuf_printf(sb, "OutSegs: %20ju %20ju\n", v4.tcp_out_segs, v6.tcp_out_segs); sbuf_printf(sb, "RetransSegs: %20ju %20ju", v4.tcp_retrans_segs, v6.tcp_retrans_segs); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tids(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint32_t x, y; struct tid_info *t = &sc->tids; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (t->natids) { sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1, t->atids_in_use); } if (t->nhpftids) { sbuf_printf(sb, "HPFTID range: %u-%u, in use: %u\n", t->hpftid_base, t->hpftid_end, t->hpftids_in_use); } if (t->ntids) { bool hashen = false; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { hashen = true; if (chip_id(sc) <= CHELSIO_T5) { x = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4; y = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4; } else { x = t4_read_reg(sc, A_LE_DB_SRVR_START_INDEX); y = t4_read_reg(sc, A_T6_LE_DB_HASH_TID_BASE); } } mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; sbuf_printf(sb, "TID range: "); if (hashen) { if (x) sbuf_printf(sb, "%u-%u, ", t->tid_base, x - 1); sbuf_printf(sb, "%u-%u", y, t->ntids - 1); } else { sbuf_printf(sb, "%u-%u", t->tid_base, t->tid_base + t->ntids - 1); } sbuf_printf(sb, ", in use: %u\n", atomic_load_acq_int(&t->tids_in_use)); } if (t->nstids) { sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base, t->stid_base + t->nstids - 1, t->stids_in_use); } if (t->nftids) { sbuf_printf(sb, "FTID range: %u-%u, in use: %u\n", t->ftid_base, t->ftid_end, t->ftids_in_use); } if (t->netids) { sbuf_printf(sb, "ETID range: %u-%u, in use: %u\n", t->etid_base, t->etid_base + t->netids - 1, t->etids_in_use); } mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { x = t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4); y = t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6); } mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", x, y); done: if (rc == 0) rc = sbuf_finish(sb); else (void)sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_err_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_err_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "macInErrs: %10u %10u %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1], stats.mac_in_errs[2], stats.mac_in_errs[3]); sbuf_printf(sb, "hdrInErrs: %10u %10u %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1], stats.hdr_in_errs[2], stats.hdr_in_errs[3]); sbuf_printf(sb, "tcpInErrs: %10u %10u %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1], stats.tcp_in_errs[2], stats.tcp_in_errs[3]); sbuf_printf(sb, "tcp6InErrs: %10u %10u %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1], stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]); sbuf_printf(sb, "tnlCongDrops: %10u %10u %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1], stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]); sbuf_printf(sb, "tnlTxDrops: %10u %10u %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1], stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1], stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]); sbuf_printf(sb, "ofldChanDrops: %10u %10u %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1], stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "macInErrs: %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1]); sbuf_printf(sb, "hdrInErrs: %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1]); sbuf_printf(sb, "tcpInErrs: %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1]); sbuf_printf(sb, "tcp6InErrs: %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]); sbuf_printf(sb, "tnlCongDrops: %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]); sbuf_printf(sb, "tnlTxDrops: %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]); sbuf_printf(sb, "ofldChanDrops: %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]); } sbuf_printf(sb, "ofldNoNeigh: %u\nofldCongDefer: %u", stats.ofld_no_neigh, stats.ofld_cong_defer); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tnl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tnl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_tp_get_tnl_stats(sc, &stats, 1); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "OutPkts: %10u %10u %10u %10u\n", stats.out_pkt[0], stats.out_pkt[1], stats.out_pkt[2], stats.out_pkt[3]); sbuf_printf(sb, "InPkts: %10u %10u %10u %10u", stats.in_pkt[0], stats.in_pkt[1], stats.in_pkt[2], stats.in_pkt[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "OutPkts: %10u %10u\n", stats.out_pkt[0], stats.out_pkt[1]); sbuf_printf(sb, "InPkts: %10u %10u", stats.in_pkt[0], stats.in_pkt[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tp_params *tpp = &sc->params.tp; u_int mask; int rc; mask = tpp->la_mask >> 16; rc = sysctl_handle_int(oidp, &mask, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (mask > 0xffff) return (EINVAL); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { tpp->la_mask = mask << 16; t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U, tpp->la_mask); } mtx_unlock(&sc->reg_lock); return (rc); } struct field_desc { const char *name; u_int start; u_int width; }; static void field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f) { char buf[32]; int line_size = 0; while (f->name) { uint64_t mask = (1ULL << f->width) - 1; int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name, ((uintmax_t)v >> f->start) & mask); if (line_size + len >= 79) { line_size = 8; sbuf_printf(sb, "\n "); } sbuf_printf(sb, "%s ", buf); line_size += len + 1; f++; } sbuf_printf(sb, "\n"); } static const struct field_desc tp_la0[] = { { "RcfOpCodeOut", 60, 4 }, { "State", 56, 4 }, { "WcfState", 52, 4 }, { "RcfOpcSrcOut", 50, 2 }, { "CRxError", 49, 1 }, { "ERxError", 48, 1 }, { "SanityFailed", 47, 1 }, { "SpuriousMsg", 46, 1 }, { "FlushInputMsg", 45, 1 }, { "FlushInputCpl", 44, 1 }, { "RssUpBit", 43, 1 }, { "RssFilterHit", 42, 1 }, { "Tid", 32, 10 }, { "InitTcb", 31, 1 }, { "LineNumber", 24, 7 }, { "Emsg", 23, 1 }, { "EdataOut", 22, 1 }, { "Cmsg", 21, 1 }, { "CdataOut", 20, 1 }, { "EreadPdu", 19, 1 }, { "CreadPdu", 18, 1 }, { "TunnelPkt", 17, 1 }, { "RcfPeerFin", 16, 1 }, { "RcfReasonOut", 12, 4 }, { "TxCchannel", 10, 2 }, { "RcfTxChannel", 8, 2 }, { "RxEchannel", 6, 2 }, { "RcfRxChannel", 5, 1 }, { "RcfDataOutSrdy", 4, 1 }, { "RxDvld", 3, 1 }, { "RxOoDvld", 2, 1 }, { "RxCongestion", 1, 1 }, { "TxCongestion", 0, 1 }, { NULL } }; static const struct field_desc tp_la1[] = { { "CplCmdIn", 56, 8 }, { "CplCmdOut", 48, 8 }, { "ESynOut", 47, 1 }, { "EAckOut", 46, 1 }, { "EFinOut", 45, 1 }, { "ERstOut", 44, 1 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static const struct field_desc tp_la2[] = { { "CplCmdIn", 56, 8 }, { "MpsVfVld", 55, 1 }, { "MpsPf", 52, 3 }, { "MpsVf", 44, 8 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static void tp_la_show(struct sbuf *sb, uint64_t *p, int idx) { field_desc_show(sb, *p, tp_la0); } static void tp_la_show2(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], tp_la0); } static void tp_la_show3(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1); } static int sysctl_tp_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint64_t *buf, *p; int rc; u_int i, inc; void (*show_func)(struct sbuf *, uint64_t *, int); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { t4_tp_read_la(sc, buf, NULL); switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) { case 2: inc = 2; show_func = tp_la_show2; break; case 3: inc = 2; show_func = tp_la_show3; break; default: inc = 1; show_func = tp_la_show; } } mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; p = buf; for (i = 0; i < TPLA_SIZE / inc; i++, p += inc) (*show_func)(sb, p, i); rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; u64 nrate[MAX_NCHAN], orate[MAX_NCHAN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_get_chan_txrate(sc, nrate, orate); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju %10ju %10ju\n", nrate[0], nrate[1], nrate[2], nrate[3]); sbuf_printf(sb, "Offload B/s: %10ju %10ju %10ju %10ju", orate[0], orate[1], orate[2], orate[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju\n", nrate[0], nrate[1]); sbuf_printf(sb, "Offload B/s: %10ju %10ju", orate[0], orate[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint32_t *buf, *p; int rc, i; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else t4_ulprx_read_la(sc, buf); mtx_unlock(&sc->reg_lock); if (rc != 0) goto done; p = buf; sbuf_printf(sb, " Pcmd Type Message" " Data"); for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) { sbuf_printf(sb, "\n%08x%08x %4x %08x %08x%08x%08x%08x", p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]); } rc = sbuf_finish(sb); done: sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint32_t cfg, s1, s2; MPASS(chip_id(sc) >= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { cfg = t4_read_reg(sc, A_SGE_STAT_CFG); s1 = t4_read_reg(sc, A_SGE_STAT_TOTAL); s2 = t4_read_reg(sc, A_SGE_STAT_MATCH); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); if (G_STATSOURCE_T5(cfg) == 7) { int mode; mode = is_t5(sc) ? G_STATMODE(cfg) : G_T6_STATMODE(cfg); if (mode == 0) sbuf_printf(sb, "total %d, incomplete %d", s1, s2); else if (mode == 1) sbuf_printf(sb, "total %d, data overflow %d", s1, s2); else sbuf_printf(sb, "unknown mode %d", mode); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; enum cpu_sets op = arg2; cpuset_t cpuset; struct sbuf *sb; int i, rc; MPASS(op == LOCAL_CPUS || op == INTR_CPUS); CPU_ZERO(&cpuset); rc = bus_get_cpus(sc->dev, op, sizeof(cpuset), &cpuset); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); CPU_FOREACH(i) sbuf_printf(sb, "%d ", i); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_reset(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int val; int rc; val = atomic_load_int(&sc->num_resets); rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val == 0) { /* Zero out the counter that tracks reset. */ atomic_store_int(&sc->num_resets, 0); return (0); } if (val != 1) return (EINVAL); /* 0 or 1 are the only legal values */ if (hw_off_limits(sc)) /* harmless race */ return (EALREADY); taskqueue_enqueue(reset_tq, &sc->reset_task); return (0); } #ifdef TCP_OFFLOAD static int sysctl_tls(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int i, j, v, rc; struct vi_info *vi; v = sc->tt.tls; rc = sysctl_handle_int(oidp, &v, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (v != 0 && !(sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS)) return (ENOTSUP); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4stls"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else { sc->tt.tls = !!v; for_each_port(sc, i) { for_each_vi(sc->port[i], j, vi) { if (vi->flags & VI_INIT_DONE) t4_update_fl_bufsize(vi->ifp); } } } end_synchronized_op(sc, 0); return (rc); } static void unit_conv(char *buf, size_t len, u_int val, u_int factor) { u_int rem = val % factor; if (rem == 0) snprintf(buf, len, "%u", val / factor); else { while (rem % 10 == 0) rem /= 10; snprintf(buf, len, "%u.%u", val / factor, rem); } } static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; char buf[16]; u_int res, re; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) res = (u_int)-1; else res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); mtx_unlock(&sc->reg_lock); if (res == (u_int)-1) return (ENXIO); switch (arg2) { case 0: /* timer_tick */ re = G_TIMERRESOLUTION(res); break; case 1: /* TCP timestamp tick */ re = G_TIMESTAMPRESOLUTION(res); break; case 2: /* DACK tick */ re = G_DELAYEDACKRESOLUTION(res); break; default: return (EDOOFUS); } unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000); return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc; u_int dack_tmr, dack_re, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; dack_re = G_DELAYEDACKRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION)); dack_tmr = t4_read_reg(sc, A_TP_DACK_TIMER); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); v = ((cclk_ps << dack_re) / 1000000) * dack_tmr; return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, reg = arg2; u_int tre; u_long tp_tick_us, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX || reg == A_TP_PERS_MIN || reg == A_TP_PERS_MAX || reg == A_TP_KEEP_IDLE || reg == A_TP_KEEP_INTVL || reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION)); tp_tick_us = (cclk_ps << tre) / 1000000; if (reg == A_TP_INIT_SRTT) v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg)); else v = tp_tick_us * t4_read_reg(sc, reg); } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); else return (sysctl_handle_long(oidp, &v, 0, req)); } /* * All fields in TP_SHIFT_CNT are 4b and the starting location of the field is * passed to this function. */ static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, idx = arg2; u_int v; MPASS(idx >= 0 && idx <= 24); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; v = (t4_read_reg(sc, A_TP_SHIFT_CNT) >> idx) & 0xf; } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); else return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, idx = arg2; u_int shift, v, r; MPASS(idx >= 0 && idx < 16); r = A_TP_TCP_BACKOFF_REG0 + (idx & ~3); shift = (idx & 3) << 3; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else { rc = 0; v = (t4_read_reg(sc, r) >> shift) & M_TIMERBACKOFFINDEX0; } mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); else return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc, i; struct sge_ofld_rxq *ofld_rxq; uint8_t v; idx = vi->ofld_tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4otmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->ofld_pktc_idx != -1); for_each_ofld_rxq(vi, i, ofld_rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&ofld_rxq->iq.intr_params, v); #else ofld_rxq->iq.intr_params = v; #endif } vi->ofld_tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; int idx, rc; idx = vi->ofld_pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4opktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->ofld_pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } #endif static int get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt) { int rc; if (cntxt->cid > M_CTXTQID) return (EINVAL); if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS && cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (sc->flags & FW_OK) { rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); if (rc == 0) goto done; } /* * Read via firmware failed or wasn't even attempted. Read directly via * the backdoor. */ rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); done: end_synchronized_op(sc, 0); return (rc); } static int load_fw(struct adapter *sc, struct t4_data *fw) { int rc; uint8_t *fw_data; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } /* * The firmware, with the sole exception of the memory parity error * handler, runs from memory and not flash. It is almost always safe to * install a new firmware on a running system. Just set bit 1 in * hw.cxgbe.dflags or dev...dflags first. */ if (sc->flags & FULL_INIT_DONE && (sc->debug_flags & DF_LOAD_FW_ANYTIME) == 0) { rc = EBUSY; goto done; } fw_data = malloc(fw->len, M_CXGBE, M_WAITOK); rc = copyin(fw->data, fw_data, fw->len); if (rc == 0) rc = -t4_load_fw(sc, fw_data, fw->len); free(fw_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_cfg(struct adapter *sc, struct t4_data *cfg) { int rc; uint8_t *cfg_data = NULL; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (cfg->len == 0) { /* clear */ rc = -t4_load_cfg(sc, NULL, 0); goto done; } cfg_data = malloc(cfg->len, M_CXGBE, M_WAITOK); rc = copyin(cfg->data, cfg_data, cfg->len); if (rc == 0) rc = -t4_load_cfg(sc, cfg_data, cfg->len); free(cfg_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_boot(struct adapter *sc, struct t4_bootrom *br) { int rc; uint8_t *br_data = NULL; u_int offset; if (br->len > 1024 * 1024) return (EFBIG); if (br->pf_offset == 0) { /* pfidx */ if (br->pfidx_addr > 7) return (EINVAL); offset = G_OFFSET(t4_read_reg(sc, PF_REG(br->pfidx_addr, A_PCIE_PF_EXPROM_OFST))); } else if (br->pf_offset == 1) { /* offset */ offset = G_OFFSET(br->pfidx_addr); } else { return (EINVAL); } rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldbr"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (br->len == 0) { /* clear */ rc = -t4_load_boot(sc, NULL, offset, 0); goto done; } br_data = malloc(br->len, M_CXGBE, M_WAITOK); rc = copyin(br->data, br_data, br->len); if (rc == 0) rc = -t4_load_boot(sc, br_data, offset, br->len); free(br_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_bootcfg(struct adapter *sc, struct t4_data *bc) { int rc; uint8_t *bc_data = NULL; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (bc->len == 0) { /* clear */ rc = -t4_load_bootcfg(sc, NULL, 0); goto done; } bc_data = malloc(bc->len, M_CXGBE, M_WAITOK); rc = copyin(bc->data, bc_data, bc->len); if (rc == 0) rc = -t4_load_bootcfg(sc, bc_data, bc->len); free(bc_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int cudbg_dump(struct adapter *sc, struct t4_cudbg_dump *dump) { int rc; struct cudbg_init *cudbg; void *handle, *buf; /* buf is large, don't block if no memory is available */ buf = malloc(dump->len, M_CXGBE, M_NOWAIT | M_ZERO); if (buf == NULL) return (ENOMEM); handle = cudbg_alloc_handle(); if (handle == NULL) { rc = ENOMEM; goto done; } cudbg = cudbg_get_init(handle); cudbg->adap = sc; cudbg->print = (cudbg_print_cb)printf; #ifndef notyet device_printf(sc->dev, "%s: wr_flash %u, len %u, data %p.\n", __func__, dump->wr_flash, dump->len, dump->data); #endif if (dump->wr_flash) cudbg->use_flash = 1; MPASS(sizeof(cudbg->dbg_bitmap) == sizeof(dump->bitmap)); memcpy(cudbg->dbg_bitmap, dump->bitmap, sizeof(cudbg->dbg_bitmap)); rc = cudbg_collect(handle, buf, &dump->len); if (rc != 0) goto done; rc = copyout(buf, dump->data, dump->len); done: cudbg_free_handle(handle); free(buf, M_CXGBE); return (rc); } static void free_offload_policy(struct t4_offload_policy *op) { struct offload_rule *r; int i; if (op == NULL) return; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { free(r->bpf_prog.bf_insns, M_CXGBE); } free(op->rule, M_CXGBE); free(op, M_CXGBE); } static int set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop) { int i, rc, len; struct t4_offload_policy *op, *old; struct bpf_program *bf; const struct offload_settings *s; struct offload_rule *r; void *u; if (!is_offload(sc)) return (ENODEV); if (uop->nrules == 0) { /* Delete installed policies. */ op = NULL; goto set_policy; } else if (uop->nrules > 256) { /* arbitrary */ return (E2BIG); } /* Copy userspace offload policy to kernel */ op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK); op->nrules = uop->nrules; len = op->nrules * sizeof(struct offload_rule); op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); rc = copyin(uop->rule, op->rule, len); if (rc) { free(op->rule, M_CXGBE); free(op, M_CXGBE); return (rc); } r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { /* Validate open_type */ if (r->open_type != OPEN_TYPE_LISTEN && r->open_type != OPEN_TYPE_ACTIVE && r->open_type != OPEN_TYPE_PASSIVE && r->open_type != OPEN_TYPE_DONTCARE) { error: /* * Rules 0 to i have malloc'd filters that need to be * freed. Rules i+1 to nrules have userspace pointers * and should be left alone. */ op->nrules = i; free_offload_policy(op); return (rc); } /* Validate settings */ s = &r->settings; if ((s->offload != 0 && s->offload != 1) || s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED || s->sched_class < -1 || s->sched_class >= sc->params.nsched_cls) { rc = EINVAL; goto error; } bf = &r->bpf_prog; u = bf->bf_insns; /* userspace ptr */ bf->bf_insns = NULL; if (bf->bf_len == 0) { /* legal, matches everything */ continue; } len = bf->bf_len * sizeof(*bf->bf_insns); bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); rc = copyin(u, bf->bf_insns, len); if (rc != 0) goto error; if (!bpf_validate(bf->bf_insns, bf->bf_len)) { rc = EINVAL; goto error; } } set_policy: rw_wlock(&sc->policy_lock); old = sc->policy; sc->policy = op; rw_wunlock(&sc->policy_lock); free_offload_policy(old); return (0); } #define MAX_READ_BUF_SIZE (128 * 1024) static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) { uint32_t addr, remaining, n; uint32_t *buf; int rc; uint8_t *dst; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else rc = validate_mem_range(sc, mr->addr, mr->len); mtx_unlock(&sc->reg_lock); if (rc != 0) return (rc); buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK); addr = mr->addr; remaining = mr->len; dst = (void *)mr->data; while (remaining) { n = min(remaining, MAX_READ_BUF_SIZE); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else read_via_memwin(sc, 2, addr, buf, n); mtx_unlock(&sc->reg_lock); if (rc != 0) break; rc = copyout(buf, dst, n); if (rc != 0) break; dst += n; remaining -= n; addr += n; } free(buf, M_CXGBE); return (rc); } #undef MAX_READ_BUF_SIZE static int read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd) { int rc; if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports) return (EINVAL); if (i2cd->len > sizeof(i2cd->data)) return (EFBIG); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr, i2cd->offset, i2cd->len, &i2cd->data[0]); end_synchronized_op(sc, 0); return (rc); } static int clear_stats(struct adapter *sc, u_int port_id) { int i, v, chan_map; struct port_info *pi; struct vi_info *vi; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif if (port_id >= sc->params.nports) return (EINVAL); pi = sc->port[port_id]; if (pi == NULL) return (EIO); mtx_lock(&sc->reg_lock); if (!hw_off_limits(sc)) { /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); if (is_t6(sc)) { if (pi->fcs_reg != -1) pi->fcs_base = t4_read_reg64(sc, pi->fcs_reg); else pi->stats.rx_fcs_err = 0; } for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) t4_clr_vi_stats(sc, vi->vin); } chan_map = pi->rx_e_chan_map; v = 0; /* reuse */ while (chan_map) { i = ffs(chan_map) - 1; t4_write_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); chan_map &= ~(1 << i); } } mtx_unlock(&sc->reg_lock); pi->tx_parse_error = 0; pi->tnl_cong_drops = 0; /* * Since this command accepts a port, clear stats for * all VIs on this port. */ for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) { for_each_rxq(vi, i, rxq) { #if defined(INET) || defined(INET6) rxq->lro.lro_queued = 0; rxq->lro.lro_flushed = 0; #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; rxq->vxlan_rxcsum = 0; rxq->fl.cl_allocated = 0; rxq->fl.cl_recycled = 0; rxq->fl.cl_fast_recycled = 0; } for_each_txq(vi, i, txq) { txq->txcsum = 0; txq->tso_wrs = 0; txq->vlan_insertion = 0; txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; txq->txpkts0_wrs = 0; txq->txpkts1_wrs = 0; txq->txpkts0_pkts = 0; txq->txpkts1_pkts = 0; txq->txpkts_flush = 0; txq->raw_wrs = 0; txq->vxlan_tso_wrs = 0; txq->vxlan_txcsum = 0; txq->kern_tls_records = 0; txq->kern_tls_short = 0; txq->kern_tls_partial = 0; txq->kern_tls_full = 0; txq->kern_tls_octets = 0; txq->kern_tls_waste = 0; txq->kern_tls_options = 0; txq->kern_tls_header = 0; txq->kern_tls_fin = 0; txq->kern_tls_fin_short = 0; txq->kern_tls_cbc = 0; txq->kern_tls_gcm = 0; mp_ring_reset_stats(txq->r); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { ofld_txq->wrq.tx_wrs_direct = 0; ofld_txq->wrq.tx_wrs_copied = 0; counter_u64_zero(ofld_txq->tx_iscsi_pdus); counter_u64_zero(ofld_txq->tx_iscsi_octets); counter_u64_zero(ofld_txq->tx_iscsi_iso_wrs); counter_u64_zero(ofld_txq->tx_toe_tls_records); counter_u64_zero(ofld_txq->tx_toe_tls_octets); } #endif #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { ofld_rxq->fl.cl_allocated = 0; ofld_rxq->fl.cl_recycled = 0; ofld_rxq->fl.cl_fast_recycled = 0; counter_u64_zero( ofld_rxq->rx_iscsi_ddp_setup_ok); counter_u64_zero( ofld_rxq->rx_iscsi_ddp_setup_error); ofld_rxq->rx_iscsi_ddp_pdus = 0; ofld_rxq->rx_iscsi_ddp_octets = 0; ofld_rxq->rx_iscsi_fl_pdus = 0; ofld_rxq->rx_iscsi_fl_octets = 0; ofld_rxq->rx_toe_tls_records = 0; ofld_rxq->rx_toe_tls_octets = 0; } #endif if (IS_MAIN_VI(vi)) { wrq = &sc->sge.ctrlq[pi->port_id]; wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } } } return (0); } static int hold_clip_addr(struct adapter *sc, struct t4_clip_addr *ca) { #ifdef INET6 struct in6_addr in6; bcopy(&ca->addr[0], &in6.s6_addr[0], sizeof(in6.s6_addr)); if (t4_get_clip_entry(sc, &in6, true) != NULL) return (0); else return (EIO); #else return (ENOTSUP); #endif } static int release_clip_addr(struct adapter *sc, struct t4_clip_addr *ca) { #ifdef INET6 struct in6_addr in6; bcopy(&ca->addr[0], &in6.s6_addr[0], sizeof(in6.s6_addr)); return (t4_release_clip_addr(sc, &in6)); #else return (ENOTSUP); #endif } int t4_os_find_pci_capability(struct adapter *sc, int cap) { int i; return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0); } int t4_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t4_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } void t4_os_portmod_changed(struct port_info *pi) { struct adapter *sc = pi->adapter; struct vi_info *vi; - struct ifnet *ifp; + if_t ifp; static const char *mod_str[] = { NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; KASSERT((pi->flags & FIXED_IFMEDIA) == 0, ("%s: port_type %u", __func__, pi->port_type)); vi = &pi->vi[0]; if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) { PORT_LOCK(pi); build_medialist(pi); if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) { fixup_link_config(pi); apply_link_config(pi); } PORT_UNLOCK(pi); end_synchronized_op(sc, LOCK_HELD); } ifp = vi->ifp; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) if_printf(ifp, "transceiver unplugged.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN) if_printf(ifp, "unknown transceiver inserted.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED) if_printf(ifp, "unsupported transceiver inserted.\n"); else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) { if_printf(ifp, "%dGbps %s transceiver inserted.\n", port_top_speed(pi), mod_str[pi->mod_type]); } else { if_printf(ifp, "transceiver (type %d) inserted.\n", pi->mod_type); } } void t4_os_link_changed(struct port_info *pi) { struct vi_info *vi; - struct ifnet *ifp; + if_t ifp; struct link_config *lc = &pi->link_cfg; struct adapter *sc = pi->adapter; int v; PORT_LOCK_ASSERT_OWNED(pi); if (is_t6(sc)) { if (lc->link_ok) { if (lc->speed > 25000 || (lc->speed == 25000 && lc->fec == FEC_RS)) { pi->fcs_reg = T5_PORT_REG(pi->tx_chan, A_MAC_PORT_AFRAMECHECKSEQUENCEERRORS); } else { pi->fcs_reg = T5_PORT_REG(pi->tx_chan, A_MAC_PORT_MTIP_1G10G_RX_CRCERRORS); } pi->fcs_base = t4_read_reg64(sc, pi->fcs_reg); pi->stats.rx_fcs_err = 0; } else { pi->fcs_reg = -1; } } else { MPASS(pi->fcs_reg != -1); MPASS(pi->fcs_base == 0); } for_each_vi(pi, v, vi) { ifp = vi->ifp; if (ifp == NULL) continue; if (lc->link_ok) { - ifp->if_baudrate = IF_Mbps(lc->speed); + if_setbaudrate(ifp, IF_Mbps(lc->speed)); if_link_state_change(ifp, LINK_STATE_UP); } else { if_link_state_change(ifp, LINK_STATE_DOWN); } } } void t4_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; sx_slock(&t4_list_lock); SLIST_FOREACH(sc, &t4_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } sx_sunlock(&t4_list_lock); } static int t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int rc; struct adapter *sc = dev->si_drv1; rc = priv_check(td, PRIV_DRIVER); if (rc != 0) return (rc); switch (cmd) { case CHELSIO_T4_GETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else if (edata->size == 4) edata->val = t4_read_reg(sc, edata->addr); else if (edata->size == 8) edata->val = t4_read_reg64(sc, edata->addr); else rc = EINVAL; mtx_unlock(&sc->reg_lock); break; } case CHELSIO_T4_SETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else if (edata->size == 4) { if (edata->val & 0xffffffff00000000) rc = EINVAL; t4_write_reg(sc, edata->addr, (uint32_t) edata->val); } else if (edata->size == 8) t4_write_reg64(sc, edata->addr, edata->val); else rc = EINVAL; mtx_unlock(&sc->reg_lock); break; } case CHELSIO_T4_REGDUMP: { struct t4_regdump *regs = (struct t4_regdump *)data; int reglen = t4_get_regs_len(sc); uint8_t *buf; if (regs->len < reglen) { regs->len = reglen; /* hint to the caller */ return (ENOBUFS); } regs->len = reglen; buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO); mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) rc = ENXIO; else get_regs(sc, regs, buf); mtx_unlock(&sc->reg_lock); if (rc == 0) rc = copyout(buf, regs->data, reglen); free(buf, M_CXGBE); break; } case CHELSIO_T4_GET_FILTER_MODE: rc = get_filter_mode(sc, (uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MODE: rc = set_filter_mode(sc, *(uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MASK: rc = set_filter_mask(sc, *(uint32_t *)data); break; case CHELSIO_T4_GET_FILTER: rc = get_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_SET_FILTER: rc = set_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_DEL_FILTER: rc = del_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_GET_SGE_CONTEXT: rc = get_sge_context(sc, (struct t4_sge_context *)data); break; case CHELSIO_T4_LOAD_FW: rc = load_fw(sc, (struct t4_data *)data); break; case CHELSIO_T4_GET_MEM: rc = read_card_mem(sc, 2, (struct t4_mem_range *)data); break; case CHELSIO_T4_GET_I2C: rc = read_i2c(sc, (struct t4_i2c_data *)data); break; case CHELSIO_T4_CLEAR_STATS: rc = clear_stats(sc, *(uint32_t *)data); break; case CHELSIO_T4_SCHED_CLASS: rc = t4_set_sched_class(sc, (struct t4_sched_params *)data); break; case CHELSIO_T4_SCHED_QUEUE: rc = t4_set_sched_queue(sc, (struct t4_sched_queue *)data); break; case CHELSIO_T4_GET_TRACER: rc = t4_get_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_SET_TRACER: rc = t4_set_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_LOAD_CFG: rc = load_cfg(sc, (struct t4_data *)data); break; case CHELSIO_T4_LOAD_BOOT: rc = load_boot(sc, (struct t4_bootrom *)data); break; case CHELSIO_T4_LOAD_BOOTCFG: rc = load_bootcfg(sc, (struct t4_data *)data); break; case CHELSIO_T4_CUDBG_DUMP: rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data); break; case CHELSIO_T4_SET_OFLD_POLICY: rc = set_offload_policy(sc, (struct t4_offload_policy *)data); break; case CHELSIO_T4_HOLD_CLIP_ADDR: rc = hold_clip_addr(sc, (struct t4_clip_addr *)data); break; case CHELSIO_T4_RELEASE_CLIP_ADDR: rc = release_clip_addr(sc, (struct t4_clip_addr *)data); break; default: rc = ENOTTY; } return (rc); } #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *vi, bool enable) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; ASSERT_SYNCHRONIZED_OP(sc); if (!is_offload(sc)) return (ENODEV); if (hw_off_limits(sc)) return (ENXIO); if (enable) { #ifdef KERN_TLS if (sc->flags & KERN_TLS_ON && is_t6(sc)) { int i, j, n; struct port_info *p; struct vi_info *v; /* * Reconfigure hardware for TOE if TXTLS is not enabled * on any ifnet. */ n = 0; for_each_port(sc, i) { p = sc->port[i]; for_each_vi(p, j, v) { - if (v->ifp->if_capenable & IFCAP_TXTLS) { + if (if_getcapenable(v->ifp) & IFCAP_TXTLS) { CH_WARN(sc, "%s has NIC TLS enabled.\n", device_get_nameunit(v->dev)); n++; } } } if (n > 0) { CH_WARN(sc, "Disable NIC TLS on all interfaces " "associated with this adapter before " "trying to enable TOE.\n"); return (EAGAIN); } rc = t6_config_kern_tls(sc, false); if (rc) return (rc); } #endif - if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) { + if ((if_getcapenable(vi->ifp) & IFCAP_TOE) != 0) { /* TOE is already enabled. */ return (0); } /* * We need the port's queues around so that we're able to send * and receive CPLs to/from the TOE even if the ifnet for this * port has never been UP'd administratively. */ if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_init(vi)) != 0)) return (rc); if (!(pi->vi[0].flags & VI_INIT_DONE) && ((rc = vi_init(&pi->vi[0])) != 0)) return (rc); if (isset(&sc->offload_map, pi->port_id)) { /* TOE is enabled on another VI of this port. */ pi->uld_vis++; return (0); } if (!uld_active(sc, ULD_TOM)) { rc = t4_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t4_tom.ko before trying " "to enable TOE on a cxgbe interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM activated but flag not set", __func__)); } /* Activate iWARP and iSCSI too, if the modules are loaded. */ if (!uld_active(sc, ULD_IWARP)) (void) t4_activate_uld(sc, ULD_IWARP); if (!uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); pi->uld_vis++; setbit(&sc->offload_map, pi->port_id); } else { pi->uld_vis--; if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0) return (0); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t4_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t4_uld_list, ui, link); ui->refcount = 0; done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t4_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_activate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = EAGAIN; /* kldoad the module with this ULD and try again. */ sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { if (!(sc->flags & FULL_INIT_DONE)) { rc = adapter_init(sc); if (rc != 0) break; } rc = ui->activate(sc); if (rc == 0) { setbit(&sc->active_ulds, id); ui->refcount++; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int t4_deactivate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = ENXIO; sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) { clrbit(&sc->active_ulds, id); ui->refcount--; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } static int t4_deactivate_all_uld(struct adapter *sc) { int rc; struct uld_info *ui; rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4detuld"); if (rc != 0) return (ENXIO); sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (isset(&sc->active_ulds, ui->uld_id)) { rc = ui->deactivate(sc); if (rc != 0) break; clrbit(&sc->active_ulds, ui->uld_id); ui->refcount--; } } sx_sunlock(&t4_uld_list_lock); end_synchronized_op(sc, 0); return (rc); } static void t4_async_event(struct adapter *sc) { struct uld_info *ui; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4async") != 0) return; sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == ULD_IWARP) { ui->async_event(sc); break; } } sx_sunlock(&t4_uld_list_lock); end_synchronized_op(sc, 0); } int uld_active(struct adapter *sc, int uld_id) { MPASS(uld_id >= 0 && uld_id <= ULD_MAX); return (isset(&sc->active_ulds, uld_id)); } #endif #ifdef KERN_TLS static int ktls_capability(struct adapter *sc, bool enable) { ASSERT_SYNCHRONIZED_OP(sc); if (!is_ktls(sc)) return (ENODEV); if (!is_t6(sc)) return (0); if (hw_off_limits(sc)) return (ENXIO); if (enable) { if (sc->flags & KERN_TLS_ON) return (0); /* already on */ if (sc->offload_map != 0) { CH_WARN(sc, "Disable TOE on all interfaces associated with " "this adapter before trying to enable NIC TLS.\n"); return (EAGAIN); } return (t6_config_kern_tls(sc, true)); } else { /* * Nothing to do for disable. If TOE is enabled sometime later * then toe_capability will reconfigure the hardware. */ return (0); } } #endif /* * t = ptr to tunable. * nc = number of CPUs. * c = compiled in default for that tunable. */ static void calculate_nqueues(int *t, int nc, const int c) { int nq; if (*t > 0) return; nq = *t < 0 ? -*t : c; *t = min(nc, nq); } /* * Come up with reasonable defaults for some of the tunables, provided they're * not set by the user (in which case we'll use the values as is). */ static void tweak_tunables(void) { int nc = mp_ncpus; /* our snapshot of the number of CPUs */ if (t4_ntxq < 1) { #ifdef RSS t4_ntxq = rss_getnumbuckets(); #else calculate_nqueues(&t4_ntxq, nc, NTXQ); #endif } calculate_nqueues(&t4_ntxq_vi, nc, NTXQ_VI); if (t4_nrxq < 1) { #ifdef RSS t4_nrxq = rss_getnumbuckets(); #else calculate_nqueues(&t4_nrxq, nc, NRXQ); #endif } calculate_nqueues(&t4_nrxq_vi, nc, NRXQ_VI); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) calculate_nqueues(&t4_nofldtxq, nc, NOFLDTXQ); calculate_nqueues(&t4_nofldtxq_vi, nc, NOFLDTXQ_VI); #endif #ifdef TCP_OFFLOAD calculate_nqueues(&t4_nofldrxq, nc, NOFLDRXQ); calculate_nqueues(&t4_nofldrxq_vi, nc, NOFLDRXQ_VI); #endif #if defined(TCP_OFFLOAD) || defined(KERN_TLS) if (t4_toecaps_allowed == -1) t4_toecaps_allowed = FW_CAPS_CONFIG_TOE; #else if (t4_toecaps_allowed == -1) t4_toecaps_allowed = 0; #endif #ifdef TCP_OFFLOAD if (t4_rdmacaps_allowed == -1) { t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP | FW_CAPS_CONFIG_RDMA_RDMAC; } if (t4_iscsicaps_allowed == -1) { t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU | FW_CAPS_CONFIG_ISCSI_TARGET_PDU | FW_CAPS_CONFIG_ISCSI_T10DIF; } if (t4_tmr_idx_ofld < 0 || t4_tmr_idx_ofld >= SGE_NTIMERS) t4_tmr_idx_ofld = TMR_IDX_OFLD; if (t4_pktc_idx_ofld < -1 || t4_pktc_idx_ofld >= SGE_NCOUNTERS) t4_pktc_idx_ofld = PKTC_IDX_OFLD; #else if (t4_rdmacaps_allowed == -1) t4_rdmacaps_allowed = 0; if (t4_iscsicaps_allowed == -1) t4_iscsicaps_allowed = 0; #endif #ifdef DEV_NETMAP calculate_nqueues(&t4_nnmtxq, nc, NNMTXQ); calculate_nqueues(&t4_nnmrxq, nc, NNMRXQ); calculate_nqueues(&t4_nnmtxq_vi, nc, NNMTXQ_VI); calculate_nqueues(&t4_nnmrxq_vi, nc, NNMRXQ_VI); #endif if (t4_tmr_idx < 0 || t4_tmr_idx >= SGE_NTIMERS) t4_tmr_idx = TMR_IDX; if (t4_pktc_idx < -1 || t4_pktc_idx >= SGE_NCOUNTERS) t4_pktc_idx = PKTC_IDX; if (t4_qsize_txq < 128) t4_qsize_txq = 128; if (t4_qsize_rxq < 128) t4_qsize_rxq = 128; while (t4_qsize_rxq & 7) t4_qsize_rxq++; t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX; /* * Number of VIs to create per-port. The first VI is the "main" regular * VI for the port. The rest are additional virtual interfaces on the * same physical port. Note that the main VI does not have native * netmap support but the extra VIs do. * * Limit the number of VIs per port to the number of available * MAC addresses per port. */ if (t4_num_vis < 1) t4_num_vis = 1; if (t4_num_vis > nitems(vi_mac_funcs)) { t4_num_vis = nitems(vi_mac_funcs); printf("cxgbe: number of VIs limited to %d\n", t4_num_vis); } if (pcie_relaxed_ordering < 0 || pcie_relaxed_ordering > 2) { pcie_relaxed_ordering = 1; #if defined(__i386__) || defined(__amd64__) if (cpu_vendor_id == CPU_VENDOR_INTEL) pcie_relaxed_ordering = 0; #endif } } #ifdef DDB static void t4_dump_tcb(struct adapter *sc, int tid) { uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos; reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2); save = t4_read_reg(sc, reg); base = sc->memwin[2].mw_base; /* Dump TCB for the tid */ tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE); tcb_addr += tid * TCB_SIZE; if (is_t4(sc)) { pf = 0; win_pos = tcb_addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); win_pos = tcb_addr & ~0x7f; /* start must be 128B aligned */ } t4_write_reg(sc, reg, win_pos | pf); t4_read_reg(sc, reg); off = tcb_addr - win_pos; for (i = 0; i < 4; i++) { uint32_t buf[8]; for (j = 0; j < 8; j++, off += 4) buf[j] = htonl(t4_read_reg(sc, base + off)); db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); } t4_write_reg(sc, reg, save); t4_read_reg(sc, reg); } static void t4_dump_devlog(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e e; int i, first, j, m, nentries, rc; uint64_t ftstamp = UINT64_MAX; if (dparams->start == 0) { db_printf("devlog params not valid\n"); return; } nentries = dparams->size / sizeof(struct fw_devlog_e); m = fwmtype_to_hwmtype(dparams->memtype); /* Find the first entry. */ first = -1; for (i = 0; i < nentries && !db_pager_quit; i++) { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) break; if (e.timestamp == 0) break; e.timestamp = be64toh(e.timestamp); if (e.timestamp < ftstamp) { ftstamp = e.timestamp; first = i; } } if (first == -1) return; i = first; do { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) return; if (e.timestamp == 0) return; e.timestamp = be64toh(e.timestamp); e.seqno = be32toh(e.seqno); for (j = 0; j < 8; j++) e.params[j] = be32toh(e.params[j]); db_printf("%10d %15ju %8s %8s ", e.seqno, e.timestamp, (e.level < nitems(devlog_level_strings) ? devlog_level_strings[e.level] : "UNKNOWN"), (e.facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e.facility] : "UNKNOWN")); db_printf(e.fmt, e.params[0], e.params[1], e.params[2], e.params[3], e.params[4], e.params[5], e.params[6], e.params[7]); if (++i == nentries) i = 0; } while (i != first && !db_pager_quit); } static struct db_command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table); _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table); DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL) { device_t dev; int t; bool valid; valid = false; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); valid = true; } db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 devlog \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } t4_dump_devlog(device_get_softc(dev)); } DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL) { device_t dev; int radix, tid, t; bool valid; valid = false; radix = db_radix; db_radix = 10; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); t = db_read_token(); if (t == tNUMBER) { tid = db_tok_number; valid = true; } } db_radix = radix; db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 tcb \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } if (tid < 0) { db_printf("invalid tid\n"); return; } t4_dump_tcb(device_get_softc(dev), tid); } #endif static eventhandler_tag vxlan_start_evtag; static eventhandler_tag vxlan_stop_evtag; struct vxlan_evargs { - struct ifnet *ifp; + if_t ifp; uint16_t port; }; static void enable_vxlan_rx(struct adapter *sc) { int i, rc; struct port_info *pi; uint8_t match_all_mac[ETHER_ADDR_LEN] = {0}; ASSERT_SYNCHRONIZED_OP(sc); t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE, V_VXLAN(sc->vxlan_port) | F_VXLAN_EN); for_each_port(sc, i) { pi = sc->port[i]; if (pi->vxlan_tcam_entry == true) continue; rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid, match_all_mac, match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id, true); if (rc < 0) { rc = -rc; CH_ERR(&pi->vi[0], "failed to add VXLAN TCAM entry: %d.\n", rc); } else { MPASS(rc == sc->rawf_base + pi->port_id); pi->vxlan_tcam_entry = true; } } } static void t4_vxlan_start(struct adapter *sc, void *arg) { struct vxlan_evargs *v = arg; if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5) return; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0) return; if (sc->vxlan_refcount == 0) { sc->vxlan_port = v->port; sc->vxlan_refcount = 1; if (!hw_off_limits(sc)) enable_vxlan_rx(sc); } else if (sc->vxlan_port == v->port) { sc->vxlan_refcount++; } else { CH_ERR(sc, "VXLAN already configured on port %d; " "ignoring attempt to configure it on port %d\n", sc->vxlan_port, v->port); } end_synchronized_op(sc, 0); } static void t4_vxlan_stop(struct adapter *sc, void *arg) { struct vxlan_evargs *v = arg; if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5) return; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0) return; /* * VXLANs may have been configured before the driver was loaded so we * may see more stops than starts. This is not handled cleanly but at * least we keep the refcount sane. */ if (sc->vxlan_port != v->port) goto done; if (sc->vxlan_refcount == 0) { CH_ERR(sc, "VXLAN operation on port %d was stopped earlier; " "ignoring attempt to stop it again.\n", sc->vxlan_port); } else if (--sc->vxlan_refcount == 0 && !hw_off_limits(sc)) t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0); done: end_synchronized_op(sc, 0); } static void -t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp, +t4_vxlan_start_handler(void *arg __unused, if_t ifp, sa_family_t family, u_int port) { struct vxlan_evargs v; MPASS(family == AF_INET || family == AF_INET6); v.ifp = ifp; v.port = port; t4_iterate(t4_vxlan_start, &v); } static void -t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family, +t4_vxlan_stop_handler(void *arg __unused, if_t ifp, sa_family_t family, u_int port) { struct vxlan_evargs v; MPASS(family == AF_INET || family == AF_INET6); v.ifp = ifp; v.port = port; t4_iterate(t4_vxlan_stop, &v); } static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); static int mod_event(module_t mod, int cmd, void *arg) { int rc = 0; static int loaded = 0; switch (cmd) { case MOD_LOAD: sx_xlock(&mlu); if (loaded++ == 0) { t4_sge_modload(); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, t4_filter_rpl, CPL_COOKIE_FILTER); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl, CPL_COOKIE_FILTER); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, t4_hashfilter_ao_rpl, CPL_COOKIE_HASHFILTER); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, t4_hashfilter_tcb_rpl, CPL_COOKIE_HASHFILTER); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, t4_del_hashfilter_rpl, CPL_COOKIE_HASHFILTER); t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt); t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt); t4_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl); sx_init(&t4_list_lock, "T4/T5 adapters"); SLIST_INIT(&t4_list); callout_init(&fatal_callout, 1); #ifdef TCP_OFFLOAD sx_init(&t4_uld_list_lock, "T4/T5 ULDs"); SLIST_INIT(&t4_uld_list); #endif #ifdef INET6 t4_clip_modload(); #endif #ifdef KERN_TLS t6_ktls_modload(); #endif t4_tracer_modload(); tweak_tunables(); vxlan_start_evtag = EVENTHANDLER_REGISTER(vxlan_start, t4_vxlan_start_handler, NULL, EVENTHANDLER_PRI_ANY); vxlan_stop_evtag = EVENTHANDLER_REGISTER(vxlan_stop, t4_vxlan_stop_handler, NULL, EVENTHANDLER_PRI_ANY); reset_tq = taskqueue_create("t4_rst_tq", M_WAITOK, taskqueue_thread_enqueue, &reset_tq); taskqueue_start_threads(&reset_tq, 1, PI_SOFT, "t4_rst_thr"); } sx_xunlock(&mlu); break; case MOD_UNLOAD: sx_xlock(&mlu); if (--loaded == 0) { int tries; taskqueue_free(reset_tq); sx_slock(&t4_list_lock); if (!SLIST_EMPTY(&t4_list)) { rc = EBUSY; sx_sunlock(&t4_list_lock); goto done_unload; } #ifdef TCP_OFFLOAD sx_slock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; sx_sunlock(&t4_uld_list_lock); sx_sunlock(&t4_list_lock); goto done_unload; } #endif tries = 0; while (tries++ < 5 && t4_sge_extfree_refs() != 0) { uprintf("%ju clusters with custom free routine " "still is use.\n", t4_sge_extfree_refs()); pause("t4unload", 2 * hz); } #ifdef TCP_OFFLOAD sx_sunlock(&t4_uld_list_lock); #endif sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { EVENTHANDLER_DEREGISTER(vxlan_start, vxlan_start_evtag); EVENTHANDLER_DEREGISTER(vxlan_stop, vxlan_stop_evtag); t4_tracer_modunload(); #ifdef KERN_TLS t6_ktls_modunload(); #endif #ifdef INET6 t4_clip_modunload(); #endif #ifdef TCP_OFFLOAD sx_destroy(&t4_uld_list_lock); #endif sx_destroy(&t4_list_lock); t4_sge_modunload(); loaded = 0; } else { rc = EBUSY; loaded++; /* undo earlier decrement */ } } done_unload: sx_xunlock(&mlu); break; } return (rc); } DRIVER_MODULE(t4nex, pci, t4_driver, mod_event, 0); MODULE_VERSION(t4nex, 1); MODULE_DEPEND(t4nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t4nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t5nex, pci, t5_driver, mod_event, 0); MODULE_VERSION(t5nex, 1); MODULE_DEPEND(t5nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t5nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t6nex, pci, t6_driver, mod_event, 0); MODULE_VERSION(t6nex, 1); MODULE_DEPEND(t6nex, crypto, 1, 1, 1); MODULE_DEPEND(t6nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t6nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, 0, 0); MODULE_VERSION(cxgbe, 1); DRIVER_MODULE(cxl, t5nex, cxl_driver, 0, 0); MODULE_VERSION(cxl, 1); DRIVER_MODULE(cc, t6nex, cc_driver, 0, 0); MODULE_VERSION(cc, 1); DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, 0, 0); MODULE_VERSION(vcxgbe, 1); DRIVER_MODULE(vcxl, cxl, vcxl_driver, 0, 0); MODULE_VERSION(vcxl, 1); DRIVER_MODULE(vcc, cc, vcc_driver, 0, 0); MODULE_VERSION(vcc, 1); diff --git a/sys/dev/cxgbe/t4_netmap.c b/sys/dev/cxgbe/t4_netmap.c index 06c9c98136f5..bc9d5f83a27e 100644 --- a/sys/dev/cxgbe/t4_netmap.c +++ b/sys/dev/cxgbe/t4_netmap.c @@ -1,1444 +1,1444 @@ /*- * Copyright (c) 2014 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef DEV_NETMAP #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" extern int fl_pad; /* XXXNM */ /* * 0 = normal netmap rx * 1 = black hole * 2 = supermassive black hole (buffer packing enabled) */ int black_hole = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_black_hole, CTLFLAG_RWTUN, &black_hole, 0, "Sink incoming packets."); int rx_ndesc = 256; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_rx_ndesc, CTLFLAG_RWTUN, &rx_ndesc, 0, "# of rx descriptors after which the hw cidx is updated."); int rx_nframes = 64; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_rx_nframes, CTLFLAG_RWTUN, &rx_nframes, 0, "max # of frames received before waking up netmap rx."); int holdoff_tmr_idx = 2; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_holdoff_tmr_idx, CTLFLAG_RWTUN, &holdoff_tmr_idx, 0, "Holdoff timer index for netmap rx queues."); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. */ static int nm_cong_drop = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_cong_drop, CTLFLAG_RWTUN, &nm_cong_drop, 0, "Congestion control for netmap rx queues (0 = backpressure, 1 = drop"); int starve_fl = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, starve_fl, CTLFLAG_RWTUN, &starve_fl, 0, "Don't ring fl db for netmap rx queues."); /* * Try to process tx credits in bulk. This may cause a delay in the return of * tx credits and is suitable for bursty or non-stop tx only. */ int lazy_tx_credit_flush = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, lazy_tx_credit_flush, CTLFLAG_RWTUN, &lazy_tx_credit_flush, 0, "lazy credit flush for netmap tx queues."); /* * Split the netmap rx queues into two groups that populate separate halves of * the RSS indirection table. This allows filters with hashmask to steer to a * particular group of queues. */ static int nm_split_rss = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_split_rss, CTLFLAG_RWTUN, &nm_split_rss, 0, "Split the netmap rx queues into two groups."); /* * netmap(4) says "netmap does not use features such as checksum offloading, TCP * segmentation offloading, encryption, VLAN encapsulation/decapsulation, etc." * but this knob can be used to get the hardware to checksum all tx traffic * anyway. */ static int nm_txcsum = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_txcsum, CTLFLAG_RWTUN, &nm_txcsum, 0, "Enable transmit checksum offloading."); static int free_nm_rxq_hwq(struct vi_info *, struct sge_nm_rxq *); static int free_nm_txq_hwq(struct vi_info *, struct sge_nm_txq *); int alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx, int idx) { int rc; struct sysctl_oid *oid; struct sysctl_oid_list *children; struct sysctl_ctx_list *ctx; char name[16]; size_t len; struct adapter *sc = vi->adapter; struct netmap_adapter *na = NA(vi->ifp); MPASS(na != NULL); len = vi->qsize_rxq * IQ_ESIZE; rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map, &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc); if (rc != 0) return (rc); len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map, &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc); if (rc != 0) return (rc); nm_rxq->vi = vi; nm_rxq->nid = idx; nm_rxq->iq_cidx = 0; nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE; nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; nm_rxq->fl_sidx = na->num_rx_desc; nm_rxq->fl_sidx2 = nm_rxq->fl_sidx; /* copy for rxsync cacheline */ nm_rxq->intr_idx = intr_idx; nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID; ctx = &vi->ctx; children = SYSCTL_CHILDREN(vi->nm_rxq_oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, "absolute id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_rxq->iq_cidx, 0, "consumer index"); children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_rxq->fl_cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &nm_rxq->fl_pidx, 0, "producer index"); return (rc); } int free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = vi->adapter; if (!(vi->flags & VI_INIT_DONE)) return (0); if (nm_rxq->iq_cntxt_id != INVALID_NM_RXQ_CNTXT_ID) free_nm_rxq_hwq(vi, nm_rxq); MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID); free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba, nm_rxq->iq_desc); free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba, nm_rxq->fl_desc); return (0); } int alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx) { int rc; size_t len; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct netmap_adapter *na = NA(vi->ifp); char name[16]; struct sysctl_oid *oid; struct sysctl_oid_list *children = SYSCTL_CHILDREN(vi->nm_txq_oid); len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map, &nm_txq->ba, (void **)&nm_txq->desc); if (rc) return (rc); nm_txq->pidx = nm_txq->cidx = 0; nm_txq->sidx = na->num_tx_desc; nm_txq->nid = idx; nm_txq->iqidx = iqidx; nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); if (sc->params.fw_vers >= FW_VERSION32(1, 24, 11, 0)) nm_txq->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS2_WR)); else nm_txq->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID; snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "netmap tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_txq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_U16(&vi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_txq->cidx, 0, "consumer index"); SYSCTL_ADD_U16(&vi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &nm_txq->pidx, 0, "producer index"); return (rc); } int free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { struct adapter *sc = vi->adapter; if (!(vi->flags & VI_INIT_DONE)) return (0); if (nm_txq->cntxt_id != INVALID_NM_TXQ_CNTXT_ID) free_nm_txq_hwq(vi, nm_txq); MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID); free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba, nm_txq->desc); return (0); } static int alloc_nm_rxq_hwq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { int rc, cntxt_id; __be32 v; struct adapter *sc = vi->adapter; struct port_info *pi = vi->pi; struct sge_params *sp = &sc->params.sge; struct netmap_adapter *na = NA(vi->ifp); struct fw_iq_cmd c; const int cong_drop = nm_cong_drop; const int cong_map = pi->rx_e_chan_map; MPASS(na != NULL); MPASS(nm_rxq->iq_desc != NULL); MPASS(nm_rxq->fl_desc != NULL); bzero(nm_rxq->iq_desc, vi->qsize_rxq * IQ_ESIZE); bzero(nm_rxq->fl_desc, na->num_rx_desc * EQ_ESIZE + sp->spg_len); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); if (nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID) c.alloc_to_len16 |= htobe32(F_FW_IQ_CMD_ALLOC); else { c.iqid = htobe16(nm_rxq->iq_cntxt_id); c.fl0id = htobe16(nm_rxq->fl_cntxt_id); c.fl1id = htobe16(0xffff); c.physiqid = htobe16(nm_rxq->iq_abs_id); } MPASS(!forwarding_intr_to_fwq(sc)); KASSERT(nm_rxq->intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, nm_rxq->intr_idx)); v = V_FW_IQ_CMD_IQANDSTINDEX(nm_rxq->intr_idx); c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(0) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(vi->qsize_rxq); c.iqaddr = htobe64(nm_rxq->iq_ba); if (cong_drop != -1) { c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN | V_FW_IQ_CMD_FL0CNGCHMAP(cong_map) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_IQ_CMD_IQTYPE(FW_IQ_IQTYPE_NIC) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (black_hole == 2 ? F_FW_IQ_CMD_FL0PACKEN : 0)); c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) | V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B)); c.fl0size = htobe16(na->num_rx_desc / 8 + sp->spg_len / EQ_ESIZE); c.fl0addr = htobe64(nm_rxq->fl_ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create netmap ingress queue: %d\n", rc); return (rc); } nm_rxq->iq_cidx = 0; MPASS(nm_rxq->iq_sidx == vi->qsize_rxq - sp->spg_len / IQ_ESIZE); nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->iq_cntxt_id = be16toh(c.iqid); nm_rxq->iq_abs_id = be16toh(c.physiqid); cntxt_id = nm_rxq->iq_cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.iqmap_sz) { panic ("%s: nm_rxq->iq_cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.iqmap_sz - 1); } sc->sge.iqmap[cntxt_id] = (void *)nm_rxq; nm_rxq->fl_cntxt_id = be16toh(c.fl0id); nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; nm_rxq->fl_db_saved = 0; /* matches the X_FETCHBURSTMAX_512B or X_FETCHBURSTMAX_256B above. */ nm_rxq->fl_db_threshold = chip_id(sc) <= CHELSIO_T5 ? 8 : 4; MPASS(nm_rxq->fl_sidx == na->num_rx_desc); cntxt_id = nm_rxq->fl_cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) { panic("%s: nm_rxq->fl_cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); } sc->sge.eqmap[cntxt_id] = (void *)nm_rxq; nm_rxq->fl_db_val = V_QID(nm_rxq->fl_cntxt_id) | sc->chip_params->sge_fl_db; if (chip_id(sc) >= CHELSIO_T5 && cong_drop != -1) { t4_sge_set_conm_context(sc, nm_rxq->iq_cntxt_id, cong_drop, cong_map); } t4_write_reg(sc, sc->sge_gts_reg, V_INGRESSQID(nm_rxq->iq_cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(holdoff_tmr_idx))); return (rc); } static int free_nm_rxq_hwq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = vi->adapter; int rc; rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, 0xffff); if (rc != 0) device_printf(sc->dev, "%s: failed for iq %d, fl %d: %d\n", __func__, nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, rc); nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID; return (rc); } static int alloc_nm_txq_hwq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { int rc, cntxt_id; size_t len; struct adapter *sc = vi->adapter; struct netmap_adapter *na = NA(vi->ifp); struct fw_eq_eth_cmd c; MPASS(na != NULL); MPASS(nm_txq->desc != NULL); len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len; bzero(nm_txq->desc, len); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); if (nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID) c.alloc_to_len16 |= htobe32(F_FW_EQ_ETH_CMD_ALLOC); else c.eqid_pkd = htobe32(V_FW_EQ_ETH_CMD_EQID(nm_txq->cntxt_id)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(vi->pi->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(sc->sge.nm_rxq[nm_txq->iqidx].iq_cntxt_id)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(len / EQ_ESIZE)); c.eqaddr = htobe64(nm_txq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create netmap egress queue: %d\n", rc); return (rc); } nm_txq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = nm_txq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) panic("%s: nm_txq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); sc->sge.eqmap[cntxt_id] = (void *)nm_txq; nm_txq->pidx = nm_txq->cidx = 0; MPASS(nm_txq->sidx == na->num_tx_desc); nm_txq->equiqidx = nm_txq->equeqidx = nm_txq->dbidx = 0; nm_txq->doorbells = sc->doorbells; if (isset(&nm_txq->doorbells, DOORBELL_UDB) || isset(&nm_txq->doorbells, DOORBELL_UDBWC) || isset(&nm_txq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (nm_txq->cntxt_id >> s_qpp) << PAGE_SHIFT; nm_txq->udb_qid = nm_txq->cntxt_id & mask; if (nm_txq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&nm_txq->doorbells, DOORBELL_WCWR); else { udb += nm_txq->udb_qid << UDBS_SEG_SHIFT; nm_txq->udb_qid = 0; } nm_txq->udb = (volatile void *)udb; } if (sc->params.fw_vers < FW_VERSION32(1, 25, 1, 0)) { uint32_t param, val; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) | V_FW_PARAMS_PARAM_YZ(nm_txq->cntxt_id); val = 0xff; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { device_printf(vi->dev, "failed to bind netmap txq %d to class 0xff: %d\n", nm_txq->cntxt_id, rc); rc = 0; } } return (rc); } static int free_nm_txq_hwq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { struct adapter *sc = vi->adapter; int rc; rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, nm_txq->cntxt_id); if (rc != 0) device_printf(sc->dev, "%s: failed for eq %d: %d\n", __func__, nm_txq->cntxt_id, rc); nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID; return (rc); } static int cxgbe_netmap_simple_rss(struct adapter *sc, struct vi_info *vi, - struct ifnet *ifp, struct netmap_adapter *na) + if_t ifp, struct netmap_adapter *na) { struct netmap_kring *kring; struct sge_nm_rxq *nm_rxq; int rc, i, j, nm_state, defq; uint16_t *rss; /* * Check if there's at least one active (or about to go active) netmap * rx queue. */ defq = -1; for_each_nm_rxq(vi, j, nm_rxq) { nm_state = atomic_load_int(&nm_rxq->nm_state); kring = na->rx_rings[nm_rxq->nid]; if ((nm_state != NM_OFF && !nm_kring_pending_off(kring)) || (nm_state == NM_OFF && nm_kring_pending_on(kring))) { MPASS(nm_rxq->iq_cntxt_id != INVALID_NM_RXQ_CNTXT_ID); if (defq == -1) { defq = nm_rxq->iq_abs_id; break; } } } if (defq == -1) { /* No active netmap queues. Switch back to NIC queues. */ rss = vi->rss; defq = vi->rss[0]; } else { for (i = 0; i < vi->rss_size;) { for_each_nm_rxq(vi, j, nm_rxq) { nm_state = atomic_load_int(&nm_rxq->nm_state); kring = na->rx_rings[nm_rxq->nid]; if ((nm_state != NM_OFF && !nm_kring_pending_off(kring)) || (nm_state == NM_OFF && nm_kring_pending_on(kring))) { MPASS(nm_rxq->iq_cntxt_id != INVALID_NM_RXQ_CNTXT_ID); vi->nm_rss[i++] = nm_rxq->iq_abs_id; if (i == vi->rss_size) break; } } } rss = vi->nm_rss; } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, rss, vi->rss_size); if (rc != 0) if_printf(ifp, "netmap rss_config failed: %d\n", rc); rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, defq, 0, 0); if (rc != 0) { if_printf(ifp, "netmap defaultq config failed: %d\n", rc); } return (rc); } /* * Odd number of rx queues work best for split RSS mode as the first queue can * be dedicated for non-RSS traffic and the rest divided into two equal halves. */ static int cxgbe_netmap_split_rss(struct adapter *sc, struct vi_info *vi, - struct ifnet *ifp, struct netmap_adapter *na) + if_t ifp, struct netmap_adapter *na) { struct netmap_kring *kring; struct sge_nm_rxq *nm_rxq; int rc, i, j, nm_state, defq; int nactive[2] = {0, 0}; int dq[2] = {-1, -1}; bool dq_norss; /* default queue should not be in RSS table. */ MPASS(nm_split_rss != 0); MPASS(vi->nnmrxq > 1); for_each_nm_rxq(vi, i, nm_rxq) { j = i / ((vi->nnmrxq + 1) / 2); nm_state = atomic_load_int(&nm_rxq->nm_state); kring = na->rx_rings[nm_rxq->nid]; if ((nm_state != NM_OFF && !nm_kring_pending_off(kring)) || (nm_state == NM_OFF && nm_kring_pending_on(kring))) { MPASS(nm_rxq->iq_cntxt_id != INVALID_NM_RXQ_CNTXT_ID); nactive[j]++; if (dq[j] == -1) { dq[j] = nm_rxq->iq_abs_id; break; } } } if (nactive[0] == 0 || nactive[1] == 0) return (cxgbe_netmap_simple_rss(sc, vi, ifp, na)); MPASS(dq[0] != -1 && dq[1] != -1); if (nactive[0] > nactive[1]) { defq = dq[0]; dq_norss = true; } else if (nactive[0] < nactive[1]) { defq = dq[1]; dq_norss = true; } else { defq = dq[0]; dq_norss = false; } i = 0; nm_rxq = &sc->sge.nm_rxq[vi->first_nm_rxq]; while (i < vi->rss_size / 2) { for (j = 0; j < (vi->nnmrxq + 1) / 2; j++) { nm_state = atomic_load_int(&nm_rxq[j].nm_state); kring = na->rx_rings[nm_rxq[j].nid]; if ((nm_state == NM_OFF && !nm_kring_pending_on(kring)) || (nm_state == NM_ON && nm_kring_pending_off(kring))) { continue; } MPASS(nm_rxq[j].iq_cntxt_id != INVALID_NM_RXQ_CNTXT_ID); if (dq_norss && defq == nm_rxq[j].iq_abs_id) continue; vi->nm_rss[i++] = nm_rxq[j].iq_abs_id; if (i == vi->rss_size / 2) break; } } while (i < vi->rss_size) { for (j = (vi->nnmrxq + 1) / 2; j < vi->nnmrxq; j++) { nm_state = atomic_load_int(&nm_rxq[j].nm_state); kring = na->rx_rings[nm_rxq[j].nid]; if ((nm_state == NM_OFF && !nm_kring_pending_on(kring)) || (nm_state == NM_ON && nm_kring_pending_off(kring))) { continue; } MPASS(nm_rxq[j].iq_cntxt_id != INVALID_NM_RXQ_CNTXT_ID); if (dq_norss && defq == nm_rxq[j].iq_abs_id) continue; vi->nm_rss[i++] = nm_rxq[j].iq_abs_id; if (i == vi->rss_size) break; } } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, vi->nm_rss, vi->rss_size); if (rc != 0) if_printf(ifp, "netmap split_rss_config failed: %d\n", rc); rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, defq, 0, 0); if (rc != 0) if_printf(ifp, "netmap defaultq config failed: %d\n", rc); return (rc); } static inline int -cxgbe_netmap_rss(struct adapter *sc, struct vi_info *vi, struct ifnet *ifp, +cxgbe_netmap_rss(struct adapter *sc, struct vi_info *vi, if_t ifp, struct netmap_adapter *na) { if (nm_split_rss == 0 || vi->nnmrxq == 1) return (cxgbe_netmap_simple_rss(sc, vi, ifp, na)); else return (cxgbe_netmap_split_rss(sc, vi, ifp, na)); } static int -cxgbe_netmap_on(struct adapter *sc, struct vi_info *vi, struct ifnet *ifp, +cxgbe_netmap_on(struct adapter *sc, struct vi_info *vi, if_t ifp, struct netmap_adapter *na) { struct netmap_slot *slot; struct netmap_kring *kring; struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; int i, j, hwidx; struct rx_buf_info *rxb; ASSERT_SYNCHRONIZED_OP(sc); MPASS(vi->nnmrxq > 0); MPASS(vi->nnmtxq > 0); if ((vi->flags & VI_INIT_DONE) == 0 || - (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + (if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "cannot enable netmap operation because " "interface is not UP.\n"); return (EAGAIN); } rxb = &sc->sge.rx_buf_info[0]; for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { if (rxb->size1 == NETMAP_BUF_SIZE(na)) { hwidx = rxb->hwidx1; break; } if (rxb->size2 == NETMAP_BUF_SIZE(na)) { hwidx = rxb->hwidx2; break; } } if (i >= SW_ZONE_SIZES) { if_printf(ifp, "no hwidx for netmap buffer size %d.\n", NETMAP_BUF_SIZE(na)); return (ENXIO); } /* Must set caps before calling netmap_reset */ nm_set_native_flags(na); for_each_nm_rxq(vi, i, nm_rxq) { kring = na->rx_rings[nm_rxq->nid]; if (!nm_kring_pending_on(kring)) continue; alloc_nm_rxq_hwq(vi, nm_rxq); nm_rxq->fl_hwidx = hwidx; slot = netmap_reset(na, NR_RX, i, 0); MPASS(slot != NULL); /* XXXNM: error check, not assert */ /* We deal with 8 bufs at a time */ MPASS((na->num_rx_desc & 7) == 0); MPASS(na->num_rx_desc == nm_rxq->fl_sidx); for (j = 0; j < nm_rxq->fl_sidx; j++) { uint64_t ba; PNMB(na, &slot[j], &ba); MPASS(ba != 0); nm_rxq->fl_desc[j] = htobe64(ba | hwidx); } j = nm_rxq->fl_pidx = nm_rxq->fl_sidx - 8; MPASS((j & 7) == 0); j /= 8; /* driver pidx to hardware pidx */ wmb(); t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(j)); (void) atomic_cmpset_int(&nm_rxq->nm_state, NM_OFF, NM_ON); } for_each_nm_txq(vi, i, nm_txq) { kring = na->tx_rings[nm_txq->nid]; if (!nm_kring_pending_on(kring)) continue; alloc_nm_txq_hwq(vi, nm_txq); slot = netmap_reset(na, NR_TX, i, 0); MPASS(slot != NULL); /* XXXNM: error check, not assert */ } if (vi->nm_rss == NULL) { vi->nm_rss = malloc(vi->rss_size * sizeof(uint16_t), M_CXGBE, M_ZERO | M_WAITOK); } return (cxgbe_netmap_rss(sc, vi, ifp, na)); } static int -cxgbe_netmap_off(struct adapter *sc, struct vi_info *vi, struct ifnet *ifp, +cxgbe_netmap_off(struct adapter *sc, struct vi_info *vi, if_t ifp, struct netmap_adapter *na) { struct netmap_kring *kring; int rc, i, nm_state, nactive; struct sge_nm_txq *nm_txq; struct sge_nm_rxq *nm_rxq; ASSERT_SYNCHRONIZED_OP(sc); MPASS(vi->nnmrxq > 0); MPASS(vi->nnmtxq > 0); if (!nm_netmap_on(na)) return (0); if ((vi->flags & VI_INIT_DONE) == 0) return (0); /* First remove the queues that are stopping from the RSS table. */ rc = cxgbe_netmap_rss(sc, vi, ifp, na); if (rc != 0) return (rc); /* error message logged already. */ for_each_nm_txq(vi, i, nm_txq) { kring = na->tx_rings[nm_txq->nid]; if (!nm_kring_pending_off(kring)) continue; MPASS(nm_txq->cntxt_id != INVALID_NM_TXQ_CNTXT_ID); rc = -t4_eth_eq_stop(sc, sc->mbox, sc->pf, 0, nm_txq->cntxt_id); if (rc != 0) { device_printf(vi->dev, "failed to stop nm_txq[%d]: %d.\n", i, rc); return (rc); } /* XXX: netmap, not the driver, should do this. */ kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = kring->nkr_num_slots - 1; } nactive = 0; for_each_nm_rxq(vi, i, nm_rxq) { nm_state = atomic_load_int(&nm_rxq->nm_state); kring = na->rx_rings[nm_rxq->nid]; if (nm_state != NM_OFF && !nm_kring_pending_off(kring)) nactive++; if (!nm_kring_pending_off(kring)) continue; MPASS(nm_state != NM_OFF); MPASS(nm_rxq->iq_cntxt_id != INVALID_NM_RXQ_CNTXT_ID); rc = -t4_iq_stop(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, 0xffff); if (rc != 0) { device_printf(vi->dev, "failed to stop nm_rxq[%d]: %d.\n", i, rc); return (rc); } while (!atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_OFF)) pause("nmst", 1); /* XXX: netmap, not the driver, should do this. */ kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = 0; } netmap_krings_mode_commit(na, 0); if (nactive == 0) nm_clear_native_flags(na); return (rc); } static int cxgbe_netmap_reg(struct netmap_adapter *na, int on) { - struct ifnet *ifp = na->ifp; - struct vi_info *vi = ifp->if_softc; + if_t ifp = na->ifp; + struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; int rc; rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4nmreg"); if (rc != 0) return (rc); if (on) rc = cxgbe_netmap_on(sc, vi, ifp, na); else rc = cxgbe_netmap_off(sc, vi, ifp, na); end_synchronized_op(sc, 0); return (rc); } /* How many packets can a single type1 WR carry in n descriptors */ static inline int ndesc_to_npkt(const int n) { MPASS(n > 0 && n <= SGE_MAX_WR_NDESC); return (n * 2 - 1); } #define MAX_NPKT_IN_TYPE1_WR (ndesc_to_npkt(SGE_MAX_WR_NDESC)) /* * Space (in descriptors) needed for a type1 WR (TX_PKTS or TX_PKTS2) that * carries n packets */ static inline int npkt_to_ndesc(const int n) { MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR); return ((n + 2) / 2); } /* * Space (in 16B units) needed for a type1 WR (TX_PKTS or TX_PKTS2) that * carries n packets */ static inline int npkt_to_len16(const int n) { MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR); return (n * 2 + 1); } #define NMIDXDIFF(q, idx) IDXDIFF((q)->pidx, (q)->idx, (q)->sidx) static void ring_nm_txq_db(struct adapter *sc, struct sge_nm_txq *nm_txq) { int n; u_int db = nm_txq->doorbells; MPASS(nm_txq->pidx != nm_txq->dbidx); n = NMIDXDIFF(nm_txq, dbidx); if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(nm_txq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for nm_txq %p", __func__, nm_txq->doorbells, n, nm_txq->pidx, nm_txq)); dst = (volatile void *)((uintptr_t)nm_txq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); src = (void *)&nm_txq->desc[nm_txq->dbidx]; while (src != (void *)&nm_txq->desc[nm_txq->dbidx + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: t4_write_reg(sc, sc->sge_kdoorbell_reg, V_QID(nm_txq->cntxt_id) | V_PIDX(n)); break; } nm_txq->dbidx = nm_txq->pidx; } /* * Write work requests to send 'npkt' frames and ring the doorbell to send them * on their way. No need to check for wraparound. */ static void cxgbe_nm_tx(struct adapter *sc, struct sge_nm_txq *nm_txq, struct netmap_kring *kring, int npkt, int npkt_remaining) { struct netmap_ring *ring = kring->ring; struct netmap_slot *slot; const u_int lim = kring->nkr_num_slots - 1; struct fw_eth_tx_pkts_wr *wr = (void *)&nm_txq->desc[nm_txq->pidx]; uint16_t len; uint64_t ba; struct cpl_tx_pkt_core *cpl; struct ulptx_sgl *usgl; int i, n; while (npkt) { n = min(npkt, MAX_NPKT_IN_TYPE1_WR); len = 0; wr = (void *)&nm_txq->desc[nm_txq->pidx]; wr->op_pkd = nm_txq->op_pkd; wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(npkt_to_len16(n))); wr->npkt = n; wr->r3 = 0; wr->type = 1; cpl = (void *)(wr + 1); for (i = 0; i < n; i++) { slot = &ring->slot[kring->nr_hwcur]; PNMB(kring->na, slot, &ba); MPASS(ba != 0); cpl->ctrl0 = nm_txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(slot->len); cpl->ctrl1 = nm_txcsum ? 0 : htobe64(F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS); usgl = (void *)(cpl + 1); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(1)); usgl->len0 = htobe32(slot->len); usgl->addr0 = htobe64(ba); slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); cpl = (void *)(usgl + 1); MPASS(slot->len + len <= UINT16_MAX); len += slot->len; kring->nr_hwcur = nm_next(kring->nr_hwcur, lim); } wr->plen = htobe16(len); npkt -= n; nm_txq->pidx += npkt_to_ndesc(n); MPASS(nm_txq->pidx <= nm_txq->sidx); if (__predict_false(nm_txq->pidx == nm_txq->sidx)) { /* * This routine doesn't know how to write WRs that wrap * around. Make sure it wasn't asked to. */ MPASS(npkt == 0); nm_txq->pidx = 0; } if (npkt == 0 && npkt_remaining == 0) { /* All done. */ if (lazy_tx_credit_flush == 0) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); nm_txq->equeqidx = nm_txq->pidx; nm_txq->equiqidx = nm_txq->pidx; } ring_nm_txq_db(sc, nm_txq); return; } if (NMIDXDIFF(nm_txq, equiqidx) >= nm_txq->sidx / 2) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); nm_txq->equeqidx = nm_txq->pidx; nm_txq->equiqidx = nm_txq->pidx; } else if (NMIDXDIFF(nm_txq, equeqidx) >= 64) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); nm_txq->equeqidx = nm_txq->pidx; } if (NMIDXDIFF(nm_txq, dbidx) >= 2 * SGE_MAX_WR_NDESC) ring_nm_txq_db(sc, nm_txq); } /* Will get called again. */ MPASS(npkt_remaining); } /* How many contiguous free descriptors starting at pidx */ static inline int contiguous_ndesc_available(struct sge_nm_txq *nm_txq) { if (nm_txq->cidx > nm_txq->pidx) return (nm_txq->cidx - nm_txq->pidx - 1); else if (nm_txq->cidx > 0) return (nm_txq->sidx - nm_txq->pidx); else return (nm_txq->sidx - nm_txq->pidx - 1); } static int reclaim_nm_tx_desc(struct sge_nm_txq *nm_txq) { struct sge_qstat *spg = (void *)&nm_txq->desc[nm_txq->sidx]; uint16_t hw_cidx = spg->cidx; /* snapshot */ struct fw_eth_tx_pkts_wr *wr; int n = 0; hw_cidx = be16toh(hw_cidx); while (nm_txq->cidx != hw_cidx) { wr = (void *)&nm_txq->desc[nm_txq->cidx]; MPASS(wr->op_pkd == htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)) || wr->op_pkd == htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS2_WR))); MPASS(wr->type == 1); MPASS(wr->npkt > 0 && wr->npkt <= MAX_NPKT_IN_TYPE1_WR); n += wr->npkt; nm_txq->cidx += npkt_to_ndesc(wr->npkt); /* * We never sent a WR that wrapped around so the credits coming * back, WR by WR, should never cause the cidx to wrap around * either. */ MPASS(nm_txq->cidx <= nm_txq->sidx); if (__predict_false(nm_txq->cidx == nm_txq->sidx)) nm_txq->cidx = 0; } return (n); } static int cxgbe_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; - struct ifnet *ifp = na->ifp; - struct vi_info *vi = ifp->if_softc; + if_t ifp = na->ifp; + struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; struct sge_nm_txq *nm_txq = &sc->sge.nm_txq[vi->first_nm_txq + kring->ring_id]; const u_int head = kring->rhead; u_int reclaimed = 0; int n, d, npkt_remaining, ndesc_remaining; /* * Tx was at kring->nr_hwcur last time around and now we need to advance * to kring->rhead. Note that the driver's pidx moves independent of * netmap's kring->nr_hwcur (pidx counts descriptors and the relation * between descriptors and frames isn't 1:1). */ npkt_remaining = head >= kring->nr_hwcur ? head - kring->nr_hwcur : kring->nkr_num_slots - kring->nr_hwcur + head; while (npkt_remaining) { reclaimed += reclaim_nm_tx_desc(nm_txq); ndesc_remaining = contiguous_ndesc_available(nm_txq); /* Can't run out of descriptors with packets still remaining */ MPASS(ndesc_remaining > 0); /* # of desc needed to tx all remaining packets */ d = (npkt_remaining / MAX_NPKT_IN_TYPE1_WR) * SGE_MAX_WR_NDESC; if (npkt_remaining % MAX_NPKT_IN_TYPE1_WR) d += npkt_to_ndesc(npkt_remaining % MAX_NPKT_IN_TYPE1_WR); if (d <= ndesc_remaining) n = npkt_remaining; else { /* Can't send all, calculate how many can be sent */ n = (ndesc_remaining / SGE_MAX_WR_NDESC) * MAX_NPKT_IN_TYPE1_WR; if (ndesc_remaining % SGE_MAX_WR_NDESC) n += ndesc_to_npkt(ndesc_remaining % SGE_MAX_WR_NDESC); } /* Send n packets and update nm_txq->pidx and kring->nr_hwcur */ npkt_remaining -= n; cxgbe_nm_tx(sc, nm_txq, kring, n, npkt_remaining); } MPASS(npkt_remaining == 0); MPASS(kring->nr_hwcur == head); MPASS(nm_txq->dbidx == nm_txq->pidx); /* * Second part: reclaim buffers for completed transmissions. */ if (reclaimed || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { reclaimed += reclaim_nm_tx_desc(nm_txq); kring->nr_hwtail += reclaimed; if (kring->nr_hwtail >= kring->nkr_num_slots) kring->nr_hwtail -= kring->nkr_num_slots; } return (0); } static int cxgbe_netmap_rxsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; - struct ifnet *ifp = na->ifp; - struct vi_info *vi = ifp->if_softc; + if_t ifp = na->ifp; + struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; struct sge_nm_rxq *nm_rxq = &sc->sge.nm_rxq[vi->first_nm_rxq + kring->ring_id]; u_int const head = kring->rhead; u_int n; int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; if (black_hole) return (0); /* No updates ever. */ if (netmap_no_pendintr || force_update) { kring->nr_hwtail = atomic_load_acq_32(&nm_rxq->fl_cidx); kring->nr_kflags &= ~NKR_PENDINTR; } if (nm_rxq->fl_db_saved > 0 && starve_fl == 0) { wmb(); t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(nm_rxq->fl_db_saved)); nm_rxq->fl_db_saved = 0; } /* Userspace done with buffers from kring->nr_hwcur to head */ n = head >= kring->nr_hwcur ? head - kring->nr_hwcur : kring->nkr_num_slots - kring->nr_hwcur + head; n &= ~7U; if (n > 0) { u_int fl_pidx = nm_rxq->fl_pidx; struct netmap_slot *slot = &ring->slot[fl_pidx]; uint64_t ba; int i, dbinc = 0, hwidx = nm_rxq->fl_hwidx; /* * We always deal with 8 buffers at a time. We must have * stopped at an 8B boundary (fl_pidx) last time around and we * must have a multiple of 8B buffers to give to the freelist. */ MPASS((fl_pidx & 7) == 0); MPASS((n & 7) == 0); IDXINCR(kring->nr_hwcur, n, kring->nkr_num_slots); IDXINCR(nm_rxq->fl_pidx, n, nm_rxq->fl_sidx2); while (n > 0) { for (i = 0; i < 8; i++, fl_pidx++, slot++) { PNMB(na, slot, &ba); MPASS(ba != 0); nm_rxq->fl_desc[fl_pidx] = htobe64(ba | hwidx); slot->flags &= ~NS_BUF_CHANGED; MPASS(fl_pidx <= nm_rxq->fl_sidx2); } n -= 8; if (fl_pidx == nm_rxq->fl_sidx2) { fl_pidx = 0; slot = &ring->slot[0]; } if (++dbinc == nm_rxq->fl_db_threshold) { wmb(); if (starve_fl) nm_rxq->fl_db_saved += dbinc; else { t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(dbinc)); } dbinc = 0; } } MPASS(nm_rxq->fl_pidx == fl_pidx); if (dbinc > 0) { wmb(); if (starve_fl) nm_rxq->fl_db_saved += dbinc; else { t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(dbinc)); } } } return (0); } void cxgbe_nm_attach(struct vi_info *vi) { struct port_info *pi; struct adapter *sc; struct netmap_adapter na; MPASS(vi->nnmrxq > 0); MPASS(vi->ifp != NULL); pi = vi->pi; sc = pi->adapter; bzero(&na, sizeof(na)); na.ifp = vi->ifp; na.na_flags = NAF_BDG_MAYSLEEP; /* Netmap doesn't know about the space reserved for the status page. */ na.num_tx_desc = vi->qsize_txq - sc->params.sge.spg_len / EQ_ESIZE; /* * The freelist's cidx/pidx drives netmap's rx cidx/pidx. So * num_rx_desc is based on the number of buffers that can be held in the * freelist, and not the number of entries in the iq. (These two are * not exactly the same due to the space taken up by the status page). */ na.num_rx_desc = rounddown(vi->qsize_rxq, 8); na.nm_txsync = cxgbe_netmap_txsync; na.nm_rxsync = cxgbe_netmap_rxsync; na.nm_register = cxgbe_netmap_reg; na.num_tx_rings = vi->nnmtxq; na.num_rx_rings = vi->nnmrxq; na.rx_buf_maxsize = MAX_MTU; netmap_attach(&na); /* This adds IFCAP_NETMAP to if_capabilities */ } void cxgbe_nm_detach(struct vi_info *vi) { MPASS(vi->nnmrxq > 0); MPASS(vi->ifp != NULL); netmap_detach(vi->ifp); } static inline const void * unwrap_nm_fw6_msg(const struct cpl_fw6_msg *cpl) { MPASS(cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL); /* data[0] is RSS header */ return (&cpl->data[1]); } static void -handle_nm_sge_egr_update(struct adapter *sc, struct ifnet *ifp, +handle_nm_sge_egr_update(struct adapter *sc, if_t ifp, const struct cpl_sge_egr_update *egr) { uint32_t oq; struct sge_nm_txq *nm_txq; oq = be32toh(egr->opcode_qid); MPASS(G_CPL_OPCODE(oq) == CPL_SGE_EGR_UPDATE); nm_txq = (void *)sc->sge.eqmap[G_EGR_QID(oq) - sc->sge.eq_start]; netmap_tx_irq(ifp, nm_txq->nid); } void service_nm_rxq(struct sge_nm_rxq *nm_rxq) { struct vi_info *vi = nm_rxq->vi; struct adapter *sc = vi->adapter; - struct ifnet *ifp = vi->ifp; + if_t ifp = vi->ifp; struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = na->rx_rings[nm_rxq->nid]; struct netmap_ring *ring = kring->ring; struct iq_desc *d = &nm_rxq->iq_desc[nm_rxq->iq_cidx]; const void *cpl; uint32_t lq; u_int work = 0; uint8_t opcode; uint32_t fl_cidx = atomic_load_acq_32(&nm_rxq->fl_cidx); u_int fl_credits = fl_cidx & 7; u_int ndesc = 0; /* desc processed since last cidx update */ u_int nframes = 0; /* frames processed since last netmap wakeup */ while ((d->rsp.u.type_gen & F_RSPD_GEN) == nm_rxq->iq_gen) { rmb(); lq = be32toh(d->rsp.pldbuflen_qid); opcode = d->rss.opcode; cpl = &d->cpl[0]; switch (G_RSPD_TYPE(d->rsp.u.type_gen)) { case X_RSPD_TYPE_FLBUF: /* fall through */ case X_RSPD_TYPE_CPL: MPASS(opcode < NUM_CPL_CMDS); switch (opcode) { case CPL_FW4_MSG: case CPL_FW6_MSG: cpl = unwrap_nm_fw6_msg(cpl); /* fall through */ case CPL_SGE_EGR_UPDATE: handle_nm_sge_egr_update(sc, ifp, cpl); break; case CPL_RX_PKT: ring->slot[fl_cidx].len = G_RSPD_LEN(lq) - sc->params.sge.fl_pktshift; ring->slot[fl_cidx].flags = 0; nframes++; if (!(lq & F_RSPD_NEWBUF)) { MPASS(black_hole == 2); break; } fl_credits++; if (__predict_false(++fl_cidx == nm_rxq->fl_sidx)) fl_cidx = 0; break; default: panic("%s: unexpected opcode 0x%x on nm_rxq %p", __func__, opcode, nm_rxq); } break; case X_RSPD_TYPE_INTR: /* Not equipped to handle forwarded interrupts. */ panic("%s: netmap queue received interrupt for iq %u\n", __func__, lq); default: panic("%s: illegal response type %d on nm_rxq %p", __func__, G_RSPD_TYPE(d->rsp.u.type_gen), nm_rxq); } d++; if (__predict_false(++nm_rxq->iq_cidx == nm_rxq->iq_sidx)) { nm_rxq->iq_cidx = 0; d = &nm_rxq->iq_desc[0]; nm_rxq->iq_gen ^= F_RSPD_GEN; } if (__predict_false(++nframes == rx_nframes) && !black_hole) { atomic_store_rel_32(&nm_rxq->fl_cidx, fl_cidx); netmap_rx_irq(ifp, nm_rxq->nid, &work); nframes = 0; } if (__predict_false(++ndesc == rx_ndesc)) { if (black_hole && fl_credits >= 8) { fl_credits /= 8; IDXINCR(nm_rxq->fl_pidx, fl_credits * 8, nm_rxq->fl_sidx); t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(fl_credits)); fl_credits = fl_cidx & 7; } t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndesc) | V_INGRESSQID(nm_rxq->iq_cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndesc = 0; } } atomic_store_rel_32(&nm_rxq->fl_cidx, fl_cidx); if (black_hole) { fl_credits /= 8; IDXINCR(nm_rxq->fl_pidx, fl_credits * 8, nm_rxq->fl_sidx); t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(fl_credits)); } else if (nframes > 0) netmap_rx_irq(ifp, nm_rxq->nid, &work); t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndesc) | V_INGRESSQID((u32)nm_rxq->iq_cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(holdoff_tmr_idx))); } #endif diff --git a/sys/dev/cxgbe/t4_sched.c b/sys/dev/cxgbe/t4_sched.c index 82f8537bda38..fcbd882098ac 100644 --- a/sys/dev/cxgbe/t4_sched.c +++ b/sys/dev/cxgbe/t4_sched.c @@ -1,991 +1,991 @@ /*- * Copyright (c) 2017 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" static int in_range(int val, int lo, int hi) { return (val < 0 || (val <= hi && val >= lo)); } static int set_sched_class_config(struct adapter *sc, int minmax) { int rc; if (minmax < 0) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc"); if (rc) return (rc); if (hw_off_limits(sc)) rc = ENXIO; else rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1); end_synchronized_op(sc, 0); return (rc); } static int set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p, int sleep_ok) { int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode; struct port_info *pi; struct tx_cl_rl_params *tc, old; bool check_pktsize = false; if (p->level == SCHED_CLASS_LEVEL_CL_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL; else if (p->level == SCHED_CLASS_LEVEL_CL_WRR) fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR; else if (p->level == SCHED_CLASS_LEVEL_CH_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL; else return (EINVAL); if (p->level == SCHED_CLASS_LEVEL_CL_RL) { if (p->mode == SCHED_CLASS_MODE_CLASS) fw_mode = FW_SCHED_PARAMS_MODE_CLASS; else if (p->mode == SCHED_CLASS_MODE_FLOW) { check_pktsize = true; fw_mode = FW_SCHED_PARAMS_MODE_FLOW; } else return (EINVAL); } else fw_mode = 0; /* Valid channel must always be provided. */ if (p->channel < 0) return (EINVAL); if (!in_range(p->channel, 0, sc->chip_params->nchan - 1)) return (ERANGE); pi = sc->port[sc->chan_map[p->channel]]; if (pi == NULL) return (ENXIO); MPASS(pi->tx_chan == p->channel); top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */ if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CH_RL) { /* * Valid rate (mode, unit and values) must be provided. */ if (p->minrate < 0) p->minrate = 0; if (p->maxrate < 0) return (EINVAL); if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) { fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; /* ratemode could be relative (%) or absolute. */ if (p->ratemode == SCHED_CLASS_RATEMODE_REL) { fw_ratemode = FW_SCHED_PARAMS_RATE_REL; /* maxrate is % of port bandwidth. */ if (!in_range(p->minrate, 0, 100) || !in_range(p->maxrate, 0, 100)) { return (ERANGE); } } else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) { fw_ratemode = FW_SCHED_PARAMS_RATE_ABS; /* maxrate is absolute value in kbps. */ if (!in_range(p->minrate, 0, top_speed) || !in_range(p->maxrate, 0, top_speed)) { return (ERANGE); } } else return (EINVAL); } else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) { /* maxrate is the absolute value in pps. */ check_pktsize = true; fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE; } else return (EINVAL); } else { MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR); /* * Valid weight must be provided. */ if (p->weight < 0) return (EINVAL); if (!in_range(p->weight, 1, 99)) return (ERANGE); fw_rateunit = 0; fw_ratemode = 0; } if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CL_WRR) { /* * Valid scheduling class must be provided. */ if (p->cl < 0) return (EINVAL); if (!in_range(p->cl, 0, sc->params.nsched_cls - 1)) return (ERANGE); } if (check_pktsize) { if (p->pktsize < 0) return (EINVAL); - if (!in_range(p->pktsize, 64, pi->vi[0].ifp->if_mtu)) + if (!in_range(p->pktsize, 64, if_getmtu(pi->vi[0].ifp))) return (ERANGE); } if (p->level == SCHED_CLASS_LEVEL_CL_RL) { tc = &pi->sched_params->cl_rl[p->cl]; mtx_lock(&sc->tc_lock); if (tc->refcount > 0 || tc->state == CS_HW_UPDATE_IN_PROGRESS) rc = EBUSY; else { old = *tc; tc->flags |= CF_USER; tc->state = CS_HW_UPDATE_IN_PROGRESS; tc->ratemode = fw_ratemode; tc->rateunit = fw_rateunit; tc->mode = fw_mode; tc->maxrate = p->maxrate; tc->pktsize = p->pktsize; rc = 0; } mtx_unlock(&sc->tc_lock); if (rc != 0) return (rc); } rc = begin_synchronized_op(sc, NULL, sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp"); if (rc != 0) { if (p->level == SCHED_CLASS_LEVEL_CL_RL) { mtx_lock(&sc->tc_lock); MPASS(tc->refcount == 0); MPASS(tc->flags & CF_USER); MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS); *tc = old; mtx_unlock(&sc->tc_lock); } return (rc); } if (!hw_off_limits(sc)) { rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level, fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl, p->minrate, p->maxrate, p->weight, p->pktsize, 0, sleep_ok); } end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD); if (p->level == SCHED_CLASS_LEVEL_CL_RL) { mtx_lock(&sc->tc_lock); MPASS(tc->refcount == 0); MPASS(tc->flags & CF_USER); MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS); if (rc == 0) tc->state = CS_HW_CONFIGURED; else { /* parameters failed so we don't park at params_set */ tc->state = CS_UNINITIALIZED; tc->flags &= ~CF_USER; CH_ERR(pi, "failed to configure traffic class %d: %d. " "params: mode %d, rateunit %d, ratemode %d, " "channel %d, minrate %d, maxrate %d, pktsize %d, " "burstsize %d\n", p->cl, rc, fw_mode, fw_rateunit, fw_ratemode, p->channel, p->minrate, p->maxrate, p->pktsize, 0); } mtx_unlock(&sc->tc_lock); } return (rc); } static void update_tx_sched(void *context, int pending) { int i, j, rc; struct port_info *pi; struct tx_cl_rl_params *tc; struct adapter *sc = context; const int n = sc->params.nsched_cls; mtx_lock(&sc->tc_lock); for_each_port(sc, i) { pi = sc->port[i]; tc = &pi->sched_params->cl_rl[0]; for (j = 0; j < n; j++, tc++) { MPASS(mtx_owned(&sc->tc_lock)); if (tc->state != CS_HW_UPDATE_REQUESTED) continue; mtx_unlock(&sc->tc_lock); if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4utxs") != 0) { mtx_lock(&sc->tc_lock); continue; } rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit, tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0, tc->pktsize, tc->burstsize, 1); end_synchronized_op(sc, 0); mtx_lock(&sc->tc_lock); MPASS(tc->state == CS_HW_UPDATE_REQUESTED); if (rc == 0) { tc->state = CS_HW_CONFIGURED; continue; } /* parameters failed so we try to avoid params_set */ if (tc->refcount > 0) tc->state = CS_PARAMS_SET; else tc->state = CS_UNINITIALIZED; CH_ERR(pi, "failed to configure traffic class %d: %d. " "params: mode %d, rateunit %d, ratemode %d, " "channel %d, minrate %d, maxrate %d, pktsize %d, " "burstsize %d\n", j, rc, tc->mode, tc->rateunit, tc->ratemode, pi->tx_chan, 0, tc->maxrate, tc->pktsize, tc->burstsize); } } mtx_unlock(&sc->tc_lock); } int t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p) { if (p->type != SCHED_CLASS_TYPE_PACKET) return (EINVAL); if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG) return (set_sched_class_config(sc, p->u.config.minmax)); if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS) return (set_sched_class_params(sc, &p->u.params, 1)); return (EINVAL); } static int bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx) { struct tx_cl_rl_params *tc0, *tc; int rc, old_idx; uint32_t fw_mnem, fw_class; if (!(txq->eq.flags & EQ_HW_ALLOCATED)) return (ENXIO); mtx_lock(&sc->tc_lock); if (txq->tc_idx == -2) { rc = EBUSY; /* Another bind/unbind in progress already. */ goto done; } if (idx == txq->tc_idx) { rc = 0; /* No change, nothing to do. */ goto done; } tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0]; if (idx != -1) { /* * Bind to a different class at index idx. */ tc = &tc0[idx]; if (tc->state != CS_HW_CONFIGURED) { rc = ENXIO; goto done; } else { /* * Ok to proceed. Place a reference on the new class * while still holding on to the reference on the * previous class, if any. */ tc->refcount++; } } /* Mark as busy before letting go of the lock. */ old_idx = txq->tc_idx; txq->tc_idx = -2; mtx_unlock(&sc->tc_lock); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq"); if (rc == 0) { fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); fw_class = idx < 0 ? 0xffffffff : idx; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem, &fw_class); end_synchronized_op(sc, 0); } mtx_lock(&sc->tc_lock); MPASS(txq->tc_idx == -2); if (rc == 0) { /* * Unbind, bind, or bind to a different class succeeded. Remove * the reference on the old traffic class, if any. */ if (old_idx != -1) { tc = &tc0[old_idx]; MPASS(tc->refcount > 0); tc->refcount--; } txq->tc_idx = idx; } else { /* * Unbind, bind, or bind to a different class failed. Remove * the anticipatory reference on the new traffic class, if any. */ if (idx != -1) { tc = &tc0[idx]; MPASS(tc->refcount > 0); tc->refcount--; } txq->tc_idx = old_idx; } done: MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->params.nsched_cls); mtx_unlock(&sc->tc_lock); return (rc); } int t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p) { struct port_info *pi = NULL; struct vi_info *vi; struct sge_txq *txq; int i, rc; if (p->port >= sc->params.nports) return (EINVAL); /* * XXX: cxgbetool allows the user to specify the physical port only. So * we always operate on the main VI. */ pi = sc->port[p->port]; vi = &pi->vi[0]; /* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */ if (!(vi->flags & VI_INIT_DONE)) return (EAGAIN); MPASS(vi->ntxq > 0); if (!in_range(p->queue, 0, vi->ntxq - 1) || !in_range(p->cl, 0, sc->params.nsched_cls - 1)) return (EINVAL); if (p->queue < 0) { /* * Change the scheduling on all the TX queues for the * interface. */ for_each_txq(vi, i, txq) { rc = bind_txq_to_traffic_class(sc, txq, p->cl); if (rc != 0) break; } } else { /* * If op.queue is non-negative, then we're only changing the * scheduling on a single specified TX queue. */ txq = &sc->sge.txq[vi->first_txq + p->queue]; rc = bind_txq_to_traffic_class(sc, txq, p->cl); } return (rc); } int t4_init_tx_sched(struct adapter *sc) { int i; const int n = sc->params.nsched_cls; struct port_info *pi; mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF); TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc); for_each_port(sc, i) { pi = sc->port[i]; pi->sched_params = malloc(sizeof(*pi->sched_params) + n * sizeof(struct tx_cl_rl_params), M_CXGBE, M_ZERO | M_WAITOK); } return (0); } int t4_free_tx_sched(struct adapter *sc) { int i; taskqueue_drain(taskqueue_thread, &sc->tc_task); for_each_port(sc, i) { if (sc->port[i] != NULL) free(sc->port[i]->sched_params, M_CXGBE); } if (mtx_initialized(&sc->tc_lock)) mtx_destroy(&sc->tc_lock); return (0); } void t4_update_tx_sched(struct adapter *sc) { taskqueue_enqueue(taskqueue_thread, &sc->tc_task); } int t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate, int *tc_idx) { int rc = 0, fa, fa2, i, pktsize, burstsize; bool update; struct tx_cl_rl_params *tc; struct port_info *pi; MPASS(port_id >= 0 && port_id < sc->params.nports); pi = sc->port[port_id]; if (pi->sched_params->pktsize > 0) pktsize = pi->sched_params->pktsize; else - pktsize = pi->vi[0].ifp->if_mtu; + pktsize = if_getmtu(pi->vi[0].ifp); if (pi->sched_params->burstsize > 0) burstsize = pi->sched_params->burstsize; else burstsize = pktsize * 4; tc = &pi->sched_params->cl_rl[0]; update = false; fa = fa2 = -1; mtx_lock(&sc->tc_lock); for (i = 0; i < sc->params.nsched_cls; i++, tc++) { if (tc->state >= CS_PARAMS_SET && tc->ratemode == FW_SCHED_PARAMS_RATE_ABS && tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE && tc->mode == FW_SCHED_PARAMS_MODE_FLOW && tc->maxrate == maxrate && tc->pktsize == pktsize && tc->burstsize == burstsize) { tc->refcount++; *tc_idx = i; if (tc->state == CS_PARAMS_SET) { tc->state = CS_HW_UPDATE_REQUESTED; update = true; } goto done; } if (fa < 0 && tc->state == CS_UNINITIALIZED) { MPASS(tc->refcount == 0); fa = i; /* first available, never used. */ } if (fa2 < 0 && tc->refcount == 0 && !(tc->flags & CF_USER)) { fa2 = i; /* first available, used previously. */ } } /* Not found */ MPASS(i == sc->params.nsched_cls); if (fa == -1) fa = fa2; if (fa == -1) { *tc_idx = -1; rc = ENOSPC; } else { MPASS(fa >= 0 && fa < sc->params.nsched_cls); tc = &pi->sched_params->cl_rl[fa]; MPASS(!(tc->flags & CF_USER)); MPASS(tc->refcount == 0); tc->refcount = 1; tc->state = CS_HW_UPDATE_REQUESTED; tc->ratemode = FW_SCHED_PARAMS_RATE_ABS; tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; tc->mode = FW_SCHED_PARAMS_MODE_FLOW; tc->maxrate = maxrate; tc->pktsize = pktsize; tc->burstsize = burstsize; *tc_idx = fa; update = true; } done: mtx_unlock(&sc->tc_lock); if (update) t4_update_tx_sched(sc); return (rc); } void t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx) { struct tx_cl_rl_params *tc; MPASS(port_id >= 0 && port_id < sc->params.nports); MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls); mtx_lock(&sc->tc_lock); tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx]; MPASS(tc->refcount > 0); tc->refcount--; mtx_unlock(&sc->tc_lock); } int sysctl_tc(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->adapter; struct sge_txq *txq; int qidx = arg2, rc, tc_idx; MPASS(qidx >= vi->first_txq && qidx < vi->first_txq + vi->ntxq); txq = &sc->sge.txq[qidx]; tc_idx = txq->tc_idx; rc = sysctl_handle_int(oidp, &tc_idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (sc->flags & IS_VF) return (EPERM); if (!in_range(tc_idx, 0, sc->params.nsched_cls - 1)) return (EINVAL); return (bind_txq_to_traffic_class(sc, txq, tc_idx)); } int sysctl_tc_params(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tx_cl_rl_params tc; struct sbuf *sb; int i, rc, port_id, mbps, gbps; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); port_id = arg2 >> 16; MPASS(port_id < sc->params.nports); MPASS(sc->port[port_id] != NULL); i = arg2 & 0xffff; MPASS(i < sc->params.nsched_cls); mtx_lock(&sc->tc_lock); tc = sc->port[port_id]->sched_params->cl_rl[i]; mtx_unlock(&sc->tc_lock); if (tc.state < CS_PARAMS_SET) { sbuf_printf(sb, "uninitialized"); goto done; } switch (tc.rateunit) { case SCHED_CLASS_RATEUNIT_BITS: switch (tc.ratemode) { case SCHED_CLASS_RATEMODE_REL: /* XXX: top speed or actual link speed? */ gbps = port_top_speed(sc->port[port_id]); sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps); break; case SCHED_CLASS_RATEMODE_ABS: mbps = tc.maxrate / 1000; gbps = tc.maxrate / 1000000; if (tc.maxrate == gbps * 1000000) sbuf_printf(sb, "%uGbps", gbps); else if (tc.maxrate == mbps * 1000) sbuf_printf(sb, "%uMbps", mbps); else sbuf_printf(sb, "%uKbps", tc.maxrate); break; default: rc = ENXIO; goto done; } break; case SCHED_CLASS_RATEUNIT_PKTS: sbuf_printf(sb, "%upps", tc.maxrate); break; default: rc = ENXIO; goto done; } switch (tc.mode) { case SCHED_CLASS_MODE_CLASS: /* Note that pktsize and burstsize are not used in this mode. */ sbuf_printf(sb, " aggregate"); break; case SCHED_CLASS_MODE_FLOW: sbuf_printf(sb, " per-flow"); if (tc.pktsize > 0) sbuf_printf(sb, " pkt-size %u", tc.pktsize); if (tc.burstsize > 0) sbuf_printf(sb, " burst-size %u", tc.burstsize); break; default: rc = ENXIO; goto done; } done: if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } #ifdef RATELIMIT void t4_init_etid_table(struct adapter *sc) { int i; struct tid_info *t; if (!is_ethoffload(sc)) return; t = &sc->tids; MPASS(t->netids > 0); mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF); t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE, M_ZERO | M_WAITOK); t->efree = t->etid_tab; t->etids_in_use = 0; for (i = 1; i < t->netids; i++) t->etid_tab[i - 1].next = &t->etid_tab[i]; t->etid_tab[t->netids - 1].next = NULL; } void t4_free_etid_table(struct adapter *sc) { struct tid_info *t; if (!is_ethoffload(sc)) return; t = &sc->tids; MPASS(t->netids > 0); free(t->etid_tab, M_CXGBE); t->etid_tab = NULL; if (mtx_initialized(&t->etid_lock)) mtx_destroy(&t->etid_lock); } /* etid services */ static int alloc_etid(struct adapter *, struct cxgbe_rate_tag *); static void free_etid(struct adapter *, int); static int alloc_etid(struct adapter *sc, struct cxgbe_rate_tag *cst) { struct tid_info *t = &sc->tids; int etid = -1; mtx_lock(&t->etid_lock); if (t->efree) { union etid_entry *p = t->efree; etid = p - t->etid_tab + t->etid_base; t->efree = p->next; p->cst = cst; t->etids_in_use++; } mtx_unlock(&t->etid_lock); return (etid); } struct cxgbe_rate_tag * lookup_etid(struct adapter *sc, int etid) { struct tid_info *t = &sc->tids; return (t->etid_tab[etid - t->etid_base].cst); } static void free_etid(struct adapter *sc, int etid) { struct tid_info *t = &sc->tids; union etid_entry *p = &t->etid_tab[etid - t->etid_base]; mtx_lock(&t->etid_lock); p->next = t->efree; t->efree = p; t->etids_in_use--; mtx_unlock(&t->etid_lock); } static int cxgbe_rate_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int cxgbe_rate_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void cxgbe_rate_tag_free(struct m_snd_tag *); static const struct if_snd_tag_sw cxgbe_rate_tag_sw = { .snd_tag_modify = cxgbe_rate_tag_modify, .snd_tag_query = cxgbe_rate_tag_query, .snd_tag_free = cxgbe_rate_tag_free, .type = IF_SND_TAG_TYPE_RATE_LIMIT }; int -cxgbe_rate_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, +cxgbe_rate_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { int rc, schedcl; - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct cxgbe_rate_tag *cst; MPASS(params->hdr.type == IF_SND_TAG_TYPE_RATE_LIMIT); rc = t4_reserve_cl_rl_kbps(sc, pi->port_id, (params->rate_limit.max_rate * 8ULL / 1000), &schedcl); if (rc != 0) return (rc); MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls); cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT); if (cst == NULL) { failed: t4_release_cl_rl(sc, pi->port_id, schedcl); return (ENOMEM); } cst->etid = alloc_etid(sc, cst); if (cst->etid < 0) { free(cst, M_CXGBE); goto failed; } mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF); mbufq_init(&cst->pending_tx, INT_MAX); mbufq_init(&cst->pending_fwack, INT_MAX); m_snd_tag_init(&cst->com, ifp, &cxgbe_rate_tag_sw); cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF; cst->adapter = sc; cst->port_id = pi->port_id; cst->schedcl = schedcl; cst->max_rate = params->rate_limit.max_rate; cst->tx_credits = sc->params.eo_wr_cred; cst->tx_total = cst->tx_credits; cst->plen = 0; cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); /* * Queues will be selected later when the connection flowid is available. */ *pt = &cst->com; return (0); } /* * Change in parameters, no change in ifp. */ static int cxgbe_rate_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { int rc, schedcl; struct cxgbe_rate_tag *cst = mst_to_crt(mst); struct adapter *sc = cst->adapter; /* XXX: is schedcl -1 ok here? */ MPASS(cst->schedcl >= 0 && cst->schedcl < sc->params.nsched_cls); mtx_lock(&cst->lock); MPASS(cst->flags & EO_SND_TAG_REF); rc = t4_reserve_cl_rl_kbps(sc, cst->port_id, (params->rate_limit.max_rate * 8ULL / 1000), &schedcl); if (rc != 0) return (rc); MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls); t4_release_cl_rl(sc, cst->port_id, cst->schedcl); cst->schedcl = schedcl; cst->max_rate = params->rate_limit.max_rate; mtx_unlock(&cst->lock); return (0); } static int cxgbe_rate_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct cxgbe_rate_tag *cst = mst_to_crt(mst); params->rate_limit.max_rate = cst->max_rate; #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total) params->rate_limit.queue_level = (cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE; return (0); } /* * Unlocks cst and frees it. */ void cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *cst) { struct adapter *sc = cst->adapter; mtx_assert(&cst->lock, MA_OWNED); MPASS((cst->flags & EO_SND_TAG_REF) == 0); MPASS(cst->tx_credits == cst->tx_total); MPASS(cst->plen == 0); MPASS(mbufq_first(&cst->pending_tx) == NULL); MPASS(mbufq_first(&cst->pending_fwack) == NULL); if (cst->etid >= 0) free_etid(sc, cst->etid); if (cst->schedcl != -1) t4_release_cl_rl(sc, cst->port_id, cst->schedcl); mtx_unlock(&cst->lock); mtx_destroy(&cst->lock); free(cst, M_CXGBE); } static void cxgbe_rate_tag_free(struct m_snd_tag *mst) { struct cxgbe_rate_tag *cst = mst_to_crt(mst); mtx_lock(&cst->lock); /* The kernel is done with the snd_tag. Remove its reference. */ MPASS(cst->flags & EO_SND_TAG_REF); cst->flags &= ~EO_SND_TAG_REF; if (cst->ncompl == 0) { /* * No fw4_ack in flight. Free the tag right away if there are * no outstanding credits. Request the firmware to return all * credits for the etid otherwise. */ if (cst->tx_credits == cst->tx_total) { cxgbe_rate_tag_free_locked(cst); return; /* cst is gone. */ } send_etid_flush_wr(cst); } mtx_unlock(&cst->lock); } void -cxgbe_ratelimit_query(struct ifnet *ifp, struct if_ratelimit_query_results *q) +cxgbe_ratelimit_query(if_t ifp, struct if_ratelimit_query_results *q) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; q->rate_table = NULL; q->flags = RT_IS_SELECTABLE; /* * Absolute max limits from the firmware configuration. Practical - * limits depend on the burstsize, pktsize (ifp->if_mtu ultimately) and + * limits depend on the burstsize, pktsize (if_getmtu(ifp) ultimately) and * the card's cclk. */ q->max_flows = sc->tids.netids; q->number_of_rates = sc->params.nsched_cls; q->min_segment_burst = 4; /* matches PKTSCHED_BURST in the firmware. */ #if 1 if (chip_id(sc) < CHELSIO_T6) { /* Based on testing by rrs@ with a T580 at burstsize = 4. */ MPASS(q->min_segment_burst == 4); q->max_flows = min(4000, q->max_flows); } else { /* XXX: TBD, carried forward from T5 for now. */ q->max_flows = min(4000, q->max_flows); } /* * XXX: tcp_ratelimit.c grabs all available rates on link-up before it * even knows whether hw pacing will be used or not. This prevents * other consumers like SO_MAX_PACING_RATE or those using cxgbetool or * the private ioctls from using any of traffic classes. * * Underreport the number of rates to tcp_ratelimit so that it doesn't * hog all of them. This can be removed if/when tcp_ratelimit switches * to making its allocations on first-use rather than link-up. There is * nothing wrong with one particular consumer reserving all the classes * but it should do so only if it'll actually use hw rate limiting. */ q->number_of_rates /= 4; #endif } #endif diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 8cd9be275126..89d8f1b0462a 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -1,6925 +1,6925 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_NETMAP #include #include #include #include #include #endif #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #ifdef T4_PKT_TIMESTAMP #define RX_COPY_THRESHOLD (MINCLSIZE - 8) #else #define RX_COPY_THRESHOLD MINCLSIZE #endif /* * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ static int fl_pktshift = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0, "payload DMA offset in rx buffer (bytes)"); /* * Pad ethernet payload up to this boundary. * -1: driver should figure out a good value. * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ int fl_pad = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0, "payload pad boundary (bytes)"); /* * Status page length. * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ static int spg_len = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0, "status page size (bytes)"); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. * 2: both backpressure and drop. */ static int cong_drop = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0, "Congestion control for NIC RX queues (0 = backpressure, 1 = drop, 2 = both"); #ifdef TCP_OFFLOAD static int ofld_cong_drop = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ofld_cong_drop, CTLFLAG_RDTUN, &ofld_cong_drop, 0, "Congestion control for TOE RX queues (0 = backpressure, 1 = drop, 2 = both"); #endif /* * Deliver multiple frames in the same free list buffer if they fit. * -1: let the driver decide whether to enable buffer packing or not. * 0: disable buffer packing. * 1: enable buffer packing. */ static int buffer_packing = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing, 0, "Enable buffer packing"); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. * T4: driver will ignore this and use the same value as fl_pad above. * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0, "payload pack boundary (bytes)"); /* * Largest rx cluster size that the driver is allowed to allocate. */ static int largest_rx_cluster = MJUM16BYTES; SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN, &largest_rx_cluster, 0, "Largest rx cluster (bytes)"); /* * Size of cluster allocation that's most likely to succeed. The driver will * fall back to this size if it fails to allocate clusters larger than this. */ static int safest_rx_cluster = PAGE_SIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN, &safest_rx_cluster, 0, "Safe rx cluster (bytes)"); #ifdef RATELIMIT /* * Knob to control TCP timestamp rewriting, and the granularity of the tick used * for rewriting. -1 and 0-3 are all valid values. * -1: hardware should leave the TCP timestamps alone. * 0: 1ms * 1: 100us * 2: 10us * 3: 1us */ static int tsclk = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0, "Control TCP timestamp rewriting when using pacing"); static int eo_max_backlog = 1024 * 1024; SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog, 0, "Maximum backlog of ratelimited data per flow"); #endif /* * The interrupt holdoff timers are multiplied by this value on T6+. * 1 and 3-17 (both inclusive) are legal values. */ static int tscale = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0, "Interrupt holdoff timer scale on T6+"); /* * Number of LRO entries in the lro_ctrl structure per rx queue. */ static int lro_entries = TCP_LRO_ENTRIES; SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0, "Number of LRO entries per RX queue"); /* * This enables presorting of frames before they're fed into tcp_lro_rx. */ static int lro_mbufs = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0, "Enable presorting of LRO frames"); static counter_u64_t pullups; SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, pullups, CTLFLAG_RD, &pullups, "Number of mbuf pullups performed"); static counter_u64_t defrags; SYSCTL_COUNTER_U64(_hw_cxgbe, OID_AUTO, defrags, CTLFLAG_RD, &defrags, "Number of mbuf defrags performed"); static int t4_tx_coalesce = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce, CTLFLAG_RWTUN, &t4_tx_coalesce, 0, "tx coalescing allowed"); /* * The driver will make aggressive attempts at tx coalescing if it sees these * many packets eligible for coalescing in quick succession, with no more than * the specified gap in between the eth_tx calls that delivered the packets. */ static int t4_tx_coalesce_pkts = 32; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_pkts, CTLFLAG_RWTUN, &t4_tx_coalesce_pkts, 0, "# of consecutive packets (1 - 255) that will trigger tx coalescing"); static int t4_tx_coalesce_gap = 5; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_coalesce_gap, CTLFLAG_RWTUN, &t4_tx_coalesce_gap, 0, "tx gap (in microseconds)"); static int service_iq(struct sge_iq *, int); static int service_iq_fl(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int eth_rx(struct adapter *, struct sge_rxq *, const struct iq_desc *, u_int); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t, struct sge_iq *, char *); static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *, struct sysctl_ctx_list *, struct sysctl_oid *); static void free_iq_fl(struct adapter *, struct sge_iq *, struct sge_fl *); static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_iq *); static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_fl *); static int alloc_iq_fl_hwq(struct vi_info *, struct sge_iq *, struct sge_fl *); static int free_iq_fl_hwq(struct adapter *, struct sge_iq *, struct sge_fl *); static int alloc_fwq(struct adapter *); static void free_fwq(struct adapter *); static int alloc_ctrlq(struct adapter *, int); static void free_ctrlq(struct adapter *, int); static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, int); static void free_rxq(struct vi_info *, struct sge_rxq *); static void add_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_rxq *); #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int, int); static void free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *); static void add_ofld_rxq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_ofld_rxq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct sge_eq *, struct sysctl_ctx_list *, struct sysctl_oid *); static void free_eq(struct adapter *, struct sge_eq *); static void add_eq_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_eq *); static int alloc_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *); static int free_eq_hwq(struct adapter *, struct vi_info *, struct sge_eq *); static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *, struct sysctl_ctx_list *, struct sysctl_oid *); static void free_wrq(struct adapter *, struct sge_wrq *); static void add_wrq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_wrq *); static int alloc_txq(struct vi_info *, struct sge_txq *, int); static void free_txq(struct vi_info *, struct sge_txq *); static void add_txq_sysctls(struct vi_info *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_txq *); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int alloc_ofld_txq(struct vi_info *, struct sge_ofld_txq *, int); static void free_ofld_txq(struct vi_info *, struct sge_ofld_txq *); static void add_ofld_txq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_ofld_txq *); #endif static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int find_refill_source(struct adapter *, int, bool); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); static inline u_int txpkt_len16(u_int, const u_int); static inline u_int txpkt_vm_len16(u_int, const u_int); static inline void calculate_mbuf_len16(struct mbuf *, bool); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int); static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *, u_int); static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *, struct mbuf *); static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *, int, bool *); static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *, int, bool *); static u_int write_txpkts_wr(struct adapter *, struct sge_txq *); static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *); static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); static inline uint16_t read_hw_cidx(struct sge_eq *); static inline u_int reclaimable_tx_desc(struct sge_eq *); static inline u_int total_available_tx_desc(struct sge_eq *); static u_int reclaim_tx_descs(struct sge_txq *, u_int); static void tx_reclaim(void *, int); static __be64 get_flit(struct sglist_seg *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *); static void wrq_tx_drain(void *, int); static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); #ifdef RATELIMIT #if defined(INET) || defined(INET6) static inline u_int txpkt_eo_len16(u_int, u_int, u_int); #endif static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *, struct mbuf *); -static int ethofld_transmit(struct ifnet *, struct mbuf *); +static int ethofld_transmit(if_t, struct mbuf *); #endif static counter_u64_t extfree_refs; static counter_u64_t extfree_rels; an_handler_t t4_an_handler; fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES]; cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS]; cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES]; cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES]; void t4_register_an_handler(an_handler_t h) { uintptr_t *loc; MPASS(h == NULL || t4_an_handler == NULL); loc = (uintptr_t *)&t4_an_handler; atomic_store_rel_ptr(loc, (uintptr_t)h); } void t4_register_fw_msg_handler(int type, fw_msg_handler_t h) { uintptr_t *loc; MPASS(type < nitems(t4_fw_msg_handler)); MPASS(h == NULL || t4_fw_msg_handler[type] == NULL); /* * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL * handler dispatch table. Reject any attempt to install a handler for * this subtype. */ MPASS(type != FW_TYPE_RSSCPL); MPASS(type != FW6_TYPE_RSSCPL); loc = (uintptr_t *)&t4_fw_msg_handler[type]; atomic_store_rel_ptr(loc, (uintptr_t)h); } void t4_register_cpl_handler(int opcode, cpl_handler_t h) { uintptr_t *loc; MPASS(opcode < nitems(t4_cpl_handler)); MPASS(h == NULL || t4_cpl_handler[opcode] == NULL); loc = (uintptr_t *)&t4_cpl_handler[opcode]; atomic_store_rel_ptr(loc, (uintptr_t)h); } static int set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); u_int tid; int cookie; MPASS(m == NULL); tid = GET_TID(cpl); if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) { /* * The return code for filter-write is put in the CPL cookie so * we have to rely on the hardware tid (is_ftid) to determine * that this is a response to a filter. */ cookie = CPL_COOKIE_FILTER; } else { cookie = G_COOKIE(cpl->cookie); } MPASS(cookie > CPL_COOKIE_RESERVED); MPASS(cookie < nitems(set_tcb_rpl_handlers)); return (set_tcb_rpl_handlers[cookie](iq, rss, m)); } static int l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); unsigned int cookie; MPASS(m == NULL); cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER; return (l2t_write_rpl_handlers[cookie](iq, rss, m)); } static int act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status))); MPASS(m == NULL); MPASS(cookie != CPL_COOKIE_RESERVED); return (act_open_rpl_handlers[cookie](iq, rss, m)); } static int abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; u_int cookie; MPASS(m == NULL); if (is_hashfilter(sc)) cookie = CPL_COOKIE_HASHFILTER; else cookie = CPL_COOKIE_TOM; return (abort_rpl_rss_handlers[cookie](iq, rss, m)); } static int fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); u_int cookie; MPASS(m == NULL); if (is_etid(sc, tid)) cookie = CPL_COOKIE_ETHOFLD; else cookie = CPL_COOKIE_TOM; return (fw4_ack_handlers[cookie](iq, rss, m)); } static void t4_init_shared_cpl_handlers(void) { t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler); t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler); t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler); t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler); t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler); } void t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie) { uintptr_t *loc; MPASS(opcode < nitems(t4_cpl_handler)); MPASS(cookie > CPL_COOKIE_RESERVED); MPASS(cookie < NUM_CPL_COOKIES); MPASS(t4_cpl_handler[opcode] != NULL); switch (opcode) { case CPL_SET_TCB_RPL: loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie]; break; case CPL_L2T_WRITE_RPL: loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie]; break; case CPL_ACT_OPEN_RPL: loc = (uintptr_t *)&act_open_rpl_handlers[cookie]; break; case CPL_ABORT_RPL_RSS: loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie]; break; case CPL_FW4_ACK: loc = (uintptr_t *)&fw4_ack_handlers[cookie]; break; default: MPASS(0); return; } MPASS(h == NULL || *loc == (uintptr_t)NULL); atomic_store_rel_ptr(loc, (uintptr_t)h); } /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. */ void t4_sge_modload(void) { if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," " using 0 instead.\n", fl_pktshift); fl_pktshift = 0; } if (spg_len != 64 && spg_len != 128) { int len; #if defined(__i386__) || defined(__amd64__) len = cpu_clflush_line_size > 64 ? 128 : 64; #else len = 64; #endif if (spg_len != -1) { printf("Invalid hw.cxgbe.spg_len value (%d)," " using %d instead.\n", spg_len, len); } spg_len = len; } if (cong_drop < -1 || cong_drop > 2) { printf("Invalid hw.cxgbe.cong_drop value (%d)," " using 0 instead.\n", cong_drop); cong_drop = 0; } #ifdef TCP_OFFLOAD if (ofld_cong_drop < -1 || ofld_cong_drop > 2) { printf("Invalid hw.cxgbe.ofld_cong_drop value (%d)," " using 0 instead.\n", ofld_cong_drop); ofld_cong_drop = 0; } #endif if (tscale != 1 && (tscale < 3 || tscale > 17)) { printf("Invalid hw.cxgbe.tscale value (%d)," " using 1 instead.\n", tscale); tscale = 1; } if (largest_rx_cluster != MCLBYTES && largest_rx_cluster != MJUMPAGESIZE && largest_rx_cluster != MJUM9BYTES && largest_rx_cluster != MJUM16BYTES) { printf("Invalid hw.cxgbe.largest_rx_cluster value (%d)," " using %d instead.\n", largest_rx_cluster, MJUM16BYTES); largest_rx_cluster = MJUM16BYTES; } if (safest_rx_cluster != MCLBYTES && safest_rx_cluster != MJUMPAGESIZE && safest_rx_cluster != MJUM9BYTES && safest_rx_cluster != MJUM16BYTES) { printf("Invalid hw.cxgbe.safest_rx_cluster value (%d)," " using %d instead.\n", safest_rx_cluster, MJUMPAGESIZE); safest_rx_cluster = MJUMPAGESIZE; } extfree_refs = counter_u64_alloc(M_WAITOK); extfree_rels = counter_u64_alloc(M_WAITOK); pullups = counter_u64_alloc(M_WAITOK); defrags = counter_u64_alloc(M_WAITOK); counter_u64_zero(extfree_refs); counter_u64_zero(extfree_rels); counter_u64_zero(pullups); counter_u64_zero(defrags); t4_init_shared_cpl_handlers(); t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update); #ifdef RATELIMIT t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack, CPL_COOKIE_ETHOFLD); #endif t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl); } void t4_sge_modunload(void) { counter_u64_free(extfree_refs); counter_u64_free(extfree_rels); counter_u64_free(pullups); counter_u64_free(defrags); } uint64_t t4_sge_extfree_refs(void) { uint64_t refs, rels; rels = counter_u64_fetch(extfree_rels); refs = counter_u64_fetch(extfree_refs); return (refs - rels); } /* max 4096 */ #define MAX_PACK_BOUNDARY 512 static inline void setup_pad_and_pack_boundaries(struct adapter *sc) { uint32_t v, m; int pad, pack, pad_shift; pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT : X_INGPADBOUNDARY_SHIFT; pad = fl_pad; if (fl_pad < (1 << pad_shift) || fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) || !powerof2(fl_pad)) { /* * If there is any chance that we might use buffer packing and * the chip is a T4, then pick 64 as the pad/pack boundary. Set * it to the minimum allowed in all other cases. */ pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift; /* * For fl_pad = 0 we'll still write a reasonable value to the * register but all the freelists will opt out of padding. * We'll complain here only if the user tried to set it to a * value greater than 0 that was invalid. */ if (fl_pad > 0) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" " (%d), using %d instead.\n", fl_pad, pad); } } m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); if (is_t4(sc)) { if (fl_pack != -1 && fl_pack != pad) { /* Complain but carry on. */ device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," " using %d instead.\n", fl_pack, pad); } return; } pack = fl_pack; if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) { if (sc->params.pci.mps > MAX_PACK_BOUNDARY) pack = MAX_PACK_BOUNDARY; else pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); MPASS(powerof2(pack)); if (pack < 16) pack = 16; if (pack == 32) pack = 64; if (pack > 4096) pack = 4096; if (fl_pack != -1) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" " (%d), using %d instead.\n", fl_pack, pack); } } m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(pack) - 5); MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); } /* * adap->params.vpd.cclk must be set up before this is called. */ void t4_tweak_chip_settings(struct adapter *sc) { int i, reg; uint32_t v, m; int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sw_buf_sizes[] = { MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES }; KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); setup_pad_and_pack_boundaries(sc); v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096); t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536); reg = A_SGE_FL_BUFFER_SIZE2; for (i = 0; i < nitems(sw_buf_sizes); i++) { MPASS(reg <= A_SGE_FL_BUFFER_SIZE15); t4_write_reg(sc, reg, sw_buf_sizes[i]); reg += 4; MPASS(reg <= A_SGE_FL_BUFFER_SIZE15); t4_write_reg(sc, reg, sw_buf_sizes[i] - CL_METADATA_SIZE); reg += 4; } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); KASSERT(intr_timer[0] <= timer_max, ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], timer_max)); for (i = 1; i < nitems(intr_timer); i++) { KASSERT(intr_timer[i] >= intr_timer[i - 1], ("%s: timers not listed in increasing order (%d)", __func__, i)); while (intr_timer[i] > timer_max) { if (i == nitems(intr_timer) - 1) { intr_timer[i] = timer_max; break; } intr_timer[i] += intr_timer[i - 1]; intr_timer[i] /= 2; } } v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); if (chip_id(sc) >= CHELSIO_T6) { m = V_TSCALE(M_TSCALE); if (tscale == 1) v = 0; else v = V_TSCALE(tscale - 2); t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v); if (sc->debug_flags & DF_DISABLE_TCB_CACHE) { m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN | V_WRTHRTHRESH(M_WRTHRTHRESH); t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1); v &= ~m; v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN | V_WRTHRTHRESH(16); t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1); } } /* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); /* * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP. These have been * chosen with MAXPHYS = 128K in mind. The largest DDP buffer that we * may have to deal with is MAXPHYS + 1 page. */ v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4); t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v); /* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */ m = v = F_TDDPTAGTCB | F_ISCSITAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } /* * SGE wants the buffer to be at least 64B and then a multiple of 16. Its * address mut be 16B aligned. If padding is in use the buffer's start and end * need to be aligned to the pad boundary as well. We'll just make sure that * the size is a multiple of the pad boundary here, it is up to the buffer * allocation code to make sure the start of the buffer is aligned. */ static inline int hwsz_ok(struct adapter *sc, int hwsz) { int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1; return (hwsz >= 64 && (hwsz & mask) == 0); } /* * Initialize the rx buffer sizes and figure out which zones the buffers will * be allocated from. */ void t4_init_rx_buf_info(struct adapter *sc) { struct sge *s = &sc->sge; struct sge_params *sp = &sc->params.sge; int i, j, n; static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES }; struct rx_buf_info *rxb; s->safe_zidx = -1; rxb = &s->rx_buf_info[0]; for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { rxb->size1 = sw_buf_sizes[i]; rxb->zone = m_getzone(rxb->size1); rxb->type = m_gettype(rxb->size1); rxb->size2 = 0; rxb->hwidx1 = -1; rxb->hwidx2 = -1; for (j = 0; j < SGE_FLBUF_SIZES; j++) { int hwsize = sp->sge_fl_buffer_size[j]; if (!hwsz_ok(sc, hwsize)) continue; /* hwidx for size1 */ if (rxb->hwidx1 == -1 && rxb->size1 == hwsize) rxb->hwidx1 = j; /* hwidx for size2 (buffer packing) */ if (rxb->size1 - CL_METADATA_SIZE < hwsize) continue; n = rxb->size1 - hwsize - CL_METADATA_SIZE; if (n == 0) { rxb->hwidx2 = j; rxb->size2 = hwsize; break; /* stop looking */ } if (rxb->hwidx2 != -1) { if (n < sp->sge_fl_buffer_size[rxb->hwidx2] - hwsize - CL_METADATA_SIZE) { rxb->hwidx2 = j; rxb->size2 = hwsize; } } else if (n <= 2 * CL_METADATA_SIZE) { rxb->hwidx2 = j; rxb->size2 = hwsize; } } if (rxb->hwidx2 != -1) sc->flags |= BUF_PACKING_OK; if (s->safe_zidx == -1 && rxb->size1 == safest_rx_cluster) s->safe_zidx = i; } } /* * Verify some basic SGE settings for the PF and VF driver, and other * miscellaneous settings for the PF driver. */ int t4_verify_chip_settings(struct adapter *sc) { struct sge_params *sp = &sc->params.sge; uint32_t m, v, r; int rc = 0; const uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); m = F_RXPKTCPLMODE; v = F_RXPKTCPLMODE; r = sp->sge_control; if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } /* * If this changes then every single use of PAGE_SHIFT in the driver * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift. */ if (sp->page_shift != PAGE_SHIFT) { device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); rc = EINVAL; } if (sc->flags & IS_VF) return (0); v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); if (r != v) { device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); if (sc->vres.ddp.size != 0) rc = EINVAL; } m = v = F_TDDPTAGTCB; r = t4_read_reg(sc, A_ULP_RX_CTL); if ((r & m) != v) { device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); if (sc->vres.ddp.size != 0) rc = EINVAL; } m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; r = t4_read_reg(sc, A_TP_PARA_REG5); if ((r & m) != v) { device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); if (sc->vres.ddp.size != 0) rc = EINVAL; } return (rc); } int t4_create_dma_tag(struct adapter *sc) { int rc; rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmat); if (rc != 0) { device_printf(sc->dev, "failed to create main DMA tag: %d\n", rc); } return (rc); } void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { struct sge_params *sp = &sc->params.sge; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, sysctl_bufsizes, "A", "freelist buffer sizes"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, NULL, sp->pad_boundary, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, sp->spg_len, "status page size (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); #ifdef TCP_OFFLOAD SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ofld_cong_drop", CTLFLAG_RD, NULL, ofld_cong_drop, "congestion drop setting"); #endif SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sp->pack_boundary, "payload pack boundary (bytes)"); } int t4_destroy_dma_tag(struct adapter *sc) { if (sc->dmat) bus_dma_tag_destroy(sc->dmat); return (0); } /* * Allocate and initialize the firmware event queue, control queues, and special * purpose rx queues owned by the adapter. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ int t4_setup_adapter_queues(struct adapter *sc) { int rc, i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* * Firmware event queue */ rc = alloc_fwq(sc); if (rc != 0) return (rc); /* * That's all for the VF driver. */ if (sc->flags & IS_VF) return (rc); /* * XXX: General purpose rx queues, one per port. */ /* * Control queues, one per port. */ for_each_port(sc, i) { rc = alloc_ctrlq(sc, i); if (rc != 0) return (rc); } return (rc); } /* * Idempotent */ int t4_teardown_adapter_queues(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); if (sc->sge.ctrlq != NULL) { MPASS(!(sc->flags & IS_VF)); /* VFs don't allocate ctrlq. */ for_each_port(sc, i) free_ctrlq(sc, i); } free_fwq(sc); return (0); } /* Maximum payload that could arrive with a single iq descriptor. */ static inline int -max_rx_payload(struct adapter *sc, struct ifnet *ifp, const bool ofld) +max_rx_payload(struct adapter *sc, if_t ifp, const bool ofld) { int maxp; /* large enough even when hw VLAN extraction is disabled */ maxp = sc->params.sge.fl_pktshift + ETHER_HDR_LEN + - ETHER_VLAN_ENCAP_LEN + ifp->if_mtu; + ETHER_VLAN_ENCAP_LEN + if_getmtu(ifp); if (ofld && sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS && maxp < sc->params.tp.max_rx_pdu) maxp = sc->params.tp.max_rx_pdu; return (maxp); } int t4_setup_vi_queues(struct vi_info *vi) { int rc = 0, i, intr_idx; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif #ifdef DEV_NETMAP int saved_idx, iqidx; struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif struct adapter *sc = vi->adapter; - struct ifnet *ifp = vi->ifp; + if_t ifp = vi->ifp; int maxp; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = vi->first_intr; #ifdef DEV_NETMAP saved_idx = intr_idx; - if (ifp->if_capabilities & IFCAP_NETMAP) { + if (if_getcapabilities(ifp) & IFCAP_NETMAP) { /* netmap is supported with direct interrupts only. */ MPASS(!forwarding_intr_to_fwq(sc)); MPASS(vi->first_intr >= 0); /* * We don't have buffers to back the netmap rx queues * right now so we create the queues in a way that * doesn't set off any congestion signal in the chip. */ for_each_nm_rxq(vi, i, nm_rxq) { rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i); if (rc != 0) goto done; intr_idx++; } for_each_nm_txq(vi, i, nm_txq) { iqidx = vi->first_nm_rxq + (i % vi->nnmrxq); rc = alloc_nm_txq(vi, nm_txq, iqidx, i); if (rc != 0) goto done; } } /* Normal rx queues and netmap rx queues share the same interrupts. */ intr_idx = saved_idx; #endif /* * Allocate rx queues first because a default iqid is required when * creating a tx queue. */ maxp = max_rx_payload(sc, ifp, false); for_each_rxq(vi, i, rxq) { rc = alloc_rxq(vi, rxq, i, intr_idx, maxp); if (rc != 0) goto done; if (!forwarding_intr_to_fwq(sc)) intr_idx++; } #ifdef DEV_NETMAP - if (ifp->if_capabilities & IFCAP_NETMAP) + if (if_getcapabilities(ifp) & IFCAP_NETMAP) intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq); #endif #ifdef TCP_OFFLOAD maxp = max_rx_payload(sc, ifp, true); for_each_ofld_rxq(vi, i, ofld_rxq) { rc = alloc_ofld_rxq(vi, ofld_rxq, i, intr_idx, maxp); if (rc != 0) goto done; if (!forwarding_intr_to_fwq(sc)) intr_idx++; } #endif /* * Now the tx queues. */ for_each_txq(vi, i, txq) { rc = alloc_txq(vi, txq, i); if (rc != 0) goto done; } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { rc = alloc_ofld_txq(vi, ofld_txq, i); if (rc != 0) goto done; } #endif done: if (rc) t4_teardown_vi_queues(vi); return (rc); } /* * Idempotent */ int t4_teardown_vi_queues(struct vi_info *vi) { int i; struct sge_rxq *rxq; struct sge_txq *txq; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_ofld_txq *ofld_txq; #endif #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif #ifdef DEV_NETMAP - if (vi->ifp->if_capabilities & IFCAP_NETMAP) { + if (if_getcapabilities(vi->ifp) & IFCAP_NETMAP) { for_each_nm_txq(vi, i, nm_txq) { free_nm_txq(vi, nm_txq); } for_each_nm_rxq(vi, i, nm_rxq) { free_nm_rxq(vi, nm_rxq); } } #endif /* * Take down all the tx queues first, as they reference the rx queues * (for egress updates, etc.). */ for_each_txq(vi, i, txq) { free_txq(vi, txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { free_ofld_txq(vi, ofld_txq); } #endif /* * Then take down the rx queues. */ for_each_rxq(vi, i, rxq) { free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { free_ofld_rxq(vi, ofld_rxq); } #endif return (0); } /* * Interrupt handler when the driver is using only 1 interrupt. This is a very * unusual scenario. * * a) Deals with errors, if any. * b) Services firmware event queue, which is taking interrupts for all other * queues. */ void t4_intr_all(void *arg) { struct adapter *sc = arg; struct sge_iq *fwq = &sc->sge.fwq; MPASS(sc->intr_count == 1); if (sc->intr_type == INTR_INTX) t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); t4_intr_err(arg); t4_intr_evt(fwq); } /* * Interrupt handler for errors (installed directly when multiple interrupts are * being used, or called by t4_intr_all). */ void t4_intr_err(void *arg) { struct adapter *sc = arg; uint32_t v; const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0; if (atomic_load_int(&sc->error_flags) & ADAP_FATAL_ERR) return; v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE)); if (v & F_PFSW) { sc->swintr++; t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v); } if (t4_slow_intr_handler(sc, verbose)) t4_fatal_err(sc, false); } /* * Interrupt handler for iq-only queues. The firmware event queue is the only * such queue right now. */ void t4_intr_evt(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } /* * Interrupt handler for iq+fl queues. */ void t4_intr(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq_fl(iq, 0); (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } #ifdef DEV_NETMAP /* * Interrupt handler for netmap rx queues. */ void t4_nm_intr(void *arg) { struct sge_nm_rxq *nm_rxq = arg; if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) { service_nm_rxq(nm_rxq); (void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON); } } /* * Interrupt handler for vectors shared between NIC and netmap rx queues. */ void t4_vi_intr(void *arg) { struct irq *irq = arg; MPASS(irq->nm_rxq != NULL); t4_nm_intr(irq->nm_rxq); MPASS(irq->rxq != NULL); t4_intr(irq->rxq); } #endif /* * Deals with interrupts on an iq-only (no freelist) queue. */ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type; uint32_t lq; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); KASSERT((iq->flags & IQ_HAS_FL) == 0, ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq, iq->flags)); MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); MPASS((iq->flags & IQ_LRO_ENABLED) == 0); limit = budget ? budget : iq->qsize / 16; /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: panic("%s: data for an iq (%p) with no freelist", __func__, iq); /* NOTREACHED */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL); break; case X_RSPD_TYPE_INTR: /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. */ if (__predict_true(lq >= 1024)) { t4_an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start - sc->sge.iq_base]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { if (service_iq_fl(q, q->qsize / 16) == 0) { (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { STAILQ_INSERT_TAIL(&iql, q, link); } } break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; if (budget) { return (EINPROGRESS); } } } if (STAILQ_EMPTY(&iql)) break; /* * Process the head only, and send it to the back of the list if * it's still not done. */ q = STAILQ_FIRST(&iql); STAILQ_REMOVE_HEAD(&iql, link); if (service_iq_fl(q, q->qsize / 8) == 0) (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); else STAILQ_INSERT_TAIL(&iql, q, link); } t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); return (0); } #if defined(INET) || defined(INET6) static inline int sort_before_lro(struct lro_ctrl *lro) { return (lro->lro_mbuf_max != 0); } #endif #define CGBE_SHIFT_SCALE 10 static inline uint64_t t4_tstmp_to_ns(struct adapter *sc, uint64_t lf) { struct clock_sync *cur, dcur; uint64_t hw_clocks; uint64_t hw_clk_div; sbintime_t sbt_cur_to_prev, sbt; uint64_t hw_tstmp = lf & 0xfffffffffffffffULL; /* 60b, not 64b. */ seqc_t gen; for (;;) { cur = &sc->cal_info[sc->cal_current]; gen = seqc_read(&cur->gen); if (gen == 0) return (0); dcur = *cur; if (seqc_consistent(&cur->gen, gen)) break; } /* * Our goal here is to have a result that is: * * ( (cur_time - prev_time) ) * ((hw_tstmp - hw_prev) * ----------------------------- ) + prev_time * ( (hw_cur - hw_prev) ) * * With the constraints that we cannot use float and we * don't want to overflow the uint64_t numbers we are using. */ hw_clocks = hw_tstmp - dcur.hw_prev; sbt_cur_to_prev = (dcur.sbt_cur - dcur.sbt_prev); hw_clk_div = dcur.hw_cur - dcur.hw_prev; sbt = hw_clocks * sbt_cur_to_prev / hw_clk_div + dcur.sbt_prev; return (sbttons(sbt)); } static inline void move_to_next_rxbuf(struct sge_fl *fl) { fl->rx_offset = 0; if (__predict_false((++fl->cidx & 7) == 0)) { uint16_t cidx = fl->cidx >> 3; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } } /* * Deals with interrupts on an iq+fl queue. */ static int service_iq_fl(struct sge_iq *iq, int budget) { struct sge_rxq *rxq = iq_to_rxq(iq); struct sge_fl *fl; struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs, limit; int rsp_type, starved; uint32_t lq; uint16_t fl_hw_cidx; struct mbuf *m0; #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; struct lro_ctrl *lro = &rxq->lro; #endif KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); MPASS(iq->flags & IQ_HAS_FL); ndescs = 0; #if defined(INET) || defined(INET6) if (iq->flags & IQ_ADJ_CREDIT) { MPASS(sort_before_lro(lro)); iq->flags &= ~IQ_ADJ_CREDIT; if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) { tcp_lro_flush_all(lro); t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); return (0); } ndescs = 1; } #else MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); #endif limit = budget ? budget : iq->qsize / 16; fl = &rxq->fl; fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); m0 = NULL; rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: if (lq & F_RSPD_NEWBUF) { if (fl->rx_offset > 0) move_to_next_rxbuf(fl); lq = G_RSPD_LEN(lq); } if (IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 4) { FL_LOCK(fl); refill_fl(sc, fl, 64); FL_UNLOCK(fl); fl_hw_cidx = fl->hw_cidx; } if (d->rss.opcode == CPL_RX_PKT) { if (__predict_true(eth_rx(sc, rxq, d, lq) == 0)) break; goto out; } m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto out; /* fall through */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. That is the only * acceptable indirect interrupt on this queue. */ if (__predict_false(lq < 1024)) { panic("%s: indirect interrupt on iq_fl %p " "with qid %u", __func__, iq, lq); } t4_an_handler(iq, &d->rsp); break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && !sort_before_lro(lro) && sc->lro_timeout != 0) { tcp_lro_flush_inactive(lro, &lro_timeout); } #endif if (budget) return (EINPROGRESS); ndescs = 0; } } out: #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { if (ndescs > 0 && lro->lro_mbuf_count > 8) { MPASS(sort_before_lro(lro)); /* hold back one credit and don't flush LRO state */ iq->flags |= IQ_ADJ_CREDIT; ndescs--; } else { tcp_lro_flush_all(lro); } } #endif t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); FL_LOCK(fl); starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); return (0); } static inline struct cluster_metadata * cl_metadata(struct fl_sdesc *sd) { return ((void *)(sd->cl + sd->moff)); } static void rxb_free(struct mbuf *m) { struct cluster_metadata *clm = m->m_ext.ext_arg1; uma_zfree(clm->zone, clm->cl); counter_u64_add(extfree_rels, 1); } /* * The mbuf returned comes from zone_muf and carries the payload in one of these * ways * a) complete frame inside the mbuf * b) m_cljset (for clusters without metadata) * d) m_extaddref (cluster with metadata) */ static struct mbuf * get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; struct cluster_metadata *clm; int len, blen; caddr_t payload; if (fl->flags & FL_BUF_PACKING) { u_int l, pad; blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); payload = sd->cl + fl->rx_offset; l = fr_offset + len; pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < rxb->size2) blen = len + pad; MPASS(fl->rx_offset + blen <= rxb->size2); } else { MPASS(fl->rx_offset == 0); /* not packing */ blen = rxb->size1; len = min(remaining, blen); payload = sd->cl; } if (fr_offset == 0) { m = m_gethdr(M_NOWAIT, MT_DATA); if (__predict_false(m == NULL)) return (NULL); m->m_pkthdr.len = remaining; } else { m = m_get(M_NOWAIT, MT_DATA); if (__predict_false(m == NULL)) return (NULL); } m->m_len = len; kmsan_mark(payload, len, KMSAN_STATE_INITED); if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { /* copy data to mbuf */ bcopy(payload, mtod(m, caddr_t), len); if (fl->flags & FL_BUF_PACKING) { fl->rx_offset += blen; MPASS(fl->rx_offset <= rxb->size2); if (fl->rx_offset < rxb->size2) return (m); /* without advancing the cidx */ } } else if (fl->flags & FL_BUF_PACKING) { clm = cl_metadata(sd); if (sd->nmbuf++ == 0) { clm->refcount = 1; clm->zone = rxb->zone; clm->cl = sd->cl; counter_u64_add(extfree_refs, 1); } m_extaddref(m, payload, blen, &clm->refcount, rxb_free, clm, NULL); fl->rx_offset += blen; MPASS(fl->rx_offset <= rxb->size2); if (fl->rx_offset < rxb->size2) return (m); /* without advancing the cidx */ } else { m_cljset(m, sd->cl, rxb->type); sd->cl = NULL; /* consumed, not a recycle candidate */ } move_to_next_rxbuf(fl); return (m); } static struct mbuf * get_fl_payload(struct adapter *sc, struct sge_fl *fl, const u_int plen) { struct mbuf *m0, *m, **pnext; u_int remaining; if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(fl->m0->m_pkthdr.len == plen); MPASS(fl->remaining < plen); m0 = fl->m0; pnext = fl->pnext; remaining = fl->remaining; fl->flags &= ~FL_BUF_RESUME; goto get_segment; } /* * Payload starts at rx_offset in the current hw buffer. Its length is * 'len' and it may span multiple hw buffers. */ m0 = get_scatter_segment(sc, fl, 0, plen); if (m0 == NULL) return (NULL); remaining = plen - m0->m_len; pnext = &m0->m_next; while (remaining > 0) { get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, plen - remaining, remaining); if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = remaining; fl->flags |= FL_BUF_RESUME; return (NULL); } *pnext = m; pnext = &m->m_next; remaining -= m->m_len; } *pnext = NULL; M_ASSERTPKTHDR(m0); return (m0); } static int skip_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; int len, blen; if (fl->flags & FL_BUF_PACKING) { u_int l, pad; blen = rxb->size2 - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); l = fr_offset + len; pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < rxb->size2) blen = len + pad; fl->rx_offset += blen; MPASS(fl->rx_offset <= rxb->size2); if (fl->rx_offset < rxb->size2) return (len); /* without advancing the cidx */ } else { MPASS(fl->rx_offset == 0); /* not packing */ blen = rxb->size1; len = min(remaining, blen); } move_to_next_rxbuf(fl); return (len); } static inline void skip_fl_payload(struct adapter *sc, struct sge_fl *fl, int plen) { int remaining, fr_offset, len; fr_offset = 0; remaining = plen; while (remaining > 0) { len = skip_scatter_segment(sc, fl, fr_offset, remaining); fr_offset += len; remaining -= len; } } static inline int get_segment_len(struct adapter *sc, struct sge_fl *fl, int plen) { int len; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[sd->zidx]; if (fl->flags & FL_BUF_PACKING) len = rxb->size2 - fl->rx_offset; else len = rxb->size1; return (min(plen, len)); } static int eth_rx(struct adapter *sc, struct sge_rxq *rxq, const struct iq_desc *d, u_int plen) { struct mbuf *m0; - struct ifnet *ifp = rxq->ifp; + if_t ifp = rxq->ifp; struct sge_fl *fl = &rxq->fl; - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); const struct cpl_rx_pkt *cpl; #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif uint16_t err_vec, tnl_type, tnlhdr_len; static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6}, {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6}, }; static const int sw_csum_flags[2][2] = { { /* IP, inner IP */ CSUM_ENCAP_VXLAN | CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, /* IP, inner IP6 */ CSUM_ENCAP_VXLAN | CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, }, { /* IP6, inner IP */ CSUM_ENCAP_VXLAN | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, /* IP6, inner IP6 */ CSUM_ENCAP_VXLAN | CSUM_L4_CALC | CSUM_L4_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID, }, }; MPASS(plen > sc->params.sge.fl_pktshift); if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) && __predict_true((fl->flags & FL_BUF_RESUME) == 0)) { struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; caddr_t frame; int rc, slen; slen = get_segment_len(sc, fl, plen) - sc->params.sge.fl_pktshift; frame = sd->cl + fl->rx_offset + sc->params.sge.fl_pktshift; - CURVNET_SET_QUIET(ifp->if_vnet); + CURVNET_SET_QUIET(if_getvnet(ifp)); rc = pfil_mem_in(vi->pfil, frame, slen, ifp, &m0); CURVNET_RESTORE(); if (rc == PFIL_DROPPED || rc == PFIL_CONSUMED) { skip_fl_payload(sc, fl, plen); return (0); } if (rc == PFIL_REALLOCED) { skip_fl_payload(sc, fl, plen); goto have_mbuf; } } m0 = get_fl_payload(sc, fl, plen); if (__predict_false(m0 == NULL)) return (ENOMEM); m0->m_pkthdr.len -= sc->params.sge.fl_pktshift; m0->m_len -= sc->params.sge.fl_pktshift; m0->m_data += sc->params.sge.fl_pktshift; have_mbuf: m0->m_pkthdr.rcvif = ifp; M_HASHTYPE_SET(m0, sw_hashtype[d->rss.hash_type][d->rss.ipv6]); m0->m_pkthdr.flowid = be32toh(d->rss.hash_val); cpl = (const void *)(&d->rss + 1); if (sc->params.tp.rx_pkt_encap) { const uint16_t ev = be16toh(cpl->err_vec); err_vec = G_T6_COMPR_RXERR_VEC(ev); tnl_type = G_T6_RX_TNL_TYPE(ev); tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev); } else { err_vec = be16toh(cpl->err_vec); tnl_type = 0; tnlhdr_len = 0; } if (cpl->csum_calc && err_vec == 0) { int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6)); /* checksum(s) calculated and found to be correct. */ MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^ (cpl->l2info & htobe32(F_RXF_IP6))); m0->m_pkthdr.csum_data = be16toh(cpl->csum); if (tnl_type == 0) { - if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) { + if (!ipv6 && if_getcapenable(ifp) & IFCAP_RXCSUM) { m0->m_pkthdr.csum_flags = CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID; - } else if (ipv6 && ifp->if_capenable & IFCAP_RXCSUM_IPV6) { + } else if (ipv6 && if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) { m0->m_pkthdr.csum_flags = CSUM_L4_CALC | CSUM_L4_VALID; } rxq->rxcsum++; } else { MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN); M_HASHTYPE_SETINNER(m0); if (__predict_false(cpl->ip_frag)) { /* * csum_data is for the inner frame (which is an * IP fragment) and is not 0xffff. There is no * way to pass the inner csum_data to the stack. * We don't want the stack to use the inner * csum_data to validate the outer frame or it * will get rejected. So we fix csum_data here * and let sw do the checksum of inner IP * fragments. * * XXX: Need 32b for csum_data2 in an rx mbuf. * Maybe stuff it into rcv_tstmp? */ m0->m_pkthdr.csum_data = 0xffff; if (ipv6) { m0->m_pkthdr.csum_flags = CSUM_L4_CALC | CSUM_L4_VALID; } else { m0->m_pkthdr.csum_flags = CSUM_L3_CALC | CSUM_L3_VALID | CSUM_L4_CALC | CSUM_L4_VALID; } } else { int outer_ipv6; MPASS(m0->m_pkthdr.csum_data == 0xffff); outer_ipv6 = tnlhdr_len >= sizeof(struct ether_header) + sizeof(struct ip6_hdr); m0->m_pkthdr.csum_flags = sw_csum_flags[outer_ipv6][ipv6]; } rxq->vxlan_rxcsum++; } } if (cpl->vlan_ex) { m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); m0->m_flags |= M_VLANTAG; rxq->vlan_extraction++; } if (rxq->iq.flags & IQ_RX_TIMESTAMP) { /* * Fill up rcv_tstmp but do not set M_TSTMP as * long as we get a non-zero back from t4_tstmp_to_ns(). */ m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc, be64toh(d->rsp.u.last_flit)); if (m0->m_pkthdr.rcv_tstmp != 0) m0->m_flags |= M_TSTMP; } #ifdef NUMA - m0->m_pkthdr.numa_domain = ifp->if_numa_domain; + m0->m_pkthdr.numa_domain = if_getnumadomain(ifp); #endif #if defined(INET) || defined(INET6) if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 && (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 || M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) { if (sort_before_lro(lro)) { tcp_lro_queue_mbuf(lro, m0); return (0); /* queued for sort, then LRO */ } if (tcp_lro_rx(lro, m0, 0) == 0) return (0); /* queued for LRO */ } #endif - ifp->if_input(ifp, m0); + if_input(ifp, m0); return (0); } /* * Must drain the wrq or make sure that someone else will. */ static void wrq_tx_drain(void *arg, int n) { struct sge_wrq *wrq = arg; struct sge_eq *eq = &wrq->eq; EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(wrq->adapter, wrq); EQ_UNLOCK(eq); } static void drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) { struct sge_eq *eq = &wrq->eq; u_int available, dbdiff; /* # of hardware descriptors */ u_int n; struct wrqe *wr; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ EQ_LOCK_ASSERT_OWNED(eq); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); wr = STAILQ_FIRST(&wrq->wr_list); MPASS(wr != NULL); /* Must be called with something useful to do */ MPASS(eq->pidx == eq->dbidx); dbdiff = 0; do { eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; MPASS(wr->wrq == wrq); n = howmany(wr->wr_len, EQ_ESIZE); if (available < n) break; dst = (void *)&eq->desc[eq->pidx]; if (__predict_true(eq->sidx - eq->pidx > n)) { /* Won't wrap, won't end exactly at the status page. */ bcopy(&wr->wr[0], dst, wr->wr_len); eq->pidx += n; } else { int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; bcopy(&wr->wr[0], dst, first_portion); if (wr->wr_len > first_portion) { bcopy(&wr->wr[first_portion], &eq->desc[0], wr->wr_len - first_portion); } eq->pidx = n - (eq->sidx - eq->pidx); } wrq->tx_wrs_copied++; if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { /* * XXX: This is not 100% reliable with some * types of WRs. But this is a very unusual * situation for an ofld/ctrl queue anyway. */ dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); } dbdiff += n; if (dbdiff >= 16) { ring_eq_db(sc, eq, dbdiff); dbdiff = 0; } STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); MPASS(wrq->nwr_pending > 0); wrq->nwr_pending--; MPASS(wrq->ndesc_needed >= n); wrq->ndesc_needed -= n; } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); if (dbdiff) ring_eq_db(sc, eq, dbdiff); } /* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { #ifdef INVARIANTS struct sge_eq *eq = &wrq->eq; #endif EQ_LOCK_ASSERT_OWNED(eq); MPASS(wr != NULL); MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); MPASS((wr->wr_len & 0x7) == 0); STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); wrq->nwr_pending++; wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) return; /* commit_wrq_wr will drain wr_list as well. */ drain_wrq_wr_list(sc, wrq); /* Doorbell must have caught up to the pidx. */ MPASS(eq->pidx == eq->dbidx); } void -t4_update_fl_bufsize(struct ifnet *ifp) +t4_update_fl_bufsize(if_t ifp) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; int i, maxp; maxp = max_rx_payload(sc, ifp, false); for_each_rxq(vi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD maxp = max_rx_payload(sc, ifp, true); for_each_ofld_rxq(vi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING); FL_UNLOCK(fl); } #endif } #ifdef RATELIMIT static inline int mbuf_eo_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[1]); } #if defined(INET) || defined(INET6) static inline void set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[1] = nsegs; } #endif static inline int mbuf_eo_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[2]; MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } #if defined(INET) || defined(INET6) static inline void set_mbuf_eo_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[2] = len16; } #endif static inline int mbuf_eo_tsclk_tsoff(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[3]); } #if defined(INET) || defined(INET6) static inline void set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff; } #endif static inline int needs_eo(struct m_snd_tag *mst) { return (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_RATE_LIMIT); } #endif /* * Try to allocate an mbuf to contain a raw work request. To make it * easy to construct the work request, don't allocate a chain but a * single mbuf. */ struct mbuf * alloc_wr_mbuf(int len, int how) { struct mbuf *m; if (len <= MHLEN) m = m_gethdr(how, MT_DATA); else if (len <= MCLBYTES) m = m_getcl(how, MT_DATA, M_PKTHDR); else m = NULL; if (m == NULL) return (NULL); m->m_pkthdr.len = len; m->m_len = len; set_mbuf_cflags(m, MC_RAW_WR); set_mbuf_len16(m, howmany(len, 16)); return (m); } static inline bool needs_hwcsum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_tso(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO | CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_vxlan_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN); } static inline bool needs_vxlan_tso(struct mbuf *m) { const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return ((m->m_pkthdr.csum_flags & csum_flags) != 0 && (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN); } #if defined(INET) || defined(INET6) static inline bool needs_inner_tcp_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } #endif static inline bool needs_l3_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_outer_tcp_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP | CSUM_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } #ifdef RATELIMIT static inline bool needs_outer_l4_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } static inline bool needs_outer_udp_csum(struct mbuf *m) { const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP; M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & csum_flags); } #endif static inline bool needs_vlan_insertion(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_flags & M_VLANTAG); } #if defined(INET) || defined(INET6) static void * m_advance(struct mbuf **pm, int *poffset, int len) { struct mbuf *m = *pm; int offset = *poffset; uintptr_t p = 0; MPASS(len > 0); for (;;) { if (offset + len < m->m_len) { offset += len; p = mtod(m, uintptr_t) + offset; break; } len -= m->m_len - offset; m = m->m_next; offset = 0; MPASS(m != NULL); } *poffset = offset; *pm = m; return ((void *)p); } #endif static inline int count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr) { vm_paddr_t paddr; int i, len, off, pglen, pgoff, seglen, segoff; int nsegs = 0; M_ASSERTEXTPG(m); off = mtod(m, vm_offset_t); len = m->m_len; off += skip; len -= skip; if (m->m_epg_hdrlen != 0) { if (off >= m->m_epg_hdrlen) { off -= m->m_epg_hdrlen; } else { seglen = m->m_epg_hdrlen - off; segoff = off; seglen = min(seglen, len); off = 0; len -= seglen; paddr = pmap_kextract( (vm_offset_t)&m->m_epg_hdr[segoff]); if (*nextaddr != paddr) nsegs++; *nextaddr = paddr + seglen; } } pgoff = m->m_epg_1st_off; for (i = 0; i < m->m_epg_npgs && len > 0; i++) { pglen = m_epg_pagelen(m, i, pgoff); if (off >= pglen) { off -= pglen; pgoff = 0; continue; } seglen = pglen - off; segoff = pgoff + off; off = 0; seglen = min(seglen, len); len -= seglen; paddr = m->m_epg_pa[i] + segoff; if (*nextaddr != paddr) nsegs++; *nextaddr = paddr + seglen; pgoff = 0; }; if (len != 0) { seglen = min(len, m->m_epg_trllen - off); len -= seglen; paddr = pmap_kextract((vm_offset_t)&m->m_epg_trail[off]); if (*nextaddr != paddr) nsegs++; *nextaddr = paddr + seglen; } return (nsegs); } /* * Can deal with empty mbufs in the chain that have m_len = 0, but the chain * must have at least one mbuf that's not empty. It is possible for this * routine to return 0 if skip accounts for all the contents of the mbuf chain. */ static inline int count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags) { vm_paddr_t nextaddr, paddr; vm_offset_t va; int len, nsegs; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len > 0); MPASS(m->m_pkthdr.len >= skip); nsegs = 0; nextaddr = 0; for (; m; m = m->m_next) { len = m->m_len; if (__predict_false(len == 0)) continue; if (skip >= len) { skip -= len; continue; } if ((m->m_flags & M_EXTPG) != 0) { *cflags |= MC_NOMAP; nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr); skip = 0; continue; } va = mtod(m, vm_offset_t) + skip; len -= skip; skip = 0; paddr = pmap_kextract(va); nsegs += sglist_count((void *)(uintptr_t)va, len); if (paddr == nextaddr) nsegs--; nextaddr = pmap_kextract(va + len - 1) + 1; } return (nsegs); } /* * The maximum number of segments that can fit in a WR. */ static int max_nsegs_allowed(struct mbuf *m, bool vm_wr) { if (vm_wr) { if (needs_tso(m)) return (TX_SGL_SEGS_VM_TSO); return (TX_SGL_SEGS_VM); } if (needs_tso(m)) { if (needs_vxlan_tso(m)) return (TX_SGL_SEGS_VXLAN_TSO); else return (TX_SGL_SEGS_TSO); } return (TX_SGL_SEGS); } static struct timeval txerr_ratecheck = {0}; static const struct timeval txerr_interval = {3, 0}; /* * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: * a) caller can assume it's been freed if this function returns with an error. * b) it may get defragged up if the gather list is too long for the hardware. */ int parse_pkt(struct mbuf **mp, bool vm_wr) { struct mbuf *m0 = *mp, *m; int rc, nsegs, defragged = 0; struct ether_header *eh; #ifdef INET void *l3hdr; #endif #if defined(INET) || defined(INET6) int offset; struct tcphdr *tcp; #endif #if defined(KERN_TLS) || defined(RATELIMIT) struct m_snd_tag *mst; #endif uint16_t eh_type; uint8_t cflags; cflags = 0; M_ASSERTPKTHDR(m0); if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { rc = EINVAL; fail: m_freem(m0); *mp = NULL; return (rc); } restart: /* * First count the number of gather list segments in the payload. * Defrag the mbuf if nsegs exceeds the hardware limit. */ M_ASSERTPKTHDR(m0); MPASS(m0->m_pkthdr.len > 0); nsegs = count_mbuf_nsegs(m0, 0, &cflags); #if defined(KERN_TLS) || defined(RATELIMIT) if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) mst = m0->m_pkthdr.snd_tag; else mst = NULL; #endif #ifdef KERN_TLS if (mst != NULL && mst->sw->type == IF_SND_TAG_TYPE_TLS) { cflags |= MC_TLS; set_mbuf_cflags(m0, cflags); rc = t6_ktls_parse_pkt(m0); if (rc != 0) goto fail; return (EINPROGRESS); } #endif if (nsegs > max_nsegs_allowed(m0, vm_wr)) { if (defragged++ > 0) { rc = EFBIG; goto fail; } counter_u64_add(defrags, 1); if ((m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = ENOMEM; goto fail; } *mp = m0 = m; /* update caller's copy after defrag */ goto restart; } if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN && !(cflags & MC_NOMAP))) { counter_u64_add(pullups, 1); m0 = m_pullup(m0, m0->m_pkthdr.len); if (m0 == NULL) { /* Should have left well enough alone. */ rc = EFBIG; goto fail; } *mp = m0; /* update caller's copy after pullup */ goto restart; } set_mbuf_nsegs(m0, nsegs); set_mbuf_cflags(m0, cflags); calculate_mbuf_len16(m0, vm_wr); #ifdef RATELIMIT /* * Ethofld is limited to TCP and UDP for now, and only when L4 hw * checksumming is enabled. needs_outer_l4_csum happens to check for * all the right things. */ if (__predict_false(needs_eo(mst) && !needs_outer_l4_csum(m0))) { m_snd_tag_rele(m0->m_pkthdr.snd_tag); m0->m_pkthdr.snd_tag = NULL; m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; mst = NULL; } #endif if (!needs_hwcsum(m0) #ifdef RATELIMIT && !needs_eo(mst) #endif ) return (0); m = m0; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.l2hlen = sizeof(*evh); } else m0->m_pkthdr.l2hlen = sizeof(*eh); #if defined(INET) || defined(INET6) offset = 0; #ifdef INET l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); #else m_advance(&m, &offset, m0->m_pkthdr.l2hlen); #endif #endif switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr); break; #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; if (needs_vxlan_csum(m0)) { /* Driver will do the outer IP hdr checksum. */ ip->ip_sum = 0; if (needs_vxlan_tso(m0)) { const uint16_t ipl = ip->ip_len; ip->ip_len = 0; ip->ip_sum = ~in_cksum_hdr(ip); ip->ip_len = ipl; } else ip->ip_sum = in_cksum_hdr(ip); } m0->m_pkthdr.l3hlen = ip->ip_hl << 2; break; } #endif default: if (ratecheck(&txerr_ratecheck, &txerr_interval)) { log(LOG_ERR, "%s: ethertype 0x%04x unknown. " "if_cxgbe must be compiled with the same " "INET/INET6 options as the kernel.\n", __func__, eh_type); } rc = EINVAL; goto fail; } #if defined(INET) || defined(INET6) if (needs_vxlan_csum(m0)) { m0->m_pkthdr.l4hlen = sizeof(struct udphdr); m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header); /* Inner headers. */ eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen + sizeof(struct udphdr) + sizeof(struct vxlan_header)); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.inner_l2hlen = sizeof(*evh); } else m0->m_pkthdr.inner_l2hlen = sizeof(*eh); #ifdef INET l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen); #else m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen); #endif switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr); break; #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2; break; } #endif default: if (ratecheck(&txerr_ratecheck, &txerr_interval)) { log(LOG_ERR, "%s: VXLAN hw offload requested" "with unknown ethertype 0x%04x. if_cxgbe " "must be compiled with the same INET/INET6 " "options as the kernel.\n", __func__, eh_type); } rc = EINVAL; goto fail; } if (needs_inner_tcp_csum(m0)) { tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen); m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4; } MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN; } if (needs_outer_tcp_csum(m0)) { tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); m0->m_pkthdr.l4hlen = tcp->th_off * 4; #ifdef RATELIMIT if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) { set_mbuf_eo_tsclk_tsoff(m0, V_FW_ETH_TX_EO_WR_TSCLK(tsclk) | V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1)); } else set_mbuf_eo_tsclk_tsoff(m0, 0); } else if (needs_outer_udp_csum(m0)) { m0->m_pkthdr.l4hlen = sizeof(struct udphdr); #endif } #ifdef RATELIMIT if (needs_eo(mst)) { u_int immhdrs; /* EO WRs have the headers in the WR and not the GL. */ immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; cflags = 0; nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags); MPASS(cflags == mbuf_cflags(m0)); set_mbuf_eo_nsegs(m0, nsegs); set_mbuf_eo_len16(m0, txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0))); rc = ethofld_transmit(mst->ifp, m0); if (rc != 0) goto fail; return (EINPROGRESS); } #endif #endif MPASS(m0 == *mp); return (0); } void * start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, available; struct wrqe *wr; void *w; MPASS(len16 > 0); ndesc = tx_len16_to_desc(len16); MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); if (!STAILQ_EMPTY(&wrq->wr_list)) { slowpath: EQ_UNLOCK(eq); wr = alloc_wrqe(len16 * 16, wrq); if (__predict_false(wr == NULL)) return (NULL); cookie->pidx = -1; cookie->ndesc = ndesc; return (&wr->wr); } eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < ndesc) goto slowpath; cookie->pidx = eq->pidx; cookie->ndesc = ndesc; TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); w = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, ndesc, eq->sidx); if (__predict_false(cookie->pidx + ndesc > eq->sidx)) { w = &wrq->ss[0]; wrq->ss_pidx = cookie->pidx; wrq->ss_len = len16 * 16; } EQ_UNLOCK(eq); return (w); } void commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, pidx; struct wrq_cookie *prev, *next; if (cookie->pidx == -1) { struct wrqe *wr = __containerof(w, struct wrqe, wr); t4_wrq_tx(sc, wr); return; } if (__predict_false(w == &wrq->ss[0])) { int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; MPASS(wrq->ss_len > n); /* WR had better wrap around. */ bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); wrq->tx_wrs_ss++; } else wrq->tx_wrs_direct++; EQ_LOCK(eq); ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ pidx = cookie->pidx; MPASS(pidx >= 0 && pidx < eq->sidx); prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); next = TAILQ_NEXT(cookie, link); if (prev == NULL) { MPASS(pidx == eq->dbidx); if (next == NULL || ndesc >= 16) { int available; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ /* * Note that the WR via which we'll request tx updates * is at pidx and not eq->pidx, which has moved on * already. */ dst = (void *)&eq->desc[pidx]; available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { /* * XXX: This is not 100% reliable with some * types of WRs. But this is a very unusual * situation for an ofld/ctrl queue anyway. */ dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); } ring_eq_db(wrq->adapter, eq, ndesc); } else { MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); next->pidx = pidx; next->ndesc += ndesc; } } else { MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); prev->ndesc += ndesc; } TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); #ifdef INVARIANTS if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { /* Doorbell must have caught up to the pidx. */ MPASS(wrq->eq.pidx == wrq->eq.dbidx); } #endif EQ_UNLOCK(eq); } static u_int can_resume_eth_tx(struct mp_ring *r) { struct sge_eq *eq = r->cookie; return (total_available_tx_desc(eq) > eq->sidx / 8); } static inline bool cannot_use_txpkts(struct mbuf *m) { /* maybe put a GL limit too, to avoid silliness? */ return (needs_tso(m) || (mbuf_cflags(m) & (MC_RAW_WR | MC_TLS)) != 0); } static inline int discard_tx(struct sge_eq *eq) { return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED); } static inline int wr_can_update_eq(void *p) { struct fw_eth_tx_pkts_wr *wr = p; switch (G_FW_WR_OP(be32toh(wr->op_pkd))) { case FW_ULPTX_WR: case FW_ETH_TX_PKT_WR: case FW_ETH_TX_PKTS_WR: case FW_ETH_TX_PKTS2_WR: case FW_ETH_TX_PKT_VM_WR: case FW_ETH_TX_PKTS_VM_WR: return (1); default: return (0); } } static inline void set_txupdate_flags(struct sge_txq *txq, u_int avail, struct fw_eth_tx_pkt_wr *wr) { struct sge_eq *eq = &txq->eq; struct txpkts *txp = &txq->txp; if ((txp->npkt > 0 || avail < eq->sidx / 2) && atomic_cmpset_int(&eq->equiq, 0, 1)) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } } #if defined(__i386__) || defined(__amd64__) extern uint64_t tsc_freq; #endif static inline bool record_eth_tx_time(struct sge_txq *txq) { const uint64_t cycles = get_cyclecount(); const uint64_t last_tx = txq->last_tx; #if defined(__i386__) || defined(__amd64__) const uint64_t itg = tsc_freq * t4_tx_coalesce_gap / 1000000; #else const uint64_t itg = 0; #endif MPASS(cycles >= last_tx); txq->last_tx = cycles; return (cycles - last_tx < itg); } /* * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to * be consumed. Return the actual number consumed. 0 indicates a stall. */ static u_int eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing) { struct sge_txq *txq = r->cookie; - struct ifnet *ifp = txq->ifp; + if_t ifp = txq->ifp; struct sge_eq *eq = &txq->eq; struct txpkts *txp = &txq->txp; - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct adapter *sc = vi->adapter; u_int total, remaining; /* # of packets */ u_int n, avail, dbdiff; /* # of hardware descriptors */ int i, rc; struct mbuf *m0; bool snd, recent_tx; void *wr; /* start of the last WR written to the ring */ TXQ_LOCK_ASSERT_OWNED(txq); recent_tx = record_eth_tx_time(txq); remaining = IDXDIFF(pidx, cidx, r->size); if (__predict_false(discard_tx(eq))) { for (i = 0; i < txp->npkt; i++) m_freem(txp->mb[i]); txp->npkt = 0; while (cidx != pidx) { m0 = r->items[cidx]; m_freem(m0); if (++cidx == r->size) cidx = 0; } reclaim_tx_descs(txq, eq->sidx); *coalescing = false; return (remaining); /* emptied */ } /* How many hardware descriptors do we have readily available. */ if (eq->pidx == eq->cidx) avail = eq->sidx - 1; else avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; total = 0; if (remaining == 0) { txp->score = 0; txq->txpkts_flush++; goto send_txpkts; } dbdiff = 0; MPASS(remaining > 0); while (remaining > 0) { m0 = r->items[cidx]; M_ASSERTPKTHDR(m0); MPASS(m0->m_nextpkt == NULL); if (avail < 2 * SGE_MAX_WR_NDESC) avail += reclaim_tx_descs(txq, 64); if (t4_tx_coalesce == 0 && txp->npkt == 0) goto skip_coalescing; if (cannot_use_txpkts(m0)) txp->score = 0; else if (recent_tx) { if (++txp->score == 0) txp->score = UINT8_MAX; } else txp->score = 1; if (txp->npkt > 0 || remaining > 1 || txp->score >= t4_tx_coalesce_pkts || atomic_load_int(&txq->eq.equiq) != 0) { if (vi->flags & TX_USES_VM_WR) rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd); else rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd); } else { snd = false; rc = EINVAL; } if (snd) { MPASS(txp->npkt > 0); for (i = 0; i < txp->npkt; i++) ETHER_BPF_MTAP(ifp, txp->mb[i]); if (txp->npkt > 1) { MPASS(avail >= tx_len16_to_desc(txp->len16)); if (vi->flags & TX_USES_VM_WR) n = write_txpkts_vm_wr(sc, txq); else n = write_txpkts_wr(sc, txq); } else { MPASS(avail >= tx_len16_to_desc(mbuf_len16(txp->mb[0]))); if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); else n = write_txpkt_wr(sc, txq, txp->mb[0], avail); } MPASS(n <= SGE_MAX_WR_NDESC); avail -= n; dbdiff += n; wr = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, n, eq->sidx); txp->npkt = 0; /* emptied */ } if (rc == 0) { /* m0 was coalesced into txq->txpkts. */ goto next_mbuf; } if (rc == EAGAIN) { /* * m0 is suitable for tx coalescing but could not be * combined with the existing txq->txpkts, which has now * been transmitted. Start a new txpkts with m0. */ MPASS(snd); MPASS(txp->npkt == 0); continue; } MPASS(rc != 0 && rc != EAGAIN); MPASS(txp->npkt == 0); skip_coalescing: n = tx_len16_to_desc(mbuf_len16(m0)); if (__predict_false(avail < n)) { avail += reclaim_tx_descs(txq, min(n, 32)); if (avail < n) break; /* out of descriptors */ } wr = &eq->desc[eq->pidx]; if (mbuf_cflags(m0) & MC_RAW_WR) { n = write_raw_wr(txq, wr, m0, avail); #ifdef KERN_TLS } else if (mbuf_cflags(m0) & MC_TLS) { ETHER_BPF_MTAP(ifp, m0); n = t6_ktls_write_wr(txq, wr, m0, avail); #endif } else { ETHER_BPF_MTAP(ifp, m0); if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, m0); else n = write_txpkt_wr(sc, txq, m0, avail); } MPASS(n >= 1 && n <= avail); if (!(mbuf_cflags(m0) & MC_TLS)) MPASS(n <= SGE_MAX_WR_NDESC); avail -= n; dbdiff += n; IDXINCR(eq->pidx, n, eq->sidx); if (dbdiff >= 512 / EQ_ESIZE) { /* X_FETCHBURSTMAX_512B */ if (wr_can_update_eq(wr)) set_txupdate_flags(txq, avail, wr); ring_eq_db(sc, eq, dbdiff); avail += reclaim_tx_descs(txq, 32); dbdiff = 0; } next_mbuf: total++; remaining--; if (__predict_false(++cidx == r->size)) cidx = 0; } if (dbdiff != 0) { if (wr_can_update_eq(wr)) set_txupdate_flags(txq, avail, wr); ring_eq_db(sc, eq, dbdiff); reclaim_tx_descs(txq, 32); } else if (eq->pidx == eq->cidx && txp->npkt > 0 && atomic_load_int(&txq->eq.equiq) == 0) { /* * If nothing was submitted to the chip for tx (it was coalesced * into txpkts instead) and there is no tx update outstanding * then we need to send txpkts now. */ send_txpkts: MPASS(txp->npkt > 0); for (i = 0; i < txp->npkt; i++) ETHER_BPF_MTAP(ifp, txp->mb[i]); if (txp->npkt > 1) { MPASS(avail >= tx_len16_to_desc(txp->len16)); if (vi->flags & TX_USES_VM_WR) n = write_txpkts_vm_wr(sc, txq); else n = write_txpkts_wr(sc, txq); } else { MPASS(avail >= tx_len16_to_desc(mbuf_len16(txp->mb[0]))); if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); else n = write_txpkt_wr(sc, txq, txp->mb[0], avail); } MPASS(n <= SGE_MAX_WR_NDESC); wr = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, n, eq->sidx); txp->npkt = 0; /* emptied */ MPASS(wr_can_update_eq(wr)); set_txupdate_flags(txq, avail - n, wr); ring_eq_db(sc, eq, n); reclaim_tx_descs(txq, 32); } *coalescing = txp->npkt > 0; return (total); } static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, int qsize, int intr_idx, int cong, int qtype) { KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ ("%s: bad pktc_idx %d", __func__, pktc_idx)); KASSERT(intr_idx >= -1 && intr_idx < sc->intr_count, ("%s: bad intr_idx %d", __func__, intr_idx)); KASSERT(qtype == FW_IQ_IQTYPE_OTHER || qtype == FW_IQ_IQTYPE_NIC || qtype == FW_IQ_IQTYPE_OFLD, ("%s: bad qtype %d", __func__, qtype)); iq->flags = 0; iq->state = IQS_DISABLED; iq->adapter = sc; iq->qtype = qtype; iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); iq->intr_pktc_idx = SGE_NCOUNTERS - 1; if (pktc_idx >= 0) { iq->intr_params |= F_QINTR_CNT_EN; iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE; iq->intr_idx = intr_idx; iq->cong_drop = cong; } static inline void init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) { struct sge_params *sp = &sc->params.sge; fl->qsize = qsize; fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); if (sc->flags & BUF_PACKING_OK && ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ fl->flags |= FL_BUF_PACKING; fl->zidx = find_refill_source(sc, maxp, fl->flags & FL_BUF_PACKING); fl->safe_zidx = sc->sge.safe_zidx; if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sp->fl_starve_threshold2, 8); fl->buf_boundary = sp->pack_boundary; } else { fl->lowat = roundup2(sp->fl_starve_threshold, 8); fl->buf_boundary = 16; } if (fl_pad && fl->buf_boundary < sp->pad_boundary) fl->buf_boundary = sp->pad_boundary; } static inline void init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan, struct sge_iq *iq, char *name) { KASSERT(eqtype >= EQ_CTRL && eqtype <= EQ_OFLD, ("%s: bad qtype %d", __func__, eqtype)); eq->type = eqtype; eq->tx_chan = tx_chan; eq->iq = iq; eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(eq->lockname, name, sizeof(eq->lockname)); mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); } int alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_addr_t *pa, void **va) { int rc; rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); if (rc != 0) { CH_ERR(sc, "cannot allocate DMA tag: %d\n", rc); goto done; } rc = bus_dmamem_alloc(*tag, va, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); if (rc != 0) { CH_ERR(sc, "cannot allocate DMA memory: %d\n", rc); goto done; } rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); if (rc != 0) { CH_ERR(sc, "cannot load DMA map: %d\n", rc); goto done; } done: if (rc) free_ring(sc, *tag, *map, *pa, *va); return (rc); } int free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, bus_addr_t pa, void *va) { if (pa) bus_dmamap_unload(tag, map); if (va) bus_dmamem_free(tag, va, map); if (tag) bus_dma_tag_destroy(tag); return (0); } /* * Allocates the software resources (mainly memory and sysctl nodes) for an * ingress queue and an optional freelist. * * Sets IQ_SW_ALLOCATED and returns 0 on success. */ static int alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) { int rc; size_t len; struct adapter *sc = vi->adapter; MPASS(!(iq->flags & IQ_SW_ALLOCATED)); len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) return (rc); if (fl) { len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) { free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); return (rc); } /* Allocate space for one software descriptor per buffer. */ fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); add_fl_sysctls(sc, ctx, oid, fl); iq->flags |= IQ_HAS_FL; } add_iq_sysctls(ctx, oid, iq); iq->flags |= IQ_SW_ALLOCATED; return (0); } /* * Frees all software resources (memory and locks) associated with an ingress * queue and an optional freelist. */ static void free_iq_fl(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) { MPASS(iq->flags & IQ_SW_ALLOCATED); if (fl) { MPASS(iq->flags & IQ_HAS_FL); free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); free_fl_buffers(sc, fl); free(fl->sdesc, M_CXGBE); mtx_destroy(&fl->fl_lock); bzero(fl, sizeof(*fl)); } free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); bzero(iq, sizeof(*iq)); } /* * Allocates a hardware ingress queue and an optional freelist that will be * associated with it. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ static int alloc_iq_fl_hwq(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl) { int rc, cntxt_id, cong_map; struct fw_iq_cmd c; struct adapter *sc = vi->adapter; struct port_info *pi = vi->pi; __be32 v = 0; MPASS (!(iq->flags & IQ_HW_ALLOCATED)); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); /* Special handling for firmware event queue */ if (iq == &sc->sge.fwq) v |= F_FW_IQ_CMD_IQASYNCH; if (iq->intr_idx < 0) { /* Forwarded interrupts, all headed to fwq */ v |= F_FW_IQ_CMD_IQANDST; v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id); } else { KASSERT(iq->intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, iq->intr_idx)); v |= V_FW_IQ_CMD_IQANDSTINDEX(iq->intr_idx); } bzero(iq->desc, iq->qsize * IQ_ESIZE); c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); c.iqns_to_fl0congen = htobe32(V_FW_IQ_CMD_IQTYPE(iq->qtype)); if (iq->cong_drop != -1) { cong_map = iq->qtype == IQ_ETH ? pi->rx_e_chan_map : 0; c.iqns_to_fl0congen |= htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); } if (fl) { bzero(fl->desc, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len); c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 0)); if (iq->cong_drop != -1) { c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong_map) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B_T6) | V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B)); c.fl0size = htobe16(fl->qsize); c.fl0addr = htobe64(fl->ba); } rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { CH_ERR(sc, "failed to create hw ingress queue: %d\n", rc); return (rc); } iq->cidx = 0; iq->gen = F_RSPD_GEN; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); cntxt_id = iq->cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.iqmap_sz) { panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.iqmap_sz - 1); } sc->sge.iqmap[cntxt_id] = iq; if (fl) { u_int qid; #ifdef INVARIANTS int i; MPASS(!(fl->flags & FL_BUF_RESUME)); for (i = 0; i < fl->sidx * 8; i++) MPASS(fl->sdesc[i].cl == NULL); #endif fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = fl->hw_cidx = fl->dbidx = 0; fl->rx_offset = 0; fl->flags &= ~(FL_STARVING | FL_DOOMED); cntxt_id = fl->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) { panic("%s: fl->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); } sc->sge.eqmap[cntxt_id] = (void *)fl; qid = fl->cntxt_id; if (isset(&sc->doorbells, DOORBELL_UDB)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (qid >> s_qpp) << PAGE_SHIFT; qid &= mask; if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { udb += qid << UDBS_SEG_SHIFT; qid = 0; } fl->udb = (volatile void *)udb; } fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db; FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); } if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && iq->cong_drop != -1) { t4_sge_set_conm_context(sc, iq->cntxt_id, iq->cong_drop, cong_map); } /* Enable IQ interrupts */ atomic_store_rel_int(&iq->state, IQS_IDLE); t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) | V_INGRESSQID(iq->cntxt_id)); iq->flags |= IQ_HW_ALLOCATED; return (0); } static int free_iq_fl_hwq(struct adapter *sc, struct sge_iq *iq, struct sge_fl *fl) { int rc; MPASS(iq->flags & IQ_HW_ALLOCATED); rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); if (rc != 0) { CH_ERR(sc, "failed to free iq %p: %d\n", iq, rc); return (rc); } iq->flags &= ~IQ_HW_ALLOCATED; return (0); } static void add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_iq *iq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, iq->qsize * IQ_ESIZE, "descriptor ring size in bytes"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, &iq->abs_id, 0, "absolute id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &iq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &iq->cidx, 0, "consumer index"); } static void add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_fl *fl) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &fl->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &fl->cntxt_id, 0, "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, fl_pad ? 1 : 0, "padding enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); } SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 0, "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); } /* * Idempotent. */ static int alloc_fwq(struct adapter *sc) { int rc, intr_idx; struct sge_iq *fwq = &sc->sge.fwq; struct vi_info *vi = &sc->port[0]->vi[0]; if (!(fwq->flags & IQ_SW_ALLOCATED)) { MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); if (sc->flags & IS_VF) intr_idx = 0; else intr_idx = sc->intr_count > 1 ? 1 : 0; init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE, intr_idx, -1, IQ_OTHER); rc = alloc_iq_fl(vi, fwq, NULL, &sc->ctx, sc->fwq_oid); if (rc != 0) { CH_ERR(sc, "failed to allocate fwq: %d\n", rc); return (rc); } MPASS(fwq->flags & IQ_SW_ALLOCATED); } if (!(fwq->flags & IQ_HW_ALLOCATED)) { MPASS(fwq->flags & IQ_SW_ALLOCATED); rc = alloc_iq_fl_hwq(vi, fwq, NULL); if (rc != 0) { CH_ERR(sc, "failed to create hw fwq: %d\n", rc); return (rc); } MPASS(fwq->flags & IQ_HW_ALLOCATED); } return (0); } /* * Idempotent. */ static void free_fwq(struct adapter *sc) { struct sge_iq *fwq = &sc->sge.fwq; if (fwq->flags & IQ_HW_ALLOCATED) { MPASS(fwq->flags & IQ_SW_ALLOCATED); free_iq_fl_hwq(sc, fwq, NULL); MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); } if (fwq->flags & IQ_SW_ALLOCATED) { MPASS(!(fwq->flags & IQ_HW_ALLOCATED)); free_iq_fl(sc, fwq, NULL); MPASS(!(fwq->flags & IQ_SW_ALLOCATED)); } } /* * Idempotent. */ static int alloc_ctrlq(struct adapter *sc, int idx) { int rc; char name[16]; struct sysctl_oid *oid; struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx]; MPASS(idx < sc->params.nports); if (!(ctrlq->eq.flags & EQ_SW_ALLOCATED)) { MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&sc->ctx, SYSCTL_CHILDREN(sc->ctrlq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ctrl queue"); snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev), idx); init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[idx]->tx_chan, &sc->sge.fwq, name); rc = alloc_wrq(sc, NULL, ctrlq, &sc->ctx, oid); if (rc != 0) { CH_ERR(sc, "failed to allocate ctrlq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); return (rc); } MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); } if (!(ctrlq->eq.flags & EQ_HW_ALLOCATED)) { MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); rc = alloc_eq_hwq(sc, NULL, &ctrlq->eq); if (rc != 0) { CH_ERR(sc, "failed to create hw ctrlq%d: %d\n", idx, rc); return (rc); } MPASS(ctrlq->eq.flags & EQ_HW_ALLOCATED); } return (0); } /* * Idempotent. */ static void free_ctrlq(struct adapter *sc, int idx) { struct sge_wrq *ctrlq = &sc->sge.ctrlq[idx]; if (ctrlq->eq.flags & EQ_HW_ALLOCATED) { MPASS(ctrlq->eq.flags & EQ_SW_ALLOCATED); free_eq_hwq(sc, NULL, &ctrlq->eq); MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); } if (ctrlq->eq.flags & EQ_SW_ALLOCATED) { MPASS(!(ctrlq->eq.flags & EQ_HW_ALLOCATED)); free_wrq(sc, ctrlq); MPASS(!(ctrlq->eq.flags & EQ_SW_ALLOCATED)); } } int t4_sge_set_conm_context(struct adapter *sc, int cntxt_id, int cong_drop, int cong_map) { const int cng_ch_bits_log = sc->chip_params->cng_ch_bits_log; uint32_t param, val; uint16_t ch_map; int cong_mode, rc, i; if (chip_id(sc) < CHELSIO_T5) return (ENOTSUP); /* Convert the driver knob to the mode understood by the firmware. */ switch (cong_drop) { case -1: cong_mode = X_CONMCTXT_CNGTPMODE_DISABLE; break; case 0: cong_mode = X_CONMCTXT_CNGTPMODE_CHANNEL; break; case 1: cong_mode = X_CONMCTXT_CNGTPMODE_QUEUE; break; case 2: cong_mode = X_CONMCTXT_CNGTPMODE_BOTH; break; default: MPASS(0); CH_ERR(sc, "cong_drop = %d is invalid (ingress queue %d).\n", cong_drop, cntxt_id); return (EINVAL); } param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(cntxt_id); val = V_CONMCTXT_CNGTPMODE(cong_mode); if (cong_mode == X_CONMCTXT_CNGTPMODE_CHANNEL || cong_mode == X_CONMCTXT_CNGTPMODE_BOTH) { for (i = 0, ch_map = 0; i < 4; i++) { if (cong_map & (1 << i)) ch_map |= 1 << (i << cng_ch_bits_log); } val |= V_CONMCTXT_CNGCHMAP(ch_map); } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { CH_ERR(sc, "failed to set congestion manager context " "for ingress queue %d: %d\n", cntxt_id, rc); } return (rc); } /* * Idempotent. */ static int alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int idx, int intr_idx, int maxp) { int rc; struct adapter *sc = vi->adapter; - struct ifnet *ifp = vi->ifp; + if_t ifp = vi->ifp; struct sysctl_oid *oid; char name[16]; if (!(rxq->iq.flags & IQ_SW_ALLOCATED)) { MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); #if defined(INET) || defined(INET6) rc = tcp_lro_init_args(&rxq->lro, ifp, lro_entries, lro_mbufs); if (rc != 0) return (rc); MPASS(rxq->lro.ifp == ifp); /* also indicates LRO init'ed */ #endif rxq->ifp = ifp; snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->rxq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rx queue"); init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq, intr_idx, cong_drop, IQ_ETH); #if defined(INET) || defined(INET6) - if (ifp->if_capenable & IFCAP_LRO) + if (if_getcapenable(ifp) & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; #endif - if (ifp->if_capenable & IFCAP_HWRXTSTMP) + if (if_getcapenable(ifp) & IFCAP_HWRXTSTMP) rxq->iq.flags |= IQ_RX_TIMESTAMP; snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(vi->dev), idx); init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name); rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, &vi->ctx, oid); if (rc != 0) { CH_ERR(vi, "failed to allocate rxq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); #if defined(INET) || defined(INET6) tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; #endif return (rc); } MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); add_rxq_sysctls(&vi->ctx, oid, rxq); } if (!(rxq->iq.flags & IQ_HW_ALLOCATED)) { MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); rc = alloc_iq_fl_hwq(vi, &rxq->iq, &rxq->fl); if (rc != 0) { CH_ERR(vi, "failed to create hw rxq%d: %d\n", idx, rc); return (rc); } MPASS(rxq->iq.flags & IQ_HW_ALLOCATED); if (idx == 0) sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id; else KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id, ("iq_base mismatch")); KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF, ("PF with non-zero iq_base")); /* * The freelist is just barely above the starvation threshold * right now, fill it up a bit more. */ FL_LOCK(&rxq->fl); refill_fl(sc, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); } return (0); } /* * Idempotent. */ static void free_rxq(struct vi_info *vi, struct sge_rxq *rxq) { if (rxq->iq.flags & IQ_HW_ALLOCATED) { MPASS(rxq->iq.flags & IQ_SW_ALLOCATED); free_iq_fl_hwq(vi->adapter, &rxq->iq, &rxq->fl); MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); } if (rxq->iq.flags & IQ_SW_ALLOCATED) { MPASS(!(rxq->iq.flags & IQ_HW_ALLOCATED)); #if defined(INET) || defined(INET6) tcp_lro_free(&rxq->lro); #endif free_iq_fl(vi->adapter, &rxq->iq, &rxq->fl); MPASS(!(rxq->iq.flags & IQ_SW_ALLOCATED)); bzero(rxq, sizeof(*rxq)); } } static void add_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_rxq *rxq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); #if defined(INET) || defined(INET6) SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &rxq->lro.lro_queued, 0, NULL); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_rxcsum", CTLFLAG_RD, &rxq->vxlan_rxcsum, "# of times hardware assisted with inner checksum (VXLAN)"); } #ifdef TCP_OFFLOAD /* * Idempotent. */ static int alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int idx, int intr_idx, int maxp) { int rc; struct adapter *sc = vi->adapter; struct sysctl_oid *oid; char name[16]; if (!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)) { MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->ofld_rxq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload rx queue"); init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx, vi->qsize_rxq, intr_idx, ofld_cong_drop, IQ_OFLD); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(vi->dev), idx); init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name); rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, &vi->ctx, oid); if (rc != 0) { CH_ERR(vi, "failed to allocate ofld_rxq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); return (rc); } MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK); ofld_rxq->rx_iscsi_ddp_setup_error = counter_u64_alloc(M_WAITOK); add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq); } if (!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)) { MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); rc = alloc_iq_fl_hwq(vi, &ofld_rxq->iq, &ofld_rxq->fl); if (rc != 0) { CH_ERR(vi, "failed to create hw ofld_rxq%d: %d\n", idx, rc); return (rc); } MPASS(ofld_rxq->iq.flags & IQ_HW_ALLOCATED); } return (rc); } /* * Idempotent. */ static void free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq) { if (ofld_rxq->iq.flags & IQ_HW_ALLOCATED) { MPASS(ofld_rxq->iq.flags & IQ_SW_ALLOCATED); free_iq_fl_hwq(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl); MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); } if (ofld_rxq->iq.flags & IQ_SW_ALLOCATED) { MPASS(!(ofld_rxq->iq.flags & IQ_HW_ALLOCATED)); free_iq_fl(vi->adapter, &ofld_rxq->iq, &ofld_rxq->fl); MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)); counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok); counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error); bzero(ofld_rxq, sizeof(*ofld_rxq)); } } static void add_ofld_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_ofld_rxq *ofld_rxq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_toe_tls_records", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_records, "# of TOE TLS records received"); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets, "# of payload octets in received TOE TLS records"); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_ok", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_ok, "# of times DDP buffer was setup successfully."); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "ddp_setup_error", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_setup_error, "# of times DDP buffer setup failed."); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_octets, 0, "# of octets placed directly"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "ddp_pdus", CTLFLAG_RD, &ofld_rxq->rx_iscsi_ddp_pdus, 0, "# of PDUs with data placed directly."); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_octets", CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_octets, 0, "# of data octets delivered in freelist"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "fl_pdus", CTLFLAG_RD, &ofld_rxq->rx_iscsi_fl_pdus, 0, "# of PDUs with data delivered in freelist"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "padding_errors", CTLFLAG_RD, &ofld_rxq->rx_iscsi_padding_errors, 0, "# of PDUs with invalid padding"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "header_digest_errors", CTLFLAG_RD, &ofld_rxq->rx_iscsi_header_digest_errors, 0, "# of PDUs with invalid header digests"); SYSCTL_ADD_U64(ctx, children, OID_AUTO, "data_digest_errors", CTLFLAG_RD, &ofld_rxq->rx_iscsi_data_digest_errors, 0, "# of PDUs with invalid data digests"); } #endif /* * Returns a reasonable automatic cidx flush threshold for a given queue size. */ static u_int qsize_to_fthresh(int qsize) { u_int fthresh; while (!powerof2(qsize)) qsize++; fthresh = ilog2(qsize); if (fthresh > X_CIDXFLUSHTHRESH_128) fthresh = X_CIDXFLUSHTHRESH_128; return (fthresh); } static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { CH_ERR(sc, "failed to create hw ctrlq for tx_chan %d: %d\n", eq->tx_chan, rc); return (rc); } eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); eq->abs_id = G_FW_EQ_CTRL_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } static int eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create Ethernet egress queue: %d\n", rc); return (rc); } eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | V_FW_EQ_OFLD_CMD_VFN(0)); c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create egress queue for TCP offload: %d\n", rc); return (rc); } eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); eq->abs_id = G_FW_EQ_OFLD_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.eqmap_sz) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.eqmap_sz - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #endif /* SW only */ static int alloc_eq(struct adapter *sc, struct sge_eq *eq, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) { int rc, qsize; size_t len; MPASS(!(eq->flags & EQ_SW_ALLOCATED)); qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; len = qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); if (ctx != NULL && oid != NULL) add_eq_sysctls(sc, ctx, oid, eq); eq->flags |= EQ_SW_ALLOCATED; return (0); } /* SW only */ static void free_eq(struct adapter *sc, struct sge_eq *eq) { MPASS(eq->flags & EQ_SW_ALLOCATED); if (eq->type == EQ_ETH) MPASS(eq->pidx == eq->cidx); free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); mtx_destroy(&eq->eq_lock); bzero(eq, sizeof(*eq)); } static void add_eq_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_eq *eq) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &eq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, &eq->abs_id, 0, "absolute id of the queue"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &eq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &eq->cidx, 0, "consumer index"); SYSCTL_ADD_U16(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &eq->pidx, 0, "producer index"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL, eq->sidx, "status page index"); } static int alloc_eq_hwq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc; MPASS(!(eq->flags & EQ_HW_ALLOCATED)); eq->iqid = eq->iq->cntxt_id; eq->pidx = eq->cidx = eq->dbidx = 0; /* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */ eq->equeqidx = 0; eq->doorbells = sc->doorbells; bzero(eq->desc, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len); switch (eq->type) { case EQ_CTRL: rc = ctrl_eq_alloc(sc, eq); break; case EQ_ETH: rc = eth_eq_alloc(sc, vi, eq); break; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) case EQ_OFLD: rc = ofld_eq_alloc(sc, vi, eq); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->type); } if (rc != 0) { CH_ERR(sc, "failed to allocate egress queue(%d): %d\n", eq->type, rc); return (rc); } if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ eq->udb_qid = 0; } eq->udb = (volatile void *)udb; } eq->flags |= EQ_HW_ALLOCATED; return (0); } static int free_eq_hwq(struct adapter *sc, struct vi_info *vi __unused, struct sge_eq *eq) { int rc; MPASS(eq->flags & EQ_HW_ALLOCATED); switch (eq->type) { case EQ_CTRL: rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; case EQ_ETH: rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->type); } if (rc != 0) { CH_ERR(sc, "failed to free eq (type %d): %d\n", eq->type, rc); return (rc); } eq->flags &= ~EQ_HW_ALLOCATED; return (0); } static int alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid) { struct sge_eq *eq = &wrq->eq; int rc; MPASS(!(eq->flags & EQ_SW_ALLOCATED)); rc = alloc_eq(sc, eq, ctx, oid); if (rc) return (rc); MPASS(eq->flags & EQ_SW_ALLOCATED); /* Can't fail after this. */ wrq->adapter = sc; TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); TAILQ_INIT(&wrq->incomplete_wrs); STAILQ_INIT(&wrq->wr_list); wrq->nwr_pending = 0; wrq->ndesc_needed = 0; add_wrq_sysctls(ctx, oid, wrq); return (0); } static void free_wrq(struct adapter *sc, struct sge_wrq *wrq) { free_eq(sc, &wrq->eq); MPASS(wrq->nwr_pending == 0); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); MPASS(STAILQ_EMPTY(&wrq->wr_list)); bzero(wrq, sizeof(*wrq)); } static void add_wrq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_wrq *wrq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, &wrq->tx_wrs_direct, "# of work requests (direct)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, &wrq->tx_wrs_copied, "# of work requests (copied)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD, &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)"); } /* * Idempotent. */ static int alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx) { int rc, iqidx; struct port_info *pi = vi->pi; struct adapter *sc = vi->adapter; struct sge_eq *eq = &txq->eq; struct txpkts *txp; char name[16]; struct sysctl_oid *oid; if (!(eq->flags & EQ_SW_ALLOCATED)) { MPASS(!(eq->flags & EQ_HW_ALLOCATED)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->txq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "tx queue"); iqidx = vi->first_rxq + (idx % vi->nrxq); snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(vi->dev), idx); init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, &sc->sge.rxq[iqidx].iq, name); rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, M_CXGBE, &eq->eq_lock, M_WAITOK); if (rc != 0) { CH_ERR(vi, "failed to allocate mp_ring for txq%d: %d\n", idx, rc); failed: sysctl_remove_oid(oid, 1, 1); return (rc); } rc = alloc_eq(sc, eq, &vi->ctx, oid); if (rc) { CH_ERR(vi, "failed to allocate txq%d: %d\n", idx, rc); mp_ring_free(txq->r); goto failed; } MPASS(eq->flags & EQ_SW_ALLOCATED); /* Can't fail after this point. */ TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); txq->ifp = vi->ifp; txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); add_txq_sysctls(vi, &vi->ctx, oid, txq); } if (!(eq->flags & EQ_HW_ALLOCATED)) { MPASS(eq->flags & EQ_SW_ALLOCATED); rc = alloc_eq_hwq(sc, vi, eq); if (rc != 0) { CH_ERR(vi, "failed to create hw txq%d: %d\n", idx, rc); return (rc); } MPASS(eq->flags & EQ_HW_ALLOCATED); /* Can't fail after this point. */ if (idx == 0) sc->sge.eq_base = eq->abs_id - eq->cntxt_id; else KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id, ("eq_base mismatch")); KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF, ("PF with non-zero eq_base")); txp = &txq->txp; MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr); txq->txp.max_npkt = min(nitems(txp->mb), sc->params.max_pkts_per_eth_tx_pkts_wr); if (vi->flags & TX_USES_VM_WR && !(sc->flags & IS_VF)) txq->txp.max_npkt--; if (vi->flags & TX_USES_VM_WR) txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan)); else txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); txq->tc_idx = -1; } return (0); } /* * Idempotent. */ static void free_txq(struct vi_info *vi, struct sge_txq *txq) { struct adapter *sc = vi->adapter; struct sge_eq *eq = &txq->eq; if (eq->flags & EQ_HW_ALLOCATED) { MPASS(eq->flags & EQ_SW_ALLOCATED); free_eq_hwq(sc, NULL, eq); MPASS(!(eq->flags & EQ_HW_ALLOCATED)); } if (eq->flags & EQ_SW_ALLOCATED) { MPASS(!(eq->flags & EQ_HW_ALLOCATED)); sglist_free(txq->gl); free(txq->sdesc, M_CXGBE); mp_ring_free(txq->r); free_eq(sc, eq); MPASS(!(eq->flags & EQ_SW_ALLOCATED)); bzero(txq, sizeof(*txq)); } } static void add_txq_sysctls(struct vi_info *vi, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_txq *txq) { struct adapter *sc; struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; sc = vi->adapter; children = SYSCTL_CHILDREN(oid); mp_ring_sysctls(txq->r, ctx, children); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tc", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, vi, txq - sc->sge.txq, sysctl_tc, "I", "traffic class (-1 means none)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, &txq->txcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, &txq->tso_wrs, "# of TSO work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, &txq->imm_wrs, "# of work requests with immediate data"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD, &txq->txpkts0_wrs, "# of txpkts (type 0) work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD, &txq->txpkts1_wrs, "# of txpkts (type 1) work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD, &txq->txpkts0_pkts, "# of frames tx'd using type0 txpkts work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD, &txq->txpkts1_pkts, "# of frames tx'd using type1 txpkts work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "txpkts_flush", CTLFLAG_RD, &txq->txpkts_flush, "# of times txpkts had to be flushed out by an egress-update"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD, &txq->raw_wrs, "# of raw work requests (non-packets)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_tso_wrs", CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "vxlan_txcsum", CTLFLAG_RD, &txq->vxlan_txcsum, "# of times hardware assisted with inner checksums (VXLAN)"); #ifdef KERN_TLS if (is_ktls(sc)) { SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_records", CTLFLAG_RD, &txq->kern_tls_records, "# of NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_short", CTLFLAG_RD, &txq->kern_tls_short, "# of short NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_partial", CTLFLAG_RD, &txq->kern_tls_partial, "# of partial NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_full", CTLFLAG_RD, &txq->kern_tls_full, "# of full NIC TLS records transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_octets", CTLFLAG_RD, &txq->kern_tls_octets, "# of payload octets in transmitted NIC TLS records"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_waste", CTLFLAG_RD, &txq->kern_tls_waste, "# of octets DMAd but not transmitted in NIC TLS records"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_options", CTLFLAG_RD, &txq->kern_tls_options, "# of NIC TLS options-only packets transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_header", CTLFLAG_RD, &txq->kern_tls_header, "# of NIC TLS header-only packets transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin", CTLFLAG_RD, &txq->kern_tls_fin, "# of NIC TLS FIN-only packets transmitted"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_fin_short", CTLFLAG_RD, &txq->kern_tls_fin_short, "# of NIC TLS padded FIN packets on short TLS records"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_cbc", CTLFLAG_RD, &txq->kern_tls_cbc, "# of NIC TLS sessions using AES-CBC"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "kern_tls_gcm", CTLFLAG_RD, &txq->kern_tls_gcm, "# of NIC TLS sessions using AES-GCM"); } #endif } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) /* * Idempotent. */ static int alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx) { struct sysctl_oid *oid; struct port_info *pi = vi->pi; struct adapter *sc = vi->adapter; struct sge_eq *eq = &ofld_txq->wrq.eq; int rc, iqidx; char name[16]; MPASS(idx >= 0); MPASS(idx < vi->nofldtxq); if (!(eq->flags & EQ_SW_ALLOCATED)) { snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(vi->ofld_txq_oid), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "offload tx queue"); snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(vi->dev), idx); if (vi->nofldrxq > 0) { iqidx = vi->first_ofld_rxq + (idx % vi->nofldrxq); init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, &sc->sge.ofld_rxq[iqidx].iq, name); } else { iqidx = vi->first_rxq + (idx % vi->nrxq); init_eq(sc, eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, &sc->sge.rxq[iqidx].iq, name); } rc = alloc_wrq(sc, vi, &ofld_txq->wrq, &vi->ctx, oid); if (rc != 0) { CH_ERR(vi, "failed to allocate ofld_txq%d: %d\n", idx, rc); sysctl_remove_oid(oid, 1, 1); return (rc); } MPASS(eq->flags & EQ_SW_ALLOCATED); /* Can't fail after this point. */ ofld_txq->tx_iscsi_pdus = counter_u64_alloc(M_WAITOK); ofld_txq->tx_iscsi_octets = counter_u64_alloc(M_WAITOK); ofld_txq->tx_iscsi_iso_wrs = counter_u64_alloc(M_WAITOK); ofld_txq->tx_toe_tls_records = counter_u64_alloc(M_WAITOK); ofld_txq->tx_toe_tls_octets = counter_u64_alloc(M_WAITOK); add_ofld_txq_sysctls(&vi->ctx, oid, ofld_txq); } if (!(eq->flags & EQ_HW_ALLOCATED)) { rc = alloc_eq_hwq(sc, vi, eq); if (rc != 0) { CH_ERR(vi, "failed to create hw ofld_txq%d: %d\n", idx, rc); return (rc); } MPASS(eq->flags & EQ_HW_ALLOCATED); } return (0); } /* * Idempotent. */ static void free_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq) { struct adapter *sc = vi->adapter; struct sge_eq *eq = &ofld_txq->wrq.eq; if (eq->flags & EQ_HW_ALLOCATED) { MPASS(eq->flags & EQ_SW_ALLOCATED); free_eq_hwq(sc, NULL, eq); MPASS(!(eq->flags & EQ_HW_ALLOCATED)); } if (eq->flags & EQ_SW_ALLOCATED) { MPASS(!(eq->flags & EQ_HW_ALLOCATED)); counter_u64_free(ofld_txq->tx_iscsi_pdus); counter_u64_free(ofld_txq->tx_iscsi_octets); counter_u64_free(ofld_txq->tx_iscsi_iso_wrs); counter_u64_free(ofld_txq->tx_toe_tls_records); counter_u64_free(ofld_txq->tx_toe_tls_octets); free_wrq(sc, &ofld_txq->wrq); MPASS(!(eq->flags & EQ_SW_ALLOCATED)); bzero(ofld_txq, sizeof(*ofld_txq)); } } static void add_ofld_txq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_ofld_txq *ofld_txq) { struct sysctl_oid_list *children; if (ctx == NULL || oid == NULL) return; children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_pdus", CTLFLAG_RD, &ofld_txq->tx_iscsi_pdus, "# of iSCSI PDUs transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_octets", CTLFLAG_RD, &ofld_txq->tx_iscsi_octets, "# of payload octets in transmitted iSCSI PDUs"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_iso_wrs", CTLFLAG_RD, &ofld_txq->tx_iscsi_iso_wrs, "# of iSCSI segmentation offload work requests"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_records", CTLFLAG_RD, &ofld_txq->tx_toe_tls_records, "# of TOE TLS records transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_octets", CTLFLAG_RD, &ofld_txq->tx_toe_tls_octets, "# of payload octets in transmitted TOE TLS records"); } #endif static void oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *ba = arg; KASSERT(nseg == 1, ("%s meant for single segment mappings only.", __func__)); *ba = error ? 0 : segs->ds_addr; } static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { uint32_t n, v; n = IDXDIFF(fl->pidx >> 3, fl->dbidx, fl->sidx); MPASS(n > 0); wmb(); v = fl->dbval | V_PIDX(n); if (fl->udb) *fl->udb = htole32(v); else t4_write_reg(sc, sc->sge_kdoorbell_reg, v); IDXINCR(fl->dbidx, n, fl->sidx); } /* * Fills up the freelist by allocating up to 'n' buffers. Buffers that are * recycled do not count towards this allocation budget. * * Returns non-zero to indicate that this freelist should be added to the list * of starving freelists. */ static int refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { __be64 *d; struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; struct rx_buf_info *rxb; struct cluster_metadata *clm; uint16_t max_pidx, zidx = fl->zidx; uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); /* * We always stop at the beginning of the hardware descriptor that's just * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, * which would mean an empty freelist to the chip. */ max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; if (fl->pidx == max_pidx * 8) return (0); d = &fl->desc[fl->pidx]; sd = &fl->sdesc[fl->pidx]; rxb = &sc->sge.rx_buf_info[zidx]; while (n > 0) { if (sd->cl != NULL) { if (sd->nmbuf == 0) { /* * Fast recycle without involving any atomics on * the cluster's metadata (if the cluster has * metadata). This happens when all frames * received in the cluster were small enough to * fit within a single mbuf each. */ fl->cl_fast_recycled++; goto recycled; } /* * Cluster is guaranteed to have metadata. Clusters * without metadata always take the fast recycle path * when they're recycled. */ clm = cl_metadata(sd); MPASS(clm != NULL); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { fl->cl_recycled++; counter_u64_add(extfree_rels, 1); goto recycled; } sd->cl = NULL; /* gave up my reference */ } MPASS(sd->cl == NULL); cl = uma_zalloc(rxb->zone, M_NOWAIT); if (__predict_false(cl == NULL)) { if (zidx != fl->safe_zidx) { zidx = fl->safe_zidx; rxb = &sc->sge.rx_buf_info[zidx]; cl = uma_zalloc(rxb->zone, M_NOWAIT); } if (cl == NULL) break; } fl->cl_allocated++; n--; pa = pmap_kextract((vm_offset_t)cl); sd->cl = cl; sd->zidx = zidx; if (fl->flags & FL_BUF_PACKING) { *d = htobe64(pa | rxb->hwidx2); sd->moff = rxb->size2; } else { *d = htobe64(pa | rxb->hwidx1); sd->moff = 0; } recycled: sd->nmbuf = 0; d++; sd++; if (__predict_false((++fl->pidx & 7) == 0)) { uint16_t pidx = fl->pidx >> 3; if (__predict_false(pidx == fl->sidx)) { fl->pidx = 0; pidx = 0; sd = fl->sdesc; d = fl->desc; } if (n < 8 || pidx == max_pidx) break; if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) ring_fl_db(sc, fl); } } if ((fl->pidx >> 3) != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); } /* * Attempt to refill all starving freelists. */ static void refill_sfl(void *arg) { struct adapter *sc = arg; struct sge_fl *fl, *fl_temp; mtx_assert(&sc->sfl_lock, MA_OWNED); TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { FL_LOCK(fl); refill_fl(sc, fl, 64); if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { TAILQ_REMOVE(&sc->sfl, fl, link); fl->flags &= ~FL_STARVING; } FL_UNLOCK(fl); } if (!TAILQ_EMPTY(&sc->sfl)) callout_schedule(&sc->sfl_callout, hz / 5); } /* * Release the driver's reference on all buffers in the given freelist. Buffers * with kernel references cannot be freed and will prevent the driver from being * unloaded safely. */ void free_fl_buffers(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; struct cluster_metadata *clm; int i; sd = fl->sdesc; for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; if (sd->nmbuf == 0) uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl); else if (fl->flags & FL_BUF_PACKING) { clm = cl_metadata(sd); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { uma_zfree(sc->sge.rx_buf_info[sd->zidx].zone, sd->cl); counter_u64_add(extfree_rels, 1); } } sd->cl = NULL; } if (fl->flags & FL_BUF_RESUME) { m_freem(fl->m0); fl->flags &= ~FL_BUF_RESUME; } } static inline void get_pkt_gl(struct mbuf *m, struct sglist *gl) { int rc; M_ASSERTPKTHDR(m); sglist_reset(gl); rc = sglist_append_mbuf(gl, m); if (__predict_false(rc != 0)) { panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " "with %d.", __func__, m, mbuf_nsegs(m), rc); } KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); #if 0 /* vm_wr not readily available here. */ KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m, vm_wr), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, gl->sg_nseg, max_nsegs_allowed(m, vm_wr))); #endif } /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_len16(u_int nsegs, const u_int extra) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = extra + sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkt_vm WR with a GL. Includes the firmware work * request header. */ static inline u_int txpkt_vm_len16(u_int nsegs, const u_int extra) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } static inline void calculate_mbuf_len16(struct mbuf *m, bool vm_wr) { const int lso = sizeof(struct cpl_tx_pkt_lso_core); const int tnl_lso = sizeof(struct cpl_tx_tnl_lso); if (vm_wr) { if (needs_tso(m)) set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso)); else set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0)); return; } if (needs_tso(m)) { if (needs_vxlan_tso(m)) set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso)); else set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso)); } else set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0)); } /* * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts0_len16(u_int nsegs) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts1_len16(void) { u_int n; n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); return (howmany(n, 16)); } static inline u_int imm_payload(u_int ndesc) { u_int n; n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - sizeof(struct cpl_tx_pkt_core); return (n); } static inline uint64_t csum_to_ctrl(struct adapter *sc, struct mbuf *m) { uint64_t ctrl; int csum_type, l2hlen, l3hlen; int x, y; static const int csum_types[3][2] = { {TX_CSUM_TCPIP, TX_CSUM_TCPIP6}, {TX_CSUM_UDPIP, TX_CSUM_UDPIP6}, {TX_CSUM_IP, 0} }; M_ASSERTPKTHDR(m); if (!needs_hwcsum(m)) return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS); MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN); MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip)); if (needs_vxlan_csum(m)) { MPASS(m->m_pkthdr.l4hlen > 0); MPASS(m->m_pkthdr.l5hlen > 0); MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN); MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip)); l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen + m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN; l3hlen = m->m_pkthdr.inner_l3hlen; } else { l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN; l3hlen = m->m_pkthdr.l3hlen; } ctrl = 0; if (!needs_l3_csum(m)) ctrl |= F_TXPKT_IPCSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP | CSUM_IP6_TCP | CSUM_INNER_IP6_TCP)) x = 0; /* TCP */ else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP | CSUM_IP6_UDP | CSUM_INNER_IP6_UDP)) x = 1; /* UDP */ else x = 2; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP | CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP)) y = 0; /* IPv4 */ else { MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP)); y = 1; /* IPv6 */ } /* * needs_hwcsum returned true earlier so there must be some kind of * checksum to calculate. */ csum_type = csum_types[x][y]; MPASS(csum_type != 0); if (csum_type == TX_CSUM_IP) ctrl |= F_TXPKT_L4CSUM_DIS; ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen); if (chip_id(sc) <= CHELSIO_T5) ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen); else ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen); return (ctrl); } static inline void * write_lso_cpl(void *cpl, struct mbuf *m0) { struct cpl_tx_pkt_lso_core *lso; uint32_t ctrl; KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso = cpl; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(m0->m_pkthdr.len); return (lso + 1); } static void * write_tnl_lso_cpl(void *cpl, struct mbuf *m0) { struct cpl_tx_tnl_lso *tnl_lso = cpl; uint32_t ctrl; KASSERT(m0->m_pkthdr.inner_l2hlen > 0 && m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 && m0->m_pkthdr.inner_l5hlen > 0, ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths", __func__, m0)); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0, ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths", __func__, m0)); /* Outer headers. */ ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) | F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST | V_CPL_TX_TNL_LSO_ETHHDRLENOUT( (m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) | F_CPL_TX_TNL_LSO_IPLENSETOUT; if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT; else { ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT | F_CPL_TX_TNL_LSO_IPIDINCOUT; } tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl); tnl_lso->IpIdOffsetOut = 0; tnl_lso->UdpLenSetOut_to_TnlHdrLen = htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT | F_CPL_TX_TNL_LSO_UDPLENSETOUT | V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen + m0->m_pkthdr.l5hlen) | V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN)); tnl_lso->r1 = 0; /* Inner headers. */ ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN( (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) | V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) | V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2); if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_CPL_TX_TNL_LSO_IPV6; tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl); tnl_lso->IpIdOffset = 0; tnl_lso->IpIdSplit_to_Mss = htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz)); tnl_lso->TCPSeqOffset = 0; tnl_lso->EthLenOffset_Size = htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len)); return (tnl_lso + 1); } #define VM_TX_L2HDR_LEN 16 /* ethmacdst to vlantci */ /* * Write a VM txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0) { struct sge_eq *eq; struct fw_eth_tx_pkt_vm_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); len16 = mbuf_len16(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); ndesc = tx_len16_to_desc(len16); /* Firmware work request header */ eq = &txq->eq; wr = (void *)&eq->desc[eq->pidx]; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3[0] = 0; wr->r3[1] = 0; /* * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci. * vlantci is ignored unless the ethtype is 0x8100, so it's * simpler to always copy it rather than making it * conditional. Also, it seems that we do not have to set * vlantci or fake the ethtype when doing VLAN tag insertion. */ m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst); if (needs_tso(m0)) { cpl = write_lso_cpl(wr + 1, m0); txq->tso_wrs++; } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m0); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); /* * A packet using TSO will use up an entire descriptor for the * firmware work request header, LSO CPL, and TX_PKT_XT CPL. * If this descriptor is the last descriptor in the ring, wrap * around to the front of the ring explicitly for the start of * the sgl. */ if (dst == (void *)&eq->desc[eq->sidx]) { dst = (void *)&eq->desc[0]; write_gl_to_txd(txq, m0, &dst, 0); } else write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * Write a raw WR to the hardware descriptors, update the software * descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct mbuf *m; caddr_t dst; int len16, ndesc; len16 = mbuf_len16(m0); ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); dst = wr; for (m = m0; m != NULL; m = m->m_next) copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); txq->raw_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * Write a txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_wr(struct adapter *sc, struct sge_txq *txq, struct mbuf *m0, u_int available) { struct sge_eq *eq; struct fw_eth_tx_pkt_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) { if (needs_vxlan_tso(m0)) ctrl += sizeof(struct cpl_tx_tnl_lso); else ctrl += sizeof(struct cpl_tx_pkt_lso_core); } else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) && available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + pktlen, 16); nsegs = 0; } ndesc = tx_len16_to_desc(len16); MPASS(ndesc <= available); /* Firmware work request header */ eq = &txq->eq; wr = (void *)&eq->desc[eq->pidx]; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; if (needs_tso(m0)) { if (needs_vxlan_tso(m0)) { cpl = write_tnl_lso_cpl(wr + 1, m0); txq->vxlan_tso_wrs++; } else { cpl = write_lso_cpl(wr + 1, m0); txq->tso_wrs++; } } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m0); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { /* some hardware assistance provided */ if (needs_vxlan_csum(m0)) txq->vxlan_txcsum++; else txq->txcsum++; } /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx])) dst = (caddr_t)&eq->desc[0]; if (nsegs > 0) { write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; } else { struct mbuf *m; for (m = m0; m != NULL; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; #endif } #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif txq->imm_wrs++; } txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } static inline bool cmp_l2hdr(struct txpkts *txp, struct mbuf *m) { int len; MPASS(txp->npkt > 0); MPASS(m->m_len >= VM_TX_L2HDR_LEN); if (txp->ethtype == be16toh(ETHERTYPE_VLAN)) len = VM_TX_L2HDR_LEN; else len = sizeof(struct ether_header); return (memcmp(m->m_data, &txp->ethmacdst[0], len) != 0); } static inline void save_l2hdr(struct txpkts *txp, struct mbuf *m) { MPASS(m->m_len >= VM_TX_L2HDR_LEN); memcpy(&txp->ethmacdst[0], mtod(m, const void *), VM_TX_L2HDR_LEN); } static int add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, int avail, bool *send) { struct txpkts *txp = &txq->txp; /* Cannot have TSO and coalesce at the same time. */ if (cannot_use_txpkts(m)) { cannot_coalesce: *send = txp->npkt > 0; return (EINVAL); } /* VF allows coalescing of type 1 (1 GL) only */ if (mbuf_nsegs(m) > 1) goto cannot_coalesce; *send = false; if (txp->npkt > 0) { MPASS(tx_len16_to_desc(txp->len16) <= avail); MPASS(txp->npkt < txp->max_npkt); MPASS(txp->wr_type == 1); /* VF supports type 1 only */ if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) { retry_after_send: *send = true; return (EAGAIN); } if (m->m_pkthdr.len + txp->plen > 65535) goto retry_after_send; if (cmp_l2hdr(txp, m)) goto retry_after_send; txp->len16 += txpkts1_len16(); txp->plen += m->m_pkthdr.len; txp->mb[txp->npkt++] = m; if (txp->npkt == txp->max_npkt) *send = true; } else { txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_vm_wr), 16) + txpkts1_len16(); if (tx_len16_to_desc(txp->len16) > avail) goto cannot_coalesce; txp->npkt = 1; txp->wr_type = 1; txp->plen = m->m_pkthdr.len; txp->mb[0] = m; save_l2hdr(txp, m); } return (0); } static int add_to_txpkts_pf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, int avail, bool *send) { struct txpkts *txp = &txq->txp; int nsegs; MPASS(!(sc->flags & IS_VF)); /* Cannot have TSO and coalesce at the same time. */ if (cannot_use_txpkts(m)) { cannot_coalesce: *send = txp->npkt > 0; return (EINVAL); } *send = false; nsegs = mbuf_nsegs(m); if (txp->npkt == 0) { if (m->m_pkthdr.len > 65535) goto cannot_coalesce; if (nsegs > 1) { txp->wr_type = 0; txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + txpkts0_len16(nsegs); } else { txp->wr_type = 1; txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + txpkts1_len16(); } if (tx_len16_to_desc(txp->len16) > avail) goto cannot_coalesce; txp->npkt = 1; txp->plen = m->m_pkthdr.len; txp->mb[0] = m; } else { MPASS(tx_len16_to_desc(txp->len16) <= avail); MPASS(txp->npkt < txp->max_npkt); if (m->m_pkthdr.len + txp->plen > 65535) { retry_after_send: *send = true; return (EAGAIN); } MPASS(txp->wr_type == 0 || txp->wr_type == 1); if (txp->wr_type == 0) { if (tx_len16_to_desc(txp->len16 + txpkts0_len16(nsegs)) > min(avail, SGE_MAX_WR_NDESC)) goto retry_after_send; txp->len16 += txpkts0_len16(nsegs); } else { if (nsegs != 1) goto retry_after_send; if (tx_len16_to_desc(txp->len16 + txpkts1_len16()) > avail) goto retry_after_send; txp->len16 += txpkts1_len16(); } txp->plen += m->m_pkthdr.len; txp->mb[txp->npkt++] = m; if (txp->npkt == txp->max_npkt) *send = true; } return (0); } /* * Write a txpkts WR for the packets in txp to the hardware descriptors, update * the software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkts_wr(struct adapter *sc, struct sge_txq *txq) { const struct txpkts *txp = &txq->txp; struct sge_eq *eq = &txq->eq; struct fw_eth_tx_pkts_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint64_t ctrl1; int ndesc, i, checkwrap; struct mbuf *m, *last; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); wr = (void *)&eq->desc[eq->pidx]; wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r3 = 0; wr->type = txp->wr_type; flitp = wr + 1; /* * At this point we are 16B into a hardware descriptor. If checkwrap is * set then we know the WR is going to wrap around somewhere. We'll * check for that at appropriate points. */ ndesc = tx_len16_to_desc(txp->len16); last = NULL; checkwrap = eq->sidx - ndesc < eq->pidx; for (i = 0; i < txp->npkt; i++) { m = txp->mb[i]; if (txp->wr_type == 0) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; /* ULP master command */ ulpmc = flitp; ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); ulpmc->len = htobe32(txpkts0_len16(mbuf_nsegs(m))); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); cpl = (void *)(ulpsc + 1); if (checkwrap && (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) cpl = (void *)&eq->desc[0]; } else { cpl = flitp; } /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { /* some hardware assistance provided */ if (needs_vxlan_csum(m)) txq->vxlan_txcsum++; else txq->txcsum++; } /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; if (checkwrap && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = (void *)&eq->desc[0]; write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); if (last != NULL) last->m_nextpkt = m; last = m; } txq->sgl_wrs++; if (txp->wr_type == 0) { txq->txpkts0_pkts += txp->npkt; txq->txpkts0_wrs++; } else { txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; } txsd = &txq->sdesc[eq->pidx]; txsd->m = txp->mb[0]; txsd->desc_used = ndesc; return (ndesc); } static u_int write_txpkts_vm_wr(struct adapter *sc, struct sge_txq *txq) { const struct txpkts *txp = &txq->txp; struct sge_eq *eq = &txq->eq; struct fw_eth_tx_pkts_vm_wr *wr; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint64_t ctrl1; int ndesc, i; struct mbuf *m, *last; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->wr_type == 1); /* VF supports type 1 only */ MPASS(txp->mb[0] != NULL); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); wr = (void *)&eq->desc[eq->pidx]; wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_VM_WR)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(txp->len16)); wr->r3 = 0; wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r4 = 0; memcpy(&wr->ethmacdst[0], &txp->ethmacdst[0], 16); flitp = wr + 1; /* * At this point we are 32B into a hardware descriptor. Each mbuf in * the WR will take 32B so we check for the end of the descriptor ring * before writing odd mbufs (mb[1], 3, 5, ..) */ ndesc = tx_len16_to_desc(txp->len16); last = NULL; for (i = 0; i < txp->npkt; i++) { m = txp->mb[i]; if (i & 1 && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = &eq->desc[0]; cpl = flitp; /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m); if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; MPASS(mbuf_nsegs(m) == 1); write_gl_to_txd(txq, m, (caddr_t *)(&flitp), 0); if (last != NULL) last->m_nextpkt = m; last = m; } txq->sgl_wrs++; txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = txp->mb[0]; txsd->desc_used = ndesc; return (ndesc); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) { struct sge_eq *eq = &txq->eq; struct sglist *gl = txq->gl; struct sglist_seg *seg; __be64 *flitp, *wrap; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); get_pkt_gl(m, gl); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)(*to); wrap = (__be64 *)(&eq->desc[eq->sidx]); seg = &gl->sg_segs[0]; usgl = (void *)flitp; /* * We start at a 16 byte boundary somewhere inside the tx descriptor * ring, so we're at least 16 bytes away from the status page. There is * no chance of a wrap around in the middle of usgl (which is 16 bytes). */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { /* Won't wrap around at all */ for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); for (i = 0; i < nflits - 2; i++) { if (flitp == wrap) flitp = (void *)eq->desc; *flitp++ = get_flit(seg, nsegs - 1, i); } } if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); if (__predict_false(flitp == wrap)) *to = (void *)eq->desc; else *to = (void *)flitp; } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static inline void ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) { u_int db; MPASS(n > 0); db = eq->doorbells; if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; int i; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(eq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", __func__, eq->doorbells, n, eq->dbidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); i = eq->dbidx; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: t4_write_reg(sc, sc->sge_kdoorbell_reg, V_QID(eq->cntxt_id) | V_PIDX(n)); break; } IDXINCR(eq->dbidx, n, eq->sidx); } static inline u_int reclaimable_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx; hw_cidx = read_hw_cidx(eq); return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); } static inline u_int total_available_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx, pidx; hw_cidx = read_hw_cidx(eq); pidx = eq->pidx; if (pidx == hw_cidx) return (eq->sidx - 1); else return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); } static inline uint16_t read_hw_cidx(struct sge_eq *eq) { struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; uint16_t cidx = spg->cidx; /* stable snapshot */ return (be16toh(cidx)); } /* * Reclaim 'n' descriptors approximately. */ static u_int reclaim_tx_descs(struct sge_txq *txq, u_int n) { struct tx_sdesc *txsd; struct sge_eq *eq = &txq->eq; u_int can_reclaim, reclaimed; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(n > 0); reclaimed = 0; can_reclaim = reclaimable_tx_desc(eq); while (can_reclaim && reclaimed < n) { int ndesc; struct mbuf *m, *nextpkt; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; /* Firmware doesn't return "partial" credits. */ KASSERT(can_reclaim >= ndesc, ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); KASSERT(ndesc != 0, ("%s: descriptor with no credits: cidx %d", __func__, eq->cidx)); for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } reclaimed += ndesc; can_reclaim -= ndesc; IDXINCR(eq->cidx, ndesc, eq->sidx); } return (reclaimed); } static void tx_reclaim(void *arg, int n) { struct sge_txq *txq = arg; struct sge_eq *eq = &txq->eq; do { if (TXQ_TRYLOCK(txq) == 0) break; n = reclaim_tx_descs(txq, 32); if (eq->cidx == eq->pidx) eq->equeqidx = eq->pidx; TXQ_UNLOCK(txq); } while (n > 0); } static __be64 get_flit(struct sglist_seg *segs, int nsegs, int idx) { int i = (idx / 3) * 2; switch (idx % 3) { case 0: { uint64_t rc; rc = (uint64_t)segs[i].ss_len << 32; if (i + 1 < nsegs) rc |= (uint64_t)(segs[i + 1].ss_len); return (htobe64(rc)); } case 1: return (htobe64(segs[i].ss_paddr)); case 2: return (htobe64(segs[i + 1].ss_paddr)); } return (0); } static int find_refill_source(struct adapter *sc, int maxp, bool packing) { int i, zidx = -1; struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0]; if (packing) { for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { if (rxb->hwidx2 == -1) continue; if (rxb->size1 < PAGE_SIZE && rxb->size1 < largest_rx_cluster) continue; if (rxb->size1 > largest_rx_cluster) break; MPASS(rxb->size1 - rxb->size2 >= CL_METADATA_SIZE); if (rxb->size2 >= maxp) return (i); zidx = i; } } else { for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { if (rxb->hwidx1 == -1) continue; if (rxb->size1 > largest_rx_cluster) break; if (rxb->size1 >= maxp) return (i); zidx = i; } } return (zidx); } static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); if ((fl->flags & FL_DOOMED) == 0) { fl->flags |= FL_STARVING; TAILQ_INSERT_TAIL(&sc->sfl, fl, link); callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); } FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); } static void handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_wrq *wrq = (void *)eq; atomic_readandclear_int(&eq->equiq); taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task); } static void handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_txq *txq = (void *)eq; MPASS(eq->type == EQ_ETH); atomic_readandclear_int(&eq->equiq); if (mp_ring_is_idle(txq->r)) taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); else mp_ring_check_drainage(txq->r, 64); } static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, &handle_wrq_egr_update, &handle_eth_egr_update, &handle_wrq_egr_update}; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start - s->eq_base]; (*h[eq->type])(sc, eq); return (0); } /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ offsetof(struct cpl_fw6_msg, data)); static int handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { const struct rss_header *rss2; rss2 = (const struct rss_header *)&cpl->data[0]; return (t4_cpl_handler[rss2->opcode](iq, rss2, m)); } return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0])); } /** * t4_handle_wrerr_rpl - process a FW work request error message * @adap: the adapter * @rpl: start of the FW message */ static int t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl) { u8 opcode = *(const u8 *)rpl; const struct fw_error_cmd *e = (const void *)rpl; unsigned int i; if (opcode != FW_ERROR_CMD) { log(LOG_ERR, "%s: Received WRERR_RPL message with opcode %#x\n", device_get_nameunit(adap->dev), opcode); return (EINVAL); } log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev), G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" : "non-fatal"); switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) { case FW_ERROR_TYPE_EXCEPTION: log(LOG_ERR, "exception info:\n"); for (i = 0; i < nitems(e->u.exception.info); i++) log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ", be32toh(e->u.exception.info[i])); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_HWMODULE: log(LOG_ERR, "HW module regaddr %08x regval %08x\n", be32toh(e->u.hwmodule.regaddr), be32toh(e->u.hwmodule.regval)); break; case FW_ERROR_TYPE_WR: log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n", be16toh(e->u.wr.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)), be32toh(e->u.wr.eqid)); for (i = 0; i < nitems(e->u.wr.wrhdr); i++) log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ", e->u.wr.wrhdr[i]); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_ACL: log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s", be16toh(e->u.acl.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)), be32toh(e->u.acl.eqid), G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" : "MAC"); for (i = 0; i < nitems(e->u.acl.val); i++) log(LOG_ERR, " %02x", e->u.acl.val[i]); log(LOG_ERR, "\n"); break; default: log(LOG_ERR, "type %#x\n", G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))); return (EINVAL); } return (0); } static inline bool bufidx_used(struct adapter *sc, int idx) { struct rx_buf_info *rxb = &sc->sge.rx_buf_info[0]; int i; for (i = 0; i < SW_ZONE_SIZES; i++, rxb++) { if (rxb->size1 > largest_rx_cluster) continue; if (rxb->hwidx1 == idx || rxb->hwidx2 == idx) return (true); } return (false); } static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sge_params *sp = &sc->params.sge; int i, rc; struct sbuf sb; char c; sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND); for (i = 0; i < SGE_FLBUF_SIZES; i++) { if (bufidx_used(sc, i)) c = '*'; else c = '\0'; sbuf_printf(&sb, "%u%c ", sp->sge_fl_buffer_size[i], c); } sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } #ifdef RATELIMIT #if defined(INET) || defined(INET6) /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso) { u_int n; MPASS(immhdrs > 0); n = roundup2(sizeof(struct fw_eth_tx_eo_wr) + sizeof(struct cpl_tx_pkt_core) + immhdrs, 16); if (__predict_false(nsegs == 0)) goto done; nsegs--; /* first segment is part of ulptx_sgl */ n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); done: return (howmany(n, 16)); } #endif #define ETID_FLOWC_NPARAMS 6 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \ ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16)) #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16)) static int send_etid_flowc_wr(struct cxgbe_rate_tag *cst, struct port_info *pi, struct vi_info *vi) { struct wrq_cookie cookie; u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN; struct fw_flowc_wr *flowc; mtx_assert(&cst->lock, MA_OWNED); MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) == EO_FLOWC_PENDING); flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLOWC_LEN16, &cookie); if (__predict_false(flowc == NULL)) return (ENOMEM); bzero(flowc, ETID_FLOWC_LEN); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) | V_FW_WR_FLOWID(cst->etid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(cst->iqid); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE; flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; flowc->mnemval[5].val = htobe32(cst->schedcl); commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie); cst->flags &= ~EO_FLOWC_PENDING; cst->flags |= EO_FLOWC_RPL_PENDING; MPASS(cst->tx_credits >= ETID_FLOWC_LEN16); /* flowc is first WR. */ cst->tx_credits -= ETID_FLOWC_LEN16; return (0); } #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16)) void send_etid_flush_wr(struct cxgbe_rate_tag *cst) { struct fw_flowc_wr *flowc; struct wrq_cookie cookie; mtx_assert(&cst->lock, MA_OWNED); flowc = start_wrq_wr(&cst->eo_txq->wrq, ETID_FLUSH_LEN16, &cookie); if (__predict_false(flowc == NULL)) CXGBE_UNIMPLEMENTED(__func__); bzero(flowc, ETID_FLUSH_LEN16 * 16); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL); flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) | V_FW_WR_FLOWID(cst->etid)); commit_wrq_wr(&cst->eo_txq->wrq, flowc, &cookie); cst->flags |= EO_FLUSH_RPL_PENDING; MPASS(cst->tx_credits >= ETID_FLUSH_LEN16); cst->tx_credits -= ETID_FLUSH_LEN16; cst->ncompl++; } static void write_ethofld_wr(struct cxgbe_rate_tag *cst, struct fw_eth_tx_eo_wr *wr, struct mbuf *m0, int compl) { struct cpl_tx_pkt_core *cpl; uint64_t ctrl1; uint32_t ctrl; /* used in many unrelated places */ int len16, pktlen, nsegs, immhdrs; uintptr_t p; struct ulptx_sgl *usgl; struct sglist sg; struct sglist_seg segs[38]; /* XXX: find real limit. XXX: get off the stack */ mtx_assert(&cst->lock, MA_OWNED); M_ASSERTPKTHDR(m0); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: ethofld mbuf %p is missing header lengths", __func__, m0)); len16 = mbuf_eo_len16(m0); nsegs = mbuf_eo_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; ctrl += immhdrs; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) | V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) | V_FW_WR_FLOWID(cst->etid)); wr->r3 = 0; if (needs_outer_udp_csum(m0)) { wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG; wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen; wr->u.udpseg.rtplen = 0; wr->u.udpseg.r4 = 0; wr->u.udpseg.mss = htobe16(pktlen - immhdrs); wr->u.udpseg.schedpktsize = wr->u.udpseg.mss; wr->u.udpseg.plen = htobe32(pktlen - immhdrs); cpl = (void *)(wr + 1); } else { MPASS(needs_outer_tcp_csum(m0)); wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG; wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen; wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0); wr->u.tcpseg.r4 = 0; wr->u.tcpseg.r5 = 0; wr->u.tcpseg.plen = htobe32(pktlen - immhdrs); if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); } else { wr->u.tcpseg.mss = htobe16(0xffff); cpl = (void *)(wr + 1); } } /* Checksum offload must be requested for ethofld. */ MPASS(needs_outer_l4_csum(m0)); ctrl1 = csum_to_ctrl(cst->adapter, m0); /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); } /* CPL header */ cpl->ctrl0 = cst->ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */ p = (uintptr_t)(cpl + 1); m_copydata(m0, 0, immhdrs, (void *)p); /* SGL */ if (nsegs > 0) { int i, pad; /* zero-pad upto next 16Byte boundary, if not 16Byte aligned */ p += immhdrs; pad = 16 - (immhdrs & 0xf); bzero((void *)p, pad); usgl = (void *)(p + pad); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); sglist_init(&sg, nitems(segs), segs); for (; m0 != NULL; m0 = m0->m_next) { if (__predict_false(m0->m_len == 0)) continue; if (immhdrs >= m0->m_len) { immhdrs -= m0->m_len; continue; } if (m0->m_flags & M_EXTPG) sglist_append_mbuf_epg(&sg, m0, mtod(m0, vm_offset_t), m0->m_len); else sglist_append(&sg, mtod(m0, char *) + immhdrs, m0->m_len - immhdrs); immhdrs = 0; } MPASS(sg.sg_nseg == nsegs); /* * Zero pad last 8B in case the WR doesn't end on a 16B * boundary. */ *(uint64_t *)((char *)wr + len16 * 16 - 8) = 0; usgl->len0 = htobe32(segs[0].ss_len); usgl->addr0 = htobe64(segs[0].ss_paddr); for (i = 0; i < nsegs - 1; i++) { usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); } } static void ethofld_tx(struct cxgbe_rate_tag *cst) { struct mbuf *m; struct wrq_cookie cookie; int next_credits, compl; struct fw_eth_tx_eo_wr *wr; mtx_assert(&cst->lock, MA_OWNED); while ((m = mbufq_first(&cst->pending_tx)) != NULL) { M_ASSERTPKTHDR(m); /* How many len16 credits do we need to send this mbuf. */ next_credits = mbuf_eo_len16(m); MPASS(next_credits > 0); if (next_credits > cst->tx_credits) { /* * Tx will make progress eventually because there is at * least one outstanding fw4_ack that will return * credits and kick the tx. */ MPASS(cst->ncompl > 0); return; } wr = start_wrq_wr(&cst->eo_txq->wrq, next_credits, &cookie); if (__predict_false(wr == NULL)) { /* XXX: wishful thinking, not a real assertion. */ MPASS(cst->ncompl > 0); return; } cst->tx_credits -= next_credits; cst->tx_nocompl += next_credits; compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2; ETHER_BPF_MTAP(cst->com.ifp, m); write_ethofld_wr(cst, wr, m, compl); commit_wrq_wr(&cst->eo_txq->wrq, wr, &cookie); if (compl) { cst->ncompl++; cst->tx_nocompl = 0; } (void) mbufq_dequeue(&cst->pending_tx); /* * Drop the mbuf's reference on the tag now rather * than waiting until m_freem(). This ensures that * cxgbe_rate_tag_free gets called when the inp drops * its reference on the tag and there are no more * mbufs in the pending_tx queue and can flush any * pending requests. Otherwise if the last mbuf * doesn't request a completion the etid will never be * released. */ m->m_pkthdr.snd_tag = NULL; m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; m_snd_tag_rele(&cst->com); mbufq_enqueue(&cst->pending_fwack, m); } } static int -ethofld_transmit(struct ifnet *ifp, struct mbuf *m0) +ethofld_transmit(if_t ifp, struct mbuf *m0) { struct cxgbe_rate_tag *cst; int rc; MPASS(m0->m_nextpkt == NULL); MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG); MPASS(m0->m_pkthdr.snd_tag != NULL); cst = mst_to_crt(m0->m_pkthdr.snd_tag); mtx_lock(&cst->lock); MPASS(cst->flags & EO_SND_TAG_REF); if (__predict_false(cst->flags & EO_FLOWC_PENDING)) { - struct vi_info *vi = ifp->if_softc; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; const uint32_t rss_mask = vi->rss_size - 1; uint32_t rss_hash; cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq]; if (M_HASHTYPE_ISHASH(m0)) rss_hash = m0->m_pkthdr.flowid; else rss_hash = arc4random(); /* We assume RSS hashing */ cst->iqid = vi->rss[rss_hash & rss_mask]; cst->eo_txq += rss_hash % vi->nofldtxq; rc = send_etid_flowc_wr(cst, pi, vi); if (rc != 0) goto done; } if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) { rc = ENOBUFS; goto done; } mbufq_enqueue(&cst->pending_tx, m0); cst->plen += m0->m_pkthdr.len; /* * Hold an extra reference on the tag while generating work * requests to ensure that we don't try to free the tag during * ethofld_tx() in case we are sending the final mbuf after * the inp was freed. */ m_snd_tag_ref(&cst->com); ethofld_tx(cst); mtx_unlock(&cst->lock); m_snd_tag_rele(&cst->com); return (0); done: mtx_unlock(&cst->lock); return (rc); } static int ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); struct mbuf *m; u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct cxgbe_rate_tag *cst; uint8_t credits = cpl->credits; cst = lookup_etid(sc, etid); mtx_lock(&cst->lock); if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) { MPASS(credits >= ETID_FLOWC_LEN16); credits -= ETID_FLOWC_LEN16; cst->flags &= ~EO_FLOWC_RPL_PENDING; } KASSERT(cst->ncompl > 0, ("%s: etid %u (%p) wasn't expecting completion.", __func__, etid, cst)); cst->ncompl--; while (credits > 0) { m = mbufq_dequeue(&cst->pending_fwack); if (__predict_false(m == NULL)) { /* * The remaining credits are for the final flush that * was issued when the tag was freed by the kernel. */ MPASS((cst->flags & (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) == EO_FLUSH_RPL_PENDING); MPASS(credits == ETID_FLUSH_LEN16); MPASS(cst->tx_credits + cpl->credits == cst->tx_total); MPASS(cst->ncompl == 0); cst->flags &= ~EO_FLUSH_RPL_PENDING; cst->tx_credits += cpl->credits; cxgbe_rate_tag_free_locked(cst); return (0); /* cst is gone. */ } KASSERT(m != NULL, ("%s: too many credits (%u, %u)", __func__, cpl->credits, credits)); KASSERT(credits >= mbuf_eo_len16(m), ("%s: too few credits (%u, %u, %u)", __func__, cpl->credits, credits, mbuf_eo_len16(m))); credits -= mbuf_eo_len16(m); cst->plen -= m->m_pkthdr.len; m_freem(m); } cst->tx_credits += cpl->credits; MPASS(cst->tx_credits <= cst->tx_total); if (cst->flags & EO_SND_TAG_REF) { /* * As with ethofld_transmit(), hold an extra reference * so that the tag is stable across ethold_tx(). */ m_snd_tag_ref(&cst->com); m = mbufq_first(&cst->pending_tx); if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m)) ethofld_tx(cst); mtx_unlock(&cst->lock); m_snd_tag_rele(&cst->com); } else { /* * There shouldn't be any pending packets if the tag * was freed by the kernel since any pending packet * should hold a reference to the tag. */ MPASS(mbufq_first(&cst->pending_tx) == NULL); mtx_unlock(&cst->lock); } return (0); } #endif diff --git a/sys/dev/cxgbe/t4_tracer.c b/sys/dev/cxgbe/t4_tracer.c index c4d6f9d48fbc..3d9d7038bf17 100644 --- a/sys/dev/cxgbe/t4_tracer.c +++ b/sys/dev/cxgbe/t4_tracer.c @@ -1,532 +1,531 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "t4_ioctl.h" /* * Locking notes * ============= * * An interface cloner is registered during mod_load and it can be used to * create or destroy the tracing ifnet for an adapter at any time. It is * possible for the cloned interface to outlive the adapter (adapter disappears * in t4_detach but the tracing ifnet may live till mod_unload when removal of * the cloner finally destroys any remaining cloned interfaces). When tracing * filters are active, this ifnet is also receiving data. There are potential * bad races between ifnet create, ifnet destroy, ifnet rx, ifnet ioctl, * cxgbe_detach/t4_detach, mod_unload. * * a) The driver selects an iq for tracing (sc->traceq) inside a synch op. The * iq is destroyed inside a synch op too (and sc->traceq updated). * b) The cloner looks for an adapter that matches the name of the ifnet it's * been asked to create, starts a synch op on that adapter, and proceeds only * if the adapter has a tracing iq. * c) The cloned ifnet and the adapter are coupled to each other via * ifp->if_softc and sc->ifp. These can be modified only with the global * t4_trace_lock sx as well as the sc->ifp_lock mutex held. Holding either * of these will prevent any change. * * The order in which all the locks involved should be acquired are: * t4_list_lock * adapter lock * (begin synch op and let go of the above two) * t4_trace_lock * sc->ifp_lock */ static struct sx t4_trace_lock; static const char *t4_cloner_name = "tXnex"; static struct if_clone *t4_cloner; /* tracer ifnet routines. mostly no-ops. */ static void tracer_init(void *); -static int tracer_ioctl(struct ifnet *, unsigned long, caddr_t); -static int tracer_transmit(struct ifnet *, struct mbuf *); -static void tracer_qflush(struct ifnet *); -static int tracer_media_change(struct ifnet *); -static void tracer_media_status(struct ifnet *, struct ifmediareq *); +static int tracer_ioctl(if_t, unsigned long, caddr_t); +static int tracer_transmit(if_t, struct mbuf *); +static void tracer_qflush(if_t); +static int tracer_media_change(if_t); +static void tracer_media_status(if_t, struct ifmediareq *); /* match name (request/response) */ struct match_rr { const char *name; int lock; /* set to 1 to returned sc locked. */ struct adapter *sc; int rc; }; static void match_name(struct adapter *sc, void *arg) { struct match_rr *mrr = arg; if (strcmp(device_get_nameunit(sc->dev), mrr->name) != 0) return; KASSERT(mrr->sc == NULL, ("%s: multiple matches (%p, %p) for %s", __func__, mrr->sc, sc, mrr->name)); mrr->sc = sc; if (mrr->lock) mrr->rc = begin_synchronized_op(mrr->sc, NULL, 0, "t4clon"); else mrr->rc = 0; } static int t4_cloner_match(struct if_clone *ifc, const char *name) { if (strncmp(name, "t4nex", 5) != 0 && strncmp(name, "t5nex", 5) != 0 && strncmp(name, "t6nex", 5) != 0) return (0); if (name[5] < '0' || name[5] > '9') return (0); return (1); } static int t4_cloner_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct match_rr mrr; struct adapter *sc; - struct ifnet *ifp; + if_t ifp; int rc, unit; const uint8_t lla[ETHER_ADDR_LEN] = {0, 0, 0, 0, 0, 0}; mrr.name = name; mrr.lock = 1; mrr.sc = NULL; mrr.rc = ENOENT; t4_iterate(match_name, &mrr); if (mrr.rc != 0) return (mrr.rc); sc = mrr.sc; KASSERT(sc != NULL, ("%s: name (%s) matched but softc is NULL", __func__, name)); ASSERT_SYNCHRONIZED_OP(sc); sx_xlock(&t4_trace_lock); if (sc->ifp != NULL) { rc = EEXIST; goto done; } if (sc->traceq < 0) { rc = EAGAIN; goto done; } unit = -1; rc = ifc_alloc_unit(ifc, &unit); if (rc != 0) goto done; ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { ifc_free_unit(ifc, unit); rc = ENOMEM; goto done; } /* Note that if_xname is not . */ - strlcpy(ifp->if_xname, name, sizeof(ifp->if_xname)); - ifp->if_dname = t4_cloner_name; - ifp->if_dunit = unit; - ifp->if_init = tracer_init; - ifp->if_flags = IFF_SIMPLEX | IFF_DRV_RUNNING; - ifp->if_ioctl = tracer_ioctl; - ifp->if_transmit = tracer_transmit; - ifp->if_qflush = tracer_qflush; - ifp->if_capabilities = IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; + if_initname(ifp, name, unit); + if_setdname(ifp, t4_cloner_name); + if_setinitfn(ifp, tracer_init); + if_setflags(ifp, IFF_SIMPLEX | IFF_DRV_RUNNING); + if_setioctlfn(ifp, tracer_ioctl); + if_settransmitfn(ifp, tracer_transmit); + if_setqflushfn(ifp, tracer_qflush); + if_setcapabilities(ifp, IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU); ifmedia_init(&sc->media, IFM_IMASK, tracer_media_change, tracer_media_status); ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | IFM_NONE, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | IFM_NONE); ether_ifattach(ifp, lla); mtx_lock(&sc->ifp_lock); - ifp->if_softc = sc; + if_setsoftc(ifp, sc); sc->ifp = ifp; mtx_unlock(&sc->ifp_lock); done: sx_xunlock(&t4_trace_lock); end_synchronized_op(sc, 0); return (rc); } static int -t4_cloner_destroy(struct if_clone *ifc, struct ifnet *ifp) +t4_cloner_destroy(struct if_clone *ifc, if_t ifp) { struct adapter *sc; - int unit = ifp->if_dunit; + int unit = if_getdunit(ifp); sx_xlock(&t4_trace_lock); - sc = ifp->if_softc; + sc = if_getsoftc(ifp); if (sc != NULL) { mtx_lock(&sc->ifp_lock); sc->ifp = NULL; - ifp->if_softc = NULL; + if_setsoftc(ifp, NULL); mtx_unlock(&sc->ifp_lock); ifmedia_removeall(&sc->media); } ether_ifdetach(ifp); if_free(ifp); ifc_free_unit(ifc, unit); sx_xunlock(&t4_trace_lock); return (0); } void t4_tracer_modload(void) { sx_init(&t4_trace_lock, "T4/T5 tracer lock"); t4_cloner = if_clone_advanced(t4_cloner_name, 0, t4_cloner_match, t4_cloner_create, t4_cloner_destroy); } void t4_tracer_modunload(void) { if (t4_cloner != NULL) { /* * The module is being unloaded so the nexus drivers have * detached. The tracing interfaces can not outlive the nexus * (ifp->if_softc is the nexus) and must have been destroyed * already. XXX: but if_clone is opaque to us and we can't * assert LIST_EMPTY(&t4_cloner->ifc_iflist) at this time. */ if_clone_detach(t4_cloner); } sx_destroy(&t4_trace_lock); } void t4_tracer_port_detach(struct adapter *sc) { sx_xlock(&t4_trace_lock); if (sc->ifp != NULL) { mtx_lock(&sc->ifp_lock); - sc->ifp->if_softc = NULL; + if_setsoftc(sc->ifp, NULL); sc->ifp = NULL; mtx_unlock(&sc->ifp_lock); } ifmedia_removeall(&sc->media); sx_xunlock(&t4_trace_lock); } int t4_get_tracer(struct adapter *sc, struct t4_tracer *t) { int rc, i, enabled; struct trace_params tp; if (t->idx >= NTRACE) { t->idx = 0xff; t->enabled = 0; t->valid = 0; return (0); } rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4gett"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } for (i = t->idx; i < NTRACE; i++) { if (isset(&sc->tracer_valid, t->idx)) { t4_get_trace_filter(sc, &tp, i, &enabled); t->idx = i; t->enabled = enabled; t->valid = 1; memcpy(&t->tp.data[0], &tp.data[0], sizeof(t->tp.data)); memcpy(&t->tp.mask[0], &tp.mask[0], sizeof(t->tp.mask)); t->tp.snap_len = tp.snap_len; t->tp.min_len = tp.min_len; t->tp.skip_ofst = tp.skip_ofst; t->tp.skip_len = tp.skip_len; t->tp.invert = tp.invert; /* convert channel to port iff 0 <= port < 8. */ if (tp.port < 4) t->tp.port = sc->chan_map[tp.port]; else if (tp.port < 8) t->tp.port = sc->chan_map[tp.port - 4] + 4; else t->tp.port = tp.port; goto done; } } t->idx = 0xff; t->enabled = 0; t->valid = 0; done: end_synchronized_op(sc, LOCK_HELD); return (rc); } int t4_set_tracer(struct adapter *sc, struct t4_tracer *t) { int rc; struct trace_params tp, *tpp; if (t->idx >= NTRACE) return (EINVAL); rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4sett"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } /* * If no tracing filter is specified this time then check if the filter * at the index is valid anyway because it was set previously. If so * then this is a legitimate enable/disable operation. */ if (t->valid == 0) { if (isset(&sc->tracer_valid, t->idx)) tpp = NULL; else rc = EINVAL; goto done; } if (t->tp.port > 19 || t->tp.snap_len > 9600 || t->tp.min_len > M_TFMINPKTSIZE || t->tp.skip_len > M_TFLENGTH || t->tp.skip_ofst > M_TFOFFSET) { rc = EINVAL; goto done; } memcpy(&tp.data[0], &t->tp.data[0], sizeof(tp.data)); memcpy(&tp.mask[0], &t->tp.mask[0], sizeof(tp.mask)); tp.snap_len = t->tp.snap_len; tp.min_len = t->tp.min_len; tp.skip_ofst = t->tp.skip_ofst; tp.skip_len = t->tp.skip_len; tp.invert = !!t->tp.invert; /* convert port to channel iff 0 <= port < 8. */ if (t->tp.port < 4) { if (sc->port[t->tp.port] == NULL) { rc = EINVAL; goto done; } tp.port = sc->port[t->tp.port]->tx_chan; } else if (t->tp.port < 8) { if (sc->port[t->tp.port - 4] == NULL) { rc = EINVAL; goto done; } tp.port = sc->port[t->tp.port - 4]->tx_chan + 4; } tpp = &tp; done: if (rc == 0) { rc = -t4_set_trace_filter(sc, tpp, t->idx, t->enabled); if (rc == 0) { if (t->enabled) { setbit(&sc->tracer_valid, t->idx); if (sc->tracer_enabled == 0) { t4_set_reg_field(sc, A_MPS_TRC_CFG, F_TRCEN, F_TRCEN); } setbit(&sc->tracer_enabled, t->idx); } else { clrbit(&sc->tracer_enabled, t->idx); if (sc->tracer_enabled == 0) { t4_set_reg_field(sc, A_MPS_TRC_CFG, F_TRCEN, 0); } } } } end_synchronized_op(sc, LOCK_HELD); return (rc); } int t4_trace_pkt(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; - struct ifnet *ifp; + if_t ifp; KASSERT(m != NULL, ("%s: no payload with opcode %02x", __func__, rss->opcode)); mtx_lock(&sc->ifp_lock); ifp = sc->ifp; if (sc->ifp) { m_adj(m, sizeof(struct cpl_trace_pkt)); m->m_pkthdr.rcvif = ifp; ETHER_BPF_MTAP(ifp, m); } mtx_unlock(&sc->ifp_lock); m_freem(m); return (0); } int t5_trace_pkt(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; - struct ifnet *ifp; + if_t ifp; KASSERT(m != NULL, ("%s: no payload with opcode %02x", __func__, rss->opcode)); mtx_lock(&sc->ifp_lock); ifp = sc->ifp; if (ifp != NULL) { m_adj(m, sizeof(struct cpl_t5_trace_pkt)); m->m_pkthdr.rcvif = ifp; ETHER_BPF_MTAP(ifp, m); } mtx_unlock(&sc->ifp_lock); m_freem(m); return (0); } static void tracer_init(void *arg) { return; } static int -tracer_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) +tracer_ioctl(if_t ifp, unsigned long cmd, caddr_t data) { int rc = 0; struct adapter *sc; struct ifreq *ifr = (struct ifreq *)data; switch (cmd) { case SIOCSIFMTU: case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFCAP: break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: sx_xlock(&t4_trace_lock); - sc = ifp->if_softc; + sc = if_getsoftc(ifp); if (sc == NULL) rc = EIO; else rc = ifmedia_ioctl(ifp, ifr, &sc->media, cmd); sx_xunlock(&t4_trace_lock); break; default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int -tracer_transmit(struct ifnet *ifp, struct mbuf *m) +tracer_transmit(if_t ifp, struct mbuf *m) { m_freem(m); return (0); } static void -tracer_qflush(struct ifnet *ifp) +tracer_qflush(if_t ifp) { return; } static int -tracer_media_change(struct ifnet *ifp) +tracer_media_change(if_t ifp) { return (EOPNOTSUPP); } static void -tracer_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) +tracer_media_status(if_t ifp, struct ifmediareq *ifmr) { ifmr->ifm_status = IFM_AVALID | IFM_ACTIVE; return; } diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index 2ba27d62686f..6d858ea5eaad 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -1,414 +1,414 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * Active open succeeded. */ static int do_act_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_establish *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); u_int atid = G_TID_TID(ntohl(cpl->tos_atid)); struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(sc, atid); CURVNET_SET(toep->vnet); INP_WLOCK(inp); toep->tid = tid; insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1); if (inp->inp_flags & INP_DROPPED) { /* socket closed by the kernel before hw told us it connected */ send_flowc_wr(toep, NULL); send_reset(sc, toep, be32toh(cpl->snd_isn)); goto done; } make_established(toep, be32toh(cpl->snd_isn) - 1, be32toh(cpl->rcv_isn) - 1, cpl->tcp_opt); inp->inp_flowtype = M_HASHTYPE_OPAQUE; inp->inp_flowid = tid; done: INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status) { struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; struct toedev *tod = &toep->td->tod; struct epoch_tracker et; free_atid(sc, atid); toep->tid = -1; CURVNET_SET(toep->vnet); if (status != EAGAIN) NET_EPOCH_ENTER(et); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * Active open failed. */ static int do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct toepcb *toep = lookup_atid(sc, atid); int rc; KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status); /* Ignore negative advice */ if (negative_advice(status)) return (0); if (status && act_open_has_tid(status)) release_tid(sc, GET_TID(cpl), toep->ctrlq); rc = act_open_rpl_status_to_errno(status); act_open_failure_cleanup(sc, atid, rc); return (0); } void t4_init_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl, CPL_COOKIE_TOM); } void t4_uninit_connect_cpl_handlers(void) { t4_register_cpl_handler(CPL_ACT_ESTABLISH, NULL); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL, CPL_COOKIE_TOM); } #ifdef KTR #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ reason = __LINE__; \ rc = (x); \ goto failed; \ } while (0) #else #define DONT_OFFLOAD_ACTIVE_OPEN(x) do { \ rc = (x); \ goto failed; \ } while (0) #endif static inline int act_open_cpl_size(struct adapter *sc, int isipv6) { int idx; static const int sz_table[3][2] = { { sizeof (struct cpl_act_open_req), sizeof (struct cpl_act_open_req6) }, { sizeof (struct cpl_t5_act_open_req), sizeof (struct cpl_t5_act_open_req6) }, { sizeof (struct cpl_t6_act_open_req), sizeof (struct cpl_t6_act_open_req6) }, }; MPASS(chip_id(sc) >= CHELSIO_T4); idx = min(chip_id(sc) - CHELSIO_T4, 2); return (sz_table[idx][!!isipv6]); } /* * active open (soconnect). * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (This has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = NULL; struct wrqe *wr = NULL; - struct ifnet *rt_ifp = nh->nh_ifp; + if_t rt_ifp = nh->nh_ifp; struct vi_info *vi; int qid_atid, rc, isipv6; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); #ifdef KTR int reason; #endif struct offload_settings settings; struct epoch_tracker et; uint16_t vid = 0xfff, pcp = 0; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family)); - if (rt_ifp->if_type == IFT_ETHER) - vi = rt_ifp->if_softc; - else if (rt_ifp->if_type == IFT_L2VLAN) { - struct ifnet *ifp = VLAN_TRUNKDEV(rt_ifp); + if (if_gettype(rt_ifp) == IFT_ETHER) + vi = if_getsoftc(rt_ifp); + else if (if_gettype(rt_ifp) == IFT_L2VLAN) { + if_t ifp = VLAN_TRUNKDEV(rt_ifp); - vi = ifp->if_softc; + vi = if_getsoftc(ifp); VLAN_TAG(rt_ifp, &vid); VLAN_PCP(rt_ifp, &pcp); - } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) + } else if (if_gettype(rt_ifp) == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); if (sc->flags & KERN_TLS_ON) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, EVL_MAKETAG(vid, pcp, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) DONT_OFFLOAD_ACTIVE_OPEN(EPERM); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->tid = alloc_atid(sc, toep); if (toep->tid < 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(vi->pi, rt_ifp, nh->nh_flags & NHF_GATEWAY ? &nh->gw_sa : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; init_conn_params(vi, &settings, &inp->inp_inc, so, NULL, toep->l2te->idx, &toep->params); init_toepcb(vi, toep); isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) | V_TID_COOKIE(CPL_COOKIE_TOM); if (isipv6) { struct cpl_act_open_req6 *cpl = wrtod(wr); struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; if ((inp->inp_vflag & INP_IPV6) == 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); toep->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (toep->ce == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOENT); switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_options0(vi, &toep->params); cpl->opt2 = calc_options2(vi, &toep->params); CTR6(KTR_CXGBE, "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", __func__, toep->tid, toep, inp, be64toh(cpl->opt0), be32toh(cpl->opt2)); } offload_socket(so, toep); NET_EPOCH_ENTER(et); rc = t4_l2t_send(sc, wr, toep->l2te); NET_EPOCH_EXIT(et); if (rc == 0) { toep->flags |= TPF_CPL_PENDING; return (0); } undo_offload_socket(so); #if defined(KTR) reason = __LINE__; #endif failed: CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc); if (wr) free_wrqe(wr); if (toep) { if (toep->tid >= 0) free_atid(sc, toep->tid); if (toep->l2te) t4_l2t_release(toep->l2te); if (toep->ce) t4_release_clip_entry(sc, toep->ce); free_toepcb(toep); } return (rc); } #endif diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index e764df14b245..e8d1338291c5 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -1,1604 +1,1604 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* stid services */ static int alloc_stid(struct adapter *, struct listen_ctx *, int); static struct listen_ctx *lookup_stid(struct adapter *, int); static void free_stid(struct adapter *, struct listen_ctx *); /* lctx services */ static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, struct vi_info *); static int free_lctx(struct adapter *, struct listen_ctx *); static void hold_lctx(struct listen_ctx *); static void listen_hash_add(struct adapter *, struct listen_ctx *); static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); static int alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) { struct tid_info *t = &sc->tids; u_int stid, n, f, mask; struct stid_region *sr = &lctx->stid_region; /* * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in * the TCAM. The start of the stid region is properly aligned (the chip * requires each region to be 128-cell aligned). */ n = isipv6 ? 2 : 1; mask = n - 1; KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, ("%s: stid region (%u, %u) not properly aligned. n = %u", __func__, t->stid_base, t->nstids, n)); mtx_lock(&t->stid_lock); if (n > t->nstids - t->stids_in_use) { mtx_unlock(&t->stid_lock); return (-1); } if (t->nstids_free_head >= n) { /* * This allocation will definitely succeed because the region * starts at a good alignment and we just checked we have enough * stids free. */ f = t->nstids_free_head & mask; t->nstids_free_head -= n + f; stid = t->nstids_free_head; TAILQ_INSERT_HEAD(&t->stids, sr, link); } else { struct stid_region *s; stid = t->nstids_free_head; TAILQ_FOREACH(s, &t->stids, link) { stid += s->used + s->free; f = stid & mask; if (s->free >= n + f) { stid -= n + f; s->free -= n + f; TAILQ_INSERT_AFTER(&t->stids, s, sr, link); goto allocated; } } if (__predict_false(stid != t->nstids)) { panic("%s: stids TAILQ (%p) corrupt." " At %d instead of %d at the end of the queue.", __func__, &t->stids, stid, t->nstids); } mtx_unlock(&t->stid_lock); return (-1); } allocated: sr->used = n; sr->free = f; t->stids_in_use += n; t->stid_tab[stid] = lctx; mtx_unlock(&t->stid_lock); KASSERT(((stid + t->stid_base) & mask) == 0, ("%s: EDOOFUS.", __func__)); return (stid + t->stid_base); } static struct listen_ctx * lookup_stid(struct adapter *sc, int stid) { struct tid_info *t = &sc->tids; return (t->stid_tab[stid - t->stid_base]); } static void free_stid(struct adapter *sc, struct listen_ctx *lctx) { struct tid_info *t = &sc->tids; struct stid_region *sr = &lctx->stid_region; struct stid_region *s; KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); mtx_lock(&t->stid_lock); s = TAILQ_PREV(sr, stid_head, link); if (s != NULL) s->free += sr->used + sr->free; else t->nstids_free_head += sr->used + sr->free; KASSERT(t->stids_in_use >= sr->used, ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, t->stids_in_use, sr->used)); t->stids_in_use -= sr->used; TAILQ_REMOVE(&t->stids, sr, link); mtx_unlock(&t->stid_lock); } static struct listen_ctx * alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) { struct listen_ctx *lctx; INP_WLOCK_ASSERT(inp); lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); if (lctx == NULL) return (NULL); lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); if (lctx->stid < 0) { free(lctx, M_CXGBE); return (NULL); } if (inp->inp_vflag & INP_IPV6 && !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); if (lctx->ce == NULL) { free(lctx, M_CXGBE); return (NULL); } } lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; refcount_init(&lctx->refcount, 1); lctx->inp = inp; lctx->vnet = inp->inp_socket->so_vnet; in_pcbref(inp); return (lctx); } /* Don't call this directly, use release_lctx instead */ static int free_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; INP_WLOCK_ASSERT(inp); KASSERT(lctx->refcount == 0, ("%s: refcount %d", __func__, lctx->refcount)); KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", __func__, lctx->stid, lctx, lctx->inp); if (lctx->ce) t4_release_clip_entry(sc, lctx->ce); free_stid(sc, lctx); free(lctx, M_CXGBE); return (in_pcbrele_wlocked(inp)); } static void hold_lctx(struct listen_ctx *lctx) { refcount_acquire(&lctx->refcount); } static inline uint32_t listen_hashfn(void *key, u_long mask) { return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); } /* * Add a listen_ctx entry to the listen hash table. */ static void listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(lctx->inp, td->listen_mask); mtx_lock(&td->lctx_hash_lock); LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); td->lctx_count++; mtx_unlock(&td->lctx_hash_lock); } /* * Look for the listening socket's context entry in the hash and return it. */ static struct listen_ctx * listen_hash_find(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { if (lctx->inp == inp) break; } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Removes the listen_ctx structure for inp from the hash and returns it. */ static struct listen_ctx * listen_hash_del(struct adapter *sc, struct inpcb *inp) { struct tom_data *td = sc->tom_softc; int bucket = listen_hashfn(inp, td->listen_mask); struct listen_ctx *lctx, *l; mtx_lock(&td->lctx_hash_lock); LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { if (lctx->inp == inp) { LIST_REMOVE(lctx, link); td->lctx_count--; break; } } mtx_unlock(&td->lctx_hash_lock); return (lctx); } /* * Releases a hold on the lctx. Must be called with the listening socket's inp * locked. The inp may be freed by this function and it returns NULL to * indicate this. */ static struct inpcb * release_lctx(struct adapter *sc, struct listen_ctx *lctx) { struct inpcb *inp = lctx->inp; int inp_freed = 0; INP_WLOCK_ASSERT(inp); if (refcount_release(&lctx->refcount)) inp_freed = free_lctx(sc, lctx); return (inp_freed ? NULL : inp); } static void send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) { struct mbuf *m = synqe->syn; - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct vi_info *vi = ifp->if_softc; + if_t ifp = m->m_pkthdr.rcvif; + struct vi_info *vi = if_getsoftc(ifp); struct port_info *pi = vi->pi; struct wrqe *wr; struct fw_flowc_wr *flowc; struct sge_ofld_txq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; const int nparams = 6; const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); const u_int pfvf = sc->pf << S_FW_VIID_PFN; INP_WLOCK_ASSERT(synqe->lctx->inp); MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(synqe->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; flowc->mnemval[4].val = htobe32(512); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[5].val = htobe32(512); synqe->flags |= TPF_FLOWC_WR_SENT; t4_wrq_tx(sc, wr); } static void send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, int rst_status) { struct adapter *sc = tod->tod_softc; struct wrqe *wr; struct cpl_abort_req *req; INP_WLOCK_ASSERT(synqe->lctx->inp); CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", __func__, synqe, synqe->flags, synqe->tid, synqe->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (synqe->flags & TPF_ABORT_SHUTDOWN) return; /* abort already in progress */ synqe->flags |= TPF_ABORT_SHUTDOWN; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); wr = alloc_wrqe(sizeof(*req), &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); req->rsvd0 = 0; /* don't have a snd_nxt */ req->rsvd1 = 1; /* no data sent yet */ req->cmd = rst_status; t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); } static int create_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip = inp->inp_laddr.s_addr; req->peer_ip = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int create_server6(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_pass_open_req6 *req; struct inpcb *inp = lctx->inp; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { log(LOG_ERR, "%s: allocation failure", __func__); return (ENOMEM); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); req->local_port = inp->inp_lport; req->peer_port = 0; req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; req->peer_ip_hi = 0; req->peer_ip_lo = 0; req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); t4_wrq_tx(sc, wr); return (0); } static int destroy_server(struct adapter *sc, struct listen_ctx *lctx) { struct wrqe *wr; struct cpl_close_listsvr_req *req; wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, lctx->stid)); req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); req->rsvd = htobe16(0); t4_wrq_tx(sc, wr); return (0); } /* * Start a listening server by sending a passive open request to HW. * * Can't take adapter lock here and access to sc->flags, * sc->offload_map, if_capenable are all race prone. */ int t4_listen_start(struct toedev *tod, struct tcpcb *tp) { struct adapter *sc = tod->tod_softc; struct vi_info *vi; struct port_info *pi; struct inpcb *inp = tptoinpcb(tp); struct listen_ctx *lctx; int i, rc, v; struct offload_settings settings; INP_WLOCK_ASSERT(inp); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) return (0); /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); if (!(inp->inp_vflag & INP_IPV6) && IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) return (0); if (sc->flags & KERN_TLS_ON) return (0); #if 0 ADAPTER_LOCK(sc); if (IS_BUSY(sc)) { log(LOG_ERR, "%s: listen request ignored, %s is busy", __func__, device_get_nameunit(sc->dev)); goto done; } KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM not initialized", __func__)); #endif /* * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first * such VI's queues to send the passive open and receive the reply to * it. * * XXX: need a way to mark a port in use by offload. if_cxgbe should * then reject any attempt to bring down such a port (and maybe reject * attempts to disable IFCAP_TOE on that port too?). */ for_each_port(sc, i) { pi = sc->port[i]; for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE && - vi->ifp->if_capenable & IFCAP_TOE) + if_getcapenable(vi->ifp) & IFCAP_TOE) goto found; } } goto done; /* no port that's UP with IFCAP_TOE enabled */ found: if (listen_hash_find(sc, inp) != NULL) goto done; /* already setup */ lctx = alloc_lctx(sc, inp, vi); if (lctx == NULL) { log(LOG_ERR, "%s: listen request ignored, %s couldn't allocate lctx\n", __func__, device_get_nameunit(sc->dev)); goto done; } listen_hash_add(sc, lctx); CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, inp->inp_vflag); if (inp->inp_vflag & INP_IPV6) rc = create_server6(sc, lctx); else rc = create_server(sc, lctx); if (rc != 0) { log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", __func__, device_get_nameunit(sc->dev), rc); (void) listen_hash_del(sc, inp); inp = release_lctx(sc, lctx); /* can't be freed, host stack has a reference */ KASSERT(inp != NULL, ("%s: inp freed", __func__)); goto done; } lctx->flags |= LCTX_RPL_PENDING; done: #if 0 ADAPTER_UNLOCK(sc); #endif return (0); } int t4_listen_stop(struct toedev *tod, struct tcpcb *tp) { struct listen_ctx *lctx; struct adapter *sc = tod->tod_softc; struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); lctx = listen_hash_del(sc, inp); if (lctx == NULL) return (ENOENT); /* no hardware listener for this inp */ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, lctx, lctx->flags); /* * If the reply to the PASS_OPEN is still pending we'll wait for it to * arrive and clean up when it does. */ if (lctx->flags & LCTX_RPL_PENDING) { return (EINPROGRESS); } destroy_server(sc, lctx); return (0); } static inline struct synq_entry * alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags) { struct synq_entry *synqe; INP_RLOCK_ASSERT(lctx->inp); MPASS(flags == M_WAITOK || flags == M_NOWAIT); synqe = malloc(sizeof(*synqe), M_CXGBE, flags); if (__predict_true(synqe != NULL)) { synqe->flags = TPF_SYNQE; refcount_init(&synqe->refcnt, 1); synqe->lctx = lctx; hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ synqe->syn = NULL; } return (synqe); } static inline void hold_synqe(struct synq_entry *synqe) { refcount_acquire(&synqe->refcnt); } static inline struct inpcb * release_synqe(struct adapter *sc, struct synq_entry *synqe) { struct inpcb *inp; MPASS(synqe->flags & TPF_SYNQE); MPASS(synqe->lctx != NULL); inp = synqe->lctx->inp; MPASS(inp != NULL); INP_WLOCK_ASSERT(inp); if (refcount_release(&synqe->refcnt)) { inp = release_lctx(sc, synqe->lctx); m_freem(synqe->syn); free(synqe, M_CXGBE); } return (inp); } void t4_syncache_added(struct toedev *tod __unused, void *arg) { struct synq_entry *synqe = arg; hold_synqe(synqe); } void t4_syncache_removed(struct toedev *tod, void *arg) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = synqe->lctx->inp; /* * XXX: this is a LOR but harmless when running from the softclock. */ INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); } int t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) { struct synq_entry *synqe = arg; if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { struct tcpopt to; struct ip *ip = mtod(m, struct ip *); struct tcphdr *th; if (ip->ip_v == IPVERSION) th = (void *)(ip + 1); else th = (void *)((struct ip6_hdr *)ip + 1); bzero(&to, sizeof(to)); tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), TO_SYN); /* save these for later */ synqe->iss = be32toh(th->th_seq); synqe->irs = be32toh(th->th_ack) - 1; synqe->ts = to.to_tsval; } m_freem(m); /* don't need this any more */ return (0); } static int do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_OPEN_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); INP_WLOCK(inp); CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", __func__, stid, status, lctx->flags); lctx->flags &= ~LCTX_RPL_PENDING; if (status != CPL_ERR_NONE) log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); #ifdef INVARIANTS /* * If the inp has been dropped (listening socket closed) then * listen_stop must have run and taken the inp out of the hash. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(listen_hash_del(sc, inp) == NULL, ("%s: inp %p still in listen hash", __func__, inp)); } #endif if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* * Listening socket stopped listening earlier and now the chip tells us * it has started the hardware listener. Stop it; the lctx will be * released in do_close_server_rpl. */ if (inp->inp_flags & INP_DROPPED) { destroy_server(sc, lctx); INP_WUNLOCK(inp); return (status); } /* * Failed to start hardware listener. Take inp out of the hash and * release our reference on it. An error message has been logged * already. */ if (status != CPL_ERR_NONE) { listen_hash_del(sc, inp); if (release_lctx(sc, lctx) != NULL) INP_WUNLOCK(inp); return (status); } /* hardware listener open for business */ INP_WUNLOCK(inp); return (status); } static int do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); int stid = GET_TID(cpl); unsigned int status = cpl->status; struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); if (status != CPL_ERR_NONE) { log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", __func__, status, stid); return (status); } INP_WLOCK(inp); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); return (status); } static void done_with_synqe(struct adapter *sc, struct synq_entry *synqe) { struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; int ntids; INP_WLOCK_ASSERT(inp); ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; remove_tid(sc, synqe->tid, ntids); release_tid(sc, synqe->tid, lctx->ctrlq); t4_l2t_release(e); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } void synack_failure_cleanup(struct adapter *sc, int tid) { struct synq_entry *synqe = lookup_tid(sc, tid); INP_WLOCK(synqe->lctx->inp); done_with_synqe(sc, synqe); } int do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; struct sge_ofld_txq *ofld_txq; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_REQ_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); if (negative_advice(cpl->status)) return (0); /* Ignore negative advice */ INP_WLOCK(inp); ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; if (!(synqe->flags & TPF_FLOWC_WR_SENT)) send_flowc_wr_synqe(sc, synqe); /* * If we'd initiated an abort earlier the reply to it is responsible for * cleaning up resources. Otherwise we tear everything down right here * right now. We owe the T4 a CPL_ABORT_RPL no matter what. */ if (synqe->flags & TPF_ABORT_SHUTDOWN) { INP_WUNLOCK(inp); goto done; } done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ done: send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } int do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_ABORT_RPL_RSS, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); INP_WLOCK(inp); KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, ("%s: wasn't expecting abort reply for synqe %p (0x%x)", __func__, synqe, synqe->flags)); done_with_synqe(sc, synqe); /* inp lock released by done_with_synqe */ return (0); } void t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) { struct adapter *sc = tod->tod_softc; struct synq_entry *synqe = arg; struct inpcb *inp = sotoinpcb(so); struct toepcb *toep = synqe->toep; NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); MPASS(toep->tid == synqe->tid); offload_socket(so, toep); make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); toep->flags |= TPF_CPL_PENDING; update_tid(sc, synqe->tid, toep); synqe->flags |= TPF_SYNQE_EXPANDED; inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; inp->inp_flowid = synqe->rss_hash; } static void t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) { bzero(to, sizeof(*to)); if (t4opt->mss) { to->to_flags |= TOF_MSS; to->to_mss = be16toh(t4opt->mss); } if (t4opt->wsf > 0 && t4opt->wsf < 15) { to->to_flags |= TOF_SCALE; to->to_wscale = t4opt->wsf; } if (t4opt->tstamp) to->to_flags |= TOF_TS; if (t4opt->sack) to->to_flags |= TOF_SACKPERM; } static bool encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) { u_int hlen = be32toh(cpl->hdr_len); if (chip_id(sc) >= CHELSIO_T6) return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); else return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); } static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) { const struct cpl_pass_accept_req *cpl = mtod(m, const void *); const struct ether_header *eh; unsigned int hlen = be32toh(cpl->hdr_len); uintptr_t l3hdr; const struct tcphdr *tcp; eh = (const void *)(cpl + 1); if (chip_id(sc) >= CHELSIO_T6) { l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); } else { l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); } /* extract TOS (DiffServ + ECN) byte for AccECN */ if (iptos) { if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; *iptos = ip->ip_tos; } #ifdef INET6 else if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { const struct ip6_hdr *ip6 = (const void *)l3hdr; *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; } #endif /* INET */ } if (inc) { bzero(inc, sizeof(*inc)); inc->inc_fport = tcp->th_sport; inc->inc_lport = tcp->th_dport; if (((struct ip *)l3hdr)->ip_v == IPVERSION) { const struct ip *ip = (const void *)l3hdr; inc->inc_faddr = ip->ip_src; inc->inc_laddr = ip->ip_dst; } else { const struct ip6_hdr *ip6 = (const void *)l3hdr; inc->inc_flags |= INC_ISIPV6; inc->inc6_faddr = ip6->ip6_src; inc->inc6_laddr = ip6->ip6_dst; } } if (th) { bcopy(tcp, th, sizeof(*th)); tcp_fields_to_host(th); /* just like tcp_input */ } } static struct l2t_entry * -get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, +get_l2te_for_nexthop(struct port_info *pi, if_t ifp, struct in_conninfo *inc) { struct l2t_entry *e; struct sockaddr_in6 sin6; struct sockaddr *dst = (void *)&sin6; struct nhop_object *nh; if (inc->inc_flags & INC_ISIPV6) { bzero(dst, sizeof(struct sockaddr_in6)); dst->sa_len = sizeof(struct sockaddr_in6); dst->sa_family = AF_INET6; if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { /* no need for route lookup */ e = t4_l2t_get(pi, ifp, dst); return (e); } nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; else ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; } else { dst->sa_len = sizeof(struct sockaddr_in); dst->sa_family = AF_INET; nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (NULL); if (nh->nh_ifp != ifp) return (NULL); if (nh->nh_flags & NHF_GATEWAY) if (nh->gw_sa.sa_family == AF_INET) ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; else *((struct sockaddr_in6 *)dst) = nh->gw6_sa; else ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; } e = t4_l2t_get(pi, ifp, dst); return (e); } static int send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, uint32_t opt2, int tid) { struct wrqe *wr; struct cpl_pass_accept_rpl *rpl; struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); if (wr == NULL) return (ENOMEM); rpl = wrtod(wr); if (is_t4(sc)) INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); else { struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); rpl5->iss = htobe32(synqe->iss); } rpl->opt0 = opt0; rpl->opt2 = opt2; return (t4_l2t_send(sc, wr, e)); } #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ if (!tunnel) { \ m_freem(m); \ m = NULL; \ } \ reject_reason = __LINE__; \ goto reject; \ } while (0) /* * The context associated with a tid entry via insert_tid could be a synq_entry * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. */ CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); /* * Incoming SYN on a listening socket. * * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, * etc. */ static int do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct toedev *tod; const struct cpl_pass_accept_req *cpl = mtod(m, const void *); unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); unsigned int tid = GET_TID(cpl); struct listen_ctx *lctx = lookup_stid(sc, stid); struct inpcb *inp; struct socket *so; struct in_conninfo inc; struct tcphdr th; struct tcpopt to; struct port_info *pi; struct vi_info *vi; - struct ifnet *hw_ifp, *ifp; + if_t hw_ifp, ifp; struct l2t_entry *e = NULL; struct synq_entry *synqe = NULL; int reject_reason, v, ntids; uint16_t vid, l2info; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif struct offload_settings settings; uint8_t iptos; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, lctx); /* * Figure out the port the SYN arrived on. We'll look for an exact VI * match in a bit but in case we don't find any we'll use the main VI as * the incoming ifnet. */ l2info = be16toh(cpl->l2info); pi = sc->port[G_SYN_INTF(l2info)]; hw_ifp = pi->vi[0].ifp; m->m_pkthdr.rcvif = hw_ifp; CURVNET_SET(lctx->vnet); /* before any potential REJECT */ /* * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will * also hit the listener. We don't want to offload those. */ if (encapsulated_syn(sc, cpl)) { REJECT_PASS_ACCEPT_REQ(true); } /* * Use the MAC index to lookup the associated VI. If this SYN didn't * match a perfect MAC filter, punt. */ if (!(l2info & F_SYN_XACT_MATCH)) { REJECT_PASS_ACCEPT_REQ(true); } for_each_vi(pi, v, vi) { if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) goto found; } REJECT_PASS_ACCEPT_REQ(true); found: hw_ifp = vi->ifp; /* the cxgbe ifnet */ m->m_pkthdr.rcvif = hw_ifp; tod = TOEDEV(hw_ifp); /* * Don't offload if the peer requested a TCP option that's not known to * the silicon. Send the SYN to the kernel instead. */ if (__predict_false(cpl->tcpopt.unknown)) REJECT_PASS_ACCEPT_REQ(true); /* * Figure out if there is a pseudo interface (vlan, lagg, etc.) * involved. Don't offload if the SYN had a VLAN tag and the vid * doesn't match anything on this interface. * * XXX: lagg support, lagg + vlan support. */ vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); if (vid != 0xfff && vid != 0) { ifp = VLAN_DEVAT(hw_ifp, vid); if (ifp == NULL) REJECT_PASS_ACCEPT_REQ(true); } else ifp = hw_ifp; /* * Don't offload if the ifnet that the SYN came in on is not in the same * vnet as the listening socket. */ - if (lctx->vnet != ifp->if_vnet) + if (lctx->vnet != if_getvnet(ifp)) REJECT_PASS_ACCEPT_REQ(true); pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); if (inc.inc_flags & INC_ISIPV6) { /* Don't offload if the ifcap isn't enabled */ - if ((ifp->if_capenable & IFCAP_TOE6) == 0) + if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP6 address on this ifnet. This * is more restrictive than in6_localip. */ NET_EPOCH_ENTER(et); if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 2; } else { /* Don't offload if the ifcap isn't enabled */ - if ((ifp->if_capenable & IFCAP_TOE4) == 0) + if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0) REJECT_PASS_ACCEPT_REQ(true); /* * SYN must be directed to an IP address on this ifnet. This * is more restrictive than in_localip. */ NET_EPOCH_ENTER(et); if (!in_ifhasaddr(ifp, inc.inc_laddr)) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } ntids = 1; } e = get_l2te_for_nexthop(pi, ifp, &inc); if (e == NULL) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } inp = lctx->inp; /* listening socket, not owned by TOE */ INP_RLOCK(inp); /* Don't offload if the listening socket has closed */ if (__predict_false(inp->inp_flags & INP_DROPPED)) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } so = inp->inp_socket; rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, EVL_MAKETAG(0xfff, 0, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ } synqe = alloc_synqe(sc, lctx, M_NOWAIT); if (synqe == NULL) { INP_RUNLOCK(inp); NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } MPASS(rss->hash_type == RSS_HASH_TCP); synqe->rss_hash = be32toh(rss->hash_val); atomic_store_int(&synqe->ok_to_respond, 0); init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, &synqe->params); /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. */ t4opt_to_tcpopt(&cpl->tcpopt, &to); toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); if (atomic_load_int(&synqe->ok_to_respond) > 0) { uint64_t opt0; uint32_t opt2; opt0 = calc_options0(vi, &synqe->params); opt2 = calc_options2(vi, &synqe->params); insert_tid(sc, tid, synqe, ntids); synqe->tid = tid; synqe->syn = m; m = NULL; if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { remove_tid(sc, tid, ntids); m = synqe->syn; synqe->syn = NULL; NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(true); } CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); } else { NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); reject: CURVNET_RESTORE(); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, reject_reason); if (e) t4_l2t_release(e); release_tid(sc, tid, lctx->ctrlq); if (synqe) { inp = synqe->lctx->inp; INP_WLOCK(inp); inp = release_synqe(sc, synqe); if (inp) INP_WUNLOCK(inp); } if (m) { /* * The connection request hit a TOE listener but is being passed * on to the kernel sw stack instead of getting offloaded. */ m_adj(m, sizeof(*cpl)); m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; - hw_ifp->if_input(hw_ifp, m); + if_input(hw_ifp, m); } return (reject_reason); } static void synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, const struct cpl_pass_establish *cpl, struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) { uint16_t tcp_opt = be16toh(cpl->tcp_opt); uint8_t iptos; /* start off with the original SYN */ pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); /* modify parts to make it look like the ACK to our SYN|ACK */ th->th_flags = TH_ACK; th->th_ack = synqe->iss + 1; th->th_seq = be32toh(cpl->rcv_isn); bzero(to, sizeof(*to)); if (G_TCPOPT_TSTAMP(tcp_opt)) { to->to_flags |= TOF_TS; to->to_tsecr = synqe->ts; } } static int do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct vi_info *vi; - struct ifnet *ifp; + if_t ifp; const struct cpl_pass_establish *cpl = (const void *)(rss + 1); #if defined(KTR) || defined(INVARIANTS) unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); #endif unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; struct in_conninfo inc; struct toepcb *toep; struct epoch_tracker et; int rstreason; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif KASSERT(opcode == CPL_PASS_ESTABLISH, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); CURVNET_SET(lctx->vnet); NET_EPOCH_ENTER(et); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); ifp = synqe->syn->m_pkthdr.rcvif; - vi = ifp->if_softc; + vi = if_getsoftc(ifp); KASSERT(vi->adapter == sc, ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); if (__predict_false(inp->inp_flags & INP_DROPPED)) { reset: send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], ("%s: CPL arrived on unexpected rxq. %d %d", __func__, synqe->params.rxq_idx, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) goto reset; toep->tid = tid; toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; toep->vnet = lctx->vnet; bcopy(&synqe->params, &toep->params, sizeof(toep->params)); init_toepcb(vi, toep); MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); synqe->tcp_opt = cpl->tcp_opt; synqe->toep = toep; /* Come up with something that syncache_expand should be ok with. */ synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); if (inc.inc_flags & INC_ISIPV6) { if (lctx->ce == NULL) { toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); if (toep->ce == NULL) { free_toepcb(toep); goto reset; /* RST without a CLIP entry? */ } } else { t4_hold_clip_entry(sc, lctx->ce); toep->ce = lctx->ce; } } so = inp->inp_socket; KASSERT(so != NULL, ("%s: socket is NULL", __func__)); rstreason = toe_syncache_expand(&inc, &to, &th, &so); if (rstreason < 0) { free_toepcb(toep); send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } else if (rstreason == 0 || so == NULL) { free_toepcb(toep); goto reset; } /* New connection inpcb is already locked by syncache_expand(). */ new_inp = sotoinpcb(so); INP_WLOCK_ASSERT(new_inp); MPASS(so->so_vnet == lctx->vnet); /* * This is for expansion from syncookies. * * XXX: we've held the tcbinfo lock throughout so there's no risk of * anyone accept'ing a connection before we've installed our hooks, but * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); } INP_WUNLOCK(new_inp); /* Done with the synqe */ inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } void t4_init_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); } void t4_uninit_listen_cpl_handlers(void) { t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.c b/sys/dev/cxgbe/tom/t4_tom_l2t.c index e91d5c955758..95bb1b5ede86 100644 --- a/sys/dev/cxgbe/tom/t4_tom_l2t.c +++ b/sys/dev/cxgbe/tom/t4_tom_l2t.c @@ -1,457 +1,457 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #define VLAN_NONE 0xfff static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e) { if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */ atomic_subtract_int(&d->nfree, 1); } static inline u_int l2_hash(struct l2t_data *d, const struct sockaddr *sa, int ifindex) { u_int hash, half = d->l2t_size / 2, start = 0; const void *key; size_t len; KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6, ("%s: sa %p has unexpected sa_family %d", __func__, sa, sa->sa_family)); if (sa->sa_family == AF_INET) { const struct sockaddr_in *sin = (const void *)sa; key = &sin->sin_addr; len = sizeof(sin->sin_addr); } else { const struct sockaddr_in6 *sin6 = (const void *)sa; key = &sin6->sin6_addr; len = sizeof(sin6->sin6_addr); start = half; } hash = fnv_32_buf(key, len, FNV1_32_INIT); hash = fnv_32_buf(&ifindex, sizeof(ifindex), hash); hash %= half; return (hash + start); } static inline int l2_cmp(const struct sockaddr *sa, struct l2t_entry *e) { KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6, ("%s: sa %p has unexpected sa_family %d", __func__, sa, sa->sa_family)); if (sa->sa_family == AF_INET) { const struct sockaddr_in *sin = (const void *)sa; return (e->addr[0] != sin->sin_addr.s_addr); } else { const struct sockaddr_in6 *sin6 = (const void *)sa; return (memcmp(&e->addr[0], &sin6->sin6_addr, sizeof(e->addr))); } } static inline void l2_store(const struct sockaddr *sa, struct l2t_entry *e) { KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6, ("%s: sa %p has unexpected sa_family %d", __func__, sa, sa->sa_family)); if (sa->sa_family == AF_INET) { const struct sockaddr_in *sin = (const void *)sa; e->addr[0] = sin->sin_addr.s_addr; e->ipv6 = 0; } else { const struct sockaddr_in6 *sin6 = (const void *)sa; memcpy(&e->addr[0], &sin6->sin6_addr, sizeof(e->addr)); e->ipv6 = 1; } } /* * Add a WR to an L2T entry's queue of work requests awaiting resolution. * Must be called with the entry's lock held. */ static inline void arpq_enqueue(struct l2t_entry *e, struct wrqe *wr) { mtx_assert(&e->lock, MA_OWNED); STAILQ_INSERT_TAIL(&e->wr_list, wr, link); } static inline void send_pending(struct adapter *sc, struct l2t_entry *e) { struct wrqe *wr; mtx_assert(&e->lock, MA_OWNED); while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) { STAILQ_REMOVE_HEAD(&e->wr_list, link); t4_wrq_tx(sc, wr); } } static void resolution_failed(struct adapter *sc, struct l2t_entry *e) { struct tom_data *td = sc->tom_softc; mtx_assert(&e->lock, MA_OWNED); mtx_lock(&td->unsent_wr_lock); STAILQ_CONCAT(&td->unsent_wr_list, &e->wr_list); mtx_unlock(&td->unsent_wr_lock); taskqueue_enqueue(taskqueue_thread, &td->reclaim_wr_resources); } static void update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr, uint16_t vtag) { mtx_assert(&e->lock, MA_OWNED); /* * The entry may be in active use (e->refcount > 0) or not. We update * it even when it's not as this simplifies the case where we decide to * reuse the entry later. */ if (lladdr == NULL && (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) { /* * Never got a valid L2 address for this one. Just mark it as * failed instead of removing it from the hash (for which we'd * need to wlock the table). */ e->state = L2T_STATE_FAILED; resolution_failed(sc, e); return; } else if (lladdr == NULL) { /* Valid or already-stale entry was deleted (or expired) */ KASSERT(e->state == L2T_STATE_VALID || e->state == L2T_STATE_STALE, ("%s: lladdr NULL, state %d", __func__, e->state)); e->state = L2T_STATE_STALE; } else { if (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED || memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) { /* unresolved -> resolved; or dmac changed */ memcpy(e->dmac, lladdr, ETHER_ADDR_LEN); e->vlan = vtag; t4_write_l2e(e, 1); } e->state = L2T_STATE_VALID; } } static int resolve_entry(struct adapter *sc, struct l2t_entry *e) { struct tom_data *td = sc->tom_softc; struct toedev *tod = &td->tod; struct sockaddr_in sin = {0}; struct sockaddr_in6 sin6 = {0}; struct sockaddr *sa; uint8_t dmac[ETHER_HDR_LEN]; uint16_t vtag; int rc; if (e->ipv6 == 0) { sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr.s_addr = e->addr[0]; sa = (void *)&sin; } else { sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); memcpy(&sin6.sin6_addr, &e->addr[0], sizeof(e->addr)); sa = (void *)&sin6; } vtag = EVL_MAKETAG(VLAN_NONE, 0, 0); rc = toe_l2_resolve(tod, e->ifp, sa, dmac, &vtag); if (rc == EWOULDBLOCK) return (rc); mtx_lock(&e->lock); update_entry(sc, e, rc == 0 ? dmac : NULL, vtag); mtx_unlock(&e->lock); return (rc); } int t4_l2t_send_slow(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e) { again: switch (e->state) { case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ resolve_entry(sc, e); /* Fall through */ case L2T_STATE_VALID: /* fast-path, send the packet on */ t4_wrq_tx(sc, wr); return (0); case L2T_STATE_RESOLVING: case L2T_STATE_SYNC_WRITE: mtx_lock(&e->lock); if (e->state != L2T_STATE_SYNC_WRITE && e->state != L2T_STATE_RESOLVING) { /* state changed by the time we got here */ mtx_unlock(&e->lock); goto again; } arpq_enqueue(e, wr); mtx_unlock(&e->lock); if (resolve_entry(sc, e) == EWOULDBLOCK) break; mtx_lock(&e->lock); if (e->state == L2T_STATE_VALID && !STAILQ_EMPTY(&e->wr_list)) send_pending(sc, e); if (e->state == L2T_STATE_FAILED) resolution_failed(sc, e); mtx_unlock(&e->lock); break; case L2T_STATE_FAILED: return (EHOSTUNREACH); } return (0); } int do_l2t_write_rpl2(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); unsigned int tid = GET_TID(rpl); unsigned int idx = tid % L2T_SIZE; if (__predict_false(rpl->status != CPL_ERR_NONE)) { log(LOG_ERR, "Unexpected L2T_WRITE_RPL (%u) for entry at hw_idx %u\n", rpl->status, idx); return (EINVAL); } if (tid & F_SYNC_WR) { struct l2t_entry *e = &sc->l2t->l2tab[idx - sc->vres.l2t.start]; mtx_lock(&e->lock); if (e->state != L2T_STATE_SWITCHING) { send_pending(sc, e); e->state = L2T_STATE_VALID; } mtx_unlock(&e->lock); } return (0); } /* * The TOE wants an L2 table entry that it can use to reach the next hop over * the specified port. Produce such an entry - create one if needed. * * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on * top of the real cxgbe interface. */ struct l2t_entry * -t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa) +t4_l2t_get(struct port_info *pi, if_t ifp, struct sockaddr *sa) { struct l2t_entry *e; struct adapter *sc = pi->adapter; struct l2t_data *d = sc->l2t; u_int hash, smt_idx = pi->port_id; uint16_t vid, pcp, vtag; KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6, ("%s: sa %p has unexpected sa_family %d", __func__, sa, sa->sa_family)); vid = VLAN_NONE; pcp = 0; - if (ifp->if_type == IFT_L2VLAN) { + if (if_gettype(ifp) == IFT_L2VLAN) { VLAN_TAG(ifp, &vid); VLAN_PCP(ifp, &pcp); - } else if (ifp->if_pcp != IFNET_PCP_NONE) { + } else if ((pcp = if_getpcp(ifp)) != IFNET_PCP_NONE) vid = 0; - pcp = ifp->if_pcp; - } + else + pcp = 0; vtag = EVL_MAKETAG(vid, pcp, 0); - hash = l2_hash(d, sa, ifp->if_index); + hash = l2_hash(d, sa, if_getindex(ifp)); rw_wlock(&d->lock); for (e = d->l2tab[hash].first; e; e = e->next) { if (l2_cmp(sa, e) == 0 && e->ifp == ifp && e->vlan == vtag && e->smt_idx == smt_idx) { l2t_hold(d, e); goto done; } } /* Need to allocate a new entry */ e = t4_alloc_l2e(d); if (e) { mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ e->next = d->l2tab[hash].first; d->l2tab[hash].first = e; e->state = L2T_STATE_RESOLVING; l2_store(sa, e); e->ifp = ifp; e->smt_idx = smt_idx; e->hash = hash; e->lport = pi->lport; e->wrq = &sc->sge.ctrlq[pi->port_id]; e->iqid = sc->sge.ofld_rxq[pi->vi[0].first_ofld_rxq].iq.abs_id; atomic_store_rel_int(&e->refcnt, 1); e->vlan = vtag; mtx_unlock(&e->lock); } done: rw_wunlock(&d->lock); return e; } /* * Called when the host's ARP layer makes a change to some entry that is loaded * into the HW L2 table. */ void -t4_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, +t4_l2_update(struct toedev *tod, if_t ifp, struct sockaddr *sa, uint8_t *lladdr, uint16_t vtag) { struct adapter *sc = tod->tod_softc; struct l2t_entry *e; struct l2t_data *d = sc->l2t; u_int hash; KASSERT(d != NULL, ("%s: no L2 table", __func__)); - hash = l2_hash(d, sa, ifp->if_index); + hash = l2_hash(d, sa, if_getindex(ifp)); rw_rlock(&d->lock); for (e = d->l2tab[hash].first; e; e = e->next) { if (l2_cmp(sa, e) == 0 && e->ifp == ifp) { mtx_lock(&e->lock); if (atomic_load_acq_int(&e->refcnt)) goto found; e->state = L2T_STATE_STALE; mtx_unlock(&e->lock); break; } } rw_runlock(&d->lock); /* * This is of no interest to us. We've never had an offloaded * connection to this destination, and we aren't attempting one right * now. */ return; found: rw_runlock(&d->lock); KASSERT(e->state != L2T_STATE_UNUSED, ("%s: unused entry in the hash.", __func__)); update_entry(sc, e, lladdr, vtag); mtx_unlock(&e->lock); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.h b/sys/dev/cxgbe/tom/t4_tom_l2t.h index dc8e04d8d978..1d661fd74f80 100644 --- a/sys/dev/cxgbe/tom/t4_tom_l2t.h +++ b/sys/dev/cxgbe/tom/t4_tom_l2t.h @@ -1,55 +1,55 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_TOM_L2T_H #define __T4_TOM_L2T_H #include "t4_l2t.h" int t4_l2t_send_slow(struct adapter *, struct wrqe *, struct l2t_entry *); -struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *, +struct l2t_entry *t4_l2t_get(struct port_info *, if_t, struct sockaddr *); -void t4_l2_update(struct toedev *, struct ifnet *, struct sockaddr *, +void t4_l2_update(struct toedev *, if_t, struct sockaddr *, uint8_t *, uint16_t); int do_l2t_write_rpl2(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline int t4_l2t_send(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e) { if (__predict_true(e->state == L2T_STATE_VALID)) { t4_wrq_tx(sc, wr); return (0); } else return (t4_l2t_send_slow(sc, wr, e)); } #endif /* __T4_TOM_L2T_H */