diff --git a/sys/dev/cxgbe/common/common.h b/sys/dev/cxgbe/common/common.h index 894e0444b710..a49c21576994 100644 --- a/sys/dev/cxgbe/common/common.h +++ b/sys/dev/cxgbe/common/common.h @@ -1,966 +1,1008 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef __CHELSIO_COMMON_H #define __CHELSIO_COMMON_H #include "t4_hw.h" enum { MAX_NPORTS = 4, /* max # of ports */ SERNUM_LEN = 24, /* Serial # length */ EC_LEN = 16, /* E/C length */ ID_LEN = 16, /* ID length */ PN_LEN = 16, /* Part Number length */ MD_LEN = 16, /* MFG diags version length */ MACADDR_LEN = 12, /* MAC Address length */ }; enum { T4_REGMAP_SIZE = (160 * 1024), T5_REGMAP_SIZE = (332 * 1024), }; enum { MEM_EDC0, MEM_EDC1, MEM_MC, MEM_MC0 = MEM_MC, MEM_MC1, MEM_HMA }; enum dev_master { MASTER_CANT, MASTER_MAY, MASTER_MUST }; enum dev_state { DEV_STATE_UNINIT, DEV_STATE_INIT, DEV_STATE_ERR }; enum { PAUSE_RX = 1 << 0, PAUSE_TX = 1 << 1, PAUSE_AUTONEG = 1 << 2 }; enum { /* * Real FECs. In the same order as the FEC portion of caps32 so that * the code can do (fec & M_FW_PORT_CAP32_FEC) to get all the real FECs. */ FEC_RS = 1 << 0, /* Reed-Solomon */ FEC_BASER_RS = 1 << 1, /* BASE-R, aka Firecode */ FEC_NONE = 1 << 2, /* no FEC */ /* * Pseudo FECs that translate to real FECs. The firmware knows nothing * about these and they start at M_FW_PORT_CAP32_FEC + 1. AUTO should * be set all by itself. */ FEC_AUTO = 1 << 5, FEC_MODULE = 1 << 6, /* FEC suggested by the cable/transceiver. */ }; enum t4_bar2_qtype { T4_BAR2_QTYPE_EGRESS, T4_BAR2_QTYPE_INGRESS }; struct port_stats { u64 tx_octets; /* total # of octets in good frames */ u64 tx_frames; /* all good frames */ u64 tx_bcast_frames; /* all broadcast frames */ u64 tx_mcast_frames; /* all multicast frames */ u64 tx_ucast_frames; /* all unicast frames */ u64 tx_error_frames; /* all error frames */ u64 tx_frames_64; /* # of Tx frames in a particular range */ u64 tx_frames_65_127; u64 tx_frames_128_255; u64 tx_frames_256_511; u64 tx_frames_512_1023; u64 tx_frames_1024_1518; u64 tx_frames_1519_max; u64 tx_drop; /* # of dropped Tx frames */ u64 tx_pause; /* # of transmitted pause frames */ u64 tx_ppp0; /* # of transmitted PPP prio 0 frames */ u64 tx_ppp1; /* # of transmitted PPP prio 1 frames */ u64 tx_ppp2; /* # of transmitted PPP prio 2 frames */ u64 tx_ppp3; /* # of transmitted PPP prio 3 frames */ u64 tx_ppp4; /* # of transmitted PPP prio 4 frames */ u64 tx_ppp5; /* # of transmitted PPP prio 5 frames */ u64 tx_ppp6; /* # of transmitted PPP prio 6 frames */ u64 tx_ppp7; /* # of transmitted PPP prio 7 frames */ u64 rx_octets; /* total # of octets in good frames */ u64 rx_frames; /* all good frames */ u64 rx_bcast_frames; /* all broadcast frames */ u64 rx_mcast_frames; /* all multicast frames */ u64 rx_ucast_frames; /* all unicast frames */ u64 rx_too_long; /* # of frames exceeding MTU */ u64 rx_jabber; /* # of jabber frames */ u64 rx_fcs_err; /* # of received frames with bad FCS */ u64 rx_len_err; /* # of received frames with length error */ u64 rx_symbol_err; /* symbol errors */ u64 rx_runt; /* # of short frames */ u64 rx_frames_64; /* # of Rx frames in a particular range */ u64 rx_frames_65_127; u64 rx_frames_128_255; u64 rx_frames_256_511; u64 rx_frames_512_1023; u64 rx_frames_1024_1518; u64 rx_frames_1519_max; u64 rx_pause; /* # of received pause frames */ u64 rx_ppp0; /* # of received PPP prio 0 frames */ u64 rx_ppp1; /* # of received PPP prio 1 frames */ u64 rx_ppp2; /* # of received PPP prio 2 frames */ u64 rx_ppp3; /* # of received PPP prio 3 frames */ u64 rx_ppp4; /* # of received PPP prio 4 frames */ u64 rx_ppp5; /* # of received PPP prio 5 frames */ u64 rx_ppp6; /* # of received PPP prio 6 frames */ u64 rx_ppp7; /* # of received PPP prio 7 frames */ u64 rx_ovflow0; /* drops due to buffer-group 0 overflows */ u64 rx_ovflow1; /* drops due to buffer-group 1 overflows */ u64 rx_ovflow2; /* drops due to buffer-group 2 overflows */ u64 rx_ovflow3; /* drops due to buffer-group 3 overflows */ u64 rx_trunc0; /* buffer-group 0 truncated packets */ u64 rx_trunc1; /* buffer-group 1 truncated packets */ u64 rx_trunc2; /* buffer-group 2 truncated packets */ u64 rx_trunc3; /* buffer-group 3 truncated packets */ }; struct lb_port_stats { u64 octets; u64 frames; u64 bcast_frames; u64 mcast_frames; u64 ucast_frames; u64 error_frames; u64 frames_64; u64 frames_65_127; u64 frames_128_255; u64 frames_256_511; u64 frames_512_1023; u64 frames_1024_1518; u64 frames_1519_max; u64 drop; u64 ovflow0; u64 ovflow1; u64 ovflow2; u64 ovflow3; u64 trunc0; u64 trunc1; u64 trunc2; u64 trunc3; }; struct tp_tcp_stats { u32 tcp_out_rsts; u64 tcp_in_segs; u64 tcp_out_segs; u64 tcp_retrans_segs; }; struct tp_usm_stats { u32 frames; u32 drops; u64 octets; }; struct tp_tid_stats { u32 del; u32 inv; u32 act; u32 pas; }; struct tp_fcoe_stats { u32 frames_ddp; u32 frames_drop; u64 octets_ddp; }; struct tp_err_stats { u32 mac_in_errs[MAX_NCHAN]; u32 hdr_in_errs[MAX_NCHAN]; u32 tcp_in_errs[MAX_NCHAN]; u32 tnl_cong_drops[MAX_NCHAN]; u32 ofld_chan_drops[MAX_NCHAN]; u32 tnl_tx_drops[MAX_NCHAN]; u32 ofld_vlan_drops[MAX_NCHAN]; u32 tcp6_in_errs[MAX_NCHAN]; u32 ofld_no_neigh; u32 ofld_cong_defer; }; struct tp_tnl_stats { u32 out_pkt[MAX_NCHAN]; u32 in_pkt[MAX_NCHAN]; }; struct tp_proxy_stats { u32 proxy[MAX_NCHAN]; }; struct tp_cpl_stats { u32 req[MAX_NCHAN]; u32 rsp[MAX_NCHAN]; }; struct tp_rdma_stats { u32 rqe_dfr_pkt; u32 rqe_dfr_mod; }; struct sge_params { int timer_val[SGE_NTIMERS]; /* final, scaled values */ int counter_val[SGE_NCOUNTERS]; int fl_starve_threshold; int fl_starve_threshold2; int page_shift; int eq_s_qpp; int iq_s_qpp; int spg_len; int pad_boundary; int pack_boundary; int fl_pktshift; u32 sge_control; u32 sge_fl_buffer_size[SGE_FLBUF_SIZES]; }; struct tp_params { unsigned int tre; /* log2 of core clocks per TP tick */ unsigned int dack_re; /* DACK timer resolution */ unsigned int la_mask; /* what events are recorded by TP LA */ uint16_t filter_mode; uint16_t filter_mask; /* Used by TOE and hashfilters */ int vnic_mode; uint32_t max_rx_pdu; uint32_t max_tx_pdu; bool rx_pkt_encap; int8_t fcoe_shift; int8_t port_shift; int8_t vnic_shift; int8_t vlan_shift; int8_t tos_shift; int8_t protocol_shift; int8_t ethertype_shift; int8_t macmatch_shift; int8_t matchtype_shift; int8_t frag_shift; }; /* Use same modulation queue as the tx channel. */ #define TX_MODQ(tx_chan) (tx_chan) struct vpd_params { unsigned int cclk; u8 ec[EC_LEN + 1]; u8 sn[SERNUM_LEN + 1]; u8 id[ID_LEN + 1]; u8 pn[PN_LEN + 1]; u8 na[MACADDR_LEN + 1]; u8 md[MD_LEN + 1]; }; struct pci_params { unsigned int vpd_cap_addr; unsigned int mps; unsigned short speed; unsigned short width; }; /* * Firmware device log. */ struct devlog_params { u32 memtype; /* which memory (FW_MEMTYPE_* ) */ u32 start; /* start of log in firmware memory */ u32 size; /* size of log */ u32 addr; /* start address in flat addr space */ }; /* Stores chip specific parameters */ struct chip_params { u8 nchan; u8 pm_stats_cnt; u8 cng_ch_bits_log; /* congestion channel map bits width */ u8 nsched_cls; u8 cim_num_obq; u8 filter_opt_len; u16 mps_rplc_size; u16 vfcount; u32 sge_fl_db; u16 mps_tcam_size; u16 rss_nentries; u16 cim_la_size; }; /* VF-only parameters. */ /* * Global Receive Side Scaling (RSS) parameters in host-native format. */ struct rss_params { unsigned int mode; /* RSS mode */ union { struct { u_int synmapen:1; /* SYN Map Enable */ u_int syn4tupenipv6:1; /* enable hashing 4-tuple IPv6 SYNs */ u_int syn2tupenipv6:1; /* enable hashing 2-tuple IPv6 SYNs */ u_int syn4tupenipv4:1; /* enable hashing 4-tuple IPv4 SYNs */ u_int syn2tupenipv4:1; /* enable hashing 2-tuple IPv4 SYNs */ u_int ofdmapen:1; /* Offload Map Enable */ u_int tnlmapen:1; /* Tunnel Map Enable */ u_int tnlalllookup:1; /* Tunnel All Lookup */ u_int hashtoeplitz:1; /* use Toeplitz hash */ } basicvirtual; } u; }; /* * Maximum resources provisioned for a PCI VF. */ struct vf_resources { unsigned int nvi; /* N virtual interfaces */ unsigned int neq; /* N egress Qs */ unsigned int nethctrl; /* N egress ETH or CTRL Qs */ unsigned int niqflint; /* N ingress Qs/w free list(s) & intr */ unsigned int niq; /* N ingress Qs */ unsigned int tc; /* PCI-E traffic class */ unsigned int pmask; /* port access rights mask */ unsigned int nexactf; /* N exact MPS filters */ unsigned int r_caps; /* read capabilities */ unsigned int wx_caps; /* write/execute capabilities */ }; struct adapter_params { struct sge_params sge; struct tp_params tp; /* PF-only */ struct vpd_params vpd; struct pci_params pci; struct devlog_params devlog; /* PF-only */ struct rss_params rss; /* VF-only */ struct vf_resources vfres; /* VF-only */ unsigned int core_vdd; unsigned int sf_size; /* serial flash size in bytes */ unsigned int sf_nsec; /* # of flash sectors */ unsigned int fw_vers; /* firmware version */ unsigned int bs_vers; /* bootstrap version */ unsigned int tp_vers; /* TP microcode version */ unsigned int er_vers; /* expansion ROM version */ unsigned int scfg_vers; /* Serial Configuration version */ unsigned int vpd_vers; /* VPD version */ unsigned short mtus[NMTUS]; unsigned short a_wnd[NCCTRL_WIN]; unsigned short b_wnd[NCCTRL_WIN]; unsigned int cim_la_size; uint8_t nports; /* # of ethernet ports */ uint8_t portvec; unsigned int chipid:4; /* chip ID. T4 = 4, T5 = 5, ... */ unsigned int rev:4; /* chip revision */ unsigned int fpga:1; /* this is an FPGA */ unsigned int offload:1; /* hw is TOE capable, fw has divvied up card resources for TOE operation. */ unsigned int bypass:1; /* this is a bypass card */ unsigned int ethoffload:1; unsigned int hash_filter:1; unsigned int filter2_wr_support:1; unsigned int port_caps32:1; unsigned int smac_add_support:1; unsigned int ofldq_wr_cred; unsigned int eo_wr_cred; unsigned int max_ordird_qp; unsigned int max_ird_adapter; /* These values are for all ports (8b/port, upto 4 ports) */ uint32_t mps_bg_map; /* MPS rx buffer group map */ uint32_t tp_ch_map; /* TPCHMAP from firmware */ bool ulptx_memwrite_dsgl; /* use of T5 DSGL allowed */ bool fr_nsmr_tpte_wr_support; /* FW support for FR_NSMR_TPTE_WR */ bool dev_512sgl_mr; /* FW support for 512 SGL per FR MR */ bool viid_smt_extn_support; /* FW returns vin, vfvld & smt index? */ unsigned int max_pkts_per_eth_tx_pkts_wr; uint8_t nsched_cls; /* # of usable sched classes per port */ }; #define CHELSIO_T4 0x4 #define CHELSIO_T5 0x5 #define CHELSIO_T6 0x6 /* * State needed to monitor the forward progress of SGE Ingress DMA activities * and possible hangs. */ struct sge_idma_monitor_state { unsigned int idma_1s_thresh; /* 1s threshold in Core Clock ticks */ unsigned int idma_stalled[2]; /* synthesized stalled timers in HZ */ unsigned int idma_state[2]; /* IDMA Hang detect state */ unsigned int idma_qid[2]; /* IDMA Hung Ingress Queue ID */ unsigned int idma_warn[2]; /* time to warning in HZ */ }; struct trace_params { u32 data[TRACE_LEN / 4]; u32 mask[TRACE_LEN / 4]; unsigned short snap_len; unsigned short min_len; unsigned char skip_ofst; unsigned char skip_len; unsigned char invert; unsigned char port; }; struct link_config { /* OS-specific code owns all the requested_* fields. */ int8_t requested_aneg; /* link autonegotiation */ int8_t requested_fc; /* flow control */ int8_t requested_fec; /* FEC */ int8_t force_fec; /* FORCE_FEC in L1_CFG32 command. */ u_int requested_speed; /* speed (Mbps) */ uint32_t requested_caps;/* rcap in last l1cfg issued by the driver. */ /* These are populated with information from the firmware. */ uint32_t pcaps; /* link capabilities */ uint32_t acaps; /* advertised capabilities */ uint32_t lpacaps; /* peer advertised capabilities */ u_int speed; /* actual link speed (Mbps) */ int8_t fc; /* actual link flow control */ int8_t fec_hint; /* cable/transceiver recommended fec */ int8_t fec; /* actual FEC */ bool link_ok; /* link up? */ uint8_t link_down_rc; /* link down reason */ }; #include "adapter.h" #ifndef PCI_VENDOR_ID_CHELSIO # define PCI_VENDOR_ID_CHELSIO 0x1425 #endif #define for_each_port(adapter, iter) \ for (iter = 0; iter < (adapter)->params.nports; ++iter) static inline int is_ftid(const struct adapter *sc, u_int tid) { return (sc->tids.nftids > 0 && tid >= sc->tids.ftid_base && tid <= sc->tids.ftid_end); } static inline int is_hpftid(const struct adapter *sc, u_int tid) { return (sc->tids.nhpftids > 0 && tid >= sc->tids.hpftid_base && tid <= sc->tids.hpftid_end); } static inline int is_etid(const struct adapter *sc, u_int tid) { return (sc->tids.netids > 0 && tid >= sc->tids.etid_base && tid <= sc->tids.etid_end); } static inline int is_offload(const struct adapter *adap) { return adap->params.offload; } static inline int is_ethoffload(const struct adapter *adap) { return adap->params.ethoffload; } static inline int is_hashfilter(const struct adapter *adap) { return adap->params.hash_filter; } static inline int is_ktls(const struct adapter *adap) { return adap->cryptocaps & FW_CAPS_CONFIG_TLS_HW; } static inline int chip_id(struct adapter *adap) { return adap->params.chipid; } static inline int chip_rev(struct adapter *adap) { return adap->params.rev; } static inline int is_t4(struct adapter *adap) { return adap->params.chipid == CHELSIO_T4; } static inline int is_t5(struct adapter *adap) { return adap->params.chipid == CHELSIO_T5; } static inline int is_t6(struct adapter *adap) { return adap->params.chipid == CHELSIO_T6; } static inline int is_fpga(struct adapter *adap) { return adap->params.fpga; } static inline unsigned int core_ticks_per_usec(const struct adapter *adap) { return adap->params.vpd.cclk / 1000; } static inline unsigned int us_to_core_ticks(const struct adapter *adap, unsigned int us) { return (us * adap->params.vpd.cclk) / 1000; } static inline unsigned int core_ticks_to_us(const struct adapter *adapter, unsigned int ticks) { /* add Core Clock / 2 to round ticks to nearest uS */ return ((ticks * 1000 + adapter->params.vpd.cclk/2) / adapter->params.vpd.cclk); } static inline unsigned int dack_ticks_to_usec(const struct adapter *adap, unsigned int ticks) { return (ticks << adap->params.tp.dack_re) / core_ticks_per_usec(adap); } static inline u_int us_to_tcp_ticks(const struct adapter *adap, u_long us) { return (us * adap->params.vpd.cclk / 1000 >> adap->params.tp.tre); } static inline u_int tcp_ticks_to_us(const struct adapter *adap, u_int ticks) { return ((uint64_t)ticks << adap->params.tp.tre) / core_ticks_per_usec(adap); } void t4_set_reg_field(struct adapter *adap, unsigned int addr, u32 mask, u32 val); int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, int size, void *rpl, bool sleep_ok, int timeout); int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size, void *rpl, bool sleep_ok); void t4_report_fw_error(struct adapter *adap); static inline int t4_wr_mbox_timeout(struct adapter *adap, int mbox, const void *cmd, int size, void *rpl, int timeout) { return t4_wr_mbox_meat_timeout(adap, mbox, cmd, size, rpl, true, timeout); } static inline int t4_wr_mbox(struct adapter *adap, int mbox, const void *cmd, int size, void *rpl) { return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, true); } static inline int t4_wr_mbox_ns(struct adapter *adap, int mbox, const void *cmd, int size, void *rpl) { return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, false); } void t4_read_indirect(struct adapter *adap, unsigned int addr_reg, unsigned int data_reg, u32 *vals, unsigned int nregs, unsigned int start_idx); void t4_write_indirect(struct adapter *adap, unsigned int addr_reg, unsigned int data_reg, const u32 *vals, unsigned int nregs, unsigned int start_idx); u32 t4_hw_pci_read_cfg4(adapter_t *adapter, int reg); struct fw_filter_wr; void t4_intr_enable(struct adapter *adapter); void t4_intr_disable(struct adapter *adapter); bool t4_slow_intr_handler(struct adapter *adapter, bool verbose); int t4_hash_mac_addr(const u8 *addr); int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port, struct link_config *lc); int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port); int t4_seeprom_read(struct adapter *adapter, u32 addr, u32 *data); int t4_seeprom_write(struct adapter *adapter, u32 addr, u32 data); int t4_eeprom_ptov(unsigned int phys_addr, unsigned int fn, unsigned int sz); int t4_seeprom_wp(struct adapter *adapter, int enable); int t4_read_flash(struct adapter *adapter, unsigned int addr, unsigned int nwords, u32 *data, int byte_oriented); int t4_write_flash(struct adapter *adapter, unsigned int addr, unsigned int n, const u8 *data, int byte_oriented); int t4_load_fw(struct adapter *adapter, const u8 *fw_data, unsigned int size); int t4_fwcache(struct adapter *adap, enum fw_params_param_dev_fwcache op); int t5_fw_init_extern_mem(struct adapter *adap); int t4_load_bootcfg(struct adapter *adapter, const u8 *cfg_data, unsigned int size); int t4_load_boot(struct adapter *adap, u8 *boot_data, unsigned int boot_addr, unsigned int size); int t4_flash_erase_sectors(struct adapter *adapter, int start, int end); int t4_flash_cfg_addr(struct adapter *adapter); int t4_load_cfg(struct adapter *adapter, const u8 *cfg_data, unsigned int size); int t4_get_fw_version(struct adapter *adapter, u32 *vers); int t4_get_fw_hdr(struct adapter *adapter, struct fw_hdr *hdr); int t4_get_bs_version(struct adapter *adapter, u32 *vers); int t4_get_tp_version(struct adapter *adapter, u32 *vers); int t4_get_exprom_version(struct adapter *adapter, u32 *vers); int t4_get_scfg_version(struct adapter *adapter, u32 *vers); int t4_get_vpd_version(struct adapter *adapter, u32 *vers); int t4_get_version_info(struct adapter *adapter); int t4_init_hw(struct adapter *adapter, u32 fw_params); const struct chip_params *t4_get_chip_params(int chipid); int t4_prep_adapter(struct adapter *adapter, u32 *buf); int t4_shutdown_adapter(struct adapter *adapter); int t4_init_devlog_params(struct adapter *adapter, int fw_attach); int t4_init_sge_params(struct adapter *adapter); int t4_init_tp_params(struct adapter *adap); int t4_filter_field_shift(const struct adapter *adap, int filter_sel); int t4_port_init(struct adapter *adap, int mbox, int pf, int vf, int port_id); void t4_fatal_err(struct adapter *adapter, bool fw_error); int t4_set_trace_filter(struct adapter *adapter, const struct trace_params *tp, int filter_index, int enable); void t4_get_trace_filter(struct adapter *adapter, struct trace_params *tp, int filter_index, int *enabled); int t4_config_rss_range(struct adapter *adapter, int mbox, unsigned int viid, int start, int n, const u16 *rspq, unsigned int nrspq); int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode, unsigned int flags); int t4_config_vi_rss(struct adapter *adapter, int mbox, unsigned int viid, unsigned int flags, unsigned int defq, unsigned int skeyidx, unsigned int skey); int t4_read_rss(struct adapter *adapter, u16 *entries); void t4_read_rss_key(struct adapter *adapter, u32 *key, bool sleep_ok); void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx, bool sleep_ok); void t4_read_rss_pf_config(struct adapter *adapter, unsigned int index, u32 *valp, bool sleep_ok); void t4_write_rss_pf_config(struct adapter *adapter, unsigned int index, u32 val, bool sleep_ok); void t4_read_rss_vf_config(struct adapter *adapter, unsigned int index, u32 *vfl, u32 *vfh, bool sleep_ok); void t4_write_rss_vf_config(struct adapter *adapter, unsigned int index, u32 vfl, u32 vfh, bool sleep_ok); u32 t4_read_rss_pf_map(struct adapter *adapter, bool sleep_ok); void t4_write_rss_pf_map(struct adapter *adapter, u32 pfmap, bool sleep_ok); u32 t4_read_rss_pf_mask(struct adapter *adapter, bool sleep_ok); void t4_write_rss_pf_mask(struct adapter *adapter, u32 pfmask, bool sleep_ok); int t4_mps_set_active_ports(struct adapter *adap, unsigned int port_mask); void t4_pmtx_get_stats(struct adapter *adap, u32 cnt[], u64 cycles[]); void t4_pmrx_get_stats(struct adapter *adap, u32 cnt[], u64 cycles[]); void t4_read_cimq_cfg(struct adapter *adap, u16 *base, u16 *size, u16 *thres); int t4_read_cim_ibq(struct adapter *adap, unsigned int qid, u32 *data, size_t n); int t4_read_cim_obq(struct adapter *adap, unsigned int qid, u32 *data, size_t n); int t4_cim_read(struct adapter *adap, unsigned int addr, unsigned int n, unsigned int *valp); int t4_cim_write(struct adapter *adap, unsigned int addr, unsigned int n, const unsigned int *valp); int t4_cim_ctl_read(struct adapter *adap, unsigned int addr, unsigned int n, unsigned int *valp); int t4_cim_read_la(struct adapter *adap, u32 *la_buf, unsigned int *wrptr); void t4_cim_read_pif_la(struct adapter *adap, u32 *pif_req, u32 *pif_rsp, unsigned int *pif_req_wrptr, unsigned int *pif_rsp_wrptr); void t4_cim_read_ma_la(struct adapter *adap, u32 *ma_req, u32 *ma_rsp); int t4_get_flash_params(struct adapter *adapter); u32 t4_read_pcie_cfg4(struct adapter *adap, int reg, int drv_fw_attach); int t4_mc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *parity); int t4_edc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *parity); int t4_mem_read(struct adapter *adap, int mtype, u32 addr, u32 size, __be32 *data); void t4_idma_monitor_init(struct adapter *adapter, struct sge_idma_monitor_state *idma); void t4_idma_monitor(struct adapter *adapter, struct sge_idma_monitor_state *idma, int hz, int ticks); int t4_set_vf_mac(struct adapter *adapter, unsigned int pf, unsigned int vf, unsigned int naddr, u8 *addr); unsigned int t4_get_regs_len(struct adapter *adapter); void t4_get_regs(struct adapter *adap, u8 *buf, size_t buf_size); u32 t4_port_reg(struct adapter *adap, u8 port, u32 reg); const char *t4_get_port_type_description(enum fw_port_type port_type); void t4_get_port_stats(struct adapter *adap, int idx, struct port_stats *p); void t4_get_port_stats_offset(struct adapter *adap, int idx, struct port_stats *stats, struct port_stats *offset); void t4_get_lb_stats(struct adapter *adap, int idx, struct lb_port_stats *p); void t4_clr_port_stats(struct adapter *adap, int idx); void t4_read_mtu_tbl(struct adapter *adap, u16 *mtus, u8 *mtu_log); void t4_read_cong_tbl(struct adapter *adap, u16 incr[NMTUS][NCCTRL_WIN]); void t4_read_pace_tbl(struct adapter *adap, unsigned int pace_vals[NTX_SCHED]); void t4_get_tx_sched(struct adapter *adap, unsigned int sched, unsigned int *kbps, unsigned int *ipg, bool sleep_ok); void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr, unsigned int mask, unsigned int val); void t4_tp_read_la(struct adapter *adap, u64 *la_buf, unsigned int *wrptr); void t4_tp_get_err_stats(struct adapter *adap, struct tp_err_stats *st, bool sleep_ok); void t4_tp_get_tnl_stats(struct adapter *adap, struct tp_tnl_stats *st, bool sleep_ok); void t4_tp_get_proxy_stats(struct adapter *adap, struct tp_proxy_stats *st, bool sleep_ok); void t4_tp_get_cpl_stats(struct adapter *adap, struct tp_cpl_stats *st, bool sleep_ok); void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st, bool sleep_ok); void t4_get_usm_stats(struct adapter *adap, struct tp_usm_stats *st, bool sleep_ok); void t4_tp_get_tid_stats(struct adapter *adap, struct tp_tid_stats *st, bool sleep_ok); void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4, struct tp_tcp_stats *v6, bool sleep_ok); void t4_get_fcoe_stats(struct adapter *adap, unsigned int idx, struct tp_fcoe_stats *st, bool sleep_ok); void t4_load_mtus(struct adapter *adap, const unsigned short *mtus, const unsigned short *alpha, const unsigned short *beta); void t4_ulprx_read_la(struct adapter *adap, u32 *la_buf); int t4_set_sched_bps(struct adapter *adap, int sched, unsigned int kbps); int t4_set_sched_ipg(struct adapter *adap, int sched, unsigned int ipg); int t4_set_pace_tbl(struct adapter *adap, const unsigned int *pace_vals, unsigned int start, unsigned int n); void t4_get_chan_txrate(struct adapter *adap, u64 *nic_rate, u64 *ofld_rate); int t4_set_filter_cfg(struct adapter *adap, int mode, int mask, int vnic_mode); void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid); void t4_wol_magic_enable(struct adapter *adap, unsigned int port, const u8 *addr); int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map, u64 mask0, u64 mask1, unsigned int crc, bool enable); int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox, enum dev_master master, enum dev_state *state); int t4_fw_bye(struct adapter *adap, unsigned int mbox); int t4_fw_reset(struct adapter *adap, unsigned int mbox, int reset); int t4_fw_halt(struct adapter *adap, unsigned int mbox, int force); int t4_fw_restart(struct adapter *adap, unsigned int mbox); int t4_fw_upgrade(struct adapter *adap, unsigned int mbox, const u8 *fw_data, unsigned int size, int force); int t4_fw_initialize(struct adapter *adap, unsigned int mbox); int t4_query_params(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int nparams, const u32 *params, u32 *val); int t4_query_params_rw(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int nparams, const u32 *params, u32 *val, int rw); int t4_set_params_timeout(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int nparams, const u32 *params, const u32 *val, int timeout); int t4_set_params(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int nparams, const u32 *params, const u32 *val); int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int txq, unsigned int txq_eth_ctrl, unsigned int rxqi, unsigned int rxq, unsigned int tc, unsigned int vi, unsigned int cmask, unsigned int pmask, unsigned int exactf, unsigned int rcaps, unsigned int wxcaps); int t4_alloc_vi_func(struct adapter *adap, unsigned int mbox, unsigned int port, unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac, u16 *rss_size, uint8_t *vfvld, uint16_t *vin, unsigned int portfunc, unsigned int idstype); int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port, unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac, u16 *rss_size, uint8_t *vfvld, uint16_t *vin); int t4_free_vi(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int viid); int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid, int mtu, int promisc, int all_multi, int bcast, int vlanex, bool sleep_ok); int t4_alloc_mac_filt(struct adapter *adap, unsigned int mbox, unsigned int viid, bool free, unsigned int naddr, const u8 **addr, u16 *idx, u64 *hash, bool sleep_ok); int t4_free_mac_filt(struct adapter *adap, unsigned int mbox, unsigned int viid, unsigned int naddr, const u8 **addr, bool sleep_ok); int t4_free_encap_mac_filt(struct adapter *adap, unsigned int viid, int idx, bool sleep_ok); int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid, const u8 *addr, const u8 *mask, unsigned int idx, u8 lookup_type, u8 port_id, bool sleep_ok); int t4_alloc_raw_mac_filt(struct adapter *adap, unsigned int viid, const u8 *addr, const u8 *mask, unsigned int idx, u8 lookup_type, u8 port_id, bool sleep_ok); int t4_alloc_encap_mac_filt(struct adapter *adap, unsigned int viid, const u8 *addr, const u8 *mask, unsigned int vni, unsigned int vni_mask, u8 dip_hit, u8 lookup_type, bool sleep_ok); int t4_change_mac(struct adapter *adap, unsigned int mbox, unsigned int viid, int idx, const u8 *addr, bool persist, uint16_t *smt_idx); int t4_del_mac(struct adapter *adap, unsigned int mbox, unsigned int viid, const u8 *addr, bool smac); int t4_add_mac(struct adapter *adap, unsigned int mbox, unsigned int viid, int idx, const u8 *addr, bool persist, u8 *smt_idx, bool smac); int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid, bool ucast, u64 vec, bool sleep_ok); int t4_enable_vi_params(struct adapter *adap, unsigned int mbox, unsigned int viid, bool rx_en, bool tx_en, bool dcb_en); int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid, bool rx_en, bool tx_en); int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid, unsigned int nblinks); int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, unsigned int mmd, unsigned int reg, unsigned int *valp); int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, unsigned int mmd, unsigned int reg, unsigned int val); int t4_i2c_io(struct adapter *adap, unsigned int mbox, int port, unsigned int devid, unsigned int offset, unsigned int len, u8 *buf, bool write); int t4_i2c_rd(struct adapter *adap, unsigned int mbox, int port, unsigned int devid, unsigned int offset, unsigned int len, u8 *buf); int t4_i2c_wr(struct adapter *adap, unsigned int mbox, int port, unsigned int devid, unsigned int offset, unsigned int len, u8 *buf); int t4_iq_stop(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int iqtype, unsigned int iqid, unsigned int fl0id, unsigned int fl1id); int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int iqtype, unsigned int iqid, unsigned int fl0id, unsigned int fl1id); int t4_eth_eq_stop(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int eqid); int t4_eth_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int eqid); int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int eqid); int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int eqid); int t4_sge_ctxt_rd(struct adapter *adap, unsigned int mbox, unsigned int cid, enum ctxt_type ctype, u32 *data); int t4_sge_ctxt_rd_bd(struct adapter *adap, unsigned int cid, enum ctxt_type ctype, u32 *data); int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type); const char *t4_link_down_rc_str(unsigned char link_down_rc); int t4_update_port_info(struct port_info *pi); int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl); int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox, u32 addr, u32 val); int t4_sched_config(struct adapter *adapter, int type, int minmaxen, int sleep_ok); int t4_sched_params(struct adapter *adapter, int type, int level, int mode, int rateunit, int ratemode, int channel, int cl, int minrate, int maxrate, int weight, int pktsize, int burstsize, int sleep_ok); int t4_sched_params_ch_rl(struct adapter *adapter, int channel, int ratemode, unsigned int maxrate, int sleep_ok); int t4_sched_params_cl_wrr(struct adapter *adapter, int channel, int cl, int weight, int sleep_ok); int t4_sched_params_cl_rl_kbps(struct adapter *adapter, int channel, int cl, int mode, unsigned int maxrate, int pktsize, int sleep_ok); int t4_config_watchdog(struct adapter *adapter, unsigned int mbox, unsigned int pf, unsigned int vf, unsigned int timeout, unsigned int action); int t4_get_devlog_level(struct adapter *adapter, unsigned int *level); int t4_set_devlog_level(struct adapter *adapter, unsigned int level); void t4_sge_decode_idma_state(struct adapter *adapter, int state); void t4_tp_pio_read(struct adapter *adap, u32 *buff, u32 nregs, u32 start_index, bool sleep_ok); void t4_tp_pio_write(struct adapter *adap, const u32 *buff, u32 nregs, u32 start_index, bool sleep_ok); void t4_tp_tm_pio_read(struct adapter *adap, u32 *buff, u32 nregs, u32 start_index, bool sleep_ok); void t4_tp_mib_read(struct adapter *adap, u32 *buff, u32 nregs, u32 start_index, bool sleep_ok); int t4_configure_ringbb(struct adapter *adap); int t4_configure_add_smac(struct adapter *adap); int t4_set_vlan_acl(struct adapter *adap, unsigned int mbox, unsigned int vf, u16 vlan); static inline int t4vf_query_params(struct adapter *adapter, unsigned int nparams, const u32 *params, u32 *vals) { return t4_query_params(adapter, 0, 0, 0, nparams, params, vals); } static inline int t4vf_set_params(struct adapter *adapter, unsigned int nparams, const u32 *params, const u32 *vals) { return t4_set_params(adapter, 0, 0, 0, nparams, params, vals); } static inline int t4vf_wr_mbox(struct adapter *adap, const void *cmd, int size, void *rpl) { return t4_wr_mbox(adap, adap->mbox, cmd, size, rpl); } int t4vf_wait_dev_ready(struct adapter *adapter); int t4vf_fw_reset(struct adapter *adapter); int t4vf_get_sge_params(struct adapter *adapter); int t4vf_get_rss_glb_config(struct adapter *adapter); int t4vf_get_vfres(struct adapter *adapter); int t4vf_prep_adapter(struct adapter *adapter); int t4vf_get_vf_mac(struct adapter *adapter, unsigned int port, unsigned int *naddr, u8 *addr); int t4_bar2_sge_qregs(struct adapter *adapter, unsigned int qid, enum t4_bar2_qtype qtype, int user, u64 *pbar2_qoffset, unsigned int *pbar2_qid); unsigned int fwcap_to_speed(uint32_t caps); uint32_t speed_to_fwcap(unsigned int speed); uint32_t fwcap_top_speed(uint32_t caps); static inline int port_top_speed(const struct port_info *pi) { /* Mbps -> Gbps */ return (fwcap_to_speed(pi->link_cfg.pcaps) / 1000); } +/* SET_TCB_FIELD sent as a ULP command looks like this */ +#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ + sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) + +static inline void * +mk_set_tcb_field_ulp(struct adapter *sc, void *cur, int tid, uint16_t word, + uint64_t mask, uint64_t val) +{ + struct ulp_txpkt *ulpmc; + struct ulptx_idata *ulpsc; + struct cpl_set_tcb_field_core *req; + + MPASS(((uintptr_t)cur & 7) == 0); + + ulpmc = cur; + ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | + V_ULP_TXPKT_DEST(ULP_TXPKT_DEST_TP)); + ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); + + ulpsc = (struct ulptx_idata *)(ulpmc + 1); + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); + ulpsc->len = htobe32(sizeof(*req)); + + req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); + OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply_ctrl = htobe16(F_NO_REPLY); + req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); + req->mask = htobe64(mask); + req->val = htobe64(val); + + /* + * ULP_TX is an 8B processor but the firmware transfers WRs in 16B + * chunks. The master command for set_tcb_field does not end at a 16B + * boundary so it needs to be padded with a no-op. + */ + MPASS((LEN__SET_TCB_FIELD_ULP & 0xf) != 0); + ulpsc = (struct ulptx_idata *)(req + 1); + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); + ulpsc->len = htobe32(0); + + return (ulpsc + 1); +} #endif /* __CHELSIO_COMMON_H */ diff --git a/sys/dev/cxgbe/t4_filter.c b/sys/dev/cxgbe/t4_filter.c index 18fa1093800f..359aae6df24e 100644 --- a/sys/dev/cxgbe/t4_filter.c +++ b/sys/dev/cxgbe/t4_filter.c @@ -1,2063 +1,2030 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2018 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_l2t.h" #include "t4_smt.h" struct filter_entry { LIST_ENTRY(filter_entry) link_4t; LIST_ENTRY(filter_entry) link_tid; uint32_t valid:1; /* filter allocated and valid */ uint32_t locked:1; /* filter is administratively locked or busy */ uint32_t pending:1; /* filter action is pending firmware reply */ int tid; /* tid of the filter TCB */ struct l2t_entry *l2te; /* L2 table entry for DMAC rewrite */ struct smt_entry *smt; /* SMT entry for SMAC rewrite */ struct t4_filter_specification fs; }; static void free_filter_resources(struct filter_entry *); static int get_tcamfilter(struct adapter *, struct t4_filter *); static int get_hashfilter(struct adapter *, struct t4_filter *); static int set_hashfilter(struct adapter *, struct t4_filter *, uint64_t, struct l2t_entry *, struct smt_entry *); static int del_hashfilter(struct adapter *, struct t4_filter *); static int configure_hashfilter_tcb(struct adapter *, struct filter_entry *); static inline bool separate_hpfilter_region(struct adapter *sc) { return (chip_id(sc) >= CHELSIO_T6); } static inline uint32_t hf_hashfn_4t(struct t4_filter_specification *fs) { struct t4_filter_tuple *ft = &fs->val; uint32_t hash; if (fs->type) { /* IPv6 */ hash = fnv_32_buf(&ft->sip[0], 16, FNV1_32_INIT); hash = fnv_32_buf(&ft->dip[0], 16, hash); } else { hash = fnv_32_buf(&ft->sip[0], 4, FNV1_32_INIT); hash = fnv_32_buf(&ft->dip[0], 4, hash); } hash = fnv_32_buf(&ft->sport, sizeof(ft->sport), hash); hash = fnv_32_buf(&ft->dport, sizeof(ft->dport), hash); return (hash); } static inline uint32_t hf_hashfn_tid(int tid) { return (fnv_32_buf(&tid, sizeof(tid), FNV1_32_INIT)); } static int alloc_hftid_hash(struct tid_info *t, int flags) { int n; MPASS(t->ntids > 0); MPASS(t->hftid_hash_4t == NULL); MPASS(t->hftid_hash_tid == NULL); n = max(t->ntids / 1024, 16); t->hftid_hash_4t = hashinit_flags(n, M_CXGBE, &t->hftid_4t_mask, flags); if (t->hftid_hash_4t == NULL) return (ENOMEM); t->hftid_hash_tid = hashinit_flags(n, M_CXGBE, &t->hftid_tid_mask, flags); if (t->hftid_hash_tid == NULL) { hashdestroy(t->hftid_hash_4t, M_CXGBE, t->hftid_4t_mask); t->hftid_hash_4t = NULL; return (ENOMEM); } mtx_init(&t->hftid_lock, "T4 hashfilters", 0, MTX_DEF); cv_init(&t->hftid_cv, "t4hfcv"); return (0); } void free_hftid_hash(struct tid_info *t) { struct filter_entry *f, *ftmp; LIST_HEAD(, filter_entry) *head; int i; #ifdef INVARIANTS int n = 0; #endif if (t->tids_in_use > 0) { /* Remove everything from the tid hash. */ head = t->hftid_hash_tid; for (i = 0; i <= t->hftid_tid_mask; i++) { LIST_FOREACH_SAFE(f, &head[i], link_tid, ftmp) { LIST_REMOVE(f, link_tid); } } /* Remove and then free each filter in the 4t hash. */ head = t->hftid_hash_4t; for (i = 0; i <= t->hftid_4t_mask; i++) { LIST_FOREACH_SAFE(f, &head[i], link_4t, ftmp) { #ifdef INVARIANTS n += f->fs.type ? 2 : 1; #endif LIST_REMOVE(f, link_4t); free(f, M_CXGBE); } } MPASS(t->tids_in_use == n); t->tids_in_use = 0; } if (t->hftid_hash_4t) { hashdestroy(t->hftid_hash_4t, M_CXGBE, t->hftid_4t_mask); t->hftid_hash_4t = NULL; } if (t->hftid_hash_tid) { hashdestroy(t->hftid_hash_tid, M_CXGBE, t->hftid_tid_mask); t->hftid_hash_tid = NULL; } if (mtx_initialized(&t->hftid_lock)) { mtx_destroy(&t->hftid_lock); cv_destroy(&t->hftid_cv); } } static void insert_hf(struct adapter *sc, struct filter_entry *f, uint32_t hash) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_4t; MPASS(head != NULL); if (hash == 0) hash = hf_hashfn_4t(&f->fs); LIST_INSERT_HEAD(&head[hash & t->hftid_4t_mask], f, link_4t); atomic_add_int(&t->tids_in_use, f->fs.type ? 2 : 1); } static void insert_hftid(struct adapter *sc, struct filter_entry *f) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_tid; uint32_t hash; MPASS(f->tid >= t->tid_base); MPASS(f->tid - t->tid_base < t->ntids); mtx_assert(&t->hftid_lock, MA_OWNED); hash = hf_hashfn_tid(f->tid); LIST_INSERT_HEAD(&head[hash & t->hftid_tid_mask], f, link_tid); } static bool filter_eq(struct t4_filter_specification *fs1, struct t4_filter_specification *fs2) { int n; MPASS(fs1->hash && fs2->hash); if (fs1->type != fs2->type) return (false); n = fs1->type ? 16 : 4; if (bcmp(&fs1->val.sip[0], &fs2->val.sip[0], n) || bcmp(&fs1->val.dip[0], &fs2->val.dip[0], n) || fs1->val.sport != fs2->val.sport || fs1->val.dport != fs2->val.dport) return (false); /* * We know the masks are the same because all hashfilters conform to the * global tp->filter_mask and the driver has verified that already. */ if ((fs1->mask.pfvf_vld || fs1->mask.ovlan_vld) && fs1->val.vnic != fs2->val.vnic) return (false); if (fs1->mask.vlan_vld && fs1->val.vlan != fs2->val.vlan) return (false); if (fs1->mask.macidx && fs1->val.macidx != fs2->val.macidx) return (false); if (fs1->mask.frag && fs1->val.frag != fs2->val.frag) return (false); if (fs1->mask.matchtype && fs1->val.matchtype != fs2->val.matchtype) return (false); if (fs1->mask.iport && fs1->val.iport != fs2->val.iport) return (false); if (fs1->mask.fcoe && fs1->val.fcoe != fs2->val.fcoe) return (false); if (fs1->mask.proto && fs1->val.proto != fs2->val.proto) return (false); if (fs1->mask.tos && fs1->val.tos != fs2->val.tos) return (false); if (fs1->mask.ethtype && fs1->val.ethtype != fs2->val.ethtype) return (false); return (true); } static struct filter_entry * lookup_hf(struct adapter *sc, struct t4_filter_specification *fs, uint32_t hash) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_4t; struct filter_entry *f; mtx_assert(&t->hftid_lock, MA_OWNED); MPASS(head != NULL); if (hash == 0) hash = hf_hashfn_4t(fs); LIST_FOREACH(f, &head[hash & t->hftid_4t_mask], link_4t) { if (filter_eq(&f->fs, fs)) return (f); } return (NULL); } static struct filter_entry * lookup_hftid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; LIST_HEAD(, filter_entry) *head = t->hftid_hash_tid; struct filter_entry *f; uint32_t hash; mtx_assert(&t->hftid_lock, MA_OWNED); MPASS(head != NULL); hash = hf_hashfn_tid(tid); LIST_FOREACH(f, &head[hash & t->hftid_tid_mask], link_tid) { if (f->tid == tid) return (f); } return (NULL); } static void remove_hf(struct adapter *sc, struct filter_entry *f) { struct tid_info *t = &sc->tids; mtx_assert(&t->hftid_lock, MA_OWNED); LIST_REMOVE(f, link_4t); atomic_subtract_int(&t->tids_in_use, f->fs.type ? 2 : 1); } static void remove_hftid(struct adapter *sc, struct filter_entry *f) { #ifdef INVARIANTS struct tid_info *t = &sc->tids; mtx_assert(&t->hftid_lock, MA_OWNED); #endif LIST_REMOVE(f, link_tid); } /* * Input: driver's 32b filter mode. * Returns: hardware filter mode (bits to set in vlan_pri_map) for the input. */ static uint16_t mode_to_fconf(uint32_t mode) { uint32_t fconf = 0; if (mode & T4_FILTER_IP_FRAGMENT) fconf |= F_FRAGMENTATION; if (mode & T4_FILTER_MPS_HIT_TYPE) fconf |= F_MPSHITTYPE; if (mode & T4_FILTER_MAC_IDX) fconf |= F_MACMATCH; if (mode & T4_FILTER_ETH_TYPE) fconf |= F_ETHERTYPE; if (mode & T4_FILTER_IP_PROTO) fconf |= F_PROTOCOL; if (mode & T4_FILTER_IP_TOS) fconf |= F_TOS; if (mode & T4_FILTER_VLAN) fconf |= F_VLAN; if (mode & T4_FILTER_VNIC) fconf |= F_VNIC_ID; if (mode & T4_FILTER_PORT) fconf |= F_PORT; if (mode & T4_FILTER_FCoE) fconf |= F_FCOE; return (fconf); } /* * Input: driver's 32b filter mode. * Returns: hardware vnic mode (ingress config) matching the input. */ static int mode_to_iconf(uint32_t mode) { if ((mode & T4_FILTER_VNIC) == 0) return (-1); /* ingress config doesn't matter. */ if (mode & T4_FILTER_IC_VNIC) return (FW_VNIC_MODE_PF_VF); else if (mode & T4_FILTER_IC_ENCAP) return (FW_VNIC_MODE_ENCAP_EN); else return (FW_VNIC_MODE_OUTER_VLAN); } static int check_fspec_against_fconf_iconf(struct adapter *sc, struct t4_filter_specification *fs) { struct tp_params *tpp = &sc->params.tp; uint32_t fconf = 0; if (fs->val.frag || fs->mask.frag) fconf |= F_FRAGMENTATION; if (fs->val.matchtype || fs->mask.matchtype) fconf |= F_MPSHITTYPE; if (fs->val.macidx || fs->mask.macidx) fconf |= F_MACMATCH; if (fs->val.ethtype || fs->mask.ethtype) fconf |= F_ETHERTYPE; if (fs->val.proto || fs->mask.proto) fconf |= F_PROTOCOL; if (fs->val.tos || fs->mask.tos) fconf |= F_TOS; if (fs->val.vlan_vld || fs->mask.vlan_vld) fconf |= F_VLAN; if (fs->val.ovlan_vld || fs->mask.ovlan_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_OUTER_VLAN) return (EINVAL); fconf |= F_VNIC_ID; } if (fs->val.pfvf_vld || fs->mask.pfvf_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_PF_VF) return (EINVAL); fconf |= F_VNIC_ID; } #ifdef notyet if (fs->val.encap_vld || fs->mask.encap_vld) { if (tpp->vnic_mode != FW_VNIC_MODE_ENCAP_EN); return (EINVAL); fconf |= F_VNIC_ID; } #endif if (fs->val.iport || fs->mask.iport) fconf |= F_PORT; if (fs->val.fcoe || fs->mask.fcoe) fconf |= F_FCOE; if ((tpp->filter_mode | fconf) != tpp->filter_mode) return (E2BIG); return (0); } /* * Input: hardware filter configuration (filter mode/mask, ingress config). * Input: driver's 32b filter mode matching the input. */ static uint32_t fconf_to_mode(uint16_t hwmode, int vnic_mode) { uint32_t mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR | T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT; if (hwmode & F_FRAGMENTATION) mode |= T4_FILTER_IP_FRAGMENT; if (hwmode & F_MPSHITTYPE) mode |= T4_FILTER_MPS_HIT_TYPE; if (hwmode & F_MACMATCH) mode |= T4_FILTER_MAC_IDX; if (hwmode & F_ETHERTYPE) mode |= T4_FILTER_ETH_TYPE; if (hwmode & F_PROTOCOL) mode |= T4_FILTER_IP_PROTO; if (hwmode & F_TOS) mode |= T4_FILTER_IP_TOS; if (hwmode & F_VLAN) mode |= T4_FILTER_VLAN; if (hwmode & F_VNIC_ID) mode |= T4_FILTER_VNIC; /* real meaning depends on vnic_mode. */ if (hwmode & F_PORT) mode |= T4_FILTER_PORT; if (hwmode & F_FCOE) mode |= T4_FILTER_FCoE; switch (vnic_mode) { case FW_VNIC_MODE_PF_VF: mode |= T4_FILTER_IC_VNIC; break; case FW_VNIC_MODE_ENCAP_EN: mode |= T4_FILTER_IC_ENCAP; break; case FW_VNIC_MODE_OUTER_VLAN: default: break; } return (mode); } int get_filter_mode(struct adapter *sc, uint32_t *mode) { struct tp_params *tp = &sc->params.tp; uint16_t filter_mode; /* Filter mask must comply with the global filter mode. */ MPASS((tp->filter_mode | tp->filter_mask) == tp->filter_mode); /* Non-zero incoming value in mode means "hashfilter mode". */ filter_mode = *mode ? tp->filter_mask : tp->filter_mode; *mode = fconf_to_mode(filter_mode, tp->vnic_mode); return (0); } int set_filter_mode(struct adapter *sc, uint32_t mode) { struct tp_params *tp = &sc->params.tp; int rc, iconf; uint16_t fconf; iconf = mode_to_iconf(mode); fconf = mode_to_fconf(mode); if ((iconf == -1 || iconf == tp->vnic_mode) && fconf == tp->filter_mode) return (0); /* Nothing to do */ rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setfm"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (sc->tids.ftids_in_use > 0 || /* TCAM filters active */ sc->tids.hpftids_in_use > 0 || /* hi-pri TCAM filters active */ sc->tids.tids_in_use > 0) { /* TOE or hashfilters active */ rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (uld_active(sc, ULD_TOM)) { rc = EBUSY; goto done; } #endif /* Note that filter mask will get clipped to the new filter mode. */ rc = -t4_set_filter_cfg(sc, fconf, -1, iconf); done: end_synchronized_op(sc, 0); return (rc); } int set_filter_mask(struct adapter *sc, uint32_t mode) { struct tp_params *tp = &sc->params.tp; int rc, iconf; uint16_t fmask; iconf = mode_to_iconf(mode); fmask = mode_to_fconf(mode); if ((iconf == -1 || iconf == tp->vnic_mode) && fmask == tp->filter_mask) return (0); /* Nothing to do */ /* * We aren't going to change the global filter mode or VNIC mode here. * The given filter mask must conform to them. */ if ((fmask | tp->filter_mode) != tp->filter_mode) return (EINVAL); if (iconf != -1 && iconf != tp->vnic_mode) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sethfm"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (sc->tids.tids_in_use > 0) { /* TOE or hashfilters active */ rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (uld_active(sc, ULD_TOM)) { rc = EBUSY; goto done; } #endif rc = -t4_set_filter_cfg(sc, -1, fmask, -1); done: end_synchronized_op(sc, 0); return (rc); } static inline uint64_t get_filter_hits(struct adapter *sc, uint32_t tid) { uint32_t tcb_addr; uint64_t hits; tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; mtx_lock(&sc->reg_lock); if (hw_off_limits(sc)) hits = 0; else if (is_t4(sc)) { uint64_t t; read_via_memwin(sc, 0, tcb_addr + 16, (uint32_t *)&t, 8); hits = be64toh(t); } else { uint32_t t; read_via_memwin(sc, 0, tcb_addr + 24, &t, 4); hits = be32toh(t); } mtx_unlock(&sc->reg_lock); return (hits); } int get_filter(struct adapter *sc, struct t4_filter *t) { if (t->fs.hash) return (get_hashfilter(sc, t)); else return (get_tcamfilter(sc, t)); } static int set_tcamfilter(struct adapter *sc, struct t4_filter *t, struct l2t_entry *l2te, struct smt_entry *smt) { struct filter_entry *f; struct fw_filter2_wr *fwr; u_int vnic_vld, vnic_vld_mask; struct wrq_cookie cookie; int i, rc, busy, locked; u_int tid; const int ntids = t->fs.type ? 4 : 1; MPASS(!t->fs.hash); /* Already validated against fconf, iconf */ MPASS((t->fs.val.pfvf_vld & t->fs.val.ovlan_vld) == 0); MPASS((t->fs.mask.pfvf_vld & t->fs.mask.ovlan_vld) == 0); if (separate_hpfilter_region(sc) && t->fs.prio) { MPASS(t->idx < sc->tids.nhpftids); f = &sc->tids.hpftid_tab[t->idx]; tid = sc->tids.hpftid_base + t->idx; } else { MPASS(t->idx < sc->tids.nftids); f = &sc->tids.ftid_tab[t->idx]; tid = sc->tids.ftid_base + t->idx; } rc = busy = locked = 0; mtx_lock(&sc->tids.ftid_lock); for (i = 0; i < ntids; i++) { busy += f[i].pending + f[i].valid; locked += f[i].locked; } if (locked > 0) rc = EPERM; else if (busy > 0) rc = EBUSY; else { int len16; if (sc->params.filter2_wr_support) len16 = howmany(sizeof(struct fw_filter2_wr), 16); else len16 = howmany(sizeof(struct fw_filter_wr), 16); fwr = start_wrq_wr(&sc->sge.ctrlq[0], len16, &cookie); if (__predict_false(fwr == NULL)) rc = ENOMEM; else { f->pending = 1; if (separate_hpfilter_region(sc) && t->fs.prio) sc->tids.hpftids_in_use++; else sc->tids.ftids_in_use++; } } mtx_unlock(&sc->tids.ftid_lock); if (rc != 0) return (rc); /* * Can't fail now. A set-filter WR will definitely be sent. */ f->tid = tid; f->fs = t->fs; f->l2te = l2te; f->smt = smt; if (t->fs.val.pfvf_vld || t->fs.val.ovlan_vld) vnic_vld = 1; else vnic_vld = 0; if (t->fs.mask.pfvf_vld || t->fs.mask.ovlan_vld) vnic_vld_mask = 1; else vnic_vld_mask = 0; bzero(fwr, sizeof(*fwr)); if (sc->params.filter2_wr_support) fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER2_WR)); else fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); fwr->len16_pkd = htobe32(FW_LEN16(*fwr)); fwr->tid_to_iq = htobe32(V_FW_FILTER_WR_TID(f->tid) | V_FW_FILTER_WR_RQTYPE(f->fs.type) | V_FW_FILTER_WR_NOREPLY(0) | V_FW_FILTER_WR_IQ(f->fs.iq)); fwr->del_filter_to_l2tix = htobe32(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) | V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) | V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) | V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) | V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) | V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) | V_FW_FILTER_WR_DMAC(f->fs.newdmac) | V_FW_FILTER_WR_SMAC(f->fs.newsmac) | V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) | V_FW_FILTER_WR_TXCHAN(f->fs.eport) | V_FW_FILTER_WR_PRIO(f->fs.prio) | V_FW_FILTER_WR_L2TIX(f->l2te ? f->l2te->idx : 0)); fwr->ethtype = htobe16(f->fs.val.ethtype); fwr->ethtypem = htobe16(f->fs.mask.ethtype); fwr->frag_to_ovlan_vldm = (V_FW_FILTER_WR_FRAG(f->fs.val.frag) | V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) | V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLD(vnic_vld) | V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLDM(vnic_vld_mask)); fwr->smac_sel = 0; fwr->rx_chan_rx_rpl_iq = htobe16(V_FW_FILTER_WR_RX_CHAN(0) | V_FW_FILTER_WR_RX_RPL_IQ(sc->sge.fwq.abs_id)); fwr->maci_to_matchtypem = htobe32(V_FW_FILTER_WR_MACI(f->fs.val.macidx) | V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) | V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) | V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) | V_FW_FILTER_WR_PORT(f->fs.val.iport) | V_FW_FILTER_WR_PORTM(f->fs.mask.iport) | V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) | V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype)); fwr->ptcl = f->fs.val.proto; fwr->ptclm = f->fs.mask.proto; fwr->ttyp = f->fs.val.tos; fwr->ttypm = f->fs.mask.tos; fwr->ivlan = htobe16(f->fs.val.vlan); fwr->ivlanm = htobe16(f->fs.mask.vlan); fwr->ovlan = htobe16(f->fs.val.vnic); fwr->ovlanm = htobe16(f->fs.mask.vnic); bcopy(f->fs.val.dip, fwr->lip, sizeof (fwr->lip)); bcopy(f->fs.mask.dip, fwr->lipm, sizeof (fwr->lipm)); bcopy(f->fs.val.sip, fwr->fip, sizeof (fwr->fip)); bcopy(f->fs.mask.sip, fwr->fipm, sizeof (fwr->fipm)); fwr->lp = htobe16(f->fs.val.dport); fwr->lpm = htobe16(f->fs.mask.dport); fwr->fp = htobe16(f->fs.val.sport); fwr->fpm = htobe16(f->fs.mask.sport); /* sma = 0 tells the fw to use SMAC_SEL for source MAC address */ bzero(fwr->sma, sizeof (fwr->sma)); if (sc->params.filter2_wr_support) { fwr->filter_type_swapmac = V_FW_FILTER2_WR_SWAPMAC(f->fs.swapmac); fwr->natmode_to_ulp_type = V_FW_FILTER2_WR_ULP_TYPE(f->fs.nat_mode ? ULP_MODE_TCPDDP : ULP_MODE_NONE) | V_FW_FILTER2_WR_NATFLAGCHECK(f->fs.nat_flag_chk) | V_FW_FILTER2_WR_NATMODE(f->fs.nat_mode); memcpy(fwr->newlip, f->fs.nat_dip, sizeof(fwr->newlip)); memcpy(fwr->newfip, f->fs.nat_sip, sizeof(fwr->newfip)); fwr->newlport = htobe16(f->fs.nat_dport); fwr->newfport = htobe16(f->fs.nat_sport); fwr->natseqcheck = htobe32(f->fs.nat_seq_chk); } commit_wrq_wr(&sc->sge.ctrlq[0], fwr, &cookie); /* Wait for response. */ mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? 0 : EIO; break; } if (cv_wait_sig(&sc->tids.ftid_cv, &sc->tids.ftid_lock) != 0) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); return (rc); } static int hashfilter_ntuple(struct adapter *sc, const struct t4_filter_specification *fs, uint64_t *ftuple) { struct tp_params *tp = &sc->params.tp; uint16_t fmask; *ftuple = fmask = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && fs->mask.vlan) { *ftuple |= (uint64_t)(F_FT_VLAN_VLD | fs->val.vlan) << tp->vlan_shift; fmask |= F_VLAN; } if (tp->port_shift >= 0 && fs->mask.iport) { *ftuple |= (uint64_t)fs->val.iport << tp->port_shift; fmask |= F_PORT; } if (tp->protocol_shift >= 0 && fs->mask.proto) { *ftuple |= (uint64_t)fs->val.proto << tp->protocol_shift; fmask |= F_PROTOCOL; } if (tp->tos_shift >= 0 && fs->mask.tos) { *ftuple |= (uint64_t)(fs->val.tos) << tp->tos_shift; fmask |= F_TOS; } if (tp->vnic_shift >= 0 && fs->mask.vnic) { /* vnic_mode was already validated. */ if (tp->vnic_mode == FW_VNIC_MODE_PF_VF) MPASS(fs->mask.pfvf_vld); else if (tp->vnic_mode == FW_VNIC_MODE_OUTER_VLAN) MPASS(fs->mask.ovlan_vld); #ifdef notyet else if (tp->vnic_mode == FW_VNIC_MODE_ENCAP_EN) MPASS(fs->mask.encap_vld); #endif *ftuple |= ((1ULL << 16) | fs->val.vnic) << tp->vnic_shift; fmask |= F_VNIC_ID; } if (tp->macmatch_shift >= 0 && fs->mask.macidx) { *ftuple |= (uint64_t)(fs->val.macidx) << tp->macmatch_shift; fmask |= F_MACMATCH; } if (tp->ethertype_shift >= 0 && fs->mask.ethtype) { *ftuple |= (uint64_t)(fs->val.ethtype) << tp->ethertype_shift; fmask |= F_ETHERTYPE; } if (tp->matchtype_shift >= 0 && fs->mask.matchtype) { *ftuple |= (uint64_t)(fs->val.matchtype) << tp->matchtype_shift; fmask |= F_MPSHITTYPE; } if (tp->frag_shift >= 0 && fs->mask.frag) { *ftuple |= (uint64_t)(fs->val.frag) << tp->frag_shift; fmask |= F_FRAGMENTATION; } if (tp->fcoe_shift >= 0 && fs->mask.fcoe) { *ftuple |= (uint64_t)(fs->val.fcoe) << tp->fcoe_shift; fmask |= F_FCOE; } /* A hashfilter must conform to the hardware filter mask. */ if (fmask != tp->filter_mask) return (EINVAL); return (0); } static bool is_4tuple_specified(struct t4_filter_specification *fs) { int i; const int n = fs->type ? 16 : 4; if (fs->mask.sport != 0xffff || fs->mask.dport != 0xffff) return (false); for (i = 0; i < n; i++) { if (fs->mask.sip[i] != 0xff) return (false); if (fs->mask.dip[i] != 0xff) return (false); } return (true); } int set_filter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; struct l2t_entry *l2te = NULL; struct smt_entry *smt = NULL; uint64_t ftuple; int rc; /* * Basic filter checks first. */ if (t->fs.hash) { if (!is_hashfilter(sc) || ti->ntids == 0) return (ENOTSUP); /* Hardware, not user, selects a tid for hashfilters. */ if (t->idx != (uint32_t)-1) return (EINVAL); /* T5 can't count hashfilter hits. */ if (is_t5(sc) && t->fs.hitcnts) return (EINVAL); if (!is_4tuple_specified(&t->fs)) return (EINVAL); rc = hashfilter_ntuple(sc, &t->fs, &ftuple); if (rc != 0) return (rc); } else { if (separate_hpfilter_region(sc) && t->fs.prio) { if (ti->nhpftids == 0) return (ENOTSUP); if (t->idx >= ti->nhpftids) return (EINVAL); } else { if (ti->nftids == 0) return (ENOTSUP); if (t->idx >= ti->nftids) return (EINVAL); } /* IPv6 filter idx must be 4 aligned */ if (t->fs.type == 1 && ((t->idx & 0x3) || t->idx + 4 >= ti->nftids)) return (EINVAL); } /* T4 doesn't support VLAN tag removal or rewrite, swapmac, and NAT. */ if (is_t4(sc) && t->fs.action == FILTER_SWITCH && (t->fs.newvlan == VLAN_REMOVE || t->fs.newvlan == VLAN_REWRITE || t->fs.swapmac || t->fs.nat_mode)) return (ENOTSUP); if (t->fs.action == FILTER_SWITCH && t->fs.eport >= sc->params.nports) return (EINVAL); if (t->fs.val.iport >= sc->params.nports) return (EINVAL); /* Can't specify an iqid/rss_info if not steering. */ if (!t->fs.dirsteer && !t->fs.dirsteerhash && !t->fs.maskhash && t->fs.iq) return (EINVAL); /* Validate against the global filter mode and ingress config */ rc = check_fspec_against_fconf_iconf(sc, &t->fs); if (rc != 0) return (rc); /* * Basic checks passed. Make sure the queues and tid tables are setup. */ rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setf"); if (rc) return (rc); if (hw_off_limits(sc)) { rc = ENXIO; goto done; } if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_init(sc)) != 0)) goto done; if (t->fs.hash) { if (__predict_false(ti->hftid_hash_4t == NULL)) { rc = alloc_hftid_hash(&sc->tids, HASH_NOWAIT); if (rc != 0) goto done; } } else if (separate_hpfilter_region(sc) && t->fs.prio && __predict_false(ti->hpftid_tab == NULL)) { MPASS(ti->nhpftids != 0); KASSERT(ti->hpftids_in_use == 0, ("%s: no memory allocated but hpftids_in_use is %u", __func__, ti->hpftids_in_use)); ti->hpftid_tab = malloc(sizeof(struct filter_entry) * ti->nhpftids, M_CXGBE, M_NOWAIT | M_ZERO); if (ti->hpftid_tab == NULL) { rc = ENOMEM; goto done; } if (!mtx_initialized(&sc->tids.ftid_lock)) { mtx_init(&ti->ftid_lock, "T4 filters", 0, MTX_DEF); cv_init(&ti->ftid_cv, "t4fcv"); } } else if (__predict_false(ti->ftid_tab == NULL)) { MPASS(ti->nftids != 0); KASSERT(ti->ftids_in_use == 0, ("%s: no memory allocated but ftids_in_use is %u", __func__, ti->ftids_in_use)); ti->ftid_tab = malloc(sizeof(struct filter_entry) * ti->nftids, M_CXGBE, M_NOWAIT | M_ZERO); if (ti->ftid_tab == NULL) { rc = ENOMEM; goto done; } if (!mtx_initialized(&sc->tids.ftid_lock)) { mtx_init(&ti->ftid_lock, "T4 filters", 0, MTX_DEF); cv_init(&ti->ftid_cv, "t4fcv"); } } done: end_synchronized_op(sc, 0); if (rc != 0) return (rc); /* * Allocate L2T entry, SMT entry, etc. */ if (t->fs.newdmac || t->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ l2te = t4_l2t_alloc_switching(sc, t->fs.vlan, t->fs.eport, t->fs.dmac); if (__predict_false(l2te == NULL)) { rc = EAGAIN; goto error; } } if (t->fs.newsmac) { /* This filter needs an SMT entry; allocate one. */ smt = t4_smt_alloc_switching(sc->smt, t->fs.smac); if (__predict_false(smt == NULL)) { rc = EAGAIN; goto error; } rc = t4_smt_set_switching(sc, smt, 0x0, t->fs.smac); if (rc) goto error; } if (t->fs.hash) rc = set_hashfilter(sc, t, ftuple, l2te, smt); else rc = set_tcamfilter(sc, t, l2te, smt); if (rc != 0 && rc != EINPROGRESS) { error: if (l2te) t4_l2t_release(l2te); if (smt) t4_smt_release(smt); } return (rc); } static int del_tcamfilter(struct adapter *sc, struct t4_filter *t) { struct filter_entry *f; struct fw_filter_wr *fwr; struct wrq_cookie cookie; int rc, nfilters; #ifdef INVARIANTS u_int tid_base; #endif mtx_lock(&sc->tids.ftid_lock); if (separate_hpfilter_region(sc) && t->fs.prio) { nfilters = sc->tids.nhpftids; f = sc->tids.hpftid_tab; #ifdef INVARIANTS tid_base = sc->tids.hpftid_base; #endif } else { nfilters = sc->tids.nftids; f = sc->tids.ftid_tab; #ifdef INVARIANTS tid_base = sc->tids.ftid_base; #endif } MPASS(f != NULL); /* Caller checked this. */ if (t->idx >= nfilters) { rc = EINVAL; goto done; } f += t->idx; if (f->locked) { rc = EPERM; goto done; } if (f->pending) { rc = EBUSY; goto done; } if (f->valid == 0) { rc = EINVAL; goto done; } MPASS(f->tid == tid_base + t->idx); fwr = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*fwr), 16), &cookie); if (fwr == NULL) { rc = ENOMEM; goto done; } bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(f->tid, fwr, sc->sge.fwq.abs_id); f->pending = 1; commit_wrq_wr(&sc->sge.ctrlq[0], fwr, &cookie); t->fs = f->fs; /* extra info for the caller */ for (;;) { if (f->pending == 0) { rc = f->valid ? EIO : 0; break; } if (cv_wait_sig(&sc->tids.ftid_cv, &sc->tids.ftid_lock) != 0) { rc = EINPROGRESS; break; } } done: mtx_unlock(&sc->tids.ftid_lock); return (rc); } int del_filter(struct adapter *sc, struct t4_filter *t) { /* No filters possible if not initialized yet. */ if (!(sc->flags & FULL_INIT_DONE)) return (EINVAL); /* * The checks for tid tables ensure that the locks that del_* will reach * for are initialized. */ if (t->fs.hash) { if (sc->tids.hftid_hash_4t != NULL) return (del_hashfilter(sc, t)); } else if (separate_hpfilter_region(sc) && t->fs.prio) { if (sc->tids.hpftid_tab != NULL) return (del_tcamfilter(sc, t)); } else { if (sc->tids.ftid_tab != NULL) return (del_tcamfilter(sc, t)); } return (EINVAL); } /* * Release secondary resources associated with the filter. */ static void free_filter_resources(struct filter_entry *f) { if (f->l2te) { t4_l2t_release(f->l2te); f->l2te = NULL; } if (f->smt) { t4_smt_release(f->smt); f->smt = NULL; } } static int set_tcb_field(struct adapter *sc, u_int tid, uint16_t word, uint64_t mask, uint64_t val, int no_reply) { struct wrq_cookie cookie; struct cpl_set_tcb_field *req; req = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*req), 16), &cookie); if (req == NULL) return (ENOMEM); bzero(req, sizeof(*req)); INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, tid); if (no_reply == 0) { req->reply_ctrl = htobe16(V_QUEUENO(sc->sge.fwq.abs_id) | V_NO_REPLY(0)); } else req->reply_ctrl = htobe16(V_NO_REPLY(1)); req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(CPL_COOKIE_HASHFILTER)); req->mask = htobe64(mask); req->val = htobe64(val); commit_wrq_wr(&sc->sge.ctrlq[0], req, &cookie); return (0); } /* Set one of the t_flags bits in the TCB. */ static inline int set_tcb_tflag(struct adapter *sc, int tid, u_int bit_pos, u_int val, u_int no_reply) { return (set_tcb_field(sc, tid, W_TCB_T_FLAGS, 1ULL << bit_pos, (uint64_t)val << bit_pos, no_reply)); } int t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); u_int tid = GET_TID(rpl); u_int rc, idx; struct filter_entry *f; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (is_hpftid(sc, tid)) { idx = tid - sc->tids.hpftid_base; f = &sc->tids.hpftid_tab[idx]; } else if (is_ftid(sc, tid)) { idx = tid - sc->tids.ftid_base; f = &sc->tids.ftid_tab[idx]; } else panic("%s: FW reply for invalid TID %d.", __func__, tid); MPASS(f->tid == tid); rc = G_COOKIE(rpl->cookie); mtx_lock(&sc->tids.ftid_lock); KASSERT(f->pending, ("%s: reply %d for filter[%u] that isn't pending.", __func__, rc, tid)); switch(rc) { case FW_FILTER_WR_FLT_ADDED: /* set-filter succeeded */ f->valid = 1; if (f->fs.newsmac) { MPASS(f->smt != NULL); set_tcb_tflag(sc, f->tid, S_TF_CCTRL_CWR, 1, 1); set_tcb_field(sc, f->tid, W_TCB_SMAC_SEL, V_TCB_SMAC_SEL(M_TCB_SMAC_SEL), V_TCB_SMAC_SEL(f->smt->idx), 1); /* XXX: wait for reply to TCB update before !pending */ } break; case FW_FILTER_WR_FLT_DELETED: /* del-filter succeeded */ MPASS(f->valid == 1); f->valid = 0; /* Fall through */ case FW_FILTER_WR_SMT_TBL_FULL: /* set-filter failed due to lack of SMT space. */ MPASS(f->valid == 0); free_filter_resources(f); if (separate_hpfilter_region(sc) && f->fs.prio) sc->tids.hpftids_in_use--; else sc->tids.ftids_in_use--; break; case FW_FILTER_WR_SUCCESS: case FW_FILTER_WR_EINVAL: default: panic("%s: unexpected reply %d for filter[%d].", __func__, rc, idx); } f->pending = 0; cv_broadcast(&sc->tids.ftid_cv); mtx_unlock(&sc->tids.ftid_lock); return (0); } /* * This is the reply to the Active Open that created the filter. Additional TCB * updates may be required to complete the filter configuration. */ int t4_hashfilter_ao_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); struct filter_entry *f = lookup_atid(sc, atid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); mtx_lock(&sc->tids.hftid_lock); KASSERT(f->pending, ("%s: hashfilter[%p] isn't pending.", __func__, f)); KASSERT(f->tid == -1, ("%s: hashfilter[%p] has tid %d already.", __func__, f, f->tid)); if (status == CPL_ERR_NONE) { f->tid = GET_TID(cpl); MPASS(lookup_hftid(sc, f->tid) == NULL); insert_hftid(sc, f); /* * Leave the filter pending until it is fully set up, which will * be indicated by the reply to the last TCB update. No need to * unblock the ioctl thread either. */ if (configure_hashfilter_tcb(sc, f) == EINPROGRESS) goto done; f->valid = 1; f->pending = 0; } else { /* provide errno instead of tid to ioctl */ f->tid = act_open_rpl_status_to_errno(status); f->valid = 0; f->pending = 0; if (act_open_has_tid(status)) release_tid(sc, GET_TID(cpl), &sc->sge.ctrlq[0]); free_filter_resources(f); remove_hf(sc, f); if (f->locked == 0) free(f, M_CXGBE); } cv_broadcast(&sc->tids.hftid_cv); done: mtx_unlock(&sc->tids.hftid_lock); free_atid(sc, atid); return (0); } int t4_hashfilter_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); u_int tid = GET_TID(rpl); struct filter_entry *f; mtx_lock(&sc->tids.hftid_lock); f = lookup_hftid(sc, tid); KASSERT(f->tid == tid, ("%s: filter tid mismatch", __func__)); KASSERT(f->pending, ("%s: hashfilter %p [%u] isn't pending.", __func__, f, tid)); KASSERT(f->valid == 0, ("%s: hashfilter %p [%u] is valid already.", __func__, f, tid)); f->pending = 0; if (rpl->status == 0) { f->valid = 1; } else { f->tid = EIO; f->valid = 0; free_filter_resources(f); remove_hftid(sc, f); remove_hf(sc, f); release_tid(sc, tid, &sc->sge.ctrlq[0]); if (f->locked == 0) free(f, M_CXGBE); } cv_broadcast(&sc->tids.hftid_cv); mtx_unlock(&sc->tids.hftid_lock); return (0); } int t4_del_hashfilter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct filter_entry *f; mtx_lock(&sc->tids.hftid_lock); f = lookup_hftid(sc, tid); KASSERT(f->tid == tid, ("%s: filter tid mismatch", __func__)); KASSERT(f->pending, ("%s: hashfilter %p [%u] isn't pending.", __func__, f, tid)); KASSERT(f->valid, ("%s: hashfilter %p [%u] isn't valid.", __func__, f, tid)); f->pending = 0; if (cpl->status == 0) { f->valid = 0; free_filter_resources(f); remove_hftid(sc, f); remove_hf(sc, f); release_tid(sc, tid, &sc->sge.ctrlq[0]); if (f->locked == 0) free(f, M_CXGBE); } cv_broadcast(&sc->tids.hftid_cv); mtx_unlock(&sc->tids.hftid_lock); return (0); } static int get_tcamfilter(struct adapter *sc, struct t4_filter *t) { int i, nfilters; struct filter_entry *f; u_int in_use; #ifdef INVARIANTS u_int tid_base; #endif MPASS(!t->fs.hash); if (separate_hpfilter_region(sc) && t->fs.prio) { nfilters = sc->tids.nhpftids; f = sc->tids.hpftid_tab; in_use = sc->tids.hpftids_in_use; #ifdef INVARIANTS tid_base = sc->tids.hpftid_base; #endif } else { nfilters = sc->tids.nftids; f = sc->tids.ftid_tab; in_use = sc->tids.ftids_in_use; #ifdef INVARIANTS tid_base = sc->tids.ftid_base; #endif } if (in_use == 0 || f == NULL || t->idx >= nfilters) { t->idx = 0xffffffff; return (0); } f += t->idx; mtx_lock(&sc->tids.ftid_lock); for (i = t->idx; i < nfilters; i++, f++) { if (f->valid) { MPASS(f->tid == tid_base + i); t->idx = i; t->l2tidx = f->l2te ? f->l2te->idx : 0; t->smtidx = f->smt ? f->smt->idx : 0; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, f->tid); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: mtx_unlock(&sc->tids.ftid_lock); return (0); } static int get_hashfilter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; int tid; struct filter_entry *f; const int inv_tid = ti->ntids + ti->tid_base; MPASS(t->fs.hash); if (ti->tids_in_use == 0 || ti->hftid_hash_tid == NULL || t->idx >= inv_tid) { t->idx = 0xffffffff; return (0); } if (t->idx < ti->tid_base) t->idx = ti->tid_base; mtx_lock(&ti->hftid_lock); for (tid = t->idx; tid < inv_tid; tid++) { f = lookup_hftid(sc, tid); if (f != NULL && f->valid) { t->idx = tid; t->l2tidx = f->l2te ? f->l2te->idx : 0; t->smtidx = f->smt ? f->smt->idx : 0; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, tid); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: mtx_unlock(&ti->hftid_lock); return (0); } static void mk_act_open_req6(struct adapter *sc, struct filter_entry *f, int atid, uint64_t ftuple, struct cpl_act_open_req6 *cpl) { struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; /* Review changes to CPL after cpl_t6_act_open_req if this goes off. */ MPASS(chip_id(sc) >= CHELSIO_T5 && chip_id(sc) <= CHELSIO_T6); MPASS(atid >= 0); if (chip_id(sc) == CHELSIO_T5) { INIT_TP_WR(cpl5, 0); } else { INIT_TP_WR(cpl6, 0); cpl6->rsvd2 = 0; cpl6->opt3 = 0; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_HASHFILTER))); cpl->local_port = htobe16(f->fs.val.dport); cpl->peer_port = htobe16(f->fs.val.sport); cpl->local_ip_hi = *(uint64_t *)(&f->fs.val.dip); cpl->local_ip_lo = *(((uint64_t *)&f->fs.val.dip) + 1); cpl->peer_ip_hi = *(uint64_t *)(&f->fs.val.sip); cpl->peer_ip_lo = *(((uint64_t *)&f->fs.val.sip) + 1); cpl->opt0 = htobe64(V_NAGLE(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_DELACK(f->fs.hitcnts) | V_L2T_IDX(f->l2te ? f->l2te->idx : 0) | V_TX_CHAN(f->fs.eport) | V_NO_CONG(f->fs.rpttid) | V_ULP_MODE(f->fs.nat_mode ? ULP_MODE_TCPDDP : ULP_MODE_NONE) | F_TCAM_BYPASS | F_NON_OFFLOAD); cpl6->params = htobe64(V_FILTER_TUPLE(ftuple)); cpl6->opt2 = htobe32(F_RSS_QUEUE_VALID | V_RSS_QUEUE(f->fs.iq) | V_TX_QUEUE(f->fs.nat_mode) | V_WND_SCALE_EN(f->fs.nat_flag_chk) | V_RX_FC_DISABLE(f->fs.nat_seq_chk ? 1 : 0) | F_T5_OPT_2_VALID | F_RX_CHANNEL | V_SACK_EN(f->fs.swapmac) | V_CONG_CNTRL((f->fs.action == FILTER_DROP) | (f->fs.dirsteer << 1)) | V_PACE(f->fs.maskhash | (f->fs.dirsteerhash << 1))); } static void mk_act_open_req(struct adapter *sc, struct filter_entry *f, int atid, uint64_t ftuple, struct cpl_act_open_req *cpl) { struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; /* Review changes to CPL after cpl_t6_act_open_req if this goes off. */ MPASS(chip_id(sc) >= CHELSIO_T5 && chip_id(sc) <= CHELSIO_T6); MPASS(atid >= 0); if (chip_id(sc) == CHELSIO_T5) { INIT_TP_WR(cpl5, 0); } else { INIT_TP_WR(cpl6, 0); cpl6->rsvd2 = 0; cpl6->opt3 = 0; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) | V_TID_COOKIE(CPL_COOKIE_HASHFILTER))); cpl->local_port = htobe16(f->fs.val.dport); cpl->peer_port = htobe16(f->fs.val.sport); cpl->local_ip = f->fs.val.dip[0] | f->fs.val.dip[1] << 8 | f->fs.val.dip[2] << 16 | f->fs.val.dip[3] << 24; cpl->peer_ip = f->fs.val.sip[0] | f->fs.val.sip[1] << 8 | f->fs.val.sip[2] << 16 | f->fs.val.sip[3] << 24; cpl->opt0 = htobe64(V_NAGLE(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_DELACK(f->fs.hitcnts) | V_L2T_IDX(f->l2te ? f->l2te->idx : 0) | V_TX_CHAN(f->fs.eport) | V_NO_CONG(f->fs.rpttid) | V_ULP_MODE(f->fs.nat_mode ? ULP_MODE_TCPDDP : ULP_MODE_NONE) | F_TCAM_BYPASS | F_NON_OFFLOAD); cpl6->params = htobe64(V_FILTER_TUPLE(ftuple)); cpl6->opt2 = htobe32(F_RSS_QUEUE_VALID | V_RSS_QUEUE(f->fs.iq) | V_TX_QUEUE(f->fs.nat_mode) | V_WND_SCALE_EN(f->fs.nat_flag_chk) | V_RX_FC_DISABLE(f->fs.nat_seq_chk ? 1 : 0) | F_T5_OPT_2_VALID | F_RX_CHANNEL | V_SACK_EN(f->fs.swapmac) | V_CONG_CNTRL((f->fs.action == FILTER_DROP) | (f->fs.dirsteer << 1)) | V_PACE(f->fs.maskhash | (f->fs.dirsteerhash << 1))); } static int act_open_cpl_len16(struct adapter *sc, int isipv6) { int idx; static const int sz_table[3][2] = { { howmany(sizeof (struct cpl_act_open_req), 16), howmany(sizeof (struct cpl_act_open_req6), 16) }, { howmany(sizeof (struct cpl_t5_act_open_req), 16), howmany(sizeof (struct cpl_t5_act_open_req6), 16) }, { howmany(sizeof (struct cpl_t6_act_open_req), 16), howmany(sizeof (struct cpl_t6_act_open_req6), 16) }, }; MPASS(chip_id(sc) >= CHELSIO_T4); idx = min(chip_id(sc) - CHELSIO_T4, 2); return (sz_table[idx][!!isipv6]); } static int set_hashfilter(struct adapter *sc, struct t4_filter *t, uint64_t ftuple, struct l2t_entry *l2te, struct smt_entry *smt) { void *wr; struct wrq_cookie cookie; struct filter_entry *f; int rc, atid = -1; uint32_t hash; MPASS(t->fs.hash); /* Already validated against fconf, iconf */ MPASS((t->fs.val.pfvf_vld & t->fs.val.ovlan_vld) == 0); MPASS((t->fs.mask.pfvf_vld & t->fs.mask.ovlan_vld) == 0); hash = hf_hashfn_4t(&t->fs); mtx_lock(&sc->tids.hftid_lock); if (lookup_hf(sc, &t->fs, hash) != NULL) { rc = EEXIST; goto done; } f = malloc(sizeof(*f), M_CXGBE, M_ZERO | M_NOWAIT); if (__predict_false(f == NULL)) { rc = ENOMEM; goto done; } f->fs = t->fs; f->l2te = l2te; f->smt = smt; atid = alloc_atid(sc, f); if (__predict_false(atid) == -1) { free(f, M_CXGBE); rc = EAGAIN; goto done; } MPASS(atid >= 0); wr = start_wrq_wr(&sc->sge.ctrlq[0], act_open_cpl_len16(sc, f->fs.type), &cookie); if (wr == NULL) { free_atid(sc, atid); free(f, M_CXGBE); rc = ENOMEM; goto done; } if (f->fs.type) mk_act_open_req6(sc, f, atid, ftuple, wr); else mk_act_open_req(sc, f, atid, ftuple, wr); f->locked = 1; /* ithread mustn't free f if ioctl is still around. */ f->pending = 1; f->tid = -1; insert_hf(sc, f, hash); commit_wrq_wr(&sc->sge.ctrlq[0], wr, &cookie); for (;;) { MPASS(f->locked); if (f->pending == 0) { if (f->valid) { rc = 0; f->locked = 0; t->idx = f->tid; } else { rc = f->tid; free(f, M_CXGBE); } break; } if (cv_wait_sig(&sc->tids.hftid_cv, &sc->tids.hftid_lock) != 0) { f->locked = 0; rc = EINPROGRESS; break; } } done: mtx_unlock(&sc->tids.hftid_lock); return (rc); } -/* SET_TCB_FIELD sent as a ULP command looks like this */ -#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ - sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) - -static void * -mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, uint64_t word, uint64_t mask, - uint64_t val, uint32_t tid, uint32_t qid) -{ - struct ulptx_idata *ulpsc; - struct cpl_set_tcb_field_core *req; - - ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); - ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); - - ulpsc = (struct ulptx_idata *)(ulpmc + 1); - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); - ulpsc->len = htobe32(sizeof(*req)); - - req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); - OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); - req->reply_ctrl = htobe16(V_NO_REPLY(1) | V_QUEUENO(qid)); - req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); - req->mask = htobe64(mask); - req->val = htobe64(val); - - ulpsc = (struct ulptx_idata *)(req + 1); - if (LEN__SET_TCB_FIELD_ULP % 16) { - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); - ulpsc->len = htobe32(0); - return (ulpsc + 1); - } - return (ulpsc); -} - /* ABORT_REQ sent as a ULP command looks like this */ #define LEN__ABORT_REQ_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_abort_req_core)) static void * mk_abort_req_ulp(struct ulp_txpkt *ulpmc, uint32_t tid) { struct ulptx_idata *ulpsc; struct cpl_abort_req_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__ABORT_REQ_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_abort_req_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); req->rsvd0 = htonl(0); req->rsvd1 = 0; req->cmd = CPL_ABORT_NO_RST; ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__ABORT_REQ_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } /* ABORT_RPL sent as a ULP command looks like this */ #define LEN__ABORT_RPL_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_abort_rpl_core)) static void * mk_abort_rpl_ulp(struct ulp_txpkt *ulpmc, uint32_t tid) { struct ulptx_idata *ulpsc; struct cpl_abort_rpl_core *rpl; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__ABORT_RPL_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*rpl)); rpl = (struct cpl_abort_rpl_core *)(ulpsc + 1); OPCODE_TID(rpl) = htobe32(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); rpl->rsvd0 = htonl(0); rpl->rsvd1 = 0; rpl->cmd = CPL_ABORT_NO_RST; ulpsc = (struct ulptx_idata *)(rpl + 1); if (LEN__ABORT_RPL_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static inline int del_hashfilter_wrlen(void) { return (sizeof(struct work_request_hdr) + roundup2(LEN__SET_TCB_FIELD_ULP, 16) + roundup2(LEN__ABORT_REQ_ULP, 16) + roundup2(LEN__ABORT_RPL_ULP, 16)); } static void -mk_del_hashfilter_wr(int tid, struct work_request_hdr *wrh, int wrlen, int qid) +mk_del_hashfilter_wr(struct adapter *sc, int tid, struct work_request_hdr *wrh, + int wrlen, int qid) { struct ulp_txpkt *ulpmc; INIT_ULPTX_WRH(wrh, wrlen, 0, 0); ulpmc = (struct ulp_txpkt *)(wrh + 1); - ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_RSS_INFO, - V_TCB_RSS_INFO(M_TCB_RSS_INFO), V_TCB_RSS_INFO(qid), tid, 0); + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, tid, W_TCB_RSS_INFO, + V_TCB_RSS_INFO(M_TCB_RSS_INFO), V_TCB_RSS_INFO(qid)); ulpmc = mk_abort_req_ulp(ulpmc, tid); ulpmc = mk_abort_rpl_ulp(ulpmc, tid); } static int del_hashfilter(struct adapter *sc, struct t4_filter *t) { struct tid_info *ti = &sc->tids; void *wr; struct filter_entry *f; struct wrq_cookie cookie; int rc; const int wrlen = del_hashfilter_wrlen(); const int inv_tid = ti->ntids + ti->tid_base; MPASS(sc->tids.hftid_hash_4t != NULL); MPASS(sc->tids.ntids > 0); if (t->idx < sc->tids.tid_base || t->idx >= inv_tid) return (EINVAL); mtx_lock(&ti->hftid_lock); f = lookup_hftid(sc, t->idx); if (f == NULL || f->valid == 0) { rc = EINVAL; goto done; } MPASS(f->tid == t->idx); if (f->locked) { rc = EPERM; goto done; } if (f->pending) { rc = EBUSY; goto done; } wr = start_wrq_wr(&sc->sge.ctrlq[0], howmany(wrlen, 16), &cookie); if (wr == NULL) { rc = ENOMEM; goto done; } - mk_del_hashfilter_wr(t->idx, wr, wrlen, sc->sge.fwq.abs_id); + mk_del_hashfilter_wr(sc, t->idx, wr, wrlen, sc->sge.fwq.abs_id); f->locked = 1; f->pending = 1; commit_wrq_wr(&sc->sge.ctrlq[0], wr, &cookie); t->fs = f->fs; /* extra info for the caller */ for (;;) { MPASS(f->locked); if (f->pending == 0) { if (f->valid) { f->locked = 0; rc = EIO; } else { rc = 0; free(f, M_CXGBE); } break; } if (cv_wait_sig(&ti->hftid_cv, &ti->hftid_lock) != 0) { f->locked = 0; rc = EINPROGRESS; break; } } done: mtx_unlock(&ti->hftid_lock); return (rc); } #define WORD_MASK 0xffffffff static void set_nat_params(struct adapter *sc, struct filter_entry *f, const bool dip, const bool sip, const bool dp, const bool sp) { if (dip) { if (f->fs.type) { set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW, WORD_MASK, f->fs.nat_dip[15] | f->fs.nat_dip[14] << 8 | f->fs.nat_dip[13] << 16 | f->fs.nat_dip[12] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW + 1, WORD_MASK, f->fs.nat_dip[11] | f->fs.nat_dip[10] << 8 | f->fs.nat_dip[9] << 16 | f->fs.nat_dip[8] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW + 2, WORD_MASK, f->fs.nat_dip[7] | f->fs.nat_dip[6] << 8 | f->fs.nat_dip[5] << 16 | f->fs.nat_dip[4] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_SND_UNA_RAW + 3, WORD_MASK, f->fs.nat_dip[3] | f->fs.nat_dip[2] << 8 | f->fs.nat_dip[1] << 16 | f->fs.nat_dip[0] << 24, 1); } else { set_tcb_field(sc, f->tid, W_TCB_RX_FRAG3_LEN_RAW, WORD_MASK, f->fs.nat_dip[3] | f->fs.nat_dip[2] << 8 | f->fs.nat_dip[1] << 16 | f->fs.nat_dip[0] << 24, 1); } } if (sip) { if (f->fs.type) { set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW, WORD_MASK, f->fs.nat_sip[15] | f->fs.nat_sip[14] << 8 | f->fs.nat_sip[13] << 16 | f->fs.nat_sip[12] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW + 1, WORD_MASK, f->fs.nat_sip[11] | f->fs.nat_sip[10] << 8 | f->fs.nat_sip[9] << 16 | f->fs.nat_sip[8] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW + 2, WORD_MASK, f->fs.nat_sip[7] | f->fs.nat_sip[6] << 8 | f->fs.nat_sip[5] << 16 | f->fs.nat_sip[4] << 24, 1); set_tcb_field(sc, f->tid, W_TCB_RX_FRAG2_PTR_RAW + 3, WORD_MASK, f->fs.nat_sip[3] | f->fs.nat_sip[2] << 8 | f->fs.nat_sip[1] << 16 | f->fs.nat_sip[0] << 24, 1); } else { set_tcb_field(sc, f->tid, W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW, WORD_MASK, f->fs.nat_sip[3] | f->fs.nat_sip[2] << 8 | f->fs.nat_sip[1] << 16 | f->fs.nat_sip[0] << 24, 1); } } set_tcb_field(sc, f->tid, W_TCB_PDU_HDR_LEN, WORD_MASK, (dp ? f->fs.nat_dport : 0) | (sp ? f->fs.nat_sport << 16 : 0), 1); } /* * Returns EINPROGRESS to indicate that at least one TCB update was sent and the * last of the series of updates requested a reply. The reply informs the * driver that the filter is fully setup. */ static int configure_hashfilter_tcb(struct adapter *sc, struct filter_entry *f) { int updated = 0; MPASS(f->tid < sc->tids.ntids); MPASS(f->fs.hash); MPASS(f->pending); MPASS(f->valid == 0); if (f->fs.newdmac) { set_tcb_tflag(sc, f->tid, S_TF_CCTRL_ECE, 1, 1); updated++; } if (f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) { set_tcb_tflag(sc, f->tid, S_TF_CCTRL_RFR, 1, 1); updated++; } if (f->fs.newsmac) { MPASS(f->smt != NULL); set_tcb_tflag(sc, f->tid, S_TF_CCTRL_CWR, 1, 1); set_tcb_field(sc, f->tid, W_TCB_SMAC_SEL, V_TCB_SMAC_SEL(M_TCB_SMAC_SEL), V_TCB_SMAC_SEL(f->smt->idx), 1); updated++; } switch(f->fs.nat_mode) { case NAT_MODE_NONE: break; case NAT_MODE_DIP: set_nat_params(sc, f, true, false, false, false); updated++; break; case NAT_MODE_DIP_DP: set_nat_params(sc, f, true, false, true, false); updated++; break; case NAT_MODE_DIP_DP_SIP: set_nat_params(sc, f, true, true, true, false); updated++; break; case NAT_MODE_DIP_DP_SP: set_nat_params(sc, f, true, false, true, true); updated++; break; case NAT_MODE_SIP_SP: set_nat_params(sc, f, false, true, false, true); updated++; break; case NAT_MODE_DIP_SIP_SP: set_nat_params(sc, f, true, true, false, true); updated++; break; case NAT_MODE_ALL: set_nat_params(sc, f, true, true, true, true); updated++; break; default: MPASS(0); /* should have been validated earlier */ break; } if (f->fs.nat_seq_chk) { set_tcb_field(sc, f->tid, W_TCB_RCV_NXT, V_TCB_RCV_NXT(M_TCB_RCV_NXT), V_TCB_RCV_NXT(f->fs.nat_seq_chk), 1); updated++; } if (is_t5(sc) && f->fs.action == FILTER_DROP) { /* * Migrating = 1, Non-offload = 0 to get a T5 hashfilter to drop. */ set_tcb_field(sc, f->tid, W_TCB_T_FLAGS, V_TF_NON_OFFLOAD(1) | V_TF_MIGRATING(1), V_TF_MIGRATING(1), 1); updated++; } /* * Enable switching after all secondary resources (L2T entry, SMT entry, * etc.) are setup so that any switched packet will use correct * values. */ if (f->fs.action == FILTER_SWITCH) { set_tcb_tflag(sc, f->tid, S_TF_CCTRL_ECN, 1, 1); updated++; } if (f->fs.hitcnts || updated > 0) { set_tcb_field(sc, f->tid, W_TCB_TIMESTAMP, V_TCB_TIMESTAMP(M_TCB_TIMESTAMP) | V_TCB_T_RTT_TS_RECENT_AGE(M_TCB_T_RTT_TS_RECENT_AGE), V_TCB_TIMESTAMP(0ULL) | V_TCB_T_RTT_TS_RECENT_AGE(0ULL), 0); return (EINPROGRESS); } return (0); } diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c index c1d4af45fd70..a08ddea00d05 100644 --- a/sys/dev/cxgbe/tom/t4_ddp.c +++ b/sys/dev/cxgbe/tom/t4_ddp.c @@ -1,3051 +1,3019 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" /* * Use the 'backend3' field in AIO jobs to store the amount of data * received by the AIO job so far. */ #define aio_received backend3 static void aio_ddp_requeue_task(void *context, int pending); static void ddp_complete_all(struct toepcb *toep, int error); static void t4_aio_cancel_active(struct kaiocb *job); static void t4_aio_cancel_queued(struct kaiocb *job); static int t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr, struct ddp_rcv_buffer *drb); static int t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid, struct ddp_rcv_buffer *drb); static TAILQ_HEAD(, pageset) ddp_orphan_pagesets; static struct mtx ddp_orphan_pagesets_lock; static struct task ddp_orphan_task; #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) /* * A page set holds information about a user buffer used for AIO DDP. * The page set holds resources such as the VM pages backing the * buffer (either held or wired) and the page pods associated with the * buffer. Recently used page sets are cached to allow for efficient * reuse of buffers (avoiding the need to re-fault in pages, hold * them, etc.). Note that cached page sets keep the backing pages * wired. The number of wired pages is capped by only allowing for * two wired pagesets per connection. This is not a perfect cap, but * is a trade-off for performance. * * If an application ping-pongs two buffers for a connection via * aio_read(2) then those buffers should remain wired and expensive VM * fault lookups should be avoided after each buffer has been used * once. If an application uses more than two buffers then this will * fall back to doing expensive VM fault lookups for each operation. */ static void free_pageset(struct tom_data *td, struct pageset *ps) { vm_page_t p; int i; if (ps->prsv.prsv_nppods > 0) t4_free_page_pods(&ps->prsv); for (i = 0; i < ps->npages; i++) { p = ps->pages[i]; vm_page_unwire(p, PQ_INACTIVE); } mtx_lock(&ddp_orphan_pagesets_lock); TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link); taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task); mtx_unlock(&ddp_orphan_pagesets_lock); } static void ddp_free_orphan_pagesets(void *context, int pending) { struct pageset *ps; mtx_lock(&ddp_orphan_pagesets_lock); while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) { ps = TAILQ_FIRST(&ddp_orphan_pagesets); TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link); mtx_unlock(&ddp_orphan_pagesets_lock); if (ps->vm) vmspace_free(ps->vm); free(ps, M_CXGBE); mtx_lock(&ddp_orphan_pagesets_lock); } mtx_unlock(&ddp_orphan_pagesets_lock); } static void recycle_pageset(struct toepcb *toep, struct pageset *ps) { DDP_ASSERT_LOCKED(toep); if (!(toep->ddp.flags & DDP_DEAD)) { KASSERT(toep->ddp.cached_count + toep->ddp.active_count < nitems(toep->ddp.db), ("too many wired pagesets")); TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link); toep->ddp.cached_count++; } else free_pageset(toep->td, ps); } static void ddp_complete_one(struct kaiocb *job, int error) { long copied; /* * If this job had copied data out of the socket buffer before * it was cancelled, report it as a short read rather than an * error. */ copied = job->aio_received; if (copied != 0 || error == 0) aio_complete(job, copied, 0); else aio_complete(job, -1, error); } static void free_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb) { t4_free_page_pods(&drb->prsv); contigfree(drb->buf, drb->len, M_CXGBE); free(drb, M_CXGBE); counter_u64_add(toep->ofld_rxq->ddp_buffer_free, 1); free_toepcb(toep); } static void recycle_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb) { DDP_CACHE_LOCK(toep); if (!(toep->ddp.flags & DDP_DEAD) && toep->ddp.cached_count < t4_ddp_rcvbuf_cache) { TAILQ_INSERT_HEAD(&toep->ddp.cached_buffers, drb, link); toep->ddp.cached_count++; DDP_CACHE_UNLOCK(toep); } else { DDP_CACHE_UNLOCK(toep); free_ddp_rcv_buffer(toep, drb); } } static struct ddp_rcv_buffer * alloc_cached_ddp_rcv_buffer(struct toepcb *toep) { struct ddp_rcv_buffer *drb; DDP_CACHE_LOCK(toep); if (!TAILQ_EMPTY(&toep->ddp.cached_buffers)) { drb = TAILQ_FIRST(&toep->ddp.cached_buffers); TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link); toep->ddp.cached_count--; counter_u64_add(toep->ofld_rxq->ddp_buffer_reuse, 1); } else drb = NULL; DDP_CACHE_UNLOCK(toep); return (drb); } static struct ddp_rcv_buffer * alloc_ddp_rcv_buffer(struct toepcb *toep, int how) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); struct ddp_rcv_buffer *drb; int error; drb = malloc(sizeof(*drb), M_CXGBE, how | M_ZERO); if (drb == NULL) return (NULL); drb->buf = contigmalloc(t4_ddp_rcvbuf_len, M_CXGBE, how, 0, ~0, t4_ddp_rcvbuf_len, 0); if (drb->buf == NULL) { free(drb, M_CXGBE); return (NULL); } drb->len = t4_ddp_rcvbuf_len; drb->refs = 1; error = t4_alloc_page_pods_for_rcvbuf(&td->pr, drb); if (error != 0) { contigfree(drb->buf, drb->len, M_CXGBE); free(drb, M_CXGBE); return (NULL); } error = t4_write_page_pods_for_rcvbuf(sc, toep->ctrlq, toep->tid, drb); if (error != 0) { t4_free_page_pods(&drb->prsv); contigfree(drb->buf, drb->len, M_CXGBE); free(drb, M_CXGBE); return (NULL); } hold_toepcb(toep); counter_u64_add(toep->ofld_rxq->ddp_buffer_alloc, 1); return (drb); } static void free_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db) { if ((toep->ddp.flags & DDP_RCVBUF) != 0) { if (db->drb != NULL) free_ddp_rcv_buffer(toep, db->drb); #ifdef INVARIANTS db->drb = NULL; #endif return; } if (db->job) { /* * XXX: If we are un-offloading the socket then we * should requeue these on the socket somehow. If we * got a FIN from the remote end, then this completes * any remaining requests with an EOF read. */ if (!aio_clear_cancel_function(db->job)) ddp_complete_one(db->job, 0); #ifdef INVARIANTS db->job = NULL; #endif } if (db->ps) { free_pageset(toep->td, db->ps); #ifdef INVARIANTS db->ps = NULL; #endif } } static void ddp_init_toep(struct toepcb *toep) { toep->ddp.flags = DDP_OK; toep->ddp.active_id = -1; mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF); mtx_init(&toep->ddp.cache_lock, "t4 ddp cache", NULL, MTX_DEF); } void ddp_uninit_toep(struct toepcb *toep) { mtx_destroy(&toep->ddp.lock); mtx_destroy(&toep->ddp.cache_lock); } void release_ddp_resources(struct toepcb *toep) { struct ddp_rcv_buffer *drb; struct pageset *ps; int i; DDP_LOCK(toep); DDP_CACHE_LOCK(toep); toep->ddp.flags |= DDP_DEAD; DDP_CACHE_UNLOCK(toep); for (i = 0; i < nitems(toep->ddp.db); i++) { free_ddp_buffer(toep, &toep->ddp.db[i]); } if ((toep->ddp.flags & DDP_AIO) != 0) { while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) { TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); free_pageset(toep->td, ps); } ddp_complete_all(toep, 0); } if ((toep->ddp.flags & DDP_RCVBUF) != 0) { DDP_CACHE_LOCK(toep); while ((drb = TAILQ_FIRST(&toep->ddp.cached_buffers)) != NULL) { TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link); free_ddp_rcv_buffer(toep, drb); } DDP_CACHE_UNLOCK(toep); } DDP_UNLOCK(toep); } #ifdef INVARIANTS void ddp_assert_empty(struct toepcb *toep) { int i; MPASS((toep->ddp.flags & (DDP_TASK_ACTIVE | DDP_DEAD)) != DDP_TASK_ACTIVE); for (i = 0; i < nitems(toep->ddp.db); i++) { if ((toep->ddp.flags & DDP_AIO) != 0) { MPASS(toep->ddp.db[i].job == NULL); MPASS(toep->ddp.db[i].ps == NULL); } else MPASS(toep->ddp.db[i].drb == NULL); } if ((toep->ddp.flags & DDP_AIO) != 0) { MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets)); MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq)); } if ((toep->ddp.flags & DDP_RCVBUF) != 0) MPASS(TAILQ_EMPTY(&toep->ddp.cached_buffers)); } #endif static void complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db, unsigned int db_idx) { struct ddp_rcv_buffer *drb; unsigned int db_flag; toep->ddp.active_count--; if (toep->ddp.active_id == db_idx) { if (toep->ddp.active_count == 0) { if ((toep->ddp.flags & DDP_AIO) != 0) KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL, ("%s: active_count mismatch", __func__)); else KASSERT(toep->ddp.db[db_idx ^ 1].drb == NULL, ("%s: active_count mismatch", __func__)); toep->ddp.active_id = -1; } else toep->ddp.active_id ^= 1; #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %u, ddp_active_id = %d", __func__, toep->tid, toep->ddp.active_id); #endif } else { KASSERT(toep->ddp.active_count != 0 && toep->ddp.active_id != -1, ("%s: active count mismatch", __func__)); } if ((toep->ddp.flags & DDP_AIO) != 0) { db->cancel_pending = 0; db->job = NULL; recycle_pageset(toep, db->ps); db->ps = NULL; } else { drb = db->drb; if (atomic_fetchadd_int(&drb->refs, -1) == 1) recycle_ddp_rcv_buffer(toep, drb); db->drb = NULL; db->placed = 0; } db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; KASSERT(toep->ddp.flags & db_flag, ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x", __func__, toep, toep->ddp.flags)); toep->ddp.flags &= ~db_flag; } /* Called when m_free drops the last reference. */ static void ddp_rcv_mbuf_done(struct mbuf *m) { struct toepcb *toep = m->m_ext.ext_arg1; struct ddp_rcv_buffer *drb = m->m_ext.ext_arg2; recycle_ddp_rcv_buffer(toep, drb); } static void queue_ddp_rcvbuf_mbuf(struct toepcb *toep, u_int db_idx, u_int len) { struct inpcb *inp = toep->inp; struct sockbuf *sb; struct ddp_buffer *db; struct ddp_rcv_buffer *drb; struct mbuf *m; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { printf("%s: failed to allocate mbuf", __func__); return; } m->m_pkthdr.rcvif = toep->vi->ifp; db = &toep->ddp.db[db_idx]; drb = db->drb; m_extaddref(m, (char *)drb->buf + db->placed, len, &drb->refs, ddp_rcv_mbuf_done, toep, drb); m->m_pkthdr.len = len; m->m_len = len; sb = &inp->inp_socket->so_rcv; SOCKBUF_LOCK_ASSERT(sb); sbappendstream_locked(sb, m, 0); db->placed += len; toep->ofld_rxq->rx_toe_ddp_octets += len; } /* XXX: handle_ddp_data code duplication */ void insert_ddp_data(struct toepcb *toep, uint32_t n) { struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct ddp_buffer *db; struct kaiocb *job; size_t placed; long copied; unsigned int db_idx; #ifdef INVARIANTS unsigned int db_flag; #endif bool ddp_rcvbuf; INP_WLOCK_ASSERT(inp); DDP_ASSERT_LOCKED(toep); ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0; tp->rcv_nxt += n; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); tp->rcv_wnd -= n; #endif CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP", __func__, n); while (toep->ddp.active_count > 0) { MPASS(toep->ddp.active_id != -1); db_idx = toep->ddp.active_id; #ifdef INVARIANTS db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; #endif MPASS((toep->ddp.flags & db_flag) != 0); db = &toep->ddp.db[db_idx]; if (ddp_rcvbuf) { placed = n; if (placed > db->drb->len - db->placed) placed = db->drb->len - db->placed; if (placed != 0) queue_ddp_rcvbuf_mbuf(toep, db_idx, placed); complete_ddp_buffer(toep, db, db_idx); n -= placed; continue; } job = db->job; copied = job->aio_received; placed = n; if (placed > job->uaiocb.aio_nbytes - copied) placed = job->uaiocb.aio_nbytes - copied; if (placed > 0) { job->msgrcv = 1; toep->ofld_rxq->rx_aio_ddp_jobs++; } toep->ofld_rxq->rx_aio_ddp_octets += placed; if (!aio_clear_cancel_function(job)) { /* * Update the copied length for when * t4_aio_cancel_active() completes this * request. */ job->aio_received += placed; } else if (copied + placed != 0) { CTR4(KTR_CXGBE, "%s: completing %p (copied %ld, placed %lu)", __func__, job, copied, placed); /* XXX: This always completes if there is some data. */ aio_complete(job, copied + placed, 0); } else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) { TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count++; } else aio_cancel(job); n -= placed; complete_ddp_buffer(toep, db, db_idx); } MPASS(n == 0); } /* SET_TCB_FIELD sent as a ULP command looks like this */ #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) /* RX_DATA_ACK sent as a ULP command looks like this */ #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) -static inline void * -mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, - uint64_t word, uint64_t mask, uint64_t val) -{ - struct ulptx_idata *ulpsc; - struct cpl_set_tcb_field_core *req; - - ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); - ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); - - ulpsc = (struct ulptx_idata *)(ulpmc + 1); - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); - ulpsc->len = htobe32(sizeof(*req)); - - req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); - OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); - req->reply_ctrl = htobe16(V_NO_REPLY(1) | - V_QUEUENO(toep->ofld_rxq->iq.abs_id)); - req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); - req->mask = htobe64(mask); - req->val = htobe64(val); - - ulpsc = (struct ulptx_idata *)(req + 1); - if (LEN__SET_TCB_FIELD_ULP % 16) { - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); - ulpsc->len = htobe32(0); - return (ulpsc + 1); - } - return (ulpsc); -} - static inline void * mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) { struct ulptx_idata *ulpsc; struct cpl_rx_data_ack_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); req->credit_dack = htobe32(F_RX_MODULATE_RX); ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__RX_DATA_ACK_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static struct wrqe * mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, struct ppod_reservation *prsv, int offset, uint32_t len, uint64_t ddp_flags, uint64_t ddp_flags_mask) { struct wrqe *wr; struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int wrlen; KASSERT(db_idx == 0 || db_idx == 1, ("%s: bad DDP buffer index %d", __func__, db_idx)); /* * We'll send a compound work request that has 3 SET_TCB_FIELDs and an * RX_DATA_ACK (with RX_MODULATE to speed up delivery). * * The work request header is 16B and always ends at a 16B boundary. * The ULPTX master commands that follow must all end at 16B boundaries * too so we round up the size to 16. */ wrlen = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + roundup2(LEN__RX_DATA_ACK_ULP, 16); wr = alloc_wrqe(wrlen, toep->ctrlq); if (wr == NULL) return (NULL); wrh = wrtod(wr); INIT_ULPTX_WRH(wrh, wrlen, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); /* Write the buffer's tag */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_RX_DDP_BUF0_TAG + db_idx, V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), V_TCB_RX_DDP_BUF0_TAG(prsv->prsv_tag)); /* Update the current offset in the DDP buffer and its total length */ if (db_idx == 0) - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_RX_DDP_BUF0_OFFSET, V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_OFFSET(offset) | V_TCB_RX_DDP_BUF0_LEN(len)); else - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_RX_DDP_BUF1_OFFSET, V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), V_TCB_RX_DDP_BUF1_OFFSET(offset) | V_TCB_RX_DDP_BUF1_LEN((u64)len << 32)); /* Update DDP flags */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_RX_DDP_FLAGS, ddp_flags_mask, ddp_flags); /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); return (wr); } static int handle_ddp_data_aio(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) { uint32_t report = be32toh(ddp_report); unsigned int db_idx; struct inpcb *inp = toep->inp; struct ddp_buffer *db; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct kaiocb *job; long copied; db_idx = report & F_DDP_BUF_IDX ? 1 : 0; if (__predict_false(!(report & F_DDP_INV))) CXGBE_UNIMPLEMENTED("DDP buffer still valid"); INP_WLOCK(inp); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; DDP_LOCK(toep); KASSERT(toep->ddp.active_id == db_idx, ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx, toep->ddp.active_id, toep->tid)); db = &toep->ddp.db[db_idx]; job = db->job; if (__predict_false(inp->inp_flags & INP_DROPPED)) { /* * This can happen due to an administrative tcpdrop(8). * Just fail the request with ECONNRESET. */ CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); if (aio_clear_cancel_function(job)) ddp_complete_one(job, ECONNRESET); goto completed; } tp = intotcpcb(inp); /* * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the * sequence number of the next byte to receive. The length of * the data received for this message must be computed by * comparing the new and old values of rcv_nxt. * * For RX_DATA_DDP, len might be non-zero, but it is only the * length of the most recent DMA. It does not include the * total length of the data received since the previous update * for this DDP buffer. rcv_nxt is the sequence number of the * first received byte from the most recent DMA. */ len += be32toh(rcv_nxt) - tp->rcv_nxt; tp->rcv_nxt += len; tp->t_rcvtime = ticks; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); tp->rcv_wnd -= len; #endif #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__, toep->tid, db_idx, len, report); #endif /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); SOCKBUF_LOCK(sb); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { struct adapter *sc = td_adapter(toep->td); unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } SOCKBUF_UNLOCK(sb); CURVNET_RESTORE(); job->msgrcv = 1; toep->ofld_rxq->rx_aio_ddp_jobs++; toep->ofld_rxq->rx_aio_ddp_octets += len; if (db->cancel_pending) { /* * Update the job's length but defer completion to the * TCB_RPL callback. */ job->aio_received += len; goto out; } else if (!aio_clear_cancel_function(job)) { /* * Update the copied length for when * t4_aio_cancel_active() completes this request. */ job->aio_received += len; } else { copied = job->aio_received; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %u, completing %p (copied %ld, placed %d)", __func__, toep->tid, job, copied, len); #endif aio_complete(job, copied + len, 0); t4_rcvd(&toep->td->tod, tp); } completed: complete_ddp_buffer(toep, db, db_idx); if (toep->ddp.waiting_count > 0) ddp_queue_toep(toep); out: DDP_UNLOCK(toep); INP_WUNLOCK(inp); return (0); } static bool queue_ddp_rcvbuf(struct toepcb *toep, struct ddp_rcv_buffer *drb) { struct adapter *sc = td_adapter(toep->td); struct ddp_buffer *db; struct wrqe *wr; uint64_t ddp_flags, ddp_flags_mask; int buf_flag, db_idx; DDP_ASSERT_LOCKED(toep); KASSERT((toep->ddp.flags & DDP_DEAD) == 0, ("%s: DDP_DEAD", __func__)); KASSERT(toep->ddp.active_count < nitems(toep->ddp.db), ("%s: no empty DDP buffer slot", __func__)); /* Determine which DDP buffer to use. */ if (toep->ddp.db[0].drb == NULL) { db_idx = 0; } else { MPASS(toep->ddp.db[1].drb == NULL); db_idx = 1; } /* * Permit PSH to trigger a partial completion without * invalidating the rest of the buffer, but disable the PUSH * timer. */ ddp_flags = 0; ddp_flags_mask = 0; if (db_idx == 0) { ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE0(1) | V_TF_DDP_PUSH_DISABLE_0(0) | V_TF_DDP_PSHF_ENABLE_0(1) | V_TF_DDP_BUF0_VALID(1); ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) | V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) | V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1); buf_flag = DDP_BUF0_ACTIVE; } else { ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_PUSH_DISABLE_1(0) | V_TF_DDP_PSHF_ENABLE_1(1) | V_TF_DDP_BUF1_VALID(1); ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) | V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1); buf_flag = DDP_BUF1_ACTIVE; } MPASS((toep->ddp.flags & buf_flag) == 0); if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) { MPASS(db_idx == 0); MPASS(toep->ddp.active_id == -1); MPASS(toep->ddp.active_count == 0); ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1); } /* * The TID for this connection should still be valid. If * DDP_DEAD is set, SBS_CANTRCVMORE should be set, so we * shouldn't be this far anyway. */ wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &drb->prsv, 0, drb->len, ddp_flags, ddp_flags_mask); if (wr == NULL) { recycle_ddp_rcv_buffer(toep, drb); printf("%s: mk_update_tcb_for_ddp failed\n", __func__); return (false); } #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u, scheduling DDP[%d] (flags %#lx/%#lx)", __func__, toep->tid, db_idx, ddp_flags, ddp_flags_mask); #endif /* * Hold a reference on scheduled buffers that is dropped in * complete_ddp_buffer. */ drb->refs = 1; /* Give the chip the go-ahead. */ t4_wrq_tx(sc, wr); db = &toep->ddp.db[db_idx]; db->drb = drb; toep->ddp.flags |= buf_flag; toep->ddp.active_count++; if (toep->ddp.active_count == 1) { MPASS(toep->ddp.active_id == -1); toep->ddp.active_id = db_idx; CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__, toep->ddp.active_id); } return (true); } static int handle_ddp_data_rcvbuf(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) { uint32_t report = be32toh(ddp_report); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct ddp_buffer *db; struct ddp_rcv_buffer *drb; unsigned int db_idx; bool invalidated; db_idx = report & F_DDP_BUF_IDX ? 1 : 0; invalidated = (report & F_DDP_INV) != 0; INP_WLOCK(inp); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; DDP_LOCK(toep); KASSERT(toep->ddp.active_id == db_idx, ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx, toep->ddp.active_id, toep->tid)); db = &toep->ddp.db[db_idx]; if (__predict_false(inp->inp_flags & INP_DROPPED)) { /* * This can happen due to an administrative tcpdrop(8). * Just ignore the received data. */ CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); if (invalidated) complete_ddp_buffer(toep, db, db_idx); goto out; } tp = intotcpcb(inp); /* * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the * sequence number of the next byte to receive. The length of * the data received for this message must be computed by * comparing the new and old values of rcv_nxt. * * For RX_DATA_DDP, len might be non-zero, but it is only the * length of the most recent DMA. It does not include the * total length of the data received since the previous update * for this DDP buffer. rcv_nxt is the sequence number of the * first received byte from the most recent DMA. */ len += be32toh(rcv_nxt) - tp->rcv_nxt; tp->rcv_nxt += len; tp->t_rcvtime = ticks; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); tp->rcv_wnd -= len; #endif #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__, toep->tid, db_idx, len, report); #endif /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); SOCKBUF_LOCK(sb); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { struct adapter *sc = td_adapter(toep->td); unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } if (len > 0) { queue_ddp_rcvbuf_mbuf(toep, db_idx, len); t4_rcvd_locked(&toep->td->tod, tp); } sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); CURVNET_RESTORE(); if (invalidated) complete_ddp_buffer(toep, db, db_idx); else KASSERT(db->placed < db->drb->len, ("%s: full DDP buffer not invalidated", __func__)); if (toep->ddp.active_count != nitems(toep->ddp.db)) { drb = alloc_cached_ddp_rcv_buffer(toep); if (drb == NULL) drb = alloc_ddp_rcv_buffer(toep, M_NOWAIT); if (drb == NULL) ddp_queue_toep(toep); else { if (!queue_ddp_rcvbuf(toep, drb)) { ddp_queue_toep(toep); } } } out: DDP_UNLOCK(toep); INP_WUNLOCK(inp); return (0); } static int handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) { if ((toep->ddp.flags & DDP_RCVBUF) != 0) return (handle_ddp_data_rcvbuf(toep, ddp_report, rcv_nxt, len)); else return (handle_ddp_data_aio(toep, ddp_report, rcv_nxt, len)); } void handle_ddp_indicate(struct toepcb *toep) { DDP_ASSERT_LOCKED(toep); if ((toep->ddp.flags & DDP_RCVBUF) != 0) { /* * Indicates are not meaningful for RCVBUF since * buffers are activated when the socket option is * set. */ return; } MPASS(toep->ddp.active_count == 0); MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0); if (toep->ddp.waiting_count == 0) { /* * The pending requests that triggered the request for an * an indicate were cancelled. Those cancels should have * already disabled DDP. Just ignore this as the data is * going into the socket buffer anyway. */ return; } CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__, toep->tid, toep->ddp.waiting_count); ddp_queue_toep(toep); } CTASSERT(CPL_COOKIE_DDP0 + 1 == CPL_COOKIE_DDP1); static int do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); unsigned int db_idx; struct toepcb *toep; struct inpcb *inp; struct ddp_buffer *db; struct kaiocb *job; long copied; if (cpl->status != CPL_ERR_NONE) panic("XXX: tcp_rpl failed: %d", cpl->status); toep = lookup_tid(sc, tid); inp = toep->inp; switch (cpl->cookie) { case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP0): case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP1): /* * XXX: This duplicates a lot of code with handle_ddp_data(). */ KASSERT((toep->ddp.flags & DDP_AIO) != 0, ("%s: DDP_RCVBUF", __func__)); db_idx = G_COOKIE(cpl->cookie) - CPL_COOKIE_DDP0; MPASS(db_idx < nitems(toep->ddp.db)); INP_WLOCK(inp); DDP_LOCK(toep); db = &toep->ddp.db[db_idx]; /* * handle_ddp_data() should leave the job around until * this callback runs once a cancel is pending. */ MPASS(db != NULL); MPASS(db->job != NULL); MPASS(db->cancel_pending); /* * XXX: It's not clear what happens if there is data * placed when the buffer is invalidated. I suspect we * need to read the TCB to see how much data was placed. * * For now this just pretends like nothing was placed. * * XXX: Note that if we did check the PCB we would need to * also take care of updating the tp, etc. */ job = db->job; copied = job->aio_received; if (copied == 0) { CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job); aio_cancel(job); } else { CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)", __func__, job, copied); aio_complete(job, copied, 0); t4_rcvd(&toep->td->tod, intotcpcb(inp)); } complete_ddp_buffer(toep, db, db_idx); if (toep->ddp.waiting_count > 0) ddp_queue_toep(toep); DDP_UNLOCK(toep); INP_WUNLOCK(inp); break; default: panic("XXX: unknown tcb_rpl offset %#x, cookie %#x", G_WORD(cpl->cookie), G_COOKIE(cpl->cookie)); } return (0); } void handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt) { struct socket *so = toep->inp->inp_socket; struct sockbuf *sb = &so->so_rcv; struct ddp_buffer *db; struct kaiocb *job; long copied; unsigned int db_idx; #ifdef INVARIANTS unsigned int db_flag; #endif int len, placed; bool ddp_rcvbuf; INP_WLOCK_ASSERT(toep->inp); DDP_ASSERT_LOCKED(toep); ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0; /* - 1 is to ignore the byte for FIN */ len = be32toh(rcv_nxt) - tp->rcv_nxt - 1; tp->rcv_nxt += len; CTR(KTR_CXGBE, "%s: tid %d placed %u bytes before FIN", __func__, toep->tid, len); while (toep->ddp.active_count > 0) { MPASS(toep->ddp.active_id != -1); db_idx = toep->ddp.active_id; #ifdef INVARIANTS db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; #endif MPASS((toep->ddp.flags & db_flag) != 0); db = &toep->ddp.db[db_idx]; if (ddp_rcvbuf) { placed = len; if (placed > db->drb->len - db->placed) placed = db->drb->len - db->placed; if (placed != 0) { SOCKBUF_LOCK(sb); queue_ddp_rcvbuf_mbuf(toep, db_idx, placed); sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); } complete_ddp_buffer(toep, db, db_idx); len -= placed; continue; } job = db->job; copied = job->aio_received; placed = len; if (placed > job->uaiocb.aio_nbytes - copied) placed = job->uaiocb.aio_nbytes - copied; if (placed > 0) { job->msgrcv = 1; toep->ofld_rxq->rx_aio_ddp_jobs++; } toep->ofld_rxq->rx_aio_ddp_octets += placed; if (!aio_clear_cancel_function(job)) { /* * Update the copied length for when * t4_aio_cancel_active() completes this * request. */ job->aio_received += placed; } else { CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d", __func__, toep->tid, db_idx, placed); aio_complete(job, copied + placed, 0); } len -= placed; complete_ddp_buffer(toep, db, db_idx); } MPASS(len == 0); if ((toep->ddp.flags & DDP_AIO) != 0) ddp_complete_all(toep, 0); } #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) extern cpl_handler_t t4_cpl_handler[]; static int do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); uint32_t vld; struct toepcb *toep = lookup_tid(sc, tid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); vld = be32toh(cpl->ddpvld); if (__predict_false(vld & DDP_ERR)) { panic("%s: DDP error 0x%x (tid %d, toep %p)", __func__, vld, tid, toep); } if (ulp_mode(toep) == ULP_MODE_ISCSI) { t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m); return (0); } handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); return (0); } static int do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); return (0); } static bool set_ddp_ulp_mode(struct toepcb *toep) { struct adapter *sc = toep->vi->adapter; struct wrqe *wr; struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int fields, len; if (!sc->tt.ddp) return (false); fields = 0; /* Overlay region including W_TCB_RX_DDP_FLAGS */ fields += 3; /* W_TCB_ULP_TYPE */ fields++; #ifdef USE_DDP_RX_FLOW_CONTROL /* W_TCB_T_FLAGS */ fields++; #endif len = sizeof(*wrh) + fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16); KASSERT(len <= SGE_MAX_WR_LEN, ("%s: WR with %d TCB field updates too large", __func__, fields)); wr = alloc_wrqe(len, toep->ctrlq); if (wr == NULL) return (false); CTR(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); wrh = wrtod(wr); INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); /* * Words 26/27 are zero except for the DDP_OFF flag in * W_TCB_RX_DDP_FLAGS (27). */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 26, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 26, 0xffffffffffffffff, (uint64_t)V_TF_DDP_OFF(1) << 32); /* Words 28/29 are zero. */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 28, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 28, 0xffffffffffffffff, 0); /* Words 30/31 are zero. */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 30, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 30, 0xffffffffffffffff, 0); /* Set the ULP mode to ULP_MODE_TCPDDP. */ toep->params.ulp_mode = ULP_MODE_TCPDDP; - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_ULP_TYPE, - V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), - V_TCB_ULP_TYPE(ULP_MODE_TCPDDP)); + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_TYPE, + V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_TCPDDP)); #ifdef USE_DDP_RX_FLOW_CONTROL /* Set TF_RX_FLOW_CONTROL_DDP. */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_T_FLAGS, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_FLAGS, V_TF_RX_FLOW_CONTROL_DDP(1), V_TF_RX_FLOW_CONTROL_DDP(1)); #endif ddp_init_toep(toep); t4_wrq_tx(sc, wr); return (true); } static void enable_ddp(struct adapter *sc, struct toepcb *toep) { uint64_t ddp_flags; KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, ("%s: toep %p has bad ddp_flags 0x%x", __func__, toep, toep->ddp.flags)); CTR3(KTR_CXGBE, "%s: tid %u (time %u)", __func__, toep->tid, time_uptime); ddp_flags = 0; if ((toep->ddp.flags & DDP_AIO) != 0) ddp_flags |= V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1); DDP_ASSERT_LOCKED(toep); toep->ddp.flags |= DDP_SC_REQ; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), ddp_flags, 0, 0); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0); } static int calculate_hcf(int n1, int n2) { int a, b, t; if (n1 <= n2) { a = n1; b = n2; } else { a = n2; b = n1; } while (a != 0) { t = a; a = b % a; b = t; } return (b); } static inline int pages_to_nppods(int npages, int ddp_page_shift) { MPASS(ddp_page_shift >= PAGE_SHIFT); return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES)); } static int alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx, struct ppod_reservation *prsv) { vmem_addr_t addr; /* relative to start of region */ if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT, &addr) != 0) return (ENOMEM); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d", __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask, nppods, 1 << pr->pr_page_shift[pgsz_idx]); #endif /* * The hardware tagmask includes an extra invalid bit but the arena was * seeded with valid values only. An allocation out of this arena will * fit inside the tagmask but won't have the invalid bit set. */ MPASS((addr & pr->pr_tag_mask) == addr); MPASS((addr & pr->pr_invalid_bit) == 0); prsv->prsv_pr = pr; prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr; prsv->prsv_nppods = nppods; return (0); } static int t4_alloc_page_pods_for_vmpages(struct ppod_region *pr, vm_page_t *pages, int npages, struct ppod_reservation *prsv) { int i, hcf, seglen, idx, nppods; /* * The DDP page size is unrelated to the VM page size. We combine * contiguous physical pages into larger segments to get the best DDP * page size possible. This is the largest of the four sizes in * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in * the page list. */ hcf = 0; for (i = 0; i < npages; i++) { seglen = PAGE_SIZE; while (i < npages - 1 && VM_PAGE_TO_PHYS(pages[i]) + PAGE_SIZE == VM_PAGE_TO_PHYS(pages[i + 1])) { seglen += PAGE_SIZE; i++; } hcf = calculate_hcf(hcf, seglen); if (hcf < (1 << pr->pr_page_shift[1])) { idx = 0; goto have_pgsz; /* give up, short circuit */ } } #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { if ((hcf & PR_PAGE_MASK(idx)) == 0) break; } #undef PR_PAGE_MASK have_pgsz: MPASS(idx <= M_PPOD_PGSZ); nppods = pages_to_nppods(npages, pr->pr_page_shift[idx]); if (alloc_page_pods(pr, nppods, idx, prsv) != 0) return (ENOMEM); MPASS(prsv->prsv_nppods > 0); return (0); } int t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps) { struct ppod_reservation *prsv = &ps->prsv; KASSERT(prsv->prsv_nppods == 0, ("%s: page pods already allocated", __func__)); return (t4_alloc_page_pods_for_vmpages(pr, ps->pages, ps->npages, prsv)); } int t4_alloc_page_pods_for_bio(struct ppod_region *pr, struct bio *bp, struct ppod_reservation *prsv) { MPASS(bp->bio_flags & BIO_UNMAPPED); return (t4_alloc_page_pods_for_vmpages(pr, bp->bio_ma, bp->bio_ma_n, prsv)); } int t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len, struct ppod_reservation *prsv) { int hcf, seglen, idx, npages, nppods; uintptr_t start_pva, end_pva, pva, p1; MPASS(buf > 0); MPASS(len > 0); /* * The DDP page size is unrelated to the VM page size. We combine * contiguous physical pages into larger segments to get the best DDP * page size possible. This is the largest of the four sizes in * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes * in the page list. */ hcf = 0; start_pva = trunc_page(buf); end_pva = trunc_page(buf + len - 1); pva = start_pva; while (pva <= end_pva) { seglen = PAGE_SIZE; p1 = pmap_kextract(pva); pva += PAGE_SIZE; while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) { seglen += PAGE_SIZE; pva += PAGE_SIZE; } hcf = calculate_hcf(hcf, seglen); if (hcf < (1 << pr->pr_page_shift[1])) { idx = 0; goto have_pgsz; /* give up, short circuit */ } } #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { if ((hcf & PR_PAGE_MASK(idx)) == 0) break; } #undef PR_PAGE_MASK have_pgsz: MPASS(idx <= M_PPOD_PGSZ); npages = 1; npages += (end_pva - start_pva) >> pr->pr_page_shift[idx]; nppods = howmany(npages, PPOD_PAGES); if (alloc_page_pods(pr, nppods, idx, prsv) != 0) return (ENOMEM); MPASS(prsv->prsv_nppods > 0); return (0); } static int t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr, struct ddp_rcv_buffer *drb) { struct ppod_reservation *prsv = &drb->prsv; KASSERT(prsv->prsv_nppods == 0, ("%s: page pods already allocated", __func__)); return (t4_alloc_page_pods_for_buf(pr, (vm_offset_t)drb->buf, drb->len, prsv)); } int t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl, int entries, struct ppod_reservation *prsv) { int hcf, seglen, idx = 0, npages, nppods, i, len; uintptr_t start_pva, end_pva, pva, p1 ; vm_offset_t buf; struct ctl_sg_entry *sge; MPASS(entries > 0); MPASS(sgl); /* * The DDP page size is unrelated to the VM page size. We combine * contiguous physical pages into larger segments to get the best DDP * page size possible. This is the largest of the four sizes in * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes * in the page list. */ hcf = 0; for (i = entries - 1; i >= 0; i--) { sge = sgl + i; buf = (vm_offset_t)sge->addr; len = sge->len; start_pva = trunc_page(buf); end_pva = trunc_page(buf + len - 1); pva = start_pva; while (pva <= end_pva) { seglen = PAGE_SIZE; p1 = pmap_kextract(pva); pva += PAGE_SIZE; while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) { seglen += PAGE_SIZE; pva += PAGE_SIZE; } hcf = calculate_hcf(hcf, seglen); if (hcf < (1 << pr->pr_page_shift[1])) { idx = 0; goto have_pgsz; /* give up, short circuit */ } } } #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { if ((hcf & PR_PAGE_MASK(idx)) == 0) break; } #undef PR_PAGE_MASK have_pgsz: MPASS(idx <= M_PPOD_PGSZ); npages = 0; while (entries--) { npages++; start_pva = trunc_page((vm_offset_t)sgl->addr); end_pva = trunc_page((vm_offset_t)sgl->addr + sgl->len - 1); npages += (end_pva - start_pva) >> pr->pr_page_shift[idx]; sgl = sgl + 1; } nppods = howmany(npages, PPOD_PAGES); if (alloc_page_pods(pr, nppods, idx, prsv) != 0) return (ENOMEM); MPASS(prsv->prsv_nppods > 0); return (0); } void t4_free_page_pods(struct ppod_reservation *prsv) { struct ppod_region *pr = prsv->prsv_pr; vmem_addr_t addr; MPASS(prsv != NULL); MPASS(prsv->prsv_nppods != 0); addr = prsv->prsv_tag & pr->pr_tag_mask; MPASS((addr & pr->pr_invalid_bit) == 0); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__, pr->pr_arena, addr, prsv->prsv_nppods); #endif vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods)); prsv->prsv_nppods = 0; } #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) int t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid, struct pageset *ps) { struct wrqe *wr; struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz, idx; u_int ppod_addr; uint32_t cmd; struct ppod_reservation *prsv = &ps->prsv; struct ppod_region *pr = prsv->prsv_pr; vm_paddr_t pa; KASSERT(!(ps->flags & PS_PPODS_WRITTEN), ("%s: page pods already written", __func__)); MPASS(prsv->prsv_nppods > 0); cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); wr = alloc_wrqe(len, wrq); if (wr == NULL) return (ENOMEM); /* ok to just bail out */ ulpmc = wrtod(wr); INIT_ULPTX_WR(ulpmc, len, 0, 0); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(tid) | prsv->prsv_tag); ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) | V_PPOD_OFST(ps->offset)); ppod->rsvd = 0; idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); for (k = 0; k < nitems(ppod->addr); k++) { if (idx < ps->npages) { pa = VM_PAGE_TO_PHYS(ps->pages[idx]); ppod->addr[k] = htobe64(pa); idx += ddp_pgsz / PAGE_SIZE; } else ppod->addr[k] = 0; #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, tid, i, k, be64toh(ppod->addr[k])); #endif } } t4_wrq_tx(sc, wr); } ps->flags |= PS_PPODS_WRITTEN; return (0); } static int t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid, struct ddp_rcv_buffer *drb) { struct wrqe *wr; struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz; u_int ppod_addr, offset; uint32_t cmd; struct ppod_reservation *prsv = &drb->prsv; struct ppod_region *pr = prsv->prsv_pr; uintptr_t end_pva, pva; vm_paddr_t pa; MPASS(prsv->prsv_nppods > 0); cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; offset = (uintptr_t)drb->buf & PAGE_MASK; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); pva = trunc_page((uintptr_t)drb->buf); end_pva = trunc_page((uintptr_t)drb->buf + drb->len - 1); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); MPASS(n > 0); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); wr = alloc_wrqe(len, wrq); if (wr == NULL) return (ENOMEM); /* ok to just bail out */ ulpmc = wrtod(wr); INIT_ULPTX_WR(ulpmc, len, 0, 0); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(tid) | prsv->prsv_tag); ppod->len_offset = htobe64(V_PPOD_LEN(drb->len) | V_PPOD_OFST(offset)); ppod->rsvd = 0; for (k = 0; k < nitems(ppod->addr); k++) { if (pva > end_pva) ppod->addr[k] = 0; else { pa = pmap_kextract(pva); ppod->addr[k] = htobe64(pa); pva += ddp_pgsz; } #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, tid, i, k, be64toh(ppod->addr[k])); #endif } /* * Walk back 1 segment so that the first address in the * next pod is the same as the last one in the current * pod. */ pva -= ddp_pgsz; } t4_wrq_tx(sc, wr); } MPASS(pva <= end_pva); return (0); } static struct mbuf * alloc_raw_wr_mbuf(int len) { struct mbuf *m; if (len <= MHLEN) m = m_gethdr(M_NOWAIT, MT_DATA); else if (len <= MCLBYTES) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = NULL; if (m == NULL) return (NULL); m->m_pkthdr.len = len; m->m_len = len; set_mbuf_raw_wr(m, true); return (m); } int t4_write_page_pods_for_bio(struct adapter *sc, struct toepcb *toep, struct ppod_reservation *prsv, struct bio *bp, struct mbufq *wrq) { struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz, idx; u_int ppod_addr; uint32_t cmd; struct ppod_region *pr = prsv->prsv_pr; vm_paddr_t pa; struct mbuf *m; MPASS(bp->bio_flags & BIO_UNMAPPED); cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); MPASS(n > 0); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); m = alloc_raw_wr_mbuf(len); if (m == NULL) return (ENOMEM); ulpmc = mtod(m, struct ulp_mem_io *); INIT_ULPTX_WR(ulpmc, len, 0, toep->tid); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(toep->tid) | (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ))); ppod->len_offset = htobe64(V_PPOD_LEN(bp->bio_bcount) | V_PPOD_OFST(bp->bio_ma_offset)); ppod->rsvd = 0; idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); for (k = 0; k < nitems(ppod->addr); k++) { if (idx < bp->bio_ma_n) { pa = VM_PAGE_TO_PHYS(bp->bio_ma[idx]); ppod->addr[k] = htobe64(pa); idx += ddp_pgsz / PAGE_SIZE; } else ppod->addr[k] = 0; #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, toep->tid, i, k, be64toh(ppod->addr[k])); #endif } } mbufq_enqueue(wrq, m); } return (0); } int t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep, struct ppod_reservation *prsv, vm_offset_t buf, int buflen, struct mbufq *wrq) { struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz; u_int ppod_addr, offset; uint32_t cmd; struct ppod_region *pr = prsv->prsv_pr; uintptr_t end_pva, pva; vm_paddr_t pa; struct mbuf *m; cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; offset = buf & PAGE_MASK; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); pva = trunc_page(buf); end_pva = trunc_page(buf + buflen - 1); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); MPASS(n > 0); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); m = alloc_raw_wr_mbuf(len); if (m == NULL) return (ENOMEM); ulpmc = mtod(m, struct ulp_mem_io *); INIT_ULPTX_WR(ulpmc, len, 0, toep->tid); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(toep->tid) | (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ))); ppod->len_offset = htobe64(V_PPOD_LEN(buflen) | V_PPOD_OFST(offset)); ppod->rsvd = 0; for (k = 0; k < nitems(ppod->addr); k++) { if (pva > end_pva) ppod->addr[k] = 0; else { pa = pmap_kextract(pva); ppod->addr[k] = htobe64(pa); pva += ddp_pgsz; } #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, toep->tid, i, k, be64toh(ppod->addr[k])); #endif } /* * Walk back 1 segment so that the first address in the * next pod is the same as the last one in the current * pod. */ pva -= ddp_pgsz; } mbufq_enqueue(wrq, m); } MPASS(pva <= end_pva); return (0); } int t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep, struct ppod_reservation *prsv, struct ctl_sg_entry *sgl, int entries, int xferlen, struct mbufq *wrq) { struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz; u_int ppod_addr, offset, sg_offset = 0; uint32_t cmd; struct ppod_region *pr = prsv->prsv_pr; uintptr_t pva; vm_paddr_t pa; struct mbuf *m; MPASS(sgl != NULL); MPASS(entries > 0); cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; offset = (vm_offset_t)sgl->addr & PAGE_MASK; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); pva = trunc_page((vm_offset_t)sgl->addr); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); MPASS(n > 0); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); m = alloc_raw_wr_mbuf(len); if (m == NULL) return (ENOMEM); ulpmc = mtod(m, struct ulp_mem_io *); INIT_ULPTX_WR(ulpmc, len, 0, toep->tid); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(toep->tid) | (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ))); ppod->len_offset = htobe64(V_PPOD_LEN(xferlen) | V_PPOD_OFST(offset)); ppod->rsvd = 0; for (k = 0; k < nitems(ppod->addr); k++) { if (entries != 0) { pa = pmap_kextract(pva + sg_offset); ppod->addr[k] = htobe64(pa); } else ppod->addr[k] = 0; #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, toep->tid, i, k, be64toh(ppod->addr[k])); #endif /* * If this is the last entry in a pod, * reuse the same entry for first address * in the next pod. */ if (k + 1 == nitems(ppod->addr)) break; /* * Don't move to the next DDP page if the * sgl is already finished. */ if (entries == 0) continue; sg_offset += ddp_pgsz; if (sg_offset == sgl->len) { /* * This sgl entry is done. Go * to the next. */ entries--; sgl++; sg_offset = 0; if (entries != 0) pva = trunc_page( (vm_offset_t)sgl->addr); } } } mbufq_enqueue(wrq, m); } return (0); } /* * Prepare a pageset for DDP. This sets up page pods. */ static int prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps) { struct tom_data *td = sc->tom_softc; if (ps->prsv.prsv_nppods == 0 && t4_alloc_page_pods_for_ps(&td->pr, ps) != 0) { return (0); } if (!(ps->flags & PS_PPODS_WRITTEN) && t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) { return (0); } return (1); } int t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz, const char *name) { int i; MPASS(pr != NULL); MPASS(r->size > 0); pr->pr_start = r->start; pr->pr_len = r->size; pr->pr_page_shift[0] = 12 + G_HPZ0(psz); pr->pr_page_shift[1] = 12 + G_HPZ1(psz); pr->pr_page_shift[2] = 12 + G_HPZ2(psz); pr->pr_page_shift[3] = 12 + G_HPZ3(psz); /* The SGL -> page pod algorithm requires the sizes to be in order. */ for (i = 1; i < nitems(pr->pr_page_shift); i++) { if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1]) return (ENXIO); } pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG); pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask; if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0) return (ENXIO); pr->pr_alias_shift = fls(pr->pr_tag_mask); pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1); pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0, M_FIRSTFIT | M_NOWAIT); if (pr->pr_arena == NULL) return (ENOMEM); return (0); } void t4_free_ppod_region(struct ppod_region *pr) { MPASS(pr != NULL); if (pr->pr_arena) vmem_destroy(pr->pr_arena); bzero(pr, sizeof(*pr)); } static int pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages, int pgoff, int len) { if (ps->start != start || ps->npages != npages || ps->offset != pgoff || ps->len != len) return (1); return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp); } static int hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps) { struct vmspace *vm; vm_map_t map; vm_offset_t start, end, pgoff; struct pageset *ps; int n; DDP_ASSERT_LOCKED(toep); /* * The AIO subsystem will cancel and drain all requests before * permitting a process to exit or exec, so p_vmspace should * be stable here. */ vm = job->userproc->p_vmspace; map = &vm->vm_map; start = (uintptr_t)job->uaiocb.aio_buf; pgoff = start & PAGE_MASK; end = round_page(start + job->uaiocb.aio_nbytes); start = trunc_page(start); if (end - start > MAX_DDP_BUFFER_SIZE) { /* * Truncate the request to a short read. * Alternatively, we could DDP in chunks to the larger * buffer, but that would be quite a bit more work. * * When truncating, round the request down to avoid * crossing a cache line on the final transaction. */ end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu", __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes, (unsigned long)(end - (start + pgoff))); job->uaiocb.aio_nbytes = end - (start + pgoff); #endif end = round_page(end); } n = atop(end - start); /* * Try to reuse a cached pageset. */ TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) { if (pscmp(ps, vm, start, n, pgoff, job->uaiocb.aio_nbytes) == 0) { TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); toep->ddp.cached_count--; *pps = ps; return (0); } } /* * If there are too many cached pagesets to create a new one, * free a pageset before creating a new one. */ KASSERT(toep->ddp.active_count + toep->ddp.cached_count <= nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__)); if (toep->ddp.active_count + toep->ddp.cached_count == nitems(toep->ddp.db)) { KASSERT(toep->ddp.cached_count > 0, ("no cached pageset to free")); ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq); TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); toep->ddp.cached_count--; free_pageset(toep->td, ps); } DDP_UNLOCK(toep); /* Create a new pageset. */ ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | M_ZERO); ps->pages = (vm_page_t *)(ps + 1); ps->vm_timestamp = map->timestamp; ps->npages = vm_fault_quick_hold_pages(map, start, end - start, VM_PROT_WRITE, ps->pages, n); DDP_LOCK(toep); if (ps->npages < 0) { free(ps, M_CXGBE); return (EFAULT); } KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d", ps->npages, n)); ps->offset = pgoff; ps->len = job->uaiocb.aio_nbytes; refcount_acquire(&vm->vm_refcnt); ps->vm = vm; ps->start = start; CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", __func__, toep->tid, ps, job, ps->npages); *pps = ps; return (0); } static void ddp_complete_all(struct toepcb *toep, int error) { struct kaiocb *job; DDP_ASSERT_LOCKED(toep); KASSERT((toep->ddp.flags & DDP_AIO) != 0, ("%s: DDP_RCVBUF", __func__)); while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) { job = TAILQ_FIRST(&toep->ddp.aiojobq); TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count--; if (aio_clear_cancel_function(job)) ddp_complete_one(job, error); } } static void aio_ddp_cancel_one(struct kaiocb *job) { long copied; /* * If this job had copied data out of the socket buffer before * it was cancelled, report it as a short read rather than an * error. */ copied = job->aio_received; if (copied != 0) aio_complete(job, copied, 0); else aio_cancel(job); } /* * Called when the main loop wants to requeue a job to retry it later. * Deals with the race of the job being cancelled while it was being * examined. */ static void aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job) { DDP_ASSERT_LOCKED(toep); if (!(toep->ddp.flags & DDP_DEAD) && aio_set_cancel_function(job, t4_aio_cancel_queued)) { TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count++; } else aio_ddp_cancel_one(job); } static void aio_ddp_requeue(struct toepcb *toep) { struct adapter *sc = td_adapter(toep->td); struct socket *so; struct sockbuf *sb; struct inpcb *inp; struct kaiocb *job; struct ddp_buffer *db; size_t copied, offset, resid; struct pageset *ps; struct mbuf *m; uint64_t ddp_flags, ddp_flags_mask; struct wrqe *wr; int buf_flag, db_idx, error; DDP_ASSERT_LOCKED(toep); restart: if (toep->ddp.flags & DDP_DEAD) { MPASS(toep->ddp.waiting_count == 0); MPASS(toep->ddp.active_count == 0); return; } if (toep->ddp.waiting_count == 0 || toep->ddp.active_count == nitems(toep->ddp.db)) { return; } job = TAILQ_FIRST(&toep->ddp.aiojobq); so = job->fd_file->f_data; sb = &so->so_rcv; SOCKBUF_LOCK(sb); /* We will never get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { SOCKBUF_UNLOCK(sb); ddp_complete_all(toep, ENOTCONN); return; } KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0, ("%s: pending sockbuf data and DDP is active", __func__)); /* Abort if socket has reported problems. */ /* XXX: Wait for any queued DDP's to finish and/or flush them? */ if (so->so_error && sbavail(sb) == 0) { toep->ddp.waiting_count--; TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); if (!aio_clear_cancel_function(job)) { SOCKBUF_UNLOCK(sb); goto restart; } /* * If this job has previously copied some data, report * a short read and leave the error to be reported by * a future request. */ copied = job->aio_received; if (copied != 0) { SOCKBUF_UNLOCK(sb); aio_complete(job, copied, 0); goto restart; } error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); aio_complete(job, -1, error); goto restart; } /* * Door is closed. If there is pending data in the socket buffer, * deliver it. If there are pending DDP requests, wait for those * to complete. Once they have completed, return EOF reads. */ if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) { SOCKBUF_UNLOCK(sb); if (toep->ddp.active_count != 0) return; ddp_complete_all(toep, 0); return; } /* * If DDP is not enabled and there is no pending socket buffer * data, try to enable DDP. */ if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) { SOCKBUF_UNLOCK(sb); /* * Wait for the card to ACK that DDP is enabled before * queueing any buffers. Currently this waits for an * indicate to arrive. This could use a TCB_SET_FIELD_RPL * message to know that DDP was enabled instead of waiting * for the indicate which would avoid copying the indicate * if no data is pending. * * XXX: Might want to limit the indicate size to the size * of the first queued request. */ if ((toep->ddp.flags & DDP_SC_REQ) == 0) enable_ddp(sc, toep); return; } SOCKBUF_UNLOCK(sb); /* * If another thread is queueing a buffer for DDP, let it * drain any work and return. */ if (toep->ddp.queueing != NULL) return; /* Take the next job to prep it for DDP. */ toep->ddp.waiting_count--; TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); if (!aio_clear_cancel_function(job)) goto restart; toep->ddp.queueing = job; /* NB: This drops DDP_LOCK while it holds the backing VM pages. */ error = hold_aio(toep, job, &ps); if (error != 0) { ddp_complete_one(job, error); toep->ddp.queueing = NULL; goto restart; } SOCKBUF_LOCK(sb); if (so->so_error && sbavail(sb) == 0) { copied = job->aio_received; if (copied != 0) { SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_complete(job, copied, 0); toep->ddp.queueing = NULL; goto restart; } error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_complete(job, -1, error); toep->ddp.queueing = NULL; goto restart; } if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) { SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); if (toep->ddp.active_count != 0) { /* * The door is closed, but there are still pending * DDP buffers. Requeue. These jobs will all be * completed once those buffers drain. */ aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; return; } ddp_complete_one(job, 0); ddp_complete_all(toep, 0); toep->ddp.queueing = NULL; return; } sbcopy: /* * If the toep is dead, there shouldn't be any data in the socket * buffer, so the above case should have handled this. */ MPASS(!(toep->ddp.flags & DDP_DEAD)); /* * If there is pending data in the socket buffer (either * from before the requests were queued or a DDP indicate), * copy those mbufs out directly. */ copied = 0; offset = ps->offset + job->aio_received; MPASS(job->aio_received <= job->uaiocb.aio_nbytes); resid = job->uaiocb.aio_nbytes - job->aio_received; m = sb->sb_mb; KASSERT(m == NULL || toep->ddp.active_count == 0, ("%s: sockbuf data with active DDP", __func__)); while (m != NULL && resid > 0) { struct iovec iov[1]; struct uio uio; #ifdef INVARIANTS int error; #endif iov[0].iov_base = mtod(m, void *); iov[0].iov_len = m->m_len; if (iov[0].iov_len > resid) iov[0].iov_len = resid; uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = iov[0].iov_len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; #ifdef INVARIANTS error = uiomove_fromphys(ps->pages, offset + copied, uio.uio_resid, &uio); #else uiomove_fromphys(ps->pages, offset + copied, uio.uio_resid, &uio); #endif MPASS(error == 0 && uio.uio_resid == 0); copied += uio.uio_offset; resid -= uio.uio_offset; m = m->m_next; } if (copied != 0) { sbdrop_locked(sb, copied); job->aio_received += copied; job->msgrcv = 1; copied = job->aio_received; inp = sotoinpcb(so); if (!INP_TRY_WLOCK(inp)) { /* * The reference on the socket file descriptor in * the AIO job should keep 'sb' and 'inp' stable. * Our caller has a reference on the 'toep' that * keeps it stable. */ SOCKBUF_UNLOCK(sb); DDP_UNLOCK(toep); INP_WLOCK(inp); DDP_LOCK(toep); SOCKBUF_LOCK(sb); /* * If the socket has been closed, we should detect * that and complete this request if needed on * the next trip around the loop. */ } t4_rcvd_locked(&toep->td->tod, intotcpcb(inp)); INP_WUNLOCK(inp); if (resid == 0 || toep->ddp.flags & DDP_DEAD) { /* * We filled the entire buffer with socket * data, DDP is not being used, or the socket * is being shut down, so complete the * request. */ SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_complete(job, copied, 0); toep->ddp.queueing = NULL; goto restart; } /* * If DDP is not enabled, requeue this request and restart. * This will either enable DDP or wait for more data to * arrive on the socket buffer. */ if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) { SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; goto restart; } /* * An indicate might have arrived and been added to * the socket buffer while it was unlocked after the * copy to lock the INP. If so, restart the copy. */ if (sbavail(sb) != 0) goto sbcopy; } SOCKBUF_UNLOCK(sb); if (prep_pageset(sc, toep, ps) == 0) { recycle_pageset(toep, ps); aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; /* * XXX: Need to retry this later. Mostly need a trigger * when page pods are freed up. */ printf("%s: prep_pageset failed\n", __func__); return; } /* Determine which DDP buffer to use. */ if (toep->ddp.db[0].job == NULL) { db_idx = 0; } else { MPASS(toep->ddp.db[1].job == NULL); db_idx = 1; } ddp_flags = 0; ddp_flags_mask = 0; if (db_idx == 0) { ddp_flags |= V_TF_DDP_BUF0_VALID(1); if (so->so_state & SS_NBIO) ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) | V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) | V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1); buf_flag = DDP_BUF0_ACTIVE; } else { ddp_flags |= V_TF_DDP_BUF1_VALID(1); if (so->so_state & SS_NBIO) ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) | V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1); buf_flag = DDP_BUF1_ACTIVE; } MPASS((toep->ddp.flags & buf_flag) == 0); if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) { MPASS(db_idx == 0); MPASS(toep->ddp.active_id == -1); MPASS(toep->ddp.active_count == 0); ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1); } /* * The TID for this connection should still be valid. If DDP_DEAD * is set, SBS_CANTRCVMORE should be set, so we shouldn't be * this far anyway. Even if the socket is closing on the other * end, the AIO job holds a reference on this end of the socket * which will keep it open and keep the TCP PCB attached until * after the job is completed. */ wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &ps->prsv, ps->len, job->aio_received, ddp_flags, ddp_flags_mask); if (wr == NULL) { recycle_pageset(toep, ps); aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; /* * XXX: Need a way to kick a retry here. * * XXX: We know the fixed size needed and could * preallocate this using a blocking request at the * start of the task to avoid having to handle this * edge case. */ printf("%s: mk_update_tcb_for_ddp failed\n", __func__); return; } if (!aio_set_cancel_function(job, t4_aio_cancel_active)) { free_wrqe(wr); recycle_pageset(toep, ps); aio_ddp_cancel_one(job); toep->ddp.queueing = NULL; goto restart; } #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %u, scheduling %p for DDP[%d] (flags %#lx/%#lx)", __func__, toep->tid, job, db_idx, ddp_flags, ddp_flags_mask); #endif /* Give the chip the go-ahead. */ t4_wrq_tx(sc, wr); db = &toep->ddp.db[db_idx]; db->cancel_pending = 0; db->job = job; db->ps = ps; toep->ddp.queueing = NULL; toep->ddp.flags |= buf_flag; toep->ddp.active_count++; if (toep->ddp.active_count == 1) { MPASS(toep->ddp.active_id == -1); toep->ddp.active_id = db_idx; CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__, toep->ddp.active_id); } goto restart; } void ddp_queue_toep(struct toepcb *toep) { DDP_ASSERT_LOCKED(toep); if (toep->ddp.flags & DDP_TASK_ACTIVE) return; toep->ddp.flags |= DDP_TASK_ACTIVE; hold_toepcb(toep); soaio_enqueue(&toep->ddp.requeue_task); } static void aio_ddp_requeue_task(void *context, int pending) { struct toepcb *toep = context; DDP_LOCK(toep); aio_ddp_requeue(toep); toep->ddp.flags &= ~DDP_TASK_ACTIVE; DDP_UNLOCK(toep); free_toepcb(toep); } static void t4_aio_cancel_active(struct kaiocb *job) { struct socket *so = job->fd_file->f_data; struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); uint64_t valid_flag; int i; DDP_LOCK(toep); if (aio_cancel_cleared(job)) { DDP_UNLOCK(toep); aio_ddp_cancel_one(job); return; } for (i = 0; i < nitems(toep->ddp.db); i++) { if (toep->ddp.db[i].job == job) { /* Should only ever get one cancel request for a job. */ MPASS(toep->ddp.db[i].cancel_pending == 0); /* * Invalidate this buffer. It will be * cancelled or partially completed once the * card ACKs the invalidate. */ valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) : V_TF_DDP_BUF1_VALID(1); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1, CPL_COOKIE_DDP0 + i); toep->ddp.db[i].cancel_pending = 1; CTR2(KTR_CXGBE, "%s: request %p marked pending", __func__, job); break; } } DDP_UNLOCK(toep); } static void t4_aio_cancel_queued(struct kaiocb *job) { struct socket *so = job->fd_file->f_data; struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; DDP_LOCK(toep); if (!aio_cancel_cleared(job)) { TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count--; if (toep->ddp.waiting_count == 0) ddp_queue_toep(toep); } CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job); DDP_UNLOCK(toep); aio_ddp_cancel_one(job); } int t4_aio_queue_ddp(struct socket *so, struct kaiocb *job) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; /* Ignore writes. */ if (job->uaiocb.aio_lio_opcode != LIO_READ) return (EOPNOTSUPP); INP_WLOCK(inp); if (__predict_false(ulp_mode(toep) == ULP_MODE_NONE)) { if (!set_ddp_ulp_mode(toep)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } } INP_WUNLOCK(inp); DDP_LOCK(toep); /* * If DDP is being used for all normal receive, don't use it * for AIO. */ if ((toep->ddp.flags & DDP_RCVBUF) != 0) { DDP_UNLOCK(toep); return (EOPNOTSUPP); } /* * XXX: Think about possibly returning errors for ENOTCONN, * etc. Perhaps the caller would only queue the request * if it failed with EOPNOTSUPP? */ #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); #endif if (!aio_set_cancel_function(job, t4_aio_cancel_queued)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count++; if ((toep->ddp.flags & DDP_AIO) == 0) { toep->ddp.flags |= DDP_AIO; TAILQ_INIT(&toep->ddp.cached_pagesets); TAILQ_INIT(&toep->ddp.aiojobq); TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep); } /* * Try to handle this request synchronously. If this has * to block because the task is running, it will just bail * and let the task handle it instead. */ aio_ddp_requeue(toep); DDP_UNLOCK(toep); return (0); } static void ddp_rcvbuf_requeue(struct toepcb *toep) { struct socket *so; struct sockbuf *sb; struct inpcb *inp; struct ddp_rcv_buffer *drb; DDP_ASSERT_LOCKED(toep); restart: if ((toep->ddp.flags & DDP_DEAD) != 0) { MPASS(toep->ddp.active_count == 0); return; } /* If both buffers are active, nothing to do. */ if (toep->ddp.active_count == nitems(toep->ddp.db)) { return; } inp = toep->inp; so = inp->inp_socket; sb = &so->so_rcv; drb = alloc_cached_ddp_rcv_buffer(toep); DDP_UNLOCK(toep); if (drb == NULL) { drb = alloc_ddp_rcv_buffer(toep, M_WAITOK); if (drb == NULL) { printf("%s: failed to allocate buffer\n", __func__); DDP_LOCK(toep); return; } } DDP_LOCK(toep); if ((toep->ddp.flags & DDP_DEAD) != 0 || toep->ddp.active_count == nitems(toep->ddp.db)) { recycle_ddp_rcv_buffer(toep, drb); return; } /* We will never get anything unless we are or were connected. */ SOCKBUF_LOCK(sb); if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { SOCKBUF_UNLOCK(sb); recycle_ddp_rcv_buffer(toep, drb); return; } /* Abort if socket has reported problems or is closed. */ if (so->so_error != 0 || (sb->sb_state & SBS_CANTRCVMORE) != 0) { SOCKBUF_UNLOCK(sb); recycle_ddp_rcv_buffer(toep, drb); return; } SOCKBUF_UNLOCK(sb); if (!queue_ddp_rcvbuf(toep, drb)) { /* * XXX: Need a way to kick a retry here. * * XXX: We know the fixed size needed and could * preallocate the work request using a blocking * request at the start of the task to avoid having to * handle this edge case. */ return; } goto restart; } static void ddp_rcvbuf_requeue_task(void *context, int pending) { struct toepcb *toep = context; DDP_LOCK(toep); ddp_rcvbuf_requeue(toep); toep->ddp.flags &= ~DDP_TASK_ACTIVE; DDP_UNLOCK(toep); free_toepcb(toep); } int t4_enable_ddp_rcv(struct socket *so, struct toepcb *toep) { struct inpcb *inp = sotoinpcb(so); struct adapter *sc = td_adapter(toep->td); INP_WLOCK(inp); switch (ulp_mode(toep)) { case ULP_MODE_TCPDDP: break; case ULP_MODE_NONE: if (set_ddp_ulp_mode(toep)) break; /* FALLTHROUGH */ default: INP_WUNLOCK(inp); return (EOPNOTSUPP); } INP_WUNLOCK(inp); DDP_LOCK(toep); /* * If DDP is being used for AIO already, don't use it for * normal receive. */ if ((toep->ddp.flags & DDP_AIO) != 0) { DDP_UNLOCK(toep); return (EOPNOTSUPP); } if ((toep->ddp.flags & DDP_RCVBUF) != 0) { DDP_UNLOCK(toep); return (EBUSY); } toep->ddp.flags |= DDP_RCVBUF; TAILQ_INIT(&toep->ddp.cached_buffers); enable_ddp(sc, toep); TASK_INIT(&toep->ddp.requeue_task, 0, ddp_rcvbuf_requeue_task, toep); ddp_queue_toep(toep); DDP_UNLOCK(toep); return (0); } void t4_ddp_mod_load(void) { if (t4_ddp_rcvbuf_len < PAGE_SIZE) t4_ddp_rcvbuf_len = PAGE_SIZE; if (t4_ddp_rcvbuf_len > MAX_DDP_BUFFER_SIZE) t4_ddp_rcvbuf_len = MAX_DDP_BUFFER_SIZE; if (!powerof2(t4_ddp_rcvbuf_len)) t4_ddp_rcvbuf_len = 1 << fls(t4_ddp_rcvbuf_len); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl, CPL_COOKIE_DDP0); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl, CPL_COOKIE_DDP1); t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); TAILQ_INIT(&ddp_orphan_pagesets); mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF); TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL); } void t4_ddp_mod_unload(void) { taskqueue_drain(taskqueue_thread, &ddp_orphan_task); MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets)); mtx_destroy(&ddp_orphan_pagesets_lock); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1); t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL); t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL); } #endif diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c index bdd03edd3a6f..c6377980fca9 100644 --- a/sys/dev/cxgbe/tom/t4_tls.c +++ b/sys/dev/cxgbe/tom/t4_tls.c @@ -1,1333 +1,1297 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2017-2018 Chelsio Communications, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_kern_tls.h" #include #ifdef KERN_TLS #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_tcb.h" #include "crypto/t4_crypto.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" /* * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while * the mbuf is in the ulp_pdu_reclaimq. */ #define tls_tcp_seq PH_loc.thirtytwo[0] static void t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) { struct adapter *sc = td_adapter(toep->td); t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0); } /* TLS and DTLS common routines */ bool can_tls_offload(struct adapter *sc) { return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS); } int tls_tx_key(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; return (tls_ofld->tx_key_addr >= 0); } /* Set TF_RX_QUIESCE to pause receive. */ static void t4_set_rx_quiesce(struct toepcb *toep) { struct adapter *sc = td_adapter(toep->td); t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), V_TF_RX_QUIESCE(1), 1, CPL_COOKIE_TOM); } /* Clear TF_RX_QUIESCE to re-enable receive. */ static void t4_clear_rx_quiesce(struct toepcb *toep) { t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0); } /* TLS/DTLS content type for CPL SFO */ static inline unsigned char tls_content_type(unsigned char content_type) { switch (content_type) { case CONTENT_TYPE_CCS: return CPL_TX_TLS_SFO_TYPE_CCS; case CONTENT_TYPE_ALERT: return CPL_TX_TLS_SFO_TYPE_ALERT; case CONTENT_TYPE_HANDSHAKE: return CPL_TX_TLS_SFO_TYPE_HANDSHAKE; case CONTENT_TYPE_APP_DATA: return CPL_TX_TLS_SFO_TYPE_DATA; default: return CPL_TX_TLS_SFO_TYPE_CUSTOM; } } /* TLS Key memory management */ static void clear_tls_keyid(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); if (tls_ofld->rx_key_addr >= 0) { t4_free_tls_keyid(sc, tls_ofld->rx_key_addr); tls_ofld->rx_key_addr = -1; } if (tls_ofld->tx_key_addr >= 0) { t4_free_tls_keyid(sc, tls_ofld->tx_key_addr); tls_ofld->tx_key_addr = -1; } } static int get_tp_plen_max(struct ktls_session *tls) { int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448; return (tls->params.max_frame_len <= 8192 ? plen : FC_TP_PLEN_MAX); } /* Send request to get the key-id */ static int tls_program_key_id(struct toepcb *toep, struct ktls_session *tls, int direction) { struct tls_ofld_info *tls_ofld = &toep->tls; struct adapter *sc = td_adapter(toep->td); struct ofld_tx_sdesc *txsd; int keyid; struct wrqe *wr; struct tls_key_req *kwr; struct tls_keyctx *kctx; #ifdef INVARIANTS int kwrlen, kctxlen, len; kwrlen = sizeof(*kwr); kctxlen = roundup2(sizeof(*kctx), 32); len = roundup2(kwrlen + kctxlen, 16); MPASS(TLS_KEY_WR_SZ == len); #endif if (toep->txsd_avail == 0) return (EAGAIN); if ((keyid = t4_alloc_tls_keyid(sc)) < 0) { return (ENOSPC); } wr = alloc_wrqe(TLS_KEY_WR_SZ, &toep->ofld_txq->wrq); if (wr == NULL) { t4_free_tls_keyid(sc, keyid); return (ENOMEM); } kwr = wrtod(wr); memset(kwr, 0, TLS_KEY_WR_SZ); t4_write_tlskey_wr(tls, direction, toep->tid, F_FW_WR_COMPL, keyid, kwr); kctx = (struct tls_keyctx *)(kwr + 1); if (direction == KTLS_TX) tls_ofld->tx_key_addr = keyid; else tls_ofld->rx_key_addr = keyid; t4_tls_key_ctx(tls, direction, kctx); txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = DIV_ROUND_UP(TLS_KEY_WR_SZ, 16); txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); return (0); } int tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction) { struct adapter *sc = td_adapter(toep->td); int error, explicit_iv_size, mac_first; if (!can_tls_offload(sc)) return (EINVAL); if (direction == KTLS_RX) { if (ulp_mode(toep) != ULP_MODE_NONE) return (EINVAL); if ((toep->flags & TPF_TLS_STARTING) != 0) return (EINVAL); } else { switch (ulp_mode(toep)) { case ULP_MODE_NONE: case ULP_MODE_TLS: case ULP_MODE_TCPDDP: break; default: return (EINVAL); } } switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: /* XXX: Explicitly ignore any provided IV. */ switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } switch (tls->params.auth_algorithm) { case CRYPTO_SHA1_HMAC: case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: break; default: return (EPROTONOSUPPORT); } explicit_iv_size = AES_BLOCK_LEN; mac_first = 1; break; case CRYPTO_AES_NIST_GCM_16: if (tls->params.iv_len != SALT_SIZE) { return (EINVAL); } switch (tls->params.cipher_key_len) { case 128 / 8: case 192 / 8: case 256 / 8: break; default: return (EINVAL); } explicit_iv_size = 8; mac_first = 0; break; default: return (EPROTONOSUPPORT); } /* Only TLS 1.1 and TLS 1.2 are currently supported. */ if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE || tls->params.tls_vminor < TLS_MINOR_VER_ONE || tls->params.tls_vminor > TLS_MINOR_VER_TWO) { return (EPROTONOSUPPORT); } /* Bail if we already have a key. */ if (direction == KTLS_TX) { if (toep->tls.tx_key_addr != -1) return (EOPNOTSUPP); } else { if (toep->tls.rx_key_addr != -1) return (EOPNOTSUPP); } error = tls_program_key_id(toep, tls, direction); if (error) return (error); if (direction == KTLS_TX) { toep->tls.scmd0.seqno_numivs = (V_SCMD_SEQ_NO_CTRL(3) | V_SCMD_PROTO_VERSION(t4_tls_proto_ver(tls)) | V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) | V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) | V_SCMD_CIPH_MODE(t4_tls_cipher_mode(tls)) | V_SCMD_AUTH_MODE(t4_tls_auth_mode(tls)) | V_SCMD_HMAC_CTRL(t4_tls_hmac_ctrl(tls)) | V_SCMD_IV_SIZE(explicit_iv_size / 2)); toep->tls.scmd0.ivgen_hdrlen = (V_SCMD_IV_GEN_CTRL(1) | V_SCMD_KEY_CTX_INLINE(0) | V_SCMD_TLS_FRAG_ENABLE(1)); toep->tls.iv_len = explicit_iv_size; toep->tls.frag_size = tls->params.max_frame_len; toep->tls.fcplenmax = get_tp_plen_max(tls); toep->tls.expn_per_ulp = tls->params.tls_hlen + tls->params.tls_tlen; toep->tls.pdus_per_ulp = 1; toep->tls.adjusted_plen = toep->tls.expn_per_ulp + tls->params.max_frame_len; toep->tls.tx_key_info_size = t4_tls_key_info_size(tls); } else { toep->flags |= TPF_TLS_STARTING | TPF_TLS_RX_QUIESCING; toep->tls.rx_version = tls->params.tls_vmajor << 8 | tls->params.tls_vminor; CTR2(KTR_CXGBE, "%s: tid %d setting RX_QUIESCE", __func__, toep->tid); t4_set_rx_quiesce(toep); } return (0); } void tls_init_toep(struct toepcb *toep) { struct tls_ofld_info *tls_ofld = &toep->tls; tls_ofld->rx_key_addr = -1; tls_ofld->tx_key_addr = -1; } void tls_uninit_toep(struct toepcb *toep) { clear_tls_keyid(toep); } #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) #define MIN_OFLD_TLSTX_CREDITS(toep) \ (howmany(sizeof(struct fw_tlstx_data_wr) + \ sizeof(struct cpl_tx_tls_sfo) + sizeof(struct ulptx_idata) + \ sizeof(struct ulptx_sc_memrd) + \ AES_BLOCK_LEN + 1, 16)) static void write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep, unsigned int plen, unsigned int expn, uint8_t credits, int shove) { struct tls_ofld_info *tls_ofld = &toep->tls; unsigned int len = plen + expn; txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) | V_FW_TLSTX_DATA_WR_COMPL(1) | V_FW_TLSTX_DATA_WR_IMMDLEN(0)); txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) | V_FW_TLSTX_DATA_WR_LEN16(credits)); txwr->plen = htobe32(len); txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) | V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove)); txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(1) | V_FW_TLSTX_DATA_WR_EXP(expn) | V_FW_TLSTX_DATA_WR_CTXLOC(TLS_SFO_WR_CONTEXTLOC_DDR) | V_FW_TLSTX_DATA_WR_IVDSGL(0) | V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->tx_key_info_size >> 4)); txwr->mfs = htobe16(tls_ofld->frag_size); txwr->adjustedplen_pkd = htobe16( V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen)); txwr->expinplenmax_pkd = htobe16( V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp)); txwr->pdusinplenmax_pkd = V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp); } static void write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep, struct tls_hdr *tls_hdr, unsigned int plen, uint64_t seqno) { struct tls_ofld_info *tls_ofld = &toep->tls; int data_type, seglen; seglen = plen; data_type = tls_content_type(tls_hdr->type); cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) | V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) | V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen)); cpl->pld_len = htobe32(plen); if (data_type == CPL_TX_TLS_SFO_TYPE_CUSTOM) cpl->type_protover = htobe32( V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type)); cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs | V_SCMD_NUM_IVS(1)); cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen); cpl->scmd1 = htobe64(seqno); } static int count_ext_pgs_segs(struct mbuf *m) { vm_paddr_t nextpa; u_int i, nsegs; MPASS(m->m_epg_npgs > 0); nsegs = 1; nextpa = m->m_epg_pa[0] + PAGE_SIZE; for (i = 1; i < m->m_epg_npgs; i++) { if (nextpa != m->m_epg_pa[i]) nsegs++; nextpa = m->m_epg_pa[i] + PAGE_SIZE; } return (nsegs); } static void write_ktlstx_sgl(void *dst, struct mbuf *m, int nsegs) { struct ulptx_sgl *usgl = dst; vm_paddr_t pa; uint32_t len; int i, j; KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); /* Figure out the first S/G length. */ pa = m->m_epg_pa[0] + m->m_epg_1st_off; usgl->addr0 = htobe64(pa); len = m_epg_pagelen(m, 0, m->m_epg_1st_off); pa += len; for (i = 1; i < m->m_epg_npgs; i++) { if (m->m_epg_pa[i] != pa) break; len += m_epg_pagelen(m, i, 0); pa += m_epg_pagelen(m, i, 0); } usgl->len0 = htobe32(len); #ifdef INVARIANTS nsegs--; #endif j = -1; for (; i < m->m_epg_npgs; i++) { if (j == -1 || m->m_epg_pa[i] != pa) { if (j >= 0) usgl->sge[j / 2].len[j & 1] = htobe32(len); j++; #ifdef INVARIANTS nsegs--; #endif pa = m->m_epg_pa[i]; usgl->sge[j / 2].addr[j & 1] = htobe64(pa); len = m_epg_pagelen(m, i, 0); pa += len; } else { len += m_epg_pagelen(m, i, 0); pa += m_epg_pagelen(m, i, 0); } } if (j >= 0) { usgl->sge[j / 2].len[j & 1] = htobe32(len); if ((j & 1) == 0) usgl->sge[j / 2].len[1] = htobe32(0); } KASSERT(nsegs == 0, ("%s: nsegs %d, m %p", __func__, nsegs, m)); } /* * Similar to t4_push_frames() but handles sockets that contain TLS * record mbufs. */ void t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop) { struct tls_hdr *thdr; struct fw_tlstx_data_wr *txwr; struct cpl_tx_tls_sfo *cpl; struct ulptx_idata *idata; struct ulptx_sc_memrd *memrd; struct wrqe *wr; struct mbuf *m; u_int nsegs, credits, wr_len; u_int expn_size; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int tls_size, tx_credits, shove, sowwakeup; struct ofld_tx_sdesc *txsd; char *buf; INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); KASSERT(ulp_mode(toep) == ULP_MODE_NONE || ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS, ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); KASSERT(tls_tx_key(toep), ("%s: TX key not set for toep %p", __func__, toep)); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", __func__, toep->tid, toep->flags, tp->t_flags); #endif if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; #ifdef RATELIMIT if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; } #endif /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. */ if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) { KASSERT(drop == 0, ("%s: drop (%d) != 0 but tx is suspended", __func__, drop)); return; } txsd = &toep->txsd[toep->txsd_pidx]; for (;;) { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); SOCKBUF_LOCK(sb); sowwakeup = drop; if (drop) { sbdrop_locked(sb, drop); drop = 0; } m = sb->sb_sndptr != NULL ? sb->sb_sndptr->m_next : sb->sb_mb; /* * Send a FIN if requested, but only if there's no * more data to send. */ if (m == NULL && toep->flags & TPF_SEND_FIN) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); t4_close_conn(sc, toep); return; } /* * If there is no ready data to send, wait until more * data arrives. */ if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); #ifdef VERBOSE_TRACES CTR2(KTR_CXGBE, "%s: tid %d no ready data to send", __func__, toep->tid); #endif return; } KASSERT(m->m_flags & M_EXTPG, ("%s: mbuf %p is not NOMAP", __func__, m)); KASSERT(m->m_epg_tls != NULL, ("%s: mbuf %p doesn't have TLS session", __func__, m)); /* Calculate WR length. */ wr_len = sizeof(struct fw_tlstx_data_wr) + sizeof(struct cpl_tx_tls_sfo) + sizeof(struct ulptx_idata) + sizeof(struct ulptx_sc_memrd); /* Explicit IVs for AES-CBC and AES-GCM are <= 16. */ MPASS(toep->tls.iv_len <= AES_BLOCK_LEN); wr_len += AES_BLOCK_LEN; /* Account for SGL in work request length. */ nsegs = count_ext_pgs_segs(m); wr_len += sizeof(struct ulptx_sgl) + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; /* Not enough credits for this work request. */ if (howmany(wr_len, 16) > tx_credits) { if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d mbuf %p requires %d credits, but only %d available", __func__, toep->tid, m, howmany(wr_len, 16), tx_credits); #endif toep->flags |= TPF_TX_SUSPENDED; return; } /* Shove if there is no additional data pending. */ shove = ((m->m_next == NULL || (m->m_next->m_flags & M_NOTAVAIL) != 0)) && (tp->t_flags & TF_MORETOCOME) == 0; if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf && sb->sb_hiwat < V_tcp_autosndbuf_max && sbused(sb) >= sb->sb_hiwat * 7 / 8) { int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max); if (!sbreserve_locked(so, SO_SND, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else sowwakeup = 1; /* room available */ } if (sowwakeup) sowwakeup_locked(so); else SOCKBUF_UNLOCK(sb); SOCKBUF_UNLOCK_ASSERT(sb); if (__predict_false(toep->flags & TPF_FIN_SENT)) panic("%s: excess tx.", __func__); wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX: how will we recover from this? */ toep->flags |= TPF_TX_SUSPENDED; return; } thdr = (struct tls_hdr *)&m->m_epg_hdr; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d TLS record %ju type %d len %#x", __func__, toep->tid, m->m_epg_seqno, thdr->type, m->m_len); #endif txwr = wrtod(wr); cpl = (struct cpl_tx_tls_sfo *)(txwr + 1); memset(txwr, 0, roundup2(wr_len, 16)); credits = howmany(wr_len, 16); expn_size = m->m_epg_hdrlen + m->m_epg_trllen; tls_size = m->m_len - expn_size; write_tlstx_wr(txwr, toep, tls_size, expn_size, credits, shove); write_tlstx_cpl(cpl, toep, thdr, tls_size, m->m_epg_seqno); idata = (struct ulptx_idata *)(cpl + 1); idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); idata->len = htobe32(0); memrd = (struct ulptx_sc_memrd *)(idata + 1); memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) | V_ULP_TX_SC_MORE(1) | V_ULPTX_LEN16(toep->tls.tx_key_info_size >> 4)); memrd->addr = htobe32(toep->tls.tx_key_addr >> 5); /* Copy IV. */ buf = (char *)(memrd + 1); memcpy(buf, thdr + 1, toep->tls.iv_len); buf += AES_BLOCK_LEN; write_ktlstx_sgl(buf, m, nsegs); KASSERT(toep->tx_credits >= credits, ("%s: not enough credits", __func__)); toep->tx_credits -= credits; tp->snd_nxt += m->m_len; tp->snd_max += m->m_len; SOCKBUF_LOCK(sb); sb->sb_sndptr = m; SOCKBUF_UNLOCK(sb); toep->flags |= TPF_TX_DATA_SENT; if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep)) toep->flags |= TPF_TX_SUSPENDED; KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); txsd->plen = m->m_len; txsd->tx_credits = credits; txsd++; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { toep->txsd_pidx = 0; txsd = &toep->txsd[0]; } toep->txsd_avail--; counter_u64_add(toep->ofld_txq->tx_toe_tls_records, 1); counter_u64_add(toep->ofld_txq->tx_toe_tls_octets, m->m_len); t4_l2t_send(sc, wr, toep->l2te); } } /* * For TLS data we place received mbufs received via CPL_TLS_DATA into * an mbufq in the TLS offload state. When CPL_RX_TLS_CMP is * received, the completed PDUs are placed into the socket receive * buffer. * * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs. */ static int do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_tls_data *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; int len; /* XXX: Should this match do_rx_data instead? */ KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); len = m->m_pkthdr.len; toep->ofld_rxq->rx_toe_tls_octets += len; KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } /* Save TCP sequence number. */ m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq); if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) { #ifdef INVARIANTS panic("Failed to queue TLS data packet"); #else printf("%s: Failed to queue TLS data packet\n", __func__); INP_WUNLOCK(inp); m_freem(m); return (0); #endif } tp = intotcpcb(inp); tp->t_rcvtime = ticks; #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, be32toh(cpl->seq)); #endif INP_WUNLOCK(inp); return (0); } static int do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *); struct tlsrx_hdr_pkt *tls_hdr_pkt; unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct mbuf *tls_data; struct tls_get_record *tgr; struct mbuf *control; int pdu_length, trailer_len; #if defined(KTR) || defined(INVARIANTS) int len; #endif KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); #if defined(KTR) || defined(INVARIANTS) len = m->m_pkthdr.len; #endif toep->ofld_rxq->rx_toe_tls_records++; KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (inp->inp_flags & INP_DROPPED) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length)); so = inp_inpcbtosocket(inp); tp = intotcpcb(inp); #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u", __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt); #endif tp->rcv_nxt += pdu_length; KASSERT(tp->rcv_wnd >= pdu_length, ("%s: negative window size", __func__)); tp->rcv_wnd -= pdu_length; /* XXX: Not sure what to do about urgent data. */ /* * The payload of this CPL is the TLS header followed by * additional fields. */ KASSERT(m->m_len >= sizeof(*tls_hdr_pkt), ("%s: payload too small", __func__)); tls_hdr_pkt = mtod(m, void *); tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq); if (tls_data != NULL) { KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq, ("%s: sequence mismatch", __func__)); } /* Report decryption errors as EBADMSG. */ if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) { CTR4(KTR_CXGBE, "%s: tid %u TLS error %#x ddp_vld %#x", __func__, toep->tid, tls_hdr_pkt->res_to_mac_error, be32toh(cpl->ddp_valid)); m_freem(m); m_freem(tls_data); CURVNET_SET(toep->vnet); so->so_error = EBADMSG; sorwakeup(so); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } /* Handle data received after the socket is closed. */ sb = &so->so_rcv; SOCKBUF_LOCK(sb); if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { struct epoch_tracker et; CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", __func__, tid, pdu_length); m_freem(m); m_freem(tls_data); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp != NULL) INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * If there is any data in the 'sb_mtls' chain of the socket * or we aren't able to allocate the control mbuf, append the * record as a CSUM_TLS_DECRYPTED packet to 'sb_mtls' rather * than as a decrypted record to 'sb_m'. */ if (sb->sb_mtls != NULL) control = NULL; else control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD, IPPROTO_TCP, M_NOWAIT); if (control != NULL) { tgr = (struct tls_get_record *) CMSG_DATA(mtod(control, struct cmsghdr *)); memset(tgr, 0, sizeof(*tgr)); tgr->tls_type = tls_hdr_pkt->type; tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8; tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff; if (tls_data != NULL) { m_last(tls_data)->m_flags |= M_EOR; tgr->tls_length = htobe16(tls_data->m_pkthdr.len); } else tgr->tls_length = 0; m_freem(m); m = tls_data; } else { M_ASSERTPKTHDR(m); /* It's ok that any explicit IV is missing. */ m->m_len = sb->sb_tls_info->params.tls_hlen; m->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; m->m_pkthdr.len = m->m_len; if (tls_data != NULL) { m->m_pkthdr.len += tls_data->m_pkthdr.len; m_demote_pkthdr(tls_data); m->m_next = tls_data; } /* * Grow the chain by the trailer, but without * contents. The trailer will be thrown away by * ktls_decrypt. Note that ktls_decrypt assumes the * trailer is tls_tlen bytes long, so append that many * bytes not the actual trailer size computed from * pdu_length. */ trailer_len = sb->sb_tls_info->params.tls_tlen; if (tls_data != NULL) { m_last(tls_data)->m_len += trailer_len; tls_data = NULL; } else m->m_len += trailer_len; m->m_pkthdr.len += trailer_len; tls_hdr_pkt->length = htobe16(m->m_pkthdr.len - sizeof(struct tls_record_layer)); } /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } if (control != NULL) sbappendcontrol_locked(sb, m, control, 0); else sbappendstream_locked(sb, m, 0); t4_rcvd_locked(&toep->td->tod, tp); sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); CURVNET_RESTORE(); return (0); } void do_rx_data_tls(const struct cpl_rx_data *cpl, struct toepcb *toep, struct mbuf *m) { struct inpcb *inp = toep->inp; struct tls_ofld_info *tls_ofld = &toep->tls; struct tls_hdr *hdr; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; int len; len = m->m_pkthdr.len; INP_WLOCK_ASSERT(inp); so = inp_inpcbtosocket(inp); tp = intotcpcb(inp); sb = &so->so_rcv; SOCKBUF_LOCK(sb); CURVNET_SET(toep->vnet); tp->rcv_nxt += len; KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); tp->rcv_wnd -= len; /* Do we have a full TLS header? */ if (len < sizeof(*hdr)) { CTR3(KTR_CXGBE, "%s: tid %u len %d: too short for a TLS header", __func__, toep->tid, len); so->so_error = EMSGSIZE; goto out; } hdr = mtod(m, struct tls_hdr *); /* Is the header valid? */ if (be16toh(hdr->version) != tls_ofld->rx_version) { CTR3(KTR_CXGBE, "%s: tid %u invalid version %04x", __func__, toep->tid, be16toh(hdr->version)); so->so_error = EINVAL; goto out; } if (be16toh(hdr->length) < sizeof(*hdr)) { CTR3(KTR_CXGBE, "%s: tid %u invalid length %u", __func__, toep->tid, be16toh(hdr->length)); so->so_error = EBADMSG; goto out; } /* Did we get a truncated record? */ if (len < be16toh(hdr->length)) { CTR4(KTR_CXGBE, "%s: tid %u truncated TLS record (%d vs %u)", __func__, toep->tid, len, be16toh(hdr->length)); so->so_error = EMSGSIZE; goto out; } /* Is the header type unknown? */ switch (hdr->type) { case CONTENT_TYPE_CCS: case CONTENT_TYPE_ALERT: case CONTENT_TYPE_APP_DATA: case CONTENT_TYPE_HANDSHAKE: break; default: CTR3(KTR_CXGBE, "%s: tid %u invalid TLS record type %u", __func__, toep->tid, hdr->type); so->so_error = EBADMSG; goto out; } /* * Just punt. Although this could fall back to software * decryption, this case should never really happen. */ CTR4(KTR_CXGBE, "%s: tid %u dropping TLS record type %u, length %u", __func__, toep->tid, hdr->type, be16toh(hdr->length)); so->so_error = EBADMSG; out: sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); CURVNET_RESTORE(); m_freem(m); } -/* SET_TCB_FIELD sent as a ULP command looks like this */ -#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ - sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) - -static inline void * -mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, - uint64_t word, uint64_t mask, uint64_t val) -{ - struct ulptx_idata *ulpsc; - struct cpl_set_tcb_field_core *req; - - ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); - ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); - - ulpsc = (struct ulptx_idata *)(ulpmc + 1); - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); - ulpsc->len = htobe32(sizeof(*req)); - - req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); - OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); - req->reply_ctrl = htobe16(V_NO_REPLY(1) | - V_QUEUENO(toep->ofld_rxq->iq.abs_id)); - req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); - req->mask = htobe64(mask); - req->val = htobe64(val); - - ulpsc = (struct ulptx_idata *)(req + 1); - if (LEN__SET_TCB_FIELD_ULP % 16) { - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); - ulpsc->len = htobe32(0); - return (ulpsc + 1); - } - return (ulpsc); -} - /* * Send a work request setting multiple TCB fields to enable * ULP_MODE_TLS. */ static void tls_update_tcb(struct adapter *sc, struct toepcb *toep, uint64_t seqno) { struct wrqe *wr; struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int fields, key_offset, len; KASSERT(ulp_mode(toep) == ULP_MODE_NONE, ("%s: tid %d already ULP_MODE_TLS", __func__, toep->tid)); fields = 0; /* 2 writes for the overlay region */ fields += 2; /* W_TCB_TLS_SEQ */ fields++; /* W_TCB_ULP_RAW */ fields++; /* W_TCB_ULP_TYPE */ fields ++; /* W_TCB_T_FLAGS */ fields++; len = sizeof(*wrh) + fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16); KASSERT(len <= SGE_MAX_WR_LEN, ("%s: WR with %d TCB field updates too large", __func__, fields)); wr = alloc_wrqe(len, toep->ctrlq); if (wr == NULL) { /* XXX */ panic("%s: out of memory", __func__); } wrh = wrtod(wr); INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); /* * Clear the TLS overlay region: 1023:832. * * Words 26/27 are always set to zero. Words 28/29 * contain seqno and are set when enabling TLS * decryption. Word 30 is zero and Word 31 contains * the keyid. */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 26, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 26, 0xffffffffffffffff, 0); /* * RX key tags are an index into the key portion of MA * memory stored as an offset from the base address in * units of 64 bytes. */ key_offset = toep->tls.rx_key_addr - sc->vres.key.start; - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 30, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 30, 0xffffffffffffffff, (uint64_t)V_TCB_RX_TLS_KEY_TAG(key_offset / 64) << 32); CTR3(KTR_CXGBE, "%s: tid %d enable TLS seqno %lu", __func__, toep->tid, seqno); - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_TLS_SEQ, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TLS_SEQ, V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), V_TCB_TLS_SEQ(seqno)); - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_ULP_RAW, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW), V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) | V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1)))); toep->flags &= ~TPF_TLS_STARTING; toep->flags |= TPF_TLS_RECEIVE; /* Set the ULP mode to ULP_MODE_TLS. */ toep->params.ulp_mode = ULP_MODE_TLS; - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_ULP_TYPE, - V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), - V_TCB_ULP_TYPE(ULP_MODE_TLS)); + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_TYPE, + V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_TLS)); /* Clear TF_RX_QUIESCE. */ - ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_T_FLAGS, + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0); t4_wrq_tx(sc, wr); } /* * Examine the pending data in the socket buffer and either enable TLS * RX or request more encrypted data. */ static void tls_check_rx_sockbuf(struct adapter *sc, struct toepcb *toep, struct sockbuf *sb) { uint64_t seqno; size_t resid; bool have_header; SOCKBUF_LOCK_ASSERT(sb); MPASS(toep->tls.rx_resid == 0); have_header = ktls_pending_rx_info(sb, &seqno, &resid); CTR5(KTR_CXGBE, "%s: tid %d have_header %d seqno %lu resid %zu", __func__, toep->tid, have_header, seqno, resid); /* * If we have a partial header or we need fewer bytes than the * size of a TLS record, re-enable receive and pause again once * we get more data to try again. */ if (!have_header || resid != 0) { CTR(KTR_CXGBE, "%s: tid %d waiting for more data", __func__, toep->tid); toep->flags &= ~TPF_TLS_RX_QUIESCED; t4_clear_rx_quiesce(toep); return; } tls_update_tcb(sc, toep, seqno); } void tls_received_starting_data(struct adapter *sc, struct toepcb *toep, struct sockbuf *sb, int len) { MPASS(toep->flags & TPF_TLS_STARTING); /* Data was received before quiescing took effect. */ if ((toep->flags & TPF_TLS_RX_QUIESCING) != 0) return; /* * A previous call to tls_check_rx_sockbuf needed more data. * Now that more data has arrived, quiesce receive again and * check the state once the quiesce has completed. */ if ((toep->flags & TPF_TLS_RX_QUIESCED) == 0) { CTR(KTR_CXGBE, "%s: tid %d quiescing", __func__, toep->tid); toep->flags |= TPF_TLS_RX_QUIESCING; t4_set_rx_quiesce(toep); return; } KASSERT(len <= toep->tls.rx_resid, ("%s: received excess bytes %d (waiting for %zu)", __func__, len, toep->tls.rx_resid)); toep->tls.rx_resid -= len; if (toep->tls.rx_resid != 0) return; tls_check_rx_sockbuf(sc, toep, sb); } static int do_tls_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep; struct inpcb *inp; struct socket *so; struct sockbuf *sb; if (cpl->status != CPL_ERR_NONE) panic("XXX: tcp_rpl failed: %d", cpl->status); toep = lookup_tid(sc, tid); inp = toep->inp; switch (cpl->cookie) { case V_WORD(W_TCB_T_FLAGS) | V_COOKIE(CPL_COOKIE_TOM): INP_WLOCK(inp); if ((toep->flags & TPF_TLS_STARTING) == 0) panic("%s: connection is not starting TLS RX\n", __func__); MPASS((toep->flags & TPF_TLS_RX_QUIESCING) != 0); toep->flags &= ~TPF_TLS_RX_QUIESCING; toep->flags |= TPF_TLS_RX_QUIESCED; so = inp->inp_socket; sb = &so->so_rcv; SOCKBUF_LOCK(sb); tls_check_rx_sockbuf(sc, toep, sb); SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); break; default: panic("XXX: unknown tcb_rpl offset %#x, cookie %#x", G_WORD(cpl->cookie), G_COOKIE(cpl->cookie)); } return (0); } void t4_tls_mod_load(void) { t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data); t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_tls_tcb_rpl, CPL_COOKIE_TOM); } void t4_tls_mod_unload(void) { t4_register_cpl_handler(CPL_TLS_DATA, NULL); t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_TOM); } #endif /* TCP_OFFLOAD */ #endif /* KERN_TLS */ diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index ac5bba75f904..3fe34c7c01a3 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -1,2101 +1,2067 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_tcb.h" #include "t4_clip.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" #include "tom/t4_tls.h" static struct protosw toe_protosw; static struct protosw toe6_protosw; /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); static int t4_tom_modevent(module_t, int, void *); /* ULD ops and helpers */ static int t4_tom_activate(struct adapter *); static int t4_tom_deactivate(struct adapter *); static struct uld_info tom_uld_info = { .uld_id = ULD_TOM, .activate = t4_tom_activate, .deactivate = t4_tom_deactivate, }; static void release_offload_resources(struct toepcb *); static int alloc_tid_tabs(struct tid_info *); static void free_tid_tabs(struct tid_info *); static void free_tom_data(struct adapter *, struct tom_data *); static void reclaim_wr_resources(void *, int); struct toepcb * alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct toepcb *toep; int tx_credits, txsd_total, len; /* * The firmware counts tx work request credits in units of 16 bytes * each. Reserve room for an ABORT_REQ so the driver never has to worry * about tx credits if it wants to abort a connection. */ tx_credits = sc->params.ofldq_wr_cred; tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); /* * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte * immediate payload, and firmware counts tx work request credits in * units of 16 byte. Calculate the maximum work requests possible. */ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); toep = malloc(len, M_CXGBE, M_ZERO | flags); if (toep == NULL) return (NULL); refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; toep->txsd_avail = txsd_total; toep->txsd_pidx = 0; toep->txsd_cidx = 0; aiotx_init_toep(toep); return (toep); } /* * Initialize a toepcb after its params have been filled out. */ int init_toepcb(struct vi_info *vi, struct toepcb *toep) { struct conn_params *cp = &toep->params; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tx_cl_rl_params *tc; if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) { tc = &pi->sched_params->cl_rl[cp->tc_idx]; mtx_lock(&sc->tc_lock); if (tc->state != CS_HW_CONFIGURED) { CH_ERR(vi, "tid %d cannot be bound to traffic class %d " "because it is not configured (its state is %d)\n", toep->tid, cp->tc_idx, tc->state); cp->tc_idx = -1; } else { tc->refcount++; } mtx_unlock(&sc->tc_lock); } toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; tls_init_toep(toep); MPASS(ulp_mode(toep) != ULP_MODE_TCPDDP); toep->flags |= TPF_INITIALIZED; return (0); } struct toepcb * hold_toepcb(struct toepcb *toep) { refcount_acquire(&toep->refcount); return (toep); } void free_toepcb(struct toepcb *toep) { if (refcount_release(&toep->refcount) == 0) return; KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: attached to an inpcb", __func__)); KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); if (toep->flags & TPF_INITIALIZED) { if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); } free(toep, M_CXGBE); } /* * Set up the socket for TCP offload. */ void offload_socket(struct socket *so, struct toepcb *toep) { struct tom_data *td = toep->td; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb; INP_WLOCK_ASSERT(inp); /* Update socket */ sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; if (inp->inp_vflag & INP_IPV6) so->so_proto = &toe6_protosw; else so->so_proto = &toe_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ tp->tod = &td->tod; tp->t_toe = toep; tp->t_flags |= TF_TOE; /* Install an extra hold on inp */ toep->inp = inp; toep->flags |= TPF_ATTACHED; in_pcbref(inp); /* Add the TOE PCB to the active list */ mtx_lock(&td->toep_list_lock); TAILQ_INSERT_HEAD(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } void restore_so_proto(struct socket *so, bool v6) { if (v6) so->so_proto = &tcp6_protosw; else so->so_proto = &tcp_protosw; } /* This is _not_ the normal way to "unoffload" a socket. */ void undo_offload_socket(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; struct tom_data *td = toep->td; struct sockbuf *sb; INP_WLOCK_ASSERT(inp); sb = &so->so_snd; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; SOCKBUF_UNLOCK(sb); sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_NOCOALESCE; restore_so_proto(so, inp->inp_vflag & INP_IPV6); SOCKBUF_UNLOCK(sb); tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->inp = NULL; toep->flags &= ~TPF_ATTACHED; if (in_pcbrele_wlocked(inp)) panic("%s: inp freed.", __func__); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); } static void release_offload_resources(struct toepcb *toep) { struct tom_data *td = toep->td; struct adapter *sc = td_adapter(td); int tid = toep->tid; KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: %p has CPL pending.", __func__, toep)); KASSERT(!(toep->flags & TPF_ATTACHED), ("%s: %p is still attached.", __func__, toep)); CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)", __func__, toep, tid, toep->l2te, toep->ce); /* * These queues should have been emptied at approximately the same time * that a normal connection's socket's so_snd would have been purged or * drained. Do _not_ clean up here. */ MPASS(mbufq_empty(&toep->ulp_pduq)); MPASS(mbufq_empty(&toep->ulp_pdu_reclaimq)); #ifdef INVARIANTS if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); if (toep->l2te) t4_l2t_release(toep->l2te); if (tid >= 0) { remove_tid(sc, tid, toep->ce ? 2 : 1); release_tid(sc, tid, toep->ctrlq); } if (toep->ce) t4_release_clip_entry(sc, toep->ce); if (toep->params.tc_idx != -1) t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); free_toepcb(toep); } /* * The kernel is done with the TCP PCB and this is our opportunity to unhook the * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no * pending CPL) then it is time to release all resources tied to the toepcb. * * Also gets called when an offloaded active open fails and the TOM wants the * kernel to take the TCP PCB back. */ static void t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) { #if defined(KTR) || defined(INVARIANTS) struct inpcb *inp = tptoinpcb(tp); #endif struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(inp); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); KASSERT(toep->flags & TPF_ATTACHED, ("%s: not attached", __func__)); #ifdef KTR if (tp->t_state == TCPS_SYN_SENT) { CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); } else { CTR6(KTR_CXGBE, "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, inp->inp_flags); } #endif tp->tod = NULL; tp->t_toe = NULL; tp->t_flags &= ~TF_TOE; toep->flags &= ~TPF_ATTACHED; if (!(toep->flags & TPF_CPL_PENDING)) release_offload_resources(toep); } /* * setsockopt handler. */ static void t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; if (dir == SOPT_GET) return; CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name); switch (name) { case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; } } static inline uint64_t get_tcb_tflags(const uint64_t *tcb) { return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32)); } static inline uint32_t get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift) { #define LAST_WORD ((TCB_SIZE / 4) - 1) uint64_t t1, t2; int flit_idx; MPASS(mask != 0); MPASS(word <= LAST_WORD); MPASS(shift < 32); flit_idx = (LAST_WORD - word) / 2; if (word & 0x1) shift += 32; t1 = be64toh(tcb[flit_idx]) >> shift; t2 = 0; if (fls(mask) > 64 - shift) { /* * Will spill over into the next logical flit, which is the flit * before this one. The flit_idx before this one must be valid. */ MPASS(flit_idx > 0); t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift); } return ((t2 | t1) & mask); #undef LAST_WORD } #define GET_TCB_FIELD(tcb, F) \ get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F) /* * Issues a CPL_GET_TCB to read the entire TCB for the tid. */ static int send_get_tcb(struct adapter *sc, u_int tid) { struct cpl_get_tcb *cpl; struct wrq_cookie cookie; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16), &cookie); if (__predict_false(cpl == NULL)) return (ENOMEM); bzero(cpl, sizeof(*cpl)); INIT_TP_WR(cpl, tid); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid)); cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) | V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id)); cpl->cookie = 0xff; commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie); return (0); } static struct tcb_histent * alloc_tcb_histent(struct adapter *sc, u_int tid, int flags) { struct tcb_histent *te; MPASS(flags == M_NOWAIT || flags == M_WAITOK); te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags); if (te == NULL) return (NULL); mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF); callout_init_mtx(&te->te_callout, &te->te_lock, 0); te->te_adapter = sc; te->te_tid = tid; return (te); } static void free_tcb_histent(struct tcb_histent *te) { mtx_destroy(&te->te_lock); free(te, M_CXGBE); } /* * Start tracking the tid in the TCB history. */ int add_tid_to_history(struct adapter *sc, u_int tid) { struct tcb_histent *te = NULL; struct tom_data *td = sc->tom_softc; int rc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (ENXIO); rw_wlock(&td->tcb_history_lock); if (td->tcb_history[tid] != NULL) { rc = EEXIST; goto done; } te = alloc_tcb_histent(sc, tid, M_NOWAIT); if (te == NULL) { rc = ENOMEM; goto done; } mtx_lock(&te->te_lock); rc = send_get_tcb(sc, tid); if (rc == 0) { te->te_flags |= TE_RPL_PENDING; td->tcb_history[tid] = te; } else { free(te, M_CXGBE); } mtx_unlock(&te->te_lock); done: rw_wunlock(&td->tcb_history_lock); return (rc); } static void remove_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; rw_assert(&td->tcb_history_lock, RA_WLOCKED); mtx_assert(&te->te_lock, MA_OWNED); MPASS(td->tcb_history[te->te_tid] == te); td->tcb_history[te->te_tid] = NULL; free_tcb_histent(te); rw_wunlock(&td->tcb_history_lock); } static inline struct tcb_histent * lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem) { struct tcb_histent *te; struct tom_data *td = sc->tom_softc; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); if (td->tcb_history == NULL) return (NULL); if (addrem) rw_wlock(&td->tcb_history_lock); else rw_rlock(&td->tcb_history_lock); te = td->tcb_history[tid]; if (te != NULL) { mtx_lock(&te->te_lock); return (te); /* with both locks held */ } if (addrem) rw_wunlock(&td->tcb_history_lock); else rw_runlock(&td->tcb_history_lock); return (te); } static inline void release_tcb_histent(struct tcb_histent *te) { struct adapter *sc = te->te_adapter; struct tom_data *td = sc->tom_softc; mtx_assert(&te->te_lock, MA_OWNED); mtx_unlock(&te->te_lock); rw_assert(&td->tcb_history_lock, RA_RLOCKED); rw_runlock(&td->tcb_history_lock); } static void request_tcb(void *arg) { struct tcb_histent *te = arg; mtx_assert(&te->te_lock, MA_OWNED); /* Noone else is supposed to update the histent. */ MPASS(!(te->te_flags & TE_RPL_PENDING)); if (send_get_tcb(te->te_adapter, te->te_tid) == 0) te->te_flags |= TE_RPL_PENDING; else callout_schedule(&te->te_callout, hz / 100); } static void update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb) { struct tom_data *td = te->te_adapter->tom_softc; uint64_t tflags = get_tcb_tflags(tcb); uint8_t sample = 0; if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) { if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0) sample |= TS_RTO; if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0) sample |= TS_DUPACKS; if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold) sample |= TS_FASTREXMT; } if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) { uint32_t snd_wnd; sample |= TS_SND_BACKLOGGED; /* for whatever reason. */ snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (tflags & V_TF_RECV_SCALE(1)) snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE); if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd) sample |= TS_CWND_LIMITED; /* maybe due to CWND */ } if (tflags & V_TF_CCTRL_ECN(1)) { /* * CE marker on incoming IP hdr, echoing ECE back in the TCP * hdr. Indicates congestion somewhere on the way from the peer * to this node. */ if (tflags & V_TF_CCTRL_ECE(1)) sample |= TS_ECN_ECE; /* * ECE seen and CWR sent (or about to be sent). Might indicate * congestion on the way to the peer. This node is reducing its * congestion window in response. */ if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1))) sample |= TS_ECN_CWR; } te->te_sample[te->te_pidx] = sample; if (++te->te_pidx == nitems(te->te_sample)) te->te_pidx = 0; memcpy(te->te_tcb, tcb, TCB_SIZE); te->te_flags |= TE_ACTIVE; } static int do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *); const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1); struct tcb_histent *te; const u_int tid = GET_TID(cpl); bool remove; remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED; te = lookup_tcb_histent(sc, tid, remove); if (te == NULL) { /* Not in the history. Who issued the GET_TCB for this? */ device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, " "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid, (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE), GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE), GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie); goto done; } MPASS(te->te_flags & TE_RPL_PENDING); te->te_flags &= ~TE_RPL_PENDING; if (remove) { remove_tcb_histent(te); } else { update_tcb_histent(te, tcb); callout_reset(&te->te_callout, hz / 10, request_tcb, te); release_tcb_histent(te); } done: m_freem(m); return (0); } static void fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti) { uint32_t v; ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE); v = GET_TCB_FIELD(tcb, T_SRTT); ti->tcpi_rtt = tcp_ticks_to_us(sc, v); v = GET_TCB_FIELD(tcb, T_RTTVAR); ti->tcpi_rttvar = tcp_ticks_to_us(sc, v); ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH); ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND); ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT); ti->tcpi_rcv_adv = GET_TCB_FIELD(tcb, RCV_ADV); ti->tcpi_dupacks = GET_TCB_FIELD(tcb, T_DUPACKS); v = GET_TCB_FIELD(tcb, TX_MAX); ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW); ti->tcpi_snd_una = v - GET_TCB_FIELD(tcb, SND_UNA_RAW); ti->tcpi_snd_max = v - GET_TCB_FIELD(tcb, SND_MAX_RAW); /* Receive window being advertised by us. */ ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE); /* Yes, SND. */ ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND); /* Send window */ ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE); /* Yes, RCV. */ ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV); if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1)) ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale; else ti->tcpi_snd_wscale = 0; } static void fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te, struct tcp_info *ti) { fill_tcp_info_from_tcb(sc, te->te_tcb, ti); } /* * Reads the TCB for the given tid using a memory window and copies it to 'buf' * in the same format as CPL_GET_TCB_RPL. */ static void read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf) { int i, j, k, rc; uint32_t addr; u_char *tcb, tmp; MPASS(tid >= sc->tids.tid_base); MPASS(tid - sc->tids.tid_base < sc->tids.ntids); addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE; rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE); if (rc != 0) return; tcb = (u_char *)buf; for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) { for (k = 0; k < 16; k++) { tmp = tcb[i + k]; tcb[i + k] = tcb[j + k]; tcb[j + k] = tmp; } } } static void fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti) { uint64_t tcb[TCB_SIZE / sizeof(uint64_t)]; struct tcb_histent *te; ti->tcpi_toe_tid = tid; te = lookup_tcb_histent(sc, tid, false); if (te != NULL) { fill_tcp_info_from_history(sc, te, ti); release_tcb_histent(te); } else { if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) { /* XXX: tell firmware to flush TCB cache. */ } read_tcb_using_memwin(sc, tid, tcb); fill_tcp_info_from_tcb(sc, tcb, ti); } } /* * Called by the kernel to allow the TOE driver to "refine" values filled up in * the tcp_info for an offloaded connection. */ static void t4_tcp_info(struct toedev *tod, const struct tcpcb *tp, struct tcp_info *ti) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = tp->t_toe; INP_LOCK_ASSERT(tptoinpcb(tp)); MPASS(ti != NULL); fill_tcp_info(sc, toep->tid, ti); } #ifdef KERN_TLS static int t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp, struct ktls_session *tls, int direction) { struct toepcb *toep = tp->t_toe; INP_WLOCK_ASSERT(tptoinpcb(tp)); MPASS(tls != NULL); return (tls_alloc_ktls(toep, tls, direction)); } #endif -/* SET_TCB_FIELD sent as a ULP command looks like this */ -#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ - sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) - -static void * -mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, uint64_t word, uint64_t mask, - uint64_t val, uint32_t tid) -{ - struct ulptx_idata *ulpsc; - struct cpl_set_tcb_field_core *req; - - ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); - ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); - - ulpsc = (struct ulptx_idata *)(ulpmc + 1); - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); - ulpsc->len = htobe32(sizeof(*req)); - - req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); - OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); - req->reply_ctrl = htobe16(V_NO_REPLY(1)); - req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); - req->mask = htobe64(mask); - req->val = htobe64(val); - - ulpsc = (struct ulptx_idata *)(req + 1); - if (LEN__SET_TCB_FIELD_ULP % 16) { - ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); - ulpsc->len = htobe32(0); - return (ulpsc + 1); - } - return (ulpsc); -} - static void send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep) { struct wrq_cookie cookie; struct fw_flowc_wr *flowc; struct ofld_tx_sdesc *txsd; const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval); const int flowclen16 = howmany(flowclen, 16); if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) { CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__, toep->tid, toep->tx_credits, toep->txsd_avail); return; } flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie); if (__predict_false(flowc == NULL)) { CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid); return; } flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(1)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[0].val = htobe32(toep->params.emss); txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = flowclen16; txsd->plen = 0; toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie); } static void t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu) { struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int idx, len; struct wrq_cookie cookie; struct inpcb *inp = tptoinpcb(tp); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); unsigned short *mtus = &sc->params.mtus[0]; INP_WLOCK_ASSERT(inp); MPASS(mtu > 0); /* kernel is supposed to provide something usable. */ /* tp->snd_una and snd_max are in host byte order too. */ seq = be32toh(seq); CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)", __func__, toep->tid, seq, mtu, toep->params.mtu_idx, mtus[toep->params.mtu_idx]); if (ulp_mode(toep) == ULP_MODE_NONE && /* XXX: Read TCB otherwise? */ (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) { CTR5(KTR_CXGBE, "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).", __func__, toep->tid, seq, tp->snd_una, tp->snd_max); return; } /* Find the best mtu_idx for the suggested MTU. */ for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++) continue; if (idx >= toep->params.mtu_idx) return; /* Never increase the PMTU (just like the kernel). */ /* * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first * one updates the mtu_idx and the second one triggers a retransmit. */ len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16); wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie); if (wrh == NULL) { CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n", toep->tid, toep->params.mtu_idx, idx); return; } INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); - ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_T_MAXSEG, - V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx), toep->tid); - ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_TIMESTAMP, - V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0, toep->tid); + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_MAXSEG, + V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx)); + ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_TIMESTAMP, + V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0); commit_wrq_wr(toep->ctrlq, wrh, &cookie); /* Update the software toepcb and tcpcb. */ toep->params.mtu_idx = idx; tp->t_maxseg = mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); toep->params.emss = tp->t_maxseg; if (tp->t_flags & TF_RCVD_TSTMP) toep->params.emss -= TCPOLEN_TSTAMP_APPA; /* Update the firmware flowc. */ send_mss_flowc_wr(sc, toep); /* Update the MTU in the kernel's hostcache. */ if (sc->tt.update_hc_on_pmtu_change != 0) { struct in_conninfo inc = {0}; inc.inc_fibnum = inp->inp_inc.inc_fibnum; if (inp->inp_inc.inc_flags & INC_ISIPV6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = inp->inp_inc.inc6_faddr; } else { inc.inc_faddr = inp->inp_inc.inc_faddr; } tcp_hc_updatemtu(&inc, mtu); } CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", __func__, toep->tid, toep->params.mtu_idx, mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss); } /* * The TOE driver will not receive any more CPLs for the tid associated with the * toepcb; release the hold on the inpcb. */ void final_cpl_received(struct toepcb *toep) { struct inpcb *inp = toep->inp; bool need_wakeup; KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_CPL_PENDING, ("%s: CPL not pending already?", __func__)); CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0; toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL); mbufq_drain(&toep->ulp_pduq); mbufq_drain(&toep->ulp_pdu_reclaimq); if (!(toep->flags & TPF_ATTACHED)) release_offload_resources(toep); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); if (need_wakeup) { struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); mtx_lock(lock); wakeup(toep); mtx_unlock(lock); } } void insert_tid(struct adapter *sc, int tid, void *ctx, int ntids) { struct tid_info *t = &sc->tids; MPASS(tid >= t->tid_base); MPASS(tid - t->tid_base < t->ntids); t->tid_tab[tid - t->tid_base] = ctx; atomic_add_int(&t->tids_in_use, ntids); } void * lookup_tid(struct adapter *sc, int tid) { struct tid_info *t = &sc->tids; return (t->tid_tab[tid - t->tid_base]); } void update_tid(struct adapter *sc, int tid, void *ctx) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = ctx; } void remove_tid(struct adapter *sc, int tid, int ntids) { struct tid_info *t = &sc->tids; t->tid_tab[tid - t->tid_base] = NULL; atomic_subtract_int(&t->tids_in_use, ntids); } /* * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt * have the MSS that we should advertise in our SYN. Advertised MSS doesn't * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; int i, mss, mtu; MPASS(inc != NULL); mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); } /* * Determine the receive window size for a socket. */ u_long select_rcv_wnd(struct socket *so) { unsigned long wnd; SOCKBUF_LOCK_ASSERT(&so->so_rcv); wnd = sbspace(&so->so_rcv); if (wnd < MIN_RCV_WND) wnd = MIN_RCV_WND; return min(wnd, MAX_RCV_WND); } int select_rcv_wscale(void) { int wscale = 0; unsigned long space = sb_max; if (space > MAX_RCV_WND) space = MAX_RCV_WND; while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) wscale++; return (wscale); } __be64 calc_options0(struct vi_info *vi, struct conn_params *cp) { uint64_t opt0 = 0; opt0 |= F_TCAM_BYPASS; MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); opt0 |= V_WND_SCALE(cp->wscale); MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); opt0 |= V_MSS_IDX(cp->mtu_idx); MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); opt0 |= V_ULP_MODE(cp->ulp_mode); MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size); opt0 |= V_L2T_IDX(cp->l2t_idx); opt0 |= V_SMAC_SEL(vi->smt_idx); opt0 |= V_TX_CHAN(vi->pi->tx_chan); MPASS(cp->keepalive == 0 || cp->keepalive == 1); opt0 |= V_KEEP_ALIVE(cp->keepalive); MPASS(cp->nagle == 0 || cp->nagle == 1); opt0 |= V_NAGLE(cp->nagle); return (htobe64(opt0)); } __be32 calc_options2(struct vi_info *vi, struct conn_params *cp) { uint32_t opt2 = 0; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; /* * rx flow control, rx coalesce, congestion control, and tx pace are all * explicitly set by the driver. On T5+ the ISS is also set by the * driver to the value picked by the kernel. */ if (is_t4(sc)) { opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; } else { opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ opt2 |= F_T5_ISS; /* ISS provided in CPL */ } MPASS(cp->sack == 0 || cp->sack == 1); opt2 |= V_SACK_EN(cp->sack); MPASS(cp->tstamp == 0 || cp->tstamp == 1); opt2 |= V_TSTAMPS_EN(cp->tstamp); if (cp->wscale > 0) opt2 |= F_WND_SCALE_EN; MPASS(cp->ecn == 0 || cp->ecn == 1); opt2 |= V_CCTRL_ECN(cp->ecn); opt2 |= V_TX_QUEUE(TX_MODQ(pi->tx_chan)); opt2 |= V_PACE(0); opt2 |= F_RSS_QUEUE_VALID; opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); if (chip_id(sc) <= CHELSIO_T6) { MPASS(pi->rx_chan == 0 || pi->rx_chan == 1); opt2 |= V_RX_CHANNEL(pi->rx_chan); } MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); opt2 |= V_CONG_CNTRL(cp->cong_algo); MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); if (cp->rx_coalesce == 1) opt2 |= V_RX_COALESCE(M_RX_COALESCE); opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); MPASS(cp->ulp_mode != ULP_MODE_TCPDDP); return (htobe32(opt2)); } uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { struct adapter *sc = vi->adapter; struct tp_params *tp = &sc->params.tp; uint64_t ntuple = 0; /* * Initialize each of the fields which we care about which are present * in the Compressed Filter Tuple. */ if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE) ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift; if (tp->port_shift >= 0) ntuple |= (uint64_t)e->lport << tp->port_shift; if (tp->protocol_shift >= 0) ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift; if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) { ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) | V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) << tp->vnic_shift; } if (is_t4(sc)) return (htobe32((uint32_t)ntuple)); else return (htobe64(V_FILTER_TUPLE(ntuple))); } /* * Initialize various connection parameters. */ void init_conn_params(struct vi_info *vi , struct offload_settings *s, struct in_conninfo *inc, struct socket *so, const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct tom_tunables *tt = &sc->tt; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); u_long wnd; u_int q_idx; MPASS(s->offload != 0); /* Congestion control algorithm */ if (s->cong_algo >= 0) cp->cong_algo = s->cong_algo & M_CONG_CNTRL; else if (sc->tt.cong_algorithm >= 0) cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; else { struct cc_algo *cc = CC_ALGO(tp); if (strcasecmp(cc->name, "reno") == 0) cp->cong_algo = CONG_ALG_RENO; else if (strcasecmp(cc->name, "tahoe") == 0) cp->cong_algo = CONG_ALG_TAHOE; if (strcasecmp(cc->name, "newreno") == 0) cp->cong_algo = CONG_ALG_NEWRENO; if (strcasecmp(cc->name, "highspeed") == 0) cp->cong_algo = CONG_ALG_HIGHSPEED; else { /* * Use newreno in case the algorithm selected by the * host stack is not supported by the hardware. */ cp->cong_algo = CONG_ALG_NEWRENO; } } /* Tx traffic scheduling class. */ if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls) cp->tc_idx = s->sched_class; else cp->tc_idx = -1; /* Nagle's algorithm. */ if (s->nagle >= 0) cp->nagle = s->nagle > 0 ? 1 : 0; else cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; /* TCP Keepalive. */ if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) cp->keepalive = 1; else cp->keepalive = 0; /* Optimization that's specific to T5 @ 40G. */ if (tt->tx_align >= 0) cp->tx_align = tt->tx_align > 0 ? 1 : 0; else if (chip_id(sc) == CHELSIO_T5 && (port_top_speed(pi) > 10 || sc->params.nports > 2)) cp->tx_align = 1; else cp->tx_align = 0; /* ULP mode. */ cp->ulp_mode = ULP_MODE_NONE; /* Rx coalescing. */ if (s->rx_coalesce >= 0) cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; else if (tt->rx_coalesce >= 0) cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; else cp->rx_coalesce = 1; /* default */ /* * Index in the PMTU table. This controls the MSS that we announce in * our SYN initially, but after ESTABLISHED it controls the MSS that we * use to send data. */ cp->mtu_idx = find_best_mtu_idx(sc, inc, s); /* Tx queue for this connection. */ if (s->txq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->txq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->txq_rr, 1); else q_idx = s->txq; cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq; /* Rx queue for this connection. */ if (s->rxq == QUEUE_RANDOM) q_idx = arc4random(); else if (s->rxq == QUEUE_ROUNDROBIN) q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1); else q_idx = s->rxq; cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq; if (SOLISTENING(so)) { /* Passive open */ MPASS(tcpopt != NULL); /* TCP timestamp option */ if (tcpopt->tstamp && (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling. */ if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (tcpopt->ecn && /* XXX: review. */ (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) cp->ecn = 1; else cp->ecn = 0; wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else if (so->sol_sbsnd_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->sol_sbsnd_hiwat; } else { /* Active open */ /* TCP timestamp option */ if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) cp->tstamp = 1; else cp->tstamp = 0; /* SACK */ if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) cp->sack = 1; else cp->sack = 0; /* Receive window scaling */ if (tp->t_flags & TF_REQ_SCALE) cp->wscale = select_rcv_wscale(); else cp->wscale = 0; /* ECN */ if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) cp->ecn = 1; else cp->ecn = 0; SOCKBUF_LOCK(&so->so_rcv); wnd = max(select_rcv_wnd(so), MIN_RCV_WND); SOCKBUF_UNLOCK(&so->so_rcv); cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); if (tt->sndbuf > 0) cp->sndbuf = tt->sndbuf; else { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) cp->sndbuf = 256 * 1024; else cp->sndbuf = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); } } cp->l2t_idx = l2t_idx; /* This will be initialized on ESTABLISHED. */ cp->emss = 0; } int negative_advice(int status) { return (status == CPL_ERR_RTX_NEG_ADVICE || status == CPL_ERR_PERSIST_NEG_ADVICE || status == CPL_ERR_KEEPALV_NEG_ADVICE); } static int alloc_tid_tab(struct tid_info *t, int flags) { MPASS(t->ntids > 0); MPASS(t->tid_tab == NULL); t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE, M_ZERO | flags); if (t->tid_tab == NULL) return (ENOMEM); atomic_store_rel_int(&t->tids_in_use, 0); return (0); } static void free_tid_tab(struct tid_info *t) { KASSERT(t->tids_in_use == 0, ("%s: %d tids still in use.", __func__, t->tids_in_use)); free(t->tid_tab, M_CXGBE); t->tid_tab = NULL; } static int alloc_stid_tab(struct tid_info *t, int flags) { MPASS(t->nstids > 0); MPASS(t->stid_tab == NULL); t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, M_ZERO | flags); if (t->stid_tab == NULL) return (ENOMEM); mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); t->stids_in_use = 0; TAILQ_INIT(&t->stids); t->nstids_free_head = t->nstids; return (0); } static void free_stid_tab(struct tid_info *t) { KASSERT(t->stids_in_use == 0, ("%s: %d tids still in use.", __func__, t->stids_in_use)); if (mtx_initialized(&t->stid_lock)) mtx_destroy(&t->stid_lock); free(t->stid_tab, M_CXGBE); t->stid_tab = NULL; } static void free_tid_tabs(struct tid_info *t) { free_tid_tab(t); free_stid_tab(t); } static int alloc_tid_tabs(struct tid_info *t) { int rc; rc = alloc_tid_tab(t, M_NOWAIT); if (rc != 0) goto failed; rc = alloc_stid_tab(t, M_NOWAIT); if (rc != 0) goto failed; return (0); failed: free_tid_tabs(t); return (rc); } static inline void alloc_tcb_history(struct adapter *sc, struct tom_data *td) { if (sc->tids.ntids == 0 || sc->tids.ntids > 1024) return; rw_init(&td->tcb_history_lock, "TCB history"); td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history), M_CXGBE, M_ZERO | M_NOWAIT); td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0)); } static inline void free_tcb_history(struct adapter *sc, struct tom_data *td) { #ifdef INVARIANTS int i; if (td->tcb_history != NULL) { for (i = 0; i < sc->tids.ntids; i++) { MPASS(td->tcb_history[i] == NULL); } } #endif free(td->tcb_history, M_CXGBE); if (rw_initialized(&td->tcb_history_lock)) rw_destroy(&td->tcb_history_lock); } static void free_tom_data(struct adapter *sc, struct tom_data *td) { ASSERT_SYNCHRONIZED_OP(sc); KASSERT(TAILQ_EMPTY(&td->toep_list), ("%s: TOE PCB list is not empty.", __func__)); KASSERT(td->lctx_count == 0, ("%s: lctx hash table is not empty.", __func__)); t4_free_ppod_region(&td->pr); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); if (mtx_initialized(&td->unsent_wr_lock)) mtx_destroy(&td->unsent_wr_lock); if (mtx_initialized(&td->lctx_hash_lock)) mtx_destroy(&td->lctx_hash_lock); if (mtx_initialized(&td->toep_list_lock)) mtx_destroy(&td->toep_list_lock); free_tcb_history(sc, td); free_tid_tabs(&sc->tids); free(td, M_CXGBE); } static char * prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen, int *buflen) { char *pkt; struct tcphdr *th; int ipv6, len; const int maxlen = max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) + max(sizeof(struct ip), sizeof(struct ip6_hdr)) + sizeof(struct tcphdr); MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN); pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT); if (pkt == NULL) return (NULL); ipv6 = inp->inp_vflag & INP_IPV6; len = 0; if (EVL_VLANOFTAG(vtag) == 0xfff) { struct ether_header *eh = (void *)pkt; if (ipv6) eh->ether_type = htons(ETHERTYPE_IPV6); else eh->ether_type = htons(ETHERTYPE_IP); len += sizeof(*eh); } else { struct ether_vlan_header *evh = (void *)pkt; evh->evl_encap_proto = htons(ETHERTYPE_VLAN); evh->evl_tag = htons(vtag); if (ipv6) evh->evl_proto = htons(ETHERTYPE_IPV6); else evh->evl_proto = htons(ETHERTYPE_IP); len += sizeof(*evh); } if (ipv6) { struct ip6_hdr *ip6 = (void *)&pkt[len]; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = ip6->ip6_src; } len += sizeof(*ip6); } else { struct ip *ip = (void *)&pkt[len]; ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr)); ip->ip_ttl = inp->inp_ip_ttl; ip->ip_p = IPPROTO_TCP; if (open_type == OPEN_TYPE_ACTIVE) { ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } else if (open_type == OPEN_TYPE_LISTEN) { ip->ip_src = inp->inp_laddr; ip->ip_dst = ip->ip_src; } len += sizeof(*ip); } th = (void *)&pkt[len]; if (open_type == OPEN_TYPE_ACTIVE) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = inp->inp_fport; /* ditto */ } else if (open_type == OPEN_TYPE_LISTEN) { th->th_sport = inp->inp_lport; /* network byte order already */ th->th_dport = th->th_sport; } len += sizeof(th); *pktlen = *buflen = len; return (pkt); } const struct offload_settings * lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m, uint16_t vtag, struct inpcb *inp) { const struct t4_offload_policy *op; char *pkt; struct offload_rule *r; int i, matched, pktlen, buflen; static const struct offload_settings allow_offloading_settings = { .offload = 1, .rx_coalesce = -1, .cong_algo = -1, .sched_class = -1, .tstamp = -1, .sack = -1, .nagle = -1, .ecn = -1, .ddp = -1, .tls = -1, .txq = QUEUE_RANDOM, .rxq = QUEUE_RANDOM, .mss = -1, }; static const struct offload_settings disallow_offloading_settings = { .offload = 0, /* rest is irrelevant when offload is off. */ }; rw_assert(&sc->policy_lock, RA_LOCKED); /* * If there's no Connection Offloading Policy attached to the device * then we need to return a default static policy. If * "cop_managed_offloading" is true, then we need to disallow * offloading until a COP is attached to the device. Otherwise we * allow offloading ... */ op = sc->policy; if (op == NULL) { if (sc->tt.cop_managed_offloading) return (&disallow_offloading_settings); else return (&allow_offloading_settings); } switch (open_type) { case OPEN_TYPE_ACTIVE: case OPEN_TYPE_LISTEN: pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen); break; case OPEN_TYPE_PASSIVE: MPASS(m != NULL); pkt = mtod(m, char *); MPASS(*pkt == CPL_PASS_ACCEPT_REQ); pkt += sizeof(struct cpl_pass_accept_req); pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req); buflen = m->m_len - sizeof(struct cpl_pass_accept_req); break; default: MPASS(0); return (&disallow_offloading_settings); } if (pkt == NULL || pktlen == 0 || buflen == 0) return (&disallow_offloading_settings); matched = 0; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { if (r->open_type != open_type && r->open_type != OPEN_TYPE_DONTCARE) { continue; } matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen); if (matched) break; } if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN) free(pkt, M_CXGBE); return (matched ? &r->settings : &disallow_offloading_settings); } static void reclaim_wr_resources(void *arg, int count) { struct tom_data *td = arg; STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list); struct cpl_act_open_req *cpl; u_int opcode, atid, tid; struct wrqe *wr; struct adapter *sc = td_adapter(td); mtx_lock(&td->unsent_wr_lock); STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe); mtx_unlock(&td->unsent_wr_lock); while ((wr = STAILQ_FIRST(&twr_list)) != NULL) { STAILQ_REMOVE_HEAD(&twr_list, link); cpl = wrtod(wr); opcode = GET_OPCODE(cpl); switch (opcode) { case CPL_ACT_OPEN_REQ: case CPL_ACT_OPEN_REQ6: atid = G_TID_TID(be32toh(OPCODE_TID(cpl))); CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid); act_open_failure_cleanup(sc, atid, EHOSTUNREACH); free(wr, M_CXGBE); break; case CPL_PASS_ACCEPT_RPL: tid = GET_TID(cpl); CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid); synack_failure_cleanup(sc, tid); free(wr, M_CXGBE); break; default: log(LOG_ERR, "%s: leaked work request %p, wr_len %d, " "opcode %x\n", __func__, wr, wr->wr_len, opcode); /* WR not freed here; go look at it with a debugger. */ } } } /* * Ground control to Major TOM * Commencing countdown, engines on */ static int t4_tom_activate(struct adapter *sc) { struct tom_data *td; struct toedev *tod; struct vi_info *vi; int i, rc, v; ASSERT_SYNCHRONIZED_OP(sc); /* per-adapter softc for TOM */ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); if (td == NULL) return (ENOMEM); /* List of TOE PCBs and associated lock */ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); TAILQ_INIT(&td->toep_list); /* Listen context */ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, &td->listen_mask, HASH_NOWAIT); /* List of WRs for which L2 resolution failed */ mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF); STAILQ_INIT(&td->unsent_wr_list); TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td); /* TID tables */ rc = alloc_tid_tabs(&sc->tids); if (rc != 0) goto done; rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp, t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods"); if (rc != 0) goto done; t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK, V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask); alloc_tcb_history(sc, td); /* toedev ops */ tod = &td->tod; init_toedev(tod); tod->tod_softc = sc; tod->tod_connect = t4_connect; tod->tod_listen_start = t4_listen_start; tod->tod_listen_stop = t4_listen_stop; tod->tod_rcvd = t4_rcvd; tod->tod_output = t4_tod_output; tod->tod_send_rst = t4_send_rst; tod->tod_send_fin = t4_send_fin; tod->tod_pcb_detach = t4_pcb_detach; tod->tod_l2_update = t4_l2_update; tod->tod_syncache_added = t4_syncache_added; tod->tod_syncache_removed = t4_syncache_removed; tod->tod_syncache_respond = t4_syncache_respond; tod->tod_offload_socket = t4_offload_socket; tod->tod_ctloutput = t4_ctloutput; tod->tod_tcp_info = t4_tcp_info; #ifdef KERN_TLS tod->tod_alloc_tls_session = t4_alloc_tls_session; #endif tod->tod_pmtu_update = t4_pmtu_update; for_each_port(sc, i) { for_each_vi(sc->port[i], v, vi) { SETTOEDEV(vi->ifp, &td->tod); } } sc->tom_softc = td; register_toedev(sc->tom_softc); done: if (rc != 0) free_tom_data(sc, td); return (rc); } static int t4_tom_deactivate(struct adapter *sc) { int rc = 0; struct tom_data *td = sc->tom_softc; ASSERT_SYNCHRONIZED_OP(sc); if (td == NULL) return (0); /* XXX. KASSERT? */ if (sc->offload_map != 0) return (EBUSY); /* at least one port has IFCAP_TOE enabled */ if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI)) return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */ mtx_lock(&td->toep_list_lock); if (!TAILQ_EMPTY(&td->toep_list)) rc = EBUSY; mtx_unlock(&td->toep_list_lock); mtx_lock(&td->lctx_hash_lock); if (td->lctx_count > 0) rc = EBUSY; mtx_unlock(&td->lctx_hash_lock); taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources); mtx_lock(&td->unsent_wr_lock); if (!STAILQ_EMPTY(&td->unsent_wr_list)) rc = EBUSY; mtx_unlock(&td->unsent_wr_lock); if (rc == 0) { unregister_toedev(sc->tom_softc); free_tom_data(sc, td); sc->tom_softc = NULL; } return (rc); } static int t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error, optval; if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) { if (sopt->sopt_dir != SOPT_SET) return (EOPNOTSUPP); if (sopt->sopt_td != NULL) { /* Only settable by the kernel. */ return (EPERM); } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) return (error); if (optval != 0) return (t4_enable_ddp_rcv(so, toep)); else return (EOPNOTSUPP); } return (tcp_ctloutput(so, sopt)); } static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; int error; /* * No lock is needed as TOE sockets never change between * active and passive. */ if (SOLISTENING(so)) return (EINVAL); if (ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_NONE) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error); } return (t4_aio_queue_aiotx(so, job)); } static int t4_tom_mod_load(void) { /* CPL handlers */ t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2, CPL_COOKIE_TOM); t4_init_connect_cpl_handlers(); t4_init_listen_cpl_handlers(); t4_init_cpl_io_handlers(); t4_ddp_mod_load(); t4_tls_mod_load(); bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw)); toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_aio_queue = t4_aio_queue_tom; bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_aio_queue = t4_aio_queue_tom; return (t4_register_uld(&tom_uld_info)); } static void tom_uninit(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun")) return; /* Try to free resources (works only if no port has IFCAP_TOE) */ if (uld_active(sc, ULD_TOM)) t4_deactivate_uld(sc, ULD_TOM); end_synchronized_op(sc, 0); } static int t4_tom_mod_unload(void) { t4_iterate(tom_uninit, NULL); if (t4_unregister_uld(&tom_uld_info) == EBUSY) return (EBUSY); t4_tls_mod_unload(); t4_ddp_mod_unload(); t4_uninit_connect_cpl_handlers(); t4_uninit_listen_cpl_handlers(); t4_uninit_cpl_io_handlers(); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM); t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL); return (0); } #endif /* TCP_OFFLOAD */ static int t4_tom_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = t4_tom_mod_load(); break; case MOD_UNLOAD: rc = t4_tom_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t t4_tom_moddata= { "t4_tom", t4_tom_modevent, 0 }; MODULE_VERSION(t4_tom, 1); MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);