diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h index d24383f4c927..f4a048a6d2ca 100644 --- a/sys/dev/mlx5/mlx5_en/en.h +++ b/sys/dev/mlx5/mlx5_en/en.h @@ -1,1314 +1,1314 @@ /*- * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MLX5_EN_H_ #define _MLX5_EN_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #include #endif #include #include #include #include #include #include #include #include #include #include #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) #define MLX5E_MAX_PRIORITY 8 #define MLX5E_MAX_FEC_10X_25X 4 #define MLX5E_MAX_FEC_50X 4 /* IEEE 802.1Qaz standard supported values */ #define IEEE_8021QAZ_MAX_TCS 8 #define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE 0x7 #define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE 0xa #define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE 0xe #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE 0x7 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE 0xa #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE 0xe #define MLX5E_MAX_BUSDMA_RX_SEGS 15 #ifndef MLX5E_MAX_RX_BYTES #define MLX5E_MAX_RX_BYTES MCLBYTES #endif #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \ MIN(65535, 7 * MLX5E_MAX_RX_BYTES) #define MLX5E_DIM_DEFAULT_PROFILE 3 #define MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO 16 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3 #define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS 0x20 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC 0x10 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS 0x20 #define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES 0x80 #define MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ 0x7 #define MLX5E_CACHELINE_SIZE CACHE_LINE_SIZE #define MLX5E_HW2SW_MTU(hwmtu) \ ((hwmtu) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN)) #define MLX5E_SW2HW_MTU(swmtu) \ ((swmtu) + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN)) #define MLX5E_SW2MB_MTU(swmtu) \ (MLX5E_SW2HW_MTU(swmtu) + MLX5E_NET_IP_ALIGN) #define MLX5E_MTU_MIN 72 /* Min MTU allowed by the kernel */ #define MLX5E_MTU_MAX MIN(ETHERMTU_JUMBO, MJUM16BYTES) /* Max MTU of Ethernet * jumbo frames */ #define MLX5E_BUDGET_MAX 8192 /* RX and TX */ #define MLX5E_RX_BUDGET_MAX 256 #define MLX5E_SQ_BF_BUDGET 16 #define MLX5E_SQ_TX_QUEUE_SIZE 4096 /* SQ drbr queue size */ #define MLX5E_MAX_TX_NUM_TC 8 /* units */ #define MLX5E_MAX_TX_HEADER 192 /* bytes */ #define MLX5E_MAX_TX_PAYLOAD_SIZE 65536 /* bytes */ #define MLX5E_MAX_TX_MBUF_SIZE 65536 /* bytes */ #define MLX5E_MAX_TX_MBUF_FRAGS \ ((MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS) - \ (MLX5E_MAX_TX_HEADER / MLX5_SEND_WQE_DS) - \ 1 /* the maximum value of the DS counter is 0x3F and not 0x40 */) /* units */ #define MLX5E_MAX_TX_INLINE \ (MLX5E_MAX_TX_HEADER - sizeof(struct mlx5e_tx_wqe) + \ sizeof(((struct mlx5e_tx_wqe *)0)->eth.inline_hdr_start)) /* bytes */ #define MLX5E_100MB (100000) #define MLX5E_1GB (1000000) #define MLX5E_ZERO(ptr, field) \ memset(&(ptr)->field, 0, \ sizeof(*(ptr)) - __offsetof(__typeof(*(ptr)), field)) MALLOC_DECLARE(M_MLX5EN); struct mlx5_core_dev; struct mlx5e_cq; typedef void (mlx5e_cq_comp_t)(struct mlx5_core_cq *, struct mlx5_eqe *); #define mlx5_en_err(_dev, format, ...) \ if_printf(_dev, "ERR: ""%s:%d:(pid %d): " format, \ __func__, __LINE__, curthread->td_proc->p_pid, \ ##__VA_ARGS__) #define mlx5_en_warn(_dev, format, ...) \ if_printf(_dev, "WARN: ""%s:%d:(pid %d): " format, \ __func__, __LINE__, curthread->td_proc->p_pid, \ ##__VA_ARGS__) #define mlx5_en_info(_dev, format, ...) \ if_printf(_dev, "INFO: ""%s:%d:(pid %d): " format, \ __func__, __LINE__, curthread->td_proc->p_pid, \ ##__VA_ARGS__) #define MLX5E_STATS_COUNT(a, ...) a #define MLX5E_STATS_VAR(a, b, c, ...) b c; #define MLX5E_STATS_COUNTER(a, b, c, ...) counter_##b##_t c; #define MLX5E_STATS_DESC(a, b, c, d, e, ...) d, e, #define MLX5E_VPORT_STATS(m) \ /* HW counters */ \ m(+1, u64, rx_packets, "rx_packets", "Received packets") \ m(+1, u64, rx_bytes, "rx_bytes", "Received bytes") \ m(+1, u64, tx_packets, "tx_packets", "Transmitted packets") \ m(+1, u64, tx_bytes, "tx_bytes", "Transmitted bytes") \ m(+1, u64, rx_error_packets, "rx_error_packets", "Received error packets") \ m(+1, u64, rx_error_bytes, "rx_error_bytes", "Received error bytes") \ m(+1, u64, tx_error_packets, "tx_error_packets", "Transmitted error packets") \ m(+1, u64, tx_error_bytes, "tx_error_bytes", "Transmitted error bytes") \ m(+1, u64, rx_unicast_packets, "rx_unicast_packets", "Received unicast packets") \ m(+1, u64, rx_unicast_bytes, "rx_unicast_bytes", "Received unicast bytes") \ m(+1, u64, tx_unicast_packets, "tx_unicast_packets", "Transmitted unicast packets") \ m(+1, u64, tx_unicast_bytes, "tx_unicast_bytes", "Transmitted unicast bytes") \ m(+1, u64, rx_multicast_packets, "rx_multicast_packets", "Received multicast packets") \ m(+1, u64, rx_multicast_bytes, "rx_multicast_bytes", "Received multicast bytes") \ m(+1, u64, tx_multicast_packets, "tx_multicast_packets", "Transmitted multicast packets") \ m(+1, u64, tx_multicast_bytes, "tx_multicast_bytes", "Transmitted multicast bytes") \ m(+1, u64, rx_broadcast_packets, "rx_broadcast_packets", "Received broadcast packets") \ m(+1, u64, rx_broadcast_bytes, "rx_broadcast_bytes", "Received broadcast bytes") \ m(+1, u64, tx_broadcast_packets, "tx_broadcast_packets", "Transmitted broadcast packets") \ m(+1, u64, tx_broadcast_bytes, "tx_broadcast_bytes", "Transmitted broadcast bytes") \ m(+1, u64, rx_out_of_buffer, "rx_out_of_buffer", "Receive out of buffer, no recv wqes events") \ /* SW counters */ \ m(+1, u64, tso_packets, "tso_packets", "Transmitted TSO packets") \ m(+1, u64, tso_bytes, "tso_bytes", "Transmitted TSO bytes") \ m(+1, u64, lro_packets, "lro_packets", "Received LRO packets") \ m(+1, u64, lro_bytes, "lro_bytes", "Received LRO bytes") \ m(+1, u64, sw_lro_queued, "sw_lro_queued", "Packets queued for SW LRO") \ m(+1, u64, sw_lro_flushed, "sw_lro_flushed", "Packets flushed from SW LRO") \ m(+1, u64, rx_csum_good, "rx_csum_good", "Received checksum valid packets") \ m(+1, u64, rx_csum_none, "rx_csum_none", "Received no checksum packets") \ m(+1, u64, tx_csum_offload, "tx_csum_offload", "Transmit checksum offload packets") \ m(+1, u64, tx_queue_dropped, "tx_queue_dropped", "Transmit queue dropped") \ m(+1, u64, tx_defragged, "tx_defragged", "Transmit queue defragged") \ m(+1, u64, rx_wqe_err, "rx_wqe_err", "Receive WQE errors") \ m(+1, u64, tx_jumbo_packets, "tx_jumbo_packets", "TX packets greater than 1518 octets") \ m(+1, u64, rx_steer_missed_packets, "rx_steer_missed_packets", "RX packets dropped by steering rule(s)") \ m(+1, u64, rx_decrypted_ok_packets, "rx_decrypted_ok_packets", "RX packets successfully decrypted by steering rule(s)") \ m(+1, u64, rx_decrypted_error_packets, "rx_decrypted_error_packets", "RX packets not decrypted by steering rule(s)") #define MLX5E_VPORT_STATS_NUM (0 MLX5E_VPORT_STATS(MLX5E_STATS_COUNT)) struct mlx5e_vport_stats { struct sysctl_ctx_list ctx; u64 arg [0]; MLX5E_VPORT_STATS(MLX5E_STATS_VAR) }; #define MLX5E_PPORT_IEEE802_3_STATS(m) \ m(+1, u64, frames_tx, "frames_tx", "Frames transmitted") \ m(+1, u64, frames_rx, "frames_rx", "Frames received") \ m(+1, u64, check_seq_err, "check_seq_err", "Sequence errors") \ m(+1, u64, alignment_err, "alignment_err", "Alignment errors") \ m(+1, u64, octets_tx, "octets_tx", "Bytes transmitted") \ m(+1, u64, octets_received, "octets_received", "Bytes received") \ m(+1, u64, multicast_xmitted, "multicast_xmitted", "Multicast transmitted") \ m(+1, u64, broadcast_xmitted, "broadcast_xmitted", "Broadcast transmitted") \ m(+1, u64, multicast_rx, "multicast_rx", "Multicast received") \ m(+1, u64, broadcast_rx, "broadcast_rx", "Broadcast received") \ m(+1, u64, in_range_len_errors, "in_range_len_errors", "In range length errors") \ m(+1, u64, out_of_range_len, "out_of_range_len", "Out of range length errors") \ m(+1, u64, too_long_errors, "too_long_errors", "Too long errors") \ m(+1, u64, symbol_err, "symbol_err", "Symbol errors") \ m(+1, u64, mac_control_tx, "mac_control_tx", "MAC control transmitted") \ m(+1, u64, mac_control_rx, "mac_control_rx", "MAC control received") \ m(+1, u64, unsupported_op_rx, "unsupported_op_rx", "Unsupported operation received") \ m(+1, u64, pause_ctrl_rx, "pause_ctrl_rx", "Pause control received") \ m(+1, u64, pause_ctrl_tx, "pause_ctrl_tx", "Pause control transmitted") #define MLX5E_PPORT_RFC2819_STATS(m) \ m(+1, u64, drop_events, "drop_events", "Dropped events") \ m(+1, u64, octets, "octets", "Octets") \ m(+1, u64, pkts, "pkts", "Packets") \ m(+1, u64, broadcast_pkts, "broadcast_pkts", "Broadcast packets") \ m(+1, u64, multicast_pkts, "multicast_pkts", "Multicast packets") \ m(+1, u64, crc_align_errors, "crc_align_errors", "CRC alignment errors") \ m(+1, u64, undersize_pkts, "undersize_pkts", "Undersized packets") \ m(+1, u64, oversize_pkts, "oversize_pkts", "Oversized packets") \ m(+1, u64, fragments, "fragments", "Fragments") \ m(+1, u64, jabbers, "jabbers", "Jabbers") \ m(+1, u64, collisions, "collisions", "Collisions") #define MLX5E_PPORT_RFC2819_STATS_DEBUG(m) \ m(+1, u64, p64octets, "p64octets", "Bytes") \ m(+1, u64, p65to127octets, "p65to127octets", "Bytes") \ m(+1, u64, p128to255octets, "p128to255octets", "Bytes") \ m(+1, u64, p256to511octets, "p256to511octets", "Bytes") \ m(+1, u64, p512to1023octets, "p512to1023octets", "Bytes") \ m(+1, u64, p1024to1518octets, "p1024to1518octets", "Bytes") \ m(+1, u64, p1519to2047octets, "p1519to2047octets", "Bytes") \ m(+1, u64, p2048to4095octets, "p2048to4095octets", "Bytes") \ m(+1, u64, p4096to8191octets, "p4096to8191octets", "Bytes") \ m(+1, u64, p8192to10239octets, "p8192to10239octets", "Bytes") #define MLX5E_PPORT_RFC2863_STATS_DEBUG(m) \ m(+1, u64, in_octets, "in_octets", "In octets") \ m(+1, u64, in_ucast_pkts, "in_ucast_pkts", "In unicast packets") \ m(+1, u64, in_discards, "in_discards", "In discards") \ m(+1, u64, in_errors, "in_errors", "In errors") \ m(+1, u64, in_unknown_protos, "in_unknown_protos", "In unknown protocols") \ m(+1, u64, out_octets, "out_octets", "Out octets") \ m(+1, u64, out_ucast_pkts, "out_ucast_pkts", "Out unicast packets") \ m(+1, u64, out_discards, "out_discards", "Out discards") \ m(+1, u64, out_errors, "out_errors", "Out errors") \ m(+1, u64, in_multicast_pkts, "in_multicast_pkts", "In multicast packets") \ m(+1, u64, in_broadcast_pkts, "in_broadcast_pkts", "In broadcast packets") \ m(+1, u64, out_multicast_pkts, "out_multicast_pkts", "Out multicast packets") \ m(+1, u64, out_broadcast_pkts, "out_broadcast_pkts", "Out broadcast packets") #define MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG(m) \ m(+1, u64, port_transmit_wait, "port_transmit_wait", "Port transmit wait") \ m(+1, u64, ecn_marked, "ecn_marked", "ECN marked") \ m(+1, u64, no_buffer_discard_mc, "no_buffer_discard_mc", "No buffer discard mc") \ m(+1, u64, rx_ebp, "rx_ebp", "RX EBP") \ m(+1, u64, tx_ebp, "tx_ebp", "TX EBP") \ m(+1, u64, rx_buffer_almost_full, "rx_buffer_almost_full", "RX buffer almost full") \ m(+1, u64, rx_buffer_full, "rx_buffer_full", "RX buffer full") \ m(+1, u64, rx_icrc_encapsulated, "rx_icrc_encapsulated", "RX ICRC encapsulated") \ m(+1, u64, ex_reserved_0, "ex_reserved_0", "Reserved") \ m(+1, u64, ex_reserved_1, "ex_reserved_1", "Reserved") \ m(+1, u64, tx_stat_p64octets, "tx_stat_p64octets", "Bytes") \ m(+1, u64, tx_stat_p65to127octets, "tx_stat_p65to127octets", "Bytes") \ m(+1, u64, tx_stat_p128to255octets, "tx_stat_p128to255octets", "Bytes") \ m(+1, u64, tx_stat_p256to511octets, "tx_stat_p256to511octets", "Bytes") \ m(+1, u64, tx_stat_p512to1023octets, "tx_stat_p512to1023octets", "Bytes") \ m(+1, u64, tx_stat_p1024to1518octets, "tx_stat_p1024to1518octets", "Bytes") \ m(+1, u64, tx_stat_p1519to2047octets, "tx_stat_p1519to2047octets", "Bytes") \ m(+1, u64, tx_stat_p2048to4095octets, "tx_stat_p2048to4095octets", "Bytes") \ m(+1, u64, tx_stat_p4096to8191octets, "tx_stat_p4096to8191octets", "Bytes") \ m(+1, u64, tx_stat_p8192to10239octets, "tx_stat_p8192to10239octets", "Bytes") #define MLX5E_PPORT_STATISTICAL_DEBUG(m) \ m(+1, u64, phy_time_since_last_clear, "phy_time_since_last_clear", \ "Time since last clear in milliseconds") \ m(+1, u64, phy_received_bits, "phy_received_bits", \ "Total amount of traffic received in bits before error correction") \ m(+1, u64, phy_symbol_errors, "phy_symbol_errors", \ "Total number of symbol errors before error correction") \ m(+1, u64, phy_corrected_bits, "phy_corrected_bits", \ "Total number of corrected bits ") \ m(+1, u64, phy_corrected_bits_lane0, "phy_corrected_bits_lane0", \ "Total number of corrected bits for lane 0") \ m(+1, u64, phy_corrected_bits_lane1, "phy_corrected_bits_lane1", \ "Total number of corrected bits for lane 1") \ m(+1, u64, phy_corrected_bits_lane2, "phy_corrected_bits_lane2", \ "Total number of corrected bits for lane 2") \ m(+1, u64, phy_corrected_bits_lane3, "phy_corrected_bits_lane3", \ "Total number of corrected bits for lane 3") #define MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(m) \ m(+1, u64, time_since_last_clear, "time_since_last_clear", \ "Time since the last counters clear event (msec)") \ m(+1, u64, symbol_errors, "symbol_errors", "Symbol errors") \ m(+1, u64, sync_headers_errors, "sync_headers_errors", \ "Sync header error counter") \ m(+1, u64, bip_errors_lane0, "edpl_bip_errors_lane0", \ "Indicates the number of PRBS errors on lane 0") \ m(+1, u64, bip_errors_lane1, "edpl_bip_errors_lane1", \ "Indicates the number of PRBS errors on lane 1") \ m(+1, u64, bip_errors_lane2, "edpl_bip_errors_lane2", \ "Indicates the number of PRBS errors on lane 2") \ m(+1, u64, bip_errors_lane3, "edpl_bip_errors_lane3", \ "Indicates the number of PRBS errors on lane 3") \ m(+1, u64, fc_corrected_blocks_lane0, "fc_corrected_blocks_lane0", \ "FEC correctable block counter lane 0") \ m(+1, u64, fc_corrected_blocks_lane1, "fc_corrected_blocks_lane1", \ "FEC correctable block counter lane 1") \ m(+1, u64, fc_corrected_blocks_lane2, "fc_corrected_blocks_lane2", \ "FEC correctable block counter lane 2") \ m(+1, u64, fc_corrected_blocks_lane3, "fc_corrected_blocks_lane3", \ "FEC correctable block counter lane 3") \ m(+1, u64, rs_corrected_blocks, "rs_corrected_blocks", \ "FEC correcable block counter") \ m(+1, u64, rs_uncorrectable_blocks, "rs_uncorrectable_blocks", \ "FEC uncorrecable block counter") \ m(+1, u64, rs_no_errors_blocks, "rs_no_errors_blocks", \ "The number of RS-FEC blocks received that had no errors") \ m(+1, u64, rs_single_error_blocks, "rs_single_error_blocks", \ "The number of corrected RS-FEC blocks received that had" \ "exactly 1 error symbol") \ m(+1, u64, rs_corrected_symbols_total, "rs_corrected_symbols_total", \ "Port FEC corrected symbol counter") \ m(+1, u64, rs_corrected_symbols_lane0, "rs_corrected_symbols_lane0", \ "FEC corrected symbol counter lane 0") \ m(+1, u64, rs_corrected_symbols_lane1, "rs_corrected_symbols_lane1", \ "FEC corrected symbol counter lane 1") \ m(+1, u64, rs_corrected_symbols_lane2, "rs_corrected_symbols_lane2", \ "FEC corrected symbol counter lane 2") \ m(+1, u64, rs_corrected_symbols_lane3, "rs_corrected_symbols_lane3", \ "FEC corrected symbol counter lane 3") /* Per priority statistics for PFC */ #define MLX5E_PPORT_PER_PRIO_STATS_SUB(m,n,p) \ m(n, p, +1, u64, rx_octets, "rx_octets", "Received octets") \ m(n, p, +1, u64, rx_uc_frames, "rx_uc_frames", "Received unicast frames") \ m(n, p, +1, u64, rx_mc_frames, "rx_mc_frames", "Received multicast frames") \ m(n, p, +1, u64, rx_bc_frames, "rx_bc_frames", "Received broadcast frames") \ m(n, p, +1, u64, rx_frames, "rx_frames", "Received frames") \ m(n, p, +1, u64, tx_octets, "tx_octets", "Transmitted octets") \ m(n, p, +1, u64, tx_uc_frames, "tx_uc_frames", "Transmitted unicast frames") \ m(n, p, +1, u64, tx_mc_frames, "tx_mc_frames", "Transmitted multicast frames") \ m(n, p, +1, u64, tx_bc_frames, "tx_bc_frames", "Transmitted broadcast frames") \ m(n, p, +1, u64, tx_frames, "tx_frames", "Transmitted frames") \ m(n, p, +1, u64, rx_pause, "rx_pause", "Received pause frames") \ m(n, p, +1, u64, rx_pause_duration, "rx_pause_duration", \ "Received pause duration") \ m(n, p, +1, u64, tx_pause, "tx_pause", "Transmitted pause frames") \ m(n, p, +1, u64, tx_pause_duration, "tx_pause_duration", \ "Transmitted pause duration") \ m(n, p, +1, u64, rx_pause_transition, "rx_pause_transition", \ "Received pause transitions") \ m(n, p, +1, u64, rx_discards, "rx_discards", "Discarded received frames") \ m(n, p, +1, u64, device_stall_minor_watermark, \ "device_stall_minor_watermark", "Device stall minor watermark") \ m(n, p, +1, u64, device_stall_critical_watermark, \ "device_stall_critical_watermark", "Device stall critical watermark") #define MLX5E_PPORT_PER_PRIO_STATS_PREFIX(m,p,c,t,f,s,d) \ m(c, t, pri_##p##_##f, "prio" #p "_" s, "Priority " #p " - " d) #define MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO 8 #define MLX5E_PPORT_PER_PRIO_STATS(m) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,0) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,1) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,2) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,3) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,4) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,5) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,6) \ MLX5E_PPORT_PER_PRIO_STATS_SUB(MLX5E_PPORT_PER_PRIO_STATS_PREFIX,m,7) #define MLX5E_PCIE_PERFORMANCE_COUNTERS_64(m) \ m(+1, u64, life_time_counter_high, "life_time_counter", \ "Life time counter.", pcie_perf_counters) \ m(+1, u64, tx_overflow_buffer_pkt, "tx_overflow_buffer_pkt", \ "The number of packets dropped due to lack of PCIe buffers " \ "in receive path from NIC port toward the hosts.", \ pcie_perf_counters) \ m(+1, u64, tx_overflow_buffer_marked_pkt, \ "tx_overflow_buffer_marked_pkt", \ "The number of packets marked due to lack of PCIe buffers " \ "in receive path from NIC port toward the hosts.", \ pcie_perf_counters) #define MLX5E_PCIE_PERFORMANCE_COUNTERS_32(m) \ m(+1, u64, rx_errors, "rx_errors", \ "Number of transitions to recovery due to Framing " \ "errors and CRC errors.", pcie_perf_counters) \ m(+1, u64, tx_errors, "tx_errors", "Number of transitions " \ "to recovery due to EIEOS and TS errors.", pcie_perf_counters) \ m(+1, u64, l0_to_recovery_eieos, "l0_to_recovery_eieos", "Number of " \ "transitions to recovery due to getting EIEOS.", pcie_perf_counters)\ m(+1, u64, l0_to_recovery_ts, "l0_to_recovery_ts", "Number of " \ "transitions to recovery due to getting TS.", pcie_perf_counters) \ m(+1, u64, l0_to_recovery_framing, "l0_to_recovery_framing", "Number "\ "of transitions to recovery due to identifying framing " \ "errors at gen3/4.", pcie_perf_counters) \ m(+1, u64, l0_to_recovery_retrain, "l0_to_recovery_retrain", \ "Number of transitions to recovery due to link retrain request " \ "from data link.", pcie_perf_counters) \ m(+1, u64, crc_error_dllp, "crc_error_dllp", "Number of transitions " \ "to recovery due to identifying CRC DLLP errors.", \ pcie_perf_counters) \ m(+1, u64, crc_error_tlp, "crc_error_tlp", "Number of transitions to "\ "recovery due to identifying CRC TLP errors.", pcie_perf_counters) \ m(+1, u64, outbound_stalled_reads, "outbound_stalled_reads", \ "The percentage of time within the last second that the NIC had " \ "outbound non-posted read requests but could not perform the " \ "operation due to insufficient non-posted credits.", \ pcie_perf_counters) \ m(+1, u64, outbound_stalled_writes, "outbound_stalled_writes", \ "The percentage of time within the last second that the NIC had " \ "outbound posted writes requests but could not perform the " \ "operation due to insufficient posted credits.", \ pcie_perf_counters) \ m(+1, u64, outbound_stalled_reads_events, \ "outbound_stalled_reads_events", "The number of events where " \ "outbound_stalled_reads was above a threshold.", \ pcie_perf_counters) \ m(+1, u64, outbound_stalled_writes_events, \ "outbound_stalled_writes_events", \ "The number of events where outbound_stalled_writes was above " \ "a threshold.", pcie_perf_counters) #define MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(m) \ m(+1, u64, time_to_boot_image_start, "time_to_boot_image_start", \ "Time from start until FW boot image starts running in usec.", \ pcie_timers_states) \ m(+1, u64, time_to_link_image, "time_to_link_image", \ "Time from start until FW pci_link image starts running in usec.", \ pcie_timers_states) \ m(+1, u64, calibration_time, "calibration_time", \ "Time it took FW to do calibration in usec.", \ pcie_timers_states) \ m(+1, u64, time_to_first_perst, "time_to_first_perst", \ "Time form start until FW handle first perst. in usec.", \ pcie_timers_states) \ m(+1, u64, time_to_detect_state, "time_to_detect_state", \ "Time from start until first transition to LTSSM.Detect_Q in usec", \ pcie_timers_states) \ m(+1, u64, time_to_l0, "time_to_l0", \ "Time from start until first transition to LTSSM.L0 in usec", \ pcie_timers_states) \ m(+1, u64, time_to_crs_en, "time_to_crs_en", \ "Time from start until crs is enabled in usec", \ pcie_timers_states) \ m(+1, u64, time_to_plastic_image_start, "time_to_plastic_image_start",\ "Time form start until FW plastic image starts running in usec.", \ pcie_timers_states) \ m(+1, u64, time_to_iron_image_start, "time_to_iron_image_start", \ "Time form start until FW iron image starts running in usec.", \ pcie_timers_states) \ m(+1, u64, perst_handler, "perst_handler", \ "Number of persts arrived.", pcie_timers_states) \ m(+1, u64, times_in_l1, "times_in_l1", \ "Number of times LTSSM entered L1 flow.", pcie_timers_states) \ m(+1, u64, times_in_l23, "times_in_l23", \ "Number of times LTSSM entered L23 flow.", pcie_timers_states) \ m(+1, u64, dl_down, "dl_down", \ "Number of moves for DL_active to DL_down.", pcie_timers_states) \ m(+1, u64, config_cycle1usec, "config_cycle1usec", \ "Number of configuration requests that firmware " \ "handled in less than 1 usec.", pcie_timers_states) \ m(+1, u64, config_cycle2to7usec, "config_cycle2to7usec", \ "Number of configuration requests that firmware " \ "handled within 2 to 7 usec.", pcie_timers_states) \ m(+1, u64, config_cycle8to15usec, "config_cycle8to15usec", \ "Number of configuration requests that firmware " \ "handled within 8 to 15 usec.", pcie_timers_states) \ m(+1, u64, config_cycle16to63usec, "config_cycle16to63usec", \ "Number of configuration requests that firmware " \ "handled within 16 to 63 usec.", pcie_timers_states) \ m(+1, u64, config_cycle64usec, "config_cycle64usec", \ "Number of configuration requests that firmware " \ "handled took more than 64 usec.", pcie_timers_states) \ m(+1, u64, correctable_err_msg_sent, "correctable_err_msg_sent", \ "Number of correctable error messages sent.", pcie_timers_states) \ m(+1, u64, non_fatal_err_msg_sent, "non_fatal_err_msg_sent", \ "Number of non-Fatal error msg sent.", pcie_timers_states) \ m(+1, u64, fatal_err_msg_sent, "fatal_err_msg_sent", \ "Number of fatal error msg sent.", pcie_timers_states) #define MLX5E_PCIE_LANE_COUNTERS_32(m) \ m(+1, u64, error_counter_lane0, "error_counter_lane0", \ "Error counter for PCI lane 0", pcie_lanes_counters) \ m(+1, u64, error_counter_lane1, "error_counter_lane1", \ "Error counter for PCI lane 1", pcie_lanes_counters) \ m(+1, u64, error_counter_lane2, "error_counter_lane2", \ "Error counter for PCI lane 2", pcie_lanes_counters) \ m(+1, u64, error_counter_lane3, "error_counter_lane3", \ "Error counter for PCI lane 3", pcie_lanes_counters) \ m(+1, u64, error_counter_lane4, "error_counter_lane4", \ "Error counter for PCI lane 4", pcie_lanes_counters) \ m(+1, u64, error_counter_lane5, "error_counter_lane5", \ "Error counter for PCI lane 5", pcie_lanes_counters) \ m(+1, u64, error_counter_lane6, "error_counter_lane6", \ "Error counter for PCI lane 6", pcie_lanes_counters) \ m(+1, u64, error_counter_lane7, "error_counter_lane7", \ "Error counter for PCI lane 7", pcie_lanes_counters) \ m(+1, u64, error_counter_lane8, "error_counter_lane8", \ "Error counter for PCI lane 8", pcie_lanes_counters) \ m(+1, u64, error_counter_lane9, "error_counter_lane9", \ "Error counter for PCI lane 9", pcie_lanes_counters) \ m(+1, u64, error_counter_lane10, "error_counter_lane10", \ "Error counter for PCI lane 10", pcie_lanes_counters) \ m(+1, u64, error_counter_lane11, "error_counter_lane11", \ "Error counter for PCI lane 11", pcie_lanes_counters) \ m(+1, u64, error_counter_lane12, "error_counter_lane12", \ "Error counter for PCI lane 12", pcie_lanes_counters) \ m(+1, u64, error_counter_lane13, "error_counter_lane13", \ "Error counter for PCI lane 13", pcie_lanes_counters) \ m(+1, u64, error_counter_lane14, "error_counter_lane14", \ "Error counter for PCI lane 14", pcie_lanes_counters) \ m(+1, u64, error_counter_lane15, "error_counter_lane15", \ "Error counter for PCI lane 15", pcie_lanes_counters) /* * Make sure to update mlx5e_update_pport_counters() * when adding a new MLX5E_PPORT_STATS block */ #define MLX5E_PPORT_STATS(m) \ MLX5E_PPORT_PER_PRIO_STATS(m) \ MLX5E_PPORT_IEEE802_3_STATS(m) \ MLX5E_PPORT_RFC2819_STATS(m) #define MLX5E_PORT_STATS_DEBUG(m) \ MLX5E_PPORT_RFC2819_STATS_DEBUG(m) \ MLX5E_PPORT_RFC2863_STATS_DEBUG(m) \ MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(m) \ MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG(m) \ MLX5E_PPORT_STATISTICAL_DEBUG(m) \ MLX5E_PCIE_PERFORMANCE_COUNTERS_64(m) \ MLX5E_PCIE_PERFORMANCE_COUNTERS_32(m) \ MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(m) \ MLX5E_PCIE_LANE_COUNTERS_32(m) #define MLX5E_PPORT_IEEE802_3_STATS_NUM \ (0 MLX5E_PPORT_IEEE802_3_STATS(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_RFC2819_STATS_NUM \ (0 MLX5E_PPORT_RFC2819_STATS(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_STATS_NUM \ (0 MLX5E_PPORT_STATS(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_PER_PRIO_STATS_NUM \ (0 MLX5E_PPORT_PER_PRIO_STATS(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM \ (0 MLX5E_PPORT_RFC2819_STATS_DEBUG(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM \ (0 MLX5E_PPORT_RFC2863_STATS_DEBUG(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM \ (0 MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM \ (0 MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG(MLX5E_STATS_COUNT)) #define MLX5E_PPORT_STATISTICAL_DEBUG_NUM \ (0 MLX5E_PPORT_STATISTICAL_DEBUG(MLX5E_STATS_COUNT)) #define MLX5E_PORT_STATS_DEBUG_NUM \ (0 MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_COUNT)) struct mlx5e_pport_stats { struct sysctl_ctx_list ctx; u64 arg [0]; MLX5E_PPORT_STATS(MLX5E_STATS_VAR) }; struct mlx5e_port_stats_debug { struct sysctl_ctx_list ctx; u64 arg [0]; MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_VAR) }; #define MLX5E_RQ_STATS(m) \ m(+1, u64, packets, "packets", "Received packets") \ m(+1, u64, bytes, "bytes", "Received bytes") \ m(+1, u64, csum_none, "csum_none", "Received packets") \ m(+1, u64, lro_packets, "lro_packets", "Received LRO packets") \ m(+1, u64, lro_bytes, "lro_bytes", "Received LRO bytes") \ m(+1, u64, sw_lro_queued, "sw_lro_queued", "Packets queued for SW LRO") \ m(+1, u64, sw_lro_flushed, "sw_lro_flushed", "Packets flushed from SW LRO") \ m(+1, u64, wqe_err, "wqe_err", "Received packets") \ m(+1, u64, decrypted_ok_packets, "decrypted_ok_packets", "Received packets successfully decrypted by steering rule(s)") \ m(+1, u64, decrypted_error_packets, "decrypted_error_packets", "Received packets not decrypted by steering rule(s)") #define MLX5E_RQ_STATS_NUM (0 MLX5E_RQ_STATS(MLX5E_STATS_COUNT)) struct mlx5e_rq_stats { struct sysctl_ctx_list ctx; u64 arg [0]; MLX5E_RQ_STATS(MLX5E_STATS_VAR) }; #define MLX5E_SQ_STATS(m) \ m(+1, u64, packets, "packets", "Transmitted packets") \ m(+1, u64, bytes, "bytes", "Transmitted bytes") \ m(+1, u64, tso_packets, "tso_packets", "Transmitted packets") \ m(+1, u64, tso_bytes, "tso_bytes", "Transmitted bytes") \ m(+1, u64, csum_offload_none, "csum_offload_none", "Transmitted packets") \ m(+1, u64, defragged, "defragged", "Transmitted packets") \ m(+1, u64, dropped, "dropped", "Transmitted packets") \ m(+1, u64, enobuf, "enobuf", "Transmitted packets") \ m(+1, u64, cqe_err, "cqe_err", "Transmit CQE errors") \ m(+1, u64, nop, "nop", "Transmitted packets") #define MLX5E_SQ_STATS_NUM (0 MLX5E_SQ_STATS(MLX5E_STATS_COUNT)) struct mlx5e_sq_stats { struct sysctl_ctx_list ctx; u64 arg [0]; MLX5E_SQ_STATS(MLX5E_STATS_VAR) }; struct mlx5e_stats { struct mlx5e_vport_stats vport; struct mlx5e_pport_stats pport; struct mlx5e_port_stats_debug port_stats_debug; }; struct mlx5e_rq_param { u32 rqc [MLX5_ST_SZ_DW(rqc)]; struct mlx5_wq_param wq; }; struct mlx5e_sq_param { u32 sqc [MLX5_ST_SZ_DW(sqc)]; struct mlx5_wq_param wq; }; struct mlx5e_cq_param { u32 cqc [MLX5_ST_SZ_DW(cqc)]; struct mlx5_wq_param wq; }; struct mlx5e_params { u8 log_sq_size; u8 log_rq_size; u16 num_channels; u8 default_vlan_prio; u8 num_tc; u8 rx_cq_moderation_mode; u8 tx_cq_moderation_mode; u16 rx_cq_moderation_usec; u16 rx_cq_moderation_pkts; u16 tx_cq_moderation_usec; u16 tx_cq_moderation_pkts; u16 min_rx_wqes; bool hw_lro_en; bool cqe_zipping_en; u32 lro_wqe_sz; u16 rx_hash_log_tbl_sz; u32 tx_pauseframe_control __aligned(4); u32 rx_pauseframe_control __aligned(4); u16 tx_max_inline; u8 tx_min_inline_mode; u8 tx_priority_flow_control; u8 rx_priority_flow_control; u8 channels_rsss; }; #define MLX5E_PARAMS(m) \ m(+1, u64, tx_queue_size_max, "tx_queue_size_max", "Max send queue size") \ m(+1, u64, rx_queue_size_max, "rx_queue_size_max", "Max receive queue size") \ m(+1, u64, tx_queue_size, "tx_queue_size", "Default send queue size") \ m(+1, u64, rx_queue_size, "rx_queue_size", "Default receive queue size") \ m(+1, u64, channels, "channels", "Default number of channels") \ m(+1, u64, channels_rsss, "channels_rsss", "Default channels receive side scaling stride") \ m(+1, u64, coalesce_usecs_max, "coalesce_usecs_max", "Maximum usecs for joining packets") \ m(+1, u64, coalesce_pkts_max, "coalesce_pkts_max", "Maximum packets to join") \ m(+1, u64, rx_coalesce_usecs, "rx_coalesce_usecs", "Limit in usec for joining rx packets") \ m(+1, u64, rx_coalesce_pkts, "rx_coalesce_pkts", "Maximum number of rx packets to join") \ m(+1, u64, rx_coalesce_mode, "rx_coalesce_mode", "0: EQE fixed mode 1: CQE fixed mode 2: EQE auto mode 3: CQE auto mode") \ m(+1, u64, tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining tx packets") \ m(+1, u64, tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx packets to join") \ m(+1, u64, tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \ m(+1, u64, tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \ m(+1, u64, tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \ m(+1, u64, hw_lro, "hw_lro", "set to enable hw_lro") \ m(+1, u64, cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled") \ m(+1, u64, modify_tx_dma, "modify_tx_dma", "0: Enable TX 1: Disable TX") \ m(+1, u64, modify_rx_dma, "modify_rx_dma", "0: Enable RX 1: Disable RX") \ m(+1, u64, diag_pci_enable, "diag_pci_enable", "0: Disabled 1: Enabled") \ m(+1, u64, diag_general_enable, "diag_general_enable", "0: Disabled 1: Enabled") \ m(+1, u64, hw_mtu, "hw_mtu", "Current hardware MTU value") \ m(+1, u64, mc_local_lb, "mc_local_lb", "0: Local multicast loopback enabled 1: Disabled") \ m(+1, u64, uc_local_lb, "uc_local_lb", "0: Local unicast loopback enabled 1: Disabled") \ m(+1, s64, irq_cpu_base, "irq_cpu_base", "-1: Don't bind IRQ 0..NCPU-1: select this base CPU when binding IRQs") \ m(+1, s64, irq_cpu_stride, "irq_cpu_stride", "0..NCPU-1: Distance between IRQ vectors when binding them") #define MLX5E_PARAMS_NUM (0 MLX5E_PARAMS(MLX5E_STATS_COUNT)) struct mlx5e_params_ethtool { u64 arg [0]; MLX5E_PARAMS(MLX5E_STATS_VAR) u64 max_bw_value[IEEE_8021QAZ_MAX_TCS]; u8 max_bw_share[IEEE_8021QAZ_MAX_TCS]; u8 prio_tc[MLX5E_MAX_PRIORITY]; u8 dscp2prio[MLX5_MAX_SUPPORTED_DSCP]; u8 trust_state; u8 fec_mask_10x_25x[MLX5E_MAX_FEC_10X_25X]; u16 fec_mask_50x[MLX5E_MAX_FEC_50X]; u8 fec_avail_10x_25x[MLX5E_MAX_FEC_10X_25X]; u16 fec_avail_50x[MLX5E_MAX_FEC_50X]; u32 fec_mode_active; u32 hw_mtu_msb; s32 hw_val_temp[MLX5_MAX_TEMPERATURE]; u32 hw_num_temp; }; struct mlx5e_cq { /* data path - accessed per cqe */ struct mlx5_cqwq wq; /* data path - accessed per HW polling */ struct mlx5_core_cq mcq; /* control */ struct mlx5e_priv *priv; struct mlx5_wq_ctrl wq_ctrl; } __aligned(MLX5E_CACHELINE_SIZE); struct mlx5e_rq_mbuf { bus_dmamap_t dma_map; caddr_t data; struct mbuf *mbuf; }; struct mlx5e_rq { /* persistent fields */ struct mtx mtx; struct mlx5e_rq_stats stats; struct callout watchdog; /* data path */ #define mlx5e_rq_zero_start wq struct mlx5_wq_ll wq; bus_dma_tag_t dma_tag; u32 wqe_sz; u32 nsegs; struct mlx5e_rq_mbuf *mbuf; - struct ifnet *ifp; + if_t ifp; struct mlx5e_cq cq; struct lro_ctrl lro; volatile int enabled; int ix; /* Dynamic Interrupt Moderation */ struct net_dim dim; /* control */ struct mlx5_wq_ctrl wq_ctrl; u32 rqn; struct mlx5e_channel *channel; } __aligned(MLX5E_CACHELINE_SIZE); typedef void (mlx5e_iq_callback_t)(void *arg); struct mlx5e_iq_data { bus_dmamap_t dma_map; mlx5e_iq_callback_t *callback; void *arg; volatile s32 *p_refcount; /* in use refcount, if any */ u32 num_wqebbs; u32 dma_sync; }; struct mlx5e_iq { /* persistant fields */ struct mtx lock; struct mtx comp_lock; int db_inhibit; /* data path */ #define mlx5e_iq_zero_start dma_tag bus_dma_tag_t dma_tag; u16 cc; /* consumer counter */ u16 pc __aligned(MLX5E_CACHELINE_SIZE); u16 running; union { u32 d32[2]; u64 d64; } doorbell; struct mlx5e_cq cq; /* pointers to per request info: write@xmit, read@completion */ struct mlx5e_iq_data *data; /* read only */ struct mlx5_wq_cyc wq; void __iomem *uar_map; u32 sqn; u32 mkey_be; /* control path */ struct mlx5_wq_ctrl wq_ctrl; struct mlx5e_priv *priv; }; struct mlx5e_sq_mbuf { bus_dmamap_t dma_map; struct mbuf *mbuf; struct m_snd_tag *mst; /* if set, unref this send tag on completion */ u32 num_bytes; u32 num_wqebbs; }; enum { MLX5E_SQ_READY, MLX5E_SQ_FULL }; struct mlx5e_sq { /* persistent fields */ struct mtx lock; struct mtx comp_lock; struct mlx5e_sq_stats stats; struct callout cev_callout; int db_inhibit; /* data path */ #define mlx5e_sq_zero_start dma_tag bus_dma_tag_t dma_tag; /* dirtied @completion */ u16 cc; /* dirtied @xmit */ u16 pc __aligned(MLX5E_CACHELINE_SIZE); u16 cev_counter; /* completion event counter */ u16 cev_factor; /* completion event factor */ u16 cev_next_state; /* next completion event state */ #define MLX5E_CEV_STATE_INITIAL 0 /* timer not started */ #define MLX5E_CEV_STATE_SEND_NOPS 1 /* send NOPs */ #define MLX5E_CEV_STATE_HOLD_NOPS 2 /* don't send NOPs yet */ u16 running; /* set if SQ is running */ union { u32 d32[2]; u64 d64; } doorbell; struct mlx5e_cq cq; /* pointers to per packet info: write@xmit, read@completion */ struct mlx5e_sq_mbuf *mbuf; /* read only */ struct mlx5_wq_cyc wq; void __iomem *uar_map; struct ifnet *ifp; u32 sqn; u32 mkey_be; u16 max_inline; u8 min_inline_mode; u8 min_insert_caps; u32 queue_handle; /* SQ remap support */ #define MLX5E_INSERT_VLAN 1 #define MLX5E_INSERT_NON_VLAN 2 /* control path */ struct mlx5_wq_ctrl wq_ctrl; struct mlx5e_priv *priv; int tc; } __aligned(MLX5E_CACHELINE_SIZE); static inline bool mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n) { u16 cc = sq->cc; u16 pc = sq->pc; return ((sq->wq.sz_m1 & (cc - pc)) >= n || cc == pc); } static inline u32 mlx5e_sq_queue_level(struct mlx5e_sq *sq) { u16 cc; u16 pc; if (sq == NULL) return (0); cc = sq->cc; pc = sq->pc; return (((sq->wq.sz_m1 & (pc - cc)) * IF_SND_QUEUE_LEVEL_MAX) / sq->wq.sz_m1); } struct mlx5e_channel { struct mlx5e_rq rq; struct m_snd_tag tag; struct mlx5_sq_bfreg bfreg; struct mlx5e_sq sq[MLX5E_MAX_TX_NUM_TC]; struct mlx5e_iq iq; struct mlx5e_priv *priv; struct completion completion; int ix; u32 rqtn; } __aligned(MLX5E_CACHELINE_SIZE); enum mlx5e_traffic_types { MLX5E_TT_IPV4_TCP, MLX5E_TT_IPV6_TCP, MLX5E_TT_IPV4_UDP, MLX5E_TT_IPV6_UDP, MLX5E_TT_IPV4_IPSEC_AH, MLX5E_TT_IPV6_IPSEC_AH, MLX5E_TT_IPV4_IPSEC_ESP, MLX5E_TT_IPV6_IPSEC_ESP, MLX5E_TT_IPV4, MLX5E_TT_IPV6, MLX5E_TT_ANY, MLX5E_NUM_TT, }; enum { MLX5E_RQT_SPREADING = 0, MLX5E_RQT_DEFAULT_RQ = 1, MLX5E_NUM_RQT = 2, }; struct mlx5_flow_rule; struct mlx5e_eth_addr_info { u8 addr [ETH_ALEN + 2]; u32 tt_vec; /* flow table rule per traffic type */ struct mlx5_flow_rule *ft_rule[MLX5E_NUM_TT]; }; #define MLX5E_ETH_ADDR_HASH_SIZE (1 << BITS_PER_BYTE) struct mlx5e_eth_addr_hash_node; struct mlx5e_eth_addr_hash_head { struct mlx5e_eth_addr_hash_node *lh_first; }; struct mlx5e_eth_addr_db { struct mlx5e_eth_addr_hash_head if_uc[MLX5E_ETH_ADDR_HASH_SIZE]; struct mlx5e_eth_addr_hash_head if_mc[MLX5E_ETH_ADDR_HASH_SIZE]; struct mlx5e_eth_addr_info broadcast; struct mlx5e_eth_addr_info allmulti; struct mlx5e_eth_addr_info promisc; bool broadcast_enabled; bool allmulti_enabled; bool promisc_enabled; }; enum { MLX5E_STATE_ASYNC_EVENTS_ENABLE, MLX5E_STATE_OPENED, MLX5E_STATE_FLOW_RULES_READY, }; enum { MLX5_BW_NO_LIMIT = 0, MLX5_100_MBPS_UNIT = 3, MLX5_GBPS_UNIT = 4, }; struct mlx5e_vlan_db { unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; struct mlx5_flow_rule *active_vlans_ft_rule[VLAN_N_VID]; struct mlx5_flow_rule *untagged_ft_rule; struct mlx5_flow_rule *any_cvlan_ft_rule; struct mlx5_flow_rule *any_svlan_ft_rule; bool filter_disabled; }; struct mlx5e_vxlan_db_el { u_int refcount; u_int proto; u_int port; bool installed; struct mlx5_flow_rule *vxlan_ft_rule; TAILQ_ENTRY(mlx5e_vxlan_db_el) link; }; struct mlx5e_vxlan_db { TAILQ_HEAD(, mlx5e_vxlan_db_el) head; }; struct mlx5e_flow_table { int num_groups; struct mlx5_flow_table *t; struct mlx5_flow_group **g; }; enum accel_fs_tcp_type { MLX5E_ACCEL_FS_IPV4_TCP, MLX5E_ACCEL_FS_IPV6_TCP, MLX5E_ACCEL_FS_TCP_NUM_TYPES, }; struct mlx5e_accel_fs_tcp { struct mlx5_flow_namespace *ns; struct mlx5e_flow_table tables[MLX5E_ACCEL_FS_TCP_NUM_TYPES]; struct mlx5_flow_rule *default_rules[MLX5E_ACCEL_FS_TCP_NUM_TYPES]; }; struct mlx5e_flow_tables { struct mlx5_flow_namespace *ns; struct mlx5e_flow_table vlan; struct mlx5e_flow_table vxlan; struct mlx5_flow_rule *vxlan_catchall_ft_rule; struct mlx5e_flow_table main; struct mlx5e_flow_table main_vxlan; struct mlx5_flow_rule *main_vxlan_rule[MLX5E_NUM_TT]; struct mlx5e_flow_table inner_rss; struct mlx5e_accel_fs_tcp accel_tcp; }; struct mlx5e_xmit_args { struct m_snd_tag *mst; u32 tisn; u16 ihs; }; #include #include #include #define MLX5E_TSTMP_PREC 10 struct mlx5e_clbr_point { uint64_t base_curr; uint64_t base_prev; uint64_t clbr_hw_prev; uint64_t clbr_hw_curr; u_int clbr_gen; }; struct mlx5e_dcbx { u32 cable_len; u32 xoff; }; struct mlx5e_priv { struct mlx5_core_dev *mdev; /* must be first */ /* priv data path fields - start */ int order_base_2_num_channels; int queue_mapping_channel_mask; int num_tc; int default_vlan_prio; /* priv data path fields - end */ unsigned long state; int gone; #define PRIV_LOCK(priv) sx_xlock(&(priv)->state_lock) #define PRIV_UNLOCK(priv) sx_xunlock(&(priv)->state_lock) #define PRIV_LOCKED(priv) sx_xlocked(&(priv)->state_lock) #define PRIV_ASSERT_LOCKED(priv) sx_assert(&(priv)->state_lock, SA_XLOCKED) struct sx state_lock; /* Protects Interface state */ struct mlx5e_rq drop_rq; u32 pdn; u32 tdn; struct mlx5_core_mkey mr; u32 tisn[MLX5E_MAX_TX_NUM_TC]; u32 rqtn; u32 tirn[MLX5E_NUM_TT]; u32 tirn_inner_vxlan[MLX5E_NUM_TT]; struct mlx5e_flow_tables fts; struct mlx5e_eth_addr_db eth_addr; struct mlx5e_vlan_db vlan; struct mlx5e_vxlan_db vxlan; struct mlx5e_params params; struct mlx5e_params_ethtool params_ethtool; union mlx5_core_pci_diagnostics params_pci; union mlx5_core_general_diagnostics params_general; struct mtx async_events_mtx; /* sync hw events */ struct work_struct update_stats_work; struct work_struct update_carrier_work; struct work_struct set_rx_mode_work; MLX5_DECLARE_DOORBELL_LOCK(doorbell_lock) - struct ifnet *ifp; + if_t ifp; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_ifnet; struct sysctl_oid *sysctl_hw; int sysctl_debug; struct mlx5e_stats stats; int counter_set_id; struct workqueue_struct *wq; eventhandler_tag vlan_detach; eventhandler_tag vlan_attach; struct ifmedia media; int media_status_last; int media_active_last; eventhandler_tag vxlan_start; eventhandler_tag vxlan_stop; struct callout watchdog; struct mlx5e_rl_priv_data rl; struct mlx5e_tls tls; struct mlx5e_tls_rx tls_rx; struct callout tstmp_clbr; int clbr_done; int clbr_curr; struct mlx5e_clbr_point clbr_points[2]; u_int clbr_gen; uint64_t cclk; struct mlx5e_dcbx dcbx; bool sw_is_port_buf_owner; struct pfil_head *pfil; struct mlx5e_channel channel[]; }; #define MLX5E_NET_IP_ALIGN 2 struct mlx5e_tx_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_eth_seg eth; }; struct mlx5e_tx_umr_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_umr_ctrl_seg umr; uint8_t mkc[64]; }; struct mlx5e_tx_psv_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_seg_set_psv psv; }; struct mlx5e_tx_qos_remap_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_qos_remap_seg qos_remap; }; struct mlx5e_rx_wqe { struct mlx5_wqe_srq_next_seg next; struct mlx5_wqe_data_seg data[]; }; /* the size of the structure above must be power of two */ CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe))); struct mlx5e_eeprom { int lock_bit; int i2c_addr; int page_num; int device_addr; int module_num; int len; int type; int page_valid; u32 *data; }; #define MLX5E_FLD_MAX(typ, fld) ((1ULL << __mlx5_bit_sz(typ, fld)) - 1ULL) bool mlx5e_do_send_cqe(struct mlx5e_sq *); int mlx5e_get_full_header_size(const struct mbuf *, const struct tcphdr **); -int mlx5e_xmit(struct ifnet *, struct mbuf *); +int mlx5e_xmit(if_t, struct mbuf *); -int mlx5e_open_locked(struct ifnet *); -int mlx5e_close_locked(struct ifnet *); +int mlx5e_open_locked(if_t); +int mlx5e_close_locked(if_t); void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, int event); void mlx5e_dump_err_cqe(struct mlx5e_cq *, u32, const struct mlx5_err_cqe *); mlx5e_cq_comp_t mlx5e_rx_cq_comp; mlx5e_cq_comp_t mlx5e_tx_cq_comp; struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq); void mlx5e_dim_work(struct work_struct *); void mlx5e_dim_build_cq_param(struct mlx5e_priv *, struct mlx5e_cq_param *); int mlx5e_open_flow_tables(struct mlx5e_priv *priv); void mlx5e_close_flow_tables(struct mlx5e_priv *priv); int mlx5e_open_flow_rules(struct mlx5e_priv *priv); void mlx5e_close_flow_rules(struct mlx5e_priv *priv); void mlx5e_set_rx_mode_work(struct work_struct *work); -void mlx5e_vlan_rx_add_vid(void *, struct ifnet *, u16); -void mlx5e_vlan_rx_kill_vid(void *, struct ifnet *, u16); +void mlx5e_vlan_rx_add_vid(void *, if_t, u16); +void mlx5e_vlan_rx_kill_vid(void *, if_t, u16); void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv); void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv); -void mlx5e_vxlan_start(void *arg, struct ifnet *ifp, sa_family_t family, +void mlx5e_vxlan_start(void *arg, if_t ifp, sa_family_t family, u_int port); -void mlx5e_vxlan_stop(void *arg, struct ifnet *ifp, sa_family_t family, +void mlx5e_vxlan_stop(void *arg, if_t ifp, sa_family_t family, u_int port); int mlx5e_add_all_vxlan_rules(struct mlx5e_priv *priv); void mlx5e_del_all_vxlan_rules(struct mlx5e_priv *priv); static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq, bool force) { if (unlikely((force == false && sq->db_inhibit != 0) || sq->doorbell.d64 == 0)) { /* skip writing the doorbell record */ return; } /* ensure wqe is visible to device before updating doorbell record */ wmb(); *sq->wq.db = cpu_to_be32(sq->pc); /* * Ensure the doorbell record is visible to device before ringing * the doorbell: */ wmb(); mlx5_write64(sq->doorbell.d32, sq->uar_map, MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock)); sq->doorbell.d64 = 0; } static inline void mlx5e_cq_arm(struct mlx5e_cq *cq, spinlock_t *dblock) { struct mlx5_core_cq *mcq; mcq = &cq->mcq; mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, dblock, cq->wq.cc); } #define mlx5e_dbg(_IGN, _priv, ...) mlx5_core_dbg((_priv)->mdev, __VA_ARGS__) extern const struct ethtool_ops mlx5e_ethtool_ops; void mlx5e_create_ethtool(struct mlx5e_priv *); void mlx5e_create_stats(struct sysctl_ctx_list *, struct sysctl_oid_list *, const char *, const char **, unsigned, u64 *); void mlx5e_create_counter_stats(struct sysctl_ctx_list *, struct sysctl_oid_list *, const char *, const char **, unsigned, counter_u64_t *); void mlx5e_send_nop(struct mlx5e_sq *, u32); int mlx5e_sq_dump_xmit(struct mlx5e_sq *, struct mlx5e_xmit_args *, struct mbuf **); int mlx5e_sq_xmit(struct mlx5e_sq *, struct mbuf **); void mlx5e_sq_cev_timeout(void *); int mlx5e_refresh_channel_params(struct mlx5e_priv *); int mlx5e_open_cq(struct mlx5e_priv *, struct mlx5e_cq_param *, struct mlx5e_cq *, mlx5e_cq_comp_t *, int eq_ix); void mlx5e_close_cq(struct mlx5e_cq *); void mlx5e_free_sq_db(struct mlx5e_sq *); int mlx5e_alloc_sq_db(struct mlx5e_sq *); int mlx5e_enable_sq(struct mlx5e_sq *, struct mlx5e_sq_param *, const struct mlx5_sq_bfreg *, int tis_num); int mlx5e_modify_sq(struct mlx5e_sq *, int curr_state, int next_state); void mlx5e_disable_sq(struct mlx5e_sq *); void mlx5e_drain_sq(struct mlx5e_sq *); void mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value); void mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value); void mlx5e_resume_sq(struct mlx5e_sq *sq); void mlx5e_update_sq_inline(struct mlx5e_sq *sq); void mlx5e_refresh_sq_inline(struct mlx5e_priv *priv); int mlx5e_update_buf_lossy(struct mlx5e_priv *priv); int mlx5e_fec_update(struct mlx5e_priv *priv); int mlx5e_hw_temperature_update(struct mlx5e_priv *priv); /* Internal Queue, IQ, API functions */ void mlx5e_iq_send_nop(struct mlx5e_iq *, u32); int mlx5e_iq_open(struct mlx5e_channel *, struct mlx5e_sq_param *, struct mlx5e_cq_param *, struct mlx5e_iq *); void mlx5e_iq_close(struct mlx5e_iq *); void mlx5e_iq_static_init(struct mlx5e_iq *); void mlx5e_iq_static_destroy(struct mlx5e_iq *); void mlx5e_iq_notify_hw(struct mlx5e_iq *); int mlx5e_iq_get_producer_index(struct mlx5e_iq *); void mlx5e_iq_load_memory_single(struct mlx5e_iq *, u16, void *, size_t, u64 *, u32); #endif /* _MLX5_EN_H_ */ diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c index f57d70080a46..59a8cbc56bbb 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c @@ -1,1652 +1,1652 @@ /*- * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_rss.h" #include "opt_ratelimit.h" #include #include void mlx5e_create_stats(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *parent, const char *buffer, const char **desc, unsigned num, u64 * arg) { struct sysctl_oid *node; unsigned x; sysctl_ctx_init(ctx); node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, buffer, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics"); if (node == NULL) return; for (x = 0; x != num; x++) { SYSCTL_ADD_UQUAD(ctx, SYSCTL_CHILDREN(node), OID_AUTO, desc[2 * x], CTLFLAG_RD, arg + x, desc[2 * x + 1]); } } void mlx5e_create_counter_stats(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *parent, const char *buffer, const char **desc, unsigned num, counter_u64_t *arg) { struct sysctl_oid *node; unsigned x; sysctl_ctx_init(ctx); node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, buffer, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics"); if (node == NULL) return; for (x = 0; x != num; x++) { SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, desc[2 * x], CTLFLAG_RD, arg + x, desc[2 * x + 1]); } } static void mlx5e_ethtool_sync_tx_completion_fact(struct mlx5e_priv *priv) { /* * Limit the maximum distance between completion events to * half of the currently set TX queue size. * * The maximum number of queue entries a single IP packet can * consume is given by MLX5_SEND_WQE_MAX_WQEBBS. * * The worst case max value is then given as below: */ uint64_t max = priv->params_ethtool.tx_queue_size / (2 * MLX5_SEND_WQE_MAX_WQEBBS); /* * Update the maximum completion factor value in case the * tx_queue_size field changed. Ensure we don't overflow * 16-bits. */ if (max < 1) max = 1; else if (max > 65535) max = 65535; priv->params_ethtool.tx_completion_fact_max = max; /* * Verify that the current TX completion factor is within the * given limits: */ if (priv->params_ethtool.tx_completion_fact < 1) priv->params_ethtool.tx_completion_fact = 1; else if (priv->params_ethtool.tx_completion_fact > max) priv->params_ethtool.tx_completion_fact = max; } static int mlx5e_getmaxrate(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; int err; int i; PRIV_LOCK(priv); err = -mlx5_query_port_tc_rate_limit(mdev, max_bw_value, max_bw_unit); if (err) goto done; for (i = 0; i <= mlx5_max_tc(mdev); i++) { switch (max_bw_unit[i]) { case MLX5_100_MBPS_UNIT: priv->params_ethtool.max_bw_value[i] = max_bw_value[i] * MLX5E_100MB; break; case MLX5_GBPS_UNIT: priv->params_ethtool.max_bw_value[i] = max_bw_value[i] * MLX5E_1GB; break; case MLX5_BW_NO_LIMIT: priv->params_ethtool.max_bw_value[i] = 0; break; default: priv->params_ethtool.max_bw_value[i] = -1; WARN_ONCE(true, "non-supported BW unit"); break; } } done: PRIV_UNLOCK(priv); return (err); } static int mlx5e_get_max_alloc(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; int err; int x; PRIV_LOCK(priv); err = -mlx5_query_port_tc_bw_alloc(mdev, priv->params_ethtool.max_bw_share); if (err == 0) { /* set default value */ for (x = 0; x != IEEE_8021QAZ_MAX_TCS; x++) { priv->params_ethtool.max_bw_share[x] = 100 / IEEE_8021QAZ_MAX_TCS; } err = -mlx5_set_port_tc_bw_alloc(mdev, priv->params_ethtool.max_bw_share); } PRIV_UNLOCK(priv); return (err); } static int mlx5e_get_dscp(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; int err; if (MLX5_CAP_GEN(mdev, qcam_reg) == 0 || MLX5_CAP_QCAM_REG(mdev, qpts) == 0 || MLX5_CAP_QCAM_REG(mdev, qpdpm) == 0) return (EOPNOTSUPP); PRIV_LOCK(priv); err = -mlx5_query_dscp2prio(mdev, priv->params_ethtool.dscp2prio); if (err) goto done; err = -mlx5_query_trust_state(mdev, &priv->params_ethtool.trust_state); if (err) goto done; done: PRIV_UNLOCK(priv); return (err); } static void mlx5e_tc_get_parameters(struct mlx5e_priv *priv, u64 *new_bw_value, u8 *max_bw_value, u8 *max_bw_unit) { const u64 upper_limit_mbps = 255 * MLX5E_100MB; const u64 upper_limit_gbps = 255 * MLX5E_1GB; u64 temp; int i; memset(max_bw_value, 0, IEEE_8021QAZ_MAX_TCS); memset(max_bw_unit, 0, IEEE_8021QAZ_MAX_TCS); for (i = 0; i <= mlx5_max_tc(priv->mdev); i++) { temp = (new_bw_value != NULL) ? new_bw_value[i] : priv->params_ethtool.max_bw_value[i]; if (!temp) { max_bw_unit[i] = MLX5_BW_NO_LIMIT; } else if (temp > upper_limit_gbps) { max_bw_unit[i] = MLX5_BW_NO_LIMIT; } else if (temp <= upper_limit_mbps) { max_bw_value[i] = howmany(temp, MLX5E_100MB); max_bw_unit[i] = MLX5_100_MBPS_UNIT; } else { max_bw_value[i] = howmany(temp, MLX5E_1GB); max_bw_unit[i] = MLX5_GBPS_UNIT; } } } static int mlx5e_tc_maxrate_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; struct mlx5_core_dev *mdev = priv->mdev; u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; u64 new_bw_value[IEEE_8021QAZ_MAX_TCS]; u8 max_rates = mlx5_max_tc(mdev) + 1; u8 x; int err; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.max_bw_value, sizeof(priv->params_ethtool.max_bw_value[0]) * max_rates); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, new_bw_value, sizeof(new_bw_value[0]) * max_rates); if (err) goto done; /* range check input value */ for (x = 0; x != max_rates; x++) { if (new_bw_value[x] % MLX5E_100MB) { err = ERANGE; goto done; } } mlx5e_tc_get_parameters(priv, new_bw_value, max_bw_value, max_bw_unit); err = -mlx5_modify_port_tc_rate_limit(mdev, max_bw_value, max_bw_unit); if (err) goto done; memcpy(priv->params_ethtool.max_bw_value, new_bw_value, sizeof(priv->params_ethtool.max_bw_value)); done: PRIV_UNLOCK(priv); return (err); } static int mlx5e_tc_rate_share_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; struct mlx5_core_dev *mdev = priv->mdev; u8 max_bw_share[IEEE_8021QAZ_MAX_TCS]; u8 max_rates = mlx5_max_tc(mdev) + 1; int i; int err; int sum; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.max_bw_share, max_rates); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, max_bw_share, max_rates); if (err) goto done; /* range check input value */ for (sum = i = 0; i != max_rates; i++) { if (max_bw_share[i] < 1 || max_bw_share[i] > 100) { err = ERANGE; goto done; } sum += max_bw_share[i]; } /* sum of values should be as close to 100 as possible */ if (sum < (100 - max_rates + 1) || sum > 100) { err = ERANGE; goto done; } err = -mlx5_set_port_tc_bw_alloc(mdev, max_bw_share); if (err) goto done; memcpy(priv->params_ethtool.max_bw_share, max_bw_share, sizeof(priv->params_ethtool.max_bw_share)); done: PRIV_UNLOCK(priv); return (err); } static int mlx5e_get_prio_tc(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; int err = 0; int i; PRIV_LOCK(priv); if (!MLX5_CAP_GEN(priv->mdev, ets)) { PRIV_UNLOCK(priv); return (EOPNOTSUPP); } for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { err = -mlx5_query_port_prio_tc(mdev, i, priv->params_ethtool.prio_tc + i); if (err) break; } PRIV_UNLOCK(priv); return (err); } static int mlx5e_prio_to_tc_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; struct mlx5_core_dev *mdev = priv->mdev; uint8_t temp[MLX5E_MAX_PRIORITY]; int err; int i; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.prio_tc, MLX5E_MAX_PRIORITY); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY); if (err) goto done; for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] > mlx5_max_tc(mdev)) { err = ERANGE; goto done; } } for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] == priv->params_ethtool.prio_tc[i]) continue; err = -mlx5_set_port_prio_tc(mdev, i, temp[i]); if (err) goto done; /* update cached value */ priv->params_ethtool.prio_tc[i] = temp[i]; } done: PRIV_UNLOCK(priv); return (err); } int mlx5e_fec_update(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 in[MLX5_ST_SZ_DW(pplm_reg)] = {}; const int sz = MLX5_ST_SZ_BYTES(pplm_reg); int err; if (!MLX5_CAP_GEN(mdev, pcam_reg)) return (EOPNOTSUPP); if (!MLX5_CAP_PCAM_REG(mdev, pplm)) return (EOPNOTSUPP); MLX5_SET(pplm_reg, in, local_port, 1); err = -mlx5_core_access_reg(mdev, in, sz, in, sz, MLX5_REG_PPLM, 0, 0); if (err) return (err); /* get 10x..25x mask */ priv->params_ethtool.fec_mask_10x_25x[0] = MLX5_GET(pplm_reg, in, fec_override_admin_10g_40g); priv->params_ethtool.fec_mask_10x_25x[1] = MLX5_GET(pplm_reg, in, fec_override_admin_25g) & MLX5_GET(pplm_reg, in, fec_override_admin_50g); priv->params_ethtool.fec_mask_10x_25x[2] = MLX5_GET(pplm_reg, in, fec_override_admin_56g); priv->params_ethtool.fec_mask_10x_25x[3] = MLX5_GET(pplm_reg, in, fec_override_admin_100g); /* get 10x..25x available bits */ priv->params_ethtool.fec_avail_10x_25x[0] = MLX5_GET(pplm_reg, in, fec_override_cap_10g_40g); priv->params_ethtool.fec_avail_10x_25x[1] = MLX5_GET(pplm_reg, in, fec_override_cap_25g) & MLX5_GET(pplm_reg, in, fec_override_cap_50g); priv->params_ethtool.fec_avail_10x_25x[2] = MLX5_GET(pplm_reg, in, fec_override_cap_56g); priv->params_ethtool.fec_avail_10x_25x[3] = MLX5_GET(pplm_reg, in, fec_override_cap_100g); /* get 50x mask */ priv->params_ethtool.fec_mask_50x[0] = MLX5_GET(pplm_reg, in, fec_override_admin_50g_1x); priv->params_ethtool.fec_mask_50x[1] = MLX5_GET(pplm_reg, in, fec_override_admin_100g_2x); priv->params_ethtool.fec_mask_50x[2] = MLX5_GET(pplm_reg, in, fec_override_admin_200g_4x); priv->params_ethtool.fec_mask_50x[3] = MLX5_GET(pplm_reg, in, fec_override_admin_400g_8x); /* get 50x available bits */ priv->params_ethtool.fec_avail_50x[0] = MLX5_GET(pplm_reg, in, fec_override_cap_50g_1x); priv->params_ethtool.fec_avail_50x[1] = MLX5_GET(pplm_reg, in, fec_override_cap_100g_2x); priv->params_ethtool.fec_avail_50x[2] = MLX5_GET(pplm_reg, in, fec_override_cap_200g_4x); priv->params_ethtool.fec_avail_50x[3] = MLX5_GET(pplm_reg, in, fec_override_cap_400g_8x); /* get current FEC mask */ priv->params_ethtool.fec_mode_active = MLX5_GET(pplm_reg, in, fec_mode_active); return (0); } static int mlx5e_fec_mask_10x_25x_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; struct mlx5_core_dev *mdev = priv->mdev; u32 out[MLX5_ST_SZ_DW(pplm_reg)] = {}; u32 in[MLX5_ST_SZ_DW(pplm_reg)] = {}; const int sz = MLX5_ST_SZ_BYTES(pplm_reg); u8 fec_mask_10x_25x[MLX5E_MAX_FEC_10X_25X]; u8 fec_cap_changed = 0; u8 x; int err; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.fec_mask_10x_25x, sizeof(priv->params_ethtool.fec_mask_10x_25x)); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, fec_mask_10x_25x, sizeof(fec_mask_10x_25x)); if (err) goto done; if (!MLX5_CAP_GEN(mdev, pcam_reg)) { err = EOPNOTSUPP; goto done; } if (!MLX5_CAP_PCAM_REG(mdev, pplm)) { err = EOPNOTSUPP; goto done; } MLX5_SET(pplm_reg, in, local_port, 1); err = -mlx5_core_access_reg(mdev, in, sz, in, sz, MLX5_REG_PPLM, 0, 0); if (err) goto done; /* range check input value */ for (x = 0; x != MLX5E_MAX_FEC_10X_25X; x++) { /* check only one bit is set, if any */ if (fec_mask_10x_25x[x] & (fec_mask_10x_25x[x] - 1)) { err = ERANGE; goto done; } /* check a supported bit is set, if any */ if (fec_mask_10x_25x[x] & ~priv->params_ethtool.fec_avail_10x_25x[x]) { err = ERANGE; goto done; } fec_cap_changed |= (fec_mask_10x_25x[x] ^ priv->params_ethtool.fec_mask_10x_25x[x]); } /* check for no changes */ if (fec_cap_changed == 0) goto done; memset(in, 0, sizeof(in)); MLX5_SET(pplm_reg, in, local_port, 1); /* set new values */ MLX5_SET(pplm_reg, in, fec_override_admin_10g_40g, fec_mask_10x_25x[0]); MLX5_SET(pplm_reg, in, fec_override_admin_25g, fec_mask_10x_25x[1]); MLX5_SET(pplm_reg, in, fec_override_admin_50g, fec_mask_10x_25x[1]); MLX5_SET(pplm_reg, in, fec_override_admin_56g, fec_mask_10x_25x[2]); MLX5_SET(pplm_reg, in, fec_override_admin_100g, fec_mask_10x_25x[3]); /* preserve other values */ MLX5_SET(pplm_reg, in, fec_override_admin_50g_1x, priv->params_ethtool.fec_mask_50x[0]); MLX5_SET(pplm_reg, in, fec_override_admin_100g_2x, priv->params_ethtool.fec_mask_50x[1]); MLX5_SET(pplm_reg, in, fec_override_admin_200g_4x, priv->params_ethtool.fec_mask_50x[2]); MLX5_SET(pplm_reg, in, fec_override_admin_400g_8x, priv->params_ethtool.fec_mask_50x[3]); /* send new value to the firmware */ err = -mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPLM, 0, 1); if (err) goto done; memcpy(priv->params_ethtool.fec_mask_10x_25x, fec_mask_10x_25x, sizeof(priv->params_ethtool.fec_mask_10x_25x)); mlx5_toggle_port_link(priv->mdev); done: PRIV_UNLOCK(priv); return (err); } static int mlx5e_fec_avail_10x_25x_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; int err; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.fec_avail_10x_25x, sizeof(priv->params_ethtool.fec_avail_10x_25x)); PRIV_UNLOCK(priv); return (err); } static int mlx5e_fec_mask_50x_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; struct mlx5_core_dev *mdev = priv->mdev; u32 out[MLX5_ST_SZ_DW(pplm_reg)] = {}; u32 in[MLX5_ST_SZ_DW(pplm_reg)] = {}; const int sz = MLX5_ST_SZ_BYTES(pplm_reg); u16 fec_mask_50x[MLX5E_MAX_FEC_50X]; u16 fec_cap_changed = 0; u8 x; int err; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.fec_mask_50x, sizeof(priv->params_ethtool.fec_mask_50x)); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, fec_mask_50x, sizeof(fec_mask_50x)); if (err) goto done; if (!MLX5_CAP_GEN(mdev, pcam_reg)) { err = EOPNOTSUPP; goto done; } if (!MLX5_CAP_PCAM_REG(mdev, pplm)) { err = EOPNOTSUPP; goto done; } MLX5_SET(pplm_reg, in, local_port, 1); err = -mlx5_core_access_reg(mdev, in, sz, in, sz, MLX5_REG_PPLM, 0, 0); if (err) goto done; /* range check input value */ for (x = 0; x != MLX5E_MAX_FEC_50X; x++) { /* check only one bit is set, if any */ if (fec_mask_50x[x] & (fec_mask_50x[x] - 1)) { err = ERANGE; goto done; } /* check a supported bit is set, if any */ if (fec_mask_50x[x] & ~priv->params_ethtool.fec_avail_50x[x]) { err = ERANGE; goto done; } fec_cap_changed |= (fec_mask_50x[x] ^ priv->params_ethtool.fec_mask_50x[x]); } /* check for no changes */ if (fec_cap_changed == 0) goto done; memset(in, 0, sizeof(in)); MLX5_SET(pplm_reg, in, local_port, 1); /* set new values */ MLX5_SET(pplm_reg, in, fec_override_admin_50g_1x, fec_mask_50x[0]); MLX5_SET(pplm_reg, in, fec_override_admin_100g_2x, fec_mask_50x[1]); MLX5_SET(pplm_reg, in, fec_override_admin_200g_4x, fec_mask_50x[2]); MLX5_SET(pplm_reg, in, fec_override_admin_400g_8x, fec_mask_50x[3]); /* preserve other values */ MLX5_SET(pplm_reg, in, fec_override_admin_10g_40g, priv->params_ethtool.fec_mask_10x_25x[0]); MLX5_SET(pplm_reg, in, fec_override_admin_25g, priv->params_ethtool.fec_mask_10x_25x[1]); MLX5_SET(pplm_reg, in, fec_override_admin_50g, priv->params_ethtool.fec_mask_10x_25x[1]); MLX5_SET(pplm_reg, in, fec_override_admin_56g, priv->params_ethtool.fec_mask_10x_25x[2]); MLX5_SET(pplm_reg, in, fec_override_admin_100g, priv->params_ethtool.fec_mask_10x_25x[3]); /* send new value to the firmware */ err = -mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPLM, 0, 1); if (err) goto done; memcpy(priv->params_ethtool.fec_mask_50x, fec_mask_50x, sizeof(priv->params_ethtool.fec_mask_50x)); mlx5_toggle_port_link(priv->mdev); done: PRIV_UNLOCK(priv); return (err); } static int mlx5e_fec_avail_50x_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; int err; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.fec_avail_50x, sizeof(priv->params_ethtool.fec_avail_50x)); PRIV_UNLOCK(priv); return (err); } static int mlx5e_trust_state_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; struct mlx5_core_dev *mdev = priv->mdev; int err; u8 result; PRIV_LOCK(priv); result = priv->params_ethtool.trust_state; err = sysctl_handle_8(oidp, &result, 0, req); if (err || !req->newptr || result == priv->params_ethtool.trust_state) goto done; switch (result) { case MLX5_QPTS_TRUST_PCP: case MLX5_QPTS_TRUST_DSCP: break; case MLX5_QPTS_TRUST_BOTH: if (!MLX5_CAP_QCAM_FEATURE(mdev, qpts_trust_both)) { err = EOPNOTSUPP; goto done; } break; default: err = ERANGE; goto done; } err = -mlx5_set_trust_state(mdev, result); if (err) goto done; priv->params_ethtool.trust_state = result; /* update inline mode */ mlx5e_refresh_sq_inline(priv); #ifdef RATELIMIT mlx5e_rl_refresh_sq_inline(&priv->rl); #endif done: PRIV_UNLOCK(priv); return (err); } static int mlx5e_dscp_prio_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; int prio_index = arg2; struct mlx5_core_dev *mdev = priv->mdev; uint8_t dscp2prio[MLX5_MAX_SUPPORTED_DSCP]; uint8_t x; int err; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.dscp2prio + prio_index, sizeof(priv->params_ethtool.dscp2prio) / 8); if (err || !req->newptr) goto done; memcpy(dscp2prio, priv->params_ethtool.dscp2prio, sizeof(dscp2prio)); err = SYSCTL_IN(req, dscp2prio + prio_index, sizeof(dscp2prio) / 8); if (err) goto done; for (x = 0; x != MLX5_MAX_SUPPORTED_DSCP; x++) { if (dscp2prio[x] > 7) { err = ERANGE; goto done; } } err = -mlx5_set_dscp2prio(mdev, dscp2prio); if (err) goto done; /* update local array */ memcpy(priv->params_ethtool.dscp2prio, dscp2prio, sizeof(priv->params_ethtool.dscp2prio)); done: PRIV_UNLOCK(priv); return (err); } int mlx5e_update_buf_lossy(struct mlx5e_priv *priv) { struct ieee_pfc pfc; PRIV_ASSERT_LOCKED(priv); bzero(&pfc, sizeof(pfc)); pfc.pfc_en = priv->params.rx_priority_flow_control; return (-mlx5e_port_manual_buffer_config(priv, MLX5E_PORT_BUFFER_PFC, priv->params_ethtool.hw_mtu, &pfc, NULL, NULL)); } static int mlx5e_buf_size_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv; u32 buf_size[MLX5E_MAX_BUFFER]; struct mlx5e_port_buffer port_buffer; int error, i; priv = arg1; PRIV_LOCK(priv); error = -mlx5e_port_query_buffer(priv, &port_buffer); if (error != 0) goto done; for (i = 0; i < nitems(buf_size); i++) buf_size[i] = port_buffer.buffer[i].size; error = SYSCTL_OUT(req, buf_size, sizeof(buf_size)); if (error != 0 || req->newptr == NULL) goto done; error = SYSCTL_IN(req, buf_size, sizeof(buf_size)); if (error != 0) goto done; error = -mlx5e_port_manual_buffer_config(priv, MLX5E_PORT_BUFFER_SIZE, priv->params_ethtool.hw_mtu, NULL, buf_size, NULL); done: PRIV_UNLOCK(priv); return (error); } static int mlx5e_buf_prio_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv; struct mlx5_core_dev *mdev; u8 buffer[MLX5E_MAX_BUFFER]; int error; priv = arg1; mdev = priv->mdev; PRIV_LOCK(priv); error = -mlx5e_port_query_priority2buffer(mdev, buffer); if (error != 0) goto done; error = SYSCTL_OUT(req, buffer, MLX5E_MAX_BUFFER); if (error != 0 || req->newptr == NULL) goto done; error = SYSCTL_IN(req, buffer, MLX5E_MAX_BUFFER); if (error != 0) goto done; error = -mlx5e_port_manual_buffer_config(priv, MLX5E_PORT_BUFFER_PRIO2BUFFER, priv->params_ethtool.hw_mtu, NULL, NULL, buffer); if (error == 0) error = mlx5e_update_buf_lossy(priv); done: PRIV_UNLOCK(priv); return (error); } static int mlx5e_cable_length_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv; u_int cable_len; int error; priv = arg1; PRIV_LOCK(priv); cable_len = priv->dcbx.cable_len; error = sysctl_handle_int(oidp, &cable_len, 0, req); if (error == 0 && req->newptr != NULL && cable_len != priv->dcbx.cable_len) { error = -mlx5e_port_manual_buffer_config(priv, MLX5E_PORT_BUFFER_CABLE_LEN, priv->params_ethtool.hw_mtu, NULL, NULL, NULL); if (error == 0) priv->dcbx.cable_len = cable_len; } PRIV_UNLOCK(priv); return (error); } static int mlx5e_hw_temperature_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; int err; PRIV_LOCK(priv); err = SYSCTL_OUT(req, priv->params_ethtool.hw_val_temp, sizeof(priv->params_ethtool.hw_val_temp[0]) * priv->params_ethtool.hw_num_temp); if (err == 0 && req->newptr != NULL) err = EOPNOTSUPP; PRIV_UNLOCK(priv); return (err); } int mlx5e_hw_temperature_update(struct mlx5e_priv *priv) { int err; u32 x; if (priv->params_ethtool.hw_num_temp == 0) { u32 out_cap[MLX5_ST_SZ_DW(mtcap)] = {}; const int sz_cap = MLX5_ST_SZ_BYTES(mtcap); u32 value; err = -mlx5_core_access_reg(priv->mdev, NULL, 0, out_cap, sz_cap, MLX5_ACCESS_REG_SUMMARY_CTRL_ID_MTCAP, 0, 0); if (err) goto done; value = MLX5_GET(mtcap, out_cap, sensor_count); if (value == 0) return (0); if (value > MLX5_MAX_TEMPERATURE) value = MLX5_MAX_TEMPERATURE; /* update number of temperature sensors */ priv->params_ethtool.hw_num_temp = value; } for (x = 0; x != priv->params_ethtool.hw_num_temp; x++) { u32 out_sensor[MLX5_ST_SZ_DW(mtmp_reg)] = {}; const int sz_sensor = MLX5_ST_SZ_BYTES(mtmp_reg); MLX5_SET(mtmp_reg, out_sensor, sensor_index, x); err = -mlx5_core_access_reg(priv->mdev, out_sensor, sz_sensor, out_sensor, sz_sensor, MLX5_ACCESS_REG_SUMMARY_CTRL_ID_MTMP, 0, 0); if (err) goto done; /* convert from 0.125 celsius to millicelsius */ priv->params_ethtool.hw_val_temp[x] = (s16)MLX5_GET(mtmp_reg, out_sensor, temperature) * 125; } done: return (err); } #define MLX5_PARAM_OFFSET(n) \ __offsetof(struct mlx5e_priv, params_ethtool.n) static int mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; uint64_t value; int mode_modify; int was_opened; int error; PRIV_LOCK(priv); value = priv->params_ethtool.arg[arg2]; if (req != NULL) { error = sysctl_handle_64(oidp, &value, 0, req); if (error || req->newptr == NULL || value == priv->params_ethtool.arg[arg2]) goto done; /* assign new value */ priv->params_ethtool.arg[arg2] = value; } else { error = 0; } /* check if device is gone */ if (priv->gone) { error = ENXIO; goto done; } was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); switch (MLX5_PARAM_OFFSET(arg[arg2])) { case MLX5_PARAM_OFFSET(rx_coalesce_usecs): /* import RX coal time */ if (priv->params_ethtool.rx_coalesce_usecs < 1) priv->params_ethtool.rx_coalesce_usecs = 0; else if (priv->params_ethtool.rx_coalesce_usecs > MLX5E_FLD_MAX(cqc, cq_period)) { priv->params_ethtool.rx_coalesce_usecs = MLX5E_FLD_MAX(cqc, cq_period); } priv->params.rx_cq_moderation_usec = priv->params_ethtool.rx_coalesce_usecs; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_refresh_channel_params(priv); break; case MLX5_PARAM_OFFSET(rx_coalesce_pkts): /* import RX coal pkts */ if (priv->params_ethtool.rx_coalesce_pkts < 1) priv->params_ethtool.rx_coalesce_pkts = 0; else if (priv->params_ethtool.rx_coalesce_pkts > MLX5E_FLD_MAX(cqc, cq_max_count)) { priv->params_ethtool.rx_coalesce_pkts = MLX5E_FLD_MAX(cqc, cq_max_count); } priv->params.rx_cq_moderation_pkts = priv->params_ethtool.rx_coalesce_pkts; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_refresh_channel_params(priv); break; case MLX5_PARAM_OFFSET(tx_coalesce_usecs): /* import TX coal time */ if (priv->params_ethtool.tx_coalesce_usecs < 1) priv->params_ethtool.tx_coalesce_usecs = 0; else if (priv->params_ethtool.tx_coalesce_usecs > MLX5E_FLD_MAX(cqc, cq_period)) { priv->params_ethtool.tx_coalesce_usecs = MLX5E_FLD_MAX(cqc, cq_period); } priv->params.tx_cq_moderation_usec = priv->params_ethtool.tx_coalesce_usecs; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_refresh_channel_params(priv); break; case MLX5_PARAM_OFFSET(tx_coalesce_pkts): /* import TX coal pkts */ if (priv->params_ethtool.tx_coalesce_pkts < 1) priv->params_ethtool.tx_coalesce_pkts = 0; else if (priv->params_ethtool.tx_coalesce_pkts > MLX5E_FLD_MAX(cqc, cq_max_count)) { priv->params_ethtool.tx_coalesce_pkts = MLX5E_FLD_MAX(cqc, cq_max_count); } priv->params.tx_cq_moderation_pkts = priv->params_ethtool.tx_coalesce_pkts; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_refresh_channel_params(priv); break; case MLX5_PARAM_OFFSET(tx_queue_size): /* network interface must be down */ if (was_opened) mlx5e_close_locked(priv->ifp); /* import TX queue size */ if (priv->params_ethtool.tx_queue_size < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) { priv->params_ethtool.tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); } else if (priv->params_ethtool.tx_queue_size > priv->params_ethtool.tx_queue_size_max) { priv->params_ethtool.tx_queue_size = priv->params_ethtool.tx_queue_size_max; } /* store actual TX queue size */ priv->params.log_sq_size = order_base_2(priv->params_ethtool.tx_queue_size); priv->params_ethtool.tx_queue_size = 1 << priv->params.log_sq_size; /* verify TX completion factor */ mlx5e_ethtool_sync_tx_completion_fact(priv); /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); break; case MLX5_PARAM_OFFSET(rx_queue_size): /* network interface must be down */ if (was_opened) mlx5e_close_locked(priv->ifp); /* import RX queue size */ if (priv->params_ethtool.rx_queue_size < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) { priv->params_ethtool.rx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE); } else if (priv->params_ethtool.rx_queue_size > priv->params_ethtool.rx_queue_size_max) { priv->params_ethtool.rx_queue_size = priv->params_ethtool.rx_queue_size_max; } /* store actual RX queue size */ priv->params.log_rq_size = order_base_2(priv->params_ethtool.rx_queue_size); priv->params_ethtool.rx_queue_size = 1 << priv->params.log_rq_size; /* update least number of RX WQEs */ priv->params.min_rx_wqes = min( priv->params_ethtool.rx_queue_size - 1, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES); /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); break; case MLX5_PARAM_OFFSET(channels_rsss): /* network interface must be down */ if (was_opened) mlx5e_close_locked(priv->ifp); /* import number of channels */ if (priv->params_ethtool.channels_rsss < 1) priv->params_ethtool.channels_rsss = 1; else if (priv->params_ethtool.channels_rsss > 128) priv->params_ethtool.channels_rsss = 128; priv->params.channels_rsss = priv->params_ethtool.channels_rsss; /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); break; case MLX5_PARAM_OFFSET(channels): /* network interface must be down */ if (was_opened) mlx5e_close_locked(priv->ifp); /* import number of channels */ if (priv->params_ethtool.channels < 1) priv->params_ethtool.channels = 1; else if (priv->params_ethtool.channels > (u64) priv->mdev->priv.eq_table.num_comp_vectors) { priv->params_ethtool.channels = (u64) priv->mdev->priv.eq_table.num_comp_vectors; } priv->params.num_channels = priv->params_ethtool.channels; /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); break; case MLX5_PARAM_OFFSET(rx_coalesce_mode): /* network interface must be down */ if (was_opened != 0 && mode_modify == 0) mlx5e_close_locked(priv->ifp); /* import RX coalesce mode */ if (priv->params_ethtool.rx_coalesce_mode > 3) priv->params_ethtool.rx_coalesce_mode = 3; priv->params.rx_cq_moderation_mode = priv->params_ethtool.rx_coalesce_mode; /* restart network interface, if any */ if (was_opened != 0) { if (mode_modify == 0) mlx5e_open_locked(priv->ifp); else error = mlx5e_refresh_channel_params(priv); } break; case MLX5_PARAM_OFFSET(tx_coalesce_mode): /* network interface must be down */ if (was_opened != 0 && mode_modify == 0) mlx5e_close_locked(priv->ifp); /* import TX coalesce mode */ if (priv->params_ethtool.tx_coalesce_mode != 0) priv->params_ethtool.tx_coalesce_mode = 1; priv->params.tx_cq_moderation_mode = priv->params_ethtool.tx_coalesce_mode; /* restart network interface, if any */ if (was_opened != 0) { if (mode_modify == 0) mlx5e_open_locked(priv->ifp); else error = mlx5e_refresh_channel_params(priv); } break; case MLX5_PARAM_OFFSET(hw_lro): /* network interface must be down */ if (was_opened) mlx5e_close_locked(priv->ifp); /* import HW LRO mode */ if (priv->params_ethtool.hw_lro != 0 && MLX5_CAP_ETH(priv->mdev, lro_cap)) { priv->params_ethtool.hw_lro = 1; /* check if feature should actually be enabled */ - if (priv->ifp->if_capenable & IFCAP_LRO) { + if (if_getcapenable(priv->ifp) & IFCAP_LRO) { priv->params.hw_lro_en = true; } else { priv->params.hw_lro_en = false; mlx5_en_warn(priv->ifp, "To enable HW LRO " "please also enable LRO via ifconfig(8).\n"); } } else { /* return an error if HW does not support this feature */ if (priv->params_ethtool.hw_lro != 0) error = EINVAL; priv->params.hw_lro_en = false; priv->params_ethtool.hw_lro = 0; } /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); break; case MLX5_PARAM_OFFSET(cqe_zipping): /* network interface must be down */ if (was_opened) mlx5e_close_locked(priv->ifp); /* import CQE zipping mode */ if (priv->params_ethtool.cqe_zipping && MLX5_CAP_GEN(priv->mdev, cqe_compression)) { priv->params.cqe_zipping_en = true; priv->params_ethtool.cqe_zipping = 1; } else { priv->params.cqe_zipping_en = false; priv->params_ethtool.cqe_zipping = 0; } /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); break; case MLX5_PARAM_OFFSET(tx_completion_fact): /* network interface must be down */ if (was_opened) mlx5e_close_locked(priv->ifp); /* verify parameter */ mlx5e_ethtool_sync_tx_completion_fact(priv); /* restart network interface, if any */ if (was_opened) mlx5e_open_locked(priv->ifp); break; case MLX5_PARAM_OFFSET(modify_tx_dma): /* check if network interface is opened */ if (was_opened) { priv->params_ethtool.modify_tx_dma = priv->params_ethtool.modify_tx_dma ? 1 : 0; /* modify tx according to value */ mlx5e_modify_tx_dma(priv, value != 0); } else { /* if closed force enable tx */ priv->params_ethtool.modify_tx_dma = 0; } break; case MLX5_PARAM_OFFSET(modify_rx_dma): /* check if network interface is opened */ if (was_opened) { priv->params_ethtool.modify_rx_dma = priv->params_ethtool.modify_rx_dma ? 1 : 0; /* modify rx according to value */ mlx5e_modify_rx_dma(priv, value != 0); } else { /* if closed force enable rx */ priv->params_ethtool.modify_rx_dma = 0; } break; case MLX5_PARAM_OFFSET(diag_pci_enable): priv->params_ethtool.diag_pci_enable = priv->params_ethtool.diag_pci_enable ? 1 : 0; error = -mlx5_core_set_diagnostics_full(priv->mdev, priv->params_ethtool.diag_pci_enable, priv->params_ethtool.diag_general_enable); break; case MLX5_PARAM_OFFSET(diag_general_enable): priv->params_ethtool.diag_general_enable = priv->params_ethtool.diag_general_enable ? 1 : 0; error = -mlx5_core_set_diagnostics_full(priv->mdev, priv->params_ethtool.diag_pci_enable, priv->params_ethtool.diag_general_enable); break; case MLX5_PARAM_OFFSET(mc_local_lb): /* check if mlx5ib is managing this feature */ if (MLX5_CAP_GEN(priv->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) { error = EOPNOTSUPP; break; } priv->params_ethtool.mc_local_lb = priv->params_ethtool.mc_local_lb ? 1 : 0; if (MLX5_CAP_GEN(priv->mdev, disable_local_lb_mc)) { error = mlx5_nic_vport_modify_local_lb(priv->mdev, MLX5_LOCAL_MC_LB, priv->params_ethtool.mc_local_lb); } else { error = EOPNOTSUPP; } break; case MLX5_PARAM_OFFSET(uc_local_lb): /* check if mlx5ib is managing this feature */ if (MLX5_CAP_GEN(priv->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) { error = EOPNOTSUPP; break; } priv->params_ethtool.uc_local_lb = priv->params_ethtool.uc_local_lb ? 1 : 0; if (MLX5_CAP_GEN(priv->mdev, disable_local_lb_uc)) { error = mlx5_nic_vport_modify_local_lb(priv->mdev, MLX5_LOCAL_UC_LB, priv->params_ethtool.uc_local_lb); } else { error = EOPNOTSUPP; } break; case MLX5_PARAM_OFFSET(irq_cpu_base): case MLX5_PARAM_OFFSET(irq_cpu_stride): if (was_opened) { /* network interface must toggled */ mlx5e_close_locked(priv->ifp); mlx5e_open_locked(priv->ifp); } break; default: break; } done: PRIV_UNLOCK(priv); return (error); } static const char *mlx5e_params_desc[] = { MLX5E_PARAMS(MLX5E_STATS_DESC) }; static const char *mlx5e_port_stats_debug_desc[] = { MLX5E_PORT_STATS_DEBUG(MLX5E_STATS_DESC) }; static int mlx5e_ethtool_debug_channel_info(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv; struct sbuf sb; struct mlx5e_channel *c; struct mlx5e_sq *sq; struct mlx5e_rq *rq; int error, i, tc; bool opened; priv = arg1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (sbuf_new_for_sysctl(&sb, NULL, 1024, req) == NULL) return (ENOMEM); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); PRIV_LOCK(priv); opened = test_bit(MLX5E_STATE_OPENED, &priv->state); sbuf_printf(&sb, "pages irq %d\n", priv->mdev->priv.msix_arr[MLX5_EQ_VEC_PAGES].vector); sbuf_printf(&sb, "command irq %d\n", priv->mdev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); sbuf_printf(&sb, "async irq %d\n", priv->mdev->priv.msix_arr[MLX5_EQ_VEC_ASYNC].vector); for (i = 0; i != priv->params.num_channels; i++) { int eqn_not_used = -1; int irqn = MLX5_EQ_VEC_COMP_BASE; if (mlx5_vector2eqn(priv->mdev, i, &eqn_not_used, &irqn) != 0) continue; c = opened ? &priv->channel[i] : NULL; rq = opened ? &c->rq : NULL; sbuf_printf(&sb, "channel %d rq %d cq %d irq %d\n", i, opened ? rq->rqn : -1, opened ? rq->cq.mcq.cqn : -1, priv->mdev->priv.msix_arr[irqn].vector); for (tc = 0; tc != priv->num_tc; tc++) { sq = opened ? &c->sq[tc] : NULL; sbuf_printf(&sb, "channel %d tc %d sq %d cq %d irq %d\n", i, tc, opened ? sq->sqn : -1, opened ? sq->cq.mcq.cqn : -1, priv->mdev->priv.msix_arr[irqn].vector); } } PRIV_UNLOCK(priv); error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } static int mlx5e_ethtool_debug_stats(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; int sys_debug; int error; PRIV_LOCK(priv); if (priv->gone != 0) { error = ENODEV; goto done; } sys_debug = priv->sysctl_debug; error = sysctl_handle_int(oidp, &sys_debug, 0, req); if (error != 0 || !req->newptr) goto done; sys_debug = sys_debug ? 1 : 0; if (sys_debug == priv->sysctl_debug) goto done; if ((priv->sysctl_debug = sys_debug)) { mlx5e_create_stats(&priv->stats.port_stats_debug.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), "debug_stats", mlx5e_port_stats_debug_desc, MLX5E_PORT_STATS_DEBUG_NUM, priv->stats.port_stats_debug.arg); SYSCTL_ADD_PROC(&priv->stats.port_stats_debug.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "hw_ctx_debug", CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_STRING, priv, 0, mlx5e_ethtool_debug_channel_info, "S", ""); } else { sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); } done: PRIV_UNLOCK(priv); return (error); } static void mlx5e_create_diagnostics(struct mlx5e_priv *priv) { struct mlx5_core_diagnostics_entry entry; struct sysctl_ctx_list *ctx; struct sysctl_oid *node; int x; /* sysctl context we are using */ ctx = &priv->sysctl_ctx; /* create root node */ node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "diagnostics", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Diagnostics"); if (node == NULL) return; /* create PCI diagnostics */ for (x = 0; x != MLX5_CORE_PCI_DIAGNOSTICS_NUM; x++) { entry = mlx5_core_pci_diagnostics_table[x]; if (mlx5_core_supports_diagnostics(priv->mdev, entry.counter_id) == 0) continue; SYSCTL_ADD_UQUAD(ctx, SYSCTL_CHILDREN(node), OID_AUTO, entry.desc, CTLFLAG_RD, priv->params_pci.array + x, "PCI diagnostics counter"); } /* create general diagnostics */ for (x = 0; x != MLX5_CORE_GENERAL_DIAGNOSTICS_NUM; x++) { entry = mlx5_core_general_diagnostics_table[x]; if (mlx5_core_supports_diagnostics(priv->mdev, entry.counter_id) == 0) continue; SYSCTL_ADD_UQUAD(ctx, SYSCTL_CHILDREN(node), OID_AUTO, entry.desc, CTLFLAG_RD, priv->params_general.array + x, "General diagnostics counter"); } } void mlx5e_create_ethtool(struct mlx5e_priv *priv) { struct sysctl_oid *fec_node; struct sysctl_oid *qos_node; struct sysctl_oid *node; const char *pnameunit; struct mlx5e_port_buffer port_buffer; unsigned x; int i; /* set some defaults */ priv->params_ethtool.irq_cpu_base = -1; /* disabled */ priv->params_ethtool.irq_cpu_stride = 1; priv->params_ethtool.tx_queue_size_max = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE; priv->params_ethtool.rx_queue_size_max = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE; priv->params_ethtool.tx_queue_size = 1 << priv->params.log_sq_size; priv->params_ethtool.rx_queue_size = 1 << priv->params.log_rq_size; priv->params_ethtool.channels = priv->params.num_channels; priv->params_ethtool.channels_rsss = priv->params.channels_rsss; priv->params_ethtool.coalesce_pkts_max = MLX5E_FLD_MAX(cqc, cq_max_count); priv->params_ethtool.coalesce_usecs_max = MLX5E_FLD_MAX(cqc, cq_period); priv->params_ethtool.rx_coalesce_mode = priv->params.rx_cq_moderation_mode; priv->params_ethtool.rx_coalesce_usecs = priv->params.rx_cq_moderation_usec; priv->params_ethtool.rx_coalesce_pkts = priv->params.rx_cq_moderation_pkts; priv->params_ethtool.tx_coalesce_mode = priv->params.tx_cq_moderation_mode; priv->params_ethtool.tx_coalesce_usecs = priv->params.tx_cq_moderation_usec; priv->params_ethtool.tx_coalesce_pkts = priv->params.tx_cq_moderation_pkts; priv->params_ethtool.hw_lro = priv->params.hw_lro_en; priv->params_ethtool.cqe_zipping = priv->params.cqe_zipping_en; mlx5e_ethtool_sync_tx_completion_fact(priv); /* get default values for local loopback, if any */ if (MLX5_CAP_GEN(priv->mdev, disable_local_lb_mc) || MLX5_CAP_GEN(priv->mdev, disable_local_lb_uc)) { int err; u8 val; err = mlx5_nic_vport_query_local_lb(priv->mdev, MLX5_LOCAL_MC_LB, &val); if (err == 0) priv->params_ethtool.mc_local_lb = val; err = mlx5_nic_vport_query_local_lb(priv->mdev, MLX5_LOCAL_UC_LB, &val); if (err == 0) priv->params_ethtool.uc_local_lb = val; } /* create root node */ node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "conf", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Configuration"); if (node == NULL) return; for (x = 0; x != MLX5E_PARAMS_NUM; x++) { /* check for read-only parameter */ if (strstr(mlx5e_params_desc[2 * x], "_max") != NULL || strstr(mlx5e_params_desc[2 * x], "_mtu") != NULL) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, mlx5e_params_desc[2 * x], CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, priv, x, &mlx5e_ethtool_handler, "QU", mlx5e_params_desc[2 * x + 1]); } else if (strcmp(mlx5e_params_desc[2 * x], "hw_lro") == 0) { /* read-only, but tunable parameters */ SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, mlx5e_params_desc[2 * x], CTLTYPE_U64 | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, priv, x, &mlx5e_ethtool_handler, "QU", mlx5e_params_desc[2 * x + 1]); } else { /* * NOTE: In FreeBSD-11 and newer the * CTLFLAG_RWTUN flag will take care of * loading default sysctl value from the * kernel environment, if any: */ SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, mlx5e_params_desc[2 * x], CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, x, &mlx5e_ethtool_handler, "QU", mlx5e_params_desc[2 * x + 1]); } } /* create fec node */ fec_node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, "fec", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Forward Error Correction"); if (fec_node == NULL) return; if (mlx5e_fec_update(priv) == 0) { SYSCTL_ADD_U32(&priv->sysctl_ctx, SYSCTL_CHILDREN(fec_node), OID_AUTO, "mode_active", CTLFLAG_RD | CTLFLAG_MPSAFE, &priv->params_ethtool.fec_mode_active, 0, "Current FEC mode bit, if any."); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(fec_node), OID_AUTO, "mask_10x_25x", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_fec_mask_10x_25x_handler, "CU", "Set FEC masks for 10G_40G, 25G_50G, 56G, 100G respectivly. " "0:Auto " "1:NOFEC " "2:FIRECODE " "4:RS"); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(fec_node), OID_AUTO, "avail_10x_25x", CTLTYPE_U8 | CTLFLAG_RD | CTLFLAG_MPSAFE, priv, 0, &mlx5e_fec_avail_10x_25x_handler, "CU", "Get available FEC bits for 10G_40G, 25G_50G, 56G, 100G respectivly. " "0:Auto " "1:NOFEC " "2:FIRECODE " "4:RS"); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(fec_node), OID_AUTO, "mask_50x", CTLTYPE_U16 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_fec_mask_50x_handler, "SU", "Set FEC masks for 50G 1x, 100G 2x, 200G 4x, 400G 8x respectivly. " "0:Auto " "128:RS " "512:LL RS"); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(fec_node), OID_AUTO, "avail_50x", CTLTYPE_U16 | CTLFLAG_RD | CTLFLAG_MPSAFE, priv, 0, &mlx5e_fec_avail_50x_handler, "SU", "Get available FEC bits for 50G 1x, 100G 2x, 200G 4x, 400G 8x respectivly. " "0:Auto " "128:RS " "512:LL RS"); } SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, "debug_stats", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, &mlx5e_ethtool_debug_stats, "I", "Extended debug statistics"); pnameunit = device_get_nameunit(priv->mdev->pdev->dev.bsddev); SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, "device_name", CTLFLAG_RD, __DECONST(void *, pnameunit), 0, "PCI device name"); /* Diagnostics support */ mlx5e_create_diagnostics(priv); /* create qos node */ qos_node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, "qos", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Quality Of Service configuration"); if (qos_node == NULL) return; /* Priority rate limit support */ if (mlx5e_getmaxrate(priv) == 0) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, "tc_max_rate", CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, mlx5e_tc_maxrate_handler, "QU", "Max rate for priority, specified in kilobits, where kilo=1000, " "max_rate must be divisible by 100000"); } /* Bandwidth limiting by ratio */ if (mlx5e_get_max_alloc(priv) == 0) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, "tc_rate_share", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, mlx5e_tc_rate_share_handler, "QU", "Specify bandwidth ratio from 1 to 100 " "for the available traffic classes"); } /* Priority to traffic class mapping */ if (mlx5e_get_prio_tc(priv) == 0) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, "prio_0_7_tc", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, mlx5e_prio_to_tc_handler, "CU", "Set traffic class 0 to 7 for priority 0 to 7 inclusivly"); } /* DSCP support */ if (mlx5e_get_dscp(priv) == 0) { for (i = 0; i != MLX5_MAX_SUPPORTED_DSCP; i += 8) { char name[32]; snprintf(name, sizeof(name), "dscp_%d_%d_prio", i, i + 7); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, name, CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, i, mlx5e_dscp_prio_handler, "CU", "Set DSCP to priority mapping, 0..7"); } #define A "Set trust state, 1:PCP 2:DSCP" #define B " 3:BOTH" SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, "trust_state", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, mlx5e_trust_state_handler, "CU", MLX5_CAP_QCAM_FEATURE(priv->mdev, qpts_trust_both) ? A B : A); #undef B #undef A } if (mlx5e_port_query_buffer(priv, &port_buffer) == 0) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, "buffers_size", CTLTYPE_U32 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, mlx5e_buf_size_handler, "IU", "Set buffers sizes"); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, "buffers_prio", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, mlx5e_buf_prio_handler, "CU", "Set prio to buffers mapping"); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(qos_node), OID_AUTO, "cable_length", CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, mlx5e_cable_length_handler, "IU", "Set cable length in meters for xoff threshold calculation"); } if (mlx5e_hw_temperature_update(priv) == 0) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "hw_temperature", CTLTYPE_S32 | CTLFLAG_RD | CTLFLAG_MPSAFE, priv, 0, mlx5e_hw_temperature_handler, "I", "HW temperature in millicelsius"); } } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c index 5607aad50da5..aec2bb646644 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c @@ -1,2386 +1,2386 @@ /*- * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_rss.h" #include "opt_ratelimit.h" #include #include #include #include #include /* * The flow tables with rules define the packet processing on receive. * Currently the following structure is set up to handle different * offloads like TLS RX offload, VLAN decapsulation, packet * classification, RSS hashing, VxLAN checksum offloading: * * +=========+ +=========+ +=================+ * |TCP/IPv4 | |TCP/IPv4 | |TCP/IPv4 Match | * |Flowtable|------>| |----->|Outer Proto Match|=====> TLS TIR n * | | |Catch-all|\ | | * +=========+ +=========+| +=================+ * | * +------------------------+ * V * +=========+ +=========+ +=================+ * |TCP/IPv6 | |TCP/IPv6 | |TCP/IPv6 Match | * |Flowtable|------>| |----->|Outer Proto Match|=====> TLS TIR n * | | |Catch-all|\ | | * +=========+ +=========+| +=================+ * | * +------------------------+ * V * +=========+ +=========+ +=================+ * |VLAN ft: | |VxLAN | |VxLAN Main | * |CTAG/STAG|------>| VNI|----->|Inner Proto Match|=====> Inner TIR n * |VID/noVID|/ |Catch-all|\ | | * +=========+ +=========+| +=================+ * | * | * | * v * +=================+ * |Main | * |Outer Proto Match|=====> TIR n * | | * +=================+ * * The path through flow rules directs each packet into an appropriate TIR, * according to the: * - VLAN encapsulation * - Outer protocol * - Presence of inner protocol */ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) enum { MLX5E_FULLMATCH = 0, MLX5E_ALLMULTI = 1, MLX5E_PROMISC = 2, }; enum { MLX5E_UC = 0, MLX5E_MC_IPV4 = 1, MLX5E_MC_IPV6 = 2, MLX5E_MC_OTHER = 3, }; enum { MLX5E_ACTION_NONE = 0, MLX5E_ACTION_ADD = 1, MLX5E_ACTION_DEL = 2, }; struct mlx5e_eth_addr_hash_node { LIST_ENTRY(mlx5e_eth_addr_hash_node) hlist; u8 action; u32 mpfs_index; struct mlx5e_eth_addr_info ai; }; static void mlx5e_del_all_vlan_rules(struct mlx5e_priv *); static inline int mlx5e_hash_eth_addr(const u8 * addr) { return (addr[5]); } static bool mlx5e_add_eth_addr_to_hash(struct mlx5e_eth_addr_hash_head *hash, struct mlx5e_eth_addr_hash_node *hn_new) { struct mlx5e_eth_addr_hash_node *hn; u32 ix = mlx5e_hash_eth_addr(hn_new->ai.addr); LIST_FOREACH(hn, &hash[ix], hlist) { if (bcmp(hn->ai.addr, hn_new->ai.addr, ETHER_ADDR_LEN) == 0) { if (hn->action == MLX5E_ACTION_DEL) hn->action = MLX5E_ACTION_NONE; free(hn_new, M_MLX5EN); return (false); } } LIST_INSERT_HEAD(&hash[ix], hn_new, hlist); return (true); } static void mlx5e_del_eth_addr_from_hash(struct mlx5e_eth_addr_hash_node *hn) { LIST_REMOVE(hn, hlist); free(hn, M_MLX5EN); } static void mlx5e_del_eth_addr_from_flow_table(struct mlx5e_priv *priv, struct mlx5e_eth_addr_info *ai) { if (ai->tt_vec & (1 << MLX5E_TT_IPV6_IPSEC_ESP)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_IPSEC_ESP]); if (ai->tt_vec & (1 << MLX5E_TT_IPV4_IPSEC_ESP)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_IPSEC_ESP]); if (ai->tt_vec & (1 << MLX5E_TT_IPV6_IPSEC_AH)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_IPSEC_AH]); if (ai->tt_vec & (1 << MLX5E_TT_IPV4_IPSEC_AH)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_IPSEC_AH]); if (ai->tt_vec & (1 << MLX5E_TT_IPV6_TCP)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_TCP]); if (ai->tt_vec & (1 << MLX5E_TT_IPV4_TCP)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_TCP]); if (ai->tt_vec & (1 << MLX5E_TT_IPV6_UDP)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_UDP]); if (ai->tt_vec & (1 << MLX5E_TT_IPV4_UDP)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_UDP]); if (ai->tt_vec & (1 << MLX5E_TT_IPV6)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6]); if (ai->tt_vec & (1 << MLX5E_TT_IPV4)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4]); if (ai->tt_vec & (1 << MLX5E_TT_ANY)) mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_ANY]); /* ensure the rules are not freed again */ ai->tt_vec = 0; } static int mlx5e_get_eth_addr_type(const u8 * addr) { if (ETHER_IS_MULTICAST(addr) == 0) return (MLX5E_UC); if ((addr[0] == 0x01) && (addr[1] == 0x00) && (addr[2] == 0x5e) && !(addr[3] & 0x80)) return (MLX5E_MC_IPV4); if ((addr[0] == 0x33) && (addr[1] == 0x33)) return (MLX5E_MC_IPV6); return (MLX5E_MC_OTHER); } static u32 mlx5e_get_tt_vec(struct mlx5e_eth_addr_info *ai, int type) { int eth_addr_type; u32 ret; switch (type) { case MLX5E_FULLMATCH: eth_addr_type = mlx5e_get_eth_addr_type(ai->addr); switch (eth_addr_type) { case MLX5E_UC: ret = (1 << MLX5E_TT_IPV4_TCP) | (1 << MLX5E_TT_IPV6_TCP) | (1 << MLX5E_TT_IPV4_UDP) | (1 << MLX5E_TT_IPV6_UDP) | (1 << MLX5E_TT_IPV4) | (1 << MLX5E_TT_IPV6) | (1 << MLX5E_TT_ANY) | 0; break; case MLX5E_MC_IPV4: ret = (1 << MLX5E_TT_IPV4_UDP) | (1 << MLX5E_TT_IPV4) | 0; break; case MLX5E_MC_IPV6: ret = (1 << MLX5E_TT_IPV6_UDP) | (1 << MLX5E_TT_IPV6) | 0; break; default: ret = (1 << MLX5E_TT_ANY) | 0; break; } break; case MLX5E_ALLMULTI: ret = (1 << MLX5E_TT_IPV4_UDP) | (1 << MLX5E_TT_IPV6_UDP) | (1 << MLX5E_TT_IPV4) | (1 << MLX5E_TT_IPV6) | (1 << MLX5E_TT_ANY) | 0; break; default: /* MLX5E_PROMISC */ ret = (1 << MLX5E_TT_IPV4_TCP) | (1 << MLX5E_TT_IPV6_TCP) | (1 << MLX5E_TT_IPV4_UDP) | (1 << MLX5E_TT_IPV6_UDP) | (1 << MLX5E_TT_IPV4) | (1 << MLX5E_TT_IPV6) | (1 << MLX5E_TT_ANY) | 0; break; } return (ret); } static int mlx5e_add_eth_addr_rule_sub(struct mlx5e_priv *priv, struct mlx5e_eth_addr_info *ai, int type, u32 *mc, u32 *mv) { struct mlx5_flow_destination dest = {}; u8 mc_enable = 0; struct mlx5_flow_rule **rule_p; struct mlx5_flow_table *ft = priv->fts.main.t; u8 *mc_dmac = MLX5_ADDR_OF(fte_match_param, mc, outer_headers.dmac_47_16); u8 *mv_dmac = MLX5_ADDR_OF(fte_match_param, mv, outer_headers.dmac_47_16); u32 *tirn = priv->tirn; u32 tt_vec; int err = 0; dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; switch (type) { case MLX5E_FULLMATCH: mc_enable = MLX5_MATCH_OUTER_HEADERS; memset(mc_dmac, 0xff, ETH_ALEN); ether_addr_copy(mv_dmac, ai->addr); break; case MLX5E_ALLMULTI: mc_enable = MLX5_MATCH_OUTER_HEADERS; mc_dmac[0] = 0x01; mv_dmac[0] = 0x01; break; case MLX5E_PROMISC: break; default: break; } tt_vec = mlx5e_get_tt_vec(ai, type); if (tt_vec & BIT(MLX5E_TT_ANY)) { rule_p = &ai->ft_rule[MLX5E_TT_ANY]; dest.tir_num = tirn[MLX5E_TT_ANY]; *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_ANY); } mc_enable = MLX5_MATCH_OUTER_HEADERS; MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); if (tt_vec & BIT(MLX5E_TT_IPV4)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV4]; dest.tir_num = tirn[MLX5E_TT_IPV4]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV4); } if (tt_vec & BIT(MLX5E_TT_IPV6)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV6]; dest.tir_num = tirn[MLX5E_TT_IPV6]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV6); } MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_UDP); if (tt_vec & BIT(MLX5E_TT_IPV4_UDP)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV4_UDP]; dest.tir_num = tirn[MLX5E_TT_IPV4_UDP]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV4_UDP); } if (tt_vec & BIT(MLX5E_TT_IPV6_UDP)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV6_UDP]; dest.tir_num = tirn[MLX5E_TT_IPV6_UDP]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV6_UDP); } MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_TCP); if (tt_vec & BIT(MLX5E_TT_IPV4_TCP)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV4_TCP]; dest.tir_num = tirn[MLX5E_TT_IPV4_TCP]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV4_TCP); } if (tt_vec & BIT(MLX5E_TT_IPV6_TCP)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV6_TCP]; dest.tir_num = tirn[MLX5E_TT_IPV6_TCP]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV6_TCP); } MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_AH); if (tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_AH)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV4_IPSEC_AH]; dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_AH]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV4_IPSEC_AH); } if (tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_AH)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV6_IPSEC_AH]; dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_AH]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV6_IPSEC_AH); } MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_ESP); if (tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_ESP)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV4_IPSEC_ESP]; dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_ESP]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV4_IPSEC_ESP); } if (tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_ESP)) { rule_p = &ai->ft_rule[MLX5E_TT_IPV6_IPSEC_ESP]; dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_ESP]; MLX5_SET(fte_match_param, mv, outer_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; ai->tt_vec |= BIT(MLX5E_TT_IPV6_IPSEC_ESP); } return 0; err_del_ai: err = PTR_ERR(*rule_p); *rule_p = NULL; mlx5e_del_eth_addr_from_flow_table(priv, ai); return err; } static int mlx5e_add_eth_addr_rule(struct mlx5e_priv *priv, struct mlx5e_eth_addr_info *ai, int type) { u32 *match_criteria; u32 *match_value; int err = 0; match_value = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); if (!match_value || !match_criteria) { mlx5_en_err(priv->ifp, "alloc failed\n"); err = -ENOMEM; goto add_eth_addr_rule_out; } err = mlx5e_add_eth_addr_rule_sub(priv, ai, type, match_criteria, match_value); add_eth_addr_rule_out: kvfree(match_criteria); kvfree(match_value); return (err); } static void mlx5e_del_main_vxlan_rules(struct mlx5e_priv *priv) { struct mlx5_flow_rule **ra = priv->fts.main_vxlan_rule, **r; r = &ra[MLX5E_TT_IPV6_IPSEC_ESP]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV4_IPSEC_ESP]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV6_IPSEC_AH]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV4_IPSEC_AH]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV6_TCP]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV4_TCP]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV6_UDP]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV4_UDP]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV6]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_IPV4]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } r = &ra[MLX5E_TT_ANY]; if (*r != NULL) { mlx5_del_flow_rule(*r); *r = NULL; } } static int mlx5e_add_main_vxlan_rules_sub(struct mlx5e_priv *priv, u32 *mc, u32 *mv) { struct mlx5_flow_destination dest = {}; u8 mc_enable = 0; struct mlx5_flow_rule **rule_p; struct mlx5_flow_table *ft = priv->fts.main_vxlan.t; u32 *tirn = priv->tirn_inner_vxlan; int err = 0; dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; mc_enable = MLX5_MATCH_INNER_HEADERS; MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ethertype); rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV4]; dest.tir_num = tirn[MLX5E_TT_IPV4]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV6]; dest.tir_num = tirn[MLX5E_TT_IPV6]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol); MLX5_SET(fte_match_param, mv, inner_headers.ip_protocol, IPPROTO_UDP); rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV4_UDP]; dest.tir_num = tirn[MLX5E_TT_IPV4_UDP]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV6_UDP]; dest.tir_num = tirn[MLX5E_TT_IPV6_UDP]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; MLX5_SET(fte_match_param, mv, inner_headers.ip_protocol, IPPROTO_TCP); rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV4_TCP]; dest.tir_num = tirn[MLX5E_TT_IPV4_TCP]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV6_TCP]; dest.tir_num = tirn[MLX5E_TT_IPV6_TCP]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; MLX5_SET(fte_match_param, mv, inner_headers.ip_protocol, IPPROTO_AH); rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV4_IPSEC_AH]; dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_AH]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV6_IPSEC_AH]; dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_AH]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; MLX5_SET(fte_match_param, mv, inner_headers.ip_protocol, IPPROTO_ESP); rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV4_IPSEC_ESP]; dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_ESP]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IP); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_IPV6_IPSEC_ESP]; dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_ESP]; MLX5_SET(fte_match_param, mv, inner_headers.ethertype, ETHERTYPE_IPV6); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; mc_enable = 0; memset(mv, 0, MLX5_ST_SZ_BYTES(fte_match_param)); memset(mc, 0, MLX5_ST_SZ_BYTES(fte_match_param)); rule_p = &priv->fts.main_vxlan_rule[MLX5E_TT_ANY]; dest.tir_num = tirn[MLX5E_TT_ANY]; *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR_OR_NULL(*rule_p)) goto err_del_ai; return (0); err_del_ai: err = PTR_ERR(*rule_p); *rule_p = NULL; mlx5e_del_main_vxlan_rules(priv); return (err); } static int mlx5e_add_main_vxlan_rules(struct mlx5e_priv *priv) { u32 *match_criteria; u32 *match_value; int err = 0; match_value = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); if (match_value == NULL || match_criteria == NULL) { mlx5_en_err(priv->ifp, "alloc failed\n"); err = -ENOMEM; goto add_main_vxlan_rules_out; } err = mlx5e_add_main_vxlan_rules_sub(priv, match_criteria, match_value); add_main_vxlan_rules_out: kvfree(match_criteria); kvfree(match_value); return (err); } static int mlx5e_vport_context_update_vlans(struct mlx5e_priv *priv) { - struct ifnet *ifp = priv->ifp; + if_t ifp = priv->ifp; int max_list_size; int list_size; u16 *vlans; int vlan; int err; int i; list_size = 0; for_each_set_bit(vlan, priv->vlan.active_vlans, VLAN_N_VID) list_size++; max_list_size = 1 << MLX5_CAP_GEN(priv->mdev, log_max_vlan_list); if (list_size > max_list_size) { mlx5_en_err(ifp, "ifnet vlans list size (%d) > (%d) max vport list size, some vlans will be dropped\n", list_size, max_list_size); list_size = max_list_size; } vlans = kcalloc(list_size, sizeof(*vlans), GFP_KERNEL); if (!vlans) return -ENOMEM; i = 0; for_each_set_bit(vlan, priv->vlan.active_vlans, VLAN_N_VID) { if (i >= list_size) break; vlans[i++] = vlan; } err = mlx5_modify_nic_vport_vlans(priv->mdev, vlans, list_size); if (err) mlx5_en_err(ifp, "Failed to modify vport vlans list err(%d)\n", err); kfree(vlans); return err; } enum mlx5e_vlan_rule_type { MLX5E_VLAN_RULE_TYPE_UNTAGGED, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, MLX5E_VLAN_RULE_TYPE_MATCH_VID, }; static int mlx5e_add_vlan_rule_sub(struct mlx5e_priv *priv, enum mlx5e_vlan_rule_type rule_type, u16 vid, u32 *mc, u32 *mv) { struct mlx5_flow_table *ft = priv->fts.vlan.t; struct mlx5_flow_destination dest = {}; u8 mc_enable = 0; struct mlx5_flow_rule **rule_p; int err = 0; dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest.ft = priv->fts.vxlan.t; mc_enable = MLX5_MATCH_OUTER_HEADERS; switch (rule_type) { case MLX5E_VLAN_RULE_TYPE_UNTAGGED: rule_p = &priv->vlan.untagged_ft_rule; MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); break; case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID: rule_p = &priv->vlan.any_cvlan_ft_rule; MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); MLX5_SET(fte_match_param, mv, outer_headers.cvlan_tag, 1); break; case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID: rule_p = &priv->vlan.any_svlan_ft_rule; MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag); MLX5_SET(fte_match_param, mv, outer_headers.svlan_tag, 1); break; default: /* MLX5E_VLAN_RULE_TYPE_MATCH_VID */ rule_p = &priv->vlan.active_vlans_ft_rule[vid]; MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); MLX5_SET(fte_match_param, mv, outer_headers.cvlan_tag, 1); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid); MLX5_SET(fte_match_param, mv, outer_headers.first_vid, vid); mlx5e_vport_context_update_vlans(priv); break; } *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR(*rule_p)) { err = PTR_ERR(*rule_p); *rule_p = NULL; mlx5_en_err(priv->ifp, "add rule failed\n"); } return (err); } static int mlx5e_add_vlan_rule(struct mlx5e_priv *priv, enum mlx5e_vlan_rule_type rule_type, u16 vid) { u32 *match_criteria; u32 *match_value; int err = 0; match_value = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); if (!match_value || !match_criteria) { mlx5_en_err(priv->ifp, "alloc failed\n"); err = -ENOMEM; goto add_vlan_rule_out; } err = mlx5e_add_vlan_rule_sub(priv, rule_type, vid, match_criteria, match_value); add_vlan_rule_out: kvfree(match_criteria); kvfree(match_value); return (err); } static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv, enum mlx5e_vlan_rule_type rule_type, u16 vid) { switch (rule_type) { case MLX5E_VLAN_RULE_TYPE_UNTAGGED: if (priv->vlan.untagged_ft_rule) { mlx5_del_flow_rule(priv->vlan.untagged_ft_rule); priv->vlan.untagged_ft_rule = NULL; } break; case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID: if (priv->vlan.any_cvlan_ft_rule) { mlx5_del_flow_rule(priv->vlan.any_cvlan_ft_rule); priv->vlan.any_cvlan_ft_rule = NULL; } break; case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID: if (priv->vlan.any_svlan_ft_rule) { mlx5_del_flow_rule(priv->vlan.any_svlan_ft_rule); priv->vlan.any_svlan_ft_rule = NULL; } break; case MLX5E_VLAN_RULE_TYPE_MATCH_VID: if (priv->vlan.active_vlans_ft_rule[vid]) { mlx5_del_flow_rule(priv->vlan.active_vlans_ft_rule[vid]); priv->vlan.active_vlans_ft_rule[vid] = NULL; } mlx5e_vport_context_update_vlans(priv); break; default: break; } } static void mlx5e_del_any_vid_rules(struct mlx5e_priv *priv) { mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0); } static int mlx5e_add_any_vid_rules(struct mlx5e_priv *priv) { int err; err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); if (err) return (err); err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0); if (err) mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); return (err); } void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv) { if (priv->vlan.filter_disabled) { priv->vlan.filter_disabled = false; - if (priv->ifp->if_flags & IFF_PROMISC) + if (if_getflags(priv->ifp) & IFF_PROMISC) return; if (test_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state)) mlx5e_del_any_vid_rules(priv); } } void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv) { if (!priv->vlan.filter_disabled) { priv->vlan.filter_disabled = true; - if (priv->ifp->if_flags & IFF_PROMISC) + if (if_getflags(priv->ifp) & IFF_PROMISC) return; if (test_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state)) mlx5e_add_any_vid_rules(priv); } } void -mlx5e_vlan_rx_add_vid(void *arg, struct ifnet *ifp, u16 vid) +mlx5e_vlan_rx_add_vid(void *arg, if_t ifp, u16 vid) { struct mlx5e_priv *priv = arg; if (ifp != priv->ifp) return; PRIV_LOCK(priv); if (!test_and_set_bit(vid, priv->vlan.active_vlans) && test_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state)) mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, vid); PRIV_UNLOCK(priv); } void -mlx5e_vlan_rx_kill_vid(void *arg, struct ifnet *ifp, u16 vid) +mlx5e_vlan_rx_kill_vid(void *arg, if_t ifp, u16 vid) { struct mlx5e_priv *priv = arg; if (ifp != priv->ifp) return; PRIV_LOCK(priv); clear_bit(vid, priv->vlan.active_vlans); if (test_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state)) mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, vid); PRIV_UNLOCK(priv); } static int mlx5e_add_all_vlan_rules(struct mlx5e_priv *priv) { int err; int i; set_bit(0, priv->vlan.active_vlans); for_each_set_bit(i, priv->vlan.active_vlans, VLAN_N_VID) { err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, i); if (err) goto error; } err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0); if (err) goto error; if (priv->vlan.filter_disabled) { err = mlx5e_add_any_vid_rules(priv); if (err) goto error; } return (0); error: mlx5e_del_all_vlan_rules(priv); return (err); } static void mlx5e_del_all_vlan_rules(struct mlx5e_priv *priv) { int i; if (priv->vlan.filter_disabled) mlx5e_del_any_vid_rules(priv); mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0); for_each_set_bit(i, priv->vlan.active_vlans, VLAN_N_VID) mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, i); clear_bit(0, priv->vlan.active_vlans); } #define mlx5e_for_each_hash_node(hn, tmp, hash, i) \ for (i = 0; i < MLX5E_ETH_ADDR_HASH_SIZE; i++) \ LIST_FOREACH_SAFE(hn, &(hash)[i], hlist, tmp) static void mlx5e_execute_action(struct mlx5e_priv *priv, struct mlx5e_eth_addr_hash_node *hn) { switch (hn->action) { case MLX5E_ACTION_ADD: mlx5e_add_eth_addr_rule(priv, &hn->ai, MLX5E_FULLMATCH); hn->action = MLX5E_ACTION_NONE; break; case MLX5E_ACTION_DEL: mlx5e_del_eth_addr_from_flow_table(priv, &hn->ai); if (hn->mpfs_index != -1U) mlx5_mpfs_del_mac(priv->mdev, hn->mpfs_index); mlx5e_del_eth_addr_from_hash(hn); break; default: break; } } static struct mlx5e_eth_addr_hash_node * mlx5e_move_hn(struct mlx5e_eth_addr_hash_head *fh, struct mlx5e_eth_addr_hash_head *uh) { struct mlx5e_eth_addr_hash_node *hn; hn = LIST_FIRST(fh); if (hn != NULL) { LIST_REMOVE(hn, hlist); LIST_INSERT_HEAD(uh, hn, hlist); } return (hn); } static struct mlx5e_eth_addr_hash_node * mlx5e_remove_hn(struct mlx5e_eth_addr_hash_head *fh) { struct mlx5e_eth_addr_hash_node *hn; hn = LIST_FIRST(fh); if (hn != NULL) LIST_REMOVE(hn, hlist); return (hn); } struct mlx5e_copy_addr_ctx { struct mlx5e_eth_addr_hash_head *free; struct mlx5e_eth_addr_hash_head *fill; bool success; }; static u_int mlx5e_copy_addr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { struct mlx5e_copy_addr_ctx *ctx = arg; struct mlx5e_eth_addr_hash_node *hn; hn = mlx5e_move_hn(ctx->free, ctx->fill); if (hn == NULL) { ctx->success = false; return (0); } ether_addr_copy(hn->ai.addr, LLADDR(sdl)); return (1); } static void mlx5e_sync_ifp_addr(struct mlx5e_priv *priv) { struct mlx5e_copy_addr_ctx ctx; struct mlx5e_eth_addr_hash_head head_free; struct mlx5e_eth_addr_hash_head head_uc; struct mlx5e_eth_addr_hash_head head_mc; struct mlx5e_eth_addr_hash_node *hn; - struct ifnet *ifp = priv->ifp; + if_t ifp = priv->ifp; size_t x; size_t num; PRIV_ASSERT_LOCKED(priv); retry: LIST_INIT(&head_free); LIST_INIT(&head_uc); LIST_INIT(&head_mc); num = 1 + if_lladdr_count(ifp) + if_llmaddr_count(ifp); /* allocate place holders */ for (x = 0; x != num; x++) { hn = malloc(sizeof(*hn), M_MLX5EN, M_WAITOK | M_ZERO); hn->action = MLX5E_ACTION_ADD; hn->mpfs_index = -1U; LIST_INSERT_HEAD(&head_free, hn, hlist); } hn = mlx5e_move_hn(&head_free, &head_uc); MPASS(hn != NULL); - ether_addr_copy(hn->ai.addr, - LLADDR((struct sockaddr_dl *)(ifp->if_addr->ifa_addr))); + ether_addr_copy(hn->ai.addr, if_getlladdr(ifp)); ctx.free = &head_free; ctx.fill = &head_uc; ctx.success = true; if_foreach_lladdr(ifp, mlx5e_copy_addr, &ctx); if (ctx.success == false) goto cleanup; ctx.fill = &head_mc; if_foreach_llmaddr(ifp, mlx5e_copy_addr, &ctx); if (ctx.success == false) goto cleanup; /* insert L2 unicast addresses into hash list */ while ((hn = mlx5e_remove_hn(&head_uc)) != NULL) { if (mlx5e_add_eth_addr_to_hash(priv->eth_addr.if_uc, hn) == 0) continue; if (hn->mpfs_index == -1U) mlx5_mpfs_add_mac(priv->mdev, &hn->mpfs_index, hn->ai.addr, 0, 0); } /* insert L2 multicast addresses into hash list */ while ((hn = mlx5e_remove_hn(&head_mc)) != NULL) { if (mlx5e_add_eth_addr_to_hash(priv->eth_addr.if_mc, hn) == 0) continue; } cleanup: while ((hn = mlx5e_remove_hn(&head_uc)) != NULL) free(hn, M_MLX5EN); while ((hn = mlx5e_remove_hn(&head_mc)) != NULL) free(hn, M_MLX5EN); while ((hn = mlx5e_remove_hn(&head_free)) != NULL) free(hn, M_MLX5EN); if (ctx.success == false) goto retry; } static void mlx5e_fill_addr_array(struct mlx5e_priv *priv, int list_type, u8 addr_array[][ETH_ALEN], int size) { bool is_uc = (list_type == MLX5_NIC_VPORT_LIST_TYPE_UC); - struct ifnet *ifp = priv->ifp; + if_t ifp = priv->ifp; struct mlx5e_eth_addr_hash_node *hn; struct mlx5e_eth_addr_hash_head *addr_list; struct mlx5e_eth_addr_hash_node *tmp; int i = 0; int hi; addr_list = is_uc ? priv->eth_addr.if_uc : priv->eth_addr.if_mc; if (is_uc) /* Make sure our own address is pushed first */ - ether_addr_copy(addr_array[i++], IF_LLADDR(ifp)); + ether_addr_copy(addr_array[i++], if_getlladdr(ifp)); else if (priv->eth_addr.broadcast_enabled) - ether_addr_copy(addr_array[i++], ifp->if_broadcastaddr); + ether_addr_copy(addr_array[i++], if_getbroadcastaddr(ifp)); mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) { - if (ether_addr_equal(IF_LLADDR(ifp), hn->ai.addr)) + if (ether_addr_equal(if_getlladdr(ifp), hn->ai.addr)) continue; if (i >= size) break; ether_addr_copy(addr_array[i++], hn->ai.addr); } } static void mlx5e_vport_context_update_addr_list(struct mlx5e_priv *priv, int list_type) { bool is_uc = (list_type == MLX5_NIC_VPORT_LIST_TYPE_UC); struct mlx5e_eth_addr_hash_node *hn; u8 (*addr_array)[ETH_ALEN] = NULL; struct mlx5e_eth_addr_hash_head *addr_list; struct mlx5e_eth_addr_hash_node *tmp; int max_size; int size; int err; int hi; size = is_uc ? 0 : (priv->eth_addr.broadcast_enabled ? 1 : 0); max_size = is_uc ? 1 << MLX5_CAP_GEN(priv->mdev, log_max_current_uc_list) : 1 << MLX5_CAP_GEN(priv->mdev, log_max_current_mc_list); addr_list = is_uc ? priv->eth_addr.if_uc : priv->eth_addr.if_mc; mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) size++; if (size > max_size) { mlx5_en_err(priv->ifp, "ifp %s list size (%d) > (%d) max vport list size, some addresses will be dropped\n", is_uc ? "UC" : "MC", size, max_size); size = max_size; } if (size) { addr_array = kcalloc(size, ETH_ALEN, GFP_KERNEL); if (!addr_array) { err = -ENOMEM; goto out; } mlx5e_fill_addr_array(priv, list_type, addr_array, size); } err = mlx5_modify_nic_vport_mac_list(priv->mdev, list_type, addr_array, size); out: if (err) mlx5_en_err(priv->ifp, "Failed to modify vport %s list err(%d)\n", is_uc ? "UC" : "MC", err); kfree(addr_array); } static void mlx5e_vport_context_update(struct mlx5e_priv *priv) { struct mlx5e_eth_addr_db *ea = &priv->eth_addr; mlx5e_vport_context_update_addr_list(priv, MLX5_NIC_VPORT_LIST_TYPE_UC); mlx5e_vport_context_update_addr_list(priv, MLX5_NIC_VPORT_LIST_TYPE_MC); mlx5_modify_nic_vport_promisc(priv->mdev, 0, ea->allmulti_enabled, ea->promisc_enabled); } static void mlx5e_apply_ifp_addr(struct mlx5e_priv *priv) { struct mlx5e_eth_addr_hash_node *hn; struct mlx5e_eth_addr_hash_node *tmp; int i; mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_uc, i) mlx5e_execute_action(priv, hn); mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_mc, i) mlx5e_execute_action(priv, hn); } static void mlx5e_handle_ifp_addr(struct mlx5e_priv *priv, bool rx_mode_enable) { struct mlx5e_eth_addr_hash_node *hn; struct mlx5e_eth_addr_hash_node *tmp; int i; mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_uc, i) hn->action = MLX5E_ACTION_DEL; mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.if_mc, i) hn->action = MLX5E_ACTION_DEL; if (rx_mode_enable) mlx5e_sync_ifp_addr(priv); mlx5e_apply_ifp_addr(priv); } static void mlx5e_set_rx_mode_core(struct mlx5e_priv *priv, bool rx_mode_enable) { struct mlx5e_eth_addr_db *ea = &priv->eth_addr; - struct ifnet *ndev = priv->ifp; + if_t ndev = priv->ifp; + int ndev_flags = if_getflags(ndev); - bool promisc_enabled = rx_mode_enable && (ndev->if_flags & IFF_PROMISC); - bool allmulti_enabled = rx_mode_enable && (ndev->if_flags & IFF_ALLMULTI); + bool promisc_enabled = rx_mode_enable && (ndev_flags & IFF_PROMISC); + bool allmulti_enabled = rx_mode_enable && (ndev_flags & IFF_ALLMULTI); bool broadcast_enabled = rx_mode_enable; bool enable_promisc = !ea->promisc_enabled && promisc_enabled; bool disable_promisc = ea->promisc_enabled && !promisc_enabled; bool enable_allmulti = !ea->allmulti_enabled && allmulti_enabled; bool disable_allmulti = ea->allmulti_enabled && !allmulti_enabled; bool enable_broadcast = !ea->broadcast_enabled && broadcast_enabled; bool disable_broadcast = ea->broadcast_enabled && !broadcast_enabled; /* update broadcast address */ ether_addr_copy(priv->eth_addr.broadcast.addr, - priv->ifp->if_broadcastaddr); + if_getbroadcastaddr(priv->ifp)); if (enable_promisc) { mlx5e_add_eth_addr_rule(priv, &ea->promisc, MLX5E_PROMISC); if (!priv->vlan.filter_disabled) mlx5e_add_any_vid_rules(priv); } if (enable_allmulti) mlx5e_add_eth_addr_rule(priv, &ea->allmulti, MLX5E_ALLMULTI); if (enable_broadcast) mlx5e_add_eth_addr_rule(priv, &ea->broadcast, MLX5E_FULLMATCH); mlx5e_handle_ifp_addr(priv, rx_mode_enable); if (disable_broadcast) mlx5e_del_eth_addr_from_flow_table(priv, &ea->broadcast); if (disable_allmulti) mlx5e_del_eth_addr_from_flow_table(priv, &ea->allmulti); if (disable_promisc) { if (!priv->vlan.filter_disabled) mlx5e_del_any_vid_rules(priv); mlx5e_del_eth_addr_from_flow_table(priv, &ea->promisc); } ea->promisc_enabled = promisc_enabled; ea->allmulti_enabled = allmulti_enabled; ea->broadcast_enabled = broadcast_enabled; mlx5e_vport_context_update(priv); } void mlx5e_set_rx_mode_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, set_rx_mode_work); PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state)) mlx5e_set_rx_mode_core(priv, true); PRIV_UNLOCK(priv); } static void mlx5e_destroy_groups(struct mlx5e_flow_table *ft) { int i; for (i = ft->num_groups - 1; i >= 0; i--) { if (!IS_ERR_OR_NULL(ft->g[i])) mlx5_destroy_flow_group(ft->g[i]); ft->g[i] = NULL; } ft->num_groups = 0; } static void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft) { mlx5e_destroy_groups(ft); kfree(ft->g); mlx5_destroy_flow_table(ft->t); ft->t = NULL; } #define MLX5E_NUM_MAIN_GROUPS 10 #define MLX5E_MAIN_GROUP0_SIZE BIT(4) #define MLX5E_MAIN_GROUP1_SIZE BIT(3) #define MLX5E_MAIN_GROUP2_SIZE BIT(1) #define MLX5E_MAIN_GROUP3_SIZE BIT(0) #define MLX5E_MAIN_GROUP4_SIZE BIT(14) #define MLX5E_MAIN_GROUP5_SIZE BIT(13) #define MLX5E_MAIN_GROUP6_SIZE BIT(11) #define MLX5E_MAIN_GROUP7_SIZE BIT(2) #define MLX5E_MAIN_GROUP8_SIZE BIT(1) #define MLX5E_MAIN_GROUP9_SIZE BIT(0) #define MLX5E_MAIN_TABLE_SIZE (MLX5E_MAIN_GROUP0_SIZE +\ MLX5E_MAIN_GROUP1_SIZE +\ MLX5E_MAIN_GROUP2_SIZE +\ MLX5E_MAIN_GROUP3_SIZE +\ MLX5E_MAIN_GROUP4_SIZE +\ MLX5E_MAIN_GROUP5_SIZE +\ MLX5E_MAIN_GROUP6_SIZE +\ MLX5E_MAIN_GROUP7_SIZE +\ MLX5E_MAIN_GROUP8_SIZE +\ MLX5E_MAIN_GROUP9_SIZE +\ 0) static int mlx5e_create_main_groups_sub(struct mlx5e_flow_table *ft, u32 *in, int inlen) { u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); u8 *dmac = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria.outer_headers.dmac_47_16); int err; int ix = 0; /* Tunnel rules need to be first in this list of groups */ /* Start tunnel rules */ memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.udp_dport); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP0_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; /* End Tunnel Rules */ memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP1_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP2_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP3_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); memset(dmac, 0xff, ETH_ALEN); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP4_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); memset(dmac, 0xff, ETH_ALEN); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP5_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); memset(dmac, 0xff, ETH_ALEN); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP6_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); dmac[0] = 0x01; MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP7_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); dmac[0] = 0x01; MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP8_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); dmac[0] = 0x01; MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_GROUP9_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; return (0); err_destory_groups: err = PTR_ERR(ft->g[ft->num_groups]); ft->g[ft->num_groups] = NULL; mlx5e_destroy_groups(ft); return (err); } static int mlx5e_create_main_groups(struct mlx5e_flow_table *ft) { u32 *in; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); int err; in = mlx5_vzalloc(inlen); if (!in) return (-ENOMEM); err = mlx5e_create_main_groups_sub(ft, in, inlen); kvfree(in); return (err); } #define MLX5E_MAIN_VXLAN_GROUP0_SIZE BIT(3) #define MLX5E_MAIN_VXLAN_GROUP1_SIZE BIT(3) #define MLX5E_MAIN_VXLAN_GROUP2_SIZE BIT(0) static int mlx5e_create_main_vxlan_groups_sub(struct mlx5e_flow_table *ft, u32 *in, int inlen) { u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); int err; int ix = 0; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ethertype); MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_VXLAN_GROUP0_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ethertype); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_VXLAN_GROUP1_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_MAIN_VXLAN_GROUP2_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; return (0); err_destory_groups: err = PTR_ERR(ft->g[ft->num_groups]); ft->g[ft->num_groups] = NULL; mlx5e_destroy_groups(ft); return (err); } static int mlx5e_create_main_vxlan_groups(struct mlx5e_flow_table *ft) { u32 *in; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); int err; in = mlx5_vzalloc(inlen); if (!in) return (-ENOMEM); err = mlx5e_create_main_vxlan_groups_sub(ft, in, inlen); kvfree(in); return (err); } static int mlx5e_create_main_flow_table(struct mlx5e_priv *priv, bool inner_vxlan) { struct mlx5e_flow_table *ft = inner_vxlan ? &priv->fts.main_vxlan : &priv->fts.main; int err; ft->num_groups = 0; ft->t = mlx5_create_flow_table(priv->fts.ns, 0, inner_vxlan ? "vxlan_main" : "main", MLX5E_MAIN_TABLE_SIZE); if (IS_ERR(ft->t)) { err = PTR_ERR(ft->t); ft->t = NULL; return (err); } ft->g = kcalloc(MLX5E_NUM_MAIN_GROUPS, sizeof(*ft->g), GFP_KERNEL); if (!ft->g) { err = -ENOMEM; goto err_destroy_main_flow_table; } err = inner_vxlan ? mlx5e_create_main_vxlan_groups(ft) : mlx5e_create_main_groups(ft); if (err) goto err_free_g; return (0); err_free_g: kfree(ft->g); err_destroy_main_flow_table: mlx5_destroy_flow_table(ft->t); ft->t = NULL; return (err); } static void mlx5e_destroy_main_flow_table(struct mlx5e_priv *priv) { mlx5e_destroy_flow_table(&priv->fts.main); } static void mlx5e_destroy_main_vxlan_flow_table(struct mlx5e_priv *priv) { mlx5e_destroy_flow_table(&priv->fts.main_vxlan); } #define MLX5E_NUM_VLAN_GROUPS 3 #define MLX5E_VLAN_GROUP0_SIZE BIT(12) #define MLX5E_VLAN_GROUP1_SIZE BIT(1) #define MLX5E_VLAN_GROUP2_SIZE BIT(0) #define MLX5E_VLAN_TABLE_SIZE (MLX5E_VLAN_GROUP0_SIZE +\ MLX5E_VLAN_GROUP1_SIZE +\ MLX5E_VLAN_GROUP2_SIZE +\ 0) static int mlx5e_create_vlan_groups_sub(struct mlx5e_flow_table *ft, u32 *in, int inlen) { int err; int ix = 0; u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_VLAN_GROUP0_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_VLAN_GROUP1_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_VLAN_GROUP2_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; return (0); err_destory_groups: err = PTR_ERR(ft->g[ft->num_groups]); ft->g[ft->num_groups] = NULL; mlx5e_destroy_groups(ft); return (err); } static int mlx5e_create_vlan_groups(struct mlx5e_flow_table *ft) { u32 *in; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); int err; in = mlx5_vzalloc(inlen); if (!in) return (-ENOMEM); err = mlx5e_create_vlan_groups_sub(ft, in, inlen); kvfree(in); return (err); } static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv) { struct mlx5e_flow_table *ft = &priv->fts.vlan; int err; ft->num_groups = 0; ft->t = mlx5_create_flow_table(priv->fts.ns, 0, "vlan", MLX5E_VLAN_TABLE_SIZE); if (IS_ERR(ft->t)) { err = PTR_ERR(ft->t); ft->t = NULL; return (err); } ft->g = kcalloc(MLX5E_NUM_VLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL); if (!ft->g) { err = -ENOMEM; goto err_destroy_vlan_flow_table; } err = mlx5e_create_vlan_groups(ft); if (err) goto err_free_g; return (0); err_free_g: kfree(ft->g); err_destroy_vlan_flow_table: mlx5_destroy_flow_table(ft->t); ft->t = NULL; return (err); } static void mlx5e_destroy_vlan_flow_table(struct mlx5e_priv *priv) { mlx5e_destroy_flow_table(&priv->fts.vlan); } static int mlx5e_add_vxlan_rule_sub(struct mlx5e_priv *priv, u32 *mc, u32 *mv, struct mlx5e_vxlan_db_el *el) { struct mlx5_flow_table *ft = priv->fts.vxlan.t; struct mlx5_flow_destination dest = {}; u8 mc_enable; struct mlx5_flow_rule **rule_p; int err = 0; dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest.ft = priv->fts.main_vxlan.t; mc_enable = MLX5_MATCH_OUTER_HEADERS; rule_p = &el->vxlan_ft_rule; MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); MLX5_SET(fte_match_param, mv, outer_headers.ethertype, el->proto); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_UDP); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.udp_dport); MLX5_SET(fte_match_param, mv, outer_headers.udp_dport, el->port); *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR(*rule_p)) { err = PTR_ERR(*rule_p); *rule_p = NULL; mlx5_en_err(priv->ifp, "add rule failed\n"); } return (err); } static struct mlx5e_vxlan_db_el * mlx5e_vxlan_find_db_el(struct mlx5e_priv *priv, u_int proto, u_int port) { struct mlx5e_vxlan_db_el *el; TAILQ_FOREACH(el, &priv->vxlan.head, link) { if (el->proto == proto && el->port == port) return (el); } return (NULL); } static struct mlx5e_vxlan_db_el * mlx5e_vxlan_alloc_db_el(struct mlx5e_priv *priv, u_int proto, u_int port) { struct mlx5e_vxlan_db_el *el; el = mlx5_vzalloc(sizeof(*el)); el->refcount = 1; el->proto = proto; el->port = port; el->vxlan_ft_rule = NULL; return (el); } static int mlx5e_vxlan_family_to_proto(sa_family_t family, u_int *proto) { switch (family) { case AF_INET: *proto = ETHERTYPE_IP; return (0); case AF_INET6: *proto = ETHERTYPE_IPV6; return (0); default: return (-EINVAL); } } static int mlx5e_add_vxlan_rule_from_db(struct mlx5e_priv *priv, struct mlx5e_vxlan_db_el *el) { u32 *match_criteria; u32 *match_value; int err; match_value = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); if (match_value == NULL || match_criteria == NULL) { mlx5_en_err(priv->ifp, "alloc failed\n"); err = -ENOMEM; goto add_vxlan_rule_out; } err = mlx5e_add_vxlan_rule_sub(priv, match_criteria, match_value, el); add_vxlan_rule_out: kvfree(match_criteria); kvfree(match_value); return (err); } static int mlx5e_add_vxlan_rule(struct mlx5e_priv *priv, sa_family_t family, u_int port) { struct mlx5e_vxlan_db_el *el; u_int proto; int err; err = mlx5e_vxlan_family_to_proto(family, &proto); if (err != 0) return (err); el = mlx5e_vxlan_find_db_el(priv, proto, port); if (el != NULL) { el->refcount++; if (el->installed) return (0); } el = mlx5e_vxlan_alloc_db_el(priv, proto, port); - if ((priv->ifp->if_capenable & IFCAP_VXLAN_HWCSUM) != 0) { + if ((if_getcapenable(priv->ifp) & IFCAP_VXLAN_HWCSUM) != 0) { err = mlx5e_add_vxlan_rule_from_db(priv, el); if (err == 0) el->installed = true; } if (err == 0) TAILQ_INSERT_TAIL(&priv->vxlan.head, el, link); else kvfree(el); return (err); } static int mlx5e_add_vxlan_catchall_rule_sub(struct mlx5e_priv *priv, u32 *mc, u32 *mv) { struct mlx5_flow_table *ft = priv->fts.vxlan.t; struct mlx5_flow_destination dest = {}; u8 mc_enable = 0; struct mlx5_flow_rule **rule_p; int err = 0; dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest.ft = priv->fts.main.t; rule_p = &priv->fts.vxlan_catchall_ft_rule; *rule_p = mlx5_add_flow_rule(ft, mc_enable, mc, mv, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, MLX5_FS_ETH_FLOW_TAG, &dest); if (IS_ERR(*rule_p)) { err = PTR_ERR(*rule_p); *rule_p = NULL; mlx5_en_err(priv->ifp, "add rule failed\n"); } return (err); } static int mlx5e_add_vxlan_catchall_rule(struct mlx5e_priv *priv) { u32 *match_criteria; u32 *match_value; int err; match_value = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param)); if (match_value == NULL || match_criteria == NULL) { mlx5_en_err(priv->ifp, "alloc failed\n"); err = -ENOMEM; goto add_vxlan_rule_out; } err = mlx5e_add_vxlan_catchall_rule_sub(priv, match_criteria, match_value); add_vxlan_rule_out: kvfree(match_criteria); kvfree(match_value); return (err); } int mlx5e_add_all_vxlan_rules(struct mlx5e_priv *priv) { struct mlx5e_vxlan_db_el *el; int err; err = 0; TAILQ_FOREACH(el, &priv->vxlan.head, link) { if (el->installed) continue; err = mlx5e_add_vxlan_rule_from_db(priv, el); if (err != 0) break; el->installed = true; } return (err); } static int mlx5e_del_vxlan_rule(struct mlx5e_priv *priv, sa_family_t family, u_int port) { struct mlx5e_vxlan_db_el *el; u_int proto; int err; err = mlx5e_vxlan_family_to_proto(family, &proto); if (err != 0) return (err); el = mlx5e_vxlan_find_db_el(priv, proto, port); if (el == NULL) return (0); if (el->refcount > 1) { el->refcount--; return (0); } if (el->installed) mlx5_del_flow_rule(el->vxlan_ft_rule); TAILQ_REMOVE(&priv->vxlan.head, el, link); kvfree(el); return (0); } void mlx5e_del_all_vxlan_rules(struct mlx5e_priv *priv) { struct mlx5e_vxlan_db_el *el; TAILQ_FOREACH(el, &priv->vxlan.head, link) { if (!el->installed) continue; mlx5_del_flow_rule(el->vxlan_ft_rule); el->installed = false; } } static void mlx5e_del_vxlan_catchall_rule(struct mlx5e_priv *priv) { mlx5_del_flow_rule(priv->fts.vxlan_catchall_ft_rule); } void -mlx5e_vxlan_start(void *arg, struct ifnet *ifp __unused, sa_family_t family, +mlx5e_vxlan_start(void *arg, if_t ifp __unused, sa_family_t family, u_int port) { struct mlx5e_priv *priv = arg; int err; PRIV_LOCK(priv); err = mlx5_vxlan_udp_port_add(priv->mdev, port); if (err == 0 && test_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state)) mlx5e_add_vxlan_rule(priv, family, port); PRIV_UNLOCK(priv); } void -mlx5e_vxlan_stop(void *arg, struct ifnet *ifp __unused, sa_family_t family, +mlx5e_vxlan_stop(void *arg, if_t ifp __unused, sa_family_t family, u_int port) { struct mlx5e_priv *priv = arg; PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state)) mlx5e_del_vxlan_rule(priv, family, port); (void)mlx5_vxlan_udp_port_delete(priv->mdev, port); PRIV_UNLOCK(priv); } #define MLX5E_VXLAN_GROUP0_SIZE BIT(3) /* XXXKIB */ #define MLX5E_VXLAN_GROUP1_SIZE BIT(0) #define MLX5E_NUM_VXLAN_GROUPS BIT(1) #define MLX5E_VXLAN_TABLE_SIZE \ (MLX5E_VXLAN_GROUP0_SIZE + MLX5E_VXLAN_GROUP1_SIZE) static int mlx5e_create_vxlan_groups_sub(struct mlx5e_flow_table *ft, u32 *in, int inlen) { int err; int ix = 0; u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.udp_dport); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_VXLAN_GROUP0_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_VXLAN_GROUP1_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; return (0); err_destory_groups: err = PTR_ERR(ft->g[ft->num_groups]); ft->g[ft->num_groups] = NULL; mlx5e_destroy_groups(ft); return (err); } static int mlx5e_create_vxlan_groups(struct mlx5e_flow_table *ft) { u32 *in; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); int err; in = mlx5_vzalloc(inlen); if (!in) return (-ENOMEM); err = mlx5e_create_vxlan_groups_sub(ft, in, inlen); kvfree(in); return (err); } static int mlx5e_create_vxlan_flow_table(struct mlx5e_priv *priv) { struct mlx5e_flow_table *ft = &priv->fts.vxlan; int err; ft->num_groups = 0; ft->t = mlx5_create_flow_table(priv->fts.ns, 0, "vxlan", MLX5E_VXLAN_TABLE_SIZE); if (IS_ERR(ft->t)) { err = PTR_ERR(ft->t); ft->t = NULL; return (err); } ft->g = kcalloc(MLX5E_NUM_VXLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL); if (!ft->g) { err = -ENOMEM; goto err_destroy_vxlan_flow_table; } err = mlx5e_create_vxlan_groups(ft); if (err) goto err_free_g; TAILQ_INIT(&priv->vxlan.head); return (0); err_free_g: kfree(ft->g); err_destroy_vxlan_flow_table: mlx5_destroy_flow_table(ft->t); ft->t = NULL; return (err); } #define MLX5E_NUM_INNER_RSS_GROUPS 3 #define MLX5E_INNER_RSS_GROUP0_SIZE BIT(3) #define MLX5E_INNER_RSS_GROUP1_SIZE BIT(1) #define MLX5E_INNER_RSS_GROUP2_SIZE BIT(0) #define MLX5E_INNER_RSS_TABLE_SIZE (MLX5E_INNER_RSS_GROUP0_SIZE +\ MLX5E_INNER_RSS_GROUP1_SIZE +\ MLX5E_INNER_RSS_GROUP2_SIZE +\ 0) static int mlx5e_create_inner_rss_groups_sub(struct mlx5e_flow_table *ft, u32 *in, int inlen) { u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); int err; int ix = 0; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ethertype); MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_INNER_RSS_GROUP0_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ethertype); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_INNER_RSS_GROUP1_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; memset(in, 0, inlen); MLX5_SET_CFG(in, start_flow_index, ix); ix += MLX5E_INNER_RSS_GROUP2_SIZE; MLX5_SET_CFG(in, end_flow_index, ix - 1); ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); if (IS_ERR(ft->g[ft->num_groups])) goto err_destory_groups; ft->num_groups++; return (0); err_destory_groups: err = PTR_ERR(ft->g[ft->num_groups]); ft->g[ft->num_groups] = NULL; mlx5e_destroy_groups(ft); return (err); } static int mlx5e_create_inner_rss_groups(struct mlx5e_flow_table *ft) { u32 *in; int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); int err; in = mlx5_vzalloc(inlen); if (!in) return (-ENOMEM); err = mlx5e_create_inner_rss_groups_sub(ft, in, inlen); kvfree(in); return (err); } static int mlx5e_create_inner_rss_flow_table(struct mlx5e_priv *priv) { struct mlx5e_flow_table *ft = &priv->fts.inner_rss; int err; ft->num_groups = 0; ft->t = mlx5_create_flow_table(priv->fts.ns, 0, "inner_rss", MLX5E_INNER_RSS_TABLE_SIZE); if (IS_ERR(ft->t)) { err = PTR_ERR(ft->t); ft->t = NULL; return (err); } ft->g = kcalloc(MLX5E_NUM_INNER_RSS_GROUPS, sizeof(*ft->g), GFP_KERNEL); if (!ft->g) { err = -ENOMEM; goto err_destroy_inner_rss_flow_table; } err = mlx5e_create_inner_rss_groups(ft); if (err) goto err_free_g; return (0); err_free_g: kfree(ft->g); err_destroy_inner_rss_flow_table: mlx5_destroy_flow_table(ft->t); ft->t = NULL; return (err); } static void mlx5e_destroy_inner_rss_flow_table(struct mlx5e_priv *priv) { mlx5e_destroy_flow_table(&priv->fts.inner_rss); } static void mlx5e_destroy_vxlan_flow_table(struct mlx5e_priv *priv) { mlx5e_destroy_flow_table(&priv->fts.vxlan); } int mlx5e_open_flow_tables(struct mlx5e_priv *priv) { int err; /* setup namespace pointer */ priv->fts.ns = mlx5_get_flow_namespace( priv->mdev, MLX5_FLOW_NAMESPACE_KERNEL); err = mlx5e_create_vlan_flow_table(priv); if (err) return (err); err = mlx5e_create_vxlan_flow_table(priv); if (err) goto err_destroy_vlan_flow_table; err = mlx5e_create_main_flow_table(priv, true); if (err) goto err_destroy_vxlan_flow_table; err = mlx5e_create_inner_rss_flow_table(priv); if (err) goto err_destroy_main_flow_table_true; err = mlx5e_create_main_flow_table(priv, false); if (err) goto err_destroy_inner_rss_flow_table; err = mlx5e_add_vxlan_catchall_rule(priv); if (err) goto err_destroy_main_flow_table_false; err = mlx5e_accel_fs_tcp_create(priv); if (err) goto err_del_vxlan_catchall_rule; return (0); err_del_vxlan_catchall_rule: mlx5e_del_vxlan_catchall_rule(priv); err_destroy_main_flow_table_false: mlx5e_destroy_main_flow_table(priv); err_destroy_inner_rss_flow_table: mlx5e_destroy_inner_rss_flow_table(priv); err_destroy_main_flow_table_true: mlx5e_destroy_main_vxlan_flow_table(priv); err_destroy_vxlan_flow_table: mlx5e_destroy_vxlan_flow_table(priv); err_destroy_vlan_flow_table: mlx5e_destroy_vlan_flow_table(priv); return (err); } void mlx5e_close_flow_tables(struct mlx5e_priv *priv) { mlx5e_accel_fs_tcp_destroy(priv); mlx5e_del_vxlan_catchall_rule(priv); mlx5e_destroy_main_flow_table(priv); mlx5e_destroy_inner_rss_flow_table(priv); mlx5e_destroy_main_vxlan_flow_table(priv); mlx5e_destroy_vxlan_flow_table(priv); mlx5e_destroy_vlan_flow_table(priv); } int mlx5e_open_flow_rules(struct mlx5e_priv *priv) { int err; err = mlx5e_add_all_vlan_rules(priv); if (err) return (err); err = mlx5e_add_main_vxlan_rules(priv); if (err) goto err_del_all_vlan_rules; err = mlx5e_add_all_vxlan_rules(priv); if (err) goto err_del_main_vxlan_rules; mlx5e_set_rx_mode_core(priv, true); set_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state); return (0); err_del_main_vxlan_rules: mlx5e_del_main_vxlan_rules(priv); err_del_all_vlan_rules: mlx5e_del_all_vlan_rules(priv); return (err); } void mlx5e_close_flow_rules(struct mlx5e_priv *priv) { clear_bit(MLX5E_STATE_FLOW_RULES_READY, &priv->state); mlx5e_set_rx_mode_core(priv, false); mlx5e_del_all_vxlan_rules(priv); mlx5e_del_main_vxlan_rules(priv); mlx5e_del_all_vlan_rules(priv); } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c index e6b4759aeb7a..b99a7c98500c 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls.c @@ -1,822 +1,822 @@ /*- * Copyright (c) 2019-2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_kern_tls.h" #include "opt_rss.h" #include "opt_ratelimit.h" #include #include #include #include #include #ifdef KERN_TLS #ifdef RATELIMIT static if_snd_tag_modify_t mlx5e_tls_rl_snd_tag_modify; #endif static if_snd_tag_query_t mlx5e_tls_snd_tag_query; static if_snd_tag_free_t mlx5e_tls_snd_tag_free; static const struct if_snd_tag_sw mlx5e_tls_snd_tag_sw = { .snd_tag_query = mlx5e_tls_snd_tag_query, .snd_tag_free = mlx5e_tls_snd_tag_free, .type = IF_SND_TAG_TYPE_TLS }; #ifdef RATELIMIT static const struct if_snd_tag_sw mlx5e_tls_rl_snd_tag_sw = { .snd_tag_modify = mlx5e_tls_rl_snd_tag_modify, .snd_tag_query = mlx5e_tls_snd_tag_query, .snd_tag_free = mlx5e_tls_snd_tag_free, .type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT }; #endif MALLOC_DEFINE(M_MLX5E_TLS, "MLX5E_TLS", "MLX5 ethernet HW TLS"); /* software TLS context */ struct mlx5_ifc_sw_tls_cntx_bits { struct mlx5_ifc_tls_static_params_bits param; struct mlx5_ifc_tls_progress_params_bits progress; struct { uint8_t key_data[8][0x20]; uint8_t key_len[0x20]; } key; }; CTASSERT(MLX5_ST_SZ_BYTES(sw_tls_cntx) <= sizeof(((struct mlx5e_tls_tag *)0)->crypto_params)); CTASSERT(MLX5_ST_SZ_BYTES(mkc) == sizeof(((struct mlx5e_tx_umr_wqe *)0)->mkc)); static const char *mlx5e_tls_stats_desc[] = { MLX5E_TLS_STATS(MLX5E_STATS_DESC) }; static void mlx5e_tls_work(struct work_struct *); static int mlx5e_tls_tag_import(void *arg, void **store, int cnt, int domain, int flags) { struct mlx5e_tls_tag *ptag; int i; for (i = 0; i != cnt; i++) { ptag = malloc_domainset(sizeof(*ptag), M_MLX5E_TLS, mlx5_dev_domainset(arg), flags | M_ZERO); mtx_init(&ptag->mtx, "mlx5-tls-tag-mtx", NULL, MTX_DEF); INIT_WORK(&ptag->work, mlx5e_tls_work); store[i] = ptag; } return (i); } static void mlx5e_tls_tag_release(void *arg, void **store, int cnt) { struct mlx5e_tls_tag *ptag; struct mlx5e_priv *priv; struct mlx5e_tls *ptls; int i; for (i = 0; i != cnt; i++) { ptag = store[i]; ptls = ptag->tls; priv = container_of(ptls, struct mlx5e_priv, tls); flush_work(&ptag->work); if (ptag->tisn != 0) { mlx5_tls_close_tis(priv->mdev, ptag->tisn); atomic_add_32(&ptls->num_resources, -1U); } mtx_destroy(&ptag->mtx); free(ptag, M_MLX5E_TLS); } } static void mlx5e_tls_tag_zfree(struct mlx5e_tls_tag *ptag) { /* make sure any unhandled taskqueue events are ignored */ ptag->state = MLX5E_TLS_ST_FREED; /* reset some variables */ ptag->dek_index = 0; ptag->dek_index_ok = 0; /* avoid leaking keys */ memset(ptag->crypto_params, 0, sizeof(ptag->crypto_params)); /* update number of TIS contexts */ if (ptag->tisn == 0) atomic_add_32(&ptag->tls->num_resources, -1U); /* return tag to UMA */ uma_zfree(ptag->tls->zone, ptag); } int mlx5e_tls_init(struct mlx5e_priv *priv) { struct mlx5e_tls *ptls = &priv->tls; struct sysctl_oid *node; uint32_t x; if (MLX5_CAP_GEN(priv->mdev, tls_tx) == 0 || MLX5_CAP_GEN(priv->mdev, log_max_dek) == 0) return (0); ptls->wq = create_singlethread_workqueue("mlx5-tls-wq"); if (ptls->wq == NULL) return (ENOMEM); sysctl_ctx_init(&ptls->ctx); snprintf(ptls->zname, sizeof(ptls->zname), "mlx5_%u_tls", device_get_unit(priv->mdev->pdev->dev.bsddev)); ptls->zone = uma_zcache_create(ptls->zname, sizeof(struct mlx5e_tls_tag), NULL, NULL, NULL, NULL, mlx5e_tls_tag_import, mlx5e_tls_tag_release, priv->mdev, UMA_ZONE_UNMANAGED); /* shared between RX and TX TLS */ ptls->max_resources = 1U << (MLX5_CAP_GEN(priv->mdev, log_max_dek) - 1); for (x = 0; x != MLX5E_TLS_STATS_NUM; x++) ptls->stats.arg[x] = counter_u64_alloc(M_WAITOK); ptls->init = 1; node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tls", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Hardware TLS offload"); if (node == NULL) return (0); mlx5e_create_counter_stats(&ptls->ctx, SYSCTL_CHILDREN(node), "stats", mlx5e_tls_stats_desc, MLX5E_TLS_STATS_NUM, ptls->stats.arg); return (0); } void mlx5e_tls_cleanup(struct mlx5e_priv *priv) { struct mlx5e_tls *ptls = &priv->tls; uint32_t x; if (ptls->init == 0) return; ptls->init = 0; flush_workqueue(ptls->wq); sysctl_ctx_free(&ptls->ctx); uma_zdestroy(ptls->zone); destroy_workqueue(ptls->wq); /* check if all resources are freed */ MPASS(priv->tls.num_resources == 0); for (x = 0; x != MLX5E_TLS_STATS_NUM; x++) counter_u64_free(ptls->stats.arg[x]); } static void mlx5e_tls_work(struct work_struct *work) { struct mlx5e_tls_tag *ptag; struct mlx5e_priv *priv; int err; ptag = container_of(work, struct mlx5e_tls_tag, work); priv = container_of(ptag->tls, struct mlx5e_priv, tls); switch (ptag->state) { case MLX5E_TLS_ST_INIT: /* try to open TIS, if not present */ if (ptag->tisn == 0) { err = mlx5_tls_open_tis(priv->mdev, 0, priv->tdn, priv->pdn, &ptag->tisn); if (err) { MLX5E_TLS_STAT_INC(ptag, tx_error, 1); break; } } MLX5_SET(sw_tls_cntx, ptag->crypto_params, progress.pd, ptag->tisn); /* try to allocate a DEK context ID */ err = mlx5_encryption_key_create(priv->mdev, priv->pdn, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, key.key_data), MLX5_GET(sw_tls_cntx, ptag->crypto_params, key.key_len), &ptag->dek_index); if (err) { MLX5E_TLS_STAT_INC(ptag, tx_error, 1); break; } MLX5_SET(sw_tls_cntx, ptag->crypto_params, param.dek_index, ptag->dek_index); ptag->dek_index_ok = 1; MLX5E_TLS_TAG_LOCK(ptag); if (ptag->state == MLX5E_TLS_ST_INIT) ptag->state = MLX5E_TLS_ST_SETUP; MLX5E_TLS_TAG_UNLOCK(ptag); break; case MLX5E_TLS_ST_RELEASE: /* try to destroy DEK context by ID */ if (ptag->dek_index_ok) err = mlx5_encryption_key_destroy(priv->mdev, ptag->dek_index); /* free tag */ mlx5e_tls_tag_zfree(ptag); break; default: break; } } static int mlx5e_tls_set_params(void *ctx, const struct tls_session_params *en) { MLX5_SET(sw_tls_cntx, ctx, param.const_2, 2); if (en->tls_vminor == TLS_MINOR_VER_TWO) MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 2); /* v1.2 */ else MLX5_SET(sw_tls_cntx, ctx, param.tls_version, 3); /* v1.3 */ MLX5_SET(sw_tls_cntx, ctx, param.const_1, 1); MLX5_SET(sw_tls_cntx, ctx, param.encryption_standard, 1); /* TLS */ /* copy the initial vector in place */ switch (en->iv_len) { case MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv): case MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.gcm_iv) + MLX5_FLD_SZ_BYTES(sw_tls_cntx, param.implicit_iv): memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, param.gcm_iv), en->iv, en->iv_len); break; default: return (EINVAL); } if (en->cipher_key_len <= MLX5_FLD_SZ_BYTES(sw_tls_cntx, key.key_data)) { memcpy(MLX5_ADDR_OF(sw_tls_cntx, ctx, key.key_data), en->cipher_key, en->cipher_key_len); MLX5_SET(sw_tls_cntx, ctx, key.key_len, en->cipher_key_len); } else { return (EINVAL); } return (0); } /* Verify zero default */ CTASSERT(MLX5E_TLS_ST_INIT == 0); int -mlx5e_tls_snd_tag_alloc(struct ifnet *ifp, +mlx5e_tls_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { union if_snd_tag_alloc_params rl_params; const struct if_snd_tag_sw *snd_tag_sw; struct mlx5e_priv *priv; struct mlx5e_tls_tag *ptag; const struct tls_session_params *en; int error; - priv = ifp->if_softc; + priv = if_getsoftc(ifp); if (priv->gone != 0 || priv->tls.init == 0) return (EOPNOTSUPP); /* allocate new tag from zone, if any */ ptag = uma_zalloc(priv->tls.zone, M_NOWAIT); if (ptag == NULL) return (ENOMEM); /* sanity check default values */ MPASS(ptag->dek_index == 0); MPASS(ptag->dek_index_ok == 0); /* setup TLS tag */ ptag->tls = &priv->tls; /* check if there is no TIS context */ if (ptag->tisn == 0) { uint32_t value; value = atomic_fetchadd_32(&priv->tls.num_resources, 1U); /* check resource limits */ if (value >= priv->tls.max_resources) { error = ENOMEM; goto failure; } } en = ¶ms->tls.tls->params; /* only TLS v1.2 and v1.3 is currently supported */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE || (en->tls_vminor != TLS_MINOR_VER_TWO #ifdef TLS_MINOR_VER_THREE && en->tls_vminor != TLS_MINOR_VER_THREE #endif )) { error = EPROTONOSUPPORT; goto failure; } switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: switch (en->cipher_key_len) { case 128 / 8: if (en->tls_vminor == TLS_MINOR_VER_TWO) { if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_128) == 0) { error = EPROTONOSUPPORT; goto failure; } } else { if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_128) == 0) { error = EPROTONOSUPPORT; goto failure; } } error = mlx5e_tls_set_params(ptag->crypto_params, en); if (error) goto failure; break; case 256 / 8: if (en->tls_vminor == TLS_MINOR_VER_TWO) { if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_256) == 0) { error = EPROTONOSUPPORT; goto failure; } } else { if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_256) == 0) { error = EPROTONOSUPPORT; goto failure; } } error = mlx5e_tls_set_params(ptag->crypto_params, en); if (error) goto failure; break; default: error = EINVAL; goto failure; } break; default: error = EPROTONOSUPPORT; goto failure; } memset(&rl_params, 0, sizeof(rl_params)); rl_params.hdr = params->hdr; switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: rl_params.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT; rl_params.rate_limit.max_rate = params->tls_rate_limit.max_rate; snd_tag_sw = &mlx5e_tls_rl_snd_tag_sw; break; #endif case IF_SND_TAG_TYPE_TLS: rl_params.hdr.type = IF_SND_TAG_TYPE_UNLIMITED; snd_tag_sw = &mlx5e_tls_snd_tag_sw; break; default: error = EOPNOTSUPP; goto failure; } error = m_snd_tag_alloc(ifp, &rl_params, &ptag->rl_tag); if (error) goto failure; /* store pointer to mbuf tag */ MPASS(ptag->tag.refcount == 0); m_snd_tag_init(&ptag->tag, ifp, snd_tag_sw); *ppmt = &ptag->tag; /* reset state */ ptag->state = MLX5E_TLS_ST_INIT; queue_work(priv->tls.wq, &ptag->work); flush_work(&ptag->work); return (0); failure: mlx5e_tls_tag_zfree(ptag); return (error); } #ifdef RATELIMIT static int mlx5e_tls_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) { union if_snd_tag_modify_params rl_params; struct mlx5e_tls_tag *ptag = container_of(pmt, struct mlx5e_tls_tag, tag); int error; memset(&rl_params, 0, sizeof(rl_params)); rl_params.rate_limit.max_rate = params->tls_rate_limit.max_rate; error = ptag->rl_tag->sw->snd_tag_modify(ptag->rl_tag, &rl_params); return (error); } #endif static int mlx5e_tls_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_tls_tag *ptag = container_of(pmt, struct mlx5e_tls_tag, tag); return (ptag->rl_tag->sw->snd_tag_query(ptag->rl_tag, params)); } static void mlx5e_tls_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_tls_tag *ptag = container_of(pmt, struct mlx5e_tls_tag, tag); struct mlx5e_priv *priv; m_snd_tag_rele(ptag->rl_tag); MLX5E_TLS_TAG_LOCK(ptag); ptag->state = MLX5E_TLS_ST_RELEASE; MLX5E_TLS_TAG_UNLOCK(ptag); - priv = ptag->tag.ifp->if_softc; + priv = if_getsoftc(ptag->tag.ifp); queue_work(priv->tls.wq, &ptag->work); } CTASSERT((MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) % 16) == 0); static void mlx5e_tls_send_static_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag) { const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_umr_wqe) + MLX5_FLD_SZ_BYTES(sw_tls_cntx, param), MLX5_SEND_WQE_DS); struct mlx5e_tx_umr_wqe *wqe; u16 pi; pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_UMR | (MLX5_OPCODE_MOD_UMR_TLS_TIS_STATIC_PARAMS << 24)); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); wqe->ctrl.imm = cpu_to_be32(ptag->tisn << 8); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; else wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; /* fill out UMR control segment */ wqe->umr.flags = 0x80; /* inline data */ wqe->umr.bsf_octowords = cpu_to_be16(MLX5_FLD_SZ_BYTES(sw_tls_cntx, param) / 16); /* copy in the static crypto parameters */ memcpy(wqe + 1, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, param), MLX5_FLD_SZ_BYTES(sw_tls_cntx, param)); /* copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->mbuf[pi].mst = m_snd_tag_ref(&ptag->tag); sq->pc += sq->mbuf[pi].num_wqebbs; } CTASSERT(MLX5_FLD_SZ_BYTES(sw_tls_cntx, progress) == sizeof(((struct mlx5e_tx_psv_wqe *)0)->psv)); static void mlx5e_tls_send_progress_parameters(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag) { const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_psv_wqe), MLX5_SEND_WQE_DS); struct mlx5e_tx_psv_wqe *wqe; u16 pi; pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SET_PSV | (MLX5_OPCODE_MOD_PSV_TLS_TIS_PROGRESS_PARAMS << 24)); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; /* copy in the PSV control segment */ memcpy(&wqe->psv, MLX5_ADDR_OF(sw_tls_cntx, ptag->crypto_params, progress), sizeof(wqe->psv)); /* copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->mbuf[pi].mst = m_snd_tag_ref(&ptag->tag); sq->pc += sq->mbuf[pi].num_wqebbs; } static void mlx5e_tls_send_nop(struct mlx5e_sq *sq, struct mlx5e_tls_tag *ptag) { const u32 ds_cnt = MLX5_SEND_WQEBB_NUM_DS; struct mlx5e_tx_wqe *wqe; u16 pi; pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(&wqe->ctrl, 0, sizeof(wqe->ctrl)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; else wqe->ctrl.fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; /* Copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->mbuf[pi].mst = m_snd_tag_ref(&ptag->tag); sq->pc += sq->mbuf[pi].num_wqebbs; } #define SBTLS_MBUF_NO_DATA ((struct mbuf *)1) static struct mbuf * sbtls_recover_record(struct mbuf *mb, int wait, uint32_t tcp_old, uint32_t *ptcp_seq, bool *pis_start) { struct mbuf *mr, *top; uint32_t offset; uint32_t delta; /* check format of incoming mbuf */ if (mb->m_next == NULL || (mb->m_next->m_flags & (M_EXTPG | M_EXT)) != (M_EXTPG | M_EXT)) { top = NULL; goto done; } /* get unmapped data offset */ offset = mtod(mb->m_next, uintptr_t); /* check if we don't need to re-transmit anything */ if (offset == 0) { top = SBTLS_MBUF_NO_DATA; *pis_start = true; goto done; } /* try to get a new packet header */ top = m_gethdr(wait, MT_DATA); if (top == NULL) goto done; mr = m_get(wait, MT_DATA); if (mr == NULL) { m_free(top); top = NULL; goto done; } top->m_next = mr; mb_dupcl(mr, mb->m_next); /* the beginning of the TLS record */ mr->m_data = NULL; /* setup packet header length */ top->m_pkthdr.len = mr->m_len = offset; top->m_len = 0; /* check for partial re-transmit */ delta = *ptcp_seq - tcp_old; if (delta < offset) { m_adj(top, offset - delta); offset = delta; /* continue where we left off */ *pis_start = false; } else { *pis_start = true; } /* * Rewind the TCP sequence number by the amount of data * retransmitted: */ *ptcp_seq -= offset; done: return (top); } static int mlx5e_sq_tls_populate(struct mbuf *mb, uint64_t *pseq) { for (; mb != NULL; mb = mb->m_next) { if (!(mb->m_flags & M_EXTPG)) continue; *pseq = mb->m_epg_seqno; return (1); } return (0); } int mlx5e_sq_tls_xmit(struct mlx5e_sq *sq, struct mlx5e_xmit_args *parg, struct mbuf **ppmb) { struct mlx5e_tls_tag *ptls_tag; struct m_snd_tag *ptag; const struct tcphdr *th; struct mbuf *mb = *ppmb; u64 rcd_sn; u32 header_size; u32 mb_seq; if ((mb->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0) return (MLX5E_TLS_CONTINUE); ptag = mb->m_pkthdr.snd_tag; if ( #ifdef RATELIMIT ptag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT && #endif ptag->sw->type != IF_SND_TAG_TYPE_TLS) return (MLX5E_TLS_CONTINUE); ptls_tag = container_of(ptag, struct mlx5e_tls_tag, tag); header_size = mlx5e_get_full_header_size(mb, &th); if (unlikely(header_size == 0 || th == NULL)) return (MLX5E_TLS_FAILURE); /* * Send non-TLS TCP packets AS-IS: */ if (header_size == mb->m_pkthdr.len || mlx5e_sq_tls_populate(mb, &rcd_sn) == 0) { parg->tisn = 0; parg->ihs = header_size; return (MLX5E_TLS_CONTINUE); } mb_seq = ntohl(th->th_seq); MLX5E_TLS_TAG_LOCK(ptls_tag); switch (ptls_tag->state) { case MLX5E_TLS_ST_INIT: MLX5E_TLS_TAG_UNLOCK(ptls_tag); return (MLX5E_TLS_FAILURE); case MLX5E_TLS_ST_SETUP: ptls_tag->state = MLX5E_TLS_ST_TXRDY; ptls_tag->expected_seq = ~mb_seq; /* force setup */ default: MLX5E_TLS_TAG_UNLOCK(ptls_tag); break; } if (unlikely(ptls_tag->expected_seq != mb_seq)) { bool is_start; struct mbuf *r_mb; uint32_t tcp_seq = mb_seq; r_mb = sbtls_recover_record(mb, M_NOWAIT, ptls_tag->expected_seq, &tcp_seq, &is_start); if (r_mb == NULL) { MLX5E_TLS_STAT_INC(ptls_tag, tx_error, 1); return (MLX5E_TLS_FAILURE); } MLX5E_TLS_STAT_INC(ptls_tag, tx_packets_ooo, 1); /* check if this is the first fragment of a TLS record */ if (is_start) { /* setup TLS static parameters */ MLX5_SET64(sw_tls_cntx, ptls_tag->crypto_params, param.initial_record_number, rcd_sn); /* * NOTE: The sendqueue should have enough room to * carry both the static and the progress parameters * when we get here! */ mlx5e_tls_send_static_parameters(sq, ptls_tag); mlx5e_tls_send_progress_parameters(sq, ptls_tag); if (r_mb == SBTLS_MBUF_NO_DATA) { mlx5e_tls_send_nop(sq, ptls_tag); ptls_tag->expected_seq = mb_seq; return (MLX5E_TLS_LOOP); } } MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes_ooo, r_mb->m_pkthdr.len); /* setup transmit arguments */ parg->tisn = ptls_tag->tisn; parg->mst = &ptls_tag->tag; /* try to send DUMP data */ if (mlx5e_sq_dump_xmit(sq, parg, &r_mb) != 0) { m_freem(r_mb); ptls_tag->expected_seq = tcp_seq; return (MLX5E_TLS_FAILURE); } else { ptls_tag->expected_seq = mb_seq; return (MLX5E_TLS_LOOP); } } else { MLX5E_TLS_STAT_INC(ptls_tag, tx_packets, 1); MLX5E_TLS_STAT_INC(ptls_tag, tx_bytes, mb->m_pkthdr.len); } ptls_tag->expected_seq += mb->m_pkthdr.len - header_size; parg->tisn = ptls_tag->tisn; parg->ihs = header_size; parg->mst = &ptls_tag->tag; return (MLX5E_TLS_CONTINUE); } #else int mlx5e_tls_init(struct mlx5e_priv *priv) { return (0); } void mlx5e_tls_cleanup(struct mlx5e_priv *priv) { /* NOP */ } #endif /* KERN_TLS */ diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c index 356962cfad99..576eea1d7660 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_hw_tls_rx.c @@ -1,1018 +1,1018 @@ /*- * Copyright (c) 2021-2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_kern_tls.h" #include "opt_rss.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #ifdef KERN_TLS static if_snd_tag_free_t mlx5e_tls_rx_snd_tag_free; static if_snd_tag_modify_t mlx5e_tls_rx_snd_tag_modify; static const struct if_snd_tag_sw mlx5e_tls_rx_snd_tag_sw = { .snd_tag_modify = mlx5e_tls_rx_snd_tag_modify, .snd_tag_free = mlx5e_tls_rx_snd_tag_free, .type = IF_SND_TAG_TYPE_TLS_RX }; MALLOC_DEFINE(M_MLX5E_TLS_RX, "MLX5E_TLS_RX", "MLX5 ethernet HW TLS RX"); /* software TLS RX context */ struct mlx5_ifc_sw_tls_rx_cntx_bits { struct mlx5_ifc_tls_static_params_bits param; struct mlx5_ifc_tls_progress_params_bits progress; struct { uint8_t key_data[8][0x20]; uint8_t key_len[0x20]; } key; }; CTASSERT(MLX5_ST_SZ_BYTES(sw_tls_rx_cntx) <= sizeof(((struct mlx5e_tls_rx_tag *)NULL)->crypto_params)); CTASSERT(MLX5_ST_SZ_BYTES(mkc) == sizeof(((struct mlx5e_tx_umr_wqe *)NULL)->mkc)); static const char *mlx5e_tls_rx_stats_desc[] = { MLX5E_TLS_RX_STATS(MLX5E_STATS_DESC) }; static void mlx5e_tls_rx_work(struct work_struct *); static bool mlx5e_tls_rx_snd_tag_find_tcp_sn_and_tls_rcd(struct mlx5e_tls_rx_tag *, uint32_t, uint32_t *, uint64_t *); CTASSERT((MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, param) % 16) == 0); static uint32_t mlx5e_tls_rx_get_ch(struct mlx5e_priv *priv, uint32_t flowid, uint32_t flowtype) { u32 ch; #ifdef RSS u32 temp; #endif /* keep this code synced with mlx5e_select_queue() */ ch = priv->params.num_channels; #ifdef RSS if (rss_hash2bucket(flowid, flowtype, &temp) == 0) ch = temp % ch; else #endif ch = (flowid % 128) % ch; return (ch); } /* * This function gets a pointer to an internal queue, IQ, based on the * provided "flowid" and "flowtype". The IQ returned may in some rare * cases not be activated or running, but this is all handled by the * "mlx5e_iq_get_producer_index()" function. * * The idea behind this function is to spread the IQ traffic as much * as possible and to avoid congestion on the same IQ when processing * RX traffic. */ static struct mlx5e_iq * mlx5e_tls_rx_get_iq(struct mlx5e_priv *priv, uint32_t flowid, uint32_t flowtype) { /* * NOTE: The channels array is only freed at detach * and it safe to return a pointer to the send tag * inside the channels structure as long as we * reference the priv. */ return (&priv->channel[mlx5e_tls_rx_get_ch(priv, flowid, flowtype)].iq); } static void mlx5e_tls_rx_send_static_parameters_cb(void *arg) { struct mlx5e_tls_rx_tag *ptag; ptag = (struct mlx5e_tls_rx_tag *)arg; m_snd_tag_rele(&ptag->tag); } /* * This function sends the so-called TLS RX static parameters to the * hardware. These parameters are temporarily stored in the * "crypto_params" field of the TLS RX tag. Most importantly this * function sets the TCP sequence number (32-bit) and TLS record * number (64-bit) where the decryption can resume. * * Zero is returned upon success. Else some error happend. */ static int mlx5e_tls_rx_send_static_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_rx_tag *ptag) { const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_umr_wqe) + MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, param), MLX5_SEND_WQE_DS); struct mlx5e_tx_umr_wqe *wqe; int pi; mtx_lock(&iq->lock); pi = mlx5e_iq_get_producer_index(iq); if (pi < 0) { mtx_unlock(&iq->lock); return (-ENOMEM); } wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) | MLX5_OPCODE_UMR | (MLX5_OPCODE_MOD_UMR_TLS_TIR_STATIC_PARAMS << 24)); wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt); wqe->ctrl.imm = cpu_to_be32(ptag->tirn << 8); wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; /* fill out UMR control segment */ wqe->umr.flags = 0x80; /* inline data */ wqe->umr.bsf_octowords = cpu_to_be16(MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, param) / 16); /* copy in the static crypto parameters */ memcpy(wqe + 1, MLX5_ADDR_OF(sw_tls_rx_cntx, ptag->crypto_params, param), MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, param)); /* copy data for doorbell */ memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); iq->data[pi].callback = &mlx5e_tls_rx_send_static_parameters_cb; iq->data[pi].arg = ptag; m_snd_tag_ref(&ptag->tag); iq->pc += iq->data[pi].num_wqebbs; mlx5e_iq_notify_hw(iq); mtx_unlock(&iq->lock); return (0); /* success */ } static void mlx5e_tls_rx_send_progress_parameters_cb(void *arg) { struct mlx5e_tls_rx_tag *ptag; ptag = (struct mlx5e_tls_rx_tag *)arg; complete(&ptag->progress_complete); } CTASSERT(MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, progress) == sizeof(((struct mlx5e_tx_psv_wqe *)NULL)->psv)); /* * This function resets the state of the TIR context to start * searching for a valid TLS header and is used only when allocating * the TLS RX tag. * * Zero is returned upon success, else some error happened. */ static int mlx5e_tls_rx_send_progress_parameters_sync(struct mlx5e_iq *iq, struct mlx5e_tls_rx_tag *ptag) { const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_psv_wqe), MLX5_SEND_WQE_DS); struct mlx5e_priv *priv; struct mlx5e_tx_psv_wqe *wqe; int pi; mtx_lock(&iq->lock); pi = mlx5e_iq_get_producer_index(iq); if (pi < 0) { mtx_unlock(&iq->lock); return (-ENOMEM); } wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) | MLX5_OPCODE_SET_PSV | (MLX5_OPCODE_MOD_PSV_TLS_TIR_PROGRESS_PARAMS << 24)); wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt); wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; /* copy in the PSV control segment */ memcpy(&wqe->psv, MLX5_ADDR_OF(sw_tls_rx_cntx, ptag->crypto_params, progress), sizeof(wqe->psv)); /* copy data for doorbell */ memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); iq->data[pi].callback = &mlx5e_tls_rx_send_progress_parameters_cb; iq->data[pi].arg = ptag; iq->pc += iq->data[pi].num_wqebbs; init_completion(&ptag->progress_complete); mlx5e_iq_notify_hw(iq); mtx_unlock(&iq->lock); while (1) { if (wait_for_completion_timeout(&ptag->progress_complete, hz) != 0) break; priv = container_of(iq, struct mlx5e_channel, iq)->priv; if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || pci_channel_offline(priv->mdev->pdev) != 0) return (-EWOULDBLOCK); } return (0); /* success */ } CTASSERT(MLX5E_TLS_RX_PROGRESS_BUFFER_SIZE >= MLX5_ST_SZ_BYTES(tls_progress_params)); CTASSERT(MLX5E_TLS_RX_PROGRESS_BUFFER_SIZE <= PAGE_SIZE); struct mlx5e_get_tls_progress_params_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_seg_get_psv psv; }; static void mlx5e_tls_rx_receive_progress_parameters_cb(void *arg) { struct mlx5e_tls_rx_tag *ptag; struct mlx5e_iq *iq; uint32_t tcp_curr_sn_he; uint32_t tcp_next_sn_he; uint64_t tls_rcd_num; void *buffer; ptag = (struct mlx5e_tls_rx_tag *)arg; buffer = mlx5e_tls_rx_get_progress_buffer(ptag); MLX5E_TLS_RX_TAG_LOCK(ptag); ptag->tcp_resync_pending = 0; switch (MLX5_GET(tls_progress_params, buffer, record_tracker_state)) { case MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_TRACKING: break; default: goto done; } switch (MLX5_GET(tls_progress_params, buffer, auth_state)) { case MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD: break; default: goto done; } tcp_curr_sn_he = MLX5_GET(tls_progress_params, buffer, hw_resync_tcp_sn); if (mlx5e_tls_rx_snd_tag_find_tcp_sn_and_tls_rcd(ptag, tcp_curr_sn_he, &tcp_next_sn_he, &tls_rcd_num)) { MLX5_SET64(sw_tls_rx_cntx, ptag->crypto_params, param.initial_record_number, tls_rcd_num); MLX5_SET(sw_tls_rx_cntx, ptag->crypto_params, param.resync_tcp_sn, tcp_curr_sn_he); iq = mlx5e_tls_rx_get_iq( container_of(ptag->tls_rx, struct mlx5e_priv, tls_rx), ptag->flowid, ptag->flowtype); if (mlx5e_tls_rx_send_static_parameters(iq, ptag) != 0) MLX5E_TLS_RX_STAT_INC(ptag, rx_error, 1); } done: MLX5E_TLS_RX_TAG_UNLOCK(ptag); m_snd_tag_rele(&ptag->tag); } /* * This function queries the hardware for the current state of the TIR * in question. It is typically called when encrypted data is received * to re-establish hardware decryption of received TLS data. * * Zero is returned upon success, else some error happened. */ static int mlx5e_tls_rx_receive_progress_parameters(struct mlx5e_iq *iq, struct mlx5e_tls_rx_tag *ptag) { struct mlx5e_get_tls_progress_params_wqe *wqe; const u32 ds_cnt = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS); u64 dma_address; int pi; mtx_lock(&iq->lock); pi = mlx5e_iq_get_producer_index(iq); if (pi < 0) { mtx_unlock(&iq->lock); return (-ENOMEM); } mlx5e_iq_load_memory_single(iq, pi, mlx5e_tls_rx_get_progress_buffer(ptag), MLX5E_TLS_RX_PROGRESS_BUFFER_SIZE, &dma_address, BUS_DMASYNC_PREREAD); wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) | MLX5_OPCODE_GET_PSV | (MLX5_OPCODE_MOD_PSV_TLS_TIR_PROGRESS_PARAMS << 24)); wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt); wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; wqe->psv.num_psv = 1 << 4; wqe->psv.l_key = iq->mkey_be; wqe->psv.psv_index[0] = cpu_to_be32(ptag->tirn); wqe->psv.va = cpu_to_be64(dma_address); /* copy data for doorbell */ memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); iq->data[pi].callback = &mlx5e_tls_rx_receive_progress_parameters_cb; iq->data[pi].arg = ptag; m_snd_tag_ref(&ptag->tag); iq->pc += iq->data[pi].num_wqebbs; mlx5e_iq_notify_hw(iq); mtx_unlock(&iq->lock); return (0); /* success */ } /* * This is the import function for TLS RX tags. */ static int mlx5e_tls_rx_tag_import(void *arg, void **store, int cnt, int domain, int flags) { struct mlx5e_tls_rx_tag *ptag; int i; for (i = 0; i != cnt; i++) { ptag = malloc_domainset(sizeof(*ptag), M_MLX5E_TLS_RX, mlx5_dev_domainset(arg), flags | M_ZERO); mtx_init(&ptag->mtx, "mlx5-tls-rx-tag-mtx", NULL, MTX_DEF); INIT_WORK(&ptag->work, mlx5e_tls_rx_work); store[i] = ptag; } return (i); } /* * This is the release function for TLS RX tags. */ static void mlx5e_tls_rx_tag_release(void *arg, void **store, int cnt) { struct mlx5e_tls_rx_tag *ptag; int i; for (i = 0; i != cnt; i++) { ptag = store[i]; flush_work(&ptag->work); mtx_destroy(&ptag->mtx); free(ptag, M_MLX5E_TLS_RX); } } /* * This is a convenience function to free TLS RX tags. It resets some * selected fields, updates the number of resources and returns the * TLS RX tag to the UMA pool of free tags. */ static void mlx5e_tls_rx_tag_zfree(struct mlx5e_tls_rx_tag *ptag) { /* make sure any unhandled taskqueue events are ignored */ ptag->state = MLX5E_TLS_RX_ST_FREED; /* reset some variables */ ptag->dek_index = 0; ptag->dek_index_ok = 0; ptag->tirn = 0; ptag->flow_rule = NULL; ptag->tcp_resync_active = 0; ptag->tcp_resync_pending = 0; /* avoid leaking keys */ memset(ptag->crypto_params, 0, sizeof(ptag->crypto_params)); /* update number of resources in use */ atomic_add_32(&ptag->tls_rx->num_resources, -1U); /* return tag to UMA */ uma_zfree(ptag->tls_rx->zone, ptag); } /* * This function enables TLS RX support for the given NIC, if all * needed firmware capabilites are present. */ int mlx5e_tls_rx_init(struct mlx5e_priv *priv) { struct mlx5e_tls_rx *ptls = &priv->tls_rx; struct sysctl_oid *node; uint32_t x; if (MLX5_CAP_GEN(priv->mdev, tls_rx) == 0 || MLX5_CAP_GEN(priv->mdev, log_max_dek) == 0 || MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ft_field_support.outer_ip_version) == 0) return (0); ptls->wq = create_singlethread_workqueue("mlx5-tls-rx-wq"); if (ptls->wq == NULL) return (ENOMEM); sysctl_ctx_init(&ptls->ctx); snprintf(ptls->zname, sizeof(ptls->zname), "mlx5_%u_tls_rx", device_get_unit(priv->mdev->pdev->dev.bsddev)); ptls->zone = uma_zcache_create(ptls->zname, sizeof(struct mlx5e_tls_rx_tag), NULL, NULL, NULL, NULL, mlx5e_tls_rx_tag_import, mlx5e_tls_rx_tag_release, priv->mdev, UMA_ZONE_UNMANAGED); /* shared between RX and TX TLS */ ptls->max_resources = 1U << (MLX5_CAP_GEN(priv->mdev, log_max_dek) - 1); for (x = 0; x != MLX5E_TLS_RX_STATS_NUM; x++) ptls->stats.arg[x] = counter_u64_alloc(M_WAITOK); ptls->init = 1; node = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tls_rx", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Hardware TLS receive offload"); if (node == NULL) return (0); mlx5e_create_counter_stats(&ptls->ctx, SYSCTL_CHILDREN(node), "stats", mlx5e_tls_rx_stats_desc, MLX5E_TLS_RX_STATS_NUM, ptls->stats.arg); return (0); } /* * This function disables TLS RX support for the given NIC. */ void mlx5e_tls_rx_cleanup(struct mlx5e_priv *priv) { struct mlx5e_tls_rx *ptls = &priv->tls_rx; uint32_t x; if (ptls->init == 0) return; ptls->init = 0; flush_workqueue(ptls->wq); sysctl_ctx_free(&ptls->ctx); uma_zdestroy(ptls->zone); destroy_workqueue(ptls->wq); /* check if all resources are freed */ MPASS(priv->tls_rx.num_resources == 0); for (x = 0; x != MLX5E_TLS_RX_STATS_NUM; x++) counter_u64_free(ptls->stats.arg[x]); } /* * This function is used to serialize sleeping firmware operations * needed in order to establish and destroy a TLS RX tag. */ static void mlx5e_tls_rx_work(struct work_struct *work) { struct mlx5e_tls_rx_tag *ptag; struct mlx5e_priv *priv; int err; ptag = container_of(work, struct mlx5e_tls_rx_tag, work); priv = container_of(ptag->tls_rx, struct mlx5e_priv, tls_rx); switch (ptag->state) { case MLX5E_TLS_RX_ST_INIT: /* try to allocate new TIR context */ err = mlx5_tls_open_tir(priv->mdev, priv->tdn, priv->channel[mlx5e_tls_rx_get_ch(priv, ptag->flowid, ptag->flowtype)].rqtn, &ptag->tirn); if (err) { MLX5E_TLS_RX_STAT_INC(ptag, rx_error, 1); break; } MLX5_SET(sw_tls_rx_cntx, ptag->crypto_params, progress.pd, ptag->tirn); /* try to allocate a DEK context ID */ err = mlx5_encryption_key_create(priv->mdev, priv->pdn, MLX5_ADDR_OF(sw_tls_rx_cntx, ptag->crypto_params, key.key_data), MLX5_GET(sw_tls_rx_cntx, ptag->crypto_params, key.key_len), &ptag->dek_index); if (err) { MLX5E_TLS_RX_STAT_INC(ptag, rx_error, 1); break; } MLX5_SET(sw_tls_rx_cntx, ptag->crypto_params, param.dek_index, ptag->dek_index); ptag->dek_index_ok = 1; MLX5E_TLS_RX_TAG_LOCK(ptag); if (ptag->state == MLX5E_TLS_RX_ST_INIT) ptag->state = MLX5E_TLS_RX_ST_SETUP; MLX5E_TLS_RX_TAG_UNLOCK(ptag); break; case MLX5E_TLS_RX_ST_RELEASE: /* remove flow rule for incoming traffic, if any */ if (ptag->flow_rule != NULL) mlx5e_accel_fs_del_inpcb(ptag->flow_rule); /* try to destroy DEK context by ID */ if (ptag->dek_index_ok) mlx5_encryption_key_destroy(priv->mdev, ptag->dek_index); /* try to destroy TIR context by ID */ if (ptag->tirn != 0) mlx5_tls_close_tir(priv->mdev, ptag->tirn); /* free tag */ mlx5e_tls_rx_tag_zfree(ptag); break; default: break; } } /* * This function translates the crypto parameters into the format used * by the firmware and hardware. Currently only AES-128 and AES-256 is * supported for TLS v1.2 and TLS v1.3. * * Returns zero on success, else an error happened. */ static int mlx5e_tls_rx_set_params(void *ctx, struct inpcb *inp, const struct tls_session_params *en) { uint32_t tcp_sn_he; uint64_t tls_sn_he; MLX5_SET(sw_tls_rx_cntx, ctx, param.const_2, 2); if (en->tls_vminor == TLS_MINOR_VER_TWO) MLX5_SET(sw_tls_rx_cntx, ctx, param.tls_version, 2); /* v1.2 */ else MLX5_SET(sw_tls_rx_cntx, ctx, param.tls_version, 3); /* v1.3 */ MLX5_SET(sw_tls_rx_cntx, ctx, param.const_1, 1); MLX5_SET(sw_tls_rx_cntx, ctx, param.encryption_standard, 1); /* TLS */ /* copy the initial vector in place */ switch (en->iv_len) { case MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, param.gcm_iv): case MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, param.gcm_iv) + MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, param.implicit_iv): memcpy(MLX5_ADDR_OF(sw_tls_rx_cntx, ctx, param.gcm_iv), en->iv, en->iv_len); break; default: return (EINVAL); } if (en->cipher_key_len <= MLX5_FLD_SZ_BYTES(sw_tls_rx_cntx, key.key_data)) { memcpy(MLX5_ADDR_OF(sw_tls_rx_cntx, ctx, key.key_data), en->cipher_key, en->cipher_key_len); MLX5_SET(sw_tls_rx_cntx, ctx, key.key_len, en->cipher_key_len); } else { return (EINVAL); } if (__predict_false(inp == NULL || ktls_get_rx_sequence(inp, &tcp_sn_he, &tls_sn_he) != 0)) return (EINVAL); MLX5_SET64(sw_tls_rx_cntx, ctx, param.initial_record_number, tls_sn_he); MLX5_SET(sw_tls_rx_cntx, ctx, param.resync_tcp_sn, tcp_sn_he); return (0); } /* Verify zero default */ CTASSERT(MLX5E_TLS_RX_ST_INIT == 0); /* * This function is responsible for allocating a TLS RX tag. It is a * callback function invoked by the network stack. * * Returns zero on success else an error happened. */ int -mlx5e_tls_rx_snd_tag_alloc(struct ifnet *ifp, +mlx5e_tls_rx_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct mlx5e_iq *iq; struct mlx5e_priv *priv; struct mlx5e_tls_rx_tag *ptag; struct mlx5_flow_rule *flow_rule; const struct tls_session_params *en; uint32_t value; int error; - priv = ifp->if_softc; + priv = if_getsoftc(ifp); if (unlikely(priv->gone != 0 || priv->tls_rx.init == 0 || params->hdr.flowtype == M_HASHTYPE_NONE)) return (EOPNOTSUPP); /* allocate new tag from zone, if any */ ptag = uma_zalloc(priv->tls_rx.zone, M_NOWAIT); if (ptag == NULL) return (ENOMEM); /* sanity check default values */ MPASS(ptag->dek_index == 0); MPASS(ptag->dek_index_ok == 0); /* setup TLS RX tag */ ptag->tls_rx = &priv->tls_rx; ptag->flowtype = params->hdr.flowtype; ptag->flowid = params->hdr.flowid; value = atomic_fetchadd_32(&priv->tls_rx.num_resources, 1U); /* check resource limits */ if (value >= priv->tls_rx.max_resources) { error = ENOMEM; goto failure; } en = ¶ms->tls_rx.tls->params; /* only TLS v1.2 and v1.3 is currently supported */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE || (en->tls_vminor != TLS_MINOR_VER_TWO #ifdef TLS_MINOR_VER_THREE && en->tls_vminor != TLS_MINOR_VER_THREE #endif )) { error = EPROTONOSUPPORT; goto failure; } switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: switch (en->cipher_key_len) { case 128 / 8: if (en->tls_vminor == TLS_MINOR_VER_TWO) { if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_128) == 0) { error = EPROTONOSUPPORT; goto failure; } } else { if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_128) == 0) { error = EPROTONOSUPPORT; goto failure; } } error = mlx5e_tls_rx_set_params( ptag->crypto_params, params->tls_rx.inp, en); if (error) goto failure; break; case 256 / 8: if (en->tls_vminor == TLS_MINOR_VER_TWO) { if (MLX5_CAP_TLS(priv->mdev, tls_1_2_aes_gcm_256) == 0) { error = EPROTONOSUPPORT; goto failure; } } else { if (MLX5_CAP_TLS(priv->mdev, tls_1_3_aes_gcm_256) == 0) { error = EPROTONOSUPPORT; goto failure; } } error = mlx5e_tls_rx_set_params( ptag->crypto_params, params->tls_rx.inp, en); if (error) goto failure; break; default: error = EINVAL; goto failure; } break; default: error = EPROTONOSUPPORT; goto failure; } /* store pointer to mbuf tag */ MPASS(ptag->tag.refcount == 0); m_snd_tag_init(&ptag->tag, ifp, &mlx5e_tls_rx_snd_tag_sw); *ppmt = &ptag->tag; /* reset state */ ptag->state = MLX5E_TLS_RX_ST_INIT; queue_work(priv->tls_rx.wq, &ptag->work); flush_work(&ptag->work); /* check that worker task completed successfully */ MLX5E_TLS_RX_TAG_LOCK(ptag); if (ptag->state == MLX5E_TLS_RX_ST_SETUP) { ptag->state = MLX5E_TLS_RX_ST_READY; error = 0; } else { error = ENOMEM; } MLX5E_TLS_RX_TAG_UNLOCK(ptag); if (unlikely(error)) goto cleanup; iq = mlx5e_tls_rx_get_iq(priv, ptag->flowid, ptag->flowtype); /* establish connection between DEK and TIR */ if (mlx5e_tls_rx_send_static_parameters(iq, ptag) != 0) { MLX5E_TLS_RX_STAT_INC(ptag, rx_error, 1); error = ENOMEM; goto cleanup; } MLX5_SET(sw_tls_rx_cntx, ptag->crypto_params, progress.auth_state, MLX5E_TLS_RX_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD); MLX5_SET(sw_tls_rx_cntx, ptag->crypto_params, progress.record_tracker_state, MLX5E_TLS_RX_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START); /* reset state to all zeros */ if (mlx5e_tls_rx_send_progress_parameters_sync(iq, ptag) != 0) { MLX5E_TLS_RX_STAT_INC(ptag, rx_error, 1); error = ENOMEM; goto cleanup; } - if (ifp->if_pcp != IFNET_PCP_NONE || params->tls_rx.vlan_id != 0) { + if (if_getpcp(ifp) != IFNET_PCP_NONE || params->tls_rx.vlan_id != 0) { /* create flow rule for TLS RX traffic (tagged) */ flow_rule = mlx5e_accel_fs_add_inpcb(priv, params->tls_rx.inp, ptag->tirn, MLX5_FS_DEFAULT_FLOW_TAG, params->tls_rx.vlan_id); } else { /* create flow rule for TLS RX traffic (untagged) */ flow_rule = mlx5e_accel_fs_add_inpcb(priv, params->tls_rx.inp, ptag->tirn, MLX5_FS_DEFAULT_FLOW_TAG, MLX5E_ACCEL_FS_ADD_INPCB_NO_VLAN); } if (IS_ERR_OR_NULL(flow_rule)) { MLX5E_TLS_RX_STAT_INC(ptag, rx_error, 1); error = ENOMEM; goto cleanup; } ptag->flow_rule = flow_rule; return (0); cleanup: m_snd_tag_rele(&ptag->tag); return (error); failure: mlx5e_tls_rx_tag_zfree(ptag); return (error); } /* * This function adds the TCP sequence number and TLS record number in * host endian format to a small database. When TLS records have the * same length, they are simply accumulated by counting instead of * separated entries in the TLS database. The dimension of the * database is such that it cannot store more than 1GByte of * continuous TCP data to avoid issues with TCP sequence number wrap * around. A record length of zero bytes has special meaning and means * that resync completed and all data in the database can be * discarded. This function is called after the TCP stack has * re-assembled all TCP fragments due to out of order packet reception * and all TCP sequence numbers should be sequential. * * This function returns true if a so-called TLS RX resync operation * is in progress. Else no such operation is in progress. */ static bool mlx5e_tls_rx_snd_tag_add_tcp_sequence(struct mlx5e_tls_rx_tag *ptag, uint32_t tcp_sn_he, uint32_t len, uint64_t tls_rcd) { uint16_t i, j, n; if (ptag->tcp_resync_active == 0 || ptag->tcp_resync_next != tcp_sn_he || len == 0) { /* start over again or terminate */ ptag->tcp_resync_active = (len != 0); ptag->tcp_resync_len[0] = len; ptag->tcp_resync_num[0] = 1; ptag->tcp_resync_pc = (len != 0); ptag->tcp_resync_cc = 0; ptag->tcp_resync_start = tcp_sn_he; ptag->rcd_resync_start = tls_rcd; } else { i = (ptag->tcp_resync_pc - 1) & (MLX5E_TLS_RX_RESYNC_MAX - 1); n = ptag->tcp_resync_pc - ptag->tcp_resync_cc; /* check if same length like last time */ if (ptag->tcp_resync_len[i] == len && ptag->tcp_resync_num[i] != MLX5E_TLS_RX_NUM_MAX) { /* use existing entry */ ptag->tcp_resync_num[i]++; } else if (n == MLX5E_TLS_RX_RESYNC_MAX) { j = ptag->tcp_resync_cc++ & (MLX5E_TLS_RX_RESYNC_MAX - 1); /* adjust starting TCP sequence number */ ptag->rcd_resync_start += ptag->tcp_resync_num[j]; ptag->tcp_resync_start += ptag->tcp_resync_len[j] * ptag->tcp_resync_num[j]; i = ptag->tcp_resync_pc++ & (MLX5E_TLS_RX_RESYNC_MAX - 1); /* store new entry */ ptag->tcp_resync_len[i] = len; ptag->tcp_resync_num[i] = 1; } else { i = ptag->tcp_resync_pc++ & (MLX5E_TLS_RX_RESYNC_MAX - 1); /* add new entry */ ptag->tcp_resync_len[i] = len; ptag->tcp_resync_num[i] = 1; } } /* store next TCP SN in host endian format */ ptag->tcp_resync_next = tcp_sn_he + len; return (ptag->tcp_resync_active); } /* * This function checks if the given TCP sequence number points to the * beginning of a valid TLS header. * * Returns true if a match is found. Else false. */ static bool mlx5e_tls_rx_snd_tag_find_tcp_sn_and_tls_rcd(struct mlx5e_tls_rx_tag *ptag, uint32_t tcp_sn_he, uint32_t *p_next_tcp_sn_he, uint64_t *p_tls_rcd) { uint16_t i, j; uint32_t off = 0; uint32_t rcd = 0; uint32_t delta; uint32_t leap; for (i = ptag->tcp_resync_cc; i != ptag->tcp_resync_pc; i++) { delta = tcp_sn_he - off - ptag->tcp_resync_start; /* check if subtraction went negative */ if ((int32_t)delta < 0) break; j = i & (MLX5E_TLS_RX_RESYNC_MAX - 1); leap = ptag->tcp_resync_len[j] * ptag->tcp_resync_num[j]; if (delta < leap) { if ((delta % ptag->tcp_resync_len[j]) == 0) { *p_next_tcp_sn_he = tcp_sn_he + ptag->tcp_resync_len[j]; *p_tls_rcd = ptag->rcd_resync_start + (uint64_t)rcd + (uint64_t)(delta / ptag->tcp_resync_len[j]); return (true); /* success */ } break; /* invalid offset */ } rcd += ptag->tcp_resync_num[j]; off += leap; } return (false); /* not found */ } /* * This is a callback function from the network stack to keep track of * TLS RX TCP sequence numbers. * * Returns zero on success else an error happened. */ static int mlx5e_tls_rx_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) { struct mlx5e_tls_rx_tag *ptag; struct mlx5e_priv *priv; struct mlx5e_iq *iq; int err; ptag = container_of(pmt, struct mlx5e_tls_rx_tag, tag); priv = container_of(ptag->tls_rx, struct mlx5e_priv, tls_rx); if (unlikely(priv->gone != 0)) return (ENXIO); iq = mlx5e_tls_rx_get_iq(priv, ptag->flowid, ptag->flowtype); MLX5E_TLS_RX_TAG_LOCK(ptag); if (mlx5e_tls_rx_snd_tag_add_tcp_sequence(ptag, params->tls_rx.tls_hdr_tcp_sn, params->tls_rx.tls_rec_length, params->tls_rx.tls_seq_number) && ptag->tcp_resync_pending == 0) { err = mlx5e_tls_rx_receive_progress_parameters(iq, ptag); if (err != 0) { MLX5E_TLS_RX_STAT_INC(ptag, rx_resync_err, 1); } else { ptag->tcp_resync_pending = 1; MLX5E_TLS_RX_STAT_INC(ptag, rx_resync_ok, 1); } } else { err = 0; } MLX5E_TLS_RX_TAG_UNLOCK(ptag); return (-err); } /* * This function frees a TLS RX tag in a non-blocking way. */ static void mlx5e_tls_rx_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_tls_rx_tag *ptag = container_of(pmt, struct mlx5e_tls_rx_tag, tag); struct mlx5e_priv *priv; MLX5E_TLS_RX_TAG_LOCK(ptag); ptag->state = MLX5E_TLS_RX_ST_RELEASE; MLX5E_TLS_RX_TAG_UNLOCK(ptag); - priv = ptag->tag.ifp->if_softc; + priv = if_getsoftc(ptag->tag.ifp); queue_work(priv->tls_rx.wq, &ptag->work); } #else int mlx5e_tls_rx_init(struct mlx5e_priv *priv) { return (0); } void mlx5e_tls_rx_cleanup(struct mlx5e_priv *priv) { /* NOP */ } #endif diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c index 04e3c302005b..84adef8398bb 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -1,5061 +1,5061 @@ /*- * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_kern_tls.h" #include "opt_rss.h" #include "opt_ratelimit.h" #include #include #include #include #include static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs); static if_snd_tag_query_t mlx5e_ul_snd_tag_query; static if_snd_tag_free_t mlx5e_ul_snd_tag_free; struct mlx5e_channel_param { struct mlx5e_rq_param rq; struct mlx5e_sq_param sq; struct mlx5e_cq_param rx_cq; struct mlx5e_cq_param tx_cq; }; struct media { u32 subtype; u64 baudrate; }; static const struct media mlx5e_mode_table[MLX5E_LINK_SPEEDS_NUMBER] = { [MLX5E_1000BASE_CX_SGMII] = { .subtype = IFM_1000_CX_SGMII, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_1000BASE_KX] = { .subtype = IFM_1000_KX, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_10GBASE_CX4] = { .subtype = IFM_10G_CX4, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_KX4] = { .subtype = IFM_10G_KX4, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_KR] = { .subtype = IFM_10G_KR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_20GBASE_KR2] = { .subtype = IFM_20G_KR2, .baudrate = IF_Gbps(20ULL), }, [MLX5E_40GBASE_CR4] = { .subtype = IFM_40G_CR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_KR4] = { .subtype = IFM_40G_KR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_56GBASE_R4] = { .subtype = IFM_56G_R4, .baudrate = IF_Gbps(56ULL), }, [MLX5E_10GBASE_CR] = { .subtype = IFM_10G_CR1, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_SR] = { .subtype = IFM_10G_SR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_ER_LR] = { .subtype = IFM_10G_ER, .baudrate = IF_Gbps(10ULL), }, [MLX5E_40GBASE_SR4] = { .subtype = IFM_40G_SR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_LR4_ER4] = { .subtype = IFM_40G_LR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_100GBASE_CR4] = { .subtype = IFM_100G_CR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_SR4] = { .subtype = IFM_100G_SR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_KR4] = { .subtype = IFM_100G_KR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_LR4] = { .subtype = IFM_100G_LR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100BASE_TX] = { .subtype = IFM_100_TX, .baudrate = IF_Mbps(100ULL), }, [MLX5E_1000BASE_T] = { .subtype = IFM_1000_T, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_10GBASE_T] = { .subtype = IFM_10G_T, .baudrate = IF_Gbps(10ULL), }, [MLX5E_25GBASE_CR] = { .subtype = IFM_25G_CR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GBASE_KR] = { .subtype = IFM_25G_KR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GBASE_SR] = { .subtype = IFM_25G_SR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_50GBASE_CR2] = { .subtype = IFM_50G_CR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GBASE_KR2] = { .subtype = IFM_50G_KR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GBASE_KR4] = { .subtype = IFM_50G_KR4, .baudrate = IF_Gbps(50ULL), }, }; static const struct media mlx5e_ext_mode_table[MLX5E_EXT_LINK_SPEEDS_NUMBER][MLX5E_CABLE_TYPE_NUMBER] = { /**/ [MLX5E_SGMII_100M][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_100_SGMII, .baudrate = IF_Mbps(100), }, /**/ [MLX5E_1000BASE_X_SGMII][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_1000_CX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_1000_SX, .baudrate = IF_Mbps(1000), }, /**/ [MLX5E_5GBASE_R][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_5000_KR, .baudrate = IF_Mbps(5000), }, [MLX5E_5GBASE_R][MLX5E_CABLE_TYPE_TWISTED_PAIR] = { .subtype = IFM_5000_T, .baudrate = IF_Mbps(5000), }, /**/ [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_10G_KR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_10G_CR1, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_10G_SR, .baudrate = IF_Gbps(10ULL), }, /**/ [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_40G_KR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_40G_CR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_40G_SR4, .baudrate = IF_Gbps(40ULL), }, /**/ [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_25G_KR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_25G_CR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_25G_SR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CABLE_TYPE_TWISTED_PAIR] = { .subtype = IFM_25G_T, .baudrate = IF_Gbps(25ULL), }, /**/ [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_50G_KR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_50G_CR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_50G_SR2, .baudrate = IF_Gbps(50ULL), }, /**/ [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_50G_KR_PAM4, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_50G_CP, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_50G_SR, .baudrate = IF_Gbps(50ULL), }, /**/ [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_100G_KR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_100G_CR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_100G_SR4, .baudrate = IF_Gbps(100ULL), }, /**/ [MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_100G_KR_PAM4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_100G_CR_PAM4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_1_100GBASE_CR_KR][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_100G_SR2, /* XXX */ .baudrate = IF_Gbps(100ULL), }, /**/ [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_100G_KR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_100G_CP2, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_100G_SR2, .baudrate = IF_Gbps(100ULL), }, /**/ [MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_200G_KR4_PAM4, /* XXX */ .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_200G_CR4_PAM4, /* XXX */ .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_2_200GBASE_CR2_KR2][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_200G_SR4, /* XXX */ .baudrate = IF_Gbps(200ULL), }, /**/ [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_200G_KR4_PAM4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_PASSIVE_COPPER] = { .subtype = IFM_200G_CR4_PAM4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CABLE_TYPE_OPTICAL_MODULE] = { .subtype = IFM_200G_SR4, .baudrate = IF_Gbps(200ULL), }, /**/ [MLX5E_400GAUI_8][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_400G_LR8, /* XXX */ .baudrate = IF_Gbps(400ULL), }, /**/ [MLX5E_400GAUI_4_400GBASE_CR4_KR4][MLX5E_CABLE_TYPE_UNKNOWN] = { .subtype = IFM_400G_LR8, /* XXX */ .baudrate = IF_Gbps(400ULL), }, }; static const struct if_snd_tag_sw mlx5e_ul_snd_tag_sw = { .snd_tag_query = mlx5e_ul_snd_tag_query, .snd_tag_free = mlx5e_ul_snd_tag_free, .type = IF_SND_TAG_TYPE_UNLIMITED }; DEBUGNET_DEFINE(mlx5_en); MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet"); static void mlx5e_update_carrier(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; u32 eth_proto_oper; int error; u8 i; u8 cable_type; u8 port_state; u8 is_er_type; bool ext; struct media media_entry = {}; port_state = mlx5_query_vport_state(mdev, MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0); if (port_state == VPORT_STATE_UP) { priv->media_status_last |= IFM_ACTIVE; } else { priv->media_status_last &= ~IFM_ACTIVE; priv->media_active_last = IFM_ETHER; if_link_state_change(priv->ifp, LINK_STATE_DOWN); return; } error = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); if (error) { priv->media_active_last = IFM_ETHER; - priv->ifp->if_baudrate = 1; + if_setbaudrate(priv->ifp, 1); mlx5_en_err(priv->ifp, "query port ptys failed: 0x%x\n", error); return; } ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); i = ilog2(eth_proto_oper); if (ext) { error = mlx5_query_pddr_cable_type(mdev, 1, &cable_type); if (error != 0) { /* use fallback entry */ media_entry = mlx5e_ext_mode_table[i][MLX5E_CABLE_TYPE_UNKNOWN]; mlx5_en_err(priv->ifp, "query port pddr failed: %d\n", error); } else { media_entry = mlx5e_ext_mode_table[i][cable_type]; /* check if we should use fallback entry */ if (media_entry.subtype == 0) media_entry = mlx5e_ext_mode_table[i][MLX5E_CABLE_TYPE_UNKNOWN]; } } else { media_entry = mlx5e_mode_table[i]; } if (media_entry.subtype == 0) { mlx5_en_err(priv->ifp, "Could not find operational media subtype\n"); return; } switch (media_entry.subtype) { case IFM_10G_ER: error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type); if (error != 0) { mlx5_en_err(priv->ifp, "query port pddr failed: %d\n", error); } if (error != 0 || is_er_type == 0) media_entry.subtype = IFM_10G_LR; break; case IFM_40G_LR4: error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type); if (error != 0) { mlx5_en_err(priv->ifp, "query port pddr failed: %d\n", error); } if (error == 0 && is_er_type != 0) media_entry.subtype = IFM_40G_ER4; break; } priv->media_active_last = media_entry.subtype | IFM_ETHER | IFM_FDX; - priv->ifp->if_baudrate = media_entry.baudrate; + if_setbaudrate(priv->ifp, media_entry.baudrate); if_link_state_change(priv->ifp, LINK_STATE_UP); } static void -mlx5e_media_status(struct ifnet *dev, struct ifmediareq *ifmr) +mlx5e_media_status(if_t dev, struct ifmediareq *ifmr) { - struct mlx5e_priv *priv = dev->if_softc; + struct mlx5e_priv *priv = if_getsoftc(dev); ifmr->ifm_status = priv->media_status_last; ifmr->ifm_current = ifmr->ifm_active = priv->media_active_last | (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) | (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0); } static u32 mlx5e_find_link_mode(u32 subtype, bool ext) { u32 link_mode = 0; switch (subtype) { case 0: goto done; case IFM_10G_LR: subtype = IFM_10G_ER; break; case IFM_40G_ER4: subtype = IFM_40G_LR4; break; default: break; } if (ext) { for (unsigned i = 0; i != MLX5E_EXT_LINK_SPEEDS_NUMBER; i++) { for (unsigned j = 0; j != MLX5E_CABLE_TYPE_NUMBER; j++) { if (mlx5e_ext_mode_table[i][j].subtype == subtype) link_mode |= MLX5E_PROT_MASK(i); } } } else { for (unsigned i = 0; i != MLX5E_LINK_SPEEDS_NUMBER; i++) { if (mlx5e_mode_table[i].subtype == subtype) link_mode |= MLX5E_PROT_MASK(i); } } done: return (link_mode); } static int mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv) { return (mlx5_set_port_pause_and_pfc(priv->mdev, 1, priv->params.rx_pauseframe_control, priv->params.tx_pauseframe_control, priv->params.rx_priority_flow_control, priv->params.tx_priority_flow_control)); } static int mlx5e_set_port_pfc(struct mlx5e_priv *priv) { int error; if (priv->gone != 0) { error = -ENXIO; } else if (priv->params.rx_pauseframe_control || priv->params.tx_pauseframe_control) { mlx5_en_err(priv->ifp, "Global pauseframes must be disabled before enabling PFC.\n"); error = -EINVAL; } else { error = mlx5e_set_port_pause_and_pfc(priv); } return (error); } static int -mlx5e_media_change(struct ifnet *dev) +mlx5e_media_change(if_t dev) { - struct mlx5e_priv *priv = dev->if_softc; + struct mlx5e_priv *priv = if_getsoftc(dev); struct mlx5_core_dev *mdev = priv->mdev; u32 eth_proto_cap; u32 link_mode; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; int was_opened; int locked; int error; bool ext; locked = PRIV_LOCKED(priv); if (!locked) PRIV_LOCK(priv); if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) { error = EINVAL; goto done; } error = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); if (error != 0) { mlx5_en_err(dev, "Query port media capability failed\n"); goto done; } ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media), ext); /* query supported capabilities */ eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_capability); /* check for autoselect */ if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) { link_mode = eth_proto_cap; if (link_mode == 0) { mlx5_en_err(dev, "Port media capability is zero\n"); error = EINVAL; goto done; } } else { link_mode = link_mode & eth_proto_cap; if (link_mode == 0) { mlx5_en_err(dev, "Not supported link mode requested\n"); error = EINVAL; goto done; } } if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) { /* check if PFC is enabled */ if (priv->params.rx_priority_flow_control || priv->params.tx_priority_flow_control) { mlx5_en_err(dev, "PFC must be disabled before enabling global pauseframes.\n"); error = EINVAL; goto done; } } /* update pauseframe control bits */ priv->params.rx_pauseframe_control = (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0; priv->params.tx_pauseframe_control = (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0; /* check if device is opened */ was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); /* reconfigure the hardware */ mlx5_set_port_status(mdev, MLX5_PORT_DOWN); mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN, ext); error = -mlx5e_set_port_pause_and_pfc(priv); if (was_opened) mlx5_set_port_status(mdev, MLX5_PORT_UP); done: if (!locked) PRIV_UNLOCK(priv); return (error); } static void mlx5e_update_carrier_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, update_carrier_work); PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_OPENED, &priv->state)) mlx5e_update_carrier(priv); PRIV_UNLOCK(priv); } #define MLX5E_PCIE_PERF_GET_64(a,b,c,d,e,f) \ s_debug->c = MLX5_GET64(mpcnt_reg, out, counter_set.f.c); #define MLX5E_PCIE_PERF_GET_32(a,b,c,d,e,f) \ s_debug->c = MLX5_GET(mpcnt_reg, out, counter_set.f.c); static void mlx5e_update_pcie_counters(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug; const unsigned sz = MLX5_ST_SZ_BYTES(mpcnt_reg); void *out; void *in; int err; /* allocate firmware request structures */ in = mlx5_vzalloc(sz); out = mlx5_vzalloc(sz); if (in == NULL || out == NULL) goto free_out; MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_PERFORMANCE_COUNTERS_64(MLX5E_PCIE_PERF_GET_64) MLX5E_PCIE_PERFORMANCE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_LANE_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_LANE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) free_out: /* free firmware request structures */ kvfree(in); kvfree(out); } /* * This function reads the physical port counters from the firmware * using a pre-defined layout defined by various MLX5E_PPORT_XXX() * macros. The output is converted from big-endian 64-bit values into * host endian ones and stored in the "priv->stats.pport" structure. */ static void mlx5e_update_pport_counters(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_pport_stats *s = &priv->stats.pport; struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug; u32 *in; u32 *out; const u64 *ptr; unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg); unsigned x; unsigned y; unsigned z; /* allocate firmware request structures */ in = mlx5_vzalloc(sz); out = mlx5_vzalloc(sz); if (in == NULL || out == NULL) goto free_out; /* * Get pointer to the 64-bit counter set which is located at a * fixed offset in the output firmware request structure: */ ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set); MLX5_SET(ppcnt_reg, in, local_port, 1); /* read IEEE802_3 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM; x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++) s->arg[y] = be64toh(ptr[x]); /* read RFC2819 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++) s->arg[y] = be64toh(ptr[x]); for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM + MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read RFC2863 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read physical layer stats counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read Extended Ethernet counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read Extended Statistical Group */ if (MLX5_CAP_GEN(mdev, pcam_reg) && MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) && MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) { /* read Extended Statistical counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_STATISTICAL_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); } /* read PCIE counters */ mlx5e_update_pcie_counters(priv); /* read per-priority counters */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP); /* iterate all the priorities */ for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) { MLX5_SET(ppcnt_reg, in, prio_tc, z); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); /* read per priority stats counter group using predefined counter layout */ for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM / MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++) s->arg[y] = be64toh(ptr[x]); } free_out: /* free firmware request structures */ kvfree(in); kvfree(out); } static void mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv) { u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard)) return; MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV); MLX5_SET(query_vnic_env_in, in, op_mod, 0); MLX5_SET(query_vnic_env_in, in, other_vport, 0); if (mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)) != 0) return; priv->stats.vport.rx_steer_missed_packets = MLX5_GET64(query_vnic_env_out, out, vport_env.nic_receive_steering_discard); } /* * This function is called regularly to collect all statistics * counters from the firmware. The values can be viewed through the * sysctl interface. Execution is serialized using the priv's global * configuration lock. */ static void mlx5e_update_stats_locked(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_vport_stats *s = &priv->stats.vport; struct mlx5e_sq_stats *sq_stats; u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)]; u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); u64 tso_packets = 0; u64 tso_bytes = 0; u64 tx_queue_dropped = 0; u64 tx_defragged = 0; u64 tx_offload_none = 0; u64 lro_packets = 0; u64 lro_bytes = 0; u64 sw_lro_queued = 0; u64 sw_lro_flushed = 0; u64 rx_csum_none = 0; u64 rx_wqe_err = 0; u64 rx_packets = 0; u64 rx_bytes = 0; u64 rx_decrypted_error = 0; u64 rx_decrypted_ok = 0; u32 rx_out_of_buffer = 0; int error; int i; int j; out = mlx5_vzalloc(outlen); if (out == NULL) goto free_out; /* Collect firts the SW counters and then HW for consistency */ for (i = 0; i < priv->params.num_channels; i++) { struct mlx5e_channel *pch = priv->channel + i; struct mlx5e_rq *rq = &pch->rq; struct mlx5e_rq_stats *rq_stats = &pch->rq.stats; /* collect stats from LRO */ rq_stats->sw_lro_queued = rq->lro.lro_queued; rq_stats->sw_lro_flushed = rq->lro.lro_flushed; sw_lro_queued += rq_stats->sw_lro_queued; sw_lro_flushed += rq_stats->sw_lro_flushed; lro_packets += rq_stats->lro_packets; lro_bytes += rq_stats->lro_bytes; rx_csum_none += rq_stats->csum_none; rx_wqe_err += rq_stats->wqe_err; rx_packets += rq_stats->packets; rx_bytes += rq_stats->bytes; rx_decrypted_error += rq_stats->decrypted_error_packets; rx_decrypted_ok += rq_stats->decrypted_ok_packets; for (j = 0; j < priv->num_tc; j++) { sq_stats = &pch->sq[j].stats; tso_packets += sq_stats->tso_packets; tso_bytes += sq_stats->tso_bytes; tx_queue_dropped += sq_stats->dropped; tx_queue_dropped += sq_stats->enobuf; tx_defragged += sq_stats->defragged; tx_offload_none += sq_stats->csum_offload_none; } } #ifdef RATELIMIT /* Collect statistics from all rate-limit queues */ for (j = 0; j < priv->rl.param.tx_worker_threads_def; j++) { struct mlx5e_rl_worker *rlw = priv->rl.workers + j; for (i = 0; i < priv->rl.param.tx_channels_per_worker_def; i++) { struct mlx5e_rl_channel *channel = rlw->channels + i; struct mlx5e_sq *sq = channel->sq; if (sq == NULL) continue; sq_stats = &sq->stats; tso_packets += sq_stats->tso_packets; tso_bytes += sq_stats->tso_bytes; tx_queue_dropped += sq_stats->dropped; tx_queue_dropped += sq_stats->enobuf; tx_defragged += sq_stats->defragged; tx_offload_none += sq_stats->csum_offload_none; } } #endif /* update counters */ s->tso_packets = tso_packets; s->tso_bytes = tso_bytes; s->tx_queue_dropped = tx_queue_dropped; s->tx_defragged = tx_defragged; s->lro_packets = lro_packets; s->lro_bytes = lro_bytes; s->sw_lro_queued = sw_lro_queued; s->sw_lro_flushed = sw_lro_flushed; s->rx_csum_none = rx_csum_none; s->rx_wqe_err = rx_wqe_err; s->rx_packets = rx_packets; s->rx_bytes = rx_bytes; s->rx_decrypted_error_packets = rx_decrypted_error; s->rx_decrypted_ok_packets = rx_decrypted_ok; mlx5e_grp_vnic_env_update_stats(priv); /* HW counters */ memset(in, 0, sizeof(in)); MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER); MLX5_SET(query_vport_counter_in, in, op_mod, 0); MLX5_SET(query_vport_counter_in, in, other_vport, 0); memset(out, 0, outlen); /* get number of out-of-buffer drops first */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 && mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id, &rx_out_of_buffer) == 0) { s->rx_out_of_buffer = rx_out_of_buffer; } /* get port statistics */ if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen) == 0) { #define MLX5_GET_CTR(out, x) \ MLX5_GET64(query_vport_counter_out, out, x) s->rx_error_packets = MLX5_GET_CTR(out, received_errors.packets); s->rx_error_bytes = MLX5_GET_CTR(out, received_errors.octets); s->tx_error_packets = MLX5_GET_CTR(out, transmit_errors.packets); s->tx_error_bytes = MLX5_GET_CTR(out, transmit_errors.octets); s->rx_unicast_packets = MLX5_GET_CTR(out, received_eth_unicast.packets); s->rx_unicast_bytes = MLX5_GET_CTR(out, received_eth_unicast.octets); s->tx_unicast_packets = MLX5_GET_CTR(out, transmitted_eth_unicast.packets); s->tx_unicast_bytes = MLX5_GET_CTR(out, transmitted_eth_unicast.octets); s->rx_multicast_packets = MLX5_GET_CTR(out, received_eth_multicast.packets); s->rx_multicast_bytes = MLX5_GET_CTR(out, received_eth_multicast.octets); s->tx_multicast_packets = MLX5_GET_CTR(out, transmitted_eth_multicast.packets); s->tx_multicast_bytes = MLX5_GET_CTR(out, transmitted_eth_multicast.octets); s->rx_broadcast_packets = MLX5_GET_CTR(out, received_eth_broadcast.packets); s->rx_broadcast_bytes = MLX5_GET_CTR(out, received_eth_broadcast.octets); s->tx_broadcast_packets = MLX5_GET_CTR(out, transmitted_eth_broadcast.packets); s->tx_broadcast_bytes = MLX5_GET_CTR(out, transmitted_eth_broadcast.octets); s->tx_packets = s->tx_unicast_packets + s->tx_multicast_packets + s->tx_broadcast_packets; s->tx_bytes = s->tx_unicast_bytes + s->tx_multicast_bytes + s->tx_broadcast_bytes; /* Update calculated offload counters */ s->tx_csum_offload = s->tx_packets - tx_offload_none; s->rx_csum_good = s->rx_packets - s->rx_csum_none; } /* Get physical port counters */ mlx5e_update_pport_counters(priv); s->tx_jumbo_packets = priv->stats.port_stats_debug.tx_stat_p1519to2047octets + priv->stats.port_stats_debug.tx_stat_p2048to4095octets + priv->stats.port_stats_debug.tx_stat_p4096to8191octets + priv->stats.port_stats_debug.tx_stat_p8192to10239octets; free_out: kvfree(out); /* Update diagnostics, if any */ if (priv->params_ethtool.diag_pci_enable || priv->params_ethtool.diag_general_enable) { error = mlx5_core_get_diagnostics_full(mdev, priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL, priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL); if (error != 0) mlx5_en_err(priv->ifp, "Failed reading diagnostics: %d\n", error); } /* Update FEC, if any */ error = mlx5e_fec_update(priv); if (error != 0 && error != EOPNOTSUPP) { mlx5_en_err(priv->ifp, "Updating FEC failed: %d\n", error); } /* Update temperature, if any */ if (priv->params_ethtool.hw_num_temp != 0) { error = mlx5e_hw_temperature_update(priv); if (error != 0 && error != EOPNOTSUPP) { mlx5_en_err(priv->ifp, "Updating temperature failed: %d\n", error); } } } static void mlx5e_update_stats_work(struct work_struct *work) { struct mlx5e_priv *priv; priv = container_of(work, struct mlx5e_priv, update_stats_work); PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 && !test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &priv->mdev->intf_state)) mlx5e_update_stats_locked(priv); PRIV_UNLOCK(priv); } static void mlx5e_update_stats(void *arg) { struct mlx5e_priv *priv = arg; queue_work(priv->wq, &priv->update_stats_work); callout_reset(&priv->watchdog, hz / 4, &mlx5e_update_stats, priv); } static void mlx5e_async_event_sub(struct mlx5e_priv *priv, enum mlx5_dev_event event) { switch (event) { case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: queue_work(priv->wq, &priv->update_carrier_work); break; default: break; } } static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv, enum mlx5_dev_event event, unsigned long param) { struct mlx5e_priv *priv = vpriv; mtx_lock(&priv->async_events_mtx); if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state)) mlx5e_async_event_sub(priv, event); mtx_unlock(&priv->async_events_mtx); } static void mlx5e_enable_async_events(struct mlx5e_priv *priv) { set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); } static void mlx5e_disable_async_events(struct mlx5e_priv *priv) { mtx_lock(&priv->async_events_mtx); clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); mtx_unlock(&priv->async_events_mtx); } static void mlx5e_calibration_callout(void *arg); static int mlx5e_calibration_duration = 20; static int mlx5e_fast_calibration = 1; static int mlx5e_normal_calibration = 30; static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "MLX5 timestamp calibration parameters"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN, &mlx5e_calibration_duration, 0, "Duration of initial calibration"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN, &mlx5e_fast_calibration, 0, "Recalibration interval during initial calibration"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN, &mlx5e_normal_calibration, 0, "Recalibration interval during normal operations"); /* * Ignites the calibration process. */ static void mlx5e_reset_calibration_callout(struct mlx5e_priv *priv) { if (priv->clbr_done == 0) mlx5e_calibration_callout(priv); else callout_reset_sbt_curcpu(&priv->tstmp_clbr, (priv->clbr_done < mlx5e_calibration_duration ? mlx5e_fast_calibration : mlx5e_normal_calibration) * SBT_1S, 0, mlx5e_calibration_callout, priv, C_DIRECT_EXEC); } static uint64_t mlx5e_timespec2usec(const struct timespec *ts) { return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec); } static uint64_t mlx5e_hw_clock(struct mlx5e_priv *priv) { struct mlx5_init_seg *iseg; uint32_t hw_h, hw_h1, hw_l; iseg = priv->mdev->iseg; do { hw_h = ioread32be(&iseg->internal_timer_h); hw_l = ioread32be(&iseg->internal_timer_l); hw_h1 = ioread32be(&iseg->internal_timer_h); } while (hw_h1 != hw_h); return (((uint64_t)hw_h << 32) | hw_l); } /* * The calibration callout, it runs either in the context of the * thread which enables calibration, or in callout. It takes the * snapshot of system and adapter clocks, then advances the pointers to * the calibration point to allow rx path to read the consistent data * lockless. */ static void mlx5e_calibration_callout(void *arg) { struct mlx5e_priv *priv; struct mlx5e_clbr_point *next, *curr; struct timespec ts; int clbr_curr_next; priv = arg; curr = &priv->clbr_points[priv->clbr_curr]; clbr_curr_next = priv->clbr_curr + 1; if (clbr_curr_next >= nitems(priv->clbr_points)) clbr_curr_next = 0; next = &priv->clbr_points[clbr_curr_next]; next->base_prev = curr->base_curr; next->clbr_hw_prev = curr->clbr_hw_curr; next->clbr_hw_curr = mlx5e_hw_clock(priv); if (((next->clbr_hw_curr - curr->clbr_hw_curr) >> MLX5E_TSTMP_PREC) == 0) { if (priv->clbr_done != 0) { mlx5_en_err(priv->ifp, "HW failed tstmp frozen %#jx %#jx, disabling\n", next->clbr_hw_curr, curr->clbr_hw_prev); priv->clbr_done = 0; } atomic_store_rel_int(&curr->clbr_gen, 0); return; } nanouptime(&ts); next->base_curr = mlx5e_timespec2usec(&ts); curr->clbr_gen = 0; atomic_thread_fence_rel(); priv->clbr_curr = clbr_curr_next; atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen)); if (priv->clbr_done < mlx5e_calibration_duration) priv->clbr_done++; mlx5e_reset_calibration_callout(priv); } static const char *mlx5e_rq_stats_desc[] = { MLX5E_RQ_STATS(MLX5E_STATS_DESC) }; static int mlx5e_create_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *param, struct mlx5e_rq *rq) { struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; char buffer[16]; void *rqc = param->rqc; void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); int wq_sz; int err; int i; u32 nsegs, wqe_sz; err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); if (err != 0) goto done; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */ nsegs, /* nsegments */ nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &rq->dma_tag))) goto done; err = mlx5_wq_ll_create(mdev, ¶m->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); if (err) goto err_free_dma_tag; rq->wq.db = &rq->wq.db[MLX5_RCV_DBR]; err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs); if (err != 0) goto err_rq_wq_destroy; wq_sz = mlx5_wq_ll_get_size(&rq->wq); err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz); if (err) goto err_rq_wq_destroy; rq->mbuf = malloc_domainset(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, mlx5_dev_domainset(mdev), M_WAITOK | M_ZERO); for (i = 0; i != wq_sz; i++) { struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); int j; err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map); if (err != 0) { while (i--) bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); goto err_rq_mbuf_free; } /* set value for constant fields */ for (j = 0; j < rq->nsegs; j++) wqe->data[j].lkey = cpu_to_be32(priv->mr.key); } INIT_WORK(&rq->dim.work, mlx5e_dim_work); if (priv->params.rx_cq_moderation_mode < 2) { rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; } else { void *cqc = container_of(param, struct mlx5e_channel_param, rq)->rx_cq.cqc; switch (MLX5_GET(cqc, cqc, cq_period_mode)) { case MLX5_CQ_PERIOD_MODE_START_FROM_EQE: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE; break; case MLX5_CQ_PERIOD_MODE_START_FROM_CQE: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE; break; default: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; break; } } rq->ifp = priv->ifp; rq->channel = c; rq->ix = c->ix; snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix); mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM, rq->stats.arg); return (0); err_rq_mbuf_free: free(rq->mbuf, M_MLX5EN); tcp_lro_free(&rq->lro); err_rq_wq_destroy: mlx5_wq_destroy(&rq->wq_ctrl); err_free_dma_tag: bus_dma_tag_destroy(rq->dma_tag); done: return (err); } static void mlx5e_destroy_rq(struct mlx5e_rq *rq) { int wq_sz; int i; /* destroy all sysctl nodes */ sysctl_ctx_free(&rq->stats.ctx); /* free leftover LRO packets, if any */ tcp_lro_free(&rq->lro); wq_sz = mlx5_wq_ll_get_size(&rq->wq); for (i = 0; i != wq_sz; i++) { if (rq->mbuf[i].mbuf != NULL) { bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map); m_freem(rq->mbuf[i].mbuf); } bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); } free(rq->mbuf, M_MLX5EN); mlx5_wq_destroy(&rq->wq_ctrl); bus_dma_tag_destroy(rq->dma_tag); } static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *rqc; void *wq; int inlen; int err; u8 ts_format; inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); ts_format = mlx5_get_rq_default_ts(mdev); rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); wq = MLX5_ADDR_OF(rqc, rqc, wq); memcpy(rqc, param->rqc, sizeof(param->rqc)); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); MLX5_SET(rqc, rqc, ts_format, ts_format); MLX5_SET(rqc, rqc, flush_in_error_en, 1); if (priv->counter_set_id >= 0) MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id); MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); mlx5_fill_page_array(&rq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn); kvfree(in); return (err); } static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *rqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_rq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(modify_rq_in, in, rqn, rq->rqn); MLX5_SET(modify_rq_in, in, rq_state, curr_state); MLX5_SET(rqc, rqc, state, next_state); err = mlx5_core_modify_rq(mdev, in, inlen); kvfree(in); return (err); } static void mlx5e_disable_rq(struct mlx5e_rq *rq) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; mlx5_core_destroy_rq(mdev, rq->rqn); } static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_wq_ll *wq = &rq->wq; int i; for (i = 0; i < 1000; i++) { if (wq->cur_sz >= priv->params.min_rx_wqes) return (0); msleep(4); } return (-ETIMEDOUT); } static int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *param, struct mlx5e_rq *rq) { int err; err = mlx5e_create_rq(c, param, rq); if (err) return (err); /* set CQN in RQ parameters */ MLX5_SET(rqc, param->rqc, cqn, c->rq.cq.mcq.cqn); err = mlx5e_enable_rq(rq, param); if (err) goto err_destroy_rq; err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); if (err) goto err_disable_rq; c->rq.enabled = 1; return (0); err_disable_rq: mlx5e_disable_rq(rq); err_destroy_rq: mlx5e_destroy_rq(rq); return (err); } static void mlx5e_close_rq(struct mlx5e_rq *rq) { mtx_lock(&rq->mtx); rq->enabled = 0; callout_stop(&rq->watchdog); mtx_unlock(&rq->mtx); mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); } static void mlx5e_close_rq_wait(struct mlx5e_rq *rq) { mlx5e_disable_rq(rq); mlx5e_close_cq(&rq->cq); cancel_work_sync(&rq->dim.work); mlx5e_destroy_rq(rq); } /* * What is a drop RQ and why is it needed? * * The RSS indirection table, also called the RQT, selects the * destination RQ based on the receive queue number, RQN. The RQT is * frequently referred to by flow steering rules to distribute traffic * among multiple RQs. The problem is that the RQs cannot be destroyed * before the RQT referring them is destroyed too. Further, TLS RX * rules may still be referring to the RQT even if the link went * down. Because there is no magic RQN for dropping packets, we create * a dummy RQ, also called drop RQ, which sole purpose is to drop all * received packets. When the link goes down this RQN is filled in all * RQT entries, of the main RQT, so the real RQs which are about to be * destroyed can be released and the TLS RX rules can be sustained. */ static void mlx5e_open_drop_rq_comp(struct mlx5_core_cq *mcq __unused, struct mlx5_eqe *eqe __unused) { } static int mlx5e_open_drop_rq(struct mlx5e_priv *priv, struct mlx5e_rq *drop_rq) { struct mlx5e_cq_param param_cq = {}; struct mlx5e_rq_param param_rq = {}; void *rqc_wq = MLX5_ADDR_OF(rqc, param_rq.rqc, wq); int err; /* set channel pointer */ drop_rq->channel = priv->channel; /* set basic CQ parameters needed */ MLX5_SET(cqc, param_cq.cqc, log_cq_size, 0); MLX5_SET(cqc, param_cq.cqc, uar_page, priv->mdev->priv.uar->index); /* open receive completion queue */ err = mlx5e_open_cq(priv, ¶m_cq, &drop_rq->cq, &mlx5e_open_drop_rq_comp, 0); if (err) goto err_done; /* set basic WQ parameters needed */ MLX5_SET(wq, rqc_wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST); MLX5_SET(wq, rqc_wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); MLX5_SET(wq, rqc_wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) + sizeof(struct mlx5_wqe_data_seg))); MLX5_SET(wq, rqc_wq, log_wq_sz, 0); MLX5_SET(wq, rqc_wq, pd, priv->pdn); param_rq.wq.linear = 1; err = mlx5_wq_ll_create(priv->mdev, ¶m_rq.wq, rqc_wq, &drop_rq->wq, &drop_rq->wq_ctrl); if (err) goto err_close_cq; /* set CQN in RQ parameters */ MLX5_SET(rqc, param_rq.rqc, cqn, drop_rq->cq.mcq.cqn); err = mlx5e_enable_rq(drop_rq, ¶m_rq); if (err) goto err_wq_destroy; err = mlx5e_modify_rq(drop_rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); if (err) goto err_disable_rq; return (err); err_disable_rq: mlx5e_disable_rq(drop_rq); err_wq_destroy: mlx5_wq_destroy(&drop_rq->wq_ctrl); err_close_cq: mlx5e_close_cq(&drop_rq->cq); err_done: return (err); } static void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq) { mlx5e_modify_rq(drop_rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); mlx5e_disable_rq(drop_rq); mlx5_wq_destroy(&drop_rq->wq_ctrl); mlx5e_close_cq(&drop_rq->cq); } void mlx5e_free_sq_db(struct mlx5e_sq *sq) { int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); int x; for (x = 0; x != wq_sz; x++) { if (sq->mbuf[x].mbuf != NULL) { bus_dmamap_unload(sq->dma_tag, sq->mbuf[x].dma_map); m_freem(sq->mbuf[x].mbuf); } if (sq->mbuf[x].mst != NULL) { m_snd_tag_rele(sq->mbuf[x].mst); sq->mbuf[x].mst = NULL; } bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); } free(sq->mbuf, M_MLX5EN); } int mlx5e_alloc_sq_db(struct mlx5e_sq *sq) { int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); int err; int x; sq->mbuf = malloc_domainset(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN, mlx5_dev_domainset(sq->priv->mdev), M_WAITOK | M_ZERO); /* Create DMA descriptor MAPs */ for (x = 0; x != wq_sz; x++) { err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map); if (err != 0) { while (x--) bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); free(sq->mbuf, M_MLX5EN); return (err); } } return (0); } static const char *mlx5e_sq_stats_desc[] = { MLX5E_SQ_STATS(MLX5E_STATS_DESC) }; void mlx5e_update_sq_inline(struct mlx5e_sq *sq) { sq->max_inline = sq->priv->params.tx_max_inline; sq->min_inline_mode = sq->priv->params.tx_min_inline_mode; /* * Check if trust state is DSCP or if inline mode is NONE which * indicates CX-5 or newer hardware. */ if (sq->priv->params_ethtool.trust_state != MLX5_QPTS_TRUST_PCP || sq->min_inline_mode == MLX5_INLINE_MODE_NONE) { if (MLX5_CAP_ETH(sq->priv->mdev, wqe_vlan_insert)) sq->min_insert_caps = MLX5E_INSERT_VLAN | MLX5E_INSERT_NON_VLAN; else sq->min_insert_caps = MLX5E_INSERT_NON_VLAN; } else { sq->min_insert_caps = 0; } } static void mlx5e_refresh_sq_inline_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c) { int i; for (i = 0; i != priv->num_tc; i++) { mtx_lock(&c->sq[i].lock); mlx5e_update_sq_inline(&c->sq[i]); mtx_unlock(&c->sq[i].lock); } } void mlx5e_refresh_sq_inline(struct mlx5e_priv *priv) { int i; /* check if channels are closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) mlx5e_refresh_sq_inline_sub(priv, &priv->channel[i]); } static int mlx5e_create_sq(struct mlx5e_channel *c, int tc, struct mlx5e_sq_param *param, struct mlx5e_sq *sq) { struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; char buffer[16]; void *sqc = param->sqc; void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); int err; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &sq->dma_tag))) goto done; sq->mkey_be = cpu_to_be32(priv->mr.key); sq->ifp = priv->ifp; sq->priv = priv; sq->tc = tc; err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) goto err_free_dma_tag; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; err = mlx5e_alloc_sq_db(sq); if (err) goto err_sq_wq_destroy; mlx5e_update_sq_inline(sq); snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc); mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM, sq->stats.arg); return (0); err_sq_wq_destroy: mlx5_wq_destroy(&sq->wq_ctrl); err_free_dma_tag: bus_dma_tag_destroy(sq->dma_tag); done: return (err); } static void mlx5e_destroy_sq(struct mlx5e_sq *sq) { /* destroy all sysctl nodes */ sysctl_ctx_free(&sq->stats.ctx); mlx5e_free_sq_db(sq); mlx5_wq_destroy(&sq->wq_ctrl); bus_dma_tag_destroy(sq->dma_tag); } int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param, const struct mlx5_sq_bfreg *bfreg, int tis_num) { void *in; void *sqc; void *wq; int inlen; int err; u8 ts_format; inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * sq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sq->uar_map = bfreg->map; ts_format = mlx5_get_sq_default_ts(sq->priv->mdev); sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); wq = MLX5_ADDR_OF(sqc, sqc, wq); memcpy(sqc, param->sqc, sizeof(param->sqc)); MLX5_SET(sqc, sqc, tis_num_0, tis_num); MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(sqc, sqc, tis_lst_sz, 1); MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(sqc, sqc, allow_swp, 1); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, uar_page, bfreg->index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); mlx5_fill_page_array(&sq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn); kvfree(in); return (err); } int mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state) { void *in; void *sqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_sq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); MLX5_SET(modify_sq_in, in, sqn, sq->sqn); MLX5_SET(modify_sq_in, in, sq_state, curr_state); MLX5_SET(sqc, sqc, state, next_state); err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen); kvfree(in); return (err); } void mlx5e_disable_sq(struct mlx5e_sq *sq) { mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn); } static int mlx5e_open_sq(struct mlx5e_channel *c, int tc, struct mlx5e_sq_param *param, struct mlx5e_sq *sq) { int err; sq->cev_factor = c->priv->params_ethtool.tx_completion_fact; /* ensure the TX completion event factor is not zero */ if (sq->cev_factor == 0) sq->cev_factor = 1; err = mlx5e_create_sq(c, tc, param, sq); if (err) return (err); err = mlx5e_enable_sq(sq, param, &c->bfreg, c->priv->tisn[tc]); if (err) goto err_destroy_sq; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err) goto err_disable_sq; WRITE_ONCE(sq->running, 1); return (0); err_disable_sq: mlx5e_disable_sq(sq); err_destroy_sq: mlx5e_destroy_sq(sq); return (err); } static void mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep) { /* fill up remainder with NOPs */ while (sq->cev_counter != 0) { while (!mlx5e_sq_has_room_for(sq, 1)) { if (can_sleep != 0) { mtx_unlock(&sq->lock); msleep(4); mtx_lock(&sq->lock); } else { goto done; } } /* send a single NOP */ mlx5e_send_nop(sq, 1); atomic_thread_fence_rel(); } done: mlx5e_tx_notify_hw(sq, false); } void mlx5e_sq_cev_timeout(void *arg) { struct mlx5e_sq *sq = arg; mtx_assert(&sq->lock, MA_OWNED); /* check next state */ switch (sq->cev_next_state) { case MLX5E_CEV_STATE_SEND_NOPS: /* fill TX ring with NOPs, if any */ mlx5e_sq_send_nops_locked(sq, 0); /* check if completed */ if (sq->cev_counter == 0) { sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; return; } break; default: /* send NOPs on next timeout */ sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS; break; } /* restart timer */ callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq); } void mlx5e_drain_sq(struct mlx5e_sq *sq) { int error; struct mlx5_core_dev *mdev= sq->priv->mdev; /* * Check if already stopped. * * NOTE: Serialization of this function is managed by the * caller ensuring the priv's state lock is locked or in case * of rate limit support, a single thread manages drain and * resume of SQs. The "running" variable can therefore safely * be read without any locks. */ if (READ_ONCE(sq->running) == 0) return; /* don't put more packets into the SQ */ WRITE_ONCE(sq->running, 0); /* serialize access to DMA rings */ mtx_lock(&sq->lock); /* teardown event factor timer, if any */ sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; callout_stop(&sq->cev_callout); /* send dummy NOPs in order to flush the transmit ring */ mlx5e_sq_send_nops_locked(sq, 1); mtx_unlock(&sq->lock); /* wait till SQ is empty or link is down */ mtx_lock(&sq->lock); while (sq->cc != sq->pc && (sq->priv->media_status_last & IFM_ACTIVE) != 0 && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR && pci_channel_offline(mdev->pdev) == 0) { mtx_unlock(&sq->lock); msleep(1); sq->cq.mcq.comp(&sq->cq.mcq, NULL); mtx_lock(&sq->lock); } mtx_unlock(&sq->lock); /* error out remaining requests */ error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR); if (error != 0) { mlx5_en_err(sq->ifp, "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error); } /* wait till SQ is empty */ mtx_lock(&sq->lock); while (sq->cc != sq->pc && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR && pci_channel_offline(mdev->pdev) == 0) { mtx_unlock(&sq->lock); msleep(1); sq->cq.mcq.comp(&sq->cq.mcq, NULL); mtx_lock(&sq->lock); } mtx_unlock(&sq->lock); } static void mlx5e_close_sq_wait(struct mlx5e_sq *sq) { mlx5e_drain_sq(sq); mlx5e_disable_sq(sq); mlx5e_destroy_sq(sq); } static int mlx5e_create_cq(struct mlx5e_priv *priv, struct mlx5e_cq_param *param, struct mlx5e_cq *cq, mlx5e_cq_comp_t *comp, int eq_ix) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_core_cq *mcq = &cq->mcq; int eqn_not_used; int irqn; int err; u32 i; err = mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn); if (err) return (err); err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, &cq->wq_ctrl); if (err) return (err); mcq->cqe_sz = 64; mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; *mcq->set_ci_db = 0; *mcq->arm_db = 0; mcq->vector = eq_ix; mcq->comp = comp; mcq->event = mlx5e_cq_error_event; mcq->irqn = irqn; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); cqe->op_own = 0xf1; } cq->priv = priv; return (0); } static void mlx5e_destroy_cq(struct mlx5e_cq *cq) { mlx5_wq_destroy(&cq->wq_ctrl); } static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix) { struct mlx5_core_cq *mcq = &cq->mcq; u32 out[MLX5_ST_SZ_DW(create_cq_out)]; void *in; void *cqc; int inlen; int irqn_not_used; int eqn; int err; inlen = MLX5_ST_SZ_BYTES(create_cq_in) + sizeof(u64) * cq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); memcpy(cqc, param->cqc, sizeof(param->cqc)); mlx5_fill_page_array(&cq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas)); mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used); MLX5_SET(cqc, cqc, c_eqn, eqn); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen, out, sizeof(out)); kvfree(in); if (err) return (err); mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock)); return (0); } static void mlx5e_disable_cq(struct mlx5e_cq *cq) { mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq); } int mlx5e_open_cq(struct mlx5e_priv *priv, struct mlx5e_cq_param *param, struct mlx5e_cq *cq, mlx5e_cq_comp_t *comp, int eq_ix) { int err; err = mlx5e_create_cq(priv, param, cq, comp, eq_ix); if (err) return (err); err = mlx5e_enable_cq(cq, param, eq_ix); if (err) goto err_destroy_cq; return (0); err_destroy_cq: mlx5e_destroy_cq(cq); return (err); } void mlx5e_close_cq(struct mlx5e_cq *cq) { mlx5e_disable_cq(cq); mlx5e_destroy_cq(cq); } static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_channel_param *cparam) { int err; int tc; for (tc = 0; tc < c->priv->num_tc; tc++) { /* open completion queue */ err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq, &mlx5e_tx_cq_comp, c->ix); if (err) goto err_close_tx_cqs; } return (0); err_close_tx_cqs: for (tc--; tc >= 0; tc--) mlx5e_close_cq(&c->sq[tc].cq); return (err); } static void mlx5e_close_tx_cqs(struct mlx5e_channel *c) { int tc; for (tc = 0; tc < c->priv->num_tc; tc++) mlx5e_close_cq(&c->sq[tc].cq); } static int mlx5e_open_sqs(struct mlx5e_channel *c, struct mlx5e_channel_param *cparam) { int err; int tc; for (tc = 0; tc < c->priv->num_tc; tc++) { err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]); if (err) goto err_close_sqs; } return (0); err_close_sqs: for (tc--; tc >= 0; tc--) mlx5e_close_sq_wait(&c->sq[tc]); return (err); } static void mlx5e_close_sqs_wait(struct mlx5e_channel *c) { int tc; for (tc = 0; tc < c->priv->num_tc; tc++) mlx5e_close_sq_wait(&c->sq[tc]); } static void mlx5e_chan_static_init(struct mlx5e_priv *priv, struct mlx5e_channel *c, int ix) { int tc; /* setup priv and channel number */ c->priv = priv; c->ix = ix; /* setup send tag */ m_snd_tag_init(&c->tag, c->priv->ifp, &mlx5e_ul_snd_tag_sw); init_completion(&c->completion); mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0); for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) { struct mlx5e_sq *sq = c->sq + tc; mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK " TX", MTX_DEF); mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK " TX", MTX_DEF); callout_init_mtx(&sq->cev_callout, &sq->lock, 0); } mlx5e_iq_static_init(&c->iq); } static void mlx5e_chan_wait_for_completion(struct mlx5e_channel *c) { m_snd_tag_rele(&c->tag); wait_for_completion(&c->completion); } static void mlx5e_priv_wait_for_completion(struct mlx5e_priv *priv, const uint32_t channels) { uint32_t x; for (x = 0; x != channels; x++) mlx5e_chan_wait_for_completion(&priv->channel[x]); } static void mlx5e_chan_static_destroy(struct mlx5e_channel *c) { int tc; callout_drain(&c->rq.watchdog); mtx_destroy(&c->rq.mtx); for (tc = 0; tc != MLX5E_MAX_TX_NUM_TC; tc++) { callout_drain(&c->sq[tc].cev_callout); mtx_destroy(&c->sq[tc].lock); mtx_destroy(&c->sq[tc].comp_lock); } mlx5e_iq_static_destroy(&c->iq); } static int mlx5e_open_channel(struct mlx5e_priv *priv, struct mlx5e_channel_param *cparam, struct mlx5e_channel *c) { struct epoch_tracker et; int i, err; /* zero non-persistent data */ MLX5E_ZERO(&c->rq, mlx5e_rq_zero_start); for (i = 0; i != priv->num_tc; i++) MLX5E_ZERO(&c->sq[i], mlx5e_sq_zero_start); MLX5E_ZERO(&c->iq, mlx5e_iq_zero_start); /* open transmit completion queue */ err = mlx5e_open_tx_cqs(c, cparam); if (err) goto err_free; /* open receive completion queue */ err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq, &mlx5e_rx_cq_comp, c->ix); if (err) goto err_close_tx_cqs; err = mlx5e_open_sqs(c, cparam); if (err) goto err_close_rx_cq; err = mlx5e_iq_open(c, &cparam->sq, &cparam->tx_cq, &c->iq); if (err) goto err_close_sqs; err = mlx5e_open_rq(c, &cparam->rq, &c->rq); if (err) goto err_close_iq; /* poll receive queue initially */ NET_EPOCH_ENTER(et); c->rq.cq.mcq.comp(&c->rq.cq.mcq, NULL); NET_EPOCH_EXIT(et); return (0); err_close_iq: mlx5e_iq_close(&c->iq); err_close_sqs: mlx5e_close_sqs_wait(c); err_close_rx_cq: mlx5e_close_cq(&c->rq.cq); err_close_tx_cqs: mlx5e_close_tx_cqs(c); err_free: return (err); } static void mlx5e_close_channel(struct mlx5e_channel *c) { mlx5e_close_rq(&c->rq); } static void mlx5e_close_channel_wait(struct mlx5e_channel *c) { mlx5e_close_rq_wait(&c->rq); mlx5e_iq_close(&c->iq); mlx5e_close_sqs_wait(c); mlx5e_close_tx_cqs(c); } static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs) { u32 r, n; r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz : - MLX5E_SW2MB_MTU(priv->ifp->if_mtu); + MLX5E_SW2MB_MTU(if_getmtu(priv->ifp)); if (r > MJUM16BYTES) return (-ENOMEM); if (r > MJUM9BYTES) r = MJUM16BYTES; else if (r > MJUMPAGESIZE) r = MJUM9BYTES; else if (r > MCLBYTES) r = MJUMPAGESIZE; else r = MCLBYTES; /* * n + 1 must be a power of two, because stride size must be. * Stride size is 16 * (n + 1), as the first segment is * control. */ for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++) ; if (n > MLX5E_MAX_BUSDMA_RX_SEGS) return (-ENOMEM); *wqe_sz = r; *nsegs = n; return (0); } static void mlx5e_build_rq_param(struct mlx5e_priv *priv, struct mlx5e_rq_param *param) { void *rqc = param->rqc; void *wq = MLX5_ADDR_OF(rqc, rqc, wq); u32 wqe_sz, nsegs; mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST); MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) + nsegs * sizeof(struct mlx5_wqe_data_seg))); MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size); MLX5_SET(wq, wq, pd, priv->pdn); param->wq.linear = 1; } static void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); MLX5_SET(wq, wq, pd, priv->pdn); param->wq.linear = 1; } static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { void *cqc = param->cqc; MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index); } static void mlx5e_get_default_profile(struct mlx5e_priv *priv, int mode, struct net_dim_cq_moder *ptr) { *ptr = net_dim_get_profile(mode, MLX5E_DIM_DEFAULT_PROFILE); /* apply LRO restrictions */ if (priv->params.hw_lro_en && ptr->pkts > MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO) { ptr->pkts = MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO; } } static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { struct net_dim_cq_moder curr; void *cqc = param->cqc; /* * We use MLX5_CQE_FORMAT_HASH because the RX hash mini CQE * format is more beneficial for FreeBSD use case. * * Adding support for MLX5_CQE_FORMAT_CSUM will require changes * in mlx5e_decompress_cqe. */ if (priv->params.cqe_zipping_en) { MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_HASH); MLX5_SET(cqc, cqc, cqe_compression_en, 1); } MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size); switch (priv->params.rx_cq_moderation_mode) { case 0: MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts); MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 1: MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts); if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 2: mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE, &curr); MLX5_SET(cqc, cqc, cq_period, curr.usec); MLX5_SET(cqc, cqc, cq_max_count, curr.pkts); MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 3: mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE, &curr); MLX5_SET(cqc, cqc, cq_period, curr.usec); MLX5_SET(cqc, cqc, cq_max_count, curr.pkts); if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: break; } mlx5e_dim_build_cq_param(priv, param); mlx5e_build_common_cq_param(priv, param); } static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { void *cqc = param->cqc; MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size); MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts); switch (priv->params.tx_cq_moderation_mode) { case 0: MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; } mlx5e_build_common_cq_param(priv, param); } static void mlx5e_build_channel_param(struct mlx5e_priv *priv, struct mlx5e_channel_param *cparam) { memset(cparam, 0, sizeof(*cparam)); mlx5e_build_rq_param(priv, &cparam->rq); mlx5e_build_sq_param(priv, &cparam->sq); mlx5e_build_rx_cq_param(priv, &cparam->rx_cq); mlx5e_build_tx_cq_param(priv, &cparam->tx_cq); } static int mlx5e_open_channels(struct mlx5e_priv *priv) { struct mlx5e_channel_param *cparam; int err; int i; int j; cparam = malloc(sizeof(*cparam), M_MLX5EN, M_WAITOK); mlx5e_build_channel_param(priv, cparam); for (i = 0; i < priv->params.num_channels; i++) { err = mlx5e_open_channel(priv, cparam, &priv->channel[i]); if (err) goto err_close_channels; /* Bind interrupt vectors, if any. */ if (priv->params_ethtool.irq_cpu_base > -1) { cpuset_t cpuset; int cpu; int irq; int eqn; int nirq; err = mlx5_vector2eqn(priv->mdev, i, &eqn, &nirq); /* error here is non-fatal */ if (err != 0) continue; irq = priv->mdev->priv.msix_arr[nirq].vector; cpu = (unsigned)(priv->params_ethtool.irq_cpu_base + i * priv->params_ethtool.irq_cpu_stride) % (unsigned)mp_ncpus; CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); intr_setaffinity(irq, CPU_WHICH_INTRHANDLER, &cpuset); } } for (j = 0; j < priv->params.num_channels; j++) { err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j].rq); if (err) goto err_close_channels; } free(cparam, M_MLX5EN); return (0); err_close_channels: while (i--) { mlx5e_close_channel(&priv->channel[i]); mlx5e_close_channel_wait(&priv->channel[i]); } free(cparam, M_MLX5EN); return (err); } static void mlx5e_close_channels(struct mlx5e_priv *priv) { int i; for (i = 0; i < priv->params.num_channels; i++) mlx5e_close_channel(&priv->channel[i]); for (i = 0; i < priv->params.num_channels; i++) mlx5e_close_channel_wait(&priv->channel[i]); } static int mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq) { if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { uint8_t cq_mode; switch (priv->params.tx_cq_moderation_mode) { case 0: case 2: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; break; default: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; break; } return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq, priv->params.tx_cq_moderation_usec, priv->params.tx_cq_moderation_pkts, cq_mode)); } return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq, priv->params.tx_cq_moderation_usec, priv->params.tx_cq_moderation_pkts)); } static int mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq) { if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { uint8_t cq_mode; uint8_t dim_mode; int retval; switch (priv->params.rx_cq_moderation_mode) { case 0: case 2: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE; break; default: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE; break; } /* tear down dynamic interrupt moderation */ mtx_lock(&rq->mtx); rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; mtx_unlock(&rq->mtx); /* wait for dynamic interrupt moderation work task, if any */ cancel_work_sync(&rq->dim.work); if (priv->params.rx_cq_moderation_mode >= 2) { struct net_dim_cq_moder curr; mlx5e_get_default_profile(priv, dim_mode, &curr); retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq, curr.usec, curr.pkts, cq_mode); /* set dynamic interrupt moderation mode and zero defaults */ mtx_lock(&rq->mtx); rq->dim.mode = dim_mode; rq->dim.state = 0; rq->dim.profile_ix = MLX5E_DIM_DEFAULT_PROFILE; mtx_unlock(&rq->mtx); } else { retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq, priv->params.rx_cq_moderation_usec, priv->params.rx_cq_moderation_pkts, cq_mode); } return (retval); } return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq, priv->params.rx_cq_moderation_usec, priv->params.rx_cq_moderation_pkts)); } static int mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c) { int err; int i; err = mlx5e_refresh_rq_params(priv, &c->rq); if (err) goto done; for (i = 0; i != priv->num_tc; i++) { err = mlx5e_refresh_sq_params(priv, &c->sq[i]); if (err) goto done; } done: return (err); } int mlx5e_refresh_channel_params(struct mlx5e_priv *priv) { int i; /* check if channels are closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return (EINVAL); for (i = 0; i < priv->params.num_channels; i++) { int err; err = mlx5e_refresh_channel_params_sub(priv, &priv->channel[i]); if (err) return (err); } return (0); } static int mlx5e_open_tis(struct mlx5e_priv *priv, int tc) { struct mlx5_core_dev *mdev = priv->mdev; u32 in[MLX5_ST_SZ_DW(create_tis_in)]; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); memset(in, 0, sizeof(in)); MLX5_SET(tisc, tisc, prio, tc); MLX5_SET(tisc, tisc, transport_domain, priv->tdn); return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc])); } static void mlx5e_close_tis(struct mlx5e_priv *priv, int tc) { mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc], 0); } static int mlx5e_open_tises(struct mlx5e_priv *priv) { int num_tc = priv->num_tc; int err; int tc; for (tc = 0; tc < num_tc; tc++) { err = mlx5e_open_tis(priv, tc); if (err) goto err_close_tises; } return (0); err_close_tises: for (tc--; tc >= 0; tc--) mlx5e_close_tis(priv, tc); return (err); } static void mlx5e_close_tises(struct mlx5e_priv *priv) { int num_tc = priv->num_tc; int tc; for (tc = 0; tc < num_tc; tc++) mlx5e_close_tis(priv, tc); } static int mlx5e_open_default_rqt(struct mlx5e_priv *priv, u32 *prqtn, int sz) { u32 *in; void *rqtc; int inlen; int err; int i; inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); MLX5_SET(rqtc, rqtc, rqt_max_size, sz); for (i = 0; i != sz; i++) MLX5_SET(rqtc, rqtc, rq_num[i], priv->drop_rq.rqn); err = mlx5_core_create_rqt(priv->mdev, in, inlen, prqtn); kvfree(in); return (err); } static int mlx5e_open_rqts(struct mlx5e_priv *priv) { int err; int i; err = mlx5e_open_default_rqt(priv, &priv->rqtn, 1 << priv->params.rx_hash_log_tbl_sz); if (err) goto err_default; for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++) { err = mlx5e_open_default_rqt(priv, &priv->channel[i].rqtn, 1); if (err) goto err_channel; } return (0); err_channel: while (i--) mlx5_core_destroy_rqt(priv->mdev, priv->channel[i].rqtn, 0); mlx5_core_destroy_rqt(priv->mdev, priv->rqtn, 0); err_default: return (err); } static void mlx5e_close_rqts(struct mlx5e_priv *priv) { int i; for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++) mlx5_core_destroy_rqt(priv->mdev, priv->channel[i].rqtn, 0); mlx5_core_destroy_rqt(priv->mdev, priv->rqtn, 0); } static int mlx5e_activate_rqt(struct mlx5e_priv *priv) { u32 *in; void *rqtc; int inlen; int err; int sz; int i; sz = 1 << priv->params.rx_hash_log_tbl_sz; inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx); MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1); for (i = 0; i != sz; i++) { int ix; #ifdef RSS ix = rss_get_indirection_to_bucket(i); #else ix = i; #endif /* ensure we don't overflow */ ix %= priv->params.num_channels; /* apply receive side scaling stride, if any */ ix -= ix % (int)priv->params.channels_rsss; MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix].rq.rqn); } err = mlx5_core_modify_rqt(priv->mdev, priv->rqtn, in, inlen); if (err) goto err_modify; inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32); MLX5_SET(rqtc, rqtc, rqt_actual_size, 1); for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++) { int ix; #ifdef RSS ix = rss_get_indirection_to_bucket(i); #else ix = i; #endif /* ensure we don't overflow */ ix %= priv->params.num_channels; /* apply receive side scaling stride, if any */ ix -= ix % (int)priv->params.channels_rsss; MLX5_SET(rqtc, rqtc, rq_num[0], priv->channel[ix].rq.rqn); err = mlx5_core_modify_rqt(priv->mdev, priv->channel[i].rqtn, in, inlen); if (err) goto err_modify; } err_modify: kvfree(in); return (err); } static int mlx5e_deactivate_rqt(struct mlx5e_priv *priv) { u32 *in; void *rqtc; int inlen; int err; int sz; int i; sz = 1 << priv->params.rx_hash_log_tbl_sz; inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx); MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1); for (i = 0; i != sz; i++) MLX5_SET(rqtc, rqtc, rq_num[i], priv->drop_rq.rqn); err = mlx5_core_modify_rqt(priv->mdev, priv->rqtn, in, inlen); if (err) goto err_modify; inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32); MLX5_SET(rqtc, rqtc, rqt_actual_size, 1); for (i = 0; i != priv->mdev->priv.eq_table.num_comp_vectors; i++) { MLX5_SET(rqtc, rqtc, rq_num[0], priv->drop_rq.rqn); err = mlx5_core_modify_rqt(priv->mdev, priv->channel[i].rqtn, in, inlen); if (err) goto err_modify; } err_modify: kvfree(in); return (err); } #define MLX5E_RSS_KEY_SIZE (10 * 4) /* bytes */ static void mlx5e_get_rss_key(void *key_ptr) { #ifdef RSS rss_getkey(key_ptr); #else static const u32 rsskey[] = { cpu_to_be32(0xD181C62C), cpu_to_be32(0xF7F4DB5B), cpu_to_be32(0x1983A2FC), cpu_to_be32(0x943E1ADB), cpu_to_be32(0xD9389E6B), cpu_to_be32(0xD1039C2C), cpu_to_be32(0xA74499AD), cpu_to_be32(0x593D56D9), cpu_to_be32(0xF3253C06), cpu_to_be32(0x2ADC1FFC), }; CTASSERT(sizeof(rsskey) == MLX5E_RSS_KEY_SIZE); memcpy(key_ptr, rsskey, MLX5E_RSS_KEY_SIZE); #endif } static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt, bool inner_vxlan) { void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); void *hfsi = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); void *hfs = inner_vxlan ? hfsi : hfso; __be32 *hkey; MLX5_SET(tirc, tirc, transport_domain, priv->tdn); #define ROUGH_MAX_L2_L3_HDR_SZ 256 #define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP) #define MLX5_HASH_ALL (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_L4_SPORT |\ MLX5_HASH_FIELD_SEL_L4_DPORT) #define MLX5_HASH_IP_IPSEC_SPI (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_IPSEC_SPI) if (priv->params.hw_lro_en) { MLX5_SET(tirc, tirc, lro_enable_mask, MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO | MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO); MLX5_SET(tirc, tirc, lro_max_msg_sz, (priv->params.lro_wqe_sz - ROUGH_MAX_L2_L3_HDR_SZ) >> 8); /* TODO: add the option to choose timer value dynamically */ MLX5_SET(tirc, tirc, lro_timeout_period_usecs, MLX5_CAP_ETH(priv->mdev, lro_timer_supported_periods[2])); } if (inner_vxlan) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); /* * All packets must go through the indirection table, RQT, * because it is not possible to modify the RQN of the TIR * for direct dispatchment after it is created, typically * when the link goes up and down. */ MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); MLX5_SET(tirc, tirc, indirect_table, priv->rqtn); MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ); hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); CTASSERT(MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key) >= MLX5E_RSS_KEY_SIZE); #ifdef RSS /* * The FreeBSD RSS implementation does currently not * support symmetric Toeplitz hashes: */ MLX5_SET(tirc, tirc, rx_hash_symmetric, 0); #else MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); #endif mlx5e_get_rss_key(hkey); switch (tt) { case MLX5E_TT_IPV4_TCP: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfs, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) { MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV6_TCP: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfs, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) { MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV4_UDP: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfs, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) { MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV6_UDP: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfs, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) { MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV4_IPSEC_AH: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV6_IPSEC_AH: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV4_IPSEC_ESP: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV6_IPSEC_ESP: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV4: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP); break; case MLX5E_TT_IPV6: MLX5_SET(rx_hash_field_select, hfs, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfs, selected_fields, MLX5_HASH_IP); break; default: break; } } static int mlx5e_open_tir(struct mlx5e_priv *priv, int tt, bool inner_vxlan) { struct mlx5_core_dev *mdev = priv->mdev; u32 *in; void *tirc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_tir_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); mlx5e_build_tir_ctx(priv, tirc, tt, inner_vxlan); err = mlx5_core_create_tir(mdev, in, inlen, inner_vxlan ? &priv->tirn_inner_vxlan[tt] : &priv->tirn[tt]); kvfree(in); return (err); } static void mlx5e_close_tir(struct mlx5e_priv *priv, int tt, bool inner_vxlan) { mlx5_core_destroy_tir(priv->mdev, inner_vxlan ? priv->tirn_inner_vxlan[tt] : priv->tirn[tt], 0); } static int mlx5e_open_tirs(struct mlx5e_priv *priv) { int err; int i; for (i = 0; i != 2 * MLX5E_NUM_TT; i++) { err = mlx5e_open_tir(priv, i / 2, (i % 2) ? true : false); if (err) goto err_close_tirs; } return (0); err_close_tirs: for (i--; i >= 0; i--) mlx5e_close_tir(priv, i / 2, (i % 2) ? true : false); return (err); } static void mlx5e_close_tirs(struct mlx5e_priv *priv) { int i; for (i = 0; i != 2 * MLX5E_NUM_TT; i++) mlx5e_close_tir(priv, i / 2, (i % 2) ? true : false); } /* * SW MTU does not include headers, * HW MTU includes all headers and checksums. */ static int -mlx5e_set_dev_port_mtu(struct ifnet *ifp, int sw_mtu) +mlx5e_set_dev_port_mtu(if_t ifp, int sw_mtu) { - struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5e_priv *priv = if_getsoftc(ifp); struct mlx5_core_dev *mdev = priv->mdev; int hw_mtu; int err; hw_mtu = MLX5E_SW2HW_MTU(sw_mtu); err = mlx5_set_port_mtu(mdev, hw_mtu); if (err) { mlx5_en_err(ifp, "mlx5_set_port_mtu failed setting %d, err=%d\n", sw_mtu, err); return (err); } /* Update vport context MTU */ err = mlx5_set_vport_mtu(mdev, hw_mtu); if (err) { mlx5_en_err(ifp, "Failed updating vport context with MTU size, err=%d\n", err); } - ifp->if_mtu = sw_mtu; + if_setmtu(ifp, sw_mtu); err = mlx5_query_vport_mtu(mdev, &hw_mtu); if (err || !hw_mtu) { /* fallback to port oper mtu */ err = mlx5_query_port_oper_mtu(mdev, &hw_mtu); } if (err) { mlx5_en_err(ifp, "Query port MTU, after setting new MTU value, failed\n"); return (err); } else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) { err = -E2BIG, mlx5_en_err(ifp, "Port MTU %d is smaller than ifp mtu %d\n", hw_mtu, sw_mtu); } else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) { err = -EINVAL; mlx5_en_err(ifp, "Port MTU %d is bigger than ifp mtu %d\n", hw_mtu, sw_mtu); } priv->params_ethtool.hw_mtu = hw_mtu; /* compute MSB */ while (hw_mtu & (hw_mtu - 1)) hw_mtu &= (hw_mtu - 1); priv->params_ethtool.hw_mtu_msb = hw_mtu; return (err); } int -mlx5e_open_locked(struct ifnet *ifp) +mlx5e_open_locked(if_t ifp) { - struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5e_priv *priv = if_getsoftc(ifp); int err; u16 set_id; /* check if already opened */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) return (0); #ifdef RSS if (rss_getnumbuckets() > priv->params.num_channels) { mlx5_en_info(ifp, "NOTE: There are more RSS buckets(%u) than channels(%u) available\n", rss_getnumbuckets(), priv->params.num_channels); } #endif err = mlx5e_open_tises(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_tises failed, %d\n", err); return (err); } err = mlx5_vport_alloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, &set_id); if (err) { mlx5_en_err(priv->ifp, "mlx5_vport_alloc_q_counter failed: %d\n", err); goto err_close_tises; } /* store counter set ID */ priv->counter_set_id = set_id; err = mlx5e_open_channels(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_channels failed, %d\n", err); goto err_dalloc_q_counter; } err = mlx5e_activate_rqt(priv); if (err) { mlx5_en_err(ifp, "mlx5e_activate_rqt failed, %d\n", err); goto err_close_channels; } set_bit(MLX5E_STATE_OPENED, &priv->state); mlx5e_update_carrier(priv); return (0); err_close_channels: mlx5e_close_channels(priv); err_dalloc_q_counter: mlx5_vport_dealloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); err_close_tises: mlx5e_close_tises(priv); return (err); } static void mlx5e_open(void *arg) { struct mlx5e_priv *priv = arg; PRIV_LOCK(priv); if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP)) mlx5_en_err(priv->ifp, "Setting port status to up failed\n"); mlx5e_open_locked(priv->ifp); - priv->ifp->if_drv_flags |= IFF_DRV_RUNNING; + if_setdrvflagbits(priv->ifp, IFF_DRV_RUNNING, 0); PRIV_UNLOCK(priv); } int -mlx5e_close_locked(struct ifnet *ifp) +mlx5e_close_locked(if_t ifp) { - struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5e_priv *priv = if_getsoftc(ifp); /* check if already closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return (0); clear_bit(MLX5E_STATE_OPENED, &priv->state); if_link_state_change(priv->ifp, LINK_STATE_DOWN); mlx5e_deactivate_rqt(priv); mlx5e_close_channels(priv); mlx5_vport_dealloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); mlx5e_close_tises(priv); return (0); } static uint64_t -mlx5e_get_counter(struct ifnet *ifp, ift_counter cnt) +mlx5e_get_counter(if_t ifp, ift_counter cnt) { - struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5e_priv *priv = if_getsoftc(ifp); u64 retval; /* PRIV_LOCK(priv); XXX not allowed */ switch (cnt) { case IFCOUNTER_IPACKETS: retval = priv->stats.vport.rx_packets; break; case IFCOUNTER_IERRORS: retval = priv->stats.pport.in_range_len_errors + priv->stats.pport.out_of_range_len + priv->stats.pport.too_long_errors + priv->stats.pport.check_seq_err + priv->stats.pport.alignment_err; break; case IFCOUNTER_IQDROPS: retval = priv->stats.vport.rx_out_of_buffer; break; case IFCOUNTER_OPACKETS: retval = priv->stats.vport.tx_packets; break; case IFCOUNTER_OERRORS: retval = priv->stats.port_stats_debug.out_discards; break; case IFCOUNTER_IBYTES: retval = priv->stats.vport.rx_bytes; break; case IFCOUNTER_OBYTES: retval = priv->stats.vport.tx_bytes; break; case IFCOUNTER_IMCASTS: retval = priv->stats.vport.rx_multicast_packets; break; case IFCOUNTER_OMCASTS: retval = priv->stats.vport.tx_multicast_packets; break; case IFCOUNTER_OQDROPS: retval = priv->stats.vport.tx_queue_dropped; break; case IFCOUNTER_COLLISIONS: retval = priv->stats.pport.collisions; break; default: retval = if_get_counter_default(ifp, cnt); break; } /* PRIV_UNLOCK(priv); XXX not allowed */ return (retval); } static void -mlx5e_set_rx_mode(struct ifnet *ifp) +mlx5e_set_rx_mode(if_t ifp) { - struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5e_priv *priv = if_getsoftc(ifp); queue_work(priv->wq, &priv->set_rx_mode_work); } static int -mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +mlx5e_ioctl(if_t ifp, u_long command, caddr_t data) { struct mlx5e_priv *priv; struct ifreq *ifr; struct ifdownreason *ifdr; struct ifi2creq i2c; struct ifrsskey *ifrk; struct ifrsshash *ifrh; struct siocsifcapnv_driver_data *drv_ioctl_data, drv_ioctl_data_d; int error = 0; int mask; int size_read = 0; int module_status; int module_num; int max_mtu; uint8_t read_addr; - priv = ifp->if_softc; + priv = if_getsoftc(ifp); /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFMTU: ifr = (struct ifreq *)data; PRIV_LOCK(priv); mlx5_query_port_max_mtu(priv->mdev, &max_mtu); if (ifr->ifr_mtu >= MLX5E_MTU_MIN && ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) { int was_opened; was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); if (was_opened) mlx5e_close_locked(ifp); /* set new MTU */ mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu); if (was_opened) mlx5e_open_locked(ifp); } else { error = EINVAL; mlx5_en_err(ifp, "Invalid MTU value. Min val: %d, Max val: %d\n", MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu)); } PRIV_UNLOCK(priv); break; case SIOCSIFFLAGS: - if ((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if ((if_getflags(ifp) & IFF_UP) && + (if_getdrvflags(ifp) & IFF_DRV_RUNNING)) { mlx5e_set_rx_mode(ifp); break; } PRIV_LOCK(priv); - if (ifp->if_flags & IFF_UP) { - if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + if (if_getflags(ifp) & IFF_UP) { + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) mlx5e_open_locked(ifp); - ifp->if_drv_flags |= IFF_DRV_RUNNING; + if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); mlx5_set_port_status(priv->mdev, MLX5_PORT_UP); } } else { - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { mlx5_set_port_status(priv->mdev, MLX5_PORT_DOWN); if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) mlx5e_close_locked(ifp); mlx5e_update_carrier(priv); - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); } } PRIV_UNLOCK(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: mlx5e_set_rx_mode(ifp); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: ifr = (struct ifreq *)data; error = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; case SIOCGIFCAPNV: error = 0; break; case SIOCSIFCAP: ifr = (struct ifreq *)data; drv_ioctl_data = &drv_ioctl_data_d; drv_ioctl_data->reqcap = ifr->ifr_reqcap; PRIV_LOCK(priv); - drv_ioctl_data->reqcap2 = ifp->if_capenable2; + drv_ioctl_data->reqcap2 = if_getcapenable2(ifp); drv_ioctl_data->nvcap = NULL; goto siocsifcap_driver; case SIOCSIFCAPNV: drv_ioctl_data = (struct siocsifcapnv_driver_data *)data; PRIV_LOCK(priv); siocsifcap_driver: - mask = drv_ioctl_data->reqcap ^ ifp->if_capenable; + mask = drv_ioctl_data->reqcap ^ if_getcapenable(ifp); if (mask & IFCAP_TXCSUM) { - ifp->if_capenable ^= IFCAP_TXCSUM; - ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); + if_togglecapenable(ifp, IFCAP_TXCSUM); + if_togglehwassist(ifp, (CSUM_TCP | CSUM_UDP | CSUM_IP)); - if (IFCAP_TSO4 & ifp->if_capenable && - !(IFCAP_TXCSUM & ifp->if_capenable)) { + if (IFCAP_TSO4 & if_getcapenable(ifp) && + !(IFCAP_TXCSUM & if_getcapenable(ifp))) { mask &= ~IFCAP_TSO4; - ifp->if_capenable &= ~IFCAP_TSO4; - ifp->if_hwassist &= ~CSUM_IP_TSO; + if_setcapenablebit(ifp, 0, IFCAP_TSO4); + if_sethwassistbits(ifp, 0, CSUM_IP_TSO); mlx5_en_err(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { - ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; - ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); + if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); + if_togglehwassist(ifp, (CSUM_UDP_IPV6 | CSUM_TCP_IPV6)); - if (IFCAP_TSO6 & ifp->if_capenable && - !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + if (IFCAP_TSO6 & if_getcapenable(ifp) && + !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) { mask &= ~IFCAP_TSO6; - ifp->if_capenable &= ~IFCAP_TSO6; - ifp->if_hwassist &= ~CSUM_IP6_TSO; + if_setcapenablebit(ifp, 0, IFCAP_TSO6); + if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); mlx5_en_err(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_MEXTPG) - ifp->if_capenable ^= IFCAP_MEXTPG; + if_togglecapenable(ifp, IFCAP_MEXTPG); if (mask & IFCAP_TXTLS4) - ifp->if_capenable ^= IFCAP_TXTLS4; + if_togglecapenable(ifp, IFCAP_TXTLS4); if (mask & IFCAP_TXTLS6) - ifp->if_capenable ^= IFCAP_TXTLS6; + if_togglecapenable(ifp, IFCAP_TXTLS6); #ifdef RATELIMIT if (mask & IFCAP_TXTLS_RTLMT) - ifp->if_capenable ^= IFCAP_TXTLS_RTLMT; + if_togglecapenable(ifp, IFCAP_TXTLS_RTLMT); #endif if (mask & IFCAP_RXCSUM) - ifp->if_capenable ^= IFCAP_RXCSUM; + if_togglecapenable(ifp, IFCAP_RXCSUM); if (mask & IFCAP_RXCSUM_IPV6) - ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; + if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); if (mask & IFCAP_TSO4) { - if (!(IFCAP_TSO4 & ifp->if_capenable) && - !(IFCAP_TXCSUM & ifp->if_capenable)) { + if (!(IFCAP_TSO4 & if_getcapenable(ifp)) && + !(IFCAP_TXCSUM & if_getcapenable(ifp))) { mlx5_en_err(ifp, "enable txcsum first.\n"); error = EAGAIN; goto out; } - ifp->if_capenable ^= IFCAP_TSO4; - ifp->if_hwassist ^= CSUM_IP_TSO; + if_togglecapenable(ifp, IFCAP_TSO4); + if_togglehwassist(ifp, CSUM_IP_TSO); } if (mask & IFCAP_TSO6) { - if (!(IFCAP_TSO6 & ifp->if_capenable) && - !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { + if (!(IFCAP_TSO6 & if_getcapenable(ifp)) && + !(IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp))) { mlx5_en_err(ifp, "enable txcsum6 first.\n"); error = EAGAIN; goto out; } - ifp->if_capenable ^= IFCAP_TSO6; - ifp->if_hwassist ^= CSUM_IP6_TSO; + if_togglecapenable(ifp, IFCAP_TSO6); + if_togglehwassist(ifp, CSUM_IP6_TSO); } if (mask & IFCAP_VLAN_HWTSO) - ifp->if_capenable ^= IFCAP_VLAN_HWTSO; + if_togglecapenable(ifp, IFCAP_VLAN_HWTSO); if (mask & IFCAP_VLAN_HWFILTER) { - if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) + if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) mlx5e_disable_vlan_filter(priv); else mlx5e_enable_vlan_filter(priv); - ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; + if_togglecapenable(ifp, IFCAP_VLAN_HWFILTER); } if (mask & IFCAP_VLAN_HWTAGGING) - ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; + if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING); if (mask & IFCAP_WOL_MAGIC) - ifp->if_capenable ^= IFCAP_WOL_MAGIC; + if_togglecapenable(ifp, IFCAP_WOL_MAGIC); if (mask & IFCAP_VXLAN_HWCSUM) { const bool was_enabled = - (ifp->if_capenable & IFCAP_VXLAN_HWCSUM) != 0; + (if_getcapenable(ifp) & IFCAP_VXLAN_HWCSUM) != 0; if (was_enabled) mlx5e_del_all_vxlan_rules(priv); - ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM; - ifp->if_hwassist ^= CSUM_INNER_IP | CSUM_INNER_IP_UDP | + if_togglecapenable(ifp, IFCAP_VXLAN_HWCSUM); + if_togglehwassist(ifp, CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP6_UDP | - CSUM_INNER_IP6_TCP; + CSUM_INNER_IP6_TCP); if (!was_enabled) { int err = mlx5e_add_all_vxlan_rules(priv); if (err != 0) { mlx5_en_err(ifp, "mlx5e_add_all_vxlan_rules() failed, %d (ignored)\n", err); } } } if (mask & IFCAP_VXLAN_HWTSO) { - ifp->if_capenable ^= IFCAP_VXLAN_HWTSO; - ifp->if_hwassist ^= CSUM_INNER_IP_TSO | - CSUM_INNER_IP6_TSO; + if_togglecapenable(ifp, IFCAP_VXLAN_HWTSO); + if_togglehwassist(ifp, CSUM_INNER_IP_TSO | + CSUM_INNER_IP6_TSO); } VLAN_CAPABILITIES(ifp); /* turn off LRO means also turn of HW LRO - if it's on */ if (mask & IFCAP_LRO) { int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); bool need_restart = false; - ifp->if_capenable ^= IFCAP_LRO; + if_togglecapenable(ifp, IFCAP_LRO); /* figure out if updating HW LRO is needed */ - if (!(ifp->if_capenable & IFCAP_LRO)) { + if (!(if_getcapenable(ifp) & IFCAP_LRO)) { if (priv->params.hw_lro_en) { priv->params.hw_lro_en = false; need_restart = true; } } else { if (priv->params.hw_lro_en == false && priv->params_ethtool.hw_lro != 0) { priv->params.hw_lro_en = true; need_restart = true; } } if (was_opened && need_restart) { mlx5e_close_locked(ifp); mlx5e_open_locked(ifp); } } if (mask & IFCAP_HWRXTSTMP) { - ifp->if_capenable ^= IFCAP_HWRXTSTMP; - if (ifp->if_capenable & IFCAP_HWRXTSTMP) { + if_togglecapenable(ifp, IFCAP_HWRXTSTMP); + if (if_getcapenable(ifp) & IFCAP_HWRXTSTMP) { if (priv->clbr_done == 0) mlx5e_reset_calibration_callout(priv); } else { callout_drain(&priv->tstmp_clbr); priv->clbr_done = 0; } } - mask = drv_ioctl_data->reqcap2 ^ ifp->if_capenable2; + mask = drv_ioctl_data->reqcap2 ^ if_getcapenable2(ifp); if ((mask & IFCAP2_BIT(IFCAP2_RXTLS4)) != 0) - ifp->if_capenable2 ^= IFCAP2_BIT(IFCAP2_RXTLS4); + if_togglecapenable2(ifp, IFCAP2_BIT(IFCAP2_RXTLS4)); if ((mask & IFCAP2_BIT(IFCAP2_RXTLS6)) != 0) - ifp->if_capenable2 ^= IFCAP2_BIT(IFCAP2_RXTLS6); + if_togglecapenable2(ifp, IFCAP2_BIT(IFCAP2_RXTLS6)); out: PRIV_UNLOCK(priv); break; case SIOCGI2C: ifr = (struct ifreq *)data; /* * Copy from the user-space address ifr_data to the * kernel-space address i2c */ error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (error) break; if (i2c.len > sizeof(i2c.data)) { error = EINVAL; break; } PRIV_LOCK(priv); /* Get module_num which is required for the query_eeprom */ error = mlx5_query_module_num(priv->mdev, &module_num); if (error) { mlx5_en_err(ifp, "Query module num failed, eeprom reading is not supported\n"); error = EINVAL; goto err_i2c; } /* Check if module is present before doing an access */ module_status = mlx5_query_module_status(priv->mdev, module_num); if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) { error = EINVAL; goto err_i2c; } /* * Currently 0XA0 and 0xA2 are the only addresses permitted. * The internal conversion is as follows: */ if (i2c.dev_addr == 0xA0) read_addr = MLX5_I2C_ADDR_LOW; else if (i2c.dev_addr == 0xA2) read_addr = MLX5_I2C_ADDR_HIGH; else { mlx5_en_err(ifp, "Query eeprom failed, Invalid Address: %X\n", i2c.dev_addr); error = EINVAL; goto err_i2c; } error = mlx5_query_eeprom(priv->mdev, read_addr, MLX5_EEPROM_LOW_PAGE, (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num, (uint32_t *)i2c.data, &size_read); if (error) { mlx5_en_err(ifp, "Query eeprom failed, eeprom reading is not supported\n"); error = EINVAL; goto err_i2c; } if (i2c.len > MLX5_EEPROM_MAX_BYTES) { error = mlx5_query_eeprom(priv->mdev, read_addr, MLX5_EEPROM_LOW_PAGE, (uint32_t)(i2c.offset + size_read), (uint32_t)(i2c.len - size_read), module_num, (uint32_t *)(i2c.data + size_read), &size_read); } if (error) { mlx5_en_err(ifp, "Query eeprom failed, eeprom reading is not supported\n"); error = EINVAL; goto err_i2c; } error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); err_i2c: PRIV_UNLOCK(priv); break; case SIOCGIFDOWNREASON: ifdr = (struct ifdownreason *)data; bzero(ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg)); PRIV_LOCK(priv); error = -mlx5_query_pddr_troubleshooting_info(priv->mdev, NULL, ifdr->ifdr_msg, sizeof(ifdr->ifdr_msg)); PRIV_UNLOCK(priv); if (error == 0) ifdr->ifdr_reason = IFDR_REASON_MSG; break; case SIOCGIFRSSKEY: ifrk = (struct ifrsskey *)data; ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; ifrk->ifrk_keylen = MLX5E_RSS_KEY_SIZE; CTASSERT(sizeof(ifrk->ifrk_key) >= MLX5E_RSS_KEY_SIZE); mlx5e_get_rss_key(ifrk->ifrk_key); break; case SIOCGIFRSSHASH: ifrh = (struct ifrsshash *)data; ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; ifrh->ifrh_types = RSS_TYPE_IPV4 | RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4 | RSS_TYPE_IPV6 | RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6; break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev) { /* * TODO: uncoment once FW really sets all these bits if * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap || * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap || * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return * -ENOTSUPP; */ /* TODO: add more must-to-have features */ if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) return (-ENODEV); return (0); } static u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev) { const int min_size = ETHER_VLAN_ENCAP_LEN + ETHER_HDR_LEN; const int max_size = MLX5E_MAX_TX_INLINE; const int bf_buf_size = ((1U << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2U) - (sizeof(struct mlx5e_tx_wqe) - 2); /* verify against driver limits */ if (bf_buf_size > max_size) return (max_size); else if (bf_buf_size < min_size) return (min_size); else return (bf_buf_size); } static int mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv, int num_comp_vectors) { int err; /* * TODO: Consider link speed for setting "log_sq_size", * "log_rq_size" and "cq_moderation_xxx": */ priv->params.log_sq_size = MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; priv->params.rx_cq_moderation_usec = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE : MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC; priv->params.rx_cq_moderation_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0; priv->params.rx_cq_moderation_pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS; priv->params.tx_cq_moderation_usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC; priv->params.tx_cq_moderation_pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS; priv->params.min_rx_wqes = MLX5E_PARAMS_DEFAULT_MIN_RX_WQES; priv->params.rx_hash_log_tbl_sz = (order_base_2(num_comp_vectors) > MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ? order_base_2(num_comp_vectors) : MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ; priv->params.num_tc = 1; priv->params.default_vlan_prio = 0; priv->counter_set_id = -1; priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev); err = mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode); if (err) return (err); /* * hw lro is currently defaulted to off. when it won't anymore we * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)" */ priv->params.hw_lro_en = false; priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ; /* * CQE zipping is currently defaulted to off. when it won't * anymore we will consider the HW capability: * "!!MLX5_CAP_GEN(mdev, cqe_compression)" */ priv->params.cqe_zipping_en = false; priv->mdev = mdev; priv->params.num_channels = num_comp_vectors; priv->params.channels_rsss = 1; priv->order_base_2_num_channels = order_base_2(num_comp_vectors); priv->queue_mapping_channel_mask = roundup_pow_of_two(num_comp_vectors) - 1; priv->num_tc = priv->params.num_tc; priv->default_vlan_prio = priv->params.default_vlan_prio; INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work); INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work); INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); return (0); } static void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc) { bool ro_pci_enable = pci_get_relaxed_ordering_enabled(mdev->pdev->dev.bsddev); bool ro_write = MLX5_CAP_GEN(mdev, relaxed_ordering_write); bool ro_read = MLX5_CAP_GEN(mdev, relaxed_ordering_read); MLX5_SET(mkc, mkc, relaxed_ordering_read, ro_pci_enable && ro_read); MLX5_SET(mkc, mkc, relaxed_ordering_write, ro_pci_enable && ro_write); } static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, struct mlx5_core_mkey *mkey) { - struct ifnet *ifp = priv->ifp; + if_t ifp = priv->ifp; struct mlx5_core_dev *mdev = priv->mdev; int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); void *mkc; u32 *in; int err; in = mlx5_vzalloc(inlen); if (in == NULL) { mlx5_en_err(ifp, "failed to allocate inbox\n"); return (-ENOMEM); } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA); MLX5_SET(mkc, mkc, umr_en, 1); /* used by HW TLS */ MLX5_SET(mkc, mkc, lw, 1); MLX5_SET(mkc, mkc, lr, 1); mlx5e_mkey_set_relaxed_ordering(mdev, mkc); MLX5_SET(mkc, mkc, pd, pdn); MLX5_SET(mkc, mkc, length64, 1); MLX5_SET(mkc, mkc, qpn, 0xffffff); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); if (err) mlx5_en_err(ifp, "mlx5_core_create_mkey failed, %d\n", err); kvfree(in); return (err); } static const char *mlx5e_vport_stats_desc[] = { MLX5E_VPORT_STATS(MLX5E_STATS_DESC) }; static const char *mlx5e_pport_stats_desc[] = { MLX5E_PPORT_STATS(MLX5E_STATS_DESC) }; static int mlx5e_priv_static_init(struct mlx5e_priv *priv, struct mlx5_core_dev *mdev, const uint32_t channels) { uint32_t x; int err; mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF); sx_init(&priv->state_lock, "mlx5state"); callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0); MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock); for (x = 0; x != channels; x++) mlx5e_chan_static_init(priv, &priv->channel[x], x); for (x = 0; x != channels; x++) { err = mlx5_alloc_bfreg(mdev, &priv->channel[x].bfreg, false, false); if (err) goto err_alloc_bfreg; } return (0); err_alloc_bfreg: while (x--) mlx5_free_bfreg(mdev, &priv->channel[x].bfreg); for (x = 0; x != channels; x++) mlx5e_chan_static_destroy(&priv->channel[x]); callout_drain(&priv->watchdog); mtx_destroy(&priv->async_events_mtx); sx_destroy(&priv->state_lock); return (err); } static void mlx5e_priv_static_destroy(struct mlx5e_priv *priv, struct mlx5_core_dev *mdev, const uint32_t channels) { uint32_t x; for (x = 0; x != channels; x++) mlx5_free_bfreg(mdev, &priv->channel[x].bfreg); for (x = 0; x != channels; x++) mlx5e_chan_static_destroy(&priv->channel[x]); callout_drain(&priv->watchdog); mtx_destroy(&priv->async_events_mtx); sx_destroy(&priv->state_lock); } static int sysctl_firmware(SYSCTL_HANDLER_ARGS) { /* * %d.%d%.d the string format. * fw_rev_{maj,min,sub} return u16, 2^16 = 65536. * We need at most 5 chars to store that. * It also has: two "." and NULL at the end, which means we need 18 * (5*3 + 3) chars at most. */ char fw[18]; struct mlx5e_priv *priv = arg1; int error; snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev), fw_rev_sub(priv->mdev)); error = sysctl_handle_string(oidp, fw, sizeof(fw), req); return (error); } static void mlx5e_disable_tx_dma(struct mlx5e_channel *ch) { int i; for (i = 0; i < ch->priv->num_tc; i++) mlx5e_drain_sq(&ch->sq[i]); } static void mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq) { sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP); sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8); mlx5e_tx_notify_hw(sq, true); } void mlx5e_resume_sq(struct mlx5e_sq *sq) { int err; /* check if already enabled */ if (READ_ONCE(sq->running) != 0) return; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR, MLX5_SQC_STATE_RST); if (err != 0) { mlx5_en_err(sq->ifp, "mlx5e_modify_sq() from ERR to RST failed: %d\n", err); } sq->cc = 0; sq->pc = 0; /* reset doorbell prior to moving from RST to RDY */ mlx5e_reset_sq_doorbell_record(sq); err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err != 0) { mlx5_en_err(sq->ifp, "mlx5e_modify_sq() from RST to RDY failed: %d\n", err); } sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; WRITE_ONCE(sq->running, 1); } static void mlx5e_enable_tx_dma(struct mlx5e_channel *ch) { int i; for (i = 0; i < ch->priv->num_tc; i++) mlx5e_resume_sq(&ch->sq[i]); } static void mlx5e_disable_rx_dma(struct mlx5e_channel *ch) { struct mlx5e_rq *rq = &ch->rq; struct epoch_tracker et; int err; mtx_lock(&rq->mtx); rq->enabled = 0; callout_stop(&rq->watchdog); mtx_unlock(&rq->mtx); err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); if (err != 0) { mlx5_en_err(rq->ifp, "mlx5e_modify_rq() from RDY to RST failed: %d\n", err); } while (!mlx5_wq_ll_is_empty(&rq->wq)) { msleep(1); NET_EPOCH_ENTER(et); rq->cq.mcq.comp(&rq->cq.mcq, NULL); NET_EPOCH_EXIT(et); } /* * Transitioning into RST state will allow the FW to track less ERR state queues, * thus reducing the recv queue flushing time */ err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST); if (err != 0) { mlx5_en_err(rq->ifp, "mlx5e_modify_rq() from ERR to RST failed: %d\n", err); } } static void mlx5e_enable_rx_dma(struct mlx5e_channel *ch) { struct mlx5e_rq *rq = &ch->rq; struct epoch_tracker et; int err; rq->wq.wqe_ctr = 0; mlx5_wq_ll_update_db_record(&rq->wq); err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); if (err != 0) { mlx5_en_err(rq->ifp, "mlx5e_modify_rq() from RST to RDY failed: %d\n", err); } rq->enabled = 1; NET_EPOCH_ENTER(et); rq->cq.mcq.comp(&rq->cq.mcq, NULL); NET_EPOCH_EXIT(et); } void mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value) { int i; if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) { if (value) mlx5e_disable_tx_dma(&priv->channel[i]); else mlx5e_enable_tx_dma(&priv->channel[i]); } } void mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value) { int i; if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) { if (value) mlx5e_disable_rx_dma(&priv->channel[i]); else mlx5e_enable_rx_dma(&priv->channel[i]); } } static void mlx5e_add_hw_stats(struct mlx5e_priv *priv) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, priv, 0, sysctl_firmware, "A", "HCA firmware version"); SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0, "Board ID"); } static int mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; uint8_t temp[MLX5E_MAX_PRIORITY]; uint32_t tx_pfc; int err; int i; PRIV_LOCK(priv); tx_pfc = priv->params.tx_priority_flow_control; for (i = 0; i != MLX5E_MAX_PRIORITY; i++) temp[i] = (tx_pfc >> i) & 1; err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY); if (err) goto done; priv->params.tx_priority_flow_control = 0; /* range check input value */ for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] > 1) { err = ERANGE; goto done; } priv->params.tx_priority_flow_control |= (temp[i] << i); } /* check if update is required */ if (tx_pfc != priv->params.tx_priority_flow_control) err = -mlx5e_set_port_pfc(priv); done: if (err != 0) priv->params.tx_priority_flow_control= tx_pfc; PRIV_UNLOCK(priv); return (err); } static int mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; uint8_t temp[MLX5E_MAX_PRIORITY]; uint32_t rx_pfc; int err; int i; PRIV_LOCK(priv); rx_pfc = priv->params.rx_priority_flow_control; for (i = 0; i != MLX5E_MAX_PRIORITY; i++) temp[i] = (rx_pfc >> i) & 1; err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY); if (err) goto done; priv->params.rx_priority_flow_control = 0; /* range check input value */ for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] > 1) { err = ERANGE; goto done; } priv->params.rx_priority_flow_control |= (temp[i] << i); } /* check if update is required */ if (rx_pfc != priv->params.rx_priority_flow_control) { err = -mlx5e_set_port_pfc(priv); if (err == 0 && priv->sw_is_port_buf_owner) err = mlx5e_update_buf_lossy(priv); } done: if (err != 0) priv->params.rx_priority_flow_control= rx_pfc; PRIV_UNLOCK(priv); return (err); } static void mlx5e_setup_pauseframes(struct mlx5e_priv *priv) { int error; /* enable pauseframes by default */ priv->params.tx_pauseframe_control = 1; priv->params.rx_pauseframe_control = 1; /* disable ports flow control, PFC, by default */ priv->params.tx_priority_flow_control = 0; priv->params.rx_priority_flow_control = 0; /* register pauseframe SYSCTLs */ SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN, &priv->params.tx_pauseframe_control, 0, "Set to enable TX pause frames. Clear to disable."); SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN, &priv->params.rx_pauseframe_control, 0, "Set to enable RX pause frames. Clear to disable."); /* register priority flow control, PFC, SYSCTLs */ SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_tx_priority_flow_control, "CU", "Set to enable TX ports flow control frames for priorities 0..7. Clear to disable."); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_rx_priority_flow_control, "CU", "Set to enable RX ports flow control frames for priorities 0..7. Clear to disable."); PRIV_LOCK(priv); /* range check */ priv->params.tx_pauseframe_control = priv->params.tx_pauseframe_control ? 1 : 0; priv->params.rx_pauseframe_control = priv->params.rx_pauseframe_control ? 1 : 0; /* update firmware */ error = mlx5e_set_port_pause_and_pfc(priv); if (error == -EINVAL) { mlx5_en_err(priv->ifp, "Global pauseframes must be disabled before enabling PFC.\n"); priv->params.rx_priority_flow_control = 0; priv->params.tx_priority_flow_control = 0; /* update firmware */ (void) mlx5e_set_port_pause_and_pfc(priv); } PRIV_UNLOCK(priv); } static int -mlx5e_ul_snd_tag_alloc(struct ifnet *ifp, +mlx5e_ul_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct mlx5e_priv *priv; struct mlx5e_channel *pch; - priv = ifp->if_softc; + priv = if_getsoftc(ifp); if (unlikely(priv->gone || params->hdr.flowtype == M_HASHTYPE_NONE)) { return (EOPNOTSUPP); } else { /* keep this code synced with mlx5e_select_queue() */ u32 ch = priv->params.num_channels; #ifdef RSS u32 temp; if (rss_hash2bucket(params->hdr.flowid, params->hdr.flowtype, &temp) == 0) ch = temp % ch; else #endif ch = (params->hdr.flowid % 128) % ch; /* * NOTE: The channels array is only freed at detach * and it safe to return a pointer to the send tag * inside the channels structure as long as we * reference the priv. */ pch = priv->channel + ch; /* check if send queue is not running */ if (unlikely(pch->sq[0].running == 0)) return (ENXIO); m_snd_tag_ref(&pch->tag); *ppmt = &pch->tag; return (0); } } static int mlx5e_ul_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_channel *pch = container_of(pmt, struct mlx5e_channel, tag); params->unlimited.max_rate = -1ULL; params->unlimited.queue_level = mlx5e_sq_queue_level(&pch->sq[0]); return (0); } static void mlx5e_ul_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_channel *pch = container_of(pmt, struct mlx5e_channel, tag); complete(&pch->completion); } static int -mlx5e_snd_tag_alloc(struct ifnet *ifp, +mlx5e_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (mlx5e_rl_snd_tag_alloc(ifp, params, ppmt)); #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt)); #endif #endif case IF_SND_TAG_TYPE_UNLIMITED: return (mlx5e_ul_snd_tag_alloc(ifp, params, ppmt)); #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: return (mlx5e_tls_snd_tag_alloc(ifp, params, ppmt)); case IF_SND_TAG_TYPE_TLS_RX: return (mlx5e_tls_rx_snd_tag_alloc(ifp, params, ppmt)); #endif default: return (EOPNOTSUPP); } } #ifdef RATELIMIT #define NUM_HDWR_RATES_MLX 13 static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = { 135375, /* 1,083,000 */ 180500, /* 1,444,000 */ 270750, /* 2,166,000 */ 361000, /* 2,888,000 */ 541500, /* 4,332,000 */ 721875, /* 5,775,000 */ 1082875, /* 8,663,000 */ 1443875, /* 11,551,000 */ 2165750, /* 17,326,000 */ 2887750, /* 23,102,000 */ 4331625, /* 34,653,000 */ 5775500, /* 46,204,000 */ 8663125 /* 69,305,000 */ }; static void -mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) +mlx5e_ratelimit_query(if_t ifp __unused, struct if_ratelimit_query_results *q) { /* * This function needs updating by the driver maintainer! * For the MLX card there are currently (ConectX-4?) 13 * pre-set rates and others i.e. ConnectX-5, 6, 7?? * * This will change based on later adapters * and this code should be updated to look at ifp * and figure out the specific adapter type * settings i.e. how many rates as well * as if they are fixed (as is shown here) or * if they are dynamic (example chelsio t4). Also if there * is a maximum number of flows that the adapter * can handle that too needs to be updated in * the max_flows field. */ q->rate_table = adapter_rates_mlx; q->flags = RT_IS_FIXED_TABLE; q->max_flows = 0; /* mlx has no limit */ q->number_of_rates = NUM_HDWR_RATES_MLX; q->min_segment_burst = 1; } #endif static void mlx5e_ifm_add(struct mlx5e_priv *priv, int type) { ifmedia_add(&priv->media, type | IFM_ETHER, 0, NULL); ifmedia_add(&priv->media, type | IFM_ETHER | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); ifmedia_add(&priv->media, type | IFM_ETHER | IFM_ETH_RXPAUSE, 0, NULL); ifmedia_add(&priv->media, type | IFM_ETHER | IFM_ETH_TXPAUSE, 0, NULL); ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX, 0, NULL); ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX | IFM_ETH_RXPAUSE, 0, NULL); ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX | IFM_ETH_TXPAUSE, 0, NULL); ifmedia_add(&priv->media, type | IFM_ETHER | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); } static void * mlx5e_create_ifp(struct mlx5_core_dev *mdev) { - struct ifnet *ifp; + if_t ifp; struct mlx5e_priv *priv; u8 dev_addr[ETHER_ADDR_LEN] __aligned(4); struct sysctl_oid_list *child; int ncv = mdev->priv.eq_table.num_comp_vectors; char unit[16]; struct pfil_head_args pa; int err; u32 eth_proto_cap; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; bool ext; struct media media_entry = {}; if (mlx5e_check_required_hca_cap(mdev)) { mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n"); return (NULL); } /* * Try to allocate the priv and make room for worst-case * number of channel structures: */ priv = malloc_domainset(sizeof(*priv) + (sizeof(priv->channel[0]) * mdev->priv.eq_table.num_comp_vectors), M_MLX5EN, mlx5_dev_domainset(mdev), M_WAITOK | M_ZERO); ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev); if (ifp == NULL) { mlx5_core_err(mdev, "if_alloc() failed\n"); goto err_free_priv; } /* setup all static fields */ if (mlx5e_priv_static_init(priv, mdev, mdev->priv.eq_table.num_comp_vectors)) { mlx5_core_err(mdev, "mlx5e_priv_static_init() failed\n"); goto err_free_ifp; } - ifp->if_softc = priv; + if_setsoftc(ifp, priv); if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev)); - ifp->if_mtu = ETHERMTU; - ifp->if_init = mlx5e_open; - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | - IFF_KNOWSEPOCH; - ifp->if_ioctl = mlx5e_ioctl; - ifp->if_transmit = mlx5e_xmit; - ifp->if_qflush = if_qflush; - ifp->if_get_counter = mlx5e_get_counter; - ifp->if_snd.ifq_maxlen = ifqmaxlen; + if_setmtu(ifp, ETHERMTU); + if_setinitfn(ifp, mlx5e_open); + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | + IFF_KNOWSEPOCH); + if_setioctlfn(ifp, mlx5e_ioctl); + if_settransmitfn(ifp, mlx5e_xmit); + if_setqflushfn(ifp, if_qflush); + if_setgetcounterfn(ifp, mlx5e_get_counter); + if_setsendqlen(ifp, ifqmaxlen); /* * Set driver features */ - ifp->if_capabilities |= IFCAP_NV; - ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6; - ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; - ifp->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; - ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; - ifp->if_capabilities |= IFCAP_LRO; - ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO; - ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP; - ifp->if_capabilities |= IFCAP_MEXTPG; - ifp->if_capabilities |= IFCAP_TXTLS4 | IFCAP_TXTLS6; + if_setcapabilities(ifp, IFCAP_NV); + if_setcapabilitiesbit(ifp, IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6, 0); + if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING, 0); + if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER, 0); + if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE | IFCAP_JUMBO_MTU, 0); + if_setcapabilitiesbit(ifp, IFCAP_LRO, 0); + if_setcapabilitiesbit(ifp, IFCAP_TSO | IFCAP_VLAN_HWTSO, 0); + if_setcapabilitiesbit(ifp, IFCAP_HWSTATS | IFCAP_HWRXTSTMP, 0); + if_setcapabilitiesbit(ifp, IFCAP_MEXTPG, 0); + if_setcapabilitiesbit(ifp, IFCAP_TXTLS4 | IFCAP_TXTLS6, 0); #ifdef RATELIMIT - ifp->if_capabilities |= IFCAP_TXRTLMT | IFCAP_TXTLS_RTLMT; + if_setcapabilitiesbit(ifp, IFCAP_TXRTLMT | IFCAP_TXTLS_RTLMT, 0); #endif - ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO; - ifp->if_capabilities2 |= IFCAP2_BIT(IFCAP2_RXTLS4) | - IFCAP2_BIT(IFCAP2_RXTLS6); - ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc; + if_setcapabilitiesbit(ifp, IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO, 0); + if_setcapabilities2bit(ifp, IFCAP2_BIT(IFCAP2_RXTLS4) | + IFCAP2_BIT(IFCAP2_RXTLS6), 0); + if_setsndtagallocfn(ifp, mlx5e_snd_tag_alloc); #ifdef RATELIMIT - ifp->if_ratelimit_query = mlx5e_ratelimit_query; + if_setratelimitqueryfn(ifp, mlx5e_ratelimit_query); #endif /* set TSO limits so that we don't have to drop TX packets */ - ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); - ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */; - ifp->if_hw_tsomaxsegsize = MLX5E_MAX_TX_MBUF_SIZE; - - ifp->if_capenable = ifp->if_capabilities; - ifp->if_capenable2 = ifp->if_capabilities2; - ifp->if_hwassist = 0; - if (ifp->if_capenable & IFCAP_TSO) - ifp->if_hwassist |= CSUM_TSO; - if (ifp->if_capenable & IFCAP_TXCSUM) - ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); - if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) - ifp->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); - if (ifp->if_capabilities & IFCAP_VXLAN_HWCSUM) - ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | + if_sethwtsomax(ifp, MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); + if_sethwtsomaxsegcount(ifp, MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */); + if_sethwtsomaxsegsize(ifp, MLX5E_MAX_TX_MBUF_SIZE); + + if_setcapenable(ifp, if_getcapabilities(ifp)); + if_setcapenable2(ifp, if_getcapabilities2(ifp)); + if_sethwassist(ifp, 0); + if (if_getcapenable(ifp) & IFCAP_TSO) + if_sethwassistbits(ifp, CSUM_TSO, 0); + if (if_getcapenable(ifp) & IFCAP_TXCSUM) + if_sethwassistbits(ifp, (CSUM_TCP | CSUM_UDP | CSUM_IP), 0); + if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) + if_sethwassistbits(ifp, (CSUM_UDP_IPV6 | CSUM_TCP_IPV6), 0); + if (if_getcapabilities(ifp) & IFCAP_VXLAN_HWCSUM) + if_sethwassistbits(ifp, CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | - CSUM_ENCAP_VXLAN; - if (ifp->if_capabilities & IFCAP_VXLAN_HWTSO) - ifp->if_hwassist |= CSUM_INNER_IP6_TSO | CSUM_INNER_IP_TSO; + CSUM_ENCAP_VXLAN, 0); + if (if_getcapabilities(ifp) & IFCAP_VXLAN_HWTSO) + if_sethwassistbits(ifp, CSUM_INNER_IP6_TSO | CSUM_INNER_IP_TSO, 0); /* ifnet sysctl tree */ sysctl_ctx_init(&priv->sysctl_ctx); priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), - OID_AUTO, ifp->if_dname, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + OID_AUTO, if_getdname(ifp), CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "MLX5 ethernet - interface name"); if (priv->sysctl_ifnet == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } - snprintf(unit, sizeof(unit), "%d", ifp->if_dunit); + snprintf(unit, sizeof(unit), "%d", if_getdunit(ifp)); priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, unit, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "MLX5 ethernet - interface unit"); if (priv->sysctl_ifnet == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } /* HW sysctl tree */ child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev)); priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child, OID_AUTO, "hw", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "MLX5 ethernet dev hw"); if (priv->sysctl_hw == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } err = mlx5e_build_ifp_priv(mdev, priv, ncv); if (err) { mlx5_core_err(mdev, "mlx5e_build_ifp_priv() failed (%d)\n", err); goto err_free_sysctl; } /* reuse mlx5core's watchdog workqueue */ priv->wq = mdev->priv.health.wq_watchdog; err = mlx5_core_alloc_pd(mdev, &priv->pdn, 0); if (err) { mlx5_en_err(ifp, "mlx5_core_alloc_pd failed, %d\n", err); goto err_free_wq; } err = mlx5_alloc_transport_domain(mdev, &priv->tdn, 0); if (err) { mlx5_en_err(ifp, "mlx5_alloc_transport_domain failed, %d\n", err); goto err_dealloc_pd; } err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr); if (err) { mlx5_en_err(ifp, "mlx5e_create_mkey failed, %d\n", err); goto err_dealloc_transport_domain; } mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr); /* check if we should generate a random MAC address */ if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 && is_zero_ether_addr(dev_addr)) { random_ether_addr(dev_addr); mlx5_en_err(ifp, "Assigned random MAC address\n"); } err = mlx5e_rl_init(priv); if (err) { mlx5_en_err(ifp, "mlx5e_rl_init failed, %d\n", err); goto err_create_mkey; } err = mlx5e_tls_init(priv); if (err) { if_printf(ifp, "%s: mlx5e_tls_init failed\n", __func__); goto err_rl_init; } err = mlx5e_open_drop_rq(priv, &priv->drop_rq); if (err) { if_printf(ifp, "%s: mlx5e_open_drop_rq failed (%d)\n", __func__, err); goto err_tls_init; } err = mlx5e_open_rqts(priv); if (err) { if_printf(ifp, "%s: mlx5e_open_rqts failed (%d)\n", __func__, err); goto err_open_drop_rq; } err = mlx5e_open_tirs(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_tirs() failed, %d\n", err); goto err_open_rqts; } err = mlx5e_open_flow_tables(priv); if (err) { if_printf(ifp, "%s: mlx5e_open_flow_tables failed (%d)\n", __func__, err); goto err_open_tirs; } err = mlx5e_tls_rx_init(priv); if (err) { if_printf(ifp, "%s: mlx5e_tls_rx_init() failed, %d\n", __func__, err); goto err_open_flow_tables; } /* set default MTU */ - mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu); + mlx5e_set_dev_port_mtu(ifp, if_getmtu(ifp)); /* Set default media status */ priv->media_status_last = IFM_AVALID; priv->media_active_last = IFM_ETHER | IFM_AUTO | IFM_FDX; /* setup default pauseframes configuration */ mlx5e_setup_pauseframes(priv); /* Setup supported medias */ if (!mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1)) { ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_capability); } else { ext = false; eth_proto_cap = 0; mlx5_en_err(ifp, "Query port media capability failed, %d\n", err); } ifmedia_init(&priv->media, IFM_IMASK, mlx5e_media_change, mlx5e_media_status); if (ext) { for (unsigned i = 0; i != MLX5E_EXT_LINK_SPEEDS_NUMBER; i++) { /* check if hardware has the right capability */ if (MLX5E_PROT_MASK(i) & ~eth_proto_cap) continue; for (unsigned j = 0; j != MLX5E_CABLE_TYPE_NUMBER; j++) { media_entry = mlx5e_ext_mode_table[i][j]; if (media_entry.subtype == 0) continue; /* check if this subtype was already added */ for (unsigned k = 0; k != i; k++) { /* check if hardware has the right capability */ if (MLX5E_PROT_MASK(k) & ~eth_proto_cap) continue; for (unsigned m = 0; m != MLX5E_CABLE_TYPE_NUMBER; m++) { if (media_entry.subtype == mlx5e_ext_mode_table[k][m].subtype) goto skip_ext_media; } } mlx5e_ifm_add(priv, media_entry.subtype); skip_ext_media:; } } } else { for (unsigned i = 0; i != MLX5E_LINK_SPEEDS_NUMBER; i++) { media_entry = mlx5e_mode_table[i]; if (media_entry.subtype == 0) continue; if (MLX5E_PROT_MASK(i) & ~eth_proto_cap) continue; /* check if this subtype was already added */ for (unsigned k = 0; k != i; k++) { if (media_entry.subtype == mlx5e_mode_table[k].subtype) goto skip_media; } mlx5e_ifm_add(priv, media_entry.subtype); /* NOTE: 10G ER and LR shares the same entry */ if (media_entry.subtype == IFM_10G_ER) mlx5e_ifm_add(priv, IFM_10G_LR); skip_media:; } } mlx5e_ifm_add(priv, IFM_AUTO); /* Set autoselect by default */ ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE); DEBUGNET_SET(ifp, mlx5_en); ether_ifattach(ifp, dev_addr); /* Register for VLAN events */ priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST); priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST); /* Register for VxLAN events */ priv->vxlan_start = EVENTHANDLER_REGISTER(vxlan_start, mlx5e_vxlan_start, priv, EVENTHANDLER_PRI_ANY); priv->vxlan_stop = EVENTHANDLER_REGISTER(vxlan_stop, mlx5e_vxlan_stop, priv, EVENTHANDLER_PRI_ANY); /* Link is down by default */ if_link_state_change(ifp, LINK_STATE_DOWN); mlx5e_enable_async_events(priv); mlx5e_add_hw_stats(priv); mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM, priv->stats.vport.arg); mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM, priv->stats.pport.arg); mlx5e_create_ethtool(priv); mtx_lock(&priv->async_events_mtx); mlx5e_update_stats(priv); mtx_unlock(&priv->async_events_mtx); SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_clbr_done", CTLFLAG_RD, &priv->clbr_done, 0, "RX timestamps calibration state"); callout_init(&priv->tstmp_clbr, 1); /* Pull out the frequency of the clock in hz */ priv->cclk = (uint64_t)MLX5_CAP_GEN(mdev, device_frequency_khz) * 1000ULL; mlx5e_reset_calibration_callout(priv); pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; - pa.pa_headname = ifp->if_xname; + pa.pa_headname = if_name(ifp); priv->pfil = pfil_head_register(&pa); PRIV_LOCK(priv); err = mlx5e_open_flow_rules(priv); if (err) { mlx5_en_err(ifp, "mlx5e_open_flow_rules() failed, %d (ignored)\n", err); } PRIV_UNLOCK(priv); return (priv); err_open_flow_tables: mlx5e_close_flow_tables(priv); err_open_tirs: mlx5e_close_tirs(priv); err_open_rqts: mlx5e_close_rqts(priv); err_open_drop_rq: mlx5e_close_drop_rq(&priv->drop_rq); err_tls_init: mlx5e_tls_cleanup(priv); err_rl_init: mlx5e_rl_cleanup(priv); err_create_mkey: mlx5_core_destroy_mkey(priv->mdev, &priv->mr); err_dealloc_transport_domain: mlx5_dealloc_transport_domain(mdev, priv->tdn, 0); err_dealloc_pd: mlx5_core_dealloc_pd(mdev, priv->pdn, 0); err_free_wq: flush_workqueue(priv->wq); err_free_sysctl: sysctl_ctx_free(&priv->sysctl_ctx); if (priv->sysctl_debug) sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); mlx5e_priv_static_destroy(priv, mdev, mdev->priv.eq_table.num_comp_vectors); err_free_ifp: if_free(ifp); err_free_priv: free(priv, M_MLX5EN); return (NULL); } static void mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv) { struct mlx5e_priv *priv = vpriv; - struct ifnet *ifp = priv->ifp; + if_t ifp = priv->ifp; /* don't allow more IOCTLs */ priv->gone = 1; /* XXX wait a bit to allow IOCTL handlers to complete */ pause("W", hz); #ifdef RATELIMIT /* * The kernel can have reference(s) via the m_snd_tag's into * the ratelimit channels, and these must go away before * detaching: */ while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) { mlx5_en_err(priv->ifp, "Waiting for all ratelimit connections to terminate\n"); pause("W", hz); } #endif #ifdef KERN_TLS /* wait for all TLS tags to get freed */ while (priv->tls.init != 0 && uma_zone_get_cur(priv->tls.zone) != 0) { mlx5_en_err(priv->ifp, "Waiting for all TLS connections to terminate\n"); pause("W", hz); } /* wait for all TLS RX tags to get freed */ while (priv->tls_rx.init != 0 && uma_zone_get_cur(priv->tls_rx.zone) != 0) { mlx5_en_err(priv->ifp, "Waiting for all TLS RX connections to terminate\n"); pause("W", hz); } #endif /* wait for all unlimited send tags to complete */ mlx5e_priv_wait_for_completion(priv, mdev->priv.eq_table.num_comp_vectors); /* stop watchdog timer */ callout_drain(&priv->watchdog); callout_drain(&priv->tstmp_clbr); if (priv->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); if (priv->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach); if (priv->vxlan_start != NULL) EVENTHANDLER_DEREGISTER(vxlan_start, priv->vxlan_start); if (priv->vxlan_stop != NULL) EVENTHANDLER_DEREGISTER(vxlan_stop, priv->vxlan_stop); /* make sure device gets closed */ PRIV_LOCK(priv); mlx5e_close_locked(ifp); mlx5e_close_flow_rules(priv); PRIV_UNLOCK(priv); /* deregister pfil */ if (priv->pfil != NULL) { pfil_head_unregister(priv->pfil); priv->pfil = NULL; } /* unregister device */ ifmedia_removeall(&priv->media); ether_ifdetach(ifp); mlx5e_tls_rx_cleanup(priv); mlx5e_close_flow_tables(priv); mlx5e_close_tirs(priv); mlx5e_close_rqts(priv); mlx5e_close_drop_rq(&priv->drop_rq); mlx5e_tls_cleanup(priv); mlx5e_rl_cleanup(priv); /* destroy all remaining sysctl nodes */ sysctl_ctx_free(&priv->stats.vport.ctx); sysctl_ctx_free(&priv->stats.pport.ctx); if (priv->sysctl_debug) sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); sysctl_ctx_free(&priv->sysctl_ctx); mlx5_core_destroy_mkey(priv->mdev, &priv->mr); mlx5_dealloc_transport_domain(priv->mdev, priv->tdn, 0); mlx5_core_dealloc_pd(priv->mdev, priv->pdn, 0); mlx5e_disable_async_events(priv); flush_workqueue(priv->wq); mlx5e_priv_static_destroy(priv, mdev, mdev->priv.eq_table.num_comp_vectors); if_free(ifp); free(priv, M_MLX5EN); } #ifdef DEBUGNET static void -mlx5_en_debugnet_init(struct ifnet *dev, int *nrxr, int *ncl, int *clsize) +mlx5_en_debugnet_init(if_t dev, int *nrxr, int *ncl, int *clsize) { struct mlx5e_priv *priv = if_getsoftc(dev); PRIV_LOCK(priv); *nrxr = priv->params.num_channels; *ncl = DEBUGNET_MAX_IN_FLIGHT; *clsize = MLX5E_MAX_RX_BYTES; PRIV_UNLOCK(priv); } static void -mlx5_en_debugnet_event(struct ifnet *dev, enum debugnet_ev event) +mlx5_en_debugnet_event(if_t dev, enum debugnet_ev event) { } static int -mlx5_en_debugnet_transmit(struct ifnet *dev, struct mbuf *m) +mlx5_en_debugnet_transmit(if_t dev, struct mbuf *m) { struct mlx5e_priv *priv = if_getsoftc(dev); struct mlx5e_sq *sq; int err; if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || (priv->media_status_last & IFM_ACTIVE) == 0) return (ENOENT); sq = &priv->channel[0].sq[0]; if (sq->running == 0) { m_freem(m); return (ENOENT); } if (mlx5e_sq_xmit(sq, &m) != 0) { m_freem(m); err = ENOBUFS; } else { err = 0; } mlx5e_tx_notify_hw(sq, true); return (err); } static int -mlx5_en_debugnet_poll(struct ifnet *dev, int count) +mlx5_en_debugnet_poll(if_t dev, int count) { struct mlx5e_priv *priv = if_getsoftc(dev); if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0 || (priv->media_status_last & IFM_ACTIVE) == 0) return (ENOENT); mlx5_poll_interrupts(priv->mdev); return (0); } #endif /* DEBUGNET */ static void * mlx5e_get_ifp(void *vpriv) { struct mlx5e_priv *priv = vpriv; return (priv->ifp); } static struct mlx5_interface mlx5e_interface = { .add = mlx5e_create_ifp, .remove = mlx5e_destroy_ifp, .event = mlx5e_async_event, .protocol = MLX5_INTERFACE_PROTOCOL_ETH, .get_dev = mlx5e_get_ifp, }; void mlx5e_init(void) { mlx5_register_interface(&mlx5e_interface); } void mlx5e_cleanup(void) { mlx5_unregister_interface(&mlx5e_interface); } module_init_order(mlx5e_init, SI_ORDER_SIXTH); module_exit_order(mlx5e_cleanup, SI_ORDER_SIXTH); MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1); MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1); MODULE_VERSION(mlx5en, 1); diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c index d00e99fce1d4..e4f6c0bdf979 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c @@ -1,1694 +1,1694 @@ /*- * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_rss.h" #include "opt_ratelimit.h" #include #ifdef RATELIMIT static int mlx5e_rl_open_workers(struct mlx5e_priv *); static void mlx5e_rl_close_workers(struct mlx5e_priv *); static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS); static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x, struct sysctl_oid *, const char *name, const char *desc); static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, struct sysctl_oid *node, const char *name, const char *desc); static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value); static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value); static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify; static if_snd_tag_query_t mlx5e_rl_snd_tag_query; static if_snd_tag_free_t mlx5e_rl_snd_tag_free; static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = { .snd_tag_modify = mlx5e_rl_snd_tag_modify, .snd_tag_query = mlx5e_rl_snd_tag_query, .snd_tag_free = mlx5e_rl_snd_tag_free, .type = IF_SND_TAG_TYPE_RATE_LIMIT }; static void mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl, struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); MLX5_SET(wq, wq, log_wq_sz, log_sq_size); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); MLX5_SET(wq, wq, pd, rl->priv->pdn); param->wq.linear = 1; } static void mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl, struct mlx5e_cq_param *param) { void *cqc = param->cqc; uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); MLX5_SET(cqc, cqc, log_cq_size, log_sq_size); MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs); MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts); MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index); switch (rl->param.tx_coalesce_mode) { case 0: MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; } } static void mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl, struct mlx5e_rl_channel_param *cparam) { memset(cparam, 0, sizeof(*cparam)); mlx5e_rl_build_sq_param(rl, &cparam->sq); mlx5e_rl_build_cq_param(rl, &cparam->cq); } static int mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, struct mlx5e_sq_param *param, int ix) { struct mlx5_core_dev *mdev = priv->mdev; void *sqc = param->sqc; void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); int err; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &sq->dma_tag))) goto done; sq->mkey_be = cpu_to_be32(priv->mr.key); sq->ifp = priv->ifp; sq->priv = priv; err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) goto err_free_dma_tag; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; err = mlx5e_alloc_sq_db(sq); if (err) goto err_sq_wq_destroy; mlx5e_update_sq_inline(sq); return (0); err_sq_wq_destroy: mlx5_wq_destroy(&sq->wq_ctrl); err_free_dma_tag: bus_dma_tag_destroy(sq->dma_tag); done: return (err); } static void mlx5e_rl_destroy_sq(struct mlx5e_sq *sq) { mlx5e_free_sq_db(sq); mlx5_wq_destroy(&sq->wq_ctrl); bus_dma_tag_destroy(sq->dma_tag); } static int mlx5e_rl_query_sq(struct mlx5e_sq *sq) { void *out; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(query_sq_out); out = mlx5_vzalloc(inlen); if (!out) return -ENOMEM; err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out); if (err) goto out; sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle); out: kvfree(out); return err; } static int mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, struct mlx5e_sq_param *param, int ix) { int err; err = mlx5e_rl_create_sq(priv, sq, param, ix); if (err) return (err); err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn); if (err) goto err_destroy_sq; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err) goto err_disable_sq; if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) { err = mlx5e_rl_query_sq(sq); if (err) { mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for" "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err); sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; } } else sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE; WRITE_ONCE(sq->running, 1); return (0); err_disable_sq: mlx5e_disable_sq(sq); err_destroy_sq: mlx5e_rl_destroy_sq(sq); return (err); } static void mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq) { mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF); mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF); callout_init_mtx(&sq->cev_callout, &sq->lock, 0); sq->cev_factor = priv->rl.param.tx_completion_fact; /* ensure the TX completion event factor is not zero */ if (sq->cev_factor == 0) sq->cev_factor = 1; } static int mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix, struct mlx5e_rl_channel_param *cparam, struct mlx5e_sq *volatile *ppsq) { struct mlx5e_priv *priv = rlw->priv; struct mlx5e_sq *sq; int err; sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO); /* init mutexes */ mlx5e_rl_chan_mtx_init(priv, sq); /* open TX completion queue */ err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq, &mlx5e_tx_cq_comp, eq_ix); if (err) goto err_free; err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix); if (err) goto err_close_tx_cq; /* store TX channel pointer */ *ppsq = sq; /* poll TX queue initially */ sq->cq.mcq.comp(&sq->cq.mcq, NULL); return (0); err_close_tx_cq: mlx5e_close_cq(&sq->cq); err_free: /* destroy mutexes */ mtx_destroy(&sq->lock); mtx_destroy(&sq->comp_lock); free(sq, M_MLX5EN); atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL); return (err); } static void mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq) { struct mlx5e_sq *sq = *ppsq; /* check if channel is already closed */ if (sq == NULL) return; /* ensure channel pointer is no longer used */ *ppsq = NULL; /* teardown and destroy SQ */ mlx5e_drain_sq(sq); mlx5e_disable_sq(sq); mlx5e_rl_destroy_sq(sq); /* close CQ */ mlx5e_close_cq(&sq->cq); /* destroy mutexes */ mtx_destroy(&sq->lock); mtx_destroy(&sq->comp_lock); free(sq, M_MLX5EN); } static void mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl) { /* * Limit the maximum distance between completion events to * half of the currently set TX queue size. * * The maximum number of queue entries a single IP packet can * consume is given by MLX5_SEND_WQE_MAX_WQEBBS. * * The worst case max value is then given as below: */ uint64_t max = rl->param.tx_queue_size / (2 * MLX5_SEND_WQE_MAX_WQEBBS); /* * Update the maximum completion factor value in case the * tx_queue_size field changed. Ensure we don't overflow * 16-bits. */ if (max < 1) max = 1; else if (max > 65535) max = 65535; rl->param.tx_completion_fact_max = max; /* * Verify that the current TX completion factor is within the * given limits: */ if (rl->param.tx_completion_fact < 1) rl->param.tx_completion_fact = 1; else if (rl->param.tx_completion_fact > max) rl->param.tx_completion_fact = max; } static int mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index) { struct mlx5e_priv *priv = sq->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *sqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_sq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); MLX5_SET(modify_sq_in, in, sqn, sq->sqn); MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY); MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY); MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); err = mlx5_core_modify_sq(mdev, in, inlen); kvfree(in); return (err); } /* * This function will search the configured rate limit table for the * best match to avoid that a single socket based application can * allocate all the available hardware rates. If the user selected * rate deviates too much from the closes rate available in the rate * limit table, unlimited rate will be selected. */ static uint64_t mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate) { uint64_t distance = -1ULL; uint64_t diff; uint64_t retval = 0; /* unlimited */ uint64_t x; /* search for closest rate */ for (x = 0; x != rl->param.tx_rates_def; x++) { uint64_t rate = rl->rate_limit_table[x]; if (rate == 0) continue; if (rate > user_rate) diff = rate - user_rate; else diff = user_rate - rate; /* check if distance is smaller than previous rate */ if (diff < distance) { distance = diff; retval = rate; } } /* range check for multiplication below */ if (user_rate > rl->param.tx_limit_max) user_rate = rl->param.tx_limit_max; /* fallback to unlimited, if rate deviates too much */ if (distance > howmany(user_rate * rl->param.tx_allowed_deviation, 1000ULL)) retval = 0; return (retval); } static int mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle, struct mlx5e_rl_channel *sq_channel) { const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe), MLX5_SEND_WQE_DS); struct mlx5e_tx_qos_remap_wqe *wqe; int pi; mtx_lock(&iq->lock); pi = mlx5e_iq_get_producer_index(iq); if (pi < 0) { mtx_unlock(&iq->lock); return (-ENOMEM); } wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi); memset(wqe, 0, sizeof(*wqe)); wqe->qos_remap.qos_handle = cpu_to_be32(scq_handle); wqe->qos_remap.queue_handle = cpu_to_be32(sq_handle); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) | MLX5_OPCODE_QOS_REMAP); wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt); wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8); wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL; /* copy data for doorbell */ memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32)); iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); iq->data[pi].p_refcount = &sq_channel->refcount; atomic_add_int(iq->data[pi].p_refcount, 1); iq->pc += iq->data[pi].num_wqebbs; mlx5e_iq_notify_hw(iq); mtx_unlock(&iq->lock); return (0); /* success */ } static int mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index, struct mlx5e_rl_channel *sq_channel) { struct mlx5e_channel *iq_channel; u32 scq_handle; u32 sq_handle; int error; /* Specific SQ remap operations should be handled by same IQ */ iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels]; sq_handle = sq->queue_handle; scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index); if (sq_handle == MLX5_INVALID_QUEUE_HANDLE || scq_handle == MLX5_INVALID_QUEUE_HANDLE) error = -1; else error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle, sq_handle, sq_channel); return (error); } /* * This function sets the requested rate for a rate limit channel, in * bits per second. The requested rate will be filtered through the * find best rate function above. */ static int mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) { struct mlx5e_rl_priv_data *rl = &rlw->priv->rl; struct mlx5e_sq *sq; uint64_t temp; uint16_t index; uint16_t burst; int error; bool use_sq_remap; if (rate != 0) { MLX5E_RL_WORKER_UNLOCK(rlw); MLX5E_RL_RLOCK(rl); /* get current burst size in bytes */ temp = rl->param.tx_burst_size * - MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu); + MLX5E_SW2HW_MTU(if_getmtu(rlw->priv->ifp)); /* limit burst size to 64K currently */ if (temp > 65535) temp = 65535; burst = temp; /* find best rate */ rate = mlx5e_rl_find_best_rate_locked(rl, rate); MLX5E_RL_RUNLOCK(rl); if (rate == 0) { /* rate doesn't exist, fallback to unlimited */ index = 0; rate = 0; atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); } else { /* get a reference on the new rate */ error = -mlx5_rl_add_rate(rlw->priv->mdev, howmany(rate, 1000), burst, &index); if (error != 0) { /* adding rate failed, fallback to unlimited */ index = 0; rate = 0; atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL); } } MLX5E_RL_WORKER_LOCK(rlw); } else { index = 0; burst = 0; /* default */ } /* paced <--> non-paced transitions must go via FW */ use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) && channel->last_rate != 0 && rate != 0; /* atomically swap rates */ temp = channel->last_rate; channel->last_rate = rate; rate = temp; /* atomically swap burst size */ temp = channel->last_burst; channel->last_burst = burst; burst = temp; MLX5E_RL_WORKER_UNLOCK(rlw); /* put reference on the old rate, if any */ if (rate != 0) { mlx5_rl_remove_rate(rlw->priv->mdev, howmany(rate, 1000), burst); } /* set new rate, if SQ is running */ sq = channel->sq; if (sq != NULL && READ_ONCE(sq->running) != 0) { if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) { while (atomic_load_int(&channel->refcount) != 0 && rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR && pci_channel_offline(rlw->priv->mdev->pdev) == 0) pause("W", 1); error = mlx5e_rl_modify_sq(sq, index); if (error != 0) atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); } } else error = 0; MLX5E_RL_WORKER_LOCK(rlw); return (-error); } static void mlx5e_rl_worker(void *arg) { struct thread *td; struct mlx5e_rl_worker *rlw = arg; struct mlx5e_rl_channel *channel; struct mlx5e_priv *priv; unsigned ix; uint64_t x; int error; /* set thread priority */ td = curthread; thread_lock(td); sched_prio(td, PI_SWI(SWI_NET)); thread_unlock(td); priv = rlw->priv; /* compute completion vector */ ix = (rlw - priv->rl.workers) % priv->mdev->priv.eq_table.num_comp_vectors; /* TODO bind to CPU */ /* open all the SQs */ MLX5E_RL_WORKER_LOCK(rlw); for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel = rlw->channels + x; #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS) if (channel->state == MLX5E_RL_ST_FREE) continue; #endif MLX5E_RL_WORKER_UNLOCK(rlw); MLX5E_RL_RLOCK(&priv->rl); error = mlx5e_rl_open_channel(rlw, ix, &priv->rl.chan_param, &channel->sq); MLX5E_RL_RUNLOCK(&priv->rl); MLX5E_RL_WORKER_LOCK(rlw); if (error != 0) { mlx5_en_err(priv->ifp, "mlx5e_rl_open_channel failed: %d\n", error); break; } mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate); } while (1) { if (STAILQ_FIRST(&rlw->process_head) == NULL) { /* check if we are tearing down */ if (rlw->worker_done != 0) break; cv_wait(&rlw->cv, &rlw->mtx); } /* check if we are tearing down */ if (rlw->worker_done != 0) break; channel = STAILQ_FIRST(&rlw->process_head); if (channel != NULL) { STAILQ_REMOVE_HEAD(&rlw->process_head, entry); switch (channel->state) { case MLX5E_RL_ST_MODIFY: channel->state = MLX5E_RL_ST_USED; MLX5E_RL_WORKER_UNLOCK(rlw); /* create channel by demand */ if (channel->sq == NULL) { MLX5E_RL_RLOCK(&priv->rl); error = mlx5e_rl_open_channel(rlw, ix, &priv->rl.chan_param, &channel->sq); MLX5E_RL_RUNLOCK(&priv->rl); if (error != 0) { mlx5_en_err(priv->ifp, "mlx5e_rl_open_channel failed: %d\n", error); } else { atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL); } } else { mlx5e_resume_sq(channel->sq); } MLX5E_RL_WORKER_LOCK(rlw); /* convert from bytes/s to bits/s and set new rate */ error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->new_rate * 8ULL); if (error != 0) { mlx5_en_err(priv->ifp, "mlx5e_rlw_channel_set_rate_locked failed: %d\n", error); } break; case MLX5E_RL_ST_DESTROY: error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); if (error != 0) { mlx5_en_err(priv->ifp, "mlx5e_rlw_channel_set_rate_locked failed: %d\n", error); } if (channel->sq != NULL) { /* * Make sure all packets are * transmitted before SQ is * returned to free list: */ MLX5E_RL_WORKER_UNLOCK(rlw); mlx5e_drain_sq(channel->sq); MLX5E_RL_WORKER_LOCK(rlw); } /* put the channel back into the free list */ STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry); channel->state = MLX5E_RL_ST_FREE; atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL); break; default: /* NOP */ break; } } } /* close all the SQs */ for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel = rlw->channels + x; /* update the initial rate */ channel->init_rate = channel->last_rate; /* make sure we free up the rate resource */ mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); if (channel->sq != NULL) { MLX5E_RL_WORKER_UNLOCK(rlw); mlx5e_rl_close_channel(&channel->sq); atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL); MLX5E_RL_WORKER_LOCK(rlw); } } rlw->worker_done = 0; cv_broadcast(&rlw->cv); MLX5E_RL_WORKER_UNLOCK(rlw); kthread_exit(); } static int mlx5e_rl_open_tis(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 in[MLX5_ST_SZ_DW(create_tis_in)]; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); memset(in, 0, sizeof(in)); MLX5_SET(tisc, tisc, prio, 0); MLX5_SET(tisc, tisc, transport_domain, priv->tdn); return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn)); } static void mlx5e_rl_close_tis(struct mlx5e_priv *priv) { mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0); } static void mlx5e_rl_set_default_params(struct mlx5e_rl_params *param, struct mlx5_core_dev *mdev) { /* ratelimit workers */ param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors; param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS; /* range check */ if (param->tx_worker_threads_def == 0 || param->tx_worker_threads_def > param->tx_worker_threads_max) param->tx_worker_threads_def = param->tx_worker_threads_max; /* ratelimit channels */ param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS / param->tx_worker_threads_def; param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS; /* range check */ if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER) param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER; /* set default burst size */ param->tx_burst_size = 4; /* MTUs */ /* * Set maximum burst size * * The burst size is multiplied by the MTU and clamped to the * range 0 ... 65535 bytes inclusivly before fed into the * firmware. * * NOTE: If the burst size or MTU is changed only ratelimit * connections made after the change will use the new burst * size. */ param->tx_burst_size_max = 255; /* get firmware rate limits in 1000bit/s and convert them to bit/s */ param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL; param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL; /* ratelimit table size */ param->tx_rates_max = mdev->priv.rl_table.max_size; /* range check */ if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES) param->tx_rates_max = MLX5E_RL_MAX_TX_RATES; /* set default number of rates */ param->tx_rates_def = param->tx_rates_max; /* set maximum allowed rate deviation */ if (param->tx_limit_max != 0) { /* * Make sure the deviation multiplication doesn't * overflow unsigned 64-bit: */ param->tx_allowed_deviation_max = -1ULL / param->tx_limit_max; } /* set default rate deviation */ param->tx_allowed_deviation = 50; /* 5.0% */ /* channel parameters */ param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT; param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT; param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT; param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT; } static const char *mlx5e_rl_params_desc[] = { MLX5E_RL_PARAMS(MLX5E_STATS_DESC) }; static const char *mlx5e_rl_table_params_desc[] = { MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC) }; static const char *mlx5e_rl_stats_desc[] = { MLX5E_RL_STATS(MLX5E_STATS_DESC) }; int mlx5e_rl_init(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; struct sysctl_oid *node; struct sysctl_oid *stats; char buf[64]; uint64_t i; uint64_t j; int error; /* check if there is support for packet pacing */ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) return (0); rl->priv = priv; sysctl_ctx_init(&rl->ctx); sx_init(&rl->rl_sxlock, "ratelimit-sxlock"); /* open own TIS domain for ratelimit SQs */ error = mlx5e_rl_open_tis(priv); if (error) goto done; /* setup default value for parameters */ mlx5e_rl_set_default_params(&rl->param, priv->mdev); /* update the completion factor */ mlx5e_rl_sync_tx_completion_fact(rl); /* create root node */ node = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support"); if (node != NULL) { /* create SYSCTLs */ for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) { mlx5e_rl_sysctl_add_u64_oid(rl, MLX5E_RL_PARAMS_INDEX(arg[i]), node, mlx5e_rl_params_desc[2 * i], mlx5e_rl_params_desc[2 * i + 1]); } stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Rate limiting statistics"); if (stats != NULL) { /* create SYSCTLs */ for (i = 0; i != MLX5E_RL_STATS_NUM; i++) { mlx5e_rl_sysctl_add_stats_u64_oid(rl, i, stats, mlx5e_rl_stats_desc[2 * i], mlx5e_rl_stats_desc[2 * i + 1]); } } } /* allocate workers array */ rl->workers = malloc(sizeof(rl->workers[0]) * rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO); /* allocate rate limit array */ rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) * rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO); if (node != NULL) { /* create more SYSCTls */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table, "A", "Show table of all configured TX rates"); /* try to fetch rate table from kernel environment */ for (i = 0; i != rl->param.tx_rates_def; i++) { /* compute path for tunable */ snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d", device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i); if (TUNABLE_QUAD_FETCH(buf, &j)) mlx5e_rl_tx_limit_add(rl, j); } /* setup rate table sysctls */ for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) { mlx5e_rl_sysctl_add_u64_oid(rl, MLX5E_RL_PARAMS_INDEX(table_arg[i]), node, mlx5e_rl_table_params_desc[2 * i], mlx5e_rl_table_params_desc[2 * i + 1]); } } for (j = 0; j < rl->param.tx_worker_threads_def; j++) { struct mlx5e_rl_worker *rlw = rl->workers + j; rlw->priv = priv; cv_init(&rlw->cv, "mlx5-worker-cv"); mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF); STAILQ_INIT(&rlw->index_list_head); STAILQ_INIT(&rlw->process_head); rlw->channels = malloc(sizeof(rlw->channels[0]) * rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO); MLX5E_RL_WORKER_LOCK(rlw); for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) { struct mlx5e_rl_channel *channel = rlw->channels + i; channel->worker = rlw; STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry); } MLX5E_RL_WORKER_UNLOCK(rlw); } PRIV_LOCK(priv); error = mlx5e_rl_open_workers(priv); PRIV_UNLOCK(priv); if (error != 0) { mlx5_en_err(priv->ifp, "mlx5e_rl_open_workers failed: %d\n", error); } return (0); done: sysctl_ctx_free(&rl->ctx); sx_destroy(&rl->rl_sxlock); return (error); } static int mlx5e_rl_open_workers(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; struct thread *rl_thread = NULL; struct proc *rl_proc = NULL; uint64_t j; int error; if (priv->gone || rl->opened) return (-EINVAL); MLX5E_RL_WLOCK(rl); /* compute channel parameters once */ mlx5e_rl_build_channel_param(rl, &rl->chan_param); MLX5E_RL_WUNLOCK(rl); for (j = 0; j < rl->param.tx_worker_threads_def; j++) { struct mlx5e_rl_worker *rlw = rl->workers + j; /* start worker thread */ error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread, RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j); if (error != 0) { mlx5_en_err(rl->priv->ifp, "kproc_kthread_add failed: %d\n", error); rlw->worker_done = 1; } } rl->opened = 1; return (0); } static void mlx5e_rl_close_workers(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; uint64_t y; if (rl->opened == 0) return; /* tear down worker threads simultaneously */ for (y = 0; y < rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; /* tear down worker before freeing SQs */ MLX5E_RL_WORKER_LOCK(rlw); if (rlw->worker_done == 0) { rlw->worker_done = 1; cv_broadcast(&rlw->cv); } else { /* XXX thread not started */ rlw->worker_done = 0; } MLX5E_RL_WORKER_UNLOCK(rlw); } /* wait for worker threads to exit */ for (y = 0; y < rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; /* tear down worker before freeing SQs */ MLX5E_RL_WORKER_LOCK(rlw); while (rlw->worker_done != 0) cv_wait(&rlw->cv, &rlw->mtx); MLX5E_RL_WORKER_UNLOCK(rlw); } rl->opened = 0; } static void mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl) { unsigned x; MLX5E_RL_WLOCK(rl); for (x = 0; x != rl->param.tx_rates_def; x++) rl->rate_limit_table[x] = 0; MLX5E_RL_WUNLOCK(rl); } void mlx5e_rl_cleanup(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; uint64_t y; /* check if there is support for packet pacing */ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) return; /* TODO check if there is support for packet pacing */ sysctl_ctx_free(&rl->ctx); PRIV_LOCK(priv); mlx5e_rl_close_workers(priv); PRIV_UNLOCK(priv); mlx5e_rl_reset_rates(rl); /* close TIS domain */ mlx5e_rl_close_tis(priv); for (y = 0; y < rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; cv_destroy(&rlw->cv); mtx_destroy(&rlw->mtx); free(rlw->channels, M_MLX5EN); } free(rl->rate_limit_table, M_MLX5EN); free(rl->workers, M_MLX5EN); sx_destroy(&rl->rl_sxlock); } static void mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) { STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry); cv_broadcast(&rlw->cv); } static void mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) { if (channel == NULL) return; MLX5E_RL_WORKER_LOCK(rlw); switch (channel->state) { case MLX5E_RL_ST_MODIFY: channel->state = MLX5E_RL_ST_DESTROY; break; case MLX5E_RL_ST_USED: channel->state = MLX5E_RL_ST_DESTROY; mlx5e_rlw_queue_channel_locked(rlw, channel); break; default: break; } MLX5E_RL_WORKER_UNLOCK(rlw); } static int mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) { MLX5E_RL_WORKER_LOCK(rlw); channel->new_rate = rate; switch (channel->state) { case MLX5E_RL_ST_USED: channel->state = MLX5E_RL_ST_MODIFY; mlx5e_rlw_queue_channel_locked(rlw, channel); break; default: break; } MLX5E_RL_WORKER_UNLOCK(rlw); return (0); } static int mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, union if_snd_tag_query_params *params) { int retval; MLX5E_RL_WORKER_LOCK(rlw); switch (channel->state) { case MLX5E_RL_ST_USED: params->rate_limit.max_rate = channel->last_rate; params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); retval = 0; break; case MLX5E_RL_ST_MODIFY: params->rate_limit.max_rate = channel->last_rate; params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); retval = EBUSY; break; default: retval = EINVAL; break; } MLX5E_RL_WORKER_UNLOCK(rlw); return (retval); } static int mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel **pchannel) { struct mlx5e_rl_channel *channel; int retval = ENOMEM; MLX5E_RL_WORKER_LOCK(rlw); /* Check for available channel in free list */ if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) { retval = 0; /* Remove head index from available list */ STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry); channel->state = MLX5E_RL_ST_USED; atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL); } else { atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL); } MLX5E_RL_WORKER_UNLOCK(rlw); *pchannel = channel; #ifdef RATELIMIT_DEBUG mlx5_en_info(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel); #endif return (retval); } int -mlx5e_rl_snd_tag_alloc(struct ifnet *ifp, +mlx5e_rl_snd_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct mlx5e_rl_channel *channel; struct mlx5e_rl_worker *rlw; struct mlx5e_priv *priv; int error; - priv = ifp->if_softc; + priv = if_getsoftc(ifp); /* check if there is support for packet pacing or if device is going away */ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone || params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT) return (EOPNOTSUPP); /* compute worker thread this TCP connection belongs to */ rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) % priv->rl.param.tx_worker_threads_def); error = mlx5e_find_available_tx_ring_index(rlw, &channel); if (error != 0) goto done; error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate); if (error != 0) { mlx5e_rl_free(rlw, channel); goto done; } /* store pointer to mbuf tag */ MPASS(channel->tag.refcount == 0); m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw); *ppmt = &channel->tag; done: return (error); } static int mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) { struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag); return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate)); } static int mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag); return (mlx5e_rl_query(channel->worker, channel, params)); } static void mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag); mlx5e_rl_free(channel->worker, channel); } static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS) { struct mlx5e_rl_priv_data *rl = arg1; struct mlx5e_priv *priv = rl->priv; struct sbuf sbuf; unsigned x; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); PRIV_LOCK(priv); sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req); sbuf_printf(&sbuf, "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n" "\t" "--------------------------------------------\n"); MLX5E_RL_RLOCK(rl); for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] == 0) continue; sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n", x, (unsigned)rl->param.tx_burst_size, (long long)rl->rate_limit_table[x]); } MLX5E_RL_RUNLOCK(rl); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); PRIV_UNLOCK(priv); return (error); } static int mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl) { uint64_t x; uint64_t y; MLX5E_RL_WLOCK(rl); /* compute channel parameters once */ mlx5e_rl_build_channel_param(rl, &rl->chan_param); MLX5E_RL_WUNLOCK(rl); for (y = 0; y != rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel; struct mlx5e_sq *sq; channel = rlw->channels + x; sq = channel->sq; if (sq == NULL) continue; if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) { mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq, rl->param.tx_coalesce_usecs, rl->param.tx_coalesce_pkts, rl->param.tx_coalesce_mode); } else { mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq, rl->param.tx_coalesce_usecs, rl->param.tx_coalesce_pkts); } } } return (0); } void mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl) { uint64_t x; uint64_t y; for (y = 0; y != rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel; struct mlx5e_sq *sq; channel = rlw->channels + x; sq = channel->sq; if (sq == NULL) continue; mtx_lock(&sq->lock); mlx5e_update_sq_inline(sq); mtx_unlock(&sq->lock); } } } static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value) { unsigned x; int error; if (value < 1000 || mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0) return (EINVAL); MLX5E_RL_WLOCK(rl); error = ENOMEM; /* check if rate already exists */ for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] != value) continue; error = EEXIST; break; } /* check if there is a free rate entry */ if (x == rl->param.tx_rates_def) { for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] != 0) continue; rl->rate_limit_table[x] = value; error = 0; break; } } MLX5E_RL_WUNLOCK(rl); return (error); } static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value) { unsigned x; int error; if (value == 0) return (EINVAL); MLX5E_RL_WLOCK(rl); /* check if rate already exists */ for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] != value) continue; /* free up rate */ rl->rate_limit_table[x] = 0; break; } /* check if there is a free rate entry */ if (x == rl->param.tx_rates_def) error = ENOENT; else error = 0; MLX5E_RL_WUNLOCK(rl); return (error); } static int mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_rl_priv_data *rl = arg1; struct mlx5e_priv *priv = rl->priv; unsigned mode_modify; unsigned was_opened; uint64_t value; int error; PRIV_LOCK(priv); MLX5E_RL_RLOCK(rl); value = rl->param.arg[arg2]; MLX5E_RL_RUNLOCK(rl); if (req != NULL) { error = sysctl_handle_64(oidp, &value, 0, req); if (error || req->newptr == NULL || value == rl->param.arg[arg2]) goto done; } else { error = 0; } /* check if device is gone */ if (priv->gone) { error = ENXIO; goto done; } was_opened = rl->opened; mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) { case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def): if (value > rl->param.tx_worker_threads_max) value = rl->param.tx_worker_threads_max; else if (value < 1) value = 1; /* store new value */ rl->param.arg[arg2] = value; break; case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def): if (value > rl->param.tx_channels_per_worker_max) value = rl->param.tx_channels_per_worker_max; else if (value < 1) value = 1; /* store new value */ rl->param.arg[arg2] = value; break; case MLX5E_RL_PARAMS_INDEX(tx_rates_def): if (value > rl->param.tx_rates_max) value = rl->param.tx_rates_max; else if (value < 1) value = 1; /* store new value */ rl->param.arg[arg2] = value; break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs): /* range check */ if (value < 1) value = 0; else if (value > MLX5E_FLD_MAX(cqc, cq_period)) value = MLX5E_FLD_MAX(cqc, cq_period); /* store new value */ rl->param.arg[arg2] = value; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_rl_refresh_channel_params(rl); break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts): /* import TX coal pkts */ if (value < 1) value = 0; else if (value > MLX5E_FLD_MAX(cqc, cq_max_count)) value = MLX5E_FLD_MAX(cqc, cq_max_count); /* store new value */ rl->param.arg[arg2] = value; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_rl_refresh_channel_params(rl); break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode): /* network interface must be down */ if (was_opened != 0 && mode_modify == 0) mlx5e_rl_close_workers(priv); /* import TX coalesce mode */ if (value != 0) value = 1; /* store new value */ rl->param.arg[arg2] = value; /* restart network interface, if any */ if (was_opened != 0) { if (mode_modify == 0) mlx5e_rl_open_workers(priv); else error = mlx5e_rl_refresh_channel_params(rl); } break; case MLX5E_RL_PARAMS_INDEX(tx_queue_size): /* network interface must be down */ if (was_opened) mlx5e_rl_close_workers(priv); /* import TX queue size */ if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); else if (value > priv->params_ethtool.tx_queue_size_max) value = priv->params_ethtool.tx_queue_size_max; /* store actual TX queue size */ value = 1ULL << order_base_2(value); /* store new value */ rl->param.arg[arg2] = value; /* verify TX completion factor */ mlx5e_rl_sync_tx_completion_fact(rl); /* restart network interface, if any */ if (was_opened) mlx5e_rl_open_workers(priv); break; case MLX5E_RL_PARAMS_INDEX(tx_completion_fact): /* network interface must be down */ if (was_opened) mlx5e_rl_close_workers(priv); /* store new value */ rl->param.arg[arg2] = value; /* verify parameter */ mlx5e_rl_sync_tx_completion_fact(rl); /* restart network interface, if any */ if (was_opened) mlx5e_rl_open_workers(priv); break; case MLX5E_RL_PARAMS_INDEX(tx_limit_add): error = mlx5e_rl_tx_limit_add(rl, value); break; case MLX5E_RL_PARAMS_INDEX(tx_limit_clr): error = mlx5e_rl_tx_limit_clr(rl, value); break; case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation): /* range check */ if (value > rl->param.tx_allowed_deviation_max) value = rl->param.tx_allowed_deviation_max; else if (value < rl->param.tx_allowed_deviation_min) value = rl->param.tx_allowed_deviation_min; MLX5E_RL_WLOCK(rl); rl->param.arg[arg2] = value; MLX5E_RL_WUNLOCK(rl); break; case MLX5E_RL_PARAMS_INDEX(tx_burst_size): /* range check */ if (value > rl->param.tx_burst_size_max) value = rl->param.tx_burst_size_max; else if (value < rl->param.tx_burst_size_min) value = rl->param.tx_burst_size_min; MLX5E_RL_WLOCK(rl); rl->param.arg[arg2] = value; MLX5E_RL_WUNLOCK(rl); break; default: break; } done: PRIV_UNLOCK(priv); return (error); } static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, struct sysctl_oid *node, const char *name, const char *desc) { /* * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will * take care of loading default sysctl value from the kernel * environment, if any: */ if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) { /* read-only SYSCTLs */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); } else { if (strstr(name, "_def") != 0) { #ifdef RATELIMIT_DEBUG /* tunable read-only advanced SYSCTLs */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLTYPE_U64 | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); #endif } else { /* read-write SYSCTLs */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); } } } static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, struct sysctl_oid *node, const char *name, const char *desc) { /* read-only SYSCTLs */ SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLFLAG_RD, &rl->stats.arg[x], 0, desc); } #else int mlx5e_rl_init(struct mlx5e_priv *priv) { return (0); } void mlx5e_rl_cleanup(struct mlx5e_priv *priv) { /* NOP */ } #endif /* RATELIMIT */ diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c index 4b1fb25e0f82..28a49ec98fb2 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c @@ -1,683 +1,683 @@ /*- * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_rss.h" #include "opt_ratelimit.h" #include #include static inline int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix) { bus_dma_segment_t segs[MLX5E_MAX_BUSDMA_RX_SEGS]; struct mbuf *mb; int nsegs; int err; struct mbuf *mb_head; int i; if (rq->mbuf[ix].mbuf != NULL) return (0); mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MLX5E_MAX_RX_BYTES); if (unlikely(mb == NULL)) return (-ENOMEM); mb->m_len = MLX5E_MAX_RX_BYTES; mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES; for (i = 1; i < rq->nsegs; i++) { if (mb_head->m_pkthdr.len >= rq->wqe_sz) break; mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0, MLX5E_MAX_RX_BYTES); if (unlikely(mb == NULL)) { m_freem(mb_head); return (-ENOMEM); } mb->m_len = MLX5E_MAX_RX_BYTES; mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES; } /* rewind to first mbuf in chain */ mb = mb_head; /* get IP header aligned */ m_adj(mb, MLX5E_NET_IP_ALIGN); err = -bus_dmamap_load_mbuf_sg(rq->dma_tag, rq->mbuf[ix].dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); if (err != 0) goto err_free_mbuf; if (unlikely(nsegs == 0)) { bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map); err = -ENOMEM; goto err_free_mbuf; } wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr); wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len | MLX5_HW_START_PADDING); for (i = 1; i != nsegs; i++) { wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr); wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len); } for (; i < rq->nsegs; i++) { wqe->data[i].addr = 0; wqe->data[i].byte_count = 0; } rq->mbuf[ix].mbuf = mb; rq->mbuf[ix].data = mb->m_data; bus_dmamap_sync(rq->dma_tag, rq->mbuf[ix].dma_map, BUS_DMASYNC_PREREAD); return (0); err_free_mbuf: m_freem(mb); return (err); } static void mlx5e_post_rx_wqes(struct mlx5e_rq *rq) { if (unlikely(rq->enabled == 0)) return; while (!mlx5_wq_ll_is_full(&rq->wq)) { struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, rq->wq.head); if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, rq->wq.head))) { callout_reset_curcpu(&rq->watchdog, 1, (void *)&mlx5e_post_rx_wqes, rq); break; } mlx5_wq_ll_push(&rq->wq, be16_to_cpu(wqe->next.next_wqe_index)); } /* ensure wqes are visible to device before updating doorbell record */ atomic_thread_fence_rel(); mlx5_wq_ll_update_db_record(&rq->wq); } static void mlx5e_lro_update_hdr(struct mbuf *mb, struct mlx5_cqe64 *cqe) { /* TODO: consider vlans, ip options, ... */ struct ether_header *eh; uint16_t eh_type; uint16_t tot_len; struct ip6_hdr *ip6 = NULL; struct ip *ip4 = NULL; struct tcphdr *th; uint32_t *ts_ptr; uint8_t l4_hdr_type; int tcp_ack; eh = mtod(mb, struct ether_header *); eh_type = ntohs(eh->ether_type); l4_hdr_type = get_cqe_l4_hdr_type(cqe); tcp_ack = ((CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA == l4_hdr_type) || (CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA == l4_hdr_type)); /* TODO: consider vlan */ tot_len = be32_to_cpu(cqe->byte_cnt) - ETHER_HDR_LEN; switch (eh_type) { case ETHERTYPE_IP: ip4 = (struct ip *)(eh + 1); th = (struct tcphdr *)(ip4 + 1); break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(eh + 1); th = (struct tcphdr *)(ip6 + 1); break; default: return; } ts_ptr = (uint32_t *)(th + 1); if (get_cqe_lro_tcppsh(cqe)) th->th_flags |= TH_PUSH; if (tcp_ack) { th->th_flags |= TH_ACK; th->th_ack = cqe->lro_ack_seq_num; th->th_win = cqe->lro_tcp_win; /* * FreeBSD handles only 32bit aligned timestamp right after * the TCP hdr * +--------+--------+--------+--------+ * | NOP | NOP | TSopt | 10 | * +--------+--------+--------+--------+ * | TSval timestamp | * +--------+--------+--------+--------+ * | TSecr timestamp | * +--------+--------+--------+--------+ */ if (get_cqe_lro_timestamp_valid(cqe) && (__predict_true(*ts_ptr) == ntohl(TCPOPT_NOP << 24 | TCPOPT_NOP << 16 | TCPOPT_TIMESTAMP << 8 | TCPOLEN_TIMESTAMP))) { /* * cqe->timestamp is 64bit long. * [0-31] - timestamp. * [32-64] - timestamp echo replay. */ ts_ptr[1] = *(uint32_t *)&cqe->timestamp; ts_ptr[2] = *((uint32_t *)&cqe->timestamp + 1); } } if (ip4) { ip4->ip_ttl = cqe->lro_min_ttl; ip4->ip_len = cpu_to_be16(tot_len); ip4->ip_sum = 0; ip4->ip_sum = in_cksum(mb, ip4->ip_hl << 2); } else { ip6->ip6_hlim = cqe->lro_min_ttl; ip6->ip6_plen = cpu_to_be16(tot_len - sizeof(struct ip6_hdr)); } /* TODO: handle tcp checksum */ } static uint64_t mlx5e_mbuf_tstmp(struct mlx5e_priv *priv, uint64_t hw_tstmp) { struct mlx5e_clbr_point *cp, dcp; uint64_t tstmp_sec, tstmp_nsec; uint64_t hw_clocks; uint64_t rt_cur_to_prev, res_s, res_n, res_s_modulo, res; uint64_t hw_clk_div; u_int gen; do { cp = &priv->clbr_points[priv->clbr_curr]; gen = atomic_load_acq_int(&cp->clbr_gen); if (gen == 0) return (0); dcp = *cp; atomic_thread_fence_acq(); } while (gen != dcp.clbr_gen); /* * Our goal here is to have a result that is: * * ( (cur_time - prev_time) ) * ((hw_tstmp - hw_prev) * ----------------------------- ) + prev_time * ( (hw_cur - hw_prev) ) * * With the constraints that we cannot use float and we * don't want to overflow the uint64_t numbers we are using. * * The plan is to take the clocking value of the hw timestamps * and split them into seconds and nanosecond equivalent portions. * Then we operate on the two portions seperately making sure to * bring back the carry over from the seconds when we divide. * * First up lets get the two divided into separate entities * i.e. the seconds. We use the clock frequency for this. * Note that priv->cclk was setup with the clock frequency * in hz so we are all set to go. */ hw_clocks = hw_tstmp - dcp.clbr_hw_prev; tstmp_sec = hw_clocks / priv->cclk; tstmp_nsec = hw_clocks % priv->cclk; /* Now work with them separately */ rt_cur_to_prev = (dcp.base_curr - dcp.base_prev); res_s = tstmp_sec * rt_cur_to_prev; res_n = tstmp_nsec * rt_cur_to_prev; /* Now lets get our divider */ hw_clk_div = dcp.clbr_hw_curr - dcp.clbr_hw_prev; /* Make sure to save the remainder from the seconds divide */ res_s_modulo = res_s % hw_clk_div; res_s /= hw_clk_div; /* scale the remainder to where it should be */ res_s_modulo *= priv->cclk; /* Now add in the remainder */ res_n += res_s_modulo; /* Now do the divide */ res_n /= hw_clk_div; res_s *= priv->cclk; /* Recombine the two */ res = res_s + res_n; /* And now add in the base time to get to the real timestamp */ res += dcp.base_prev; return (res); } static inline void mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, struct mbuf *mb, u32 cqe_bcnt) { - struct ifnet *ifp = rq->ifp; + if_t ifp = rq->ifp; struct mlx5e_channel *c; struct mbuf *mb_head; int lro_num_seg; /* HW LRO session aggregated packets counter */ uint64_t tstmp; lro_num_seg = be32_to_cpu(cqe->srqn) >> 24; if (lro_num_seg > 1) { mlx5e_lro_update_hdr(mb, cqe); rq->stats.lro_packets++; rq->stats.lro_bytes += cqe_bcnt; } mb->m_pkthdr.len = cqe_bcnt; for (mb_head = mb; mb != NULL; mb = mb->m_next) { if (mb->m_len > cqe_bcnt) mb->m_len = cqe_bcnt; cqe_bcnt -= mb->m_len; if (likely(cqe_bcnt == 0)) { if (likely(mb->m_next != NULL)) { /* trim off empty mbufs */ m_freem(mb->m_next); mb->m_next = NULL; } break; } } /* rewind to first mbuf in chain */ mb = mb_head; /* check if a Toeplitz hash was computed */ if (cqe->rss_hash_type != 0) { mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result); #ifdef RSS /* decode the RSS hash type */ switch (cqe->rss_hash_type & (CQE_RSS_DST_HTYPE_L4 | CQE_RSS_DST_HTYPE_IP)) { /* IPv4 */ case (CQE_RSS_DST_HTYPE_TCP | CQE_RSS_DST_HTYPE_IPV4): M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_TCP_IPV4); break; case (CQE_RSS_DST_HTYPE_UDP | CQE_RSS_DST_HTYPE_IPV4): M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_UDP_IPV4); break; case CQE_RSS_DST_HTYPE_IPV4: M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_IPV4); break; /* IPv6 */ case (CQE_RSS_DST_HTYPE_TCP | CQE_RSS_DST_HTYPE_IPV6): M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_TCP_IPV6); break; case (CQE_RSS_DST_HTYPE_UDP | CQE_RSS_DST_HTYPE_IPV6): M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_UDP_IPV6); break; case CQE_RSS_DST_HTYPE_IPV6: M_HASHTYPE_SET(mb, M_HASHTYPE_RSS_IPV6); break; default: /* Other */ M_HASHTYPE_SET(mb, M_HASHTYPE_OPAQUE_HASH); break; } #else M_HASHTYPE_SET(mb, M_HASHTYPE_OPAQUE_HASH); #endif #ifdef M_HASHTYPE_SETINNER if (cqe_is_tunneled(cqe)) M_HASHTYPE_SETINNER(mb); #endif } else { mb->m_pkthdr.flowid = rq->ix; M_HASHTYPE_SET(mb, M_HASHTYPE_OPAQUE); } mb->m_pkthdr.rcvif = ifp; mb->m_pkthdr.leaf_rcvif = ifp; if (cqe_is_tunneled(cqe)) { /* * CQE can be tunneled only if TIR is configured to * enable parsing of tunneled payload, so no need to * check for capabilities. */ if (((cqe->hds_ip_ext & (CQE_L2_OK | CQE_L3_OK)) == (CQE_L2_OK | CQE_L3_OK))) { mb->m_pkthdr.csum_flags |= CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mb->m_pkthdr.csum_data = htons(0xffff); if (likely((cqe->hds_ip_ext & CQE_L4_OK) == CQE_L4_OK)) { mb->m_pkthdr.csum_flags |= CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID; } } else { rq->stats.csum_none++; } - } else if (likely((ifp->if_capenable & (IFCAP_RXCSUM | + } else if (likely((if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) != 0) && ((cqe->hds_ip_ext & (CQE_L2_OK | CQE_L3_OK | CQE_L4_OK)) == (CQE_L2_OK | CQE_L3_OK | CQE_L4_OK))) { mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mb->m_pkthdr.csum_data = htons(0xffff); } else { rq->stats.csum_none++; } if (cqe_has_vlan(cqe)) { mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->vlan_info); mb->m_flags |= M_VLANTAG; } c = container_of(rq, struct mlx5e_channel, rq); if (c->priv->clbr_done >= 2) { tstmp = mlx5e_mbuf_tstmp(c->priv, be64_to_cpu(cqe->timestamp)); if ((tstmp & MLX5_CQE_TSTMP_PTP) != 0) { /* * Timestamp was taken on the packet entrance, * instead of the cqe generation. */ tstmp &= ~MLX5_CQE_TSTMP_PTP; mb->m_flags |= M_TSTMP_HPREC; } if (tstmp != 0) { mb->m_pkthdr.rcv_tstmp = tstmp; mb->m_flags |= M_TSTMP; } } switch (get_cqe_tls_offload(cqe)) { case CQE_TLS_OFFLOAD_DECRYPTED: /* set proper checksum flag for decrypted packets */ mb->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; rq->stats.decrypted_ok_packets++; break; case CQE_TLS_OFFLOAD_ERROR: rq->stats.decrypted_error_packets++; break; default: break; } } static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cc, void *data) { memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, (cc & cq->wq.sz_m1)), sizeof(struct mlx5_cqe64)); } static inline void mlx5e_write_cqe_slot(struct mlx5e_cq *cq, u32 cc, void *data) { memcpy(mlx5_cqwq_get_wqe(&cq->wq, cc & cq->wq.sz_m1), data, sizeof(struct mlx5_cqe64)); } static inline void mlx5e_decompress_cqe(struct mlx5e_cq *cq, struct mlx5_cqe64 *title, struct mlx5_mini_cqe8 *mini, u16 wqe_counter, int i) { /* * NOTE: The fields which are not set here are copied from the * initial and common title. See memcpy() in * mlx5e_write_cqe_slot(). */ title->byte_cnt = mini->byte_cnt; title->wqe_counter = cpu_to_be16((wqe_counter + i) & cq->wq.sz_m1); title->rss_hash_result = mini->rx_hash_result; /* * Since we use MLX5_CQE_FORMAT_HASH when creating the RX CQ, * the value of the checksum should be ignored. */ title->check_sum = 0; title->op_own = (title->op_own & 0xf0) | (((cq->wq.cc + i) >> cq->wq.log_sz) & 1); } #define MLX5E_MINI_ARRAY_SZ 8 /* Make sure structs are not packet differently */ CTASSERT(sizeof(struct mlx5_cqe64) == sizeof(struct mlx5_mini_cqe8) * MLX5E_MINI_ARRAY_SZ); static void mlx5e_decompress_cqes(struct mlx5e_cq *cq) { struct mlx5_mini_cqe8 mini_array[MLX5E_MINI_ARRAY_SZ]; struct mlx5_cqe64 title; u32 cqe_count; u32 i = 0; u16 title_wqe_counter; mlx5e_read_cqe_slot(cq, cq->wq.cc, &title); title_wqe_counter = be16_to_cpu(title.wqe_counter); cqe_count = be32_to_cpu(title.byte_cnt); /* Make sure we won't overflow */ KASSERT(cqe_count <= cq->wq.sz_m1, ("%s: cqe_count %u > cq->wq.sz_m1 %u", __func__, cqe_count, cq->wq.sz_m1)); mlx5e_read_cqe_slot(cq, cq->wq.cc + 1, mini_array); while (true) { mlx5e_decompress_cqe(cq, &title, &mini_array[i % MLX5E_MINI_ARRAY_SZ], title_wqe_counter, i); mlx5e_write_cqe_slot(cq, cq->wq.cc + i, &title); i++; if (i == cqe_count) break; if (i % MLX5E_MINI_ARRAY_SZ == 0) mlx5e_read_cqe_slot(cq, cq->wq.cc + i, mini_array); } } static int mlx5e_poll_rx_cq(struct mlx5e_rq *rq, int budget) { struct pfil_head *pfil; int i, rv; - CURVNET_SET_QUIET(rq->ifp->if_vnet); + CURVNET_SET_QUIET(if_getvnet(rq->ifp)); pfil = rq->channel->priv->pfil; for (i = 0; i < budget; i++) { struct mlx5e_rx_wqe *wqe; struct mlx5_cqe64 *cqe; struct mbuf *mb; __be16 wqe_counter_be; u16 wqe_counter; u32 byte_cnt, seglen; cqe = mlx5e_get_cqe(&rq->cq); if (!cqe) break; if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) mlx5e_decompress_cqes(&rq->cq); mlx5_cqwq_pop(&rq->cq.wq); wqe_counter_be = cqe->wqe_counter; wqe_counter = be16_to_cpu(wqe_counter_be); wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter); byte_cnt = be32_to_cpu(cqe->byte_cnt); bus_dmamap_sync(rq->dma_tag, rq->mbuf[wqe_counter].dma_map, BUS_DMASYNC_POSTREAD); if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) { mlx5e_dump_err_cqe(&rq->cq, rq->rqn, (const void *)cqe); rq->stats.wqe_err++; goto wq_ll_pop; } if (pfil != NULL && PFIL_HOOKED_IN(pfil)) { seglen = MIN(byte_cnt, MLX5E_MAX_RX_BYTES); rv = pfil_mem_in(rq->channel->priv->pfil, rq->mbuf[wqe_counter].data, seglen, rq->ifp, &mb); switch (rv) { case PFIL_DROPPED: case PFIL_CONSUMED: /* * Filter dropped or consumed it. In * either case, we can just recycle * buffer; there is no more work to do. */ rq->stats.packets++; goto wq_ll_pop; case PFIL_REALLOCED: /* * Filter copied it; recycle buffer * and receive the new mbuf allocated * by the Filter */ goto rx_common; default: /* * The Filter said it was OK, so * receive like normal. */ KASSERT(rv == PFIL_PASS, ("Filter returned %d!\n", rv)); } } if ((MHLEN - MLX5E_NET_IP_ALIGN) >= byte_cnt && (mb = m_gethdr(M_NOWAIT, MT_DATA)) != NULL) { /* set maximum mbuf length */ mb->m_len = MHLEN - MLX5E_NET_IP_ALIGN; /* get IP header aligned */ mb->m_data += MLX5E_NET_IP_ALIGN; bcopy(rq->mbuf[wqe_counter].data, mtod(mb, caddr_t), byte_cnt); } else { mb = rq->mbuf[wqe_counter].mbuf; rq->mbuf[wqe_counter].mbuf = NULL; /* safety clear */ bus_dmamap_unload(rq->dma_tag, rq->mbuf[wqe_counter].dma_map); } rx_common: mlx5e_build_rx_mbuf(cqe, rq, mb, byte_cnt); rq->stats.bytes += byte_cnt; rq->stats.packets++; #ifdef NUMA - mb->m_pkthdr.numa_domain = rq->ifp->if_numa_domain; + mb->m_pkthdr.numa_domain = if_getnumadomain(rq->ifp); #endif #if !defined(HAVE_TCP_LRO_RX) tcp_lro_queue_mbuf(&rq->lro, mb); #else if (mb->m_pkthdr.csum_flags == 0 || - (rq->ifp->if_capenable & IFCAP_LRO) == 0 || + (if_getcapenable(rq->ifp) & IFCAP_LRO) == 0 || rq->lro.lro_cnt == 0 || tcp_lro_rx(&rq->lro, mb, 0) != 0) { - rq->ifp->if_input(rq->ifp, mb); + if_input(rq->ifp, mb); } #endif wq_ll_pop: mlx5_wq_ll_pop(&rq->wq, wqe_counter_be, &wqe->next.next_wqe_index); } CURVNET_RESTORE(); mlx5_cqwq_update_db_record(&rq->cq.wq); /* ensure cq space is freed before enabling more cqes */ atomic_thread_fence_rel(); return (i); } void mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe __unused) { struct mlx5e_channel *c = container_of(mcq, struct mlx5e_channel, rq.cq.mcq); struct mlx5e_rq *rq = container_of(mcq, struct mlx5e_rq, cq.mcq); int i = 0; #ifdef HAVE_PER_CQ_EVENT_PACKET #if (MHLEN < 15) #error "MHLEN is too small" #endif struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA); if (mb != NULL) { /* this code is used for debugging purpose only */ mb->m_pkthdr.len = mb->m_len = 15; memset(mb->m_data, 255, 14); mb->m_data[14] = rq->ix; mb->m_pkthdr.rcvif = rq->ifp; mb->m_pkthdr.leaf_rcvif = rq->ifp; - rq->ifp->if_input(rq->ifp, mb); + if_input(rq->ifp, mb); } #endif for (int j = 0; j != MLX5E_MAX_TX_NUM_TC; j++) { mtx_lock(&c->sq[j].lock); c->sq[j].db_inhibit++; mtx_unlock(&c->sq[j].lock); } mtx_lock(&c->iq.lock); c->iq.db_inhibit++; mtx_unlock(&c->iq.lock); mtx_lock(&rq->mtx); /* * Polling the entire CQ without posting new WQEs results in * lack of receive WQEs during heavy traffic scenarios. */ while (1) { if (mlx5e_poll_rx_cq(rq, MLX5E_RX_BUDGET_MAX) != MLX5E_RX_BUDGET_MAX) break; i += MLX5E_RX_BUDGET_MAX; if (i >= MLX5E_BUDGET_MAX) break; mlx5e_post_rx_wqes(rq); } mlx5e_post_rx_wqes(rq); /* check for dynamic interrupt moderation callback */ if (rq->dim.mode != NET_DIM_CQ_PERIOD_MODE_DISABLED) net_dim(&rq->dim, rq->stats.packets, rq->stats.bytes); mlx5e_cq_arm(&rq->cq, MLX5_GET_DOORBELL_LOCK(&rq->channel->priv->doorbell_lock)); tcp_lro_flush_all(&rq->lro); mtx_unlock(&rq->mtx); for (int j = 0; j != MLX5E_MAX_TX_NUM_TC; j++) { mtx_lock(&c->sq[j].lock); c->sq[j].db_inhibit--; /* Update the doorbell record, if any. */ mlx5e_tx_notify_hw(c->sq + j, true); mtx_unlock(&c->sq[j].lock); } mtx_lock(&c->iq.lock); c->iq.db_inhibit--; mlx5e_iq_notify_hw(&c->iq); mtx_unlock(&c->iq.lock); } diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c index 78458ab69f13..90e686bab3d5 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c @@ -1,1179 +1,1179 @@ /*- * Copyright (c) 2015-2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA corporation & affiliates. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_kern_tls.h" #include "opt_rss.h" #include "opt_ratelimit.h" #include #include static inline bool mlx5e_do_send_cqe_inline(struct mlx5e_sq *sq) { sq->cev_counter++; /* interleave the CQEs */ if (sq->cev_counter >= sq->cev_factor) { sq->cev_counter = 0; return (true); } return (false); } bool mlx5e_do_send_cqe(struct mlx5e_sq *sq) { return (mlx5e_do_send_cqe_inline(sq)); } void mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt) { u16 pi = sq->pc & sq->wq.sz_m1; struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(&wqe->ctrl, 0, sizeof(wqe->ctrl)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); if (mlx5e_do_send_cqe_inline(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; else wqe->ctrl.fm_ce_se = 0; /* Copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->pc += sq->mbuf[pi].num_wqebbs; } static uint32_t mlx5e_hash_value; static void mlx5e_hash_init(void *arg) { mlx5e_hash_value = m_ether_tcpip_hash_init(); } /* Make kernel call mlx5e_hash_init after the random stack finished initializing */ SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL); static struct mlx5e_sq * -mlx5e_select_queue_by_send_tag(struct ifnet *ifp, struct mbuf *mb) +mlx5e_select_queue_by_send_tag(if_t ifp, struct mbuf *mb) { struct m_snd_tag *mb_tag; struct mlx5e_sq *sq; mb_tag = mb->m_pkthdr.snd_tag; #ifdef KERN_TLS top: #endif /* get pointer to sendqueue */ switch (mb_tag->sw->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: sq = container_of(mb_tag, struct mlx5e_rl_channel, tag)->sq; break; #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: mb_tag = container_of(mb_tag, struct mlx5e_tls_tag, tag)->rl_tag; goto top; #endif #endif case IF_SND_TAG_TYPE_UNLIMITED: sq = &container_of(mb_tag, struct mlx5e_channel, tag)->sq[0]; KASSERT((mb_tag->refcount > 0), ("mlx5e_select_queue: Channel refs are zero for unlimited tag")); break; #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: mb_tag = container_of(mb_tag, struct mlx5e_tls_tag, tag)->rl_tag; goto top; #endif default: sq = NULL; break; } /* check if valid */ if (sq != NULL && READ_ONCE(sq->running) != 0) return (sq); return (NULL); } static struct mlx5e_sq * -mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb) +mlx5e_select_queue(if_t ifp, struct mbuf *mb) { - struct mlx5e_priv *priv = ifp->if_softc; + struct mlx5e_priv *priv = if_getsoftc(ifp); struct mlx5e_sq *sq; u32 ch; u32 tc; /* obtain VLAN information if present */ if (mb->m_flags & M_VLANTAG) { tc = (mb->m_pkthdr.ether_vtag >> 13); if (tc >= priv->num_tc) tc = priv->default_vlan_prio; } else { tc = priv->default_vlan_prio; } ch = priv->params.num_channels; /* check if flowid is set */ if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) { #ifdef RSS u32 temp; if (rss_hash2bucket(mb->m_pkthdr.flowid, M_HASHTYPE_GET(mb), &temp) == 0) ch = temp % ch; else #endif ch = (mb->m_pkthdr.flowid % 128) % ch; } else { ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 | MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch; } /* check if send queue is running */ sq = &priv->channel[ch].sq[tc]; if (likely(READ_ONCE(sq->running) != 0)) return (sq); return (NULL); } static inline u16 mlx5e_get_l2_header_size(struct mlx5e_sq *sq, struct mbuf *mb) { struct ether_vlan_header *eh; uint16_t eth_type; int min_inline; eh = mtod(mb, struct ether_vlan_header *); if (unlikely(mb->m_len < ETHER_HDR_LEN)) { goto max_inline; } else if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { if (unlikely(mb->m_len < (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN))) goto max_inline; eth_type = ntohs(eh->evl_proto); min_inline = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); min_inline = ETHER_HDR_LEN; } switch (eth_type) { case ETHERTYPE_IP: case ETHERTYPE_IPV6: /* * Make sure the TOS(IPv4) or traffic class(IPv6) * field gets inlined. Else the SQ may stall. */ min_inline += 4; break; default: goto max_inline; } /* * m_copydata() will be used on the remaining header which * does not need to reside within the first m_len bytes of * data: */ if (mb->m_pkthdr.len < min_inline) goto max_inline; return (min_inline); max_inline: return (MIN(mb->m_pkthdr.len, sq->max_inline)); } /* * This function parse IPv4 and IPv6 packets looking for TCP and UDP * headers. * * Upon return the pointer at which the "ppth" argument points, is set * to the location of the TCP header. NULL is used if no TCP header is * present. * * The return value indicates the number of bytes from the beginning * of the packet until the first byte after the TCP or UDP header. If * this function returns zero, the parsing failed. */ int mlx5e_get_full_header_size(const struct mbuf *mb, const struct tcphdr **ppth) { const struct ether_vlan_header *eh; const struct tcphdr *th; const struct ip *ip; int ip_hlen, tcp_hlen; const struct ip6_hdr *ip6; uint16_t eth_type; int eth_hdr_len; eh = mtod(mb, const struct ether_vlan_header *); if (unlikely(mb->m_len < ETHER_HDR_LEN)) goto failure; if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { if (unlikely(mb->m_len < ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)) goto failure; eth_type = ntohs(eh->evl_proto); eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); eth_hdr_len = ETHER_HDR_LEN; } switch (eth_type) { case ETHERTYPE_IP: ip = (const struct ip *)(mb->m_data + eth_hdr_len); if (unlikely(mb->m_len < eth_hdr_len + sizeof(*ip))) goto failure; switch (ip->ip_p) { case IPPROTO_TCP: ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen; goto tcp_packet; case IPPROTO_UDP: ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen + sizeof(struct udphdr); th = NULL; goto udp_packet; default: goto failure; } break; case ETHERTYPE_IPV6: ip6 = (const struct ip6_hdr *)(mb->m_data + eth_hdr_len); if (unlikely(mb->m_len < eth_hdr_len + sizeof(*ip6))) goto failure; switch (ip6->ip6_nxt) { case IPPROTO_TCP: eth_hdr_len += sizeof(*ip6); goto tcp_packet; case IPPROTO_UDP: eth_hdr_len += sizeof(*ip6) + sizeof(struct udphdr); th = NULL; goto udp_packet; default: goto failure; } break; default: goto failure; } tcp_packet: if (unlikely(mb->m_len < eth_hdr_len + sizeof(*th))) { const struct mbuf *m_th = mb->m_next; if (unlikely(mb->m_len != eth_hdr_len || m_th == NULL || m_th->m_len < sizeof(*th))) goto failure; th = (const struct tcphdr *)(m_th->m_data); } else { th = (const struct tcphdr *)(mb->m_data + eth_hdr_len); } tcp_hlen = th->th_off << 2; eth_hdr_len += tcp_hlen; udp_packet: /* * m_copydata() will be used on the remaining header which * does not need to reside within the first m_len bytes of * data: */ if (unlikely(mb->m_pkthdr.len < eth_hdr_len)) goto failure; if (ppth != NULL) *ppth = th; return (eth_hdr_len); failure: if (ppth != NULL) *ppth = NULL; return (0); } /* * Locate a pointer inside a mbuf chain. Returns NULL upon failure. */ static inline void * mlx5e_parse_mbuf_chain(const struct mbuf **mb, int *poffset, int eth_hdr_len, int min_len) { if (unlikely(mb[0]->m_len == eth_hdr_len)) { poffset[0] = eth_hdr_len; if (unlikely((mb[0] = mb[0]->m_next) == NULL)) return (NULL); } if (unlikely(mb[0]->m_len < eth_hdr_len - poffset[0] + min_len)) return (NULL); return (mb[0]->m_data + eth_hdr_len - poffset[0]); } /* * This function parse IPv4 and IPv6 packets looking for UDP, VXLAN * and TCP headers. * * The return value indicates the number of bytes from the beginning * of the packet until the first byte after the TCP header. If this * function returns zero, the parsing failed. */ static int mlx5e_get_vxlan_header_size(const struct mbuf *mb, struct mlx5e_tx_wqe *wqe, uint8_t cs_mask, uint8_t opcode) { const struct ether_vlan_header *eh; struct ip *ip4; struct ip6_hdr *ip6; struct tcphdr *th; struct udphdr *udp; bool has_outer_vlan_tag; uint16_t eth_type; uint8_t ip_type; int pkt_hdr_len; int eth_hdr_len; int tcp_hlen; int ip_hlen; int offset; pkt_hdr_len = mb->m_pkthdr.len; has_outer_vlan_tag = (mb->m_flags & M_VLANTAG) != 0; offset = 0; eh = mtod(mb, const struct ether_vlan_header *); if (unlikely(mb->m_len < ETHER_HDR_LEN)) return (0); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { if (unlikely(mb->m_len < ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)) return (0); eth_type = eh->evl_proto; eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = eh->evl_encap_proto; eth_hdr_len = ETHER_HDR_LEN; } switch (eth_type) { case htons(ETHERTYPE_IP): ip4 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*ip4)); if (unlikely(ip4 == NULL)) return (0); ip_type = ip4->ip_p; if (unlikely(ip_type != IPPROTO_UDP)) return (0); wqe->eth.swp_outer_l3_offset = eth_hdr_len / 2; wqe->eth.cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; ip_hlen = ip4->ip_hl << 2; eth_hdr_len += ip_hlen; udp = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*udp)); if (unlikely(udp == NULL)) return (0); wqe->eth.swp_outer_l4_offset = eth_hdr_len / 2; wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_TYPE; eth_hdr_len += sizeof(*udp); break; case htons(ETHERTYPE_IPV6): ip6 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*ip6)); if (unlikely(ip6 == NULL)) return (0); ip_type = ip6->ip6_nxt; if (unlikely(ip_type != IPPROTO_UDP)) return (0); wqe->eth.swp_outer_l3_offset = eth_hdr_len / 2; wqe->eth.cs_flags = MLX5_ETH_WQE_L4_CSUM; eth_hdr_len += sizeof(*ip6); udp = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*udp)); if (unlikely(udp == NULL)) return (0); wqe->eth.swp_outer_l4_offset = eth_hdr_len / 2; wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_TYPE | MLX5_ETH_WQE_SWP_OUTER_L3_TYPE; eth_hdr_len += sizeof(*udp); break; default: return (0); } /* * If the hardware is not computing inner IP checksum, then * skip inlining the inner outer UDP and VXLAN header: */ if (unlikely((cs_mask & MLX5_ETH_WQE_L3_INNER_CSUM) == 0)) goto done; if (unlikely(mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, 8) == NULL)) return (0); eth_hdr_len += 8; /* Check for ethernet header again. */ eh = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, ETHER_HDR_LEN); if (unlikely(eh == NULL)) return (0); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { if (unlikely(mb->m_len < eth_hdr_len - offset + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)) return (0); eth_type = eh->evl_proto; eth_hdr_len += ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = eh->evl_encap_proto; eth_hdr_len += ETHER_HDR_LEN; } /* Check for IP header again. */ switch (eth_type) { case htons(ETHERTYPE_IP): ip4 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*ip4)); if (unlikely(ip4 == NULL)) return (0); wqe->eth.swp_inner_l3_offset = eth_hdr_len / 2; wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM; ip_type = ip4->ip_p; ip_hlen = ip4->ip_hl << 2; eth_hdr_len += ip_hlen; break; case htons(ETHERTYPE_IPV6): ip6 = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*ip6)); if (unlikely(ip6 == NULL)) return (0); wqe->eth.swp_inner_l3_offset = eth_hdr_len / 2; wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_TYPE; ip_type = ip6->ip6_nxt; eth_hdr_len += sizeof(*ip6); break; default: return (0); } /* * If the hardware is not computing inner UDP/TCP checksum, * then skip inlining the inner UDP/TCP header: */ if (unlikely((cs_mask & MLX5_ETH_WQE_L4_INNER_CSUM) == 0)) goto done; switch (ip_type) { case IPPROTO_UDP: udp = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*udp)); if (unlikely(udp == NULL)) return (0); wqe->eth.swp_inner_l4_offset = (eth_hdr_len / 2); wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; wqe->eth.swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_TYPE; eth_hdr_len += sizeof(*udp); break; case IPPROTO_TCP: th = mlx5e_parse_mbuf_chain(&mb, &offset, eth_hdr_len, sizeof(*th)); if (unlikely(th == NULL)) return (0); wqe->eth.swp_inner_l4_offset = eth_hdr_len / 2; wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; tcp_hlen = th->th_off << 2; eth_hdr_len += tcp_hlen; break; default: return (0); } done: if (unlikely(pkt_hdr_len < eth_hdr_len)) return (0); /* Account for software inserted VLAN tag, if any. */ if (unlikely(has_outer_vlan_tag)) { wqe->eth.swp_outer_l3_offset += ETHER_VLAN_ENCAP_LEN / 2; wqe->eth.swp_outer_l4_offset += ETHER_VLAN_ENCAP_LEN / 2; wqe->eth.swp_inner_l3_offset += ETHER_VLAN_ENCAP_LEN / 2; wqe->eth.swp_inner_l4_offset += ETHER_VLAN_ENCAP_LEN / 2; } /* * When inner checksums are set, outer L4 checksum flag must * be disabled. */ if (wqe->eth.cs_flags & (MLX5_ETH_WQE_L3_INNER_CSUM | MLX5_ETH_WQE_L4_INNER_CSUM)) wqe->eth.cs_flags &= ~MLX5_ETH_WQE_L4_CSUM; return (eth_hdr_len); } struct mlx5_wqe_dump_seg { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_data_seg data; } __aligned(MLX5_SEND_WQE_BB); CTASSERT(DIV_ROUND_UP(2, MLX5_SEND_WQEBB_NUM_DS) == 1); int mlx5e_sq_dump_xmit(struct mlx5e_sq *sq, struct mlx5e_xmit_args *parg, struct mbuf **mbp) { bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS]; struct mlx5_wqe_dump_seg *wqe; struct mlx5_wqe_dump_seg *wqe_last; int nsegs; int xsegs; u32 off; u32 msb; int err; int x; struct mbuf *mb; const u32 ds_cnt = 2; u16 pi; const u8 opcode = MLX5_OPCODE_DUMP; /* get pointer to mbuf */ mb = *mbp; /* get producer index */ pi = sq->pc & sq->wq.sz_m1; sq->mbuf[pi].num_bytes = mb->m_pkthdr.len; sq->mbuf[pi].num_wqebbs = 0; /* check number of segments in mbuf */ err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); if (err == EFBIG) { /* update statistics */ sq->stats.defragged++; /* too many mbuf fragments */ mb = m_defrag(*mbp, M_NOWAIT); if (mb == NULL) { mb = *mbp; goto tx_drop; } /* try again */ err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); } if (err != 0) goto tx_drop; /* make sure all mbuf data, if any, is visible to the bus */ bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map, BUS_DMASYNC_PREWRITE); /* compute number of real DUMP segments */ msb = sq->priv->params_ethtool.hw_mtu_msb; for (x = xsegs = 0; x != nsegs; x++) xsegs += howmany((u32)segs[x].ds_len, msb); /* check if there are no segments */ if (unlikely(xsegs == 0)) { bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map); m_freem(mb); *mbp = NULL; /* safety clear */ return (0); } /* return ENOBUFS if the queue is full */ if (unlikely(!mlx5e_sq_has_room_for(sq, xsegs))) { sq->stats.enobuf++; bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map); m_freem(mb); *mbp = NULL; /* safety clear */ return (ENOBUFS); } wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); wqe_last = mlx5_wq_cyc_get_wqe(&sq->wq, sq->wq.sz_m1); for (x = 0; x != nsegs; x++) { for (off = 0; off < segs[x].ds_len; off += msb) { u32 len = segs[x].ds_len - off; /* limit length */ if (likely(len > msb)) len = msb; memset(&wqe->ctrl, 0, sizeof(wqe->ctrl)); /* fill control segment */ wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); wqe->ctrl.imm = cpu_to_be32(parg->tisn << 8); /* fill data segment */ wqe->data.addr = cpu_to_be64((uint64_t)segs[x].ds_addr + off); wqe->data.lkey = sq->mkey_be; wqe->data.byte_count = cpu_to_be32(len); /* advance to next building block */ if (unlikely(wqe == wqe_last)) wqe = mlx5_wq_cyc_get_wqe(&sq->wq, 0); else wqe++; sq->mbuf[pi].num_wqebbs++; sq->pc++; } } wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); wqe_last = mlx5_wq_cyc_get_wqe(&sq->wq, (sq->pc - 1) & sq->wq.sz_m1); /* put in place data fence */ wqe->ctrl.fm_ce_se |= MLX5_FENCE_MODE_INITIATOR_SMALL; /* check if we should generate a completion event */ if (mlx5e_do_send_cqe_inline(sq)) wqe_last->ctrl.fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; /* copy data for doorbell */ memcpy(sq->doorbell.d32, wqe_last, sizeof(sq->doorbell.d32)); /* store pointer to mbuf */ sq->mbuf[pi].mbuf = mb; sq->mbuf[pi].mst = m_snd_tag_ref(parg->mst); /* count all traffic going out */ sq->stats.packets++; sq->stats.bytes += sq->mbuf[pi].num_bytes; *mbp = NULL; /* safety clear */ return (0); tx_drop: sq->stats.dropped++; *mbp = NULL; m_freem(mb); return err; } int mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp) { bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS]; struct mlx5e_xmit_args args = {}; struct mlx5_wqe_data_seg *dseg; struct mlx5e_tx_wqe *wqe; - struct ifnet *ifp; + if_t ifp; int nsegs; int err; int x; struct mbuf *mb; u16 ds_cnt; u16 pi; u8 opcode; #ifdef KERN_TLS top: #endif /* Return ENOBUFS if the queue is full */ if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS))) { sq->stats.enobuf++; return (ENOBUFS); } /* Align SQ edge with NOPs to avoid WQE wrap around */ pi = ((~sq->pc) & sq->wq.sz_m1); if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) { /* Send one multi NOP message instead of many */ mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS); pi = ((~sq->pc) & sq->wq.sz_m1); if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) { sq->stats.enobuf++; return (ENOMEM); } } #ifdef KERN_TLS /* Special handling for TLS packets, if any */ switch (mlx5e_sq_tls_xmit(sq, &args, mbp)) { case MLX5E_TLS_LOOP: goto top; case MLX5E_TLS_FAILURE: mb = *mbp; err = ENOMEM; goto tx_drop; case MLX5E_TLS_DEFERRED: return (0); case MLX5E_TLS_CONTINUE: default: break; } #endif /* Setup local variables */ pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); ifp = sq->ifp; memset(wqe, 0, sizeof(*wqe)); /* get pointer to mbuf */ mb = *mbp; /* Send a copy of the frame to the BPF listener, if any */ - if (ifp != NULL && ifp->if_bpf != NULL) + if (ifp != NULL && if_getbpf(ifp) != NULL) ETHER_BPF_MTAP(ifp, mb); if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) { wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM; } if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) { wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM; } if (wqe->eth.cs_flags == 0) { sq->stats.csum_offload_none++; } if (mb->m_pkthdr.csum_flags & CSUM_TSO) { u32 payload_len; u32 mss = mb->m_pkthdr.tso_segsz; u32 num_pkts; wqe->eth.mss = cpu_to_be16(mss); opcode = MLX5_OPCODE_LSO; if (args.ihs == 0) args.ihs = mlx5e_get_full_header_size(mb, NULL); if (unlikely(args.ihs == 0)) { err = EINVAL; goto tx_drop; } payload_len = mb->m_pkthdr.len - args.ihs; if (payload_len == 0) num_pkts = 1; else num_pkts = DIV_ROUND_UP(payload_len, mss); sq->mbuf[pi].num_bytes = payload_len + (num_pkts * args.ihs); sq->stats.tso_packets++; sq->stats.tso_bytes += payload_len; } else if (mb->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN) { /* check for inner TCP TSO first */ if (mb->m_pkthdr.csum_flags & (CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO)) { u32 payload_len; u32 mss = mb->m_pkthdr.tso_segsz; u32 num_pkts; wqe->eth.mss = cpu_to_be16(mss); opcode = MLX5_OPCODE_LSO; if (likely(args.ihs == 0)) { args.ihs = mlx5e_get_vxlan_header_size(mb, wqe, MLX5_ETH_WQE_L3_INNER_CSUM | MLX5_ETH_WQE_L4_INNER_CSUM | MLX5_ETH_WQE_L4_CSUM | MLX5_ETH_WQE_L3_CSUM, opcode); if (unlikely(args.ihs == 0)) { err = EINVAL; goto tx_drop; } } payload_len = mb->m_pkthdr.len - args.ihs; if (payload_len == 0) num_pkts = 1; else num_pkts = DIV_ROUND_UP(payload_len, mss); sq->mbuf[pi].num_bytes = payload_len + num_pkts * args.ihs; sq->stats.tso_packets++; sq->stats.tso_bytes += payload_len; } else { opcode = MLX5_OPCODE_SEND; if (likely(args.ihs == 0)) { uint8_t cs_mask; if (mb->m_pkthdr.csum_flags & (CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP)) { cs_mask = MLX5_ETH_WQE_L3_INNER_CSUM | MLX5_ETH_WQE_L4_INNER_CSUM | MLX5_ETH_WQE_L4_CSUM | MLX5_ETH_WQE_L3_CSUM; } else if (mb->m_pkthdr.csum_flags & CSUM_INNER_IP) { cs_mask = MLX5_ETH_WQE_L3_INNER_CSUM | MLX5_ETH_WQE_L4_CSUM | MLX5_ETH_WQE_L3_CSUM; } else { cs_mask = MLX5_ETH_WQE_L4_CSUM | MLX5_ETH_WQE_L3_CSUM; } args.ihs = mlx5e_get_vxlan_header_size(mb, wqe, cs_mask, opcode); if (unlikely(args.ihs == 0)) { err = EINVAL; goto tx_drop; } } sq->mbuf[pi].num_bytes = max_t (unsigned int, mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); } } else { opcode = MLX5_OPCODE_SEND; if (args.ihs == 0) { switch (sq->min_inline_mode) { case MLX5_INLINE_MODE_IP: case MLX5_INLINE_MODE_TCP_UDP: args.ihs = mlx5e_get_full_header_size(mb, NULL); if (unlikely(args.ihs == 0)) args.ihs = mlx5e_get_l2_header_size(sq, mb); break; case MLX5_INLINE_MODE_L2: args.ihs = mlx5e_get_l2_header_size(sq, mb); break; case MLX5_INLINE_MODE_NONE: /* FALLTHROUGH */ default: if ((mb->m_flags & M_VLANTAG) != 0 && (sq->min_insert_caps & MLX5E_INSERT_VLAN) != 0) { /* inlining VLAN data is not required */ wqe->eth.vlan_cmd = htons(0x8000); /* bit 0 CVLAN */ wqe->eth.vlan_hdr = htons(mb->m_pkthdr.ether_vtag); args.ihs = 0; } else if ((mb->m_flags & M_VLANTAG) == 0 && (sq->min_insert_caps & MLX5E_INSERT_NON_VLAN) != 0) { /* inlining non-VLAN data is not required */ args.ihs = 0; } else { /* we are forced to inlining L2 header, if any */ args.ihs = mlx5e_get_l2_header_size(sq, mb); } break; } } sq->mbuf[pi].num_bytes = max_t (unsigned int, mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); } if (likely(args.ihs == 0)) { /* nothing to inline */ } else if ((mb->m_flags & M_VLANTAG) != 0) { struct ether_vlan_header *eh = (struct ether_vlan_header *) wqe->eth.inline_hdr_start; /* Range checks */ if (unlikely(args.ihs > (sq->max_inline - ETHER_VLAN_ENCAP_LEN))) { if (mb->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_ENCAP_VXLAN)) { err = EINVAL; goto tx_drop; } args.ihs = (sq->max_inline - ETHER_VLAN_ENCAP_LEN); } else if (unlikely(args.ihs < ETHER_HDR_LEN)) { err = EINVAL; goto tx_drop; } m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh); m_adj(mb, ETHER_HDR_LEN); /* Insert 4 bytes VLAN tag into data stream */ eh->evl_proto = eh->evl_encap_proto; eh->evl_encap_proto = htons(ETHERTYPE_VLAN); eh->evl_tag = htons(mb->m_pkthdr.ether_vtag); /* Copy rest of header data, if any */ m_copydata(mb, 0, args.ihs - ETHER_HDR_LEN, (caddr_t)(eh + 1)); m_adj(mb, args.ihs - ETHER_HDR_LEN); /* Extend header by 4 bytes */ args.ihs += ETHER_VLAN_ENCAP_LEN; wqe->eth.inline_hdr_sz = cpu_to_be16(args.ihs); } else { /* check if inline header size is too big */ if (unlikely(args.ihs > sq->max_inline)) { if (unlikely(mb->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_ENCAP_VXLAN))) { err = EINVAL; goto tx_drop; } args.ihs = sq->max_inline; } m_copydata(mb, 0, args.ihs, wqe->eth.inline_hdr_start); m_adj(mb, args.ihs); wqe->eth.inline_hdr_sz = cpu_to_be16(args.ihs); } ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; if (args.ihs > sizeof(wqe->eth.inline_hdr_start)) { ds_cnt += DIV_ROUND_UP(args.ihs - sizeof(wqe->eth.inline_hdr_start), MLX5_SEND_WQE_DS); } dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt; err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); if (err == EFBIG) { /* Update statistics */ sq->stats.defragged++; /* Too many mbuf fragments */ mb = m_defrag(*mbp, M_NOWAIT); if (mb == NULL) { mb = *mbp; goto tx_drop; } /* Try again */ err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); } /* Catch errors */ if (err != 0) goto tx_drop; /* Make sure all mbuf data, if any, is visible to the bus */ if (nsegs != 0) { bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map, BUS_DMASYNC_PREWRITE); } else { /* All data was inlined, free the mbuf. */ bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map); m_freem(mb); mb = NULL; } for (x = 0; x != nsegs; x++) { if (segs[x].ds_len == 0) continue; dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr); dseg->lkey = sq->mkey_be; dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len); dseg++; } ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); wqe->ctrl.imm = cpu_to_be32(args.tisn << 8); if (mlx5e_do_send_cqe_inline(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; else wqe->ctrl.fm_ce_se = 0; /* Copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); /* Store pointer to mbuf */ sq->mbuf[pi].mbuf = mb; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); if (unlikely(args.mst != NULL)) sq->mbuf[pi].mst = m_snd_tag_ref(args.mst); else MPASS(sq->mbuf[pi].mst == NULL); sq->pc += sq->mbuf[pi].num_wqebbs; /* Count all traffic going out */ sq->stats.packets++; sq->stats.bytes += sq->mbuf[pi].num_bytes; *mbp = NULL; /* safety clear */ return (0); tx_drop: sq->stats.dropped++; *mbp = NULL; m_freem(mb); return err; } static void mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget) { u16 sqcc; /* * sq->cc must be updated only after mlx5_cqwq_update_db_record(), * otherwise a cq overrun may occur */ sqcc = sq->cc; while (budget > 0) { struct mlx5_cqe64 *cqe; struct m_snd_tag *mst; struct mbuf *mb; bool match; u16 sqcc_this; u16 delta; u16 x; u16 ci; cqe = mlx5e_get_cqe(&sq->cq); if (!cqe) break; mlx5_cqwq_pop(&sq->cq.wq); /* check if the completion event indicates an error */ if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { mlx5e_dump_err_cqe(&sq->cq, sq->sqn, (const void *)cqe); sq->stats.cqe_err++; } /* setup local variables */ sqcc_this = be16toh(cqe->wqe_counter); match = false; /* update budget according to the event factor */ budget -= sq->cev_factor; for (x = 0;; x++) { if (unlikely(match != false)) { break; } else if (unlikely(x == sq->cev_factor)) { /* WQE counter match not found */ sq->stats.cqe_err++; break; } ci = sqcc & sq->wq.sz_m1; delta = sqcc_this - sqcc; match = (delta < sq->mbuf[ci].num_wqebbs); mb = sq->mbuf[ci].mbuf; sq->mbuf[ci].mbuf = NULL; mst = sq->mbuf[ci].mst; sq->mbuf[ci].mst = NULL; if (unlikely(mb == NULL)) { if (unlikely(sq->mbuf[ci].num_bytes == 0)) sq->stats.nop++; } else { bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map); /* Free transmitted mbuf */ m_freem(mb); } if (unlikely(mst != NULL)) m_snd_tag_rele(mst); sqcc += sq->mbuf[ci].num_wqebbs; } } mlx5_cqwq_update_db_record(&sq->cq.wq); /* Ensure cq space is freed before enabling more cqes */ atomic_thread_fence_rel(); sq->cc = sqcc; } static int -mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb) +mlx5e_xmit_locked(if_t ifp, struct mlx5e_sq *sq, struct mbuf *mb) { int err = 0; - if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || READ_ONCE(sq->running) == 0)) { m_freem(mb); return (ENETDOWN); } /* Do transmit */ if (mlx5e_sq_xmit(sq, &mb) != 0) { /* NOTE: m_freem() is NULL safe */ m_freem(mb); err = ENOBUFS; } /* Write the doorbell record, if any. */ mlx5e_tx_notify_hw(sq, false); /* * Check if we need to start the event timer which flushes the * transmit ring on timeout: */ if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL && sq->cev_factor != 1)) { /* start the timer */ mlx5e_sq_cev_timeout(sq); } else { /* don't send NOPs yet */ sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; } return (err); } int -mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb) +mlx5e_xmit(if_t ifp, struct mbuf *mb) { struct mlx5e_sq *sq; int ret; if (mb->m_pkthdr.csum_flags & CSUM_SND_TAG) { MPASS(mb->m_pkthdr.snd_tag->ifp == ifp); sq = mlx5e_select_queue_by_send_tag(ifp, mb); if (unlikely(sq == NULL)) { goto select_queue; } } else { select_queue: sq = mlx5e_select_queue(ifp, mb); if (unlikely(sq == NULL)) { /* Free mbuf */ m_freem(mb); /* Invalid send queue */ return (ENXIO); } } mtx_lock(&sq->lock); ret = mlx5e_xmit_locked(ifp, sq, mb); mtx_unlock(&sq->lock); return (ret); } void mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe __unused) { struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq); mtx_lock(&sq->comp_lock); mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX); mlx5e_cq_arm(&sq->cq, MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock)); mtx_unlock(&sq->comp_lock); } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib.h b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h index 316a45806966..e39e18dd1b39 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib.h +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h @@ -1,1213 +1,1213 @@ /*- * Copyright (c) 2013-2020, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef MLX5_IB_H #define MLX5_IB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) #define mlx5_ib_err(dev, format, arg...) \ pr_err("%s: ERR: %s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) #define mlx5_ib_warn(dev, format, arg...) \ pr_warn("%s: WARN: %s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) #define field_avail(type, fld, sz) (offsetof(type, fld) + \ sizeof(((type *)0)->fld) <= (sz)) #define MLX5_IB_DEFAULT_UIDX 0xffffff #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) #define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size) enum { MLX5_IB_MMAP_CMD_SHIFT = 8, MLX5_IB_MMAP_CMD_MASK = 0xff, }; enum { MLX5_RES_SCAT_DATA32_CQE = 0x1, MLX5_RES_SCAT_DATA64_CQE = 0x2, MLX5_REQ_SCAT_DATA32_CQE = 0x11, MLX5_REQ_SCAT_DATA64_CQE = 0x22, }; enum mlx5_ib_latency_class { MLX5_IB_LATENCY_CLASS_LOW, MLX5_IB_LATENCY_CLASS_MEDIUM, MLX5_IB_LATENCY_CLASS_HIGH, MLX5_IB_LATENCY_CLASS_FAST_PATH }; enum mlx5_ib_mad_ifc_flags { MLX5_MAD_IFC_IGNORE_MKEY = 1, MLX5_MAD_IFC_IGNORE_BKEY = 2, MLX5_MAD_IFC_NET_VIEW = 4, }; enum { MLX5_CROSS_CHANNEL_BFREG = 0, }; enum { MLX5_CQE_VERSION_V0, MLX5_CQE_VERSION_V1, }; enum { MLX5_IB_INVALID_UAR_INDEX = BIT(31), MLX5_IB_INVALID_BFREG = BIT(31), }; enum mlx5_ib_mmap_type { MLX5_IB_MMAP_TYPE_MEMIC = 1, MLX5_IB_MMAP_TYPE_VAR = 2, MLX5_IB_MMAP_TYPE_UAR_WC = 3, MLX5_IB_MMAP_TYPE_UAR_NC = 4, }; struct mlx5_bfreg_info { u32 *sys_pages; int num_low_latency_bfregs; unsigned int *count; /* * protect bfreg allocation data structs */ struct mutex lock; u32 ver; u8 lib_uar_4k : 1; u8 lib_uar_dyn : 1; u32 num_sys_pages; u32 num_static_sys_pages; u32 total_num_bfregs; u32 num_dyn_bfregs; }; struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; struct list_head db_page_list; /* protect doorbell record alloc/free */ struct mutex db_page_mutex; struct mlx5_bfreg_info bfregi; u8 cqe_version; /* Transport Domain number */ u32 tdn; u64 lib_caps; u16 devx_uid; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) { return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); } struct mlx5_ib_pd { struct ib_pd ibpd; u32 pdn; u16 uid; }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) #define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) #if (MLX5_IB_FLOW_LAST_PRIO <= 0) #error "Invalid number of bypass priorities" #endif #define MLX5_IB_FLOW_LEFTOVERS_PRIO (MLX5_IB_FLOW_MCAST_PRIO + 1) #define MLX5_IB_NUM_FLOW_FT (MLX5_IB_FLOW_LEFTOVERS_PRIO + 1) #define MLX5_IB_NUM_SNIFFER_FTS 2 struct mlx5_ib_flow_prio { struct mlx5_flow_table *flow_table; unsigned int refcount; }; struct mlx5_ib_flow_handler { struct list_head list; struct ib_flow ibflow; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_rule *rule; }; struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; struct mlx5_flow_table *lag_demux_ft; /* Protect flow steering bypass flow tables * when add/del flow rules. * only single add/removal of flow steering rule could be done * simultaneously. */ struct mutex lock; }; /* Use macros here so that don't have to duplicate * enum ib_send_flags and enum ib_qp_type for low-level driver */ #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) #define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) #define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) #define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 /* * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI * creates the actual hardware QP. */ #define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 #define MLX5_IB_QPT_DCI IB_QPT_RESERVED3 #define MLX5_IB_QPT_DCT IB_QPT_RESERVED4 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * * These flags are intended for internal use by the mlx5_ib driver, and they * rely on the range reserved for that use in the ib_qp_create_flags enum. */ #define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START #define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1) struct wr_list { u16 opcode; u16 next; }; struct mlx5_ib_wq { u64 *wrid; u32 *wr_data; struct wr_list *w_list; unsigned *wqe_head; u16 unsig_count; /* serialize post to the work queue */ spinlock_t lock; int wqe_cnt; int max_post; int max_gs; int offset; int wqe_shift; unsigned head; unsigned tail; u16 cur_post; u16 last_poll; void *qend; }; struct mlx5_ib_rwq { struct ib_wq ibwq; struct mlx5_core_qp core_qp; u32 rq_num_pas; u32 log_rq_stride; u32 log_rq_size; u32 rq_page_offset; u32 log_page_size; struct ib_umem *umem; size_t buf_size; unsigned int page_shift; int create_type; struct mlx5_db db; u32 user_index; u32 wqe_count; u32 wqe_shift; int wq_sig; }; enum { MLX5_QP_USER, MLX5_QP_KERNEL, MLX5_QP_EMPTY }; enum { MLX5_WQ_USER, MLX5_WQ_KERNEL }; struct mlx5_ib_rwq_ind_table { struct ib_rwq_ind_table ib_rwq_ind_tbl; u32 rqtn; u16 uid; }; /* * Connect-IB can trigger up to four concurrent pagefaults * per-QP. */ enum mlx5_ib_pagefault_context { MLX5_IB_PAGEFAULT_RESPONDER_READ, MLX5_IB_PAGEFAULT_REQUESTOR_READ, MLX5_IB_PAGEFAULT_RESPONDER_WRITE, MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, MLX5_IB_PAGEFAULT_CONTEXTS }; static inline enum mlx5_ib_pagefault_context mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) { return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); } struct mlx5_ib_pfault { struct work_struct work; struct mlx5_pagefault mpfault; }; struct mlx5_ib_ubuffer { struct ib_umem *umem; int buf_size; u64 buf_addr; }; struct mlx5_ib_qp_base { struct mlx5_ib_qp *container_mibqp; struct mlx5_core_qp mqp; struct mlx5_ib_ubuffer ubuffer; }; struct mlx5_ib_qp_trans { struct mlx5_ib_qp_base base; u16 xrcdn; u8 alt_port; u8 atomic_rd_en; u8 resp_depth; }; struct mlx5_ib_rss_qp { u32 tirn; }; struct mlx5_ib_rq { struct mlx5_ib_qp_base base; struct mlx5_ib_wq *rq; struct mlx5_ib_ubuffer ubuffer; struct mlx5_db *doorbell; u32 tirn; u8 state; }; struct mlx5_ib_sq { struct mlx5_ib_qp_base base; struct mlx5_ib_wq *sq; struct mlx5_ib_ubuffer ubuffer; struct mlx5_db *doorbell; u32 tisn; u8 state; }; struct mlx5_ib_raw_packet_qp { struct mlx5_ib_sq sq; struct mlx5_ib_rq rq; }; struct mlx5_bf { int buf_size; unsigned long offset; struct mlx5_sq_bfreg *bfreg; spinlock_t lock32; }; struct mlx5_ib_dct { struct mlx5_core_dct mdct; u32 *in; }; struct mlx5_ib_qp { struct ib_qp ibqp; union { struct mlx5_ib_qp_trans trans_qp; struct mlx5_ib_raw_packet_qp raw_packet_qp; struct mlx5_ib_rss_qp rss_qp; struct mlx5_ib_dct dct; }; struct mlx5_buf buf; struct mlx5_db db; struct mlx5_ib_wq rq; u8 sq_signal_bits; u8 fm_cache; struct mlx5_ib_wq sq; /* serialize qp state modifications */ struct mutex mutex; u32 flags; u8 port; u8 state; int wq_sig; int scat_cqe; int max_inline_data; struct mlx5_bf bf; int has_rq; /* only for user space QPs. For kernel * we have it from the bf object */ int bfregn; int create_type; /* Store signature errors */ bool signature_en; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* * A flag that is true for QP's that are in a state that doesn't * allow page faults, and shouldn't schedule any more faults. */ int disable_page_faults; /* * The disable_page_faults_lock protects a QP's disable_page_faults * field, allowing for a thread to atomically check whether the QP * allows page faults, and if so schedule a page fault. */ spinlock_t disable_page_faults_lock; struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; #endif struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; }; struct mlx5_ib_cq_buf { struct mlx5_buf buf; struct ib_umem *umem; int cqe_size; int nent; }; enum mlx5_ib_qp_flags { MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, /* QP uses 1 as its source QP number */ MLX5_IB_QP_SQPN_QP1 = 1 << 6, MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, MLX5_IB_QP_RSS = 1 << 8, MLX5_IB_QP_UNDERLAY = 1 << 10, }; struct mlx5_umr_wr { struct ib_send_wr wr; union { u64 virt_addr; u64 offset; } target; struct ib_pd *pd; unsigned int page_shift; unsigned int npages; u32 length; int access_flags; u32 mkey; }; static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr) { return container_of(wr, struct mlx5_umr_wr, wr); } struct mlx5_shared_mr_info { int mr_id; struct ib_umem *umem; }; struct mlx5_ib_cq { struct ib_cq ibcq; struct mlx5_core_cq mcq; struct mlx5_ib_cq_buf buf; struct mlx5_db db; /* serialize access to the CQ */ spinlock_t lock; /* protect resize cq */ struct mutex resize_mutex; struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; struct list_head list_send_qp; struct list_head list_recv_qp; u32 create_flags; struct list_head wc_list; enum ib_cq_notify_flags notify_flags; struct work_struct notify_work; }; struct mlx5_ib_wc { struct ib_wc wc; struct list_head list; }; struct mlx5_ib_srq { struct ib_srq ibsrq; struct mlx5_core_srq msrq; struct mlx5_buf buf; struct mlx5_db db; u64 *wrid; /* protect SRQ hanlding */ spinlock_t lock; int head; int tail; u16 wqe_ctr; struct ib_umem *umem; /* serialize arming a SRQ */ struct mutex mutex; int wq_sig; }; struct mlx5_ib_xrcd { struct ib_xrcd ibxrcd; u32 xrcdn; }; enum mlx5_ib_mtt_access_flags { MLX5_IB_MTT_READ = (1 << 0), MLX5_IB_MTT_WRITE = (1 << 1), }; struct mlx5_user_mmap_entry { struct rdma_user_mmap_entry rdma_entry; u8 mmap_flag; u64 address; u32 page_idx; }; #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) struct mlx5_ib_mr { struct ib_mr ibmr; void *descs; dma_addr_t desc_map; int ndescs; int max_descs; int desc_size; int access_mode; struct mlx5_core_mkey mmkey; struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; struct list_head list; int order; int umred; int npages; struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; struct mlx5_core_sig_ctx *sig; int live; void *descs_alloc; int access_flags; /* Needed for rereg MR */ struct mlx5_async_work cb_work; }; struct mlx5_ib_mw { struct ib_mw ibmw; struct mlx5_core_mkey mmkey; }; struct mlx5_ib_devx_mr { struct mlx5_core_mkey mmkey; int ndescs; }; struct mlx5_ib_umr_context { struct ib_cqe cqe; enum ib_wc_status status; struct completion done; }; struct umr_common { struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; /* control access to UMR QP */ struct semaphore sem; }; enum { MLX5_FMR_INVALID, MLX5_FMR_VALID, MLX5_FMR_BUSY, }; struct mlx5_cache_ent { struct list_head head; /* sync access to the cahce entry */ spinlock_t lock; struct dentry *dir; char name[4]; u32 order; u32 size; u32 cur; u32 miss; u32 limit; struct dentry *fsize; struct dentry *fcur; struct dentry *fmiss; struct dentry *flimit; struct mlx5_ib_dev *dev; struct work_struct work; struct delayed_work dwork; int pending; }; struct mlx5_mr_cache { struct workqueue_struct *wq; struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; int stopped; struct dentry *root; unsigned long last_add; }; struct mlx5_ib_gsi_qp; struct mlx5_ib_port_resources { struct mlx5_ib_resources *devr; struct mlx5_ib_gsi_qp *gsi; struct work_struct pkey_change_work; }; struct mlx5_ib_resources { struct ib_cq *c0; struct ib_xrcd *x0; struct ib_xrcd *x1; struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; struct mlx5_ib_port_resources ports[2]; /* Protects changes to the port resources */ struct mutex mutex; }; struct mlx5_ib_port { u16 q_cnt_id; }; struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer */ rwlock_t netdev_lock; - struct ifnet *netdev; + if_t netdev; struct notifier_block nb; atomic_t next_port; }; #define MLX5_IB_STATS_COUNT(a,...) a #define MLX5_IB_STATS_VAR(a,b,c,...) b c; #define MLX5_IB_STATS_DESC(a,b,c,d,e,...) d, e, #define MLX5_IB_CONG_PARAMS(m) \ /* ECN RP */ \ m(+1, u64, rp_clamp_tgt_rate, "rp_clamp_tgt_rate", "If set, whenever a CNP is processed, the target rate is updated to be the current rate") \ m(+1, u64, rp_clamp_tgt_rate_ati, "rp_clamp_tgt_rate_ati", "If set, when receiving a CNP, the target rate should be updated if the transission rate was increased due to the timer, and not only due to the byte counter") \ m(+1, u64, rp_time_reset, "rp_time_reset", "Time in microseconds between rate increases if no CNPs are received") \ m(+1, u64, rp_byte_reset, "rp_byte_reset", "Transmitted data in bytes between rate increases if no CNP's are received. A value of zero means disabled.") \ m(+1, u64, rp_threshold, "rp_threshold", "The number of times rpByteStage or rpTimeStage can count before the RP rate control state machine advances states") \ m(+1, u64, rp_ai_rate, "rp_ai_rate", "The rate, in Mbits per second, used to increase rpTargetRate in the active increase state") \ m(+1, u64, rp_hai_rate, "rp_hai_rate", "The rate, in Mbits per second, used to increase rpTargetRate in the hyper increase state") \ m(+1, u64, rp_min_dec_fac, "rp_min_dec_fac", "The minimum factor by which the current transmit rate can be changed when processing a CNP. Value is given as a percentage, [1 .. 100]") \ m(+1, u64, rp_min_rate, "rp_min_rate", "The minimum value, in Mbps per second, for rate to limit") \ m(+1, u64, rp_rate_to_set_on_first_cnp, "rp_rate_to_set_on_first_cnp", "The rate that is set for the flow when a rate limiter is allocated to it upon first CNP received, in Mbps. A value of zero means use full port speed") \ m(+1, u64, rp_dce_tcp_g, "rp_dce_tcp_g", "Used to update the congestion estimator, alpha, once every dce_tcp_rtt once every dce_tcp_rtt microseconds") \ m(+1, u64, rp_dce_tcp_rtt, "rp_dce_tcp_rtt", "The time between updates of the aolpha value, in microseconds") \ m(+1, u64, rp_rate_reduce_monitor_period, "rp_rate_reduce_monitor_period", "The minimum time between two consecutive rate reductions for a single flow") \ m(+1, u64, rp_initial_alpha_value, "rp_initial_alpha_value", "The initial value of alpha to use when receiving the first CNP for a flow") \ m(+1, u64, rp_gd, "rp_gd", "If a CNP is received, the flow rate is reduced at the beginning of the next rate_reduce_monitor_period interval") \ /* ECN NP */ \ m(+1, u64, np_cnp_dscp, "np_cnp_dscp", "The DiffServ Code Point of the generated CNP for this port") \ m(+1, u64, np_cnp_prio_mode, "np_cnp_prio_mode", "The 802.1p priority value of the generated CNP for this port") \ m(+1, u64, np_cnp_prio, "np_cnp_prio", "The 802.1p priority value of the generated CNP for this port") #define MLX5_IB_CONG_PARAMS_NUM (0 MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_COUNT)) #define MLX5_IB_CONG_STATS(m) \ m(+1, u64, syndrome, "syndrome", "Syndrome number") \ m(+1, u64, rp_cur_flows, "rp_cur_flows", "Number of flows limited") \ m(+1, u64, sum_flows, "sum_flows", "Sum of the number of flows limited over time") \ m(+1, u64, rp_cnp_ignored, "rp_cnp_ignored", "Number of CNPs and CNMs ignored") \ m(+1, u64, rp_cnp_handled, "rp_cnp_handled", "Number of CNPs and CNMs successfully handled") \ m(+1, u64, time_stamp, "time_stamp", "Time stamp in microseconds") \ m(+1, u64, accumulators_period, "accumulators_period", "The value of X variable for accumulating counters") \ m(+1, u64, np_ecn_marked_roce_packets, "np_ecn_marked_roce_packets", "Number of ECN marked packets seen") \ m(+1, u64, np_cnp_sent, "np_cnp_sent", "Number of CNPs sent") #define MLX5_IB_CONG_STATS_NUM (0 MLX5_IB_CONG_STATS(MLX5_IB_STATS_COUNT)) #define MLX5_IB_CONG_STATUS(m) \ /* ECN RP */ \ m(+1, u64, rp_0_enable, "rp_0_enable", "Enable reaction point, priority 0", MLX5_IB_RROCE_ECN_RP, 0, enable) \ m(+1, u64, rp_1_enable, "rp_1_enable", "Enable reaction point, priority 1", MLX5_IB_RROCE_ECN_RP, 1, enable) \ m(+1, u64, rp_2_enable, "rp_2_enable", "Enable reaction point, priority 2", MLX5_IB_RROCE_ECN_RP, 2, enable) \ m(+1, u64, rp_3_enable, "rp_3_enable", "Enable reaction point, priority 3", MLX5_IB_RROCE_ECN_RP, 3, enable) \ m(+1, u64, rp_4_enable, "rp_4_enable", "Enable reaction point, priority 4", MLX5_IB_RROCE_ECN_RP, 4, enable) \ m(+1, u64, rp_5_enable, "rp_5_enable", "Enable reaction point, priority 5", MLX5_IB_RROCE_ECN_RP, 5, enable) \ m(+1, u64, rp_6_enable, "rp_6_enable", "Enable reaction point, priority 6", MLX5_IB_RROCE_ECN_RP, 6, enable) \ m(+1, u64, rp_7_enable, "rp_7_enable", "Enable reaction point, priority 7", MLX5_IB_RROCE_ECN_RP, 7, enable) \ m(+1, u64, rp_8_enable, "rp_8_enable", "Enable reaction point, priority 8", MLX5_IB_RROCE_ECN_RP, 8, enable) \ m(+1, u64, rp_9_enable, "rp_9_enable", "Enable reaction point, priority 9", MLX5_IB_RROCE_ECN_RP, 9, enable) \ m(+1, u64, rp_10_enable, "rp_10_enable", "Enable reaction point, priority 10", MLX5_IB_RROCE_ECN_RP, 10, enable) \ m(+1, u64, rp_11_enable, "rp_11_enable", "Enable reaction point, priority 11", MLX5_IB_RROCE_ECN_RP, 11, enable) \ m(+1, u64, rp_12_enable, "rp_12_enable", "Enable reaction point, priority 12", MLX5_IB_RROCE_ECN_RP, 12, enable) \ m(+1, u64, rp_13_enable, "rp_13_enable", "Enable reaction point, priority 13", MLX5_IB_RROCE_ECN_RP, 13, enable) \ m(+1, u64, rp_14_enable, "rp_14_enable", "Enable reaction point, priority 14", MLX5_IB_RROCE_ECN_RP, 14, enable) \ m(+1, u64, rp_15_enable, "rp_15_enable", "Enable reaction point, priority 15", MLX5_IB_RROCE_ECN_RP, 15, enable) \ /* ECN NP */ \ m(+1, u64, np_0_enable, "np_0_enable", "Enable notification point, priority 0", MLX5_IB_RROCE_ECN_NP, 0, enable) \ m(+1, u64, np_1_enable, "np_1_enable", "Enable notification point, priority 1", MLX5_IB_RROCE_ECN_NP, 1, enable) \ m(+1, u64, np_2_enable, "np_2_enable", "Enable notification point, priority 2", MLX5_IB_RROCE_ECN_NP, 2, enable) \ m(+1, u64, np_3_enable, "np_3_enable", "Enable notification point, priority 3", MLX5_IB_RROCE_ECN_NP, 3, enable) \ m(+1, u64, np_4_enable, "np_4_enable", "Enable notification point, priority 4", MLX5_IB_RROCE_ECN_NP, 4, enable) \ m(+1, u64, np_5_enable, "np_5_enable", "Enable notification point, priority 5", MLX5_IB_RROCE_ECN_NP, 5, enable) \ m(+1, u64, np_6_enable, "np_6_enable", "Enable notification point, priority 6", MLX5_IB_RROCE_ECN_NP, 6, enable) \ m(+1, u64, np_7_enable, "np_7_enable", "Enable notification point, priority 7", MLX5_IB_RROCE_ECN_NP, 7, enable) \ m(+1, u64, np_8_enable, "np_8_enable", "Enable notification point, priority 8", MLX5_IB_RROCE_ECN_NP, 8, enable) \ m(+1, u64, np_9_enable, "np_9_enable", "Enable notification point, priority 9", MLX5_IB_RROCE_ECN_NP, 9, enable) \ m(+1, u64, np_10_enable, "np_10_enable", "Enable notification point, priority 10", MLX5_IB_RROCE_ECN_NP, 10, enable) \ m(+1, u64, np_11_enable, "np_11_enable", "Enable notification point, priority 11", MLX5_IB_RROCE_ECN_NP, 11, enable) \ m(+1, u64, np_12_enable, "np_12_enable", "Enable notification point, priority 12", MLX5_IB_RROCE_ECN_NP, 12, enable) \ m(+1, u64, np_13_enable, "np_13_enable", "Enable notification point, priority 13", MLX5_IB_RROCE_ECN_NP, 13, enable) \ m(+1, u64, np_14_enable, "np_14_enable", "Enable notification point, priority 14", MLX5_IB_RROCE_ECN_NP, 14, enable) \ m(+1, u64, np_15_enable, "np_15_enable", "Enable notification point, priority 15", MLX5_IB_RROCE_ECN_NP, 15, enable) \ #define MLX5_IB_CONG_STATUS_NUM (0 MLX5_IB_CONG_STATUS(MLX5_IB_STATS_COUNT)) struct mlx5_ib_congestion { struct sysctl_ctx_list ctx; struct sx lock; struct delayed_work dwork; union { u64 arg[1]; struct { MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_VAR) MLX5_IB_CONG_STATS(MLX5_IB_STATS_VAR) MLX5_IB_CONG_STATUS(MLX5_IB_STATS_VAR) }; }; }; struct mlx5_devx_event_table { /* serialize updating the event_xa */ struct mutex event_xa_lock; struct xarray event_xa; }; struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; struct mlx5_roce roce; MLX5_DECLARE_DOORBELL_LOCK(uar_lock); int num_ports; /* serialize update of capability mask */ struct mutex cap_mask_mutex; u8 ib_active:1; u8 wc_support:1; struct umr_common umrc; /* sync used page count stats */ struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; struct timer_list delay_timer; /* Prevents soft lock on massive reg MRs */ struct mutex slow_path_mutex; int fill_delay; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_odp_caps odp_caps; /* * Sleepable RCU that prevents destruction of MRs while they are still * being used by a page fault handler. */ struct srcu_struct mr_srcu; #endif struct mlx5_ib_flow_db flow_db; /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; struct mlx5_sq_bfreg wc_bfreg; struct mlx5_sq_bfreg fp_bfreg; struct mlx5_devx_event_table devx_event_table; struct mlx5_ib_congestion congestion; struct mlx5_async_ctx async_ctx; /* protect the user_td */ struct mutex lb_mutex; u32 user_td; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) { return container_of(mcq, struct mlx5_ib_cq, mcq); } static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) { return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); } static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx5_ib_dev, ib_dev); } static inline struct mlx5_ib_dev *mlx5_udata_to_mdev(struct ib_udata *udata) { struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); return to_mdev(context->ibucontext.device); } static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx5_ib_cq, ibcq); } static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) { return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } static inline struct mlx5_ib_rwq *to_mibrwq(struct mlx5_core_qp *core_qp) { return container_of(core_qp, struct mlx5_ib_rwq, core_qp); } static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey) { return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) { return container_of(ibpd, struct mlx5_ib_pd, ibpd); } static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) { return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); } static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx5_ib_qp, ibqp); } static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq) { return container_of(ibwq, struct mlx5_ib_rwq, ibwq); } static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) { return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl); } static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) { return container_of(msrq, struct mlx5_ib_srq, msrq); } static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) { return container_of(ibmr, struct mlx5_ib_mr, ibmr); } static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) { return container_of(ibmw, struct mlx5_ib_mw, ibmw); } struct mlx5_ib_ah { struct ib_ah ibah; struct mlx5_av av; }; static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) { return container_of(ibah, struct mlx5_ib_ah, ibah); } static inline struct mlx5_user_mmap_entry * to_mmmap(struct rdma_user_mmap_entry *rdma_entry) { return container_of(rdma_entry, struct mlx5_user_mmap_entry, rdma_entry); } int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad); int mlx5_ib_create_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr, u32 flags, struct ib_udata *udata); int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags); int mlx5_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); void mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata); int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, void *buffer, u32 length, struct mlx5_ib_qp_base *base); int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata); void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, int zap); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, struct ib_udata *udata); int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, struct ib_mad_hdr *out, size_t *out_mad_size, u16 *out_mad_pkey_index); struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_udata *udata); int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata); int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev, struct ib_smp *out_mad); int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid); int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys); int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, u32 *vendor_id); int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc); int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid); int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey); int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid); int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, unsigned long max_page_shift, int *count, int *shift, int *ncont, int *order); void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, size_t offset, size_t num_pages, __be64 *pas, int access_flags); void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask, struct ib_udata *udata); struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING extern struct workqueue_struct *mlx5_ib_page_fault_wq; void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault); void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { return; } static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *info); int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, int state); int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats); int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type); __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index); int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type); /* GSI QP helper functions */ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr); int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp); int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask); int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn); #if 1 /* IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) */ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev); void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev); bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); bool mlx5_ib_devx_is_flow_counter(void *obj, u32 offset, u32 *counter_id); #else static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) { return -EOPNOTSUPP; } static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {} static inline void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) {} static inline void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) {} static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type) { return false; } #endif static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static inline u8 convert_access(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | MLX5_PERM_LOCAL_READ; } static inline int is_qp1(enum ib_qp_type qp_type) { return qp_type == MLX5_IB_QPT_HW_GSI; } #define MLX5_MAX_UMR_SHIFT 16 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) static inline u32 check_cq_create_flags(u32 flags) { /* * It returns non-zero value for unsupported CQ * create flags, otherwise it returns zero. */ return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN | IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); } static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, u32 *user_index) { if (cqe_version) { if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) || (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK)) return -EINVAL; *user_index = cmd_uidx; } else { *user_index = MLX5_IB_DEFAULT_UIDX; } return 0; } static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_qp *ucmd, int inlen, u32 *user_index) { u8 cqe_version = ucontext->cqe_version; if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) && !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) return 0; if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) != !!cqe_version)) return -EINVAL; return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); } static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_srq *ucmd, int inlen, u32 *user_index) { u8 cqe_version = ucontext->cqe_version; if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) && !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) return 0; if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) != !!cqe_version)) return -EINVAL; return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); } void mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *); int mlx5_ib_init_congestion(struct mlx5_ib_dev *); static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_support) { return lib_support && MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; } static inline int get_num_static_uars(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; } int bfregn_to_uar_index(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, u32 bfregn, bool dyn_bfreg); #endif /* MLX5_IB_H */ diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c index acd74e3ec643..629ecf3e5755 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c @@ -1,3655 +1,3663 @@ /*- * Copyright (c) 2013-2021, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_rss.h" #include "opt_ratelimit.h" #include #include #include #include #include #if defined(CONFIG_X86) #include #endif #include #include #include #undef inode #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1); MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1); MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1); MODULE_VERSION(mlx5ib, 1); enum { MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, }; static enum rdma_link_layer mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { switch (port_type_cap) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; } } static enum rdma_link_layer mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *dev = to_mdev(device); int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); return mlx5_port_type_cap_to_rdma_ll(port_type_cap); } -static bool mlx5_netdev_match(struct ifnet *ndev, +static bool mlx5_netdev_match(if_t ndev, struct mlx5_core_dev *mdev, const char *dname) { - return ndev->if_type == IFT_ETHER && - ndev->if_dname != NULL && - strcmp(ndev->if_dname, dname) == 0 && - ndev->if_softc != NULL && - *(struct mlx5_core_dev **)ndev->if_softc == mdev; + return if_gettype(ndev) == IFT_ETHER && + if_getdname(ndev) != NULL && + strcmp(if_getdname(ndev), dname) == 0 && + if_getsoftc(ndev) != NULL && + *(struct mlx5_core_dev **)if_getsoftc(ndev) == mdev; } static int mlx5_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct ifnet *ndev = netdev_notifier_info_to_ifp(ptr); + if_t ndev = netdev_notifier_info_to_ifp(ptr); struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, roce.nb); switch (event) { case NETDEV_REGISTER: case NETDEV_UNREGISTER: write_lock(&ibdev->roce.netdev_lock); /* check if network interface belongs to mlx5en */ if (mlx5_netdev_match(ndev, ibdev->mdev, "mce")) ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev; write_unlock(&ibdev->roce.netdev_lock); break; case NETDEV_UP: case NETDEV_DOWN: { - struct ifnet *upper = NULL; + if_t upper = NULL; if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev)) && ibdev->ib_active) { struct ib_event ibev = {0}; ibev.device = &ibdev->ib_dev; ibev.event = (event == NETDEV_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; ibev.element.port_num = 1; ib_dispatch_event(&ibev); } break; } default: break; } return NOTIFY_DONE; } -static struct ifnet *mlx5_ib_get_netdev(struct ib_device *device, +static if_t mlx5_ib_get_netdev(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *ibdev = to_mdev(device); - struct ifnet *ndev; + if_t ndev; /* Ensure ndev does not disappear before we invoke if_ref() */ read_lock(&ibdev->roce.netdev_lock); ndev = ibdev->roce.netdev; if (ndev) if_ref(ndev); read_unlock(&ibdev->roce.netdev_lock); return ndev; } static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, u8 *active_width) { switch (eth_proto_oper) { case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII): case MLX5E_PROT_MASK(MLX5E_1000BASE_KX): case MLX5E_PROT_MASK(MLX5E_100BASE_TX): case MLX5E_PROT_MASK(MLX5E_1000BASE_T): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_SDR; break; case MLX5E_PROT_MASK(MLX5E_10GBASE_T): case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4): case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4): case MLX5E_PROT_MASK(MLX5E_10GBASE_KR): case MLX5E_PROT_MASK(MLX5E_10GBASE_CR): case MLX5E_PROT_MASK(MLX5E_10GBASE_SR): case MLX5E_PROT_MASK(MLX5E_10GBASE_ER_LR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_25GBASE_CR): case MLX5E_PROT_MASK(MLX5E_25GBASE_KR): case MLX5E_PROT_MASK(MLX5E_25GBASE_SR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4_ER4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2): case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2): case MLX5E_PROT_MASK(MLX5E_50GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_56GBASE_R4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_FDR; break; case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_EDR; break; default: *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; return -EINVAL; } return 0; } static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed, u8 *active_width) { switch (eth_proto_oper) { case MLX5E_PROT_MASK(MLX5E_SGMII_100M): case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_SDR; break; case MLX5E_PROT_MASK(MLX5E_5GBASE_R): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_DDR; break; case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2): *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2): *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_100GAUI_1_100GBASE_CR_KR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_NDR; break; case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_200GAUI_2_200GBASE_CR2_KR2): *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_NDR; break; case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_NDR; break; default: *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; return -EINVAL; } return 0; } static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(device); u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {}; - struct ifnet *ndev; + if_t ndev; enum ib_mtu ndev_ib_mtu; u16 qkey_viol_cntr; u32 eth_prot_oper; bool ext; int err; memset(props, 0, sizeof(*props)); /* Possible bad flows are checked before filling out props so in case * of an error it will still be zeroed out. */ err = mlx5_query_port_ptys(dev->mdev, out, sizeof(out), MLX5_PTYS_EN, port_num); if (err) return err; ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet); eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); if (ext) translate_eth_ext_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); else translate_eth_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); props->port_cap_flags |= IB_PORT_CM_SUP; props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, roce_address_table_size); props->max_mtu = IB_MTU_4096; props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); props->pkey_tbl_len = 1; props->state = IB_PORT_DOWN; props->phys_state = IB_PORT_PHYS_STATE_DISABLED; mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); props->qkey_viol_cntr = qkey_viol_cntr; ndev = mlx5_ib_get_netdev(device, port_num); if (!ndev) return 0; - if (ndev->if_drv_flags & IFF_DRV_RUNNING && - ndev->if_link_state == LINK_STATE_UP) { + if (if_getdrvflags(ndev) & IFF_DRV_RUNNING && + if_getlinkstate(ndev) == LINK_STATE_UP) { props->state = IB_PORT_ACTIVE; props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } - ndev_ib_mtu = iboe_get_mtu(ndev->if_mtu); + ndev_ib_mtu = iboe_get_mtu(if_getmtu(ndev)); if_rele(ndev); props->active_mtu = min(props->max_mtu, ndev_ib_mtu); return 0; } static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, const struct ib_gid_attr *attr, void *mlx5_addr) { #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, source_l3_address); void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, source_mac_47_32); u16 vlan_id; if (!gid) return; - ether_addr_copy(mlx5_addr_mac, IF_LLADDR(attr->ndev)); + ether_addr_copy(mlx5_addr_mac, if_getlladdr(attr->ndev)); vlan_id = rdma_vlan_dev_vlan_id(attr->ndev); if (vlan_id != 0xffff) { MLX5_SET_RA(mlx5_addr, vlan_valid, 1); MLX5_SET_RA(mlx5_addr, vlan_id, vlan_id); } switch (attr->gid_type) { case IB_GID_TYPE_IB: MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); break; case IB_GID_TYPE_ROCE_UDP_ENCAP: MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); break; default: WARN_ON(true); } if (attr->gid_type != IB_GID_TYPE_IB) { if (ipv6_addr_v4mapped((void *)gid)) MLX5_SET_RA(mlx5_addr, roce_l3_type, MLX5_ROCE_L3_TYPE_IPV4); else MLX5_SET_RA(mlx5_addr, roce_l3_type, MLX5_ROCE_L3_TYPE_IPV6); } if ((attr->gid_type == IB_GID_TYPE_IB) || !ipv6_addr_v4mapped((void *)gid)) memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); else memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); } static int set_roce_addr(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr) { struct mlx5_ib_dev *dev = to_mdev(device); u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0}; void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num); if (ll != IB_LINK_LAYER_ETHERNET) return -EINVAL; ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); MLX5_SET(set_roce_address_in, in, roce_address_index, index); MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); } static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, __always_unused void **context) { return set_roce_addr(device, port_num, index, gid, attr); } static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num, unsigned int index, __always_unused void **context) { return set_roce_addr(device, port_num, index, NULL, NULL); } __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index) { struct ib_gid_attr attr; union ib_gid gid; if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) return 0; if (!attr.ndev) return 0; if_rele(attr.ndev); if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) return 0; return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); } int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type) { struct ib_gid_attr attr; union ib_gid gid; int ret; ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr); if (ret) return ret; if (!attr.ndev) return -ENODEV; if_rele(attr.ndev); *gid_type = attr.gid_type; return 0; } static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) return !MLX5_CAP_GEN(dev->mdev, ib_virt); return 0; } enum { MLX5_VPORT_ACCESS_METHOD_MAD, MLX5_VPORT_ACCESS_METHOD_HCA, MLX5_VPORT_ACCESS_METHOD_NIC, }; static int mlx5_get_vport_access_method(struct ib_device *ibdev) { if (mlx5_use_mad_ifc(to_mdev(ibdev))) return MLX5_VPORT_ACCESS_METHOD_MAD; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) return MLX5_VPORT_ACCESS_METHOD_NIC; return MLX5_VPORT_ACCESS_METHOD_HCA; } static void get_atomic_caps(struct mlx5_ib_dev *dev, struct ib_device_attr *props) { u8 tmp; u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); u8 atomic_req_8B_endianness_mode = MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode); /* Check if HW supports 8 bytes standard atomic operations and capable * of host endianness respond */ tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; if (((atomic_operations & tmp) == tmp) && (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && (atomic_req_8B_endianness_mode)) { props->atomic_cap = IB_ATOMIC_HCA; } else { props->atomic_cap = IB_ATOMIC_NONE; } } static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u64 tmp; int err; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_system_image_guid(ibdev, sys_image_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); break; default: return -EINVAL; } if (!err) *sys_image_guid = cpu_to_be64(tmp); return err; } static int mlx5_query_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); return 0; default: return -EINVAL; } } static int mlx5_query_vendor_id(struct ib_device *ibdev, u32 *vendor_id) { struct mlx5_ib_dev *dev = to_mdev(ibdev); switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_core_query_vendor_id(dev->mdev, vendor_id); default: return -EINVAL; } } static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) { u64 tmp; int err; switch (mlx5_get_vport_access_method(&dev->ib_dev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_node_guid(dev, node_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); break; default: return -EINVAL; } if (!err) *node_guid = cpu_to_be64(tmp); return err; } struct mlx5_reg_node_desc { u8 desc[IB_DEVICE_NODE_DESC_MAX]; }; static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) { struct mlx5_reg_node_desc in; if (mlx5_use_mad_ifc(dev)) return mlx5_query_mad_ifc_node_desc(dev, node_desc); memset(&in, 0, sizeof(in)); return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, sizeof(struct mlx5_reg_node_desc), MLX5_REG_NODE_DESC, 0, 0); } static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int err = -ENOMEM; int max_sq_desc; int max_rq_sg; int max_sq_sg; u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); struct mlx5_ib_query_device_resp resp = {}; size_t resp_len; u64 max_tso; resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); if (uhw->outlen && uhw->outlen < resp_len) return -EINVAL; else resp.response_length = resp_len; if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) return -EINVAL; memset(props, 0, sizeof(*props)); err = mlx5_query_system_image_guid(ibdev, &props->sys_image_guid); if (err) return err; err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); if (err) return err; err = mlx5_query_vendor_id(ibdev, &props->vendor_id); if (err) return err; props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | ((u32)fw_rev_min(dev->mdev) << 16) | fw_rev_sub(dev->mdev); props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; if (MLX5_CAP_GEN(mdev, pkv)) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (MLX5_CAP_GEN(mdev, qkv)) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; if (MLX5_CAP_GEN(mdev, imaicl)) { props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_WINDOW_TYPE_2B; props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); /* We support 'Gappy' memory registration too */ props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; /* At this stage no support for signature handover */ props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | IB_PROT_T10DIF_TYPE_2 | IB_PROT_T10DIF_TYPE_3; props->sig_guard_cap = IB_GUARD_T10DIF_CRC | IB_GUARD_T10DIF_CSUM; } if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { if (MLX5_CAP_ETH(mdev, csum_cap)) props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); if (max_tso) { resp.tso_caps.max_tso = 1 << max_tso; resp.tso_caps.supported_qpts |= 1 << IB_QPT_RAW_PACKET; resp.response_length += sizeof(resp.tso_caps); } } if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { resp.rss_caps.rx_hash_function = MLX5_RX_HASH_FUNC_TOEPLITZ; resp.rss_caps.rx_hash_fields_mask = MLX5_RX_HASH_SRC_IPV4 | MLX5_RX_HASH_DST_IPV4 | MLX5_RX_HASH_SRC_IPV6 | MLX5_RX_HASH_DST_IPV6 | MLX5_RX_HASH_SRC_PORT_TCP | MLX5_RX_HASH_DST_PORT_TCP | MLX5_RX_HASH_SRC_PORT_UDP | MLX5_RX_HASH_DST_PORT_UDP; resp.response_length += sizeof(resp.rss_caps); } } else { if (field_avail(typeof(resp), tso_caps, uhw->outlen)) resp.response_length += sizeof(resp.tso_caps); if (field_avail(typeof(resp), rss_caps, uhw->outlen)) resp.response_length += sizeof(resp.rss_caps); } if (MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) { props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; props->device_cap_flags |= IB_DEVICE_UD_TSO; } if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && MLX5_CAP_ETH(dev->mdev, scatter_fcs)) props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; props->max_mr_size = ~0ull; props->page_size_cap = ~(min_page_size - 1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); props->max_sge_rd = MLX5_MAX_SGE_RD; props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING if (MLX5_CAP_GEN(mdev, pg)) props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; props->odp_caps = dev->odp_caps; #endif if (MLX5_CAP_GEN(mdev, cd)) props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; if (!mlx5_core_is_pf(mdev)) props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { props->rss_caps.max_rwq_indirection_tables = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); props->rss_caps.max_rwq_indirection_table_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size); props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; props->max_wq_type_rq = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); } if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) return err; } return 0; } enum mlx5_ib_width { MLX5_IB_WIDTH_1X = 1 << 0, MLX5_IB_WIDTH_2X = 1 << 1, MLX5_IB_WIDTH_4X = 1 << 2, MLX5_IB_WIDTH_8X = 1 << 3, MLX5_IB_WIDTH_12X = 1 << 4 }; static int translate_active_width(struct ib_device *ibdev, u8 active_width, u8 *ib_width) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int err = 0; if (active_width & MLX5_IB_WIDTH_1X) { *ib_width = IB_WIDTH_1X; } else if (active_width & MLX5_IB_WIDTH_2X) { *ib_width = IB_WIDTH_2X; } else if (active_width & MLX5_IB_WIDTH_4X) { *ib_width = IB_WIDTH_4X; } else if (active_width & MLX5_IB_WIDTH_8X) { *ib_width = IB_WIDTH_8X; } else if (active_width & MLX5_IB_WIDTH_12X) { *ib_width = IB_WIDTH_12X; } else { mlx5_ib_dbg(dev, "Invalid active_width %d\n", (int)active_width); err = -EINVAL; } return err; } enum ib_max_vl_num { __IB_MAX_VL_0 = 1, __IB_MAX_VL_0_1 = 2, __IB_MAX_VL_0_3 = 3, __IB_MAX_VL_0_7 = 4, __IB_MAX_VL_0_14 = 5, }; enum mlx5_vl_hw_cap { MLX5_VL_HW_0 = 1, MLX5_VL_HW_0_1 = 2, MLX5_VL_HW_0_2 = 3, MLX5_VL_HW_0_3 = 4, MLX5_VL_HW_0_4 = 5, MLX5_VL_HW_0_5 = 6, MLX5_VL_HW_0_6 = 7, MLX5_VL_HW_0_7 = 8, MLX5_VL_HW_0_14 = 15 }; static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, u8 *max_vl_num) { switch (vl_hw_cap) { case MLX5_VL_HW_0: *max_vl_num = __IB_MAX_VL_0; break; case MLX5_VL_HW_0_1: *max_vl_num = __IB_MAX_VL_0_1; break; case MLX5_VL_HW_0_3: *max_vl_num = __IB_MAX_VL_0_3; break; case MLX5_VL_HW_0_7: *max_vl_num = __IB_MAX_VL_0_7; break; case MLX5_VL_HW_0_14: *max_vl_num = __IB_MAX_VL_0_14; break; default: return -EINVAL; } return 0; } static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u32 *rep; int replen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); struct mlx5_ptys_reg *ptys; struct mlx5_pmtu_reg *pmtu; struct mlx5_pvlc_reg pvlc; void *ctx; int err; rep = mlx5_vzalloc(replen); ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); if (!rep || !ptys || !pmtu) { err = -ENOMEM; goto out; } memset(props, 0, sizeof(*props)); err = mlx5_query_hca_vport_context(mdev, port, 0, rep, replen); if (err) goto out; ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context); props->lid = MLX5_GET(hca_vport_context, ctx, lid); props->lmc = MLX5_GET(hca_vport_context, ctx, lmc); props->sm_lid = MLX5_GET(hca_vport_context, ctx, sm_lid); props->sm_sl = MLX5_GET(hca_vport_context, ctx, sm_sl); props->state = MLX5_GET(hca_vport_context, ctx, vport_state); props->phys_state = MLX5_GET(hca_vport_context, ctx, port_physical_state); props->port_cap_flags = MLX5_GET(hca_vport_context, ctx, cap_mask1); props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, pkey_violation_counter); props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, qkey_violation_counter); props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, subnet_timeout); props->init_type_reply = MLX5_GET(hca_vport_context, ctx, init_type_reply); props->grh_required = MLX5_GET(hca_vport_context, ctx, grh_required); ptys->proto_mask |= MLX5_PTYS_IB; ptys->local_port = port; err = mlx5_core_access_ptys(mdev, ptys, 0); if (err) goto out; err = translate_active_width(ibdev, ptys->ib_link_width_oper, &props->active_width); if (err) goto out; props->active_speed = (u8)ptys->ib_proto_oper; pmtu->local_port = port; err = mlx5_core_access_pmtu(mdev, pmtu, 0); if (err) goto out; props->max_mtu = pmtu->max_mtu; props->active_mtu = pmtu->oper_mtu; memset(&pvlc, 0, sizeof(pvlc)); pvlc.local_port = port; err = mlx5_core_access_pvlc(mdev, &pvlc, 0); if (err) goto out; err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap, &props->max_vl_num); out: kvfree(rep); kfree(ptys); kfree(pmtu); return err; } int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_port_roce(ibdev, port, props); default: return -EINVAL; } } static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_gids(ibdev, port, index, gid); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); default: return -EINVAL; } } static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, pkey); default: return -EINVAL; } } static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_reg_node_desc in; struct mlx5_reg_node_desc out; int err; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX); err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, sizeof(out), MLX5_REG_NODE_DESC, 0, 1); if (err) return err; memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); return err; } static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_port_attr attr; u32 tmp; int err; /* * CM layer calls ib_modify_port() regardless of the link * layer. For Ethernet ports, qkey violation and Port * capabilities are meaningless. */ if (mlx5_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_ETHERNET) return 0; mutex_lock(&dev->cap_mask_mutex); err = mlx5_ib_query_port(ibdev, port, &attr); if (err) goto out; tmp = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx5_set_port_caps(dev->mdev, port, tmp); out: mutex_unlock(&dev->cap_mask_mutex); return err; } static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps) { mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n", caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n"); } static u16 calc_dynamic_bfregs(int uars_per_sys_page) { /* Large page with non 4k uar support might limit the dynamic size */ if (uars_per_sys_page == 1 && PAGE_SIZE > 4096) return MLX5_MIN_DYN_BFREGS; return MLX5_MAX_DYN_BFREGS; } static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, struct mlx5_ib_alloc_ucontext_req_v2 *req, struct mlx5_bfreg_info *bfregi) { int uars_per_sys_page; int bfregs_per_sys_page; int ref_bfregs = req->total_num_bfregs; if (req->total_num_bfregs == 0) return -EINVAL; BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE); BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE); if (req->total_num_bfregs > MLX5_MAX_BFREGS) return -ENOMEM; uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k); bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR; /* This holds the required static allocation asked by the user */ req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page); if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) return -EINVAL; bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page); bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs; bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page; mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n", MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", lib_uar_4k ? "yes" : "no", ref_bfregs, req->total_num_bfregs, bfregi->total_num_bfregs, bfregi->num_sys_pages); return 0; } static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) { struct mlx5_bfreg_info *bfregi; int err; int i; bfregi = &context->bfregi; for (i = 0; i < bfregi->num_static_sys_pages; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]); if (err) goto error; mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]); } for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++) bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX; return 0; error: for (--i; i >= 0; i--) if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i])) mlx5_ib_warn(dev, "failed to free uar %d\n", i); return err; } static void deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) { struct mlx5_bfreg_info *bfregi; int i; bfregi = &context->bfregi; for (i = 0; i < bfregi->num_sys_pages; i++) if (i < bfregi->num_static_sys_pages || bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); } static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn, u16 uid) { int err; if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) return 0; err = mlx5_alloc_transport_domain(dev->mdev, tdn, uid); if (err) return err; if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return 0; mutex_lock(&dev->lb_mutex); dev->user_td++; if (dev->user_td == 2) err = mlx5_nic_vport_update_local_lb(dev->mdev, true); mutex_unlock(&dev->lb_mutex); if (err != 0) mlx5_dealloc_transport_domain(dev->mdev, *tdn, uid); return err; } static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn, u16 uid) { if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) return; mlx5_dealloc_transport_domain(dev->mdev, tdn, uid); if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return; mutex_lock(&dev->lb_mutex); dev->user_td--; if (dev->user_td < 2) mlx5_nic_vport_update_local_lb(dev->mdev, false); mutex_unlock(&dev->lb_mutex); } static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { struct ib_device *ibdev = uctx->device; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_ib_ucontext *context = to_mucontext(uctx); struct mlx5_bfreg_info *bfregi; int ver; int err; size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, max_cqe_version); bool lib_uar_4k; bool lib_uar_dyn; if (!dev->ib_active) return -EAGAIN; if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; else if (udata->inlen >= min_req_v2) ver = 2; else return -EINVAL; err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); if (err) return err; if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX) return -EOPNOTSUPP; if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) return -EOPNOTSUPP; req.total_num_bfregs = ALIGN(req.total_num_bfregs, MLX5_NON_FP_BFREGS_PER_UAR); if (req.num_low_latency_bfregs > req.total_num_bfregs - 1) return -EINVAL; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp.cache_line_size = cache_line_size(); resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); resp.cqe_version = min_t(__u8, (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), req.max_cqe_version); resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT; resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1; resp.response_length = min(offsetof(typeof(resp), response_length) + sizeof(resp.response_length), udata->outlen); lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR; bfregi = &context->bfregi; if (lib_uar_dyn) { bfregi->lib_uar_dyn = lib_uar_dyn; goto uar_done; } /* updates req->total_num_bfregs */ err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) goto out_ctx; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count), GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; goto out_ctx; } bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, sizeof(*bfregi->sys_pages), GFP_KERNEL); if (!bfregi->sys_pages) { err = -ENOMEM; goto out_count; } err = allocate_uars(dev, context); if (err) goto out_sys_pages; uar_done: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { err = mlx5_ib_devx_create(dev, true); if (err < 0) goto out_uars; context->devx_uid = err; } err = mlx5_ib_alloc_transport_domain(dev, &context->tdn, context->devx_uid); if (err) goto out_devx; INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); if (field_avail(typeof(resp), cqe_version, udata->outlen)) resp.response_length += sizeof(resp.cqe_version); if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; resp.response_length += sizeof(resp.cmds_supp_uhw); } /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's * pretend we don't support reading the HCA's core clock. This is also * forced by mmap function. */ if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) { if (PAGE_SIZE <= 4096) { resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; resp.hca_core_clock_offset = offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; } resp.response_length += sizeof(resp.hca_core_clock_offset); } if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen) resp.response_length += sizeof(resp.log_uar_size); if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen) resp.response_length += sizeof(resp.num_uars_per_page); if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) { resp.num_dyn_bfregs = bfregi->num_dyn_bfregs; resp.response_length += sizeof(resp.num_dyn_bfregs); } err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_mdev; bfregi->ver = ver; bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs; context->cqe_version = resp.cqe_version; context->lib_caps = req.lib_caps; print_lib_caps(dev, context->lib_caps); return 0; out_mdev: mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); out_devx: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) mlx5_ib_devx_destroy(dev, context->devx_uid); out_uars: deallocate_uars(dev, context); out_sys_pages: kfree(bfregi->sys_pages); out_count: kfree(bfregi->count); out_ctx: return err; } static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; bfregi = &context->bfregi; mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); if (context->devx_uid) mlx5_ib_devx_destroy(dev, context->devx_uid); deallocate_uars(dev, context); kfree(bfregi->sys_pages); kfree(bfregi->count); } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int uar_idx) { int fw_uars_per_page; fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; } static int get_command(unsigned long offset) { return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; } static int get_arg(unsigned long offset) { return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); } static int get_index(unsigned long offset) { return get_arg(offset); } /* Index resides in an extra byte to enable larger values than 255 */ static int get_extended_index(unsigned long offset) { return get_arg(offset) | ((offset >> 16) & 0xff) << 8; } static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { } static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) { switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: return "WC"; case MLX5_IB_MMAP_REGULAR_PAGE: return "best effort WC"; case MLX5_IB_MMAP_NC_PAGE: return "NC"; default: return NULL; } } static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { if ((vma->vm_end - vma->vm_start != PAGE_SIZE) || !(vma->vm_flags & VM_SHARED)) return -EINVAL; if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1) return -EOPNOTSUPP; if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; return -EOPNOTSUPP; } static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) { struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device); switch (mentry->mmap_flag) { case MLX5_IB_MMAP_TYPE_UAR_WC: case MLX5_IB_MMAP_TYPE_UAR_NC: mlx5_cmd_free_uar(dev->mdev, mentry->page_idx); kfree(mentry); break; default: WARN_ON(true); } } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { struct mlx5_bfreg_info *bfregi = &context->bfregi; int err; unsigned long idx; phys_addr_t pfn; pgprot_t prot; u32 bfreg_dyn_idx = 0; u32 uar_index; int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC); int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : bfregi->num_static_sys_pages; if (bfregi->lib_uar_dyn) return -EINVAL; if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; if (dyn_uar) idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages; else idx = get_index(vma->vm_pgoff); if (idx >= max_valid_idx) { mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n", idx, max_valid_idx); return -EINVAL; } switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_ALLOC_WC: case MLX5_IB_MMAP_REGULAR_PAGE: /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ prot = pgprot_writecombine(vma->vm_page_prot); break; case MLX5_IB_MMAP_NC_PAGE: prot = pgprot_noncached(vma->vm_page_prot); break; default: return -EINVAL; } if (dyn_uar) { int uars_per_page; uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR); if (bfreg_dyn_idx >= bfregi->total_num_bfregs) { mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n", bfreg_dyn_idx, bfregi->total_num_bfregs); return -EINVAL; } mutex_lock(&bfregi->lock); /* Fail if uar already allocated, first bfreg index of each * page holds its count. */ if (bfregi->count[bfreg_dyn_idx]) { mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx); mutex_unlock(&bfregi->lock); return -EINVAL; } bfregi->count[bfreg_dyn_idx]++; mutex_unlock(&bfregi->lock); err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); if (err) { mlx5_ib_warn(dev, "UAR alloc failed\n"); goto free_bfreg; } } else { uar_index = bfregi->sys_pages[idx]; } pfn = uar_index2pfn(dev, uar_index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE, prot, NULL); if (err) { mlx5_ib_err(dev, "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n", err, mmap_cmd2str(cmd)); goto err; } if (dyn_uar) bfregi->sys_pages[idx] = uar_index; return 0; err: if (!dyn_uar) return err; mlx5_cmd_free_uar(dev->mdev, idx); free_bfreg: mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx); return err; } static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma) { unsigned long idx; u8 command; command = get_command(vma->vm_pgoff); idx = get_extended_index(vma->vm_pgoff); return (command << 16 | idx); } static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev, struct vm_area_struct *vma, struct ib_ucontext *ucontext) { struct mlx5_user_mmap_entry *mentry; struct rdma_user_mmap_entry *entry; unsigned long pgoff; pgprot_t prot; phys_addr_t pfn; int ret; pgoff = mlx5_vma_to_pgoff(vma); entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff); if (!entry) return -EINVAL; mentry = to_mmmap(entry); pfn = (mentry->address >> PAGE_SHIFT); if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR || mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC) prot = pgprot_noncached(vma->vm_page_prot); else prot = pgprot_writecombine(vma->vm_page_prot); ret = rdma_user_mmap_io(ucontext, vma, pfn, entry->npages * PAGE_SIZE, prot, entry); rdma_user_mmap_entry_put(&mentry->rdma_entry); return ret; } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); unsigned long command; phys_addr_t pfn; command = get_command(vma->vm_pgoff); switch (command) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_ALLOC_WC: if (!dev->wc_support) return -EPERM; /* FALLTHROUGH */ case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; case MLX5_IB_MMAP_CORE_CLOCK: if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; if (vma->vm_flags & VM_WRITE) return -EPERM; /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) return -EOPNOTSUPP; pfn = (dev->mdev->iseg_base + offsetof(struct mlx5_init_seg, internal_timer_h)) >> PAGE_SHIFT; return rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE, pgprot_noncached(vma->vm_page_prot), NULL); case MLX5_IB_MMAP_CLOCK_INFO: return mlx5_ib_mmap_clock_info_page(dev, vma, context); default: return mlx5_ib_mmap_offset(dev, vma, ibcontext); } return 0; } static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct mlx5_ib_pd *pd = to_mpd(ibpd); struct ib_device *ibdev = ibpd->device; struct mlx5_ib_alloc_pd_resp resp; int err; struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); u16 uid = context ? context->devx_uid : 0; err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn, uid); if (err) return (err); pd->uid = uid; if (udata) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid); return -EFAULT; } } return 0; } static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); } enum { MATCH_CRITERIA_ENABLE_OUTER_BIT, MATCH_CRITERIA_ENABLE_MISC_BIT, MATCH_CRITERIA_ENABLE_INNER_BIT }; #define HEADER_IS_ZERO(match_criteria, headers) \ !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ static u8 get_match_criteria_enable(u32 *match_criteria) { u8 match_criteria_enable; match_criteria_enable = (!HEADER_IS_ZERO(match_criteria, outer_headers)) << MATCH_CRITERIA_ENABLE_OUTER_BIT; match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << MATCH_CRITERIA_ENABLE_MISC_BIT; match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, inner_headers)) << MATCH_CRITERIA_ENABLE_INNER_BIT; return match_criteria_enable; } static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val) { MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val); } static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) { MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val); MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); } #define LAST_ETH_FIELD vlan_tag #define LAST_IB_FIELD sl #define LAST_IPV4_FIELD tos #define LAST_IPV6_FIELD traffic_class #define LAST_TCP_UDP_FIELD src_port /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ memchr_inv((void *)&filter.field +\ sizeof(filter.field), 0,\ sizeof(filter) -\ offsetof(typeof(filter), field) -\ sizeof(filter.field)) static int parse_flow_attr(u32 *match_c, u32 *match_v, const union ib_flow_spec *ib_spec) { void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, outer_headers); void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, outer_headers); void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters); switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) return -ENOTSUPP; ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dmac_47_16), ib_spec->eth.mask.dst_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dmac_47_16), ib_spec->eth.val.dst_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, smac_47_16), ib_spec->eth.mask.src_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, smac_47_16), ib_spec->eth.val.src_mac); if (ib_spec->eth.mask.vlan_tag) { MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_vid, ntohs(ib_spec->eth.val.vlan_tag)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_cfi, ntohs(ib_spec->eth.mask.vlan_tag) >> 12); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_cfi, ntohs(ib_spec->eth.val.vlan_tag) >> 12); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_prio, ntohs(ib_spec->eth.mask.vlan_tag) >> 13); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_prio, ntohs(ib_spec->eth.val.vlan_tag) >> 13); } MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, ntohs(ib_spec->eth.mask.ether_type)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, ntohs(ib_spec->eth.val.ether_type)); break; case IB_FLOW_SPEC_IPV4: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, 0xffff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, ETH_P_IP); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, src_ipv4_src_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.mask.src_ip, sizeof(ib_spec->ipv4.mask.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, src_ipv4_src_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.val.src_ip, sizeof(ib_spec->ipv4.val.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.mask.dst_ip, sizeof(ib_spec->ipv4.mask.dst_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.val.dst_ip, sizeof(ib_spec->ipv4.val.dst_ip)); set_tos(outer_headers_c, outer_headers_v, ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos); set_proto(outer_headers_c, outer_headers_v, ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto); break; case IB_FLOW_SPEC_IPV6: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, 0xffff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, IPPROTO_IPV6); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, src_ipv4_src_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.mask.src_ip, sizeof(ib_spec->ipv6.mask.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, src_ipv4_src_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.val.src_ip, sizeof(ib_spec->ipv6.val.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.mask.dst_ip, sizeof(ib_spec->ipv6.mask.dst_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.val.dst_ip, sizeof(ib_spec->ipv6.val.dst_ip)); set_tos(outer_headers_c, outer_headers_v, ib_spec->ipv6.mask.traffic_class, ib_spec->ipv6.val.traffic_class); set_proto(outer_headers_c, outer_headers_v, ib_spec->ipv6.mask.next_hdr, ib_spec->ipv6.val.next_hdr); MLX5_SET(fte_match_set_misc, misc_params_c, outer_ipv6_flow_label, ntohl(ib_spec->ipv6.mask.flow_label)); MLX5_SET(fte_match_set_misc, misc_params_v, outer_ipv6_flow_label, ntohl(ib_spec->ipv6.val.flow_label)); break; case IB_FLOW_SPEC_TCP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, 0xff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, IPPROTO_TCP); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport, ntohs(ib_spec->tcp_udp.mask.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport, ntohs(ib_spec->tcp_udp.val.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport, ntohs(ib_spec->tcp_udp.mask.dst_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; case IB_FLOW_SPEC_UDP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, 0xff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, IPPROTO_UDP); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport, ntohs(ib_spec->tcp_udp.mask.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport, ntohs(ib_spec->tcp_udp.val.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport, ntohs(ib_spec->tcp_udp.mask.dst_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; default: return -EINVAL; } return 0; } /* If a flow could catch both multicast and unicast packets, * it won't fall into the multicast flow steering table and this rule * could steal other multicast packets. */ static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr) { struct ib_flow_spec_eth *eth_spec; if (ib_attr->type != IB_FLOW_ATTR_NORMAL || ib_attr->size < sizeof(struct ib_flow_attr) + sizeof(struct ib_flow_spec_eth) || ib_attr->num_of_specs < 1) return false; eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1); if (eth_spec->type != IB_FLOW_SPEC_ETH || eth_spec->size != sizeof(*eth_spec)) return false; return is_multicast_ether_addr(eth_spec->mask.dst_mac) && is_multicast_ether_addr(eth_spec->val.dst_mac); } static bool is_valid_attr(const struct ib_flow_attr *flow_attr) { union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); bool has_ipv4_spec = false; bool eth_type_ipv4 = true; unsigned int spec_index; /* Validate that ethertype is correct */ for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { if (ib_spec->type == IB_FLOW_SPEC_ETH && ib_spec->eth.mask.ether_type) { if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) && ib_spec->eth.val.ether_type == htons(ETH_P_IP))) eth_type_ipv4 = false; } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) { has_ipv4_spec = true; } ib_spec = (void *)ib_spec + ib_spec->size; } return !has_ipv4_spec || eth_type_ipv4; } static void put_flow_table(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *prio, bool ft_added) { prio->refcount -= !!ft_added; if (!prio->refcount) { mlx5_destroy_flow_table(prio->flow_table); prio->flow_table = NULL; } } static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) { struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); struct mlx5_ib_flow_handler *handler = container_of(flow_id, struct mlx5_ib_flow_handler, ibflow); struct mlx5_ib_flow_handler *iter, *tmp; mutex_lock(&dev->flow_db.lock); list_for_each_entry_safe(iter, tmp, &handler->list, list) { mlx5_del_flow_rule(iter->rule); put_flow_table(dev, iter->prio, true); list_del(&iter->list); kfree(iter); } mlx5_del_flow_rule(handler->rule); put_flow_table(dev, handler->prio, true); mutex_unlock(&dev->flow_db.lock); kfree(handler); return 0; } static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) { priority *= 2; if (!dont_trap) priority++; return priority; } enum flow_table_type { MLX5_IB_FT_RX, MLX5_IB_FT_TX }; #define MLX5_FS_MAX_TYPES 10 #define MLX5_FS_MAX_ENTRIES 32000UL static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, struct ib_flow_attr *flow_attr, enum flow_table_type ft_type) { bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_table *ft; int num_entries; int num_groups; int priority; int err = 0; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { if (flow_is_multicast_only(flow_attr) && !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else priority = ib_prio_to_core_prio(flow_attr->priority, dont_trap); ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); num_entries = MLX5_FS_MAX_ENTRIES; num_groups = MLX5_FS_MAX_TYPES; prio = &dev->flow_db.prios[priority]; } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_LEFTOVERS); build_leftovers_ft_param("bypass", &priority, &num_entries, &num_groups); prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { if (!MLX5_CAP_FLOWTABLE(dev->mdev, allow_sniffer_and_nic_rx_shared_tir)) return ERR_PTR(-ENOTSUPP); ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ? MLX5_FLOW_NAMESPACE_SNIFFER_RX : MLX5_FLOW_NAMESPACE_SNIFFER_TX); prio = &dev->flow_db.sniffer[ft_type]; priority = 0; num_entries = 1; num_groups = 1; } if (!ns) return ERR_PTR(-ENOTSUPP); ft = prio->flow_table; if (!ft) { ft = mlx5_create_auto_grouped_flow_table(ns, priority, "bypass", num_entries, num_groups); if (!IS_ERR(ft)) { prio->refcount = 0; prio->flow_table = ft; } else { err = PTR_ERR(ft); } } return err ? ERR_PTR(err) : prio; } static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, const struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; struct mlx5_flow_spec *spec; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; u32 action; int err = 0; if (!is_valid_attr(flow_attr)) return ERR_PTR(-EINVAL); spec = mlx5_vzalloc(sizeof(*spec)); handler = kzalloc(sizeof(*handler), GFP_KERNEL); if (!handler || !spec) { err = -ENOMEM; goto free; } INIT_LIST_HEAD(&handler->list); for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(spec->match_criteria, spec->match_value, ib_flow); if (err < 0) goto free; ib_flow += ((union ib_flow_spec *)ib_flow)->size; } spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; handler->rule = mlx5_add_flow_rule(ft, spec->match_criteria_enable, spec->match_criteria, spec->match_value, action, MLX5_FS_DEFAULT_FLOW_TAG, dst); if (IS_ERR(handler->rule)) { err = PTR_ERR(handler->rule); goto free; } ft_prio->refcount++; handler->prio = ft_prio; ft_prio->flow_table = ft; free: if (err) kfree(handler); kvfree(spec); return err ? ERR_PTR(err) : handler; } static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_dst = NULL; struct mlx5_ib_flow_handler *handler = NULL; handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); if (!IS_ERR(handler)) { handler_dst = create_flow_rule(dev, ft_prio, flow_attr, dst); if (IS_ERR(handler_dst)) { mlx5_del_flow_rule(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_dst; } else { list_add(&handler_dst->list, &handler->list); } } return handler; } enum { LEFTOVERS_MC, LEFTOVERS_UC, }; static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_ucast = NULL; struct mlx5_ib_flow_handler *handler = NULL; static struct { struct ib_flow_attr flow_attr; struct ib_flow_spec_eth eth_flow; } leftovers_specs[] = { [LEFTOVERS_MC] = { .flow_attr = { .num_of_specs = 1, .size = sizeof(leftovers_specs[0]) }, .eth_flow = { .type = IB_FLOW_SPEC_ETH, .size = sizeof(struct ib_flow_spec_eth), .mask = {.dst_mac = {0x1} }, .val = {.dst_mac = {0x1} } } }, [LEFTOVERS_UC] = { .flow_attr = { .num_of_specs = 1, .size = sizeof(leftovers_specs[0]) }, .eth_flow = { .type = IB_FLOW_SPEC_ETH, .size = sizeof(struct ib_flow_spec_eth), .mask = {.dst_mac = {0x1} }, .val = {.dst_mac = {} } } } }; handler = create_flow_rule(dev, ft_prio, &leftovers_specs[LEFTOVERS_MC].flow_attr, dst); if (!IS_ERR(handler) && flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { handler_ucast = create_flow_rule(dev, ft_prio, &leftovers_specs[LEFTOVERS_UC].flow_attr, dst); if (IS_ERR(handler_ucast)) { mlx5_del_flow_rule(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_ucast; } else { list_add(&handler_ucast->list, &handler->list); } } return handler; } static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_rx, struct mlx5_ib_flow_prio *ft_tx, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_rx; struct mlx5_ib_flow_handler *handler_tx; int err; static const struct ib_flow_attr flow_attr = { .num_of_specs = 0, .type = IB_FLOW_ATTR_SNIFFER, .size = sizeof(flow_attr) }; handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst); if (IS_ERR(handler_rx)) { err = PTR_ERR(handler_rx); goto err; } handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst); if (IS_ERR(handler_tx)) { err = PTR_ERR(handler_tx); goto err_tx; } list_add(&handler_tx->list, &handler_rx->list); return handler_rx; err_tx: mlx5_del_flow_rule(handler_rx->rule); ft_rx->refcount--; kfree(handler_rx); err: return ERR_PTR(err); } static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); struct mlx5_ib_flow_handler *handler = NULL; struct mlx5_flow_destination *dst = NULL; struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr; size_t min_ucmd_sz, required_ucmd_sz; int err; if (udata && udata->inlen) { min_ucmd_sz = offsetofend(struct mlx5_ib_create_flow, reserved); if (udata->inlen < min_ucmd_sz) return ERR_PTR(-EOPNOTSUPP); err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz); if (err) return ERR_PTR(err); /* currently supports only one counters data */ if (ucmd_hdr.ncounters_data > 1) return ERR_PTR(-EINVAL); required_ucmd_sz = min_ucmd_sz + sizeof(struct mlx5_ib_flow_counters_data) * ucmd_hdr.ncounters_data; if (udata->inlen > required_ucmd_sz && !ib_is_udata_cleared(udata, required_ucmd_sz, udata->inlen - required_ucmd_sz)) return ERR_PTR(-EOPNOTSUPP); ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL); if (!ucmd) return ERR_PTR(-ENOMEM); err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz); if (err) goto free_ucmd; } if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) { err = -ENOMEM; goto free_ucmd; } if (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP) { err = -EINVAL; goto free_ucmd; } dst = kzalloc(sizeof(*dst), GFP_KERNEL); if (!dst) { err = -ENOMEM; goto free_ucmd; } mutex_lock(&dev->flow_db.lock); ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; } if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX); if (IS_ERR(ft_prio_tx)) { err = PTR_ERR(ft_prio_tx); ft_prio_tx = NULL; goto destroy_ft; } } dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; if (mqp->flags & MLX5_IB_QP_RSS) dst->tir_num = mqp->rss_qp.tirn; else dst->tir_num = mqp->raw_packet_qp.rq.tirn; switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) { err = -EOPNOTSUPP; goto destroy_ft; } if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { handler = create_dont_trap_rule(dev, ft_prio, flow_attr, dst); } else { handler = create_flow_rule(dev, ft_prio, flow_attr, dst); } break; case IB_FLOW_ATTR_ALL_DEFAULT: case IB_FLOW_ATTR_MC_DEFAULT: handler = create_leftovers_rule(dev, ft_prio, flow_attr, dst); break; case IB_FLOW_ATTR_SNIFFER: handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst); break; default: err = -EINVAL; goto destroy_ft; } if (IS_ERR(handler)) { err = PTR_ERR(handler); handler = NULL; goto destroy_ft; } mutex_unlock(&dev->flow_db.lock); kfree(dst); kfree(ucmd); return &handler->ibflow; destroy_ft: put_flow_table(dev, ft_prio, false); if (ft_prio_tx) put_flow_table(dev, ft_prio_tx, false); unlock: mutex_unlock(&dev->flow_db.lock); kfree(dst); free_ucmd: kfree(ucmd); return ERR_PTR(err); } static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int init_node_data(struct mlx5_ib_dev *dev) { int err; err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); if (err) return err; return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); } static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages); } static ssize_t show_reg_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->mdev->pdev->revision); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_fw_pages, &dev_attr_reg_pages, }; static void pkey_change_handler(struct work_struct *work) { struct mlx5_ib_port_resources *ports = container_of(work, struct mlx5_ib_port_resources, pkey_change_work); mutex_lock(&ports->devr->mutex); mlx5_ib_gsi_pkey_change(ports->gsi); mutex_unlock(&ports->devr->mutex); } static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) { struct mlx5_ib_qp *mqp; struct mlx5_ib_cq *send_mcq, *recv_mcq; struct mlx5_core_cq *mcq; struct list_head cq_armed_list; unsigned long flags_qp; unsigned long flags_cq; unsigned long flags; INIT_LIST_HEAD(&cq_armed_list); /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { spin_lock_irqsave(&mqp->sq.lock, flags_qp); if (mqp->sq.tail != mqp->sq.head) { send_mcq = to_mcq(mqp->ibqp.send_cq); spin_lock_irqsave(&send_mcq->lock, flags_cq); if (send_mcq->mcq.comp && mqp->ibqp.send_cq->comp_handler) { if (!send_mcq->mcq.reset_notify_added) { send_mcq->mcq.reset_notify_added = 1; list_add_tail(&send_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&send_mcq->lock, flags_cq); } spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); spin_lock_irqsave(&mqp->rq.lock, flags_qp); /* no handling is needed for SRQ */ if (!mqp->ibqp.srq) { if (mqp->rq.tail != mqp->rq.head) { recv_mcq = to_mcq(mqp->ibqp.recv_cq); spin_lock_irqsave(&recv_mcq->lock, flags_cq); if (recv_mcq->mcq.comp && mqp->ibqp.recv_cq->comp_handler) { if (!recv_mcq->mcq.reset_notify_added) { recv_mcq->mcq.reset_notify_added = 1; list_add_tail(&recv_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&recv_mcq->lock, flags_cq); } } spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); } /*At that point all inflight post send were put to be executed as of we * lock/unlock above locks Now need to arm all involved CQs. */ list_for_each_entry(mcq, &cq_armed_list, reset_notify) { mcq->comp(mcq, NULL); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); } static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; struct ib_event ibev; bool fatal = false; u8 port = (u8)param; switch (event) { case MLX5_DEV_EVENT_SYS_ERROR: ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); fatal = true; break; case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == IB_LINK_LAYER_ETHERNET) return; ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; break; case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; break; case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: ibev.event = IB_EVENT_GID_CHANGE; break; case MLX5_DEV_EVENT_CLIENT_REREG: ibev.event = IB_EVENT_CLIENT_REREGISTER; break; default: /* unsupported event */ return; } ibev.device = &ibdev->ib_dev; ibev.element.port_num = port; if (!rdma_is_port_valid(&ibdev->ib_dev, port)) { mlx5_ib_warn(ibdev, "warning: event(%d) on port %d\n", event, port); return; } if (ibdev->ib_active) ib_dispatch_event(&ibev); if (fatal) ibdev->ib_active = false; } static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) mlx5_query_ext_port_caps(dev, port); } static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; int port; struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); if (!dprops) goto out; err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; } for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); break; } dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys; dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } out: kfree(pprops); kfree(dprops); return err; } static void destroy_umrc_res(struct mlx5_ib_dev *dev) { int err; err = mlx5_mr_cache_cleanup(dev); if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); if (dev->umrc.qp) mlx5_ib_destroy_qp(dev->umrc.qp, NULL); if (dev->umrc.cq) ib_free_cq(dev->umrc.cq); if (dev->umrc.pd) ib_dealloc_pd(dev->umrc.pd); } enum { MAX_UMR_WR = 128, }; static int create_umr_res(struct mlx5_ib_dev *dev) { struct ib_qp_init_attr *init_attr = NULL; struct ib_qp_attr *attr = NULL; struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; int ret; attr = kzalloc(sizeof(*attr), GFP_KERNEL); init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); if (!attr || !init_attr) { ret = -ENOMEM; goto error_0; } pd = ib_alloc_pd(&dev->ib_dev, 0); if (IS_ERR(pd)) { mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); ret = PTR_ERR(pd); goto error_0; } cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); goto error_2; } init_attr->send_cq = cq; init_attr->recv_cq = cq; init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; init_attr->cap.max_send_wr = MAX_UMR_WR; init_attr->cap.max_send_sge = 1; init_attr->qp_type = MLX5_IB_QPT_REG_UMR; init_attr->port_num = 1; qp = mlx5_ib_create_qp(pd, init_attr, NULL); if (IS_ERR(qp)) { mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); ret = PTR_ERR(qp); goto error_3; } qp->device = &dev->ib_dev; qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = MLX5_IB_QPT_REG_UMR; attr->qp_state = IB_QPS_INIT; attr->port_num = 1; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); goto error_4; } memset(attr, 0, sizeof(*attr)); attr->qp_state = IB_QPS_RTR; attr->path_mtu = IB_MTU_256; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); goto error_4; } memset(attr, 0, sizeof(*attr)); attr->qp_state = IB_QPS_RTS; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); goto error_4; } dev->umrc.qp = qp; dev->umrc.cq = cq; dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); ret = mlx5_mr_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); goto error_4; } kfree(attr); kfree(init_attr); return 0; error_4: mlx5_ib_destroy_qp(qp, NULL); dev->umrc.qp = NULL; error_3: ib_free_cq(cq); dev->umrc.cq = NULL; error_2: ib_dealloc_pd(pd); dev->umrc.pd = NULL; error_0: kfree(attr); kfree(init_attr); return ret; } static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_device *ibdev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); ibdev = &dev->ib_dev; mutex_init(&devr->mutex); devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd); if (!devr->p0) return -ENOMEM; devr->p0->device = ibdev; devr->p0->uobject = NULL; atomic_set(&devr->p0->usecnt, 0); ret = mlx5_ib_alloc_pd(devr->p0, NULL); if (ret) goto error0; devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq); if (!devr->c0) { ret = -ENOMEM; goto error1; } devr->c0->device = &dev->ib_dev; atomic_set(&devr->c0->usecnt, 0); ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL); if (ret) goto err_create_cq; devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); goto error2; } devr->x0->device = &dev->ib_dev; devr->x0->inode = NULL; atomic_set(&devr->x0->usecnt, 0); mutex_init(&devr->x0->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x0->tgt_qp_list); devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL); if (IS_ERR(devr->x1)) { ret = PTR_ERR(devr->x1); goto error3; } devr->x1->device = &dev->ib_dev; devr->x1->inode = NULL; atomic_set(&devr->x1->usecnt, 0); mutex_init(&devr->x1->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x1->tgt_qp_list); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_XRC; attr.ext.cq = devr->c0; attr.ext.xrc.xrcd = devr->x0; devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq); if (!devr->s0) { ret = -ENOMEM; goto error4; } devr->s0->device = &dev->ib_dev; devr->s0->pd = devr->p0; devr->s0->srq_type = IB_SRQT_XRC; devr->s0->ext.xrc.xrcd = devr->x0; devr->s0->ext.cq = devr->c0; ret = mlx5_ib_create_srq(devr->s0, &attr, NULL); if (ret) goto err_create; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); atomic_inc(&devr->s0->ext.cq->usecnt); atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq); if (!devr->s1) { ret = -ENOMEM; goto error5; } devr->s1->device = &dev->ib_dev; devr->s1->pd = devr->p0; devr->s1->srq_type = IB_SRQT_BASIC; devr->s1->ext.cq = devr->c0; ret = mlx5_ib_create_srq(devr->s1, &attr, NULL); if (ret) goto error6; atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s1->usecnt, 0); for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { INIT_WORK(&devr->ports[port].pkey_change_work, pkey_change_handler); devr->ports[port].devr = devr; } return 0; error6: kfree(devr->s1); error5: mlx5_ib_destroy_srq(devr->s0, NULL); err_create: kfree(devr->s0); error4: mlx5_ib_dealloc_xrcd(devr->x1, NULL); error3: mlx5_ib_dealloc_xrcd(devr->x0, NULL); error2: mlx5_ib_destroy_cq(devr->c0, NULL); err_create_cq: kfree(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0, NULL); error0: kfree(devr->p0); return ret; } static void destroy_dev_resources(struct mlx5_ib_resources *devr) { int port; mlx5_ib_destroy_srq(devr->s1, NULL); kfree(devr->s1); mlx5_ib_destroy_srq(devr->s0, NULL); kfree(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0, NULL); mlx5_ib_dealloc_xrcd(devr->x1, NULL); mlx5_ib_destroy_cq(devr->c0, NULL); kfree(devr->c0); mlx5_ib_dealloc_pd(devr->p0, NULL); kfree(devr->p0); /* Make sure no change P_Key work items are still executing */ for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) cancel_work_sync(&devr->ports[port].pkey_change_work); } static u32 get_core_cap_flags(struct ib_device *ibdev) { struct mlx5_ib_dev *dev = to_mdev(ibdev); enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); u32 ret = 0; if (ll == IB_LINK_LAYER_INFINIBAND) return RDMA_CORE_PORT_IBA_IB; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) return 0; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) return 0; if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE; if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; return ret; } static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; struct mlx5_ib_dev *dev = to_mdev(ibdev); enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num); int err; err = mlx5_ib_query_port(ibdev, port_num, &attr); if (err) return err; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = get_core_cap_flags(ibdev); if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce)) immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } static void get_dev_fw_str(struct ib_device *ibdev, char *str, size_t str_len) { struct mlx5_ib_dev *dev = container_of(ibdev, struct mlx5_ib_dev, ib_dev); snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); } static int mlx5_roce_lag_init(struct mlx5_ib_dev *dev) { return 0; } static void mlx5_roce_lag_cleanup(struct mlx5_ib_dev *dev) { } static void mlx5_remove_roce_notifier(struct mlx5_ib_dev *dev) { if (dev->roce.nb.notifier_call) { unregister_netdevice_notifier(&dev->roce.nb); dev->roce.nb.notifier_call = NULL; } } +static int +mlx5_enable_roce_if_cb(if_t ifp, void *arg) +{ + struct mlx5_ib_dev *dev = arg; + + /* check if network interface belongs to mlx5en */ + if (!mlx5_netdev_match(ifp, dev->mdev, "mce")) + return (0); + + write_lock(&dev->roce.netdev_lock); + dev->roce.netdev = ifp; + write_unlock(&dev->roce.netdev_lock); + + return (0); +} + static int mlx5_enable_roce(struct mlx5_ib_dev *dev) { VNET_ITERATOR_DECL(vnet_iter); - struct ifnet *idev; int err; /* Check if mlx5en net device already exists */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { IFNET_RLOCK(); CURVNET_SET_QUIET(vnet_iter); - CK_STAILQ_FOREACH(idev, &V_ifnet, if_link) { - /* check if network interface belongs to mlx5en */ - if (!mlx5_netdev_match(idev, dev->mdev, "mce")) - continue; - write_lock(&dev->roce.netdev_lock); - dev->roce.netdev = idev; - write_unlock(&dev->roce.netdev_lock); - } + if_foreach(mlx5_enable_roce_if_cb, dev); CURVNET_RESTORE(); IFNET_RUNLOCK(); } VNET_LIST_RUNLOCK(); dev->roce.nb.notifier_call = mlx5_netdev_event; err = register_netdevice_notifier(&dev->roce.nb); if (err) { dev->roce.nb.notifier_call = NULL; return err; } if (MLX5_CAP_GEN(dev->mdev, roce)) { err = mlx5_nic_vport_enable_roce(dev->mdev); if (err) goto err_unregister_netdevice_notifier; } err = mlx5_roce_lag_init(dev); if (err) goto err_disable_roce; return 0; err_disable_roce: if (MLX5_CAP_GEN(dev->mdev, roce)) mlx5_nic_vport_disable_roce(dev->mdev); err_unregister_netdevice_notifier: mlx5_remove_roce_notifier(dev); return err; } static void mlx5_disable_roce(struct mlx5_ib_dev *dev) { mlx5_roce_lag_cleanup(dev); if (MLX5_CAP_GEN(dev->mdev, roce)) mlx5_nic_vport_disable_roce(dev->mdev); } static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) { mlx5_vport_dealloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, dev->port[port_num].q_cnt_id); dev->port[port_num].q_cnt_id = 0; } static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) { unsigned int i; for (i = 0; i < dev->num_ports; i++) mlx5_ib_dealloc_q_port_counter(dev, i); } static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) { int i; int ret; for (i = 0; i < dev->num_ports; i++) { ret = mlx5_vport_alloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, &dev->port[i].q_cnt_id); if (ret) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d, err %d\n", i + 1, ret); goto dealloc_counters; } } return 0; dealloc_counters: while (--i >= 0) mlx5_ib_dealloc_q_port_counter(dev, i); return ret; } static const char * const names[] = { "rx_write_requests", "rx_read_requests", "rx_atomic_requests", "out_of_buffer", "out_of_sequence", "duplicate_request", "rnr_nak_retry_err", "packet_seq_err", "implied_nak_seq_err", "local_ack_timeout_err", }; static const size_t stats_offsets[] = { MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests), MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests), MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests), MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer), MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence), MLX5_BYTE_OFF(query_q_counter_out, duplicate_request), MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err), MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err), MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err), MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err), }; static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, u8 port_num) { BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets)); /* We support only per port stats */ if (port_num == 0) return NULL; return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names), RDMA_HW_STATS_DEFAULT_LIFESPAN); } static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u8 port, int index) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); void *out; __be32 val; int ret; int i; if (!port || !stats) return -ENOSYS; out = mlx5_vzalloc(outlen); if (!out) return -ENOMEM; ret = mlx5_vport_query_q_counter(dev->mdev, dev->port[port - 1].q_cnt_id, 0, out, outlen); if (ret) goto free; for (i = 0; i < ARRAY_SIZE(names); i++) { val = *(__be32 *)(out + stats_offsets[i]); stats->value[i] = (u64)be32_to_cpu(val); } free: kvfree(out); return ARRAY_SIZE(names); } static int mlx5_ib_stage_bfreg_init(struct mlx5_ib_dev *dev) { int err; err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); if (err) return err; err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); if (err) { mlx5_free_bfreg(dev->mdev, &dev->bfreg); return err; } err = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false); if (err) { mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); mlx5_free_bfreg(dev->mdev, &dev->bfreg); } return err; } static void mlx5_ib_stage_bfreg_cleanup(struct mlx5_ib_dev *dev) { mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg); mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); mlx5_free_bfreg(dev->mdev, &dev->bfreg); } static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; enum rdma_link_layer ll; int port_type_cap; int err; int i; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) return NULL; dev->mdev = mdev; dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), GFP_KERNEL); if (!dev->port) goto err_dealloc; rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) goto err_free_port; if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); mutex_init(&dev->lb_mutex); INIT_IB_DEVICE_OPS(&dev->ib_dev.ops, mlx5, MLX5); snprintf(dev->ib_dev.name, IB_DEVICE_NAME_MAX, "mlx5_%d", device_get_unit(mdev->pdev->dev.bsddev)); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dma_device = &mdev->pdev->dev; dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_CREATE_AH) | (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); dev->ib_dev.uverbs_ex_cmd_mask = (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; if (ll == IB_LINK_LAYER_ETHERNET) dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; dev->ib_dev.add_gid = mlx5_ib_add_gid; dev->ib_dev.del_gid = mlx5_ib_del_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; dev->ib_dev.mmap = mlx5_ib_mmap; dev->ib_dev.mmap_free = mlx5_ib_mmap_free; dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; dev->ib_dev.create_ah = mlx5_ib_create_ah; dev->ib_dev.query_ah = mlx5_ib_query_ah; dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; dev->ib_dev.create_srq = mlx5_ib_create_srq; dev->ib_dev.modify_srq = mlx5_ib_modify_srq; dev->ib_dev.query_srq = mlx5_ib_query_srq; dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; dev->ib_dev.create_qp = mlx5_ib_create_qp; dev->ib_dev.modify_qp = mlx5_ib_modify_qp; dev->ib_dev.query_qp = mlx5_ib_query_qp; dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; dev->ib_dev.post_send = mlx5_ib_post_send; dev->ib_dev.post_recv = mlx5_ib_post_recv; dev->ib_dev.create_cq = mlx5_ib_create_cq; dev->ib_dev.modify_cq = mlx5_ib_modify_cq; dev->ib_dev.resize_cq = mlx5_ib_resize_cq; dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; dev->ib_dev.poll_cq = mlx5_ib_poll_cq; dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; dev->ib_dev.process_mad = mlx5_ib_process_mad; dev->ib_dev.alloc_mr = mlx5_ib_alloc_mr; dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; dev->ib_dev.get_dev_fw_str = get_dev_fw_str; if (mlx5_core_is_pf(mdev)) { dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config; dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state; dev->ib_dev.get_vf_stats = mlx5_ib_get_vf_stats; dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; } dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; mlx5_ib_internal_fill_odp_caps(dev); if (MLX5_CAP_GEN(mdev, imaicl)) { dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; } if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { dev->ib_dev.create_flow = mlx5_ib_create_flow; dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; dev->ib_dev.create_wq = mlx5_ib_create_wq; dev->ib_dev.modify_wq = mlx5_ib_modify_wq; dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; dev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); } err = init_node_data(dev); if (err) goto err_free_port; mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); if (ll == IB_LINK_LAYER_ETHERNET) { err = mlx5_enable_roce(dev); if (err) goto err_free_port; } err = create_dev_resources(&dev->devr); if (err) goto err_disable_roce; err = mlx5_ib_odp_init_one(dev); if (err) goto err_rsrc; err = mlx5_ib_alloc_q_counters(dev); if (err) goto err_odp; err = mlx5_ib_stage_bfreg_init(dev); if (err) goto err_q_cnt; err = ib_register_device(&dev->ib_dev, NULL); if (err) goto err_bfreg; err = create_umr_res(dev); if (err) goto err_dev; for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) goto err_umrc; } err = mlx5_ib_init_congestion(dev); if (err) goto err_umrc; dev->ib_active = true; return dev; err_umrc: destroy_umrc_res(dev); err_dev: ib_unregister_device(&dev->ib_dev); err_bfreg: mlx5_ib_stage_bfreg_cleanup(dev); err_q_cnt: mlx5_ib_dealloc_q_counters(dev); err_odp: mlx5_ib_odp_remove_one(dev); err_rsrc: destroy_dev_resources(&dev->devr); err_disable_roce: if (ll == IB_LINK_LAYER_ETHERNET) { mlx5_disable_roce(dev); mlx5_remove_roce_notifier(dev); } err_free_port: kfree(dev->port); err_dealloc: ib_dealloc_device((struct ib_device *)dev); return NULL; } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); mlx5_ib_cleanup_congestion(dev); mlx5_remove_roce_notifier(dev); ib_unregister_device(&dev->ib_dev); mlx5_ib_stage_bfreg_cleanup(dev); mlx5_ib_dealloc_q_counters(dev); destroy_umrc_res(dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); if (ll == IB_LINK_LAYER_ETHERNET) mlx5_disable_roce(dev); kfree(dev->port); ib_dealloc_device(&dev->ib_dev); } static struct mlx5_interface mlx5_ib_interface = { .add = mlx5_ib_add, .remove = mlx5_ib_remove, .event = mlx5_ib_event, .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; static int __init mlx5_ib_init(void) { int err; err = mlx5_ib_odp_init(); if (err) return err; err = mlx5_register_interface(&mlx5_ib_interface); if (err) goto clean_odp; return err; clean_odp: mlx5_ib_odp_cleanup(); return err; } static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); mlx5_ib_odp_cleanup(); } module_init_order(mlx5_ib_init, SI_ORDER_SEVENTH); module_exit_order(mlx5_ib_cleanup, SI_ORDER_SEVENTH);