diff --git a/sys/dev/mlx4/mlx4_en/en.h b/sys/dev/mlx4/mlx4_en/en.h index e63027bc9dc0..4a8b4fbd5fd4 100644 --- a/sys/dev/mlx4/mlx4_en/en.h +++ b/sys/dev/mlx4/mlx4_en/en.h @@ -1,958 +1,958 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #ifndef _MLX4_EN_H_ #define _MLX4_EN_H_ #include #include #include #include #include #include #include #include #ifdef CONFIG_MLX4_EN_DCB #include #endif #include #include #include #include #include #include #include #include #include "en_port.h" #include #define DRV_NAME "mlx4_en" #define MLX4_EN_MSG_LEVEL (NETIF_MSG_LINK | NETIF_MSG_IFDOWN) /* * Device constants */ #define MLX4_EN_PAGE_SHIFT 12 #define MLX4_EN_PAGE_SIZE (1 << MLX4_EN_PAGE_SHIFT) #define MLX4_NET_IP_ALIGN 2 /* bytes */ #define DEF_RX_RINGS 16 #define MAX_RX_RINGS 128 #define MIN_RX_RINGS 4 #define TXBB_SIZE 64 #ifndef MLX4_EN_MAX_RX_SEGS #define MLX4_EN_MAX_RX_SEGS 1 /* or 8 */ #endif #ifndef MLX4_EN_MAX_RX_BYTES #define MLX4_EN_MAX_RX_BYTES MCLBYTES #endif #define HEADROOM (2048 / TXBB_SIZE + 1) #define INIT_OWNER_BIT 0xffffffff #define STAMP_STRIDE 64 #define STAMP_DWORDS (STAMP_STRIDE / 4) #define STAMP_SHIFT 31 #define STAMP_VAL 0x7fffffff #define STATS_DELAY (HZ / 4) #define SERVICE_TASK_DELAY (HZ / 4) #define MAX_NUM_OF_FS_RULES 256 #define MLX4_EN_FILTER_HASH_SHIFT 4 #define MLX4_EN_FILTER_EXPIRY_QUOTA 60 #ifdef CONFIG_NET_RX_BUSY_POLL #define LL_EXTENDED_STATS #endif /* vlan valid range */ #define VLAN_MIN_VALUE 1 #define VLAN_MAX_VALUE 4094 /* * OS related constants and tunables */ #define MLX4_EN_WATCHDOG_TIMEOUT (15 * HZ) #define MLX4_EN_ALLOC_SIZE PAGE_ALIGN(PAGE_SIZE) #define MLX4_EN_ALLOC_ORDER get_order(MLX4_EN_ALLOC_SIZE) enum mlx4_en_alloc_type { MLX4_EN_ALLOC_NEW = 0, MLX4_EN_ALLOC_REPLACEMENT = 1, }; /* Maximum ring sizes */ #define MLX4_EN_DEF_TX_QUEUE_SIZE 4096 /* Minimum packet number till arming the CQ */ #define MLX4_EN_MIN_RX_ARM 2048 #define MLX4_EN_MIN_TX_ARM 2048 /* Maximum ring sizes */ #define MLX4_EN_MAX_TX_SIZE 8192 #define MLX4_EN_MAX_RX_SIZE 8192 /* Minimum ring sizes */ #define MLX4_EN_MIN_RX_SIZE (4096 / TXBB_SIZE) #define MLX4_EN_MIN_TX_SIZE (4096 / TXBB_SIZE) #define MLX4_EN_SMALL_PKT_SIZE 64 #define MLX4_EN_MAX_TX_RING_P_UP 32 #define MLX4_EN_NUM_UP 1 #define MAX_TX_RINGS (MLX4_EN_MAX_TX_RING_P_UP * \ MLX4_EN_NUM_UP) #define MLX4_EN_NO_VLAN 0xffff #define MLX4_EN_DEF_TX_RING_SIZE 1024 #define MLX4_EN_DEF_RX_RING_SIZE 1024 /* Target number of bytes to coalesce with interrupt moderation */ #define MLX4_EN_RX_COAL_TARGET 44 #define MLX4_EN_RX_COAL_TIME 0x10 #define MLX4_EN_TX_COAL_PKTS 64 #define MLX4_EN_TX_COAL_TIME 64 #define MLX4_EN_RX_RATE_LOW 400000 #define MLX4_EN_RX_COAL_TIME_LOW 0 #define MLX4_EN_RX_RATE_HIGH 450000 #define MLX4_EN_RX_COAL_TIME_HIGH 128 #define MLX4_EN_RX_SIZE_THRESH 1024 #define MLX4_EN_RX_RATE_THRESH (1000000 / MLX4_EN_RX_COAL_TIME_HIGH) #define MLX4_EN_SAMPLE_INTERVAL 0 #define MLX4_EN_AVG_PKT_SMALL 256 #define MLX4_EN_AUTO_CONF 0xffff #define MLX4_EN_DEF_RX_PAUSE 1 #define MLX4_EN_DEF_TX_PAUSE 1 /* Interval between successive polls in the Tx routine when polling is used instead of interrupts (in per-core Tx rings) - should be power of 2 */ #define MLX4_EN_TX_POLL_MODER 16 #define MLX4_EN_TX_POLL_TIMEOUT (HZ / 4) #define MLX4_EN_64_ALIGN (64 - NET_SKB_PAD) #define SMALL_PACKET_SIZE (256 - NET_IP_ALIGN) #define HEADER_COPY_SIZE (128) #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETHER_HDR_LEN) #define MLX4_EN_MIN_MTU 46 #define ETH_BCAST 0xffffffffffffULL #define MLX4_EN_LOOPBACK_RETRIES 5 #define MLX4_EN_LOOPBACK_TIMEOUT 100 #ifdef MLX4_EN_PERF_STAT /* Number of samples to 'average' */ #define AVG_SIZE 128 #define AVG_FACTOR 1024 #define INC_PERF_COUNTER(cnt) (++(cnt)) #define ADD_PERF_COUNTER(cnt, add) ((cnt) += (add)) #define AVG_PERF_COUNTER(cnt, sample) \ ((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE) #define GET_PERF_COUNTER(cnt) (cnt) #define GET_AVG_PERF_COUNTER(cnt) ((cnt) / AVG_FACTOR) #else #define INC_PERF_COUNTER(cnt) do {} while (0) #define ADD_PERF_COUNTER(cnt, add) do {} while (0) #define AVG_PERF_COUNTER(cnt, sample) do {} while (0) #define GET_PERF_COUNTER(cnt) (0) #define GET_AVG_PERF_COUNTER(cnt) (0) #endif /* MLX4_EN_PERF_STAT */ /* Constants for TX flow */ enum { MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */ MAX_BF = 256, MIN_PKT_LEN = 17, }; /* * Configurables */ enum cq_type { RX = 0, TX = 1, }; /* * Useful macros */ #define ROUNDUP_LOG2(x) ilog2(roundup_pow_of_two(x)) #define XNOR(x, y) (!(x) == !(y)) #define ILLEGAL_MAC(addr) (addr == 0xffffffffffffULL || addr == 0x0) struct mlx4_en_tx_info { bus_dmamap_t dma_map; struct mbuf *mb; u32 nr_txbb; u32 nr_bytes; }; #define MLX4_EN_BIT_DESC_OWN 0x80000000 #define CTRL_SIZE sizeof(struct mlx4_wqe_ctrl_seg) #define MLX4_EN_MEMTYPE_PAD 0x100 #define DS_SIZE sizeof(struct mlx4_wqe_data_seg) struct mlx4_en_tx_desc { struct mlx4_wqe_ctrl_seg ctrl; union { struct mlx4_wqe_data_seg data; /* at least one data segment */ struct mlx4_wqe_lso_seg lso; struct mlx4_wqe_inline_seg inl; }; }; #define MLX4_EN_USE_SRQ 0x01000000 #define MLX4_EN_RX_BUDGET 64 #define MLX4_EN_TX_MAX_DESC_SIZE 512 /* bytes */ #define MLX4_EN_TX_MAX_MBUF_SIZE 65536 /* bytes */ #define MLX4_EN_TX_MAX_PAYLOAD_SIZE 65536 /* bytes */ #define MLX4_EN_TX_MAX_MBUF_FRAGS \ ((MLX4_EN_TX_MAX_DESC_SIZE - 128) / DS_SIZE_ALIGNMENT) /* units */ #define MLX4_EN_TX_WQE_MAX_WQEBBS \ (MLX4_EN_TX_MAX_DESC_SIZE / TXBB_SIZE) /* units */ #define MLX4_EN_CX3_LOW_ID 0x1000 #define MLX4_EN_CX3_HIGH_ID 0x1005 struct mlx4_en_tx_ring { spinlock_t tx_lock; bus_dma_tag_t dma_tag; struct mlx4_hwq_resources wqres; u32 size ; /* number of TXBBs */ u32 size_mask; u16 stride; u16 cqn; /* index of port CQ associated with this ring */ u32 prod; u32 cons; u32 buf_size; u32 doorbell_qpn; u8 *buf; u16 poll_cnt; struct mlx4_en_tx_info *tx_info; u8 queue_index; u32 last_nr_txbb; struct mlx4_qp qp; struct mlx4_qp_context context; int qpn; enum mlx4_qp_state qp_state; struct mlx4_srq dummy; u64 bytes; u64 packets; u64 tx_csum; u64 queue_stopped; u64 oversized_packets; u64 wake_queue; u64 tso_packets; u64 defrag_attempts; struct mlx4_bf bf; bool bf_enabled; int hwtstamp_tx_type; spinlock_t comp_lock; int inline_thold; u64 watchdog_time; }; struct mlx4_en_rx_desc { struct mlx4_wqe_data_seg data[MLX4_EN_MAX_RX_SEGS]; }; /* the size of the structure above must be power of two */ CTASSERT(powerof2(sizeof(struct mlx4_en_rx_desc))); struct mlx4_en_rx_mbuf { bus_dmamap_t dma_map; struct mbuf *mbuf; }; struct mlx4_en_rx_spare { bus_dmamap_t dma_map; struct mbuf *mbuf; bus_dma_segment_t segs[MLX4_EN_MAX_RX_SEGS]; }; struct mlx4_en_rx_ring { struct mlx4_hwq_resources wqres; bus_dma_tag_t dma_tag; struct mlx4_en_rx_spare spare; u32 size ; /* number of Rx descs*/ u32 actual_size; u32 size_mask; u16 log_stride; u16 cqn; /* index of port CQ associated with this ring */ u32 prod; u32 cons; u32 buf_size; u8 fcs_del; u32 rx_mb_size; u32 rx_mr_key_be; int qpn; u8 *buf; struct mlx4_en_rx_mbuf *mbuf; u64 errors; u64 bytes; u64 packets; #ifdef LL_EXTENDED_STATS u64 yields; u64 misses; u64 cleaned; #endif u64 csum_ok; u64 csum_none; int hwtstamp_rx_filter; int numa_node; struct lro_ctrl lro; }; static inline int mlx4_en_can_lro(__be16 status) { const __be16 status_all = cpu_to_be16( MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV4F | MLX4_CQE_STATUS_IPV6 | MLX4_CQE_STATUS_IPV4OPT | MLX4_CQE_STATUS_TCP | MLX4_CQE_STATUS_UDP | MLX4_CQE_STATUS_IPOK); const __be16 status_ipv4_ipok_tcp = cpu_to_be16( MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPOK | MLX4_CQE_STATUS_TCP); const __be16 status_ipv6_ipok_tcp = cpu_to_be16( MLX4_CQE_STATUS_IPV6 | MLX4_CQE_STATUS_IPOK | MLX4_CQE_STATUS_TCP); status &= status_all; return (status == status_ipv4_ipok_tcp || status == status_ipv6_ipok_tcp); } struct mlx4_en_cq { struct mlx4_cq mcq; struct mlx4_hwq_resources wqres; int ring; spinlock_t lock; - struct net_device *dev; + struct ifnet *dev; /* Per-core Tx cq processing support */ struct timer_list timer; int size; int buf_size; unsigned vector; enum cq_type is_tx; u16 moder_time; u16 moder_cnt; struct mlx4_cqe *buf; struct task cq_task; struct taskqueue *tq; #define MLX4_EN_OPCODE_ERROR 0x1e u32 tot_rx; u32 tot_tx; u32 curr_poll_rx_cpu_id; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int state; #define MLX4_EN_CQ_STATE_IDLE 0 #define MLX4_EN_CQ_STATE_NAPI 1 /* NAPI owns this CQ */ #define MLX4_EN_CQ_STATE_POLL 2 /* poll owns this CQ */ #define MLX4_CQ_LOCKED (MLX4_EN_CQ_STATE_NAPI | MLX4_EN_CQ_STATE_POLL) #define MLX4_EN_CQ_STATE_NAPI_YIELD 4 /* NAPI yielded this CQ */ #define MLX4_EN_CQ_STATE_POLL_YIELD 8 /* poll yielded this CQ */ #define CQ_YIELD (MLX4_EN_CQ_STATE_NAPI_YIELD | MLX4_EN_CQ_STATE_POLL_YIELD) #define CQ_USER_PEND (MLX4_EN_CQ_STATE_POLL | MLX4_EN_CQ_STATE_POLL_YIELD) spinlock_t poll_lock; /* protects from LLS/napi conflicts */ #endif /* CONFIG_NET_RX_BUSY_POLL */ }; struct mlx4_en_port_profile { u32 flags; u32 tx_ring_num; u32 rx_ring_num; u32 tx_ring_size; u32 rx_ring_size; u8 rx_pause; u8 rx_ppp; u8 tx_pause; u8 tx_ppp; int rss_rings; int inline_thold; }; struct mlx4_en_profile { int rss_xor; int udp_rss; u8 rss_mask; u32 active_ports; u32 small_pkt_int; u8 no_reset; u8 num_tx_rings_p_up; struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1]; }; struct mlx4_en_dev { struct mlx4_dev *dev; struct pci_dev *pdev; struct mutex state_lock; - struct net_device *pndev[MLX4_MAX_PORTS + 1]; + struct ifnet *pndev[MLX4_MAX_PORTS + 1]; u32 port_cnt; bool device_up; struct mlx4_en_profile profile; u32 LSO_support; struct workqueue_struct *workqueue; struct device *dma_device; void __iomem *uar_map; struct mlx4_uar priv_uar; struct mlx4_mr mr; u32 priv_pdn; spinlock_t uar_lock; u8 mac_removed[MLX4_MAX_PORTS + 1]; unsigned long last_overflow_check; unsigned long overflow_period; }; struct mlx4_en_rss_map { int base_qpn; struct mlx4_qp qps[MAX_RX_RINGS]; enum mlx4_qp_state state[MAX_RX_RINGS]; struct mlx4_qp indir_qp; enum mlx4_qp_state indir_state; }; enum mlx4_en_port_flag { MLX4_EN_PORT_ANC = 1<<0, /* Auto-negotiation complete */ MLX4_EN_PORT_ANE = 1<<1, /* Auto-negotiation enabled */ }; struct mlx4_en_port_state { int link_state; int link_speed; int transceiver; u32 flags; }; enum mlx4_en_addr_list_act { MLX4_ADDR_LIST_NONE, MLX4_ADDR_LIST_REM, MLX4_ADDR_LIST_ADD, }; struct mlx4_en_addr_list { struct list_head list; enum mlx4_en_addr_list_act action; u8 addr[ETH_ALEN]; u64 reg_id; u64 tunnel_reg_id; }; #ifdef CONFIG_MLX4_EN_DCB /* Minimal TC BW - setting to 0 will block traffic */ #define MLX4_EN_BW_MIN 1 #define MLX4_EN_BW_MAX 100 /* Utilize 100% of the line */ #define MLX4_EN_TC_VENDOR 0 #define MLX4_EN_TC_ETS 7 #endif enum { MLX4_EN_FLAG_PROMISC = (1 << 0), MLX4_EN_FLAG_MC_PROMISC = (1 << 1), /* whether we need to enable hardware loopback by putting dmac * in Tx WQE */ MLX4_EN_FLAG_ENABLE_HW_LOOPBACK = (1 << 2), /* whether we need to drop packets that hardware loopback-ed */ MLX4_EN_FLAG_RX_FILTER_NEEDED = (1 << 3), MLX4_EN_FLAG_FORCE_PROMISC = (1 << 4), #ifdef CONFIG_MLX4_EN_DCB MLX4_EN_FLAG_DCB_ENABLED = (1 << 5) #endif }; #define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE) #define MLX4_EN_MAC_HASH_IDX 5 struct en_port { struct kobject kobj; struct mlx4_dev *dev; u8 port_num; u8 vport_num; }; struct mlx4_en_priv { struct mlx4_en_dev *mdev; struct mlx4_en_port_profile *prof; - struct net_device *dev; + struct ifnet *dev; unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; struct mlx4_en_port_state port_state; spinlock_t stats_lock; /* To allow rules removal while port is going down */ struct list_head ethtool_list; unsigned long last_moder_packets[MAX_RX_RINGS]; unsigned long last_moder_tx_packets; unsigned long last_moder_bytes[MAX_RX_RINGS]; unsigned long last_moder_jiffies; int last_moder_time[MAX_RX_RINGS]; u16 rx_usecs; u16 rx_frames; u16 tx_usecs; u16 tx_frames; u32 pkt_rate_low; u32 rx_usecs_low; u32 pkt_rate_high; u32 rx_usecs_high; u32 sample_interval; u32 adaptive_rx_coal; u32 msg_enable; u32 loopback_ok; u32 validate_loopback; struct mlx4_hwq_resources res; int link_state; int last_link_state; bool port_up; int port; int registered; int gone; int allocated; unsigned char current_mac[ETH_ALEN + 2]; u64 mac; int mac_index; unsigned max_mtu; int base_qpn; int cqe_factor; struct mlx4_en_rss_map rss_map; u32 flags; u8 num_tx_rings_p_up; u32 tx_ring_num; u32 rx_ring_num; u32 rx_mb_size; struct mlx4_en_tx_ring **tx_ring; struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS]; struct mlx4_en_cq **tx_cq; struct mlx4_en_cq *rx_cq[MAX_RX_RINGS]; struct mlx4_qp drop_qp; struct work_struct rx_mode_task; struct work_struct watchdog_task; struct work_struct linkstate_task; struct delayed_work stats_task; struct delayed_work service_task; struct mlx4_en_perf_stats pstats; struct mlx4_en_pkt_stats pkstats; struct mlx4_en_pkt_stats pkstats_last; struct mlx4_en_flow_stats_rx rx_priority_flowstats[MLX4_NUM_PRIORITIES]; struct mlx4_en_flow_stats_tx tx_priority_flowstats[MLX4_NUM_PRIORITIES]; struct mlx4_en_flow_stats_rx rx_flowstats; struct mlx4_en_flow_stats_tx tx_flowstats; struct mlx4_en_port_stats port_stats; struct mlx4_en_vport_stats vport_stats; struct mlx4_en_vf_stats vf_stats; struct list_head mc_list; struct list_head uc_list; struct list_head curr_mc_list; struct list_head curr_uc_list; u64 broadcast_id; struct mlx4_en_stat_out_mbox hw_stats; int vids[128]; bool wol; struct device *ddev; struct dentry *dev_root; u32 counter_index; eventhandler_tag vlan_attach; eventhandler_tag vlan_detach; struct callout watchdog_timer; struct ifmedia media; volatile int blocked; struct sysctl_oid *conf_sysctl; struct sysctl_oid *stat_sysctl; struct sysctl_ctx_list conf_ctx; struct sysctl_ctx_list stat_ctx; #ifdef CONFIG_MLX4_EN_DCB struct ieee_ets ets; u16 maxrate[IEEE_8021QAZ_MAX_TCS]; u8 dcbx_cap; #endif #ifdef CONFIG_RFS_ACCEL spinlock_t filters_lock; int last_filter_id; struct list_head filters; struct hlist_head filter_hash[1 << MLX4_EN_FILTER_HASH_SHIFT]; #endif u64 tunnel_reg_id; struct en_port *vf_ports[MLX4_MAX_NUM_VF]; unsigned long last_ifq_jiffies; u64 if_counters_rx_errors; u64 if_counters_rx_no_buffer; }; enum mlx4_en_wol { MLX4_EN_WOL_MAGIC = (1ULL << 61), MLX4_EN_WOL_ENABLED = (1ULL << 62), }; struct mlx4_mac_entry { struct hlist_node hlist; unsigned char mac[ETH_ALEN + 2]; u64 reg_id; }; static inline void * -netdev_priv(const struct net_device *dev) +netdev_priv(const struct ifnet *dev) { return (dev->if_softc); } static inline struct mlx4_cqe *mlx4_en_get_cqe(u8 *buf, int idx, int cqe_sz) { return (struct mlx4_cqe *)(buf + idx * cqe_sz); } #ifdef CONFIG_NET_RX_BUSY_POLL static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq) { spin_lock_init(&cq->poll_lock); cq->state = MLX4_EN_CQ_STATE_IDLE; } /* called from the device poll rutine to get ownership of a cq */ static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq) { int rc = true; spin_lock(&cq->poll_lock); if (cq->state & MLX4_CQ_LOCKED) { WARN_ON(cq->state & MLX4_EN_CQ_STATE_NAPI); cq->state |= MLX4_EN_CQ_STATE_NAPI_YIELD; rc = false; } else /* we don't care if someone yielded */ cq->state = MLX4_EN_CQ_STATE_NAPI; spin_unlock(&cq->poll_lock); return rc; } /* returns true is someone tried to get the cq while napi had it */ static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq) { int rc = false; spin_lock(&cq->poll_lock); WARN_ON(cq->state & (MLX4_EN_CQ_STATE_POLL | MLX4_EN_CQ_STATE_NAPI_YIELD)); if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD) rc = true; cq->state = MLX4_EN_CQ_STATE_IDLE; spin_unlock(&cq->poll_lock); return rc; } /* called from mlx4_en_low_latency_poll() */ static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq) { int rc = true; spin_lock_bh(&cq->poll_lock); if ((cq->state & MLX4_CQ_LOCKED)) { - struct net_device *dev = cq->dev; + struct ifnet *dev = cq->dev; struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring]; cq->state |= MLX4_EN_CQ_STATE_POLL_YIELD; rc = false; #ifdef LL_EXTENDED_STATS rx_ring->yields++; #endif } else /* preserve yield marks */ cq->state |= MLX4_EN_CQ_STATE_POLL; spin_unlock_bh(&cq->poll_lock); return rc; } /* returns true if someone tried to get the cq while it was locked */ static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq) { int rc = false; spin_lock_bh(&cq->poll_lock); WARN_ON(cq->state & (MLX4_EN_CQ_STATE_NAPI)); if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD) rc = true; cq->state = MLX4_EN_CQ_STATE_IDLE; spin_unlock_bh(&cq->poll_lock); return rc; } /* true if a socket is polling, even if it did not get the lock */ static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq) { WARN_ON(!(cq->state & MLX4_CQ_LOCKED)); return cq->state & CQ_USER_PEND; } #else static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq) { } static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq) { return true; } static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq) { return false; } static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq) { return false; } static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq) { return false; } static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ #define MLX4_EN_WOL_DO_MODIFY (1ULL << 63) -void mlx4_en_destroy_netdev(struct net_device *dev); +void mlx4_en_destroy_netdev(struct ifnet *dev); int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, struct mlx4_en_port_profile *prof); -int mlx4_en_start_port(struct net_device *dev); -void mlx4_en_stop_port(struct net_device *dev); +int mlx4_en_start_port(struct ifnet *dev); +void mlx4_en_stop_port(struct ifnet *dev); void mlx4_en_free_resources(struct mlx4_en_priv *priv); int mlx4_en_alloc_resources(struct mlx4_en_priv *priv); int mlx4_en_pre_config(struct mlx4_en_priv *priv); int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq, int entries, int ring, enum cq_type mode, int node); void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq); int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, int cq_idx); void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_tx_irq(struct mlx4_cq *mcq); -u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb); +u16 mlx4_en_select_queue(struct ifnet *dev, struct mbuf *mb); int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp); int mlx4_en_transmit(struct ifnet *dev, struct mbuf *m); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring, u32 size, u16 stride, int node, int queue_idx); void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring); int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int cq, int user_prio); void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring); void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev); void mlx4_en_qflush(struct ifnet *dev); int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size, int node); void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size); void mlx4_en_rx_que(void *context, int pending); int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv); void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring); -int mlx4_en_process_rx_cq(struct net_device *dev, +int mlx4_en_process_rx_cq(struct ifnet *dev, struct mlx4_en_cq *cq, int budget); void mlx4_en_poll_tx_cq(unsigned long data); void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, int is_tx, int rss, int qpn, int cqn, int user_prio, struct mlx4_qp_context *context); void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event); int mlx4_en_map_buffer(struct mlx4_buf *buf); void mlx4_en_unmap_buffer(struct mlx4_buf *buf); -void mlx4_en_calc_rx_buf(struct net_device *dev); +void mlx4_en_calc_rx_buf(struct ifnet *dev); const u32 *mlx4_en_get_rss_key(struct mlx4_en_priv *priv, u16 *keylen); u8 mlx4_en_get_rss_mask(struct mlx4_en_priv *priv); int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv); void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv); int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv); void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv); -int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring); +int mlx4_en_free_tx_buf(struct ifnet *dev, struct mlx4_en_tx_ring *ring); void mlx4_en_rx_irq(struct mlx4_cq *mcq); int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv); int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset); int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port); int mlx4_en_get_vport_stats(struct mlx4_en_dev *mdev, u8 port); void mlx4_en_create_debug_files(struct mlx4_en_priv *priv); void mlx4_en_delete_debug_files(struct mlx4_en_priv *priv); int mlx4_en_register_debugfs(void); void mlx4_en_unregister_debugfs(void); #ifdef CONFIG_MLX4_EN_DCB extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops; extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops; #endif -int mlx4_en_setup_tc(struct net_device *dev, u8 up); +int mlx4_en_setup_tc(struct ifnet *dev, u8 up); #ifdef CONFIG_RFS_ACCEL void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *rx_ring); #endif #define MLX4_EN_NUM_SELF_TEST 5 -void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf); +void mlx4_en_ex_selftest(struct ifnet *dev, u32 *flags, u64 *buf); void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev); /* * Functions for time stamping */ #define SKBTX_HW_TSTAMP (1 << 0) #define SKBTX_IN_PROGRESS (1 << 2) u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe); /* Functions for caching and restoring statistics */ -int mlx4_en_get_sset_count(struct net_device *dev, int sset); +int mlx4_en_get_sset_count(struct ifnet *dev, int sset); void mlx4_en_restore_ethtool_stats(struct mlx4_en_priv *priv, u64 *data); /* * Globals */ extern const struct ethtool_ops mlx4_en_ethtool_ops; /* * Defines for link speed - needed by selftest */ #define MLX4_EN_LINK_SPEED_1G 1000 #define MLX4_EN_LINK_SPEED_10G 10000 #define MLX4_EN_LINK_SPEED_40G 40000 enum { NETIF_MSG_DRV = 0x0001, NETIF_MSG_PROBE = 0x0002, NETIF_MSG_LINK = 0x0004, NETIF_MSG_TIMER = 0x0008, NETIF_MSG_IFDOWN = 0x0010, NETIF_MSG_IFUP = 0x0020, NETIF_MSG_RX_ERR = 0x0040, NETIF_MSG_TX_ERR = 0x0080, NETIF_MSG_TX_QUEUED = 0x0100, NETIF_MSG_INTR = 0x0200, NETIF_MSG_TX_DONE = 0x0400, NETIF_MSG_RX_STATUS = 0x0800, NETIF_MSG_PKTDATA = 0x1000, NETIF_MSG_HW = 0x2000, NETIF_MSG_WOL = 0x4000, }; /* * printk / logging functions */ #define en_print(level, priv, format, arg...) \ { \ if ((priv)->registered) \ printk(level "%s: %s: " format, DRV_NAME, \ (priv)->dev->if_xname, ## arg); \ else \ printk(level "%s: %s: Port %d: " format, \ DRV_NAME, dev_name(&(priv)->mdev->pdev->dev), \ (priv)->port, ## arg); \ } #define en_dbg(mlevel, priv, format, arg...) \ do { \ if (NETIF_MSG_##mlevel & priv->msg_enable) \ en_print(KERN_DEBUG, priv, format, ##arg); \ } while (0) #define en_warn(priv, format, arg...) \ en_print(KERN_WARNING, priv, format, ##arg) #define en_err(priv, format, arg...) \ en_print(KERN_ERR, priv, format, ##arg) #define en_info(priv, format, arg...) \ en_print(KERN_INFO, priv, format, ## arg) #define mlx4_err(mdev, format, arg...) \ pr_err("%s %s: " format, DRV_NAME, \ dev_name(&(mdev)->pdev->dev), ##arg) #define mlx4_info(mdev, format, arg...) \ pr_info("%s %s: " format, DRV_NAME, \ dev_name(&(mdev)->pdev->dev), ##arg) #define mlx4_warn(mdev, format, arg...) \ pr_warning("%s %s: " format, DRV_NAME, \ dev_name(&(mdev)->pdev->dev), ##arg) #endif diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c b/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c index b36afc66d476..eec620f5b90e 100644 --- a/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c +++ b/sys/dev/mlx4/mlx4_en/mlx4_en_netdev.c @@ -1,2946 +1,2946 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include #include #ifdef CONFIG_NET_RX_BUSY_POLL #include #endif #include #include #include #include #include #include #include #include #include #include "en.h" #include "en_port.h" DEBUGNET_DEFINE(mlx4_en); static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv); static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv); #ifdef CONFIG_NET_RX_BUSY_POLL /* must be called with local_bh_disable()d */ static int mlx4_en_low_latency_recv(struct napi_struct *napi) { struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi); - struct net_device *dev = cq->dev; + struct ifnet *dev = cq->dev; struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring]; int done; if (!priv->port_up) return LL_FLUSH_FAILED; if (!mlx4_en_cq_lock_poll(cq)) return LL_FLUSH_BUSY; done = mlx4_en_process_rx_cq(dev, cq, 4); #ifdef LL_EXTENDED_STATS if (likely(done)) rx_ring->cleaned += done; else rx_ring->misses++; #endif mlx4_en_cq_unlock_poll(cq); return done; } #endif /* CONFIG_NET_RX_BUSY_POLL */ #ifdef CONFIG_RFS_ACCEL struct mlx4_en_filter { struct list_head next; struct work_struct work; u8 ip_proto; __be32 src_ip; __be32 dst_ip; __be16 src_port; __be16 dst_port; int rxq_index; struct mlx4_en_priv *priv; u32 flow_id; /* RFS infrastructure id */ int id; /* mlx4_en driver id */ u64 reg_id; /* Flow steering API id */ u8 activated; /* Used to prevent expiry before filter * is attached */ struct hlist_node filter_chain; }; static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv); static enum mlx4_net_trans_rule_id mlx4_ip_proto_to_trans_rule_id(u8 ip_proto) { switch (ip_proto) { case IPPROTO_UDP: return MLX4_NET_TRANS_RULE_ID_UDP; case IPPROTO_TCP: return MLX4_NET_TRANS_RULE_ID_TCP; default: return MLX4_NET_TRANS_RULE_NUM; } }; static void mlx4_en_filter_work(struct work_struct *work) { struct mlx4_en_filter *filter = container_of(work, struct mlx4_en_filter, work); struct mlx4_en_priv *priv = filter->priv; struct mlx4_spec_list spec_tcp_udp = { .id = mlx4_ip_proto_to_trans_rule_id(filter->ip_proto), { .tcp_udp = { .dst_port = filter->dst_port, .dst_port_msk = (__force __be16)-1, .src_port = filter->src_port, .src_port_msk = (__force __be16)-1, }, }, }; struct mlx4_spec_list spec_ip = { .id = MLX4_NET_TRANS_RULE_ID_IPV4, { .ipv4 = { .dst_ip = filter->dst_ip, .dst_ip_msk = (__force __be32)-1, .src_ip = filter->src_ip, .src_ip_msk = (__force __be32)-1, }, }, }; struct mlx4_spec_list spec_eth = { .id = MLX4_NET_TRANS_RULE_ID_ETH, }; struct mlx4_net_trans_rule rule = { .list = LIST_HEAD_INIT(rule.list), .queue_mode = MLX4_NET_TRANS_Q_LIFO, .exclusive = 1, .allow_loopback = 1, .promisc_mode = MLX4_FS_REGULAR, .port = priv->port, .priority = MLX4_DOMAIN_RFS, }; int rc; __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16); if (spec_tcp_udp.id >= MLX4_NET_TRANS_RULE_NUM) { en_warn(priv, "RFS: ignoring unsupported ip protocol (%d)\n", filter->ip_proto); goto ignore; } list_add_tail(&spec_eth.list, &rule.list); list_add_tail(&spec_ip.list, &rule.list); list_add_tail(&spec_tcp_udp.list, &rule.list); rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn; memcpy(spec_eth.eth.dst_mac, priv->dev->dev_addr, ETH_ALEN); memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN); filter->activated = 0; if (filter->reg_id) { rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id); if (rc && rc != -ENOENT) en_err(priv, "Error detaching flow. rc = %d\n", rc); } rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id); if (rc) en_err(priv, "Error attaching flow. err = %d\n", rc); ignore: mlx4_en_filter_rfs_expire(priv); filter->activated = 1; } static inline struct hlist_head * filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip, __be16 src_port, __be16 dst_port) { unsigned long l; int bucket_idx; l = (__force unsigned long)src_port | ((__force unsigned long)dst_port << 2); l ^= (__force unsigned long)(src_ip ^ dst_ip); bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT); return &priv->filter_hash[bucket_idx]; } static struct mlx4_en_filter * mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip, __be32 dst_ip, u8 ip_proto, __be16 src_port, __be16 dst_port, u32 flow_id) { struct mlx4_en_filter *filter = NULL; filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC); if (!filter) return NULL; filter->priv = priv; filter->rxq_index = rxq_index; INIT_WORK(&filter->work, mlx4_en_filter_work); filter->src_ip = src_ip; filter->dst_ip = dst_ip; filter->ip_proto = ip_proto; filter->src_port = src_port; filter->dst_port = dst_port; filter->flow_id = flow_id; filter->id = priv->last_filter_id++ % RPS_NO_FILTER; list_add_tail(&filter->next, &priv->filters); hlist_add_head(&filter->filter_chain, filter_hash_bucket(priv, src_ip, dst_ip, src_port, dst_port)); return filter; } static void mlx4_en_filter_free(struct mlx4_en_filter *filter) { struct mlx4_en_priv *priv = filter->priv; int rc; list_del(&filter->next); rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id); if (rc && rc != -ENOENT) en_err(priv, "Error detaching flow. rc = %d\n", rc); kfree(filter); } static inline struct mlx4_en_filter * mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip, u8 ip_proto, __be16 src_port, __be16 dst_port) { struct mlx4_en_filter *filter; struct mlx4_en_filter *ret = NULL; hlist_for_each_entry(filter, filter_hash_bucket(priv, src_ip, dst_ip, src_port, dst_port), filter_chain) { if (filter->src_ip == src_ip && filter->dst_ip == dst_ip && filter->ip_proto == ip_proto && filter->src_port == src_port && filter->dst_port == dst_port) { ret = filter; break; } } return ret; } static int -mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, +mlx4_en_filter_rfs(struct ifnet *net_dev, const struct sk_buff *skb, u16 rxq_index, u32 flow_id) { struct mlx4_en_priv *priv = netdev_priv(net_dev); struct mlx4_en_filter *filter; const struct iphdr *ip; const __be16 *ports; u8 ip_proto; __be32 src_ip; __be32 dst_ip; __be16 src_port; __be16 dst_port; int nhoff = skb_network_offset(skb); int ret = 0; if (skb->protocol != htons(ETH_P_IP)) return -EPROTONOSUPPORT; ip = (const struct iphdr *)(skb->data + nhoff); if (ip_is_fragment(ip)) return -EPROTONOSUPPORT; if ((ip->protocol != IPPROTO_TCP) && (ip->protocol != IPPROTO_UDP)) return -EPROTONOSUPPORT; ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl); ip_proto = ip->protocol; src_ip = ip->saddr; dst_ip = ip->daddr; src_port = ports[0]; dst_port = ports[1]; spin_lock_bh(&priv->filters_lock); filter = mlx4_en_filter_find(priv, src_ip, dst_ip, ip_proto, src_port, dst_port); if (filter) { if (filter->rxq_index == rxq_index) goto out; filter->rxq_index = rxq_index; } else { filter = mlx4_en_filter_alloc(priv, rxq_index, src_ip, dst_ip, ip_proto, src_port, dst_port, flow_id); if (!filter) { ret = -ENOMEM; goto err; } } queue_work(priv->mdev->workqueue, &filter->work); out: ret = filter->id; err: spin_unlock_bh(&priv->filters_lock); return ret; } void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv) { struct mlx4_en_filter *filter, *tmp; LIST_HEAD(del_list); spin_lock_bh(&priv->filters_lock); list_for_each_entry_safe(filter, tmp, &priv->filters, next) { list_move(&filter->next, &del_list); hlist_del(&filter->filter_chain); } spin_unlock_bh(&priv->filters_lock); list_for_each_entry_safe(filter, tmp, &del_list, next) { cancel_work_sync(&filter->work); mlx4_en_filter_free(filter); } } static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv) { struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL; LIST_HEAD(del_list); int i = 0; spin_lock_bh(&priv->filters_lock); list_for_each_entry_safe(filter, tmp, &priv->filters, next) { if (i > MLX4_EN_FILTER_EXPIRY_QUOTA) break; if (filter->activated && !work_pending(&filter->work) && rps_may_expire_flow(priv->dev, filter->rxq_index, filter->flow_id, filter->id)) { list_move(&filter->next, &del_list); hlist_del(&filter->filter_chain); } else last_filter = filter; i++; } if (last_filter && (&last_filter->next != priv->filters.next)) list_move(&priv->filters, &last_filter->next); spin_unlock_bh(&priv->filters_lock); list_for_each_entry_safe(filter, tmp, &del_list, next) mlx4_en_filter_free(filter); } #endif -static void mlx4_en_vlan_rx_add_vid(void *arg, struct net_device *dev, u16 vid) +static void mlx4_en_vlan_rx_add_vid(void *arg, struct ifnet *dev, u16 vid) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int err; int idx; if (arg != priv) return; en_dbg(HW, priv, "adding VLAN:%d\n", vid); set_bit(vid, priv->active_vlans); /* Add VID to port VLAN filter */ mutex_lock(&mdev->state_lock); if (mdev->device_up && priv->port_up) { err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); if (err) en_err(priv, "Failed configuring VLAN filter\n"); } if (mlx4_register_vlan(mdev->dev, priv->port, vid, &idx)) en_dbg(HW, priv, "failed adding vlan %d\n", vid); mutex_unlock(&mdev->state_lock); } -static void mlx4_en_vlan_rx_kill_vid(void *arg, struct net_device *dev, u16 vid) +static void mlx4_en_vlan_rx_kill_vid(void *arg, struct ifnet *dev, u16 vid) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int err; if (arg != priv) return; en_dbg(HW, priv, "Killing VID:%d\n", vid); clear_bit(vid, priv->active_vlans); /* Remove VID from port VLAN filter */ mutex_lock(&mdev->state_lock); mlx4_unregister_vlan(mdev->dev, priv->port, vid); if (mdev->device_up && priv->port_up) { err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); if (err) en_err(priv, "Failed configuring VLAN filter\n"); } mutex_unlock(&mdev->state_lock); } static int mlx4_en_tunnel_steer_add(struct mlx4_en_priv *priv, unsigned char *addr, int qpn, u64 *reg_id) { int err; if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN || priv->mdev->dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) return 0; /* do nothing */ err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn, MLX4_DOMAIN_NIC, reg_id); if (err) { en_err(priv, "failed to add vxlan steering rule, err %d\n", err); return err; } en_dbg(DRV, priv, "added vxlan steering rule, mac %pM reg_id %llx\n", addr, (long long)*reg_id); return 0; } static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv, unsigned char *mac, int *qpn, u64 *reg_id) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_dev *dev = mdev->dev; int err; switch (dev->caps.steering_mode) { case MLX4_STEERING_MODE_B0: { struct mlx4_qp qp; u8 gid[16] = {0}; qp.qpn = *qpn; memcpy(&gid[10], mac, ETH_ALEN); gid[5] = priv->port; err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH); break; } case MLX4_STEERING_MODE_DEVICE_MANAGED: { struct mlx4_spec_list spec_eth = { {NULL} }; __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16); struct mlx4_net_trans_rule rule = { .queue_mode = MLX4_NET_TRANS_Q_FIFO, .exclusive = 0, .allow_loopback = 1, .promisc_mode = MLX4_FS_REGULAR, .priority = MLX4_DOMAIN_NIC, }; rule.port = priv->port; rule.qpn = *qpn; INIT_LIST_HEAD(&rule.list); spec_eth.id = MLX4_NET_TRANS_RULE_ID_ETH; memcpy(spec_eth.eth.dst_mac, mac, ETH_ALEN); memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN); list_add_tail(&spec_eth.list, &rule.list); err = mlx4_flow_attach(dev, &rule, reg_id); break; } default: return -EINVAL; } if (err) en_warn(priv, "Failed Attaching Unicast\n"); return err; } static void mlx4_en_uc_steer_release(struct mlx4_en_priv *priv, unsigned char *mac, int qpn, u64 reg_id) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_dev *dev = mdev->dev; switch (dev->caps.steering_mode) { case MLX4_STEERING_MODE_B0: { struct mlx4_qp qp; u8 gid[16] = {0}; qp.qpn = qpn; memcpy(&gid[10], mac, ETH_ALEN); gid[5] = priv->port; mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH); break; } case MLX4_STEERING_MODE_DEVICE_MANAGED: { mlx4_flow_detach(dev, reg_id); break; } default: en_err(priv, "Invalid steering mode.\n"); } } static int mlx4_en_get_qp(struct mlx4_en_priv *priv) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_dev *dev = mdev->dev; int index = 0; int err = 0; int *qpn = &priv->base_qpn; u64 mac = mlx4_mac_to_u64(IF_LLADDR(priv->dev)); en_dbg(DRV, priv, "Registering MAC: %pM for adding\n", IF_LLADDR(priv->dev)); index = mlx4_register_mac(dev, priv->port, mac); if (index < 0) { err = index; en_err(priv, "Failed adding MAC: %pM\n", IF_LLADDR(priv->dev)); return err; } if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) { int base_qpn = mlx4_get_base_qpn(dev, priv->port); *qpn = base_qpn + index; return 0; } err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP); en_dbg(DRV, priv, "Reserved qp %d\n", *qpn); if (err) { en_err(priv, "Failed to reserve qp for mac registration\n"); mlx4_unregister_mac(dev, priv->port, mac); return err; } return 0; } static void mlx4_en_put_qp(struct mlx4_en_priv *priv) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_dev *dev = mdev->dev; int qpn = priv->base_qpn; if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) { u64 mac = mlx4_mac_to_u64(IF_LLADDR(priv->dev)); en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n", IF_LLADDR(priv->dev)); mlx4_unregister_mac(dev, priv->port, mac); } else { en_dbg(DRV, priv, "Releasing qp: port %d, qpn %d\n", priv->port, qpn); mlx4_qp_release_range(dev, qpn, 1); priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC; } } -static void mlx4_en_clear_uclist(struct net_device *dev) +static void mlx4_en_clear_uclist(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_addr_list *tmp, *uc_to_del; list_for_each_entry_safe(uc_to_del, tmp, &priv->uc_list, list) { list_del(&uc_to_del->list); kfree(uc_to_del); } } static u_int mlx4_copy_addr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { struct mlx4_en_priv *priv = arg; struct mlx4_en_addr_list *tmp; if (sdl->sdl_alen != ETHER_ADDR_LEN) /* XXXGL: can that happen? */ return (0); tmp = kzalloc(sizeof(struct mlx4_en_addr_list), GFP_ATOMIC); if (tmp == NULL) { en_err(priv, "Failed to allocate address list\n"); return (0); } memcpy(tmp->addr, LLADDR(sdl), ETH_ALEN); list_add_tail(&tmp->list, &priv->uc_list); return (1); } -static void mlx4_en_cache_uclist(struct net_device *dev) +static void mlx4_en_cache_uclist(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); mlx4_en_clear_uclist(dev); if_foreach_lladdr(dev, mlx4_copy_addr, priv); } -static void mlx4_en_clear_mclist(struct net_device *dev) +static void mlx4_en_clear_mclist(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_addr_list *tmp, *mc_to_del; list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) { list_del(&mc_to_del->list); kfree(mc_to_del); } } static u_int mlx4_copy_maddr(void *arg, struct sockaddr_dl *sdl, u_int count) { struct mlx4_en_priv *priv = arg; struct mlx4_en_addr_list *tmp; if (sdl->sdl_alen != ETHER_ADDR_LEN) /* XXXGL: can that happen? */ return (0); tmp = kzalloc(sizeof(struct mlx4_en_addr_list), GFP_ATOMIC); if (tmp == NULL) { en_err(priv, "Failed to allocate address list\n"); return (0); } memcpy(tmp->addr, LLADDR(sdl), ETH_ALEN); list_add_tail(&tmp->list, &priv->mc_list); return (1); } -static void mlx4_en_cache_mclist(struct net_device *dev) +static void mlx4_en_cache_mclist(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); mlx4_en_clear_mclist(dev); if_foreach_llmaddr(dev, mlx4_copy_maddr, priv); } static void update_addr_list_flags(struct mlx4_en_priv *priv, struct list_head *dst, struct list_head *src) { struct mlx4_en_addr_list *dst_tmp, *src_tmp, *new_mc; bool found; /* Find all the entries that should be removed from dst, * These are the entries that are not found in src */ list_for_each_entry(dst_tmp, dst, list) { found = false; list_for_each_entry(src_tmp, src, list) { if (!memcmp(dst_tmp->addr, src_tmp->addr, ETH_ALEN)) { found = true; break; } } if (!found) dst_tmp->action = MLX4_ADDR_LIST_REM; } /* Add entries that exist in src but not in dst * mark them as need to add */ list_for_each_entry(src_tmp, src, list) { found = false; list_for_each_entry(dst_tmp, dst, list) { if (!memcmp(dst_tmp->addr, src_tmp->addr, ETH_ALEN)) { dst_tmp->action = MLX4_ADDR_LIST_NONE; found = true; break; } } if (!found) { new_mc = kmalloc(sizeof(struct mlx4_en_addr_list), GFP_KERNEL); if (!new_mc) { en_err(priv, "Failed to allocate current multicast list\n"); return; } memcpy(new_mc, src_tmp, sizeof(struct mlx4_en_addr_list)); new_mc->action = MLX4_ADDR_LIST_ADD; list_add_tail(&new_mc->list, dst); } } } -static void mlx4_en_set_rx_mode(struct net_device *dev) +static void mlx4_en_set_rx_mode(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); if (!priv->port_up) return; queue_work(priv->mdev->workqueue, &priv->rx_mode_task); } static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv, struct mlx4_en_dev *mdev) { int err = 0; if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) { priv->flags |= MLX4_EN_FLAG_PROMISC; /* Enable promiscouos mode */ switch (mdev->dev->caps.steering_mode) { case MLX4_STEERING_MODE_DEVICE_MANAGED: err = mlx4_flow_steer_promisc_add(mdev->dev, priv->port, priv->base_qpn, MLX4_FS_ALL_DEFAULT); if (err) en_err(priv, "Failed enabling promiscuous mode\n"); priv->flags |= MLX4_EN_FLAG_MC_PROMISC; break; case MLX4_STEERING_MODE_B0: err = mlx4_unicast_promisc_add(mdev->dev, priv->base_qpn, priv->port); if (err) en_err(priv, "Failed enabling unicast promiscuous mode\n"); /* Add the default qp number as multicast * promisc */ if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn, priv->port); if (err) en_err(priv, "Failed enabling multicast promiscuous mode\n"); priv->flags |= MLX4_EN_FLAG_MC_PROMISC; } break; case MLX4_STEERING_MODE_A0: err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 1); if (err) en_err(priv, "Failed enabling promiscuous mode\n"); break; } /* Disable port multicast filter (unconditionally) */ err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); } } static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv, struct mlx4_en_dev *mdev) { int err = 0; priv->flags &= ~MLX4_EN_FLAG_PROMISC; /* Disable promiscouos mode */ switch (mdev->dev->caps.steering_mode) { case MLX4_STEERING_MODE_DEVICE_MANAGED: err = mlx4_flow_steer_promisc_remove(mdev->dev, priv->port, MLX4_FS_ALL_DEFAULT); if (err) en_err(priv, "Failed disabling promiscuous mode\n"); priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; break; case MLX4_STEERING_MODE_B0: err = mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn, priv->port); if (err) en_err(priv, "Failed disabling unicast promiscuous mode\n"); /* Disable Multicast promisc */ if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, priv->port); if (err) en_err(priv, "Failed disabling multicast promiscuous mode\n"); priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; } break; case MLX4_STEERING_MODE_A0: err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0); if (err) en_err(priv, "Failed disabling promiscuous mode\n"); break; } } static void mlx4_en_do_multicast(struct mlx4_en_priv *priv, - struct net_device *dev, + struct ifnet *dev, struct mlx4_en_dev *mdev) { struct mlx4_en_addr_list *addr_list, *tmp; u8 mc_list[16] = {0}; int err = 0; u64 mcast_addr = 0; /* * Enable/disable the multicast filter according to * IFF_ALLMULTI and IFF_PROMISC: */ if (dev->if_flags & (IFF_ALLMULTI | IFF_PROMISC)) { err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); /* Add the default qp number as multicast promisc */ if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { switch (mdev->dev->caps.steering_mode) { case MLX4_STEERING_MODE_DEVICE_MANAGED: err = mlx4_flow_steer_promisc_add(mdev->dev, priv->port, priv->base_qpn, MLX4_FS_MC_DEFAULT); break; case MLX4_STEERING_MODE_B0: err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn, priv->port); break; case MLX4_STEERING_MODE_A0: break; } if (err) en_err(priv, "Failed entering multicast promisc mode\n"); priv->flags |= MLX4_EN_FLAG_MC_PROMISC; } } else { /* Disable Multicast promisc */ if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { switch (mdev->dev->caps.steering_mode) { case MLX4_STEERING_MODE_DEVICE_MANAGED: err = mlx4_flow_steer_promisc_remove(mdev->dev, priv->port, MLX4_FS_MC_DEFAULT); break; case MLX4_STEERING_MODE_B0: err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, priv->port); break; case MLX4_STEERING_MODE_A0: break; } if (err) en_err(priv, "Failed disabling multicast promiscuous mode\n"); priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; } err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); /* Flush mcast filter and init it with broadcast address */ mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST, 1, MLX4_MCAST_CONFIG); /* Update multicast list - we cache all addresses so they won't * change while HW is updated holding the command semaphor */ mlx4_en_cache_mclist(dev); list_for_each_entry(addr_list, &priv->mc_list, list) { mcast_addr = mlx4_mac_to_u64(addr_list->addr); mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, mcast_addr, 0, MLX4_MCAST_CONFIG); } err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_ENABLE); if (err) en_err(priv, "Failed enabling multicast filter\n"); update_addr_list_flags(priv, &priv->curr_mc_list, &priv->mc_list); list_for_each_entry_safe(addr_list, tmp, &priv->curr_mc_list, list) { if (addr_list->action == MLX4_ADDR_LIST_REM) { /* detach this address and delete from list */ memcpy(&mc_list[10], addr_list->addr, ETH_ALEN); mc_list[5] = priv->port; err = mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list, MLX4_PROT_ETH, addr_list->reg_id); if (err) en_err(priv, "Fail to detach multicast address\n"); if (addr_list->tunnel_reg_id) { err = mlx4_flow_detach(priv->mdev->dev, addr_list->tunnel_reg_id); if (err) en_err(priv, "Failed to detach multicast address\n"); } /* remove from list */ list_del(&addr_list->list); kfree(addr_list); } else if (addr_list->action == MLX4_ADDR_LIST_ADD) { /* attach the address */ memcpy(&mc_list[10], addr_list->addr, ETH_ALEN); /* needed for B0 steering support */ mc_list[5] = priv->port; err = mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list, priv->port, 0, MLX4_PROT_ETH, &addr_list->reg_id); if (err) en_err(priv, "Fail to attach multicast address\n"); err = mlx4_en_tunnel_steer_add(priv, &mc_list[10], priv->base_qpn, &addr_list->tunnel_reg_id); if (err) en_err(priv, "Failed to attach multicast address\n"); } } } } static void mlx4_en_do_unicast(struct mlx4_en_priv *priv, - struct net_device *dev, + struct ifnet *dev, struct mlx4_en_dev *mdev) { struct mlx4_en_addr_list *addr_list, *tmp; int err; /* Update unicast list */ mlx4_en_cache_uclist(dev); update_addr_list_flags(priv, &priv->curr_uc_list, &priv->uc_list); list_for_each_entry_safe(addr_list, tmp, &priv->curr_uc_list, list) { if (addr_list->action == MLX4_ADDR_LIST_REM) { mlx4_en_uc_steer_release(priv, addr_list->addr, priv->rss_map.indir_qp.qpn, addr_list->reg_id); /* remove from list */ list_del(&addr_list->list); kfree(addr_list); } else if (addr_list->action == MLX4_ADDR_LIST_ADD) { err = mlx4_en_uc_steer_add(priv, addr_list->addr, &priv->rss_map.indir_qp.qpn, &addr_list->reg_id); if (err) en_err(priv, "Fail to add unicast address\n"); } } } static void mlx4_en_do_set_rx_mode(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, rx_mode_task); struct mlx4_en_dev *mdev = priv->mdev; - struct net_device *dev = priv->dev; + struct ifnet *dev = priv->dev; mutex_lock(&mdev->state_lock); if (!mdev->device_up) { en_dbg(HW, priv, "Card is not up, ignoring rx mode change.\n"); goto out; } if (!priv->port_up) { en_dbg(HW, priv, "Port is down, ignoring rx mode change.\n"); goto out; } if (!mlx4_en_QUERY_PORT(mdev, priv->port)) { if (priv->port_state.link_state) { priv->last_link_state = MLX4_DEV_EVENT_PORT_UP; /* update netif baudrate */ priv->dev->if_baudrate = IF_Mbps(priv->port_state.link_speed); /* Important note: the following call for if_link_state_change * is needed for interface up scenario (start port, link state * change) */ if_link_state_change(priv->dev, LINK_STATE_UP); en_dbg(HW, priv, "Link Up\n"); } } /* Set unicast rules */ mlx4_en_do_unicast(priv, dev, mdev); /* Promsicuous mode: disable all filters */ if ((dev->if_flags & IFF_PROMISC) || (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC)) { mlx4_en_set_promisc_mode(priv, mdev); } else if (priv->flags & MLX4_EN_FLAG_PROMISC) { /* Not in promiscuous mode */ mlx4_en_clear_promisc_mode(priv, mdev); } /* Set multicast rules */ mlx4_en_do_multicast(priv, dev, mdev); out: mutex_unlock(&mdev->state_lock); } static void mlx4_en_watchdog_timeout(void *arg) { struct mlx4_en_priv *priv = arg; struct mlx4_en_dev *mdev = priv->mdev; en_dbg(DRV, priv, "Scheduling watchdog\n"); queue_work(mdev->workqueue, &priv->watchdog_task); if (priv->port_up) callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT, mlx4_en_watchdog_timeout, priv); } static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv) { struct mlx4_en_cq *cq; int i; /* If we haven't received a specific coalescing setting * (module param), we set the moderation parameters as follows: * - moder_cnt is set to the number of mtu sized packets to * satisfy our coalescing target. * - moder_time is set to a fixed value. */ priv->rx_frames = MLX4_EN_RX_COAL_TARGET; priv->rx_usecs = MLX4_EN_RX_COAL_TIME; priv->tx_frames = MLX4_EN_TX_COAL_PKTS; priv->tx_usecs = MLX4_EN_TX_COAL_TIME; en_dbg(INTR, priv, "Default coalesing params for mtu: %u - " "rx_frames:%d rx_usecs:%d\n", (unsigned)priv->dev->if_mtu, priv->rx_frames, priv->rx_usecs); /* Setup cq moderation params */ for (i = 0; i < priv->rx_ring_num; i++) { cq = priv->rx_cq[i]; cq->moder_cnt = priv->rx_frames; cq->moder_time = priv->rx_usecs; priv->last_moder_time[i] = MLX4_EN_AUTO_CONF; priv->last_moder_packets[i] = 0; priv->last_moder_bytes[i] = 0; } for (i = 0; i < priv->tx_ring_num; i++) { cq = priv->tx_cq[i]; cq->moder_cnt = priv->tx_frames; cq->moder_time = priv->tx_usecs; } /* Reset auto-moderation params */ priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW; priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW; priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH; priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH; priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL; priv->adaptive_rx_coal = 1; priv->last_moder_jiffies = 0; priv->last_moder_tx_packets = 0; } static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv) { unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies); struct mlx4_en_cq *cq; unsigned long packets; unsigned long rate; unsigned long avg_pkt_size; unsigned long rx_packets; unsigned long rx_bytes; unsigned long rx_pkt_diff; int moder_time; int ring, err; if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ) return; for (ring = 0; ring < priv->rx_ring_num; ring++) { spin_lock(&priv->stats_lock); rx_packets = priv->rx_ring[ring]->packets; rx_bytes = priv->rx_ring[ring]->bytes; spin_unlock(&priv->stats_lock); rx_pkt_diff = ((unsigned long) (rx_packets - priv->last_moder_packets[ring])); packets = rx_pkt_diff; rate = packets * HZ / period; avg_pkt_size = packets ? ((unsigned long) (rx_bytes - priv->last_moder_bytes[ring])) / packets : 0; /* Apply auto-moderation only when packet rate * exceeds a rate that it matters */ if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) && avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) { if (rate < priv->pkt_rate_low) moder_time = priv->rx_usecs_low; else if (rate > priv->pkt_rate_high) moder_time = priv->rx_usecs_high; else moder_time = (rate - priv->pkt_rate_low) * (priv->rx_usecs_high - priv->rx_usecs_low) / (priv->pkt_rate_high - priv->pkt_rate_low) + priv->rx_usecs_low; } else { moder_time = priv->rx_usecs_low; } if (moder_time != priv->last_moder_time[ring]) { priv->last_moder_time[ring] = moder_time; cq = priv->rx_cq[ring]; cq->moder_time = moder_time; cq->moder_cnt = priv->rx_frames; err = mlx4_en_set_cq_moder(priv, cq); if (err) en_err(priv, "Failed modifying moderation for cq:%d\n", ring); } priv->last_moder_packets[ring] = rx_packets; priv->last_moder_bytes[ring] = rx_bytes; } priv->last_moder_jiffies = jiffies; } static void mlx4_en_do_get_stats(struct work_struct *work) { struct delayed_work *delay = to_delayed_work(work); struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv, stats_task); struct mlx4_en_dev *mdev = priv->mdev; int err; mutex_lock(&mdev->state_lock); if (mdev->device_up) { if (priv->port_up) { if (mlx4_is_slave(mdev->dev)) err = mlx4_en_get_vport_stats(mdev, priv->port); else err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0); if (err) en_dbg(HW, priv, "Could not update stats\n"); mlx4_en_auto_moderation(priv); } queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); } mutex_unlock(&mdev->state_lock); } /* mlx4_en_service_task - Run service task for tasks that needed to be done * periodically */ static void mlx4_en_service_task(struct work_struct *work) { struct delayed_work *delay = to_delayed_work(work); struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv, service_task); struct mlx4_en_dev *mdev = priv->mdev; mutex_lock(&mdev->state_lock); if (mdev->device_up) { queue_delayed_work(mdev->workqueue, &priv->service_task, SERVICE_TASK_DELAY); } mutex_unlock(&mdev->state_lock); } static void mlx4_en_linkstate(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, linkstate_task); struct mlx4_en_dev *mdev = priv->mdev; int linkstate = priv->link_state; mutex_lock(&mdev->state_lock); /* If observable port state changed set carrier state and * report to system log */ if (priv->last_link_state != linkstate) { if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) { en_info(priv, "Link Down\n"); if_link_state_change(priv->dev, LINK_STATE_DOWN); /* update netif baudrate */ priv->dev->if_baudrate = 0; /* make sure the port is up before notifying the OS. * This is tricky since we get here on INIT_PORT and * in such case we can't tell the OS the port is up. * To solve this there is a call to if_link_state_change * in set_rx_mode. * */ } else if (priv->port_up && (linkstate == MLX4_DEV_EVENT_PORT_UP)){ if (mlx4_en_QUERY_PORT(priv->mdev, priv->port)) en_info(priv, "Query port failed\n"); priv->dev->if_baudrate = IF_Mbps(priv->port_state.link_speed); en_info(priv, "Link Up\n"); if_link_state_change(priv->dev, LINK_STATE_UP); } } priv->last_link_state = linkstate; mutex_unlock(&mdev->state_lock); } -int mlx4_en_start_port(struct net_device *dev) +int mlx4_en_start_port(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_cq *cq; struct mlx4_en_tx_ring *tx_ring; int rx_index = 0; int tx_index = 0; int err = 0; int i; int j; u8 mc_list[16] = {0}; if (priv->port_up) { en_dbg(DRV, priv, "start port called while port already up\n"); return 0; } INIT_LIST_HEAD(&priv->mc_list); INIT_LIST_HEAD(&priv->uc_list); INIT_LIST_HEAD(&priv->curr_mc_list); INIT_LIST_HEAD(&priv->curr_uc_list); INIT_LIST_HEAD(&priv->ethtool_list); /* Calculate Rx buf size */ dev->if_mtu = min(dev->if_mtu, priv->max_mtu); mlx4_en_calc_rx_buf(dev); en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_mb_size); /* Configure rx cq's and rings */ err = mlx4_en_activate_rx_rings(priv); if (err) { en_err(priv, "Failed to activate RX rings\n"); return err; } for (i = 0; i < priv->rx_ring_num; i++) { cq = priv->rx_cq[i]; mlx4_en_cq_init_lock(cq); err = mlx4_en_activate_cq(priv, cq, i); if (err) { en_err(priv, "Failed activating Rx CQ\n"); goto cq_err; } for (j = 0; j < cq->size; j++) cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK; err = mlx4_en_set_cq_moder(priv, cq); if (err) { en_err(priv, "Failed setting cq moderation parameters"); mlx4_en_deactivate_cq(priv, cq); goto cq_err; } mlx4_en_arm_cq(priv, cq); priv->rx_ring[i]->cqn = cq->mcq.cqn; ++rx_index; } /* Set qp number */ en_dbg(DRV, priv, "Getting qp number for port %d\n", priv->port); err = mlx4_en_get_qp(priv); if (err) { en_err(priv, "Failed getting eth qp\n"); goto cq_err; } mdev->mac_removed[priv->port] = 0; priv->counter_index = mlx4_get_default_counter_index(mdev->dev, priv->port); err = mlx4_en_config_rss_steer(priv); if (err) { en_err(priv, "Failed configuring rss steering\n"); goto mac_err; } err = mlx4_en_create_drop_qp(priv); if (err) goto rss_err; /* Configure tx cq's and rings */ for (i = 0; i < priv->tx_ring_num; i++) { /* Configure cq */ cq = priv->tx_cq[i]; err = mlx4_en_activate_cq(priv, cq, i); if (err) { en_err(priv, "Failed activating Tx CQ\n"); goto tx_err; } err = mlx4_en_set_cq_moder(priv, cq); if (err) { en_err(priv, "Failed setting cq moderation parameters"); mlx4_en_deactivate_cq(priv, cq); goto tx_err; } en_dbg(DRV, priv, "Resetting index of collapsed CQ:%d to -1\n", i); cq->buf->wqe_index = cpu_to_be16(0xffff); /* Configure ring */ tx_ring = priv->tx_ring[i]; err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn, i / priv->num_tx_rings_p_up); if (err) { en_err(priv, "Failed activating Tx ring %d\n", i); mlx4_en_deactivate_cq(priv, cq); goto tx_err; } /* Arm CQ for TX completions */ mlx4_en_arm_cq(priv, cq); /* Set initial ownership of all Tx TXBBs to SW (1) */ for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE) *((u32 *) (tx_ring->buf + j)) = INIT_OWNER_BIT; ++tx_index; } /* Configure port */ err = mlx4_SET_PORT_general(mdev->dev, priv->port, priv->rx_mb_size, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); if (err) { en_err(priv, "Failed setting port general configurations for port %d, with error %d\n", priv->port, err); goto tx_err; } /* Set default qp number */ err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0); if (err) { en_err(priv, "Failed setting default qp numbers\n"); goto tx_err; } /* Init port */ en_dbg(HW, priv, "Initializing port\n"); err = mlx4_INIT_PORT(mdev->dev, priv->port); if (err) { en_err(priv, "Failed Initializing port\n"); goto tx_err; } /* Attach rx QP to bradcast address */ memset(&mc_list[10], 0xff, ETH_ALEN); mc_list[5] = priv->port; /* needed for B0 steering support */ if (mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list, priv->port, 0, MLX4_PROT_ETH, &priv->broadcast_id)) mlx4_warn(mdev, "Failed Attaching Broadcast\n"); /* Must redo promiscuous mode setup. */ priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC); /* Schedule multicast task to populate multicast list */ queue_work(mdev->workqueue, &priv->rx_mode_task); priv->port_up = true; /* Enable the queues. */ dev->if_drv_flags &= ~IFF_DRV_OACTIVE; dev->if_drv_flags |= IFF_DRV_RUNNING; #ifdef CONFIG_DEBUG_FS mlx4_en_create_debug_files(priv); #endif callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT, mlx4_en_watchdog_timeout, priv); return 0; tx_err: while (tx_index--) { mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[tx_index]); mlx4_en_deactivate_cq(priv, priv->tx_cq[tx_index]); } mlx4_en_destroy_drop_qp(priv); rss_err: mlx4_en_release_rss_steer(priv); mac_err: mlx4_en_put_qp(priv); cq_err: while (rx_index--) mlx4_en_deactivate_cq(priv, priv->rx_cq[rx_index]); for (i = 0; i < priv->rx_ring_num; i++) mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]); return err; /* need to close devices */ } -void mlx4_en_stop_port(struct net_device *dev) +void mlx4_en_stop_port(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_addr_list *addr_list, *tmp; int i; u8 mc_list[16] = {0}; if (!priv->port_up) { en_dbg(DRV, priv, "stop port called while port already down\n"); return; } #ifdef CONFIG_DEBUG_FS mlx4_en_delete_debug_files(priv); #endif /* close port*/ mlx4_CLOSE_PORT(mdev->dev, priv->port); /* Set port as not active */ priv->port_up = false; priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev); /* Promsicuous mode */ if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC); mlx4_flow_steer_promisc_remove(mdev->dev, priv->port, MLX4_FS_ALL_DEFAULT); mlx4_flow_steer_promisc_remove(mdev->dev, priv->port, MLX4_FS_MC_DEFAULT); } else if (priv->flags & MLX4_EN_FLAG_PROMISC) { priv->flags &= ~MLX4_EN_FLAG_PROMISC; /* Disable promiscouos mode */ mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn, priv->port); /* Disable Multicast promisc */ if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, priv->port); priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; } } /* Detach All unicasts */ list_for_each_entry(addr_list, &priv->curr_uc_list, list) { mlx4_en_uc_steer_release(priv, addr_list->addr, priv->rss_map.indir_qp.qpn, addr_list->reg_id); } mlx4_en_clear_uclist(dev); list_for_each_entry_safe(addr_list, tmp, &priv->curr_uc_list, list) { list_del(&addr_list->list); kfree(addr_list); } /* Detach All multicasts */ memset(&mc_list[10], 0xff, ETH_ALEN); mc_list[5] = priv->port; /* needed for B0 steering support */ mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list, MLX4_PROT_ETH, priv->broadcast_id); list_for_each_entry(addr_list, &priv->curr_mc_list, list) { memcpy(&mc_list[10], addr_list->addr, ETH_ALEN); mc_list[5] = priv->port; mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list, MLX4_PROT_ETH, addr_list->reg_id); } mlx4_en_clear_mclist(dev); list_for_each_entry_safe(addr_list, tmp, &priv->curr_mc_list, list) { list_del(&addr_list->list); kfree(addr_list); } /* Flush multicast filter */ mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG); mlx4_en_destroy_drop_qp(priv); /* Free TX Rings */ for (i = 0; i < priv->tx_ring_num; i++) { mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[i]); mlx4_en_deactivate_cq(priv, priv->tx_cq[i]); } msleep(10); for (i = 0; i < priv->tx_ring_num; i++) mlx4_en_free_tx_buf(dev, priv->tx_ring[i]); /* Free RSS qps */ mlx4_en_release_rss_steer(priv); /* Unregister Mac address for the port */ mlx4_en_put_qp(priv); mdev->mac_removed[priv->port] = 1; /* Free RX Rings */ for (i = 0; i < priv->rx_ring_num; i++) { struct mlx4_en_cq *cq = priv->rx_cq[i]; mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]); mlx4_en_deactivate_cq(priv, cq); } callout_stop(&priv->watchdog_timer); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); } static void mlx4_en_restart(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, watchdog_task); struct mlx4_en_dev *mdev = priv->mdev; - struct net_device *dev = priv->dev; + struct ifnet *dev = priv->dev; struct mlx4_en_tx_ring *ring; int i; if (priv->blocked == 0 || priv->port_up == 0) return; for (i = 0; i < priv->tx_ring_num; i++) { int watchdog_time; ring = priv->tx_ring[i]; watchdog_time = READ_ONCE(ring->watchdog_time); if (watchdog_time != 0 && time_after(ticks, ring->watchdog_time)) goto reset; } return; reset: priv->port_stats.tx_timeout++; en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port); mutex_lock(&mdev->state_lock); if (priv->port_up) { mlx4_en_stop_port(dev); //for (i = 0; i < priv->tx_ring_num; i++) // netdev_tx_reset_queue(priv->tx_ring[i]->tx_queue); if (mlx4_en_start_port(dev)) en_err(priv, "Failed restarting port %d\n", priv->port); } mutex_unlock(&mdev->state_lock); } -static void mlx4_en_clear_stats(struct net_device *dev) +static void mlx4_en_clear_stats(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int i; if (!mlx4_is_slave(mdev->dev)) if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1)) en_dbg(HW, priv, "Failed dumping statistics\n"); memset(&priv->pstats, 0, sizeof(priv->pstats)); memset(&priv->pkstats, 0, sizeof(priv->pkstats)); memset(&priv->port_stats, 0, sizeof(priv->port_stats)); memset(&priv->vport_stats, 0, sizeof(priv->vport_stats)); for (i = 0; i < priv->tx_ring_num; i++) { priv->tx_ring[i]->bytes = 0; priv->tx_ring[i]->packets = 0; priv->tx_ring[i]->tx_csum = 0; priv->tx_ring[i]->oversized_packets = 0; } for (i = 0; i < priv->rx_ring_num; i++) { priv->rx_ring[i]->bytes = 0; priv->rx_ring[i]->packets = 0; priv->rx_ring[i]->csum_ok = 0; priv->rx_ring[i]->csum_none = 0; } } static void mlx4_en_open(void* arg) { struct mlx4_en_priv *priv; struct mlx4_en_dev *mdev; - struct net_device *dev; + struct ifnet *dev; int err = 0; priv = arg; mdev = priv->mdev; dev = priv->dev; mutex_lock(&mdev->state_lock); if (!mdev->device_up) { en_err(priv, "Cannot open - device down/disabled\n"); goto out; } /* Reset HW statistics and SW counters */ mlx4_en_clear_stats(dev); err = mlx4_en_start_port(dev); if (err) en_err(priv, "Failed starting port:%d\n", priv->port); out: mutex_unlock(&mdev->state_lock); return; } void mlx4_en_free_resources(struct mlx4_en_priv *priv) { int i; #ifdef CONFIG_RFS_ACCEL if (priv->dev->rx_cpu_rmap) { free_irq_cpu_rmap(priv->dev->rx_cpu_rmap); priv->dev->rx_cpu_rmap = NULL; } #endif for (i = 0; i < priv->tx_ring_num; i++) { if (priv->tx_ring && priv->tx_ring[i]) mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); if (priv->tx_cq && priv->tx_cq[i]) mlx4_en_destroy_cq(priv, &priv->tx_cq[i]); } for (i = 0; i < priv->rx_ring_num; i++) { if (priv->rx_ring[i]) mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i], priv->prof->rx_ring_size); if (priv->rx_cq[i]) mlx4_en_destroy_cq(priv, &priv->rx_cq[i]); } if (priv->stat_sysctl != NULL) sysctl_ctx_free(&priv->stat_ctx); } int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) { struct mlx4_en_port_profile *prof = priv->prof; int i; int node = 0; /* Create rx Rings */ for (i = 0; i < priv->rx_ring_num; i++) { if (mlx4_en_create_cq(priv, &priv->rx_cq[i], prof->rx_ring_size, i, RX, node)) goto err; if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i], prof->rx_ring_size, node)) goto err; } /* Create tx Rings */ for (i = 0; i < priv->tx_ring_num; i++) { if (mlx4_en_create_cq(priv, &priv->tx_cq[i], prof->tx_ring_size, i, TX, node)) goto err; if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i], prof->tx_ring_size, TXBB_SIZE, node, i)) goto err; } #ifdef CONFIG_RFS_ACCEL priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->rx_ring_num); if (!priv->dev->rx_cpu_rmap) goto err; #endif /* Re-create stat sysctls in case the number of rings changed. */ mlx4_en_sysctl_stat(priv); return 0; err: en_err(priv, "Failed to allocate NIC resources\n"); for (i = 0; i < priv->rx_ring_num; i++) { if (priv->rx_ring[i]) mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i], prof->rx_ring_size); if (priv->rx_cq[i]) mlx4_en_destroy_cq(priv, &priv->rx_cq[i]); } for (i = 0; i < priv->tx_ring_num; i++) { if (priv->tx_ring[i]) mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); if (priv->tx_cq[i]) mlx4_en_destroy_cq(priv, &priv->tx_cq[i]); } priv->port_up = false; return -ENOMEM; } struct en_port_attribute { struct attribute attr; ssize_t (*show)(struct en_port *, struct en_port_attribute *, char *buf); ssize_t (*store)(struct en_port *, struct en_port_attribute *, char *buf, size_t count); }; #define PORT_ATTR_RO(_name) \ struct en_port_attribute en_port_attr_##_name = __ATTR_RO(_name) #define EN_PORT_ATTR(_name, _mode, _show, _store) \ struct en_port_attribute en_port_attr_##_name = __ATTR(_name, _mode, _show, _store) -void mlx4_en_destroy_netdev(struct net_device *dev) +void mlx4_en_destroy_netdev(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port); /* don't allow more IOCTLs */ priv->gone = 1; /* XXX wait a bit to allow IOCTL handlers to complete */ pause("W", hz); if (priv->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); if (priv->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach); mutex_lock(&mdev->state_lock); mlx4_en_stop_port(dev); mutex_unlock(&mdev->state_lock); /* Unregister device - this will close the port if it was up */ if (priv->registered) ether_ifdetach(dev); if (priv->allocated) mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE); cancel_delayed_work(&priv->stats_task); cancel_delayed_work(&priv->service_task); /* flush any pending task for this netdev */ flush_workqueue(mdev->workqueue); callout_drain(&priv->watchdog_timer); /* Detach the netdev so tasks would not attempt to access it */ mutex_lock(&mdev->state_lock); mdev->pndev[priv->port] = NULL; mutex_unlock(&mdev->state_lock); mlx4_en_free_resources(priv); /* freeing the sysctl conf cannot be called from within mlx4_en_free_resources */ if (priv->conf_sysctl != NULL) sysctl_ctx_free(&priv->conf_ctx); kfree(priv->tx_ring); kfree(priv->tx_cq); kfree(priv); if_free(dev); } -static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) +static int mlx4_en_change_mtu(struct ifnet *dev, int new_mtu) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int err = 0; en_dbg(DRV, priv, "Change MTU called - current:%u new:%u\n", (unsigned)dev->if_mtu, (unsigned)new_mtu); if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) { en_err(priv, "Bad MTU size:%d, max %u.\n", new_mtu, priv->max_mtu); return -EPERM; } mutex_lock(&mdev->state_lock); dev->if_mtu = new_mtu; if (dev->if_drv_flags & IFF_DRV_RUNNING) { if (!mdev->device_up) { /* NIC is probably restarting - let watchdog task reset * * the port */ en_dbg(DRV, priv, "Change MTU called with card down!?\n"); } else { mlx4_en_stop_port(dev); err = mlx4_en_start_port(dev); if (err) { en_err(priv, "Failed restarting port:%d\n", priv->port); queue_work(mdev->workqueue, &priv->watchdog_task); } } } mutex_unlock(&mdev->state_lock); return 0; } static int mlx4_en_calc_media(struct mlx4_en_priv *priv) { int trans_type; int active; active = IFM_ETHER; if (priv->last_link_state == MLX4_DEV_EVENT_PORT_DOWN) return (active); active |= IFM_FDX; trans_type = priv->port_state.transceiver; /* XXX I don't know all of the transceiver values. */ switch (priv->port_state.link_speed) { case 100: active |= IFM_100_T; break; case 1000: active |= IFM_1000_T; break; case 10000: if (trans_type > 0 && trans_type <= 0xC) active |= IFM_10G_SR; else if (trans_type == 0x80 || trans_type == 0) active |= IFM_10G_CX4; break; case 40000: active |= IFM_40G_CR4; break; } if (priv->prof->tx_pause) active |= IFM_ETH_TXPAUSE; if (priv->prof->rx_pause) active |= IFM_ETH_RXPAUSE; return (active); } static void mlx4_en_media_status(struct ifnet *dev, struct ifmediareq *ifmr) { struct mlx4_en_priv *priv; priv = dev->if_softc; ifmr->ifm_status = IFM_AVALID; if (priv->last_link_state != MLX4_DEV_EVENT_PORT_DOWN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active = mlx4_en_calc_media(priv); return; } static int mlx4_en_media_change(struct ifnet *dev) { struct mlx4_en_priv *priv; struct ifmedia *ifm; int rxpause; int txpause; int error; priv = dev->if_softc; ifm = &priv->media; rxpause = txpause = 0; error = 0; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) return (EINVAL); switch (IFM_SUBTYPE(ifm->ifm_media)) { case IFM_AUTO: break; case IFM_10G_SR: case IFM_10G_CX4: case IFM_1000_T: case IFM_40G_CR4: if ((IFM_SUBTYPE(ifm->ifm_media) == IFM_SUBTYPE(mlx4_en_calc_media(priv))) && (ifm->ifm_media & IFM_FDX)) break; /* Fallthrough */ default: printf("%s: Only auto media type\n", if_name(dev)); return (EINVAL); } /* Allow user to set/clear pause */ if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE) rxpause = 1; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) txpause = 1; if (priv->prof->tx_pause != txpause || priv->prof->rx_pause != rxpause) { priv->prof->tx_pause = txpause; priv->prof->rx_pause = rxpause; error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port, priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); } return (error); } static int mlx4_en_ioctl(struct ifnet *dev, u_long command, caddr_t data) { struct mlx4_en_priv *priv; struct mlx4_en_dev *mdev; struct ifreq *ifr; int error; int mask; struct ifrsskey *ifrk; const u32 *key; struct ifrsshash *ifrh; u8 rss_mask; error = 0; mask = 0; priv = dev->if_softc; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); mdev = priv->mdev; ifr = (struct ifreq *) data; switch (command) { case SIOCSIFMTU: error = -mlx4_en_change_mtu(dev, ifr->ifr_mtu); break; case SIOCSIFFLAGS: if (dev->if_flags & IFF_UP) { if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) { mutex_lock(&mdev->state_lock); mlx4_en_start_port(dev); mutex_unlock(&mdev->state_lock); } else { mlx4_en_set_rx_mode(dev); } } else { mutex_lock(&mdev->state_lock); if (dev->if_drv_flags & IFF_DRV_RUNNING) { mlx4_en_stop_port(dev); if_link_state_change(dev, LINK_STATE_DOWN); } mutex_unlock(&mdev->state_lock); } break; case SIOCADDMULTI: case SIOCDELMULTI: mlx4_en_set_rx_mode(dev); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(dev, ifr, &priv->media, command); break; case SIOCSIFCAP: mutex_lock(&mdev->state_lock); mask = ifr->ifr_reqcap ^ dev->if_capenable; if (mask & IFCAP_TXCSUM) { dev->if_capenable ^= IFCAP_TXCSUM; dev->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & dev->if_capenable && !(IFCAP_TXCSUM & dev->if_capenable)) { mask &= ~IFCAP_TSO4; dev->if_capenable &= ~IFCAP_TSO4; dev->if_hwassist &= ~CSUM_IP_TSO; if_printf(dev, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { dev->if_capenable ^= IFCAP_TXCSUM_IPV6; dev->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & dev->if_capenable && !(IFCAP_TXCSUM_IPV6 & dev->if_capenable)) { mask &= ~IFCAP_TSO6; dev->if_capenable &= ~IFCAP_TSO6; dev->if_hwassist &= ~CSUM_IP6_TSO; if_printf(dev, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) dev->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) dev->if_capenable ^= IFCAP_RXCSUM_IPV6; if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & dev->if_capenable) && !(IFCAP_TXCSUM & dev->if_capenable)) { if_printf(dev, "enable txcsum first.\n"); error = EAGAIN; goto out; } dev->if_capenable ^= IFCAP_TSO4; dev->if_hwassist ^= CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & dev->if_capenable) && !(IFCAP_TXCSUM_IPV6 & dev->if_capenable)) { if_printf(dev, "enable txcsum6 first.\n"); error = EAGAIN; goto out; } dev->if_capenable ^= IFCAP_TSO6; dev->if_hwassist ^= CSUM_IP6_TSO; } if (mask & IFCAP_LRO) dev->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_VLAN_HWTAGGING) dev->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWFILTER) dev->if_capenable ^= IFCAP_VLAN_HWFILTER; if (mask & IFCAP_WOL_MAGIC) dev->if_capenable ^= IFCAP_WOL_MAGIC; if (dev->if_drv_flags & IFF_DRV_RUNNING) mlx4_en_start_port(dev); out: mutex_unlock(&mdev->state_lock); VLAN_CAPABILITIES(dev); break; #if __FreeBSD_version >= 1100036 case SIOCGI2C: { struct ifi2creq i2c; error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (error) break; if (i2c.len > sizeof(i2c.data)) { error = EINVAL; break; } /* * Note that we ignore i2c.addr here. The driver hardcodes * the address to 0x50, while standard expects it to be 0xA0. */ error = mlx4_get_module_info(mdev->dev, priv->port, i2c.offset, i2c.len, i2c.data); if (error < 0) { error = -error; break; } error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); break; } #endif case SIOCGIFRSSKEY: ifrk = (struct ifrsskey *)data; ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; mutex_lock(&mdev->state_lock); key = mlx4_en_get_rss_key(priv, &ifrk->ifrk_keylen); if (ifrk->ifrk_keylen > RSS_KEYLEN) error = EINVAL; else memcpy(ifrk->ifrk_key, key, ifrk->ifrk_keylen); mutex_unlock(&mdev->state_lock); break; case SIOCGIFRSSHASH: mutex_lock(&mdev->state_lock); rss_mask = mlx4_en_get_rss_mask(priv); mutex_unlock(&mdev->state_lock); ifrh = (struct ifrsshash *)data; ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; ifrh->ifrh_types = 0; if (rss_mask & MLX4_RSS_IPV4) ifrh->ifrh_types |= RSS_TYPE_IPV4; if (rss_mask & MLX4_RSS_TCP_IPV4) ifrh->ifrh_types |= RSS_TYPE_TCP_IPV4; if (rss_mask & MLX4_RSS_IPV6) ifrh->ifrh_types |= RSS_TYPE_IPV6; if (rss_mask & MLX4_RSS_TCP_IPV6) ifrh->ifrh_types |= RSS_TYPE_TCP_IPV6; if (rss_mask & MLX4_RSS_UDP_IPV4) ifrh->ifrh_types |= RSS_TYPE_UDP_IPV4; if (rss_mask & MLX4_RSS_UDP_IPV6) ifrh->ifrh_types |= RSS_TYPE_UDP_IPV6; break; default: error = ether_ioctl(dev, command, data); break; } return (error); } int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, struct mlx4_en_port_profile *prof) { - struct net_device *dev; + struct ifnet *dev; struct mlx4_en_priv *priv; uint8_t dev_addr[ETHER_ADDR_LEN]; int err; int i; priv = kzalloc(sizeof(*priv), GFP_KERNEL); dev = priv->dev = if_alloc(IFT_ETHER); if (dev == NULL) { en_err(priv, "Net device allocation failed\n"); kfree(priv); return -ENOMEM; } dev->if_softc = priv; if_initname(dev, "mlxen", (device_get_unit( mdev->pdev->dev.bsddev) * MLX4_MAX_PORTS) + port - 1); dev->if_mtu = ETHERMTU; dev->if_init = mlx4_en_open; dev->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; dev->if_ioctl = mlx4_en_ioctl; dev->if_transmit = mlx4_en_transmit; dev->if_qflush = mlx4_en_qflush; dev->if_snd.ifq_maxlen = prof->tx_ring_size; /* * Initialize driver private data */ priv->counter_index = 0xff; spin_lock_init(&priv->stats_lock); INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode); INIT_WORK(&priv->watchdog_task, mlx4_en_restart); INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate); INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats); INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task); callout_init(&priv->watchdog_timer, 1); #ifdef CONFIG_RFS_ACCEL INIT_LIST_HEAD(&priv->filters); spin_lock_init(&priv->filters_lock); #endif priv->msg_enable = MLX4_EN_MSG_LEVEL; priv->dev = dev; priv->mdev = mdev; priv->ddev = &mdev->pdev->dev; priv->prof = prof; priv->port = port; priv->port_up = false; priv->flags = prof->flags; priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up; priv->tx_ring_num = prof->tx_ring_num; priv->tx_ring = kcalloc(MAX_TX_RINGS, sizeof(struct mlx4_en_tx_ring *), GFP_KERNEL); if (!priv->tx_ring) { err = -ENOMEM; goto out; } priv->tx_cq = kcalloc(sizeof(struct mlx4_en_cq *), MAX_TX_RINGS, GFP_KERNEL); if (!priv->tx_cq) { err = -ENOMEM; goto out; } priv->rx_ring_num = prof->rx_ring_num; priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0; priv->mac_index = -1; priv->last_ifq_jiffies = 0; priv->if_counters_rx_errors = 0; priv->if_counters_rx_no_buffer = 0; #ifdef CONFIG_MLX4_EN_DCB if (!mlx4_is_slave(priv->mdev->dev)) { priv->dcbx_cap = DCB_CAP_DCBX_HOST; priv->flags |= MLX4_EN_FLAG_DCB_ENABLED; if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG) { dev->dcbnl_ops = &mlx4_en_dcbnl_ops; } else { en_info(priv, "QoS disabled - no HW support\n"); dev->dcbnl_ops = &mlx4_en_dcbnl_pfc_ops; } } #endif /* Query for default mac and max mtu */ priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port]; priv->mac = mdev->dev->caps.def_mac[priv->port]; if (ILLEGAL_MAC(priv->mac)) { #if BITS_PER_LONG == 64 en_err(priv, "Port: %d, invalid mac burned: 0x%lx, quiting\n", priv->port, priv->mac); #elif BITS_PER_LONG == 32 en_err(priv, "Port: %d, invalid mac burned: 0x%llx, quiting\n", priv->port, priv->mac); #endif err = -EINVAL; goto out; } mlx4_en_sysctl_conf(priv); err = mlx4_en_alloc_resources(priv); if (err) goto out; /* Allocate page for receive rings */ err = mlx4_alloc_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE); if (err) { en_err(priv, "Failed to allocate page for rx qps\n"); goto out; } priv->allocated = 1; /* * Set driver features */ dev->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6; dev->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; dev->if_capabilities |= IFCAP_LRO; dev->if_capabilities |= IFCAP_HWSTATS; if (mdev->LSO_support) dev->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTSO; #if __FreeBSD_version >= 1100000 /* set TSO limits so that we don't have to drop TX packets */ dev->if_hw_tsomax = MLX4_EN_TX_MAX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) /* hdr */; dev->if_hw_tsomaxsegcount = MLX4_EN_TX_MAX_MBUF_FRAGS - 1 /* hdr */; dev->if_hw_tsomaxsegsize = MLX4_EN_TX_MAX_MBUF_SIZE; #endif dev->if_capenable = dev->if_capabilities; dev->if_hwassist = 0; if (dev->if_capenable & (IFCAP_TSO4 | IFCAP_TSO6)) dev->if_hwassist |= CSUM_TSO; if (dev->if_capenable & IFCAP_TXCSUM) dev->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (dev->if_capenable & IFCAP_TXCSUM_IPV6) dev->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); /* Register for VLAN events */ priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, mlx4_en_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST); priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, mlx4_en_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST); mdev->pndev[priv->port] = dev; priv->last_link_state = MLX4_DEV_EVENT_PORT_DOWN; mlx4_en_set_default_moderation(priv); /* Set default MAC */ for (i = 0; i < ETHER_ADDR_LEN; i++) dev_addr[ETHER_ADDR_LEN - 1 - i] = (u8) (priv->mac >> (8 * i)); ether_ifattach(dev, dev_addr); if_link_state_change(dev, LINK_STATE_DOWN); ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK, mlx4_en_media_change, mlx4_en_media_status); ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_1000_T, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_SR, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_CX4, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_40G_CR4, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); DEBUGNET_SET(dev, mlx4_en); en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num); en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num); priv->registered = 1; en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num); en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num); priv->rx_mb_size = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN; err = mlx4_SET_PORT_general(mdev->dev, priv->port, priv->rx_mb_size, prof->tx_pause, prof->tx_ppp, prof->rx_pause, prof->rx_ppp); if (err) { en_err(priv, "Failed setting port general configurations " "for port %d, with error %d\n", priv->port, err); goto out; } /* Init port */ en_warn(priv, "Initializing port\n"); err = mlx4_INIT_PORT(mdev->dev, priv->port); if (err) { en_err(priv, "Failed Initializing port\n"); goto out; } queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) queue_delayed_work(mdev->workqueue, &priv->service_task, SERVICE_TASK_DELAY); return 0; out: mlx4_en_destroy_netdev(dev); return err; } -static int mlx4_en_set_ring_size(struct net_device *dev, +static int mlx4_en_set_ring_size(struct ifnet *dev, int rx_size, int tx_size) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int port_up = 0; int err = 0; rx_size = roundup_pow_of_two(rx_size); rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE); rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE); tx_size = roundup_pow_of_two(tx_size); tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE); tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE); if (rx_size == (priv->port_up ? priv->rx_ring[0]->actual_size : priv->rx_ring[0]->size) && tx_size == priv->tx_ring[0]->size) return 0; mutex_lock(&mdev->state_lock); if (priv->port_up) { port_up = 1; mlx4_en_stop_port(dev); } mlx4_en_free_resources(priv); priv->prof->tx_ring_size = tx_size; priv->prof->rx_ring_size = rx_size; err = mlx4_en_alloc_resources(priv); if (err) { en_err(priv, "Failed reallocating port resources\n"); goto out; } if (port_up) { err = mlx4_en_start_port(dev); if (err) en_err(priv, "Failed starting port\n"); } out: mutex_unlock(&mdev->state_lock); return err; } static int mlx4_en_set_rx_ring_size(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; int size; int error; priv = arg1; size = priv->prof->rx_ring_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || !req->newptr) return (error); error = -mlx4_en_set_ring_size(priv->dev, size, priv->prof->tx_ring_size); return (error); } static int mlx4_en_set_tx_ring_size(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; int size; int error; priv = arg1; size = priv->prof->tx_ring_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || !req->newptr) return (error); error = -mlx4_en_set_ring_size(priv->dev, priv->prof->rx_ring_size, size); return (error); } -static int mlx4_en_get_module_info(struct net_device *dev, +static int mlx4_en_get_module_info(struct ifnet *dev, struct ethtool_modinfo *modinfo) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int ret; u8 data[4]; /* Read first 2 bytes to get Module & REV ID */ ret = mlx4_get_module_info(mdev->dev, priv->port, 0/*offset*/, 2/*size*/, data); if (ret < 2) { en_err(priv, "Failed to read eeprom module first two bytes, error: 0x%x\n", -ret); return -EIO; } switch (data[0] /* identifier */) { case MLX4_MODULE_ID_QSFP: modinfo->type = ETH_MODULE_SFF_8436; modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; break; case MLX4_MODULE_ID_QSFP_PLUS: if (data[1] >= 0x3) { /* revision id */ modinfo->type = ETH_MODULE_SFF_8636; modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN; } else { modinfo->type = ETH_MODULE_SFF_8436; modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; } break; case MLX4_MODULE_ID_QSFP28: modinfo->type = ETH_MODULE_SFF_8636; modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN; break; case MLX4_MODULE_ID_SFP: modinfo->type = ETH_MODULE_SFF_8472; modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; break; default: en_err(priv, "mlx4_en_get_module_info : Not recognized cable type\n"); return -EINVAL; } return 0; } -static int mlx4_en_get_module_eeprom(struct net_device *dev, +static int mlx4_en_get_module_eeprom(struct ifnet *dev, struct ethtool_eeprom *ee, u8 *data) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int offset = ee->offset; int i = 0, ret; if (ee->len == 0) return -EINVAL; memset(data, 0, ee->len); while (i < ee->len) { en_dbg(DRV, priv, "mlx4_get_module_info i(%d) offset(%d) len(%d)\n", i, offset, ee->len - i); ret = mlx4_get_module_info(mdev->dev, priv->port, offset, ee->len - i, data + i); if (!ret) /* Done reading */ return 0; if (ret < 0) { en_err(priv, "mlx4_get_module_info i(%d) offset(%d) bytes_to_read(%d) - FAILED (0x%x)\n", i, offset, ee->len - i, ret); return -1; } i += ret; offset += ret; } return 0; } static void mlx4_en_print_eeprom(u8 *data, __u32 len) { int i; int j = 0; int row = 0; const int NUM_OF_BYTES = 16; printf("\nOffset\t\tValues\n"); printf("------\t\t------\n"); while(row < len){ printf("0x%04x\t\t",row); for(i=0; i < NUM_OF_BYTES; i++){ printf("%02x ", data[j]); row++; j++; } printf("\n"); } } /* Read cable EEPROM module information by first inspecting the first * two bytes to get the length and then read the rest of the information. * The information is printed to dmesg. */ static int mlx4_en_read_eeprom(SYSCTL_HANDLER_ARGS) { u8* data; int error; int result = 0; struct mlx4_en_priv *priv; - struct net_device *dev; + struct ifnet *dev; struct ethtool_modinfo modinfo; struct ethtool_eeprom ee; error = sysctl_handle_int(oidp, &result, 0, req); if (error || !req->newptr) return (error); if (result == 1) { priv = arg1; dev = priv->dev; data = kmalloc(PAGE_SIZE, GFP_KERNEL); error = mlx4_en_get_module_info(dev, &modinfo); if (error) { en_err(priv, "mlx4_en_get_module_info returned with error - FAILED (0x%x)\n", -error); goto out; } ee.len = modinfo.eeprom_len; ee.offset = 0; error = mlx4_en_get_module_eeprom(dev, &ee, data); if (error) { en_err(priv, "mlx4_en_get_module_eeprom returned with error - FAILED (0x%x)\n", -error); /* Continue printing partial information in case of an error */ } /* EEPROM information will be printed in dmesg */ mlx4_en_print_eeprom(data, ee.len); out: kfree(data); } /* Return zero to prevent sysctl failure. */ return (0); } static int mlx4_en_set_tx_ppp(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; int ppp; int error; priv = arg1; ppp = priv->prof->tx_ppp; error = sysctl_handle_int(oidp, &ppp, 0, req); if (error || !req->newptr) return (error); if (ppp > 0xff || ppp < 0) return (-EINVAL); priv->prof->tx_ppp = ppp; error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port, priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); return (error); } static int mlx4_en_set_rx_ppp(SYSCTL_HANDLER_ARGS) { struct mlx4_en_priv *priv; struct mlx4_en_dev *mdev; int ppp; int error; int port_up; port_up = 0; priv = arg1; mdev = priv->mdev; ppp = priv->prof->rx_ppp; error = sysctl_handle_int(oidp, &ppp, 0, req); if (error || !req->newptr) return (error); if (ppp > 0xff || ppp < 0) return (-EINVAL); /* See if we have to change the number of tx queues. */ if (!ppp != !priv->prof->rx_ppp) { mutex_lock(&mdev->state_lock); if (priv->port_up) { port_up = 1; mlx4_en_stop_port(priv->dev); } mlx4_en_free_resources(priv); priv->prof->rx_ppp = ppp; error = -mlx4_en_alloc_resources(priv); if (error) en_err(priv, "Failed reallocating port resources\n"); if (error == 0 && port_up) { error = -mlx4_en_start_port(priv->dev); if (error) en_err(priv, "Failed starting port\n"); } mutex_unlock(&mdev->state_lock); return (error); } priv->prof->rx_ppp = ppp; error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port, priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause, priv->prof->tx_ppp, priv->prof->rx_pause, priv->prof->rx_ppp); return (error); } static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv) { - struct net_device *dev; + struct ifnet *dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *node; struct sysctl_oid_list *node_list; struct sysctl_oid *coal; struct sysctl_oid_list *coal_list; const char *pnameunit; dev = priv->dev; ctx = &priv->conf_ctx; pnameunit = device_get_nameunit(priv->mdev->pdev->dev.bsddev); sysctl_ctx_init(ctx); priv->conf_sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw), OID_AUTO, dev->if_xname, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "mlx4 10gig ethernet"); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->conf_sysctl), OID_AUTO, "conf", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Configuration"); node_list = SYSCTL_CHILDREN(node); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "msg_enable", CTLFLAG_RW, &priv->msg_enable, 0, "Driver message enable bitfield"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings", CTLFLAG_RD, &priv->rx_ring_num, 0, "Number of receive rings"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings", CTLFLAG_RD, &priv->tx_ring_num, 0, "Number of transmit rings"); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_rx_ring_size, "I", "Receive ring size"); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_tx_ring_size, "I", "Transmit ring size"); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_ppp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_tx_ppp, "I", "TX Per-priority pause"); SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_ppp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_set_rx_ppp, "I", "RX Per-priority pause"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "port_num", CTLFLAG_RD, &priv->port, 0, "Port Number"); SYSCTL_ADD_STRING(ctx, node_list, OID_AUTO, "device_name", CTLFLAG_RD, __DECONST(void *, pnameunit), 0, "PCI device name"); /* Add coalescer configuration. */ coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, "coalesce", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Interrupt coalesce configuration"); coal_list = SYSCTL_CHILDREN(coal); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_low", CTLFLAG_RW, &priv->pkt_rate_low, 0, "Packets per-second for minimum delay"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_low", CTLFLAG_RW, &priv->rx_usecs_low, 0, "Minimum RX delay in micro-seconds"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_high", CTLFLAG_RW, &priv->pkt_rate_high, 0, "Packets per-second for maximum delay"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_high", CTLFLAG_RW, &priv->rx_usecs_high, 0, "Maximum RX delay in micro-seconds"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "sample_interval", CTLFLAG_RW, &priv->sample_interval, 0, "adaptive frequency in units of HZ ticks"); SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "adaptive_rx_coal", CTLFLAG_RW, &priv->adaptive_rx_coal, 0, "Enable adaptive rx coalescing"); /* EEPROM support */ SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "eeprom_info", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, mlx4_en_read_eeprom, "I", "EEPROM information"); } static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *node_list; struct sysctl_oid *ring_node; struct sysctl_oid_list *ring_list; struct mlx4_en_tx_ring *tx_ring; struct mlx4_en_rx_ring *rx_ring; char namebuf[128]; int i; ctx = &priv->stat_ctx; sysctl_ctx_init(ctx); priv->stat_sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->conf_sysctl), OID_AUTO, "stat", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics"); node_list = SYSCTL_CHILDREN(priv->stat_sysctl); #ifdef MLX4_EN_PERF_STAT SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_poll", CTLFLAG_RD, &priv->pstats.tx_poll, "TX Poll calls"); SYSCTL_ADD_QUAD(ctx, node_list, OID_AUTO, "tx_pktsz_avg", CTLFLAG_RD, &priv->pstats.tx_pktsz_avg, "TX average packet size"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "inflight_avg", CTLFLAG_RD, &priv->pstats.inflight_avg, "TX average packets in-flight"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_coal_avg", CTLFLAG_RD, &priv->pstats.tx_coal_avg, "TX average coalesced completions"); SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_coal_avg", CTLFLAG_RD, &priv->pstats.rx_coal_avg, "RX average coalesced completions"); #endif SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tso_packets", CTLFLAG_RD, &priv->port_stats.tso_packets, 0, "TSO packets sent"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "queue_stopped", CTLFLAG_RD, &priv->port_stats.queue_stopped, 0, "Queue full"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "wake_queue", CTLFLAG_RD, &priv->port_stats.wake_queue, 0, "Queue resumed after full"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD, &priv->port_stats.tx_timeout, 0, "Transmit timeouts"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_oversized_packets", CTLFLAG_RD, &priv->port_stats.oversized_packets, 0, "TX oversized packets, m_defrag failed"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD, &priv->port_stats.rx_alloc_failed, 0, "RX failed to allocate mbuf"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD, &priv->port_stats.rx_chksum_good, 0, "RX checksum offload success"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_chksum_none", CTLFLAG_RD, &priv->port_stats.rx_chksum_none, 0, "RX without checksum offload"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_chksum_offload", CTLFLAG_RD, &priv->port_stats.tx_chksum_offload, 0, "TX checksum offloads"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "defrag_attempts", CTLFLAG_RD, &priv->port_stats.defrag_attempts, 0, "Oversized chains defragged"); /* Could strdup the names and add in a loop. This is simpler. */ SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &priv->pkstats.rx_bytes, 0, "RX Bytes"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_packets", CTLFLAG_RD, &priv->pkstats.rx_packets, 0, "RX packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_multicast_packets", CTLFLAG_RD, &priv->pkstats.rx_multicast_packets, 0, "RX Multicast Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_broadcast_packets", CTLFLAG_RD, &priv->pkstats.rx_broadcast_packets, 0, "RX Broadcast Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_errors", CTLFLAG_RD, &priv->pkstats.rx_errors, 0, "RX Errors"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_dropped", CTLFLAG_RD, &priv->pkstats.rx_dropped, 0, "RX Dropped"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_length_errors", CTLFLAG_RD, &priv->pkstats.rx_length_errors, 0, "RX Length Errors"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_over_errors", CTLFLAG_RD, &priv->pkstats.rx_over_errors, 0, "RX Over Errors"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_crc_errors", CTLFLAG_RD, &priv->pkstats.rx_crc_errors, 0, "RX CRC Errors"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_jabbers", CTLFLAG_RD, &priv->pkstats.rx_jabbers, 0, "RX Jabbers"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_in_range_length_error", CTLFLAG_RD, &priv->pkstats.rx_in_range_length_error, 0, "RX IN_Range Length Error"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_out_range_length_error", CTLFLAG_RD, &priv->pkstats.rx_out_range_length_error, 0, "RX Out Range Length Error"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_lt_64_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_lt_64_bytes_packets, 0, "RX Lt 64 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_127_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_127_bytes_packets, 0, "RX 127 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_255_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_255_bytes_packets, 0, "RX 255 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_511_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_511_bytes_packets, 0, "RX 511 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1023_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_1023_bytes_packets, 0, "RX 1023 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1518_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_1518_bytes_packets, 0, "RX 1518 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1522_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_1522_bytes_packets, 0, "RX 1522 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_1548_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_1548_bytes_packets, 0, "RX 1548 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "rx_gt_1548_bytes_packets", CTLFLAG_RD, &priv->pkstats.rx_gt_1548_bytes_packets, 0, "RX Greater Then 1548 bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &priv->pkstats.tx_packets, 0, "TX packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_bytes", CTLFLAG_RD, &priv->pkstats.tx_bytes, 0, "TX Bytes"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_multicast_packets", CTLFLAG_RD, &priv->pkstats.tx_multicast_packets, 0, "TX Multicast Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_broadcast_packets", CTLFLAG_RD, &priv->pkstats.tx_broadcast_packets, 0, "TX Broadcast Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_errors", CTLFLAG_RD, &priv->pkstats.tx_errors, 0, "TX Errors"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_dropped", CTLFLAG_RD, &priv->pkstats.tx_dropped, 0, "TX Dropped"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_lt_64_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_lt_64_bytes_packets, 0, "TX Less Then 64 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_127_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_127_bytes_packets, 0, "TX 127 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_255_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_255_bytes_packets, 0, "TX 255 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_511_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_511_bytes_packets, 0, "TX 511 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1023_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_1023_bytes_packets, 0, "TX 1023 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1518_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_1518_bytes_packets, 0, "TX 1518 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1522_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_1522_bytes_packets, 0, "TX 1522 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_1548_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_1548_bytes_packets, 0, "TX 1548 Bytes Packets"); SYSCTL_ADD_U64(ctx, node_list, OID_AUTO, "tx_gt_1548_bytes_packets", CTLFLAG_RD, &priv->pkstats.tx_gt_1548_bytes_packets, 0, "TX Greater Then 1548 Bytes Packets"); for (i = 0; i < priv->tx_ring_num; i++) { tx_ring = priv->tx_ring[i]; snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i); ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX Ring"); ring_list = SYSCTL_CHILDREN(ring_node); SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "packets", CTLFLAG_RD, &tx_ring->packets, 0, "TX packets"); SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "bytes", CTLFLAG_RD, &tx_ring->bytes, 0, "TX bytes"); SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "tso_packets", CTLFLAG_RD, &tx_ring->tso_packets, 0, "TSO packets"); SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "defrag_attempts", CTLFLAG_RD, &tx_ring->defrag_attempts, 0, "Oversized chains defragged"); } for (i = 0; i < priv->rx_ring_num; i++) { rx_ring = priv->rx_ring[i]; snprintf(namebuf, sizeof(namebuf), "rx_ring%d", i); ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX Ring"); ring_list = SYSCTL_CHILDREN(ring_node); SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "packets", CTLFLAG_RD, &rx_ring->packets, 0, "RX packets"); SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "bytes", CTLFLAG_RD, &rx_ring->bytes, 0, "RX bytes"); SYSCTL_ADD_U64(ctx, ring_list, OID_AUTO, "error", CTLFLAG_RD, &rx_ring->errors, 0, "RX soft errors"); } } #ifdef DEBUGNET static void mlx4_en_debugnet_init(struct ifnet *dev, int *nrxr, int *ncl, int *clsize) { struct mlx4_en_priv *priv; priv = if_getsoftc(dev); mutex_lock(&priv->mdev->state_lock); *nrxr = priv->rx_ring_num; *ncl = DEBUGNET_MAX_IN_FLIGHT; *clsize = priv->rx_mb_size; mutex_unlock(&priv->mdev->state_lock); } static void mlx4_en_debugnet_event(struct ifnet *dev, enum debugnet_ev event) { } static int mlx4_en_debugnet_transmit(struct ifnet *dev, struct mbuf *m) { struct mlx4_en_priv *priv; int err; priv = if_getsoftc(dev); if ((if_getdrvflags(dev) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || !priv->link_state) return (ENOENT); err = mlx4_en_xmit(priv, 0, &m); if (err != 0 && m != NULL) m_freem(m); return (err); } static int mlx4_en_debugnet_poll(struct ifnet *dev, int count) { struct mlx4_en_priv *priv; priv = if_getsoftc(dev); if ((if_getdrvflags(dev) & IFF_DRV_RUNNING) == 0 || !priv->link_state) return (ENOENT); mlx4_poll_interrupts(priv->mdev->dev); return (0); } #endif /* DEBUGNET */ diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_port.c b/sys/dev/mlx4/mlx4_en/mlx4_en_port.c index 7e8d2c1550e4..9a0edac109f4 100644 --- a/sys/dev/mlx4/mlx4_en/mlx4_en_port.c +++ b/sys/dev/mlx4/mlx4_en/mlx4_en_port.c @@ -1,507 +1,507 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include #include #include "en_port.h" #include "en.h" int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_set_vlan_fltr_mbox *filter; int i; int j; int index = 0; u32 entry; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); filter = mailbox->buf; for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) { entry = 0; for (j = 0; j < 32; j++) { if (test_bit(index, priv->active_vlans)) entry |= 1 << j; index++; } filter->entry[i] = cpu_to_be32(entry); } err = mlx4_cmd(dev, mailbox->dma, priv->port, 0, MLX4_CMD_SET_VLAN_FLTR, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev, mailbox); return err; } int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) { struct mlx4_en_query_port_context *qport_context; struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]); struct mlx4_en_port_state *state = &priv->port_state; struct mlx4_cmd_mailbox *mailbox; int err; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; qport_context = mailbox->buf; /* This command is always accessed from Ethtool context * already synchronized, no need in locking */ state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK); switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) { case MLX4_EN_100M_SPEED: state->link_speed = 100; break; case MLX4_EN_1G_SPEED: state->link_speed = 1000; break; case MLX4_EN_10G_SPEED_XAUI: case MLX4_EN_10G_SPEED_XFI: state->link_speed = 10000; break; case MLX4_EN_20G_SPEED: state->link_speed = 20000; break; case MLX4_EN_40G_SPEED: state->link_speed = 40000; break; case MLX4_EN_56G_SPEED: state->link_speed = 56000; break; default: state->link_speed = -1; break; } state->transceiver = qport_context->transceiver; state->flags = 0; /* Reset and recalculate the port flags */ state->flags |= (qport_context->link_up & MLX4_EN_ANC_MASK) ? MLX4_EN_PORT_ANC : 0; state->flags |= (qport_context->autoneg & MLX4_EN_AUTONEG_MASK) ? MLX4_EN_PORT_ANE : 0; out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } /* Each counter set is located in struct mlx4_en_stat_out_mbox * with a const offset between its prio components. * This function runs over a counter set and sum all of it's prio components. */ static u64 en_stats_adder(__be64 *start, __be64 *next, int num) { __be64 *curr = start; u64 ret = 0; int i; int offset = next - start; for (i = 0; i < num; i++) { ret += be64_to_cpu(*curr); curr += offset; } return ret; } -static void mlx4_en_fold_software_stats(struct net_device *dev) +static void mlx4_en_fold_software_stats(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; u64 packets, bytes; int i; if (!priv->port_up || mlx4_is_master(mdev->dev)) return; packets = 0; bytes = 0; for (i = 0; i < priv->rx_ring_num; i++) { const struct mlx4_en_rx_ring *ring = priv->rx_ring[i]; packets += READ_ONCE(ring->packets); bytes += READ_ONCE(ring->bytes); } priv->pkstats.rx_packets = packets; priv->pkstats.rx_bytes = bytes; packets = 0; bytes = 0; for (i = 0; i < priv->tx_ring_num; i++) { const struct mlx4_en_tx_ring *ring = priv->tx_ring[i]; packets += READ_ONCE(ring->packets); bytes += READ_ONCE(ring->bytes); } priv->pkstats.tx_packets = packets; priv->pkstats.tx_bytes = bytes; } int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) { struct mlx4_counter tmp_vport_stats; struct mlx4_en_stat_out_mbox *mlx4_en_stats; struct mlx4_en_stat_out_flow_control_mbox *flowstats; - struct net_device *dev = mdev->pndev[port]; + struct ifnet *dev = mdev->pndev[port]; struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_vport_stats *vport_stats = &priv->vport_stats; struct mlx4_cmd_mailbox *mailbox; u64 in_mod = reset << 8 | port; int err; int i, counter_index; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0, MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; mlx4_en_stats = mailbox->buf; spin_lock(&priv->stats_lock); priv->port_stats.rx_chksum_good = 0; priv->port_stats.rx_chksum_none = 0; for (i = 0; i < priv->rx_ring_num; i++) { priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok; priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none; } priv->port_stats.tx_chksum_offload = 0; priv->port_stats.queue_stopped = 0; priv->port_stats.wake_queue = 0; priv->port_stats.oversized_packets = 0; priv->port_stats.tso_packets = 0; priv->port_stats.defrag_attempts = 0; for (i = 0; i < priv->tx_ring_num; i++) { const struct mlx4_en_tx_ring *ring; ring = priv->tx_ring[i]; priv->port_stats.tx_chksum_offload += ring->tx_csum; priv->port_stats.queue_stopped += ring->queue_stopped; priv->port_stats.wake_queue += ring->wake_queue; priv->port_stats.oversized_packets += ring->oversized_packets; priv->port_stats.tso_packets += ring->tso_packets; priv->port_stats.defrag_attempts += ring->defrag_attempts; } priv->pkstats.rx_errors = be64_to_cpu(mlx4_en_stats->PCS) + be32_to_cpu(mlx4_en_stats->RJBBR) + be32_to_cpu(mlx4_en_stats->RCRC) + be32_to_cpu(mlx4_en_stats->RRUNT) + be64_to_cpu(mlx4_en_stats->RInRangeLengthErr) + be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr) + be32_to_cpu(mlx4_en_stats->RSHORT) + en_stats_adder(&mlx4_en_stats->RGIANT_prio_0, &mlx4_en_stats->RGIANT_prio_1, NUM_PRIORITIES); priv->pkstats.tx_errors = en_stats_adder(&mlx4_en_stats->TGIANT_prio_0, &mlx4_en_stats->TGIANT_prio_1, NUM_PRIORITIES); priv->pkstats.rx_multicast_packets = en_stats_adder(&mlx4_en_stats->MCAST_prio_0, &mlx4_en_stats->MCAST_prio_1, NUM_PRIORITIES); priv->pkstats.rx_dropped = be32_to_cpu(mlx4_en_stats->RDROP); priv->pkstats.rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength); priv->pkstats.rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); priv->pkstats.rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC); priv->pkstats.tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP); /* RX stats */ priv->pkstats.rx_packets = en_stats_adder(&mlx4_en_stats->RTOT_prio_0, &mlx4_en_stats->RTOT_prio_1, NUM_PRIORITIES); priv->pkstats.rx_bytes = en_stats_adder(&mlx4_en_stats->ROCT_prio_0, &mlx4_en_stats->ROCT_prio_1, NUM_PRIORITIES); priv->pkstats.rx_broadcast_packets = en_stats_adder(&mlx4_en_stats->RBCAST_prio_0, &mlx4_en_stats->RBCAST_prio_1, NUM_PRIORITIES); priv->pkstats.rx_jabbers = be32_to_cpu(mlx4_en_stats->RJBBR); priv->pkstats.rx_in_range_length_error = be64_to_cpu(mlx4_en_stats->RInRangeLengthErr); priv->pkstats.rx_out_range_length_error = be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr); /* Tx stats */ priv->pkstats.tx_packets = en_stats_adder(&mlx4_en_stats->TTOT_prio_0, &mlx4_en_stats->TTOT_prio_1, NUM_PRIORITIES); priv->pkstats.tx_bytes = en_stats_adder(&mlx4_en_stats->TOCT_prio_0, &mlx4_en_stats->TOCT_prio_1, NUM_PRIORITIES); priv->pkstats.tx_multicast_packets = en_stats_adder(&mlx4_en_stats->TMCAST_prio_0, &mlx4_en_stats->TMCAST_prio_1, NUM_PRIORITIES); priv->pkstats.tx_broadcast_packets = en_stats_adder(&mlx4_en_stats->TBCAST_prio_0, &mlx4_en_stats->TBCAST_prio_1, NUM_PRIORITIES); priv->pkstats.rx_prio[0][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0); priv->pkstats.rx_prio[0][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_0); priv->pkstats.rx_prio[1][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1); priv->pkstats.rx_prio[1][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_1); priv->pkstats.rx_prio[2][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2); priv->pkstats.rx_prio[2][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_2); priv->pkstats.rx_prio[3][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3); priv->pkstats.rx_prio[3][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_3); priv->pkstats.rx_prio[4][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4); priv->pkstats.rx_prio[4][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_4); priv->pkstats.rx_prio[5][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5); priv->pkstats.rx_prio[5][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_5); priv->pkstats.rx_prio[6][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6); priv->pkstats.rx_prio[6][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_6); priv->pkstats.rx_prio[7][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7); priv->pkstats.rx_prio[7][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_7); priv->pkstats.rx_prio[8][0] = be64_to_cpu(mlx4_en_stats->RTOT_novlan); priv->pkstats.rx_prio[8][1] = be64_to_cpu(mlx4_en_stats->ROCT_novlan); priv->pkstats.tx_prio[0][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0); priv->pkstats.tx_prio[0][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_0); priv->pkstats.tx_prio[1][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1); priv->pkstats.tx_prio[1][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_1); priv->pkstats.tx_prio[2][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2); priv->pkstats.tx_prio[2][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_2); priv->pkstats.tx_prio[3][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3); priv->pkstats.tx_prio[3][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_3); priv->pkstats.tx_prio[4][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4); priv->pkstats.tx_prio[4][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_4); priv->pkstats.tx_prio[5][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5); priv->pkstats.tx_prio[5][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_5); priv->pkstats.tx_prio[6][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6); priv->pkstats.tx_prio[6][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_6); priv->pkstats.tx_prio[7][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7); priv->pkstats.tx_prio[7][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_7); priv->pkstats.tx_prio[8][0] = be64_to_cpu(mlx4_en_stats->TTOT_novlan); priv->pkstats.tx_prio[8][1] = be64_to_cpu(mlx4_en_stats->TOCT_novlan); mlx4_en_fold_software_stats(dev); spin_unlock(&priv->stats_lock); memset(&tmp_vport_stats, 0, sizeof(tmp_vport_stats)); counter_index = mlx4_get_default_counter_index(mdev->dev, port); err = mlx4_get_counter_stats(mdev->dev, counter_index, &tmp_vport_stats, reset); spin_lock(&priv->stats_lock); if (!err) { /* ethtool stats format */ vport_stats->rx_bytes = be64_to_cpu(tmp_vport_stats.rx_bytes); vport_stats->rx_frames = be64_to_cpu(tmp_vport_stats.rx_frames); vport_stats->tx_bytes = be64_to_cpu(tmp_vport_stats.tx_bytes); vport_stats->tx_frames = be64_to_cpu(tmp_vport_stats.tx_frames); } #if __FreeBSD_version >= 1100000 if (reset == 0) { if_inc_counter(dev, IFCOUNTER_IPACKETS, priv->pkstats.rx_packets - priv->pkstats_last.rx_packets); if_inc_counter(dev, IFCOUNTER_OPACKETS, priv->pkstats.tx_packets - priv->pkstats_last.tx_packets); if_inc_counter(dev, IFCOUNTER_IBYTES, priv->pkstats.rx_bytes - priv->pkstats_last.rx_bytes); if_inc_counter(dev, IFCOUNTER_OBYTES, priv->pkstats.tx_bytes - priv->pkstats_last.tx_bytes); if_inc_counter(dev, IFCOUNTER_IERRORS, priv->pkstats.rx_errors - priv->pkstats_last.rx_errors); if_inc_counter(dev, IFCOUNTER_IQDROPS, priv->pkstats.rx_dropped - priv->pkstats_last.rx_dropped); if_inc_counter(dev, IFCOUNTER_IMCASTS, priv->pkstats.rx_multicast_packets - priv->pkstats_last.rx_multicast_packets); if_inc_counter(dev, IFCOUNTER_OMCASTS, priv->pkstats.tx_multicast_packets - priv->pkstats_last.tx_multicast_packets); } priv->pkstats_last = priv->pkstats; #else dev->if_ipackets = priv->pkstats.rx_packets; dev->if_opackets = priv->pkstats.tx_packets; dev->if_ibytes = priv->pkstats.rx_bytes; dev->if_obytes = priv->pkstats.tx_bytes; dev->if_ierrors = priv->pkstats.rx_errors; dev->if_iqdrops = priv->pkstats.rx_dropped; dev->if_imcasts = priv->pkstats.rx_multicast_packets; dev->if_omcasts = priv->pkstats.tx_multicast_packets; dev->if_collisions = 0; #endif spin_unlock(&priv->stats_lock); /* 0xffs indicates invalid value */ memset(mailbox->buf, 0xff, sizeof(*flowstats) * MLX4_NUM_PRIORITIES); if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) { memset(mailbox->buf, 0, sizeof(*flowstats) * MLX4_NUM_PRIORITIES); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL, 0, MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; } flowstats = mailbox->buf; spin_lock(&priv->stats_lock); for (i = 0; i < MLX4_NUM_PRIORITIES; i++) { priv->rx_priority_flowstats[i].rx_pause = be64_to_cpu(flowstats[i].rx_pause); priv->rx_priority_flowstats[i].rx_pause_duration = be64_to_cpu(flowstats[i].rx_pause_duration); priv->rx_priority_flowstats[i].rx_pause_transition = be64_to_cpu(flowstats[i].rx_pause_transition); priv->tx_priority_flowstats[i].tx_pause = be64_to_cpu(flowstats[i].tx_pause); priv->tx_priority_flowstats[i].tx_pause_duration = be64_to_cpu(flowstats[i].tx_pause_duration); priv->tx_priority_flowstats[i].tx_pause_transition = be64_to_cpu(flowstats[i].tx_pause_transition); } /* if pfc is not in use, all priorities counters have the same value */ priv->rx_flowstats.rx_pause = be64_to_cpu(flowstats[0].rx_pause); priv->rx_flowstats.rx_pause_duration = be64_to_cpu(flowstats[0].rx_pause_duration); priv->rx_flowstats.rx_pause_transition = be64_to_cpu(flowstats[0].rx_pause_transition); priv->tx_flowstats.tx_pause = be64_to_cpu(flowstats[0].tx_pause); priv->tx_flowstats.tx_pause_duration = be64_to_cpu(flowstats[0].tx_pause_duration); priv->tx_flowstats.tx_pause_transition = be64_to_cpu(flowstats[0].tx_pause_transition); spin_unlock(&priv->stats_lock); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } int mlx4_en_get_vport_stats(struct mlx4_en_dev *mdev, u8 port) { struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]); struct mlx4_counter tmp_vport_stats; struct mlx4_en_vf_stats *vf_stats = &priv->vf_stats; int err, i, counter_index; spin_lock(&priv->stats_lock); priv->pkstats.rx_packets = 0; priv->pkstats.rx_bytes = 0; priv->port_stats.rx_chksum_good = 0; priv->port_stats.rx_chksum_none = 0; for (i = 0; i < priv->rx_ring_num; i++) { priv->pkstats.rx_packets += priv->rx_ring[i]->packets; priv->pkstats.rx_bytes += priv->rx_ring[i]->bytes; priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok; priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none; } priv->pkstats.tx_packets = 0; priv->pkstats.tx_bytes = 0; priv->port_stats.tx_chksum_offload = 0; priv->port_stats.queue_stopped = 0; priv->port_stats.wake_queue = 0; for (i = 0; i < priv->tx_ring_num; i++) { const struct mlx4_en_tx_ring *ring = priv->tx_ring[i]; priv->pkstats.tx_packets += ring->packets; priv->pkstats.tx_bytes += ring->bytes; priv->port_stats.tx_chksum_offload += ring->tx_csum; priv->port_stats.queue_stopped += ring->queue_stopped; priv->port_stats.wake_queue += ring->wake_queue; priv->port_stats.oversized_packets += priv->tx_ring[i]->oversized_packets; } spin_unlock(&priv->stats_lock); memset(&tmp_vport_stats, 0, sizeof(tmp_vport_stats)); counter_index = mlx4_get_default_counter_index(mdev->dev, port); err = mlx4_get_counter_stats(mdev->dev, counter_index, &tmp_vport_stats, 0); if (!err) { spin_lock(&priv->stats_lock); vf_stats->rx_bytes = be64_to_cpu(tmp_vport_stats.rx_bytes); vf_stats->rx_frames = be64_to_cpu(tmp_vport_stats.rx_frames); vf_stats->tx_bytes = be64_to_cpu(tmp_vport_stats.tx_bytes); vf_stats->tx_frames = be64_to_cpu(tmp_vport_stats.tx_frames); priv->pkstats.rx_packets = vf_stats->rx_frames; priv->pkstats.rx_bytes = vf_stats->rx_bytes; priv->pkstats.tx_packets = vf_stats->tx_frames; priv->pkstats.tx_bytes = vf_stats->tx_bytes; /* PF&VFs are not expected to report errors in ifconfig. * rx_errors will be reprted in PF's ethtool statistics, * see: mlx4_en_DUMP_ETH_STATS */ priv->pkstats.rx_errors = 0; priv->pkstats.rx_dropped = 0; priv->pkstats.tx_dropped = 0; priv->pkstats.rx_multicast_packets = 0; spin_unlock(&priv->stats_lock); } return err; } diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c b/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c index db8beb501594..5d5508eebebf 100644 --- a/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c +++ b/sys/dev/mlx4/mlx4_en/mlx4_en_resources.c @@ -1,121 +1,121 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include #include "en.h" void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, int is_tx, int rss, int qpn, int cqn, int user_prio, struct mlx4_qp_context *context) { struct mlx4_en_dev *mdev = priv->mdev; - struct net_device *dev = priv->dev; + struct ifnet *dev = priv->dev; memset(context, 0, sizeof *context); context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET); context->pd = cpu_to_be32(mdev->priv_pdn); context->mtu_msgmax = 0xff; if (!is_tx && !rss) context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4); if (is_tx) context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4); else context->sq_size_stride = ilog2(TXBB_SIZE) - 4; context->usr_page = cpu_to_be32(mdev->priv_uar.index); context->local_qpn = cpu_to_be32(qpn); context->pri_path.ackto = 1 & 0x07; context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6; if (user_prio >= 0) { context->pri_path.sched_queue |= user_prio << 3; context->pri_path.feup = 1 << 6; } context->pri_path.counter_index = (u8)(priv->counter_index); if (!rss && (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK) && context->pri_path.counter_index != 0xFF) { /* disable multicast loopback to qp with same counter */ context->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB; context->pri_path.vlan_control |= MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER; } context->cqn_send = cpu_to_be32(cqn); context->cqn_recv = cpu_to_be32(cqn); context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2); if (!(dev->if_capabilities & IFCAP_VLAN_HWCSUM)) context->param3 |= cpu_to_be32(1 << 30); } int mlx4_en_map_buffer(struct mlx4_buf *buf) { struct page **pages; int i; // if nbufs == 1 - there is no need to vmap // if buf->direct.buf is not NULL it means that vmap was already done by mlx4_alloc_buff if (buf->direct.buf != NULL || buf->nbufs == 1) return 0; pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL); if (!pages) return -ENOMEM; for (i = 0; i < buf->nbufs; ++i) pages[i] = virt_to_page(buf->page_list[i].buf); buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); kfree(pages); if (!buf->direct.buf) return -ENOMEM; return 0; } void mlx4_en_unmap_buffer(struct mlx4_buf *buf) { if (BITS_PER_LONG == 64 || buf->nbufs == 1) return; vunmap(buf->direct.buf); } void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event) { return; } diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c b/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c index e9f0d2f640e1..9e6f4df107bd 100644 --- a/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c +++ b/sys/dev/mlx4/mlx4_en/mlx4_en_rx.c @@ -1,1134 +1,1134 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include "opt_inet.h" #include #include #include #include #include #include #include #ifdef CONFIG_NET_RX_BUSY_POLL #include #endif #include "en.h" #if (MLX4_EN_MAX_RX_SEGS == 1) static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ((struct mlx4_en_rx_desc *)ring->buf) + index; int i; /* Set size and memtype fields */ rx_desc->data[0].byte_count = cpu_to_be32(priv->rx_mb_size - MLX4_NET_IP_ALIGN); rx_desc->data[0].lkey = cpu_to_be32(priv->mdev->mr.key); /* * If the number of used fragments does not fill up the ring * stride, remaining (unused) fragments must be padded with * null address/size and a special memory key: */ for (i = 1; i < MLX4_EN_MAX_RX_SEGS; i++) { rx_desc->data[i].byte_count = 0; rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD); rx_desc->data[i].addr = 0; } } #endif static inline struct mbuf * mlx4_en_alloc_mbuf(struct mlx4_en_rx_ring *ring) { struct mbuf *mb; #if (MLX4_EN_MAX_RX_SEGS == 1) mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size); if (likely(mb != NULL)) mb->m_pkthdr.len = mb->m_len = ring->rx_mb_size; #else mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MLX4_EN_MAX_RX_BYTES); if (likely(mb != NULL)) { struct mbuf *mb_head = mb; int i; mb->m_len = MLX4_EN_MAX_RX_BYTES; mb->m_pkthdr.len = MLX4_EN_MAX_RX_BYTES; for (i = 1; i != MLX4_EN_MAX_RX_SEGS; i++) { if (mb_head->m_pkthdr.len >= ring->rx_mb_size) break; mb = (mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0, MLX4_EN_MAX_RX_BYTES)); if (unlikely(mb == NULL)) { m_freem(mb_head); return (NULL); } mb->m_len = MLX4_EN_MAX_RX_BYTES; mb_head->m_pkthdr.len += MLX4_EN_MAX_RX_BYTES; } /* rewind to first mbuf in chain */ mb = mb_head; } #endif return (mb); } static int mlx4_en_alloc_buf(struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_mbuf *mb_list) { bus_dma_segment_t segs[MLX4_EN_MAX_RX_SEGS]; bus_dmamap_t map; struct mbuf *mb; int nsegs; int err; #if (MLX4_EN_MAX_RX_SEGS != 1) int i; #endif /* try to allocate a new spare mbuf */ if (unlikely(ring->spare.mbuf == NULL)) { mb = mlx4_en_alloc_mbuf(ring); if (unlikely(mb == NULL)) return (-ENOMEM); /* make sure IP header gets aligned */ m_adj(mb, MLX4_NET_IP_ALIGN); /* load spare mbuf into BUSDMA */ err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, ring->spare.dma_map, mb, ring->spare.segs, &nsegs, BUS_DMA_NOWAIT); if (unlikely(err != 0)) { m_freem(mb); return (err); } /* store spare info */ ring->spare.mbuf = mb; #if (MLX4_EN_MAX_RX_SEGS != 1) /* zero remaining segs */ for (i = nsegs; i != MLX4_EN_MAX_RX_SEGS; i++) { ring->spare.segs[i].ds_addr = 0; ring->spare.segs[i].ds_len = 0; } #endif bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map, BUS_DMASYNC_PREREAD); } /* synchronize and unload the current mbuf, if any */ if (likely(mb_list->mbuf != NULL)) { bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(ring->dma_tag, mb_list->dma_map); } mb = mlx4_en_alloc_mbuf(ring); if (unlikely(mb == NULL)) goto use_spare; /* make sure IP header gets aligned */ m_adj(mb, MLX4_NET_IP_ALIGN); err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, mb_list->dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); if (unlikely(err != 0)) { m_freem(mb); goto use_spare; } #if (MLX4_EN_MAX_RX_SEGS == 1) rx_desc->data[0].addr = cpu_to_be64(segs[0].ds_addr); #else for (i = 0; i != nsegs; i++) { rx_desc->data[i].byte_count = cpu_to_be32(segs[i].ds_len); rx_desc->data[i].lkey = ring->rx_mr_key_be; rx_desc->data[i].addr = cpu_to_be64(segs[i].ds_addr); } for (; i != MLX4_EN_MAX_RX_SEGS; i++) { rx_desc->data[i].byte_count = 0; rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD); rx_desc->data[i].addr = 0; } #endif mb_list->mbuf = mb; bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, BUS_DMASYNC_PREREAD); return (0); use_spare: /* swap DMA maps */ map = mb_list->dma_map; mb_list->dma_map = ring->spare.dma_map; ring->spare.dma_map = map; /* swap MBUFs */ mb_list->mbuf = ring->spare.mbuf; ring->spare.mbuf = NULL; /* store physical address */ #if (MLX4_EN_MAX_RX_SEGS == 1) rx_desc->data[0].addr = cpu_to_be64(ring->spare.segs[0].ds_addr); #else for (i = 0; i != MLX4_EN_MAX_RX_SEGS; i++) { if (ring->spare.segs[i].ds_len != 0) { rx_desc->data[i].byte_count = cpu_to_be32(ring->spare.segs[i].ds_len); rx_desc->data[i].lkey = ring->rx_mr_key_be; rx_desc->data[i].addr = cpu_to_be64(ring->spare.segs[i].ds_addr); } else { rx_desc->data[i].byte_count = 0; rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD); rx_desc->data[i].addr = 0; } } #endif return (0); } static void mlx4_en_free_buf(struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_mbuf *mb_list) { bus_dmamap_t map = mb_list->dma_map; bus_dmamap_sync(ring->dma_tag, map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(ring->dma_tag, map); m_freem(mb_list->mbuf); mb_list->mbuf = NULL; /* safety clearing */ } static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) { struct mlx4_en_rx_desc *rx_desc = ((struct mlx4_en_rx_desc *)ring->buf) + index; struct mlx4_en_rx_mbuf *mb_list = ring->mbuf + index; mb_list->mbuf = NULL; if (mlx4_en_alloc_buf(ring, rx_desc, mb_list)) { priv->port_stats.rx_alloc_failed++; return (-ENOMEM); } return (0); } static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring) { *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff); } static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) { struct mlx4_en_rx_ring *ring; int ring_ind; int buf_ind; int new_size; int err; for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) { for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; err = mlx4_en_prepare_rx_desc(priv, ring, ring->actual_size); if (err) { if (ring->actual_size == 0) { en_err(priv, "Failed to allocate " "enough rx buffers\n"); return -ENOMEM; } else { new_size = rounddown_pow_of_two(ring->actual_size); en_warn(priv, "Only %d buffers allocated " "reducing ring size to %d\n", ring->actual_size, new_size); goto reduce_rings; } } ring->actual_size++; ring->prod++; } } return 0; reduce_rings: for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; while (ring->actual_size > new_size) { ring->actual_size--; ring->prod--; mlx4_en_free_buf(ring, ring->mbuf + ring->actual_size); } } return 0; } static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { int index; en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n", ring->cons, ring->prod); /* Unmap and free Rx buffers */ BUG_ON((u32) (ring->prod - ring->cons) > ring->actual_size); while (ring->cons != ring->prod) { index = ring->cons & ring->size_mask; en_dbg(DRV, priv, "Processing descriptor:%d\n", index); mlx4_en_free_buf(ring, ring->mbuf + index); ++ring->cons; } } void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev) { int i; int num_of_eqs; int num_rx_rings; struct mlx4_dev *dev = mdev->dev; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { num_of_eqs = max_t(int, MIN_RX_RINGS, min_t(int, mlx4_get_eqs_per_port(mdev->dev, i), DEF_RX_RINGS)); num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS : num_of_eqs; mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(num_rx_rings); } } -void mlx4_en_calc_rx_buf(struct net_device *dev) +void mlx4_en_calc_rx_buf(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); int eff_mtu = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN + MLX4_NET_IP_ALIGN; if (eff_mtu > MJUM16BYTES) { en_err(priv, "MTU(%u) is too big\n", (unsigned)dev->if_mtu); eff_mtu = MJUM16BYTES; } else if (eff_mtu > MJUM9BYTES) { eff_mtu = MJUM16BYTES; } else if (eff_mtu > MJUMPAGESIZE) { eff_mtu = MJUM9BYTES; } else if (eff_mtu > MCLBYTES) { eff_mtu = MJUMPAGESIZE; } else { eff_mtu = MCLBYTES; } priv->rx_mb_size = eff_mtu; en_dbg(DRV, priv, "Effective RX MTU: %d bytes\n", eff_mtu); } int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size, int node) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring; int err; int tmp; uint32_t x; ring = kzalloc(sizeof(struct mlx4_en_rx_ring), GFP_KERNEL); if (!ring) { en_err(priv, "Failed to allocate RX ring structure\n"); return -ENOMEM; } /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM16BYTES, /* maxsize */ MLX4_EN_MAX_RX_SEGS, /* nsegments */ MJUM16BYTES, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &ring->dma_tag))) { en_err(priv, "Failed to create DMA tag\n"); goto err_ring; } ring->prod = 0; ring->cons = 0; ring->size = size; ring->size_mask = size - 1; ring->log_stride = ilog2(sizeof(struct mlx4_en_rx_desc)); ring->buf_size = (ring->size * sizeof(struct mlx4_en_rx_desc)) + TXBB_SIZE; tmp = size * sizeof(struct mlx4_en_rx_mbuf); ring->mbuf = kzalloc(tmp, GFP_KERNEL); if (ring->mbuf == NULL) { err = -ENOMEM; goto err_dma_tag; } err = -bus_dmamap_create(ring->dma_tag, 0, &ring->spare.dma_map); if (err != 0) goto err_info; for (x = 0; x != size; x++) { err = -bus_dmamap_create(ring->dma_tag, 0, &ring->mbuf[x].dma_map); if (err != 0) { while (x--) bus_dmamap_destroy(ring->dma_tag, ring->mbuf[x].dma_map); goto err_info; } } en_dbg(DRV, priv, "Allocated MBUF ring at addr:%p size:%d\n", ring->mbuf, tmp); err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); if (err) goto err_dma_map; err = mlx4_en_map_buffer(&ring->wqres.buf); if (err) { en_err(priv, "Failed to map RX buffer\n"); goto err_hwq; } ring->buf = ring->wqres.buf.direct.buf; *pring = ring; return 0; err_hwq: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_dma_map: for (x = 0; x != size; x++) { bus_dmamap_destroy(ring->dma_tag, ring->mbuf[x].dma_map); } bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map); err_info: vfree(ring->mbuf); err_dma_tag: bus_dma_tag_destroy(ring->dma_tag); err_ring: kfree(ring); return (err); } int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv) { struct mlx4_en_rx_ring *ring; #if (MLX4_EN_MAX_RX_SEGS == 1) int i; #endif int ring_ind; int err; for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; ring->prod = 0; ring->cons = 0; ring->actual_size = 0; ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn; ring->rx_mb_size = priv->rx_mb_size; if (sizeof(struct mlx4_en_rx_desc) <= TXBB_SIZE) { /* Stamp first unused send wqe */ __be32 *ptr = (__be32 *)ring->buf; __be32 stamp = cpu_to_be32(1 << STAMP_SHIFT); *ptr = stamp; /* Move pointer to start of rx section */ ring->buf += TXBB_SIZE; } ring->log_stride = ilog2(sizeof(struct mlx4_en_rx_desc)); ring->buf_size = ring->size * sizeof(struct mlx4_en_rx_desc); memset(ring->buf, 0, ring->buf_size); mlx4_en_update_rx_prod_db(ring); #if (MLX4_EN_MAX_RX_SEGS == 1) /* Initialize all descriptors */ for (i = 0; i < ring->size; i++) mlx4_en_init_rx_desc(priv, ring, i); #endif ring->rx_mr_key_be = cpu_to_be32(priv->mdev->mr.key); #ifdef INET /* Configure lro mngr */ if (priv->dev->if_capenable & IFCAP_LRO) { if (tcp_lro_init(&ring->lro)) priv->dev->if_capenable &= ~IFCAP_LRO; else ring->lro.ifp = priv->dev; } #endif } err = mlx4_en_fill_rx_buffers(priv); if (err) goto err_buffers; for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; ring->size_mask = ring->actual_size - 1; mlx4_en_update_rx_prod_db(ring); } return 0; err_buffers: for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]); ring_ind = priv->rx_ring_num - 1; while (ring_ind >= 0) { ring = priv->rx_ring[ring_ind]; if (sizeof(struct mlx4_en_rx_desc) <= TXBB_SIZE) ring->buf -= TXBB_SIZE; ring_ind--; } return err; } void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring = *pring; uint32_t x; mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * sizeof(struct mlx4_en_rx_desc) + TXBB_SIZE); for (x = 0; x != size; x++) bus_dmamap_destroy(ring->dma_tag, ring->mbuf[x].dma_map); /* free spare mbuf, if any */ if (ring->spare.mbuf != NULL) { bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(ring->dma_tag, ring->spare.dma_map); m_freem(ring->spare.mbuf); } bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map); vfree(ring->mbuf); bus_dma_tag_destroy(ring->dma_tag); kfree(ring); *pring = NULL; #ifdef CONFIG_RFS_ACCEL mlx4_en_cleanup_filters(priv, ring); #endif } void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { #ifdef INET tcp_lro_free(&ring->lro); #endif mlx4_en_free_rx_buf(priv, ring); if (sizeof(struct mlx4_en_rx_desc) <= TXBB_SIZE) ring->buf -= TXBB_SIZE; } static void validate_loopback(struct mlx4_en_priv *priv, struct mbuf *mb) { int i; int offset = ETHER_HDR_LEN; for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) { if (*(mb->m_data + offset) != (unsigned char) (i & 0xff)) goto out_loopback; } /* Loopback found */ priv->loopback_ok = 1; out_loopback: m_freem(mb); } static inline int invalid_cqe(struct mlx4_en_priv *priv, struct mlx4_cqe *cqe) { /* Drop packet on bad receive or bad checksum */ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR)) { en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n", ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome, ((struct mlx4_err_cqe *)cqe)->syndrome); return 1; } if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) { en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n"); return 1; } return 0; } static struct mbuf * mlx4_en_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_mbuf *mb_list, int length) { #if (MLX4_EN_MAX_RX_SEGS != 1) struct mbuf *mb_head; #endif struct mbuf *mb; /* optimise reception of small packets */ if (length <= (MHLEN - MLX4_NET_IP_ALIGN) && (mb = m_gethdr(M_NOWAIT, MT_DATA)) != NULL) { /* set packet length */ mb->m_pkthdr.len = mb->m_len = length; /* make sure IP header gets aligned */ mb->m_data += MLX4_NET_IP_ALIGN; bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, BUS_DMASYNC_POSTREAD); bcopy(mtod(mb_list->mbuf, caddr_t), mtod(mb, caddr_t), length); return (mb); } /* get mbuf */ mb = mb_list->mbuf; /* collect used fragment while atomically replacing it */ if (mlx4_en_alloc_buf(ring, rx_desc, mb_list)) return (NULL); /* range check hardware computed value */ if (unlikely(length > mb->m_pkthdr.len)) length = mb->m_pkthdr.len; #if (MLX4_EN_MAX_RX_SEGS == 1) /* update total packet length in packet header */ mb->m_len = mb->m_pkthdr.len = length; #else mb->m_pkthdr.len = length; for (mb_head = mb; mb != NULL; mb = mb->m_next) { if (mb->m_len > length) mb->m_len = length; length -= mb->m_len; if (likely(length == 0)) { if (likely(mb->m_next != NULL)) { /* trim off empty mbufs */ m_freem(mb->m_next); mb->m_next = NULL; } break; } } /* rewind to first mbuf in chain */ mb = mb_head; #endif return (mb); } static __inline int mlx4_en_rss_hash(__be16 status, int udp_rss) { enum { status_all = cpu_to_be16( MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV4F | MLX4_CQE_STATUS_IPV6 | MLX4_CQE_STATUS_TCP | MLX4_CQE_STATUS_UDP), status_ipv4_tcp = cpu_to_be16( MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_TCP), status_ipv6_tcp = cpu_to_be16( MLX4_CQE_STATUS_IPV6 | MLX4_CQE_STATUS_TCP), status_ipv4_udp = cpu_to_be16( MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_UDP), status_ipv6_udp = cpu_to_be16( MLX4_CQE_STATUS_IPV6 | MLX4_CQE_STATUS_UDP), status_ipv4 = cpu_to_be16(MLX4_CQE_STATUS_IPV4), status_ipv6 = cpu_to_be16(MLX4_CQE_STATUS_IPV6) }; status &= status_all; switch (status) { case status_ipv4_tcp: return (M_HASHTYPE_RSS_TCP_IPV4); case status_ipv6_tcp: return (M_HASHTYPE_RSS_TCP_IPV6); case status_ipv4_udp: return (udp_rss ? M_HASHTYPE_RSS_UDP_IPV4 : M_HASHTYPE_RSS_IPV4); case status_ipv6_udp: return (udp_rss ? M_HASHTYPE_RSS_UDP_IPV6 : M_HASHTYPE_RSS_IPV6); default: if (status & status_ipv4) return (M_HASHTYPE_RSS_IPV4); if (status & status_ipv6) return (M_HASHTYPE_RSS_IPV6); return (M_HASHTYPE_OPAQUE_HASH); } } /* For cpu arch with cache line of 64B the performance is better when cqe size==64B * To enlarge cqe size from 32B to 64B --> 32B of garbage (i.e. 0xccccccc) * was added in the beginning of each cqe (the real data is in the corresponding 32B). * The following calc ensures that when factor==1, it means we are aligned to 64B * and we get the real cqe data*/ #define CQE_FACTOR_INDEX(index, factor) (((index) << (factor)) + (factor)) -int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget) +int mlx4_en_process_rx_cq(struct ifnet *dev, struct mlx4_en_cq *cq, int budget) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cqe *cqe; struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring]; struct mlx4_en_rx_mbuf *mb_list; struct mlx4_en_rx_desc *rx_desc; struct mbuf *mb; struct mlx4_cq *mcq = &cq->mcq; struct mlx4_cqe *buf = cq->buf; int index; unsigned int length; int polled = 0; u32 cons_index = mcq->cons_index; u32 size_mask = ring->size_mask; int size = cq->size; int factor = priv->cqe_factor; const int udp_rss = priv->mdev->profile.udp_rss; if (!priv->port_up) return 0; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx * descriptor offset can be deducted from the CQE index instead of * reading 'cqe->index' */ index = cons_index & size_mask; cqe = &buf[CQE_FACTOR_INDEX(index, factor)]; /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { mb_list = ring->mbuf + index; rx_desc = ((struct mlx4_en_rx_desc *)ring->buf) + index; /* * make sure we read the CQE after we read the ownership bit */ rmb(); if (invalid_cqe(priv, cqe)) { goto next; } /* * Packet is OK - process it. */ length = be32_to_cpu(cqe->byte_cnt); length -= ring->fcs_del; mb = mlx4_en_rx_mb(priv, ring, rx_desc, mb_list, length); if (unlikely(!mb)) { ring->errors++; goto next; } ring->bytes += length; ring->packets++; if (unlikely(priv->validate_loopback)) { validate_loopback(priv, mb); goto next; } /* forward Toeplitz compatible hash value */ mb->m_pkthdr.flowid = be32_to_cpu(cqe->immed_rss_invalid); M_HASHTYPE_SET(mb, mlx4_en_rss_hash(cqe->status, udp_rss)); mb->m_pkthdr.rcvif = dev; if (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_CVLAN_PRESENT_MASK) { mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid); mb->m_flags |= M_VLANTAG; } if (likely(dev->if_capenable & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) && (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && (cqe->checksum == cpu_to_be16(0xffff))) { priv->port_stats.rx_chksum_good++; mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mb->m_pkthdr.csum_data = htons(0xffff); /* This packet is eligible for LRO if it is: * - DIX Ethernet (type interpretation) * - TCP/IP (v4) * - without IP options * - not an IP fragment */ #ifdef INET if (mlx4_en_can_lro(cqe->status) && (dev->if_capenable & IFCAP_LRO)) { if (ring->lro.lro_cnt != 0 && tcp_lro_rx(&ring->lro, mb, 0) == 0) goto next; } #endif /* LRO not possible, complete processing here */ INC_PERF_COUNTER(priv->pstats.lro_misses); } else { mb->m_pkthdr.csum_flags = 0; priv->port_stats.rx_chksum_none++; } /* Push it up the stack */ dev->if_input(dev, mb); next: ++cons_index; index = cons_index & size_mask; cqe = &buf[CQE_FACTOR_INDEX(index, factor)]; if (++polled == budget) goto out; } /* Flush all pending IP reassembly sessions */ out: #ifdef INET tcp_lro_flush_all(&ring->lro); #endif AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); mcq->cons_index = cons_index; mlx4_cq_set_ci(mcq); wmb(); /* ensure HW sees CQ consumer before we post new buffers */ ring->cons = mcq->cons_index; ring->prod += polled; /* Polled descriptors were realocated in place */ mlx4_en_update_rx_prod_db(ring); return polled; } /* Rx CQ polling - called by NAPI */ static int mlx4_en_poll_rx_cq(struct mlx4_en_cq *cq, int budget) { - struct net_device *dev = cq->dev; + struct ifnet *dev = cq->dev; struct epoch_tracker et; int done; NET_EPOCH_ENTER(et); done = mlx4_en_process_rx_cq(dev, cq, budget); NET_EPOCH_EXIT(et); cq->tot_rx += done; return done; } void mlx4_en_rx_irq(struct mlx4_cq *mcq) { struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); struct mlx4_en_priv *priv = netdev_priv(cq->dev); int done; // Shoot one within the irq context // Because there is no NAPI in freeBSD done = mlx4_en_poll_rx_cq(cq, MLX4_EN_RX_BUDGET); if (priv->port_up && (done == MLX4_EN_RX_BUDGET) ) { cq->curr_poll_rx_cpu_id = curcpu; taskqueue_enqueue(cq->tq, &cq->cq_task); } else { mlx4_en_arm_cq(priv, cq); } } void mlx4_en_rx_que(void *context, int pending) { struct epoch_tracker et; struct mlx4_en_cq *cq; struct thread *td; cq = context; td = curthread; thread_lock(td); sched_bind(td, cq->curr_poll_rx_cpu_id); thread_unlock(td); NET_EPOCH_ENTER(et); while (mlx4_en_poll_rx_cq(cq, MLX4_EN_RX_BUDGET) == MLX4_EN_RX_BUDGET); NET_EPOCH_EXIT(et); mlx4_en_arm_cq(cq->dev->if_softc, cq); } /* RSS related functions */ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn, struct mlx4_en_rx_ring *ring, enum mlx4_qp_state *state, struct mlx4_qp *qp) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_qp_context *context; int err = 0; context = kmalloc(sizeof *context , GFP_KERNEL); if (!context) { en_err(priv, "Failed to allocate qp context\n"); return -ENOMEM; } err = mlx4_qp_alloc(mdev->dev, qpn, qp, GFP_KERNEL); if (err) { en_err(priv, "Failed to allocate qp #%x\n", qpn); goto out; } qp->event = mlx4_en_sqp_event; memset(context, 0, sizeof *context); mlx4_en_fill_qp_context(priv, ring->actual_size, sizeof(struct mlx4_en_rx_desc), 0, 0, qpn, ring->cqn, -1, context); context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma); /* Cancel FCS removal if FW allows */ if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) { context->param3 |= cpu_to_be32(1 << 29); ring->fcs_del = ETH_FCS_LEN; } else ring->fcs_del = 0; err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state); if (err) { mlx4_qp_remove(mdev->dev, qp); mlx4_qp_free(mdev->dev, qp); } mlx4_en_update_rx_prod_db(ring); out: kfree(context); return err; } int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv) { int err; u32 qpn; err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, 0); if (err) { en_err(priv, "Failed reserving drop qpn\n"); return err; } err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp, GFP_KERNEL); if (err) { en_err(priv, "Failed allocating drop qp\n"); mlx4_qp_release_range(priv->mdev->dev, qpn, 1); return err; } return 0; } void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv) { u32 qpn; qpn = priv->drop_qp.qpn; mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp); mlx4_qp_free(priv->mdev->dev, &priv->drop_qp); mlx4_qp_release_range(priv->mdev->dev, qpn, 1); } const u32 * mlx4_en_get_rss_key(struct mlx4_en_priv *priv __unused, u16 *keylen) { static const u32 rsskey[10] = { cpu_to_be32(0xD181C62C), cpu_to_be32(0xF7F4DB5B), cpu_to_be32(0x1983A2FC), cpu_to_be32(0x943E1ADB), cpu_to_be32(0xD9389E6B), cpu_to_be32(0xD1039C2C), cpu_to_be32(0xA74499AD), cpu_to_be32(0x593D56D9), cpu_to_be32(0xF3253C06), cpu_to_be32(0x2ADC1FFC) }; if (keylen != NULL) *keylen = sizeof(rsskey); return (rsskey); } u8 mlx4_en_get_rss_mask(struct mlx4_en_priv *priv) { u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 | MLX4_RSS_TCP_IPV6); if (priv->mdev->profile.udp_rss) rss_mask |= MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6; return (rss_mask); } /* Allocate rx qp's and configure them according to rss map */ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rss_map *rss_map = &priv->rss_map; struct mlx4_qp_context context; struct mlx4_rss_context *rss_context; const u32 *key; int rss_rings; void *ptr; int i; int err = 0; int good_qps = 0; en_dbg(DRV, priv, "Configuring rss steering\n"); err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num, priv->rx_ring_num, &rss_map->base_qpn, 0); if (err) { en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num); return err; } for (i = 0; i < priv->rx_ring_num; i++) { priv->rx_ring[i]->qpn = rss_map->base_qpn + i; err = mlx4_en_config_rss_qp(priv, priv->rx_ring[i]->qpn, priv->rx_ring[i], &rss_map->state[i], &rss_map->qps[i]); if (err) goto rss_err; ++good_qps; } /* Configure RSS indirection qp */ err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp, GFP_KERNEL); if (err) { en_err(priv, "Failed to allocate RSS indirection QP\n"); goto rss_err; } rss_map->indir_qp.event = mlx4_en_sqp_event; mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn, priv->rx_ring[0]->cqn, -1, &context); if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num) rss_rings = priv->rx_ring_num; else rss_rings = priv->prof->rss_rings; ptr = ((u8 *)&context) + offsetof(struct mlx4_qp_context, pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH; rss_context = ptr; rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 | (rss_map->base_qpn)); rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn); if (priv->mdev->profile.udp_rss) rss_context->base_qpn_udp = rss_context->default_qpn; rss_context->flags = mlx4_en_get_rss_mask(priv); rss_context->hash_fn = MLX4_RSS_HASH_TOP; key = mlx4_en_get_rss_key(priv, NULL); for (i = 0; i < 10; i++) rss_context->rss_key[i] = key[i]; err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context, &rss_map->indir_qp, &rss_map->indir_state); if (err) goto indir_err; return 0; indir_err: mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state, MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); mlx4_qp_free(mdev->dev, &rss_map->indir_qp); rss_err: for (i = 0; i < good_qps; i++) { mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]); mlx4_qp_remove(mdev->dev, &rss_map->qps[i]); mlx4_qp_free(mdev->dev, &rss_map->qps[i]); } mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num); return err; } void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rss_map *rss_map = &priv->rss_map; int i; mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state, MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); mlx4_qp_free(mdev->dev, &rss_map->indir_qp); for (i = 0; i < priv->rx_ring_num; i++) { mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]); mlx4_qp_remove(mdev->dev, &rss_map->qps[i]); mlx4_qp_free(mdev->dev, &rss_map->qps[i]); } mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num); } diff --git a/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c b/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c index e07964f3225e..f3a41a15f8b6 100644 --- a/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c +++ b/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c @@ -1,1003 +1,1003 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #define LINUXKPI_PARAM_PREFIX mlx4_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "en.h" int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring, u32 size, u16 stride, int node, int queue_idx) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring; uint32_t x; int tmp; int err; ring = kzalloc_node(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL, node); if (!ring) { ring = kzalloc(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL); if (!ring) { en_err(priv, "Failed allocating TX ring\n"); return -ENOMEM; } } /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX4_EN_TX_MAX_PAYLOAD_SIZE, /* maxsize */ MLX4_EN_TX_MAX_MBUF_FRAGS, /* nsegments */ MLX4_EN_TX_MAX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &ring->dma_tag))) goto done; ring->size = size; ring->size_mask = size - 1; ring->stride = stride; ring->inline_thold = MAX(MIN_PKT_LEN, MIN(priv->prof->inline_thold, MAX_INLINE)); mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF); mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF); tmp = size * sizeof(struct mlx4_en_tx_info); ring->tx_info = kzalloc_node(tmp, GFP_KERNEL, node); if (!ring->tx_info) { ring->tx_info = kzalloc(tmp, GFP_KERNEL); if (!ring->tx_info) { err = -ENOMEM; goto err_ring; } } /* Create DMA descriptor MAPs */ for (x = 0; x != size; x++) { err = -bus_dmamap_create(ring->dma_tag, 0, &ring->tx_info[x].dma_map); if (err != 0) { while (x--) { bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); } goto err_info; } } en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", ring->tx_info, tmp); ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE); /* Allocate HW buffers on provided NUMA node */ err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); if (err) { en_err(priv, "Failed allocating hwq resources\n"); goto err_dma_map; } err = mlx4_en_map_buffer(&ring->wqres.buf); if (err) { en_err(priv, "Failed to map TX buffer\n"); goto err_hwq_res; } ring->buf = ring->wqres.buf.direct.buf; en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d " "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size, ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map); err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn, MLX4_RESERVE_ETH_BF_QP); if (err) { en_err(priv, "failed reserving qp for TX ring\n"); goto err_map; } err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL); if (err) { en_err(priv, "Failed allocating qp %d\n", ring->qpn); goto err_reserve; } ring->qp.event = mlx4_en_sqp_event; err = mlx4_bf_alloc(mdev->dev, &ring->bf, node); if (err) { en_dbg(DRV, priv, "working without blueflame (%d)", err); ring->bf.uar = &mdev->priv_uar; ring->bf.uar->map = mdev->uar_map; ring->bf_enabled = false; } else ring->bf_enabled = true; ring->queue_index = queue_idx; *pring = ring; return 0; err_reserve: mlx4_qp_release_range(mdev->dev, ring->qpn, 1); err_map: mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq_res: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_dma_map: for (x = 0; x != size; x++) bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); err_info: vfree(ring->tx_info); err_ring: bus_dma_tag_destroy(ring->dma_tag); done: kfree(ring); return err; } void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring = *pring; uint32_t x; en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); if (ring->bf_enabled) mlx4_bf_free(mdev->dev, &ring->bf); mlx4_qp_remove(mdev->dev, &ring->qp); mlx4_qp_free(mdev->dev, &ring->qp); mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1); mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); for (x = 0; x != ring->size; x++) bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); vfree(ring->tx_info); mtx_destroy(&ring->tx_lock.m); mtx_destroy(&ring->comp_lock.m); bus_dma_tag_destroy(ring->dma_tag); kfree(ring); *pring = NULL; } int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int cq, int user_prio) { struct mlx4_en_dev *mdev = priv->mdev; int err; ring->cqn = cq; ring->prod = 0; ring->cons = 0xffffffff; ring->last_nr_txbb = 1; ring->poll_cnt = 0; memset(ring->buf, 0, ring->buf_size); ring->watchdog_time = 0; ring->qp_state = MLX4_QP_STATE_RST; ring->doorbell_qpn = ring->qp.qpn << 8; mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, ring->cqn, user_prio, &ring->context); if (ring->bf_enabled) ring->context.usr_page = cpu_to_be32(ring->bf.uar->index); err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context, &ring->qp, &ring->qp_state); return err; } void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring) { struct mlx4_en_dev *mdev = priv->mdev; mlx4_qp_modify(mdev->dev, NULL, ring->qp_state, MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp); } static volatile struct mlx4_wqe_data_seg * mlx4_en_store_inline_lso_data(volatile struct mlx4_wqe_data_seg *dseg, struct mbuf *mb, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); /* copy data into place */ m_copydata(mb, 0, len, inl + 4); dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); return (dseg); } static void mlx4_en_store_inline_lso_header(volatile struct mlx4_wqe_data_seg *dseg, int len, __be32 owner_bit) { } static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 index, u8 owner) { struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = (struct mlx4_en_tx_desc *) (ring->buf + (index * TXBB_SIZE)); volatile __be32 *ptr = (__be32 *)tx_desc; const __be32 stamp = cpu_to_be32(STAMP_VAL | ((u32)owner << STAMP_SHIFT)); u32 i; /* Stamp the freed descriptor */ for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { *ptr = stamp; ptr += STAMP_DWORDS; } } static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 index) { struct mlx4_en_tx_info *tx_info; struct mbuf *mb; tx_info = &ring->tx_info[index]; mb = tx_info->mb; if (mb == NULL) goto done; bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(ring->dma_tag, tx_info->dma_map); m_freem(mb); done: return (tx_info->nr_txbb); } -int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) +int mlx4_en_free_tx_buf(struct ifnet *dev, struct mlx4_en_tx_ring *ring) { struct mlx4_en_priv *priv = netdev_priv(dev); int cnt = 0; /* Skip last polled descriptor */ ring->cons += ring->last_nr_txbb; en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n", ring->cons, ring->prod); if ((u32) (ring->prod - ring->cons) > ring->size) { en_warn(priv, "Tx consumer passed producer!\n"); return 0; } while (ring->cons != ring->prod) { ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, ring->cons & ring->size_mask); ring->cons += ring->last_nr_txbb; cnt++; } if (cnt) en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt); return cnt; } static bool mlx4_en_tx_ring_is_full(struct mlx4_en_tx_ring *ring) { int wqs; wqs = ring->size - (ring->prod - ring->cons); return (wqs < (HEADROOM + (2 * MLX4_EN_TX_WQE_MAX_WQEBBS))); } -static int mlx4_en_process_tx_cq(struct net_device *dev, +static int mlx4_en_process_tx_cq(struct ifnet *dev, struct mlx4_en_cq *cq) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cq *mcq = &cq->mcq; struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; struct mlx4_cqe *cqe; u16 index; u16 new_index, ring_index, stamp_index; u32 txbbs_skipped = 0; u32 txbbs_stamp = 0; u32 cons_index = mcq->cons_index; int size = cq->size; u32 size_mask = ring->size_mask; struct mlx4_cqe *buf = cq->buf; int factor = priv->cqe_factor; if (!priv->port_up) return 0; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; ring_index = ring->cons & size_mask; stamp_index = ring_index; /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { /* * make sure we read the CQE after we read the * ownership bit */ rmb(); if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR)) { en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n", ((struct mlx4_err_cqe *)cqe)-> vendor_err_syndrome, ((struct mlx4_err_cqe *)cqe)->syndrome); } /* Skip over last polled CQE */ new_index = be16_to_cpu(cqe->wqe_index) & size_mask; do { txbbs_skipped += ring->last_nr_txbb; ring_index = (ring_index + ring->last_nr_txbb) & size_mask; /* free next descriptor */ ring->last_nr_txbb = mlx4_en_free_tx_desc( priv, ring, ring_index); mlx4_en_stamp_wqe(priv, ring, stamp_index, !!((ring->cons + txbbs_stamp) & ring->size)); stamp_index = ring_index; txbbs_stamp = txbbs_skipped; } while (ring_index != new_index); ++cons_index; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; } /* * To prevent CQ overflow we first update CQ consumer and only then * the ring consumer. */ mcq->cons_index = cons_index; mlx4_cq_set_ci(mcq); wmb(); ring->cons += txbbs_skipped; return (0); } void mlx4_en_tx_irq(struct mlx4_cq *mcq) { struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; if (priv->port_up == 0 || !spin_trylock(&ring->comp_lock)) return; mlx4_en_process_tx_cq(cq->dev, cq); mod_timer(&cq->timer, jiffies + 1); spin_unlock(&ring->comp_lock); } void mlx4_en_poll_tx_cq(unsigned long data) { struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data; struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; u32 inflight; INC_PERF_COUNTER(priv->pstats.tx_poll); if (priv->port_up == 0) return; if (!spin_trylock(&ring->comp_lock)) { mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); return; } mlx4_en_process_tx_cq(cq->dev, cq); inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb); /* If there are still packets in flight and the timer has not already * been scheduled by the Tx routine then schedule it here to guarantee * completion processing of these packets */ if (inflight && priv->port_up) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); spin_unlock(&ring->comp_lock); } static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) { struct mlx4_en_cq *cq = priv->tx_cq[tx_ind]; struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; if (priv->port_up == 0) return; /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ if (!timer_pending(&cq->timer)) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0) if (spin_trylock(&ring->comp_lock)) { mlx4_en_process_tx_cq(priv->dev, cq); spin_unlock(&ring->comp_lock); } } static u16 mlx4_en_get_inline_hdr_size(struct mlx4_en_tx_ring *ring, struct mbuf *mb) { u16 retval; /* only copy from first fragment, if possible */ retval = MIN(ring->inline_thold, mb->m_len); /* check for too little data */ if (unlikely(retval < MIN_PKT_LEN)) retval = MIN(ring->inline_thold, mb->m_pkthdr.len); return (retval); } static int mlx4_en_get_header_size(struct mbuf *mb) { struct ether_vlan_header *eh; struct tcphdr *th; struct ip *ip; int ip_hlen, tcp_hlen; struct ip6_hdr *ip6; uint16_t eth_type; int eth_hdr_len; eh = mtod(mb, struct ether_vlan_header *); if (mb->m_len < ETHER_HDR_LEN) return (0); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { eth_type = ntohs(eh->evl_proto); eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); eth_hdr_len = ETHER_HDR_LEN; } if (mb->m_len < eth_hdr_len) return (0); switch (eth_type) { case ETHERTYPE_IP: ip = (struct ip *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip)) return (0); if (ip->ip_p != IPPROTO_TCP) return (0); ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip6)) return (0); if (ip6->ip6_nxt != IPPROTO_TCP) return (0); eth_hdr_len += sizeof(*ip6); break; default: return (0); } if (mb->m_len < eth_hdr_len + sizeof(*th)) return (0); th = (struct tcphdr *)(mb->m_data + eth_hdr_len); tcp_hlen = th->th_off << 2; eth_hdr_len += tcp_hlen; if (mb->m_len < eth_hdr_len) return (0); return (eth_hdr_len); } static volatile struct mlx4_wqe_data_seg * mlx4_en_store_inline_data(volatile struct mlx4_wqe_data_seg *dseg, struct mbuf *mb, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; if (unlikely(len < MIN_PKT_LEN)) { m_copydata(mb, 0, len, inl + 4); memset(inl + 4 + len, 0, MIN_PKT_LEN - len); dseg += DIV_ROUND_UP(4 + MIN_PKT_LEN, DS_SIZE_ALIGNMENT); } else if (len <= spc) { m_copydata(mb, 0, len, inl + 4); dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); } else { m_copydata(mb, 0, spc, inl + 4); m_copydata(mb, spc, len - spc, inl + 8 + spc); dseg += DIV_ROUND_UP(8 + len, DS_SIZE_ALIGNMENT); } return (dseg); } static void mlx4_en_store_inline_header(volatile struct mlx4_wqe_data_seg *dseg, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; if (unlikely(len < MIN_PKT_LEN)) { *(volatile uint32_t *)inl = SET_BYTE_COUNT((1U << 31) | MIN_PKT_LEN); } else if (len <= spc) { *(volatile uint32_t *)inl = SET_BYTE_COUNT((1U << 31) | len); } else { *(volatile uint32_t *)(inl + 4 + spc) = SET_BYTE_COUNT((1U << 31) | (len - spc)); wmb(); *(volatile uint32_t *)inl = SET_BYTE_COUNT((1U << 31) | spc); } } static uint32_t hashrandom; static void hashrandom_init(void *arg) { /* * It is assumed that the random subsystem has been * initialized when this function is called: */ hashrandom = m_ether_tcpip_hash_init(); } SYSINIT(hashrandom_init, SI_SUB_RANDOM, SI_ORDER_ANY, &hashrandom_init, NULL); -u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb) +u16 mlx4_en_select_queue(struct ifnet *dev, struct mbuf *mb) { struct mlx4_en_priv *priv = netdev_priv(dev); u32 rings_p_up = priv->num_tx_rings_p_up; u32 up = 0; u32 queue_index; #if (MLX4_EN_NUM_UP > 1) /* Obtain VLAN information if present */ if (mb->m_flags & M_VLANTAG) { u32 vlan_tag = mb->m_pkthdr.ether_vtag; up = (vlan_tag >> 13) % MLX4_EN_NUM_UP; } #endif queue_index = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 | MBUF_HASHFLAG_L4, mb, hashrandom); return ((queue_index % rings_p_up) + (up * rings_p_up)); } static void mlx4_bf_copy(void __iomem *dst, volatile unsigned long *src, unsigned bytecnt) { __iowrite64_copy(dst, __DEVOLATILE(void *, src), bytecnt / 8); } int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp) { enum { DS_FACT = TXBB_SIZE / DS_SIZE_ALIGNMENT, CTRL_FLAGS = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICITED), }; bus_dma_segment_t segs[MLX4_EN_TX_MAX_MBUF_FRAGS]; volatile struct mlx4_wqe_data_seg *dseg; volatile struct mlx4_wqe_data_seg *dseg_inline; volatile struct mlx4_en_tx_desc *tx_desc; struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; struct ifnet *ifp = priv->dev; struct mlx4_en_tx_info *tx_info; struct mbuf *mb = *mbp; struct mbuf *m; __be32 owner_bit; int nr_segs; int pad; int err; u32 bf_size; u32 bf_prod; u32 opcode; u16 index; u16 ds_cnt; u16 ihs; if (unlikely(!priv->port_up)) { err = EINVAL; goto tx_drop; } /* check if TX ring is full */ if (unlikely(mlx4_en_tx_ring_is_full(ring))) { /* Use interrupts to find out when queue opened */ mlx4_en_arm_cq(priv, priv->tx_cq[tx_ind]); return (ENOBUFS); } /* sanity check we are not wrapping around */ KASSERT(((~ring->prod) & ring->size_mask) >= (MLX4_EN_TX_WQE_MAX_WQEBBS - 1), ("Wrapping around TX ring")); /* Track current inflight packets for performance analysis */ AVG_PERF_COUNTER(priv->pstats.inflight_avg, (u32) (ring->prod - ring->cons - 1)); /* Track current mbuf packet header length */ AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len); /* Grab an index and try to transmit packet */ owner_bit = (ring->prod & ring->size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0; index = ring->prod & ring->size_mask; tx_desc = (volatile struct mlx4_en_tx_desc *) (ring->buf + index * TXBB_SIZE); tx_info = &ring->tx_info[index]; dseg = &tx_desc->data; /* send a copy of the frame to the BPF listener, if any */ if (ifp != NULL && ifp->if_bpf != NULL) ETHER_BPF_MTAP(ifp, mb); /* get default flags */ tx_desc->ctrl.srcrb_flags = CTRL_FLAGS; if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); /* do statistics */ if (likely(tx_desc->ctrl.srcrb_flags != CTRL_FLAGS)) { priv->port_stats.tx_chksum_offload++; ring->tx_csum++; } /* check for VLAN tag */ if (mb->m_flags & M_VLANTAG) { tx_desc->ctrl.vlan_tag = cpu_to_be16(mb->m_pkthdr.ether_vtag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN; } else { tx_desc->ctrl.vlan_tag = 0; tx_desc->ctrl.ins_vlan = 0; } if (unlikely(mlx4_is_mfunc(priv->mdev->dev) || priv->validate_loopback)) { /* * Copy destination MAC address to WQE. This allows * loopback in eSwitch, so that VFs and PF can * communicate with each other: */ m_copydata(mb, 0, 2, __DEVOLATILE(void *, &tx_desc->ctrl.srcrb_flags16[0])); m_copydata(mb, 2, 4, __DEVOLATILE(void *, &tx_desc->ctrl.imm)); } else { /* clear immediate field */ tx_desc->ctrl.imm = 0; } /* Handle LSO (TSO) packets */ if (mb->m_pkthdr.csum_flags & CSUM_TSO) { u32 payload_len; u32 mss = mb->m_pkthdr.tso_segsz; u32 num_pkts; opcode = cpu_to_be32(MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR) | owner_bit; ihs = mlx4_en_get_header_size(mb); if (unlikely(ihs > MAX_INLINE)) { ring->oversized_packets++; err = EINVAL; goto tx_drop; } tx_desc->lso.mss_hdr_size = cpu_to_be32((mss << 16) | ihs); payload_len = mb->m_pkthdr.len - ihs; if (unlikely(payload_len == 0)) num_pkts = 1; else num_pkts = DIV_ROUND_UP(payload_len, mss); ring->bytes += payload_len + (num_pkts * ihs); ring->packets += num_pkts; ring->tso_packets++; /* store pointer to inline header */ dseg_inline = dseg; /* copy data inline */ dseg = mlx4_en_store_inline_lso_data(dseg, mb, ihs, owner_bit); } else { opcode = cpu_to_be32(MLX4_OPCODE_SEND) | owner_bit; ihs = mlx4_en_get_inline_hdr_size(ring, mb); ring->bytes += max_t (unsigned int, mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); ring->packets++; /* store pointer to inline header */ dseg_inline = dseg; /* copy data inline */ dseg = mlx4_en_store_inline_data(dseg, mb, ihs, owner_bit); } m_adj(mb, ihs); err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, mb, segs, &nr_segs, BUS_DMA_NOWAIT); if (unlikely(err == EFBIG)) { /* Too many mbuf fragments */ ring->defrag_attempts++; m = m_defrag(mb, M_NOWAIT); if (m == NULL) { ring->oversized_packets++; goto tx_drop; } mb = m; /* Try again */ err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, mb, segs, &nr_segs, BUS_DMA_NOWAIT); } /* catch errors */ if (unlikely(err != 0)) { ring->oversized_packets++; goto tx_drop; } /* If there were no errors and we didn't load anything, don't sync. */ if (nr_segs != 0) { /* make sure all mbuf data is written to RAM */ bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, BUS_DMASYNC_PREWRITE); } else { /* All data was inlined, free the mbuf. */ bus_dmamap_unload(ring->dma_tag, tx_info->dma_map); m_freem(mb); mb = NULL; } /* compute number of DS needed */ ds_cnt = (dseg - ((volatile struct mlx4_wqe_data_seg *)tx_desc)) + nr_segs; /* * Check if the next request can wrap around and fill the end * of the current request with zero immediate data: */ pad = DIV_ROUND_UP(ds_cnt, DS_FACT); pad = (~(ring->prod + pad)) & ring->size_mask; if (unlikely(pad < (MLX4_EN_TX_WQE_MAX_WQEBBS - 1))) { /* * Compute the least number of DS blocks we need to * pad in order to achieve a TX ring wraparound: */ pad = (DS_FACT * (pad + 1)); } else { /* * The hardware will automatically jump to the next * TXBB. No need for padding. */ pad = 0; } /* compute total number of DS blocks */ ds_cnt += pad; /* * When modifying this code, please ensure that the following * computation is always less than or equal to 0x3F: * * ((MLX4_EN_TX_WQE_MAX_WQEBBS - 1) * DS_FACT) + * (MLX4_EN_TX_WQE_MAX_WQEBBS * DS_FACT) * * Else the "ds_cnt" variable can become too big. */ tx_desc->ctrl.fence_size = (ds_cnt & 0x3f); /* store pointer to mbuf */ tx_info->mb = mb; tx_info->nr_txbb = DIV_ROUND_UP(ds_cnt, DS_FACT); bf_size = ds_cnt * DS_SIZE_ALIGNMENT; bf_prod = ring->prod; /* compute end of "dseg" array */ dseg += nr_segs + pad; /* pad using zero immediate dseg */ while (pad--) { dseg--; dseg->addr = 0; dseg->lkey = 0; wmb(); dseg->byte_count = SET_BYTE_COUNT((1U << 31)|0); } /* fill segment list */ while (nr_segs--) { if (unlikely(segs[nr_segs].ds_len == 0)) { dseg--; dseg->addr = 0; dseg->lkey = 0; wmb(); dseg->byte_count = SET_BYTE_COUNT((1U << 31)|0); } else { dseg--; dseg->addr = cpu_to_be64((uint64_t)segs[nr_segs].ds_addr); dseg->lkey = cpu_to_be32(priv->mdev->mr.key); wmb(); dseg->byte_count = SET_BYTE_COUNT((uint32_t)segs[nr_segs].ds_len); } } wmb(); /* write owner bits in reverse order */ if ((opcode & cpu_to_be32(0x1F)) == cpu_to_be32(MLX4_OPCODE_LSO)) mlx4_en_store_inline_lso_header(dseg_inline, ihs, owner_bit); else mlx4_en_store_inline_header(dseg_inline, ihs, owner_bit); /* update producer counter */ ring->prod += tx_info->nr_txbb; if (ring->bf_enabled && bf_size <= MAX_BF && (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_CVLAN)) { /* store doorbell number */ *(volatile __be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn); /* or in producer number for this WQE */ opcode |= cpu_to_be32((bf_prod & 0xffff) << 8); /* * Ensure the new descriptor hits memory before * setting ownership of this descriptor to HW: */ wmb(); tx_desc->ctrl.owner_opcode = opcode; wmb(); mlx4_bf_copy(((u8 *)ring->bf.reg) + ring->bf.offset, (volatile unsigned long *) &tx_desc->ctrl, bf_size); wmb(); ring->bf.offset ^= ring->bf.buf_size; } else { /* * Ensure the new descriptor hits memory before * setting ownership of this descriptor to HW: */ wmb(); tx_desc->ctrl.owner_opcode = opcode; wmb(); writel(cpu_to_be32(ring->doorbell_qpn), ((u8 *)ring->bf.uar->map) + MLX4_SEND_DOORBELL); } return (0); tx_drop: *mbp = NULL; m_freem(mb); return (err); } static int mlx4_en_transmit_locked(struct ifnet *ifp, int tx_ind, struct mbuf *mb) { struct mlx4_en_priv *priv = netdev_priv(ifp); struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; int err = 0; if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || READ_ONCE(priv->port_up) == 0)) { m_freem(mb); return (ENETDOWN); } if (mlx4_en_xmit(priv, tx_ind, &mb) != 0) { /* NOTE: m_freem() is NULL safe */ m_freem(mb); err = ENOBUFS; if (ring->watchdog_time == 0) ring->watchdog_time = ticks + MLX4_EN_WATCHDOG_TIMEOUT; } else { ring->watchdog_time = 0; } return (err); } int mlx4_en_transmit(struct ifnet *dev, struct mbuf *m) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; int i, err = 0; if (priv->port_up == 0) { m_freem(m); return (ENETDOWN); } /* Compute which queue to use */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { i = (m->m_pkthdr.flowid % 128) % priv->tx_ring_num; } else { i = mlx4_en_select_queue(dev, m); } ring = priv->tx_ring[i]; spin_lock(&ring->tx_lock); err = mlx4_en_transmit_locked(dev, i, m); spin_unlock(&ring->tx_lock); /* Poll CQ here */ mlx4_en_xmit_poll(priv, i); #if __FreeBSD_version >= 1100000 if (unlikely(err != 0)) if_inc_counter(dev, IFCOUNTER_IQDROPS, 1); #endif return (err); } /* * Flush ring buffers. */ void mlx4_en_qflush(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); if (priv->port_up == 0) return; if_qflush(dev); } diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib.h b/sys/dev/mlx4/mlx4_ib/mlx4_ib.h index d8926110a7a1..100a06b75af7 100644 --- a/sys/dev/mlx4/mlx4_ib/mlx4_ib.h +++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib.h @@ -1,899 +1,899 @@ /* * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef MLX4_IB_H #define MLX4_IB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #define MLX4_IB_DRV_NAME "mlx4_ib" #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ #define mlx4_ib_warn(ibdev, format, arg...) \ dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) enum { MLX4_IB_SQ_MIN_WQE_SHIFT = 6, MLX4_IB_MAX_HEADROOM = 2048 }; #define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) #define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) /*module param to indicate if SM assigns the alias_GUID*/ extern int mlx4_ib_sm_guid_assign; extern struct proc_dir_entry *mlx4_mrs_dir_entry; #define MLX4_IB_UC_STEER_QPN_ALIGN 1 #define MLX4_IB_UC_MAX_NUM_QPS 256 enum hw_bar_type { HW_BAR_BF, HW_BAR_DB, HW_BAR_CLOCK, HW_BAR_COUNT }; struct mlx4_ib_vma_private_data { struct vm_area_struct *vma; }; struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; struct list_head db_page_list; struct mutex db_page_mutex; struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT]; }; struct mlx4_ib_pd { struct ib_pd ibpd; u32 pdn; }; struct mlx4_ib_xrcd { struct ib_xrcd ibxrcd; u32 xrcdn; struct ib_pd *pd; struct ib_cq *cq; }; struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; int entry_size; }; struct mlx4_ib_cq_resize { struct mlx4_ib_cq_buf buf; int cqe; }; struct mlx4_ib_cq { struct ib_cq ibcq; struct mlx4_cq mcq; struct mlx4_ib_cq_buf buf; struct mlx4_ib_cq_resize *resize_buf; struct mlx4_db db; spinlock_t lock; struct mutex resize_mutex; struct ib_umem *umem; struct ib_umem *resize_umem; int create_flags; /* List of qps that it serves.*/ struct list_head send_qp_list; struct list_head recv_qp_list; }; #define MLX4_MR_PAGES_ALIGN 0x40 struct mlx4_ib_mr { struct ib_mr ibmr; __be64 *pages; dma_addr_t page_map; u32 npages; u32 max_pages; struct mlx4_mr mmr; struct ib_umem *umem; size_t page_map_size; }; struct mlx4_ib_mw { struct ib_mw ibmw; struct mlx4_mw mmw; }; struct mlx4_ib_fmr { struct ib_fmr ibfmr; struct mlx4_fmr mfmr; }; #define MAX_REGS_PER_FLOW 2 struct mlx4_flow_reg_id { u64 id; u64 mirror; }; struct mlx4_ib_flow { struct ib_flow ibflow; /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW]; }; struct mlx4_ib_wq { u64 *wrid; spinlock_t lock; int wqe_cnt; int max_post; int max_gs; int offset; int wqe_shift; unsigned head; unsigned tail; }; enum { MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START }; enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */ MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1U << 31, }; struct mlx4_ib_gid_entry { struct list_head list; union ib_gid gid; int added; u8 port; }; enum mlx4_ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ MLX4_IB_QPT_SMI = IB_QPT_SMI, MLX4_IB_QPT_GSI = IB_QPT_GSI, MLX4_IB_QPT_RC = IB_QPT_RC, MLX4_IB_QPT_UC = IB_QPT_UC, MLX4_IB_QPT_UD = IB_QPT_UD, MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, MLX4_IB_QPT_PROXY_SMI = 1 << 17, MLX4_IB_QPT_PROXY_GSI = 1 << 18, MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, MLX4_IB_QPT_TUN_SMI = 1 << 20, MLX4_IB_QPT_TUN_GSI = 1 << 21, }; #define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) enum mlx4_ib_mad_ifc_flags { MLX4_MAD_IFC_IGNORE_MKEY = 1, MLX4_MAD_IFC_IGNORE_BKEY = 2, MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | MLX4_MAD_IFC_IGNORE_BKEY), MLX4_MAD_IFC_NET_VIEW = 4, }; enum { MLX4_NUM_TUNNEL_BUFS = 256, }; struct mlx4_ib_tunnel_header { struct mlx4_av av; __be32 remote_qpn; __be32 qkey; __be16 vlan; u8 mac[6]; __be16 pkey_index; u8 reserved[6]; }; struct mlx4_ib_buf { void *addr; dma_addr_t map; }; struct mlx4_rcv_tunnel_hdr { __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: * 0x0 - no vlan was in the packet * 0x01 - C-VLAN was in the packet */ u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ u8 reserved; __be16 pkey_index; __be16 sl_vid; __be16 slid_mac_47_32; __be32 mac_31_0; }; struct mlx4_ib_proxy_sqp_hdr { struct ib_grh grh; struct mlx4_rcv_tunnel_hdr tun; } __packed; struct mlx4_roce_smac_vlan_info { u64 smac; int smac_index; int smac_port; u64 candidate_smac; int candidate_smac_index; int candidate_smac_port; u16 vid; int vlan_index; int vlan_port; u16 candidate_vid; int candidate_vlan_index; int candidate_vlan_port; int update_vid; }; struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; struct mlx4_buf buf; struct mlx4_db db; struct mlx4_ib_wq rq; u32 doorbell_qpn; __be32 sq_signal_bits; unsigned sq_next_wqe; int sq_max_wqes_per_wr; int sq_spare_wqes; struct mlx4_ib_wq sq; enum mlx4_ib_qp_type mlx4_ib_qp_type; struct ib_umem *umem; struct mlx4_mtt mtt; int buf_size; struct mutex mutex; u16 xrcdn; u32 flags; u8 port; u8 alt_port; u8 atomic_rd_en; u8 resp_depth; u8 sq_no_prefetch; u8 state; int mlx_type; struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; struct mlx4_roce_smac_vlan_info pri; struct mlx4_roce_smac_vlan_info alt; u64 reg_id; struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; struct counter_index *counter_index; }; struct mlx4_ib_srq { struct ib_srq ibsrq; struct mlx4_srq msrq; struct mlx4_buf buf; struct mlx4_db db; u64 *wrid; spinlock_t lock; int head; int tail; u16 wqe_ctr; struct ib_umem *umem; struct mlx4_mtt mtt; struct mutex mutex; }; struct mlx4_ib_ah { struct ib_ah ibah; union mlx4_ext_av av; }; /****************************************/ /* alias guid support */ /****************************************/ #define NUM_PORT_ALIAS_GUID 2 #define NUM_ALIAS_GUID_IN_REC 8 #define NUM_ALIAS_GUID_REC_IN_PORT 16 #define GUID_REC_SIZE 8 #define NUM_ALIAS_GUID_PER_PORT 128 #define MLX4_NOT_SET_GUID (0x00LL) #define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) enum mlx4_guid_alias_rec_status { MLX4_GUID_INFO_STATUS_IDLE, MLX4_GUID_INFO_STATUS_SET, }; #define GUID_STATE_NEED_PORT_INIT 0x01 enum mlx4_guid_alias_rec_method { MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, }; struct mlx4_sriov_alias_guid_info_rec_det { u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC]; u64 time_to_run; }; struct mlx4_sriov_alias_guid_port_rec_det { struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; struct workqueue_struct *wq; struct delayed_work alias_guid_work; u8 port; u32 state_flags; struct mlx4_sriov_alias_guid *parent; struct list_head cb_list; }; struct mlx4_sriov_alias_guid { struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; spinlock_t ag_work_lock; struct ib_sa_client *sa_client; }; struct mlx4_ib_demux_work { struct work_struct work; struct mlx4_ib_dev *dev; int slave; int do_init; u8 port; }; struct mlx4_ib_tun_tx_buf { struct mlx4_ib_buf buf; struct ib_ah *ah; }; struct mlx4_ib_demux_pv_qp { struct ib_qp *qp; enum ib_qp_type proxy_qpt; struct mlx4_ib_buf *ring; struct mlx4_ib_tun_tx_buf *tx_ring; spinlock_t tx_lock; unsigned tx_ix_head; unsigned tx_ix_tail; }; enum mlx4_ib_demux_pv_state { DEMUX_PV_STATE_DOWN, DEMUX_PV_STATE_STARTING, DEMUX_PV_STATE_ACTIVE, DEMUX_PV_STATE_DOWNING, }; struct mlx4_ib_demux_pv_ctx { int port; int slave; enum mlx4_ib_demux_pv_state state; int has_smi; struct ib_device *ib_dev; struct ib_cq *cq; struct ib_pd *pd; struct work_struct work; struct workqueue_struct *wq; struct mlx4_ib_demux_pv_qp qp[2]; }; struct mlx4_ib_demux_ctx { struct ib_device *ib_dev; int port; struct workqueue_struct *wq; struct workqueue_struct *ud_wq; spinlock_t ud_lock; atomic64_t subnet_prefix; __be64 guid_cache[128]; struct mlx4_ib_dev *dev; /* the following lock protects both mcg_table and mcg_mgid0_list */ struct mutex mcg_table_lock; struct rb_root mcg_table; struct list_head mcg_mgid0_list; struct workqueue_struct *mcg_wq; struct mlx4_ib_demux_pv_ctx **tun; atomic_t tid; int flushing; /* flushing the work queue */ }; struct mlx4_ib_sriov { struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; /* when using this spinlock you should use "irq" because * it may be called from interrupt context.*/ spinlock_t going_down_lock; int is_going_down; struct mlx4_sriov_alias_guid alias_guid; /* CM paravirtualization fields */ struct list_head cm_list; spinlock_t id_map_lock; struct rb_root sl_id_map; struct idr pv_id_table; }; struct gid_cache_context { int real_index; int refcount; }; struct gid_entry { union ib_gid gid; enum ib_gid_type gid_type; struct gid_cache_context *ctx; }; struct mlx4_port_gid_table { struct gid_entry gids[MLX4_MAX_PORT_GIDS]; }; struct mlx4_ib_iboe { spinlock_t lock; - struct net_device *netdevs[MLX4_MAX_PORTS]; + struct ifnet *netdevs[MLX4_MAX_PORTS]; atomic64_t mac[MLX4_MAX_PORTS]; struct notifier_block nb; struct mlx4_port_gid_table gids[MLX4_MAX_PORTS]; }; struct pkey_mgt { u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; struct list_head pkey_port_list[MLX4_MFUNC_MAX]; struct kobject *device_parent[MLX4_MFUNC_MAX]; }; struct mlx4_ib_iov_sysfs_attr { void *ctx; struct kobject *kobj; unsigned long data; u32 entry_num; char name[15]; struct device_attribute dentry; struct device *dev; }; struct mlx4_ib_iov_sysfs_attr_ar { struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; }; struct mlx4_ib_iov_port { char name[100]; u8 num; struct mlx4_ib_dev *dev; struct list_head list; struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; struct ib_port_attr attr; struct kobject *cur_port; struct kobject *admin_alias_parent; struct kobject *gids_parent; struct kobject *pkeys_parent; struct kobject *mcgs_parent; struct mlx4_ib_iov_sysfs_attr mcg_dentry; }; struct counter_index { struct list_head list; u32 index; u8 allocated; }; struct mlx4_ib_counters { struct list_head counters_list; struct mutex mutex; /* mutex for accessing counters list */ u32 default_counter; }; #define MLX4_DIAG_COUNTERS_TYPES 2 struct mlx4_ib_diag_counters { const char **name; u32 *offset; u32 num_counters; }; struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; int num_ports; void __iomem *uar_map; struct mlx4_uar priv_uar; u32 priv_pdn; MLX4_DECLARE_DOORBELL_LOCK(uar_lock); struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; struct ib_ah *sm_ah[MLX4_MAX_PORTS]; spinlock_t sm_lock; atomic64_t sl2vl[MLX4_MAX_PORTS]; struct mlx4_ib_sriov sriov; struct mutex cap_mask_mutex; bool ib_active; struct mlx4_ib_iboe iboe; struct mlx4_ib_counters counters_table[MLX4_MAX_PORTS]; int *eq_table; struct kobject *iov_parent; struct kobject *ports_parent; struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; struct pkey_mgt pkeys; unsigned long *ib_uc_qpns_bitmap; int steer_qpn_count; int steer_qpn_base; int steering_support; struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; /* lock when destroying qp1_proxy and getting netdev events */ struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; u8 bond_next_port; /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES]; }; struct ib_event_work { struct work_struct work; struct mlx4_ib_dev *ib_dev; struct mlx4_eqe ib_eqe; int port; }; struct mlx4_ib_qp_tunnel_init_attr { struct ib_qp_init_attr init_attr; int slave; enum ib_qp_type proxy_qp_type; u8 port; }; struct mlx4_uverbs_ex_query_device { __u32 comp_mask; __u32 reserved; }; enum query_device_resp_mask { QUERY_DEVICE_RESP_MASK_TIMESTAMP = 1UL << 0, }; struct mlx4_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; __u64 hca_core_clock_offset; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); } static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) { return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext); } static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) { return container_of(ibpd, struct mlx4_ib_pd, ibpd); } static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) { return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); } static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx4_ib_cq, ibcq); } static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq) { return container_of(mcq, struct mlx4_ib_cq, mcq); } static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) { return container_of(ibmr, struct mlx4_ib_mr, ibmr); } static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) { return container_of(ibmw, struct mlx4_ib_mw, ibmw); } static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) { return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); } static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) { return container_of(ibflow, struct mlx4_ib_flow, ibflow); } static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) { return container_of(ibqp, struct mlx4_ib_qp, ibqp); } static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp) { return container_of(mqp, struct mlx4_ib_qp, mqp); } static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq) { return container_of(ibsrq, struct mlx4_ib_srq, ibsrq); } static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq) { return container_of(msrq, struct mlx4_ib_srq, msrq); } static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) { return container_of(ibah, struct mlx4_ib_ah, ibah); } static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev) { dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports; return dev->bond_next_port + 1; } int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db); void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem); struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int mlx4_ib_destroy_cq(struct ib_cq *cq); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct ib_udata *udata); int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); int mlx4_ib_destroy_ah(struct ib_ah *ah); struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int mlx4_ib_destroy_srq(struct ib_srq *srq); void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_destroy_qp(struct ib_qp *qp); int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr); int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad); int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, struct ib_mad_hdr *out, size_t *out_mad_size, u16 *out_mad_pkey_index); int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova); int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view); int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view); int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view); static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) return true; return !!(ah->av.ib.g_slid & 0x80); } int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); int mlx4_ib_mcg_init(void); void mlx4_ib_mcg_destroy(void); int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *sa_mad); int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, struct ib_sa_mad *mad); int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid); void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, enum ib_event_type type); void mlx4_ib_tunnels_update_work(struct work_struct *work); int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad); int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, u16 vlan_id, struct ib_mad *mad); __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, struct ib_mad *mad); int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, struct ib_mad *mad); void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); /* alias guid support */ void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data); void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data); int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, int port, int slave_init); int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); __be64 mlx4_ib_gen_node_guid(void); int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach); int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, u8 port_num, int index); void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, int port); void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port); #endif /* MLX4_IB_H */ diff --git a/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c b/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c index b050272e49d0..62956607974b 100644 --- a/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c +++ b/sys/dev/mlx4/mlx4_ib/mlx4_ib_main.c @@ -1,3344 +1,3344 @@ /* * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define LINUXKPI_PARAM_PREFIX mlx4_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME #ifndef DRV_VERSION #define DRV_VERSION "3.6.0" #endif #define DRV_RELDATE "December 2020" #define MLX4_IB_FLOW_MAX_PRIO 0xFFF #define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF #define MLX4_IB_CARD_REV_A0 0xA0 MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); int mlx4_ib_sm_guid_assign = 0; module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)"); static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static int check_flow_steering_support(struct mlx4_dev *dev) { int eth_num_ports = 0; int ib_num_ports = 0; int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; if (dmfs) { int i; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) eth_num_ports++; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ib_num_ports++; dmfs &= (!ib_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && (!eth_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); if (ib_num_ports && mlx4_is_mfunc(dev)) { pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); dmfs = 0; } } return dmfs; } static int num_ib_ports(struct mlx4_dev *dev) { int ib_ports = 0; int i; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ib_ports++; return ib_ports; } -static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num) +static struct ifnet *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num) { struct mlx4_ib_dev *ibdev = to_mdev(device); - struct net_device *dev; + struct ifnet *dev; rcu_read_lock(); dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num); #if 0 if (dev) { if (mlx4_is_bonded(ibdev->dev)) { - struct net_device *upper = NULL; + struct ifnet *upper = NULL; upper = netdev_master_upper_dev_get_rcu(dev); if (upper) { - struct net_device *active; + struct ifnet *active; active = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (active) dev = active; } } } #endif if (dev) if_ref(dev); rcu_read_unlock(); return dev; } static int mlx4_ib_update_gids_v1(struct gid_entry *gids, struct mlx4_ib_dev *ibdev, u8 port_num) { struct mlx4_cmd_mailbox *mailbox; int err; struct mlx4_dev *dev = ibdev->dev; int i; union ib_gid *gid_tbl; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return -ENOMEM; gid_tbl = mailbox->buf; for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid)); err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | port_num, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (mlx4_is_bonded(dev)) err += mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 2, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev, mailbox); return err; } static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids, struct mlx4_ib_dev *ibdev, u8 port_num) { struct mlx4_cmd_mailbox *mailbox; int err; struct mlx4_dev *dev = ibdev->dev; int i; struct { union ib_gid gid; __be32 rsrvd1[2]; __be16 rsrvd2; u8 type; u8 version; __be32 rsrvd3; } *gid_tbl; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return -ENOMEM; gid_tbl = mailbox->buf; for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid)); if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { gid_tbl[i].version = 2; if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid)) gid_tbl[i].type = 1; else memset(&gid_tbl[i].gid, 0, 12); } } err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_ROCE_ADDR << 8 | port_num, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (mlx4_is_bonded(dev)) err += mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_ROCE_ADDR << 8 | 2, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev, mailbox); return err; } static int mlx4_ib_update_gids(struct gid_entry *gids, struct mlx4_ib_dev *ibdev, u8 port_num) { if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num); return mlx4_ib_update_gids_v1(gids, ibdev, port_num); } static int mlx4_ib_add_gid(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, void **context) { struct mlx4_ib_dev *ibdev = to_mdev(device); struct mlx4_ib_iboe *iboe = &ibdev->iboe; struct mlx4_port_gid_table *port_gid_table; int free = -1, found = -1; int ret = 0; int hw_update = 0; int i; struct gid_entry *gids = NULL; if (!rdma_cap_roce_gid_table(device, port_num)) return -EINVAL; if (port_num > MLX4_MAX_PORTS) return -EINVAL; if (!context) return -EINVAL; port_gid_table = &iboe->gids[port_num - 1]; spin_lock_bh(&iboe->lock); for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) && (port_gid_table->gids[i].gid_type == attr->gid_type)) { found = i; break; } if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid))) free = i; /* HW has space */ } if (found < 0) { if (free < 0) { ret = -ENOSPC; } else { port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC); if (!port_gid_table->gids[free].ctx) { ret = -ENOMEM; } else { *context = port_gid_table->gids[free].ctx; memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid)); port_gid_table->gids[free].gid_type = attr->gid_type; port_gid_table->gids[free].ctx->real_index = free; port_gid_table->gids[free].ctx->refcount = 1; hw_update = 1; } } } else { struct gid_cache_context *ctx = port_gid_table->gids[found].ctx; *context = ctx; ctx->refcount++; } if (!ret && hw_update) { gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC); if (!gids) { ret = -ENOMEM; } else { for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) { memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid)); gids[i].gid_type = port_gid_table->gids[i].gid_type; } } } spin_unlock_bh(&iboe->lock); if (!ret && hw_update) { ret = mlx4_ib_update_gids(gids, ibdev, port_num); kfree(gids); } return ret; } static int mlx4_ib_del_gid(struct ib_device *device, u8 port_num, unsigned int index, void **context) { struct gid_cache_context *ctx = *context; struct mlx4_ib_dev *ibdev = to_mdev(device); struct mlx4_ib_iboe *iboe = &ibdev->iboe; struct mlx4_port_gid_table *port_gid_table; int ret = 0; int hw_update = 0; struct gid_entry *gids = NULL; if (!rdma_cap_roce_gid_table(device, port_num)) return -EINVAL; if (port_num > MLX4_MAX_PORTS) return -EINVAL; port_gid_table = &iboe->gids[port_num - 1]; spin_lock_bh(&iboe->lock); if (ctx) { ctx->refcount--; if (!ctx->refcount) { unsigned int real_index = ctx->real_index; memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid)); kfree(port_gid_table->gids[real_index].ctx); port_gid_table->gids[real_index].ctx = NULL; hw_update = 1; } } if (!ret && hw_update) { int i; gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC); if (!gids) { ret = -ENOMEM; } else { for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) { memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid)); gids[i].gid_type = port_gid_table->gids[i].gid_type; } } } spin_unlock_bh(&iboe->lock); if (!ret && hw_update) { ret = mlx4_ib_update_gids(gids, ibdev, port_num); kfree(gids); } return ret; } int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, u8 port_num, int index) { struct mlx4_ib_iboe *iboe = &ibdev->iboe; struct gid_cache_context *ctx = NULL; union ib_gid gid; struct mlx4_port_gid_table *port_gid_table; int real_index = -EINVAL; int i; int ret; unsigned long flags; struct ib_gid_attr attr; if (port_num > MLX4_MAX_PORTS) return -EINVAL; if (mlx4_is_bonded(ibdev->dev)) port_num = 1; if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num)) return index; ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr); if (ret) return ret; if (attr.ndev) if_rele(attr.ndev); if (!memcmp(&gid, &zgid, sizeof(gid))) return -EINVAL; spin_lock_irqsave(&iboe->lock, flags); port_gid_table = &iboe->gids[port_num - 1]; for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) && attr.gid_type == port_gid_table->gids[i].gid_type) { ctx = port_gid_table->gids[i].ctx; break; } if (ctx) real_index = ctx->real_index; spin_unlock_irqrestore(&iboe->lock, flags); return real_index; } static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; int have_ib_ports; struct mlx4_uverbs_ex_query_device cmd; struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0}; struct mlx4_clock_params clock_params; if (uhw->inlen) { if (uhw->inlen < sizeof(cmd)) return -EINVAL; err = ib_copy_from_udata(&cmd, uhw, sizeof(cmd)); if (err) return err; if (cmd.comp_mask) return -EINVAL; if (cmd.reserved) return -EINVAL; } resp.response_length = offsetof(typeof(resp), response_length) + sizeof(resp.response_length); in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memset(props, 0, sizeof *props); have_ib_ports = num_ib_ports(dev->dev); props->fw_ver = dev->dev->caps.fw_ver; props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (dev->dev->caps.max_gso_sz && (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) && (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; else props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; } if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = dev->dev->persist->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL; props->timestamp_mask = 0xFFFFFFFFFFFFULL; if (!mlx4_is_slave(dev->dev)) err = mlx4_get_internal_clock_params(dev->dev, &clock_params); if (uhw->outlen >= resp.response_length + sizeof(resp.hca_core_clock_offset)) { resp.response_length += sizeof(resp.hca_core_clock_offset); if (!err && !mlx4_is_slave(dev->dev)) { resp.comp_mask |= QUERY_DEVICE_RESP_MASK_TIMESTAMP; resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE; } } if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) goto out; } out: kfree(in_mad); kfree(out_mad); return err; } static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx4_dev *dev = to_mdev(device)->dev; return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } static int ib_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int ext_active_speed; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); props->sm_sl = out_mad->data[36] & 0xf; props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); if (netw_view) props->gid_tbl_len = out_mad->data[50]; else props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; props->active_speed = out_mad->data[35] >> 4; props->max_mtu = out_mad->data[41] & 0xf; props->active_mtu = out_mad->data[36] >> 4; props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; /* Check if extended speeds (EDR/FDR/...) are supported */ if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { ext_active_speed = out_mad->data[62] >> 4; switch (ext_active_speed) { case 1: props->active_speed = IB_SPEED_FDR; break; case 2: props->active_speed = IB_SPEED_EDR; break; } } /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == IB_SPEED_QDR) { init_query_mad(in_mad); in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; /* Checking LinkSpeedActive for FDR-10 */ if (out_mad->data[15] & 0x1) props->active_speed = IB_SPEED_FDR10; } /* Avoid wrong speed value returned by FW if the IB link is down. */ if (props->state == IB_PORT_DOWN) props->active_speed = IB_SPEED_SDR; out: kfree(in_mad); kfree(out_mad); return err; } static u8 state_to_phys_state(enum ib_port_state state) { return state == IB_PORT_ACTIVE ? 5 : 3; } static int eth_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); struct mlx4_ib_iboe *iboe = &mdev->iboe; - struct net_device *ndev; + struct ifnet *ndev; enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; int err = 0; int is_bonded = mlx4_is_bonded(mdev->dev); mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; props->max_mtu = IB_MTU_4096; props->max_vl_num = 2; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock_bh(&iboe->lock); ndev = iboe->netdevs[port - 1]; if (ndev && is_bonded) { #if 0 rcu_read_lock(); /* required to get upper dev */ ndev = netdev_master_upper_dev_get_rcu(ndev); rcu_read_unlock(); #endif } if (!ndev) goto out_unlock; tmp = iboe_get_mtu(ndev->if_mtu); props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; props->state = ((ndev->if_drv_flags & IFF_DRV_RUNNING) != 0 && ndev->if_link_state == LINK_STATE_UP) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: spin_unlock_bh(&iboe->lock); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { int err; memset(props, 0, sizeof *props); err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? ib_link_query_port(ibdev, port, props, netw_view) : eth_link_query_port(ibdev, port, props, netw_view); return err; } static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { /* returns host view */ return __mlx4_ib_query_port(ibdev, port, props, 0); } int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; struct mlx4_ib_dev *dev = to_mdev(ibdev); int clear = 0; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(dev->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); if (mlx4_is_mfunc(dev->dev) && !netw_view) { if (index) { /* For any index > 0, return the null guid */ err = 0; clear = 1; goto out; } } init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: if (clear) memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; } static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { int ret; if (rdma_protocol_ib(ibdev, port)) return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); if (!rdma_protocol_roce(ibdev, port)) return -ENODEV; if (!rdma_cap_roce_gid_table(ibdev, port)) return -ENODEV; ret = ib_get_cached_gid(ibdev, port, index, gid, NULL); if (ret == -EAGAIN) { memcpy(gid, &zgid, sizeof(*gid)); return 0; } return ret; } static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u8 port, u64 *sl2vl_tbl) { union sl2vl_tbl_to_u64 sl2vl64; struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; int jj; if (mlx4_is_slave(to_mdev(ibdev)->dev)) { *sl2vl_tbl = 0; return 0; } in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_SL_TO_VL_TABLE; in_mad->attr_mod = 0; if (mlx4_is_mfunc(to_mdev(ibdev)->dev)) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; for (jj = 0; jj < 8; jj++) sl2vl64.sl8[jj] = ((struct ib_smp *)out_mad)->data[jj]; *sl2vl_tbl = sl2vl64.sl64; out: kfree(in_mad); kfree(out_mad); return err; } static void mlx4_init_sl2vl_tbl(struct mlx4_ib_dev *mdev) { u64 sl2vl; int i; int err; for (i = 1; i <= mdev->dev->caps.num_ports; i++) { if (mdev->dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) continue; err = mlx4_ib_query_sl2vl(&mdev->ib_dev, i, &sl2vl); if (err) { pr_err("Unable to get default sl to vl mapping for port %d. Using all zeroes (%d)\n", i, err); sl2vl = 0; } atomic64_set(&mdev->sl2vl[i - 1], sl2vl); } } int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); out: kfree(in_mad); kfree(out_mad); return err; } static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); } static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; if (mlx4_is_slave(to_mdev(ibdev)->dev)) return -EOPNOTSUPP; spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) return 0; memcpy(mailbox->buf, props->node_desc, IB_DEVICE_NODE_DESC_MAX); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); } else { ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } err = mlx4_cmd(dev->dev, mailbox->dma, port, MLX4_SET_PORT_IB_OPCODE, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; struct ib_port_attr attr; u32 cap_mask; int err; /* return OK if this is RoCE. CM calls ib_modify_port() regardless * of whether port link layer is ETH or IB. For ETH ports, qkey * violations and port capabilities are not meaningful. */ if (is_eth) return 0; mutex_lock(&mdev->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) goto out; cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx4_ib_SET_PORT(mdev, port, !!(mask & IB_PORT_RESET_QKEY_CNTR), cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); return err; } static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp.dev_caps = dev->dev->caps.userspace_caps; resp.qp_tab_size = dev->dev->caps.num_qps; resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; resp.cqe_size = dev->dev->caps.cqe_size; } context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); if (err) { kfree(context); return ERR_PTR(err); } INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); return ERR_PTR(-EFAULT); } return &context->ibucontext; } static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); kfree(context); return 0; } static void mlx4_ib_vma_open(struct vm_area_struct *area) { /* vma_open is called when a new VMA is created on top of our VMA. * This is done through either mremap flow or split_vma (usually due * to mlock, madvise, munmap, etc.). We do not support a clone of the * vma, as this VMA is strongly hardware related. Therefore we set the * vm_ops of the newly created/cloned VMA to NULL, to prevent it from * calling us again and trying to do incorrect actions. We assume that * the original vma size is exactly a single page that there will be no * "splitting" operations on. */ area->vm_ops = NULL; } static void mlx4_ib_vma_close(struct vm_area_struct *area) { struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data; /* It's guaranteed that all VMAs opened on a FD are closed before the * file itself is closed, therefore no sync is needed with the regular * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync * with accessing the vma as part of mlx4_ib_disassociate_ucontext. * The close operation is usually called under mm->mmap_sem except when * process is exiting. The exiting case is handled explicitly as part * of mlx4_ib_disassociate_ucontext. */ mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *) area->vm_private_data; /* set the vma context pointer to null in the mlx4_ib driver's private * data to protect against a race condition in mlx4_ib_dissassociate_ucontext(). */ mlx4_ib_vma_priv_data->vma = NULL; } static const struct vm_operations_struct mlx4_ib_vm_ops = { .open = mlx4_ib_vma_open, .close = mlx4_ib_vma_close }; static void mlx4_ib_set_vma_data(struct vm_area_struct *vma, struct mlx4_ib_vma_private_data *vma_private_data) { vma_private_data->vma = vma; vma->vm_private_data = vma_private_data; vma->vm_ops = &mlx4_ib_vm_ops; } static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); struct mlx4_ib_ucontext *mucontext = to_mucontext(context); if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; if (vma->vm_pgoff == 0) { /* We prevent double mmaping on same context */ if (mucontext->hw_bar_info[HW_BAR_DB].vma) return -EINVAL; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]); } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { /* We prevent double mmaping on same context */ if (mucontext->hw_bar_info[HW_BAR_BF].vma) return -EINVAL; vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]); } else if (vma->vm_pgoff == 3) { struct mlx4_clock_params params; int ret; /* We prevent double mmaping on same context */ if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma) return -EINVAL; ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); if (ret) return ret; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, (pci_resource_start(dev->dev->persist->pdev, params.bar) + params.offset) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_CLOCK]); } else { return -EINVAL; } return 0; } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_pd *pd; int err; pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } return &pd->ibpd; } static int mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); kfree(pd); return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_xrcd *xrcd; struct ib_cq_init_attr cq_attr = {}; int err; if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); if (err) goto err1; xrcd->pd = ib_alloc_pd(ibdev, 0); if (IS_ERR(xrcd->pd)) { err = PTR_ERR(xrcd->pd); goto err2; } cq_attr.cqe = 1; xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, &cq_attr); if (IS_ERR(xrcd->cq)) { err = PTR_ERR(xrcd->cq); goto err3; } return &xrcd->ibxrcd; err3: ib_dealloc_pd(xrcd->pd); err2: mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); err1: kfree(xrcd); return ERR_PTR(err); } static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { ib_destroy_cq(to_mxrcd(xrcd)->cq); ib_dealloc_pd(to_mxrcd(xrcd)->pd); mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); kfree(xrcd); return 0; } static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) return -ENOMEM; ge->gid = *gid; if (mlx4_ib_add_mc(mdev, mqp, gid)) { ge->port = mqp->port; ge->added = 1; } mutex_lock(&mqp->mutex); list_add_tail(&ge->list, &mqp->gid_list); mutex_unlock(&mqp->mutex); return 0; } static void mlx4_ib_delete_counters_table(struct mlx4_ib_dev *ibdev, struct mlx4_ib_counters *ctr_table) { struct counter_index *counter, *tmp_count; mutex_lock(&ctr_table->mutex); list_for_each_entry_safe(counter, tmp_count, &ctr_table->counters_list, list) { if (counter->allocated) mlx4_counter_free(ibdev->dev, counter->index); list_del(&counter->list); kfree(counter); } mutex_unlock(&ctr_table->mutex); } int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { - struct net_device *ndev; + struct ifnet *ndev; int ret = 0; if (!mqp->port) return 0; spin_lock_bh(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) if_ref(ndev); spin_unlock_bh(&mdev->iboe.lock); if (ndev) { ret = 1; if_rele(ndev); } return ret; } struct mlx4_ib_steering { struct list_head list; struct mlx4_flow_reg_id reg_id; union ib_gid gid; }; #define LAST_ETH_FIELD vlan_tag #define LAST_IB_FIELD sl #define LAST_IPV4_FIELD dst_ip #define LAST_TCP_UDP_FIELD src_port /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ memchr_inv((void *)&filter.field +\ sizeof(filter.field), 0,\ sizeof(filter) -\ offsetof(typeof(filter), field) -\ sizeof(filter.field)) static int parse_flow_attr(struct mlx4_dev *dev, u32 qp_num, union ib_flow_spec *ib_spec, struct _rule_hw *mlx4_spec) { enum mlx4_net_trans_rule_id type; switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) return -ENOTSUPP; type = MLX4_NET_TRANS_RULE_ID_ETH; memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, ETH_ALEN); memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, ETH_ALEN); mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; break; case IB_FLOW_SPEC_IB: if (FIELDS_NOT_SUPPORTED(ib_spec->ib.mask, LAST_IB_FIELD)) return -ENOTSUPP; type = MLX4_NET_TRANS_RULE_ID_IB; mlx4_spec->ib.l3_qpn = cpu_to_be32(qp_num); mlx4_spec->ib.qpn_mask = cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); break; case IB_FLOW_SPEC_IPV4: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) return -ENOTSUPP; type = MLX4_NET_TRANS_RULE_ID_IPV4; mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) return -ENOTSUPP; type = ib_spec->type == IB_FLOW_SPEC_TCP ? MLX4_NET_TRANS_RULE_ID_TCP : MLX4_NET_TRANS_RULE_ID_UDP; mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; break; default: return -EINVAL; } if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 || mlx4_hw_rule_sz(dev, type) < 0) return -EINVAL; mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type)); mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2; return mlx4_hw_rule_sz(dev, type); } struct default_rules { __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; __u8 link_layer; }; static const struct default_rules default_table[] = { { .mandatory_fields = {IB_FLOW_SPEC_IPV4}, .mandatory_not_fields = {IB_FLOW_SPEC_ETH}, .rules_create_list = {IB_FLOW_SPEC_IB}, .link_layer = IB_LINK_LAYER_INFINIBAND } }; static int __mlx4_ib_default_rules_match(struct ib_qp *qp, struct ib_flow_attr *flow_attr) { int i, j, k; void *ib_flow; const struct default_rules *pdefault_rules = default_table; u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) { __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; memset(&field_types, 0, sizeof(field_types)); if (link_layer != pdefault_rules->link_layer) continue; ib_flow = flow_attr + 1; /* we assume the specs are sorted */ for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && j < flow_attr->num_of_specs; k++) { union ib_flow_spec *current_flow = (union ib_flow_spec *)ib_flow; /* same layer but different type */ if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == (pdefault_rules->mandatory_fields[k] & IB_FLOW_SPEC_LAYER_MASK)) && (current_flow->type != pdefault_rules->mandatory_fields[k])) goto out; /* same layer, try match next one */ if (current_flow->type == pdefault_rules->mandatory_fields[k]) { j++; ib_flow += ((union ib_flow_spec *)ib_flow)->size; } } ib_flow = flow_attr + 1; for (j = 0; j < flow_attr->num_of_specs; j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) /* same layer and same type */ if (((union ib_flow_spec *)ib_flow)->type == pdefault_rules->mandatory_not_fields[k]) goto out; return i; } out: return -1; } static int __mlx4_ib_create_default_rules( struct mlx4_ib_dev *mdev, struct ib_qp *qp, const struct default_rules *pdefault_rules, struct _rule_hw *mlx4_spec) { int size = 0; int i; for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) { int ret; union ib_flow_spec ib_spec; switch (pdefault_rules->rules_create_list[i]) { case 0: /* no rule */ continue; case IB_FLOW_SPEC_IB: ib_spec.type = IB_FLOW_SPEC_IB; ib_spec.size = sizeof(struct ib_flow_spec_ib); break; default: /* invalid rule */ return -EINVAL; } /* We must put empty rule, qpn is being ignored */ ret = parse_flow_attr(mdev->dev, 0, &ib_spec, mlx4_spec); if (ret < 0) { pr_info("invalid parsing\n"); return -EINVAL; } mlx4_spec = (void *)mlx4_spec + ret; size += ret; } return size; } static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain, enum mlx4_net_trans_promisc_mode flow_type, u64 *reg_id) { int ret, i; int size = 0; void *ib_flow; struct mlx4_ib_dev *mdev = to_mdev(qp->device); struct mlx4_cmd_mailbox *mailbox; struct mlx4_net_trans_rule_hw_ctrl *ctrl; int default_flow; static const u16 __mlx4_domain[] = { [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, }; if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { pr_err("Invalid priority value %d\n", flow_attr->priority); return -EINVAL; } if (domain >= IB_FLOW_DOMAIN_NUM) { pr_err("Invalid domain value %d\n", domain); return -EINVAL; } if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) return -EINVAL; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); ctrl = mailbox->buf; ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | flow_attr->priority); ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type); ctrl->port = flow_attr->port; ctrl->qpn = cpu_to_be32(qp->qp_num); ib_flow = flow_attr + 1; size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); /* Add default flows */ default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); if (default_flow >= 0) { ret = __mlx4_ib_create_default_rules( mdev, qp, default_table + default_flow, mailbox->buf + size); if (ret < 0) { mlx4_free_cmd_mailbox(mdev->dev, mailbox); return -EINVAL; } size += ret; } for (i = 0; i < flow_attr->num_of_specs; i++) { ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, mailbox->buf + size); if (ret < 0) { mlx4_free_cmd_mailbox(mdev->dev, mailbox); return -EINVAL; } ib_flow += ((union ib_flow_spec *) ib_flow)->size; size += ret; } ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (ret == -ENOMEM) pr_err("mcg table is full. Fail to register network rule.\n"); else if (ret == -ENXIO) pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); else if (ret) pr_err("Invalid argument. Fail to register network rule.\n"); mlx4_free_cmd_mailbox(mdev->dev, mailbox); return ret; } static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) { int err; err = mlx4_cmd(dev, reg_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (err) pr_err("Fail to detach network rule. registration id = 0x%llx\n", (long long)reg_id); return err; } static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr, u64 *reg_id) { void *ib_flow; union ib_flow_spec *ib_spec; struct mlx4_dev *dev = to_mdev(qp->device)->dev; int err = 0; if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN || dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) return 0; /* do nothing */ ib_flow = flow_attr + 1; ib_spec = (union ib_flow_spec *)ib_flow; if (ib_spec->type != IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1) return 0; /* do nothing */ err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac, flow_attr->port, qp->qp_num, MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff), reg_id); return err; } static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev, struct ib_flow_attr *flow_attr, enum mlx4_net_trans_promisc_mode *type) { int err = 0; if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER) || (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) || (flow_attr->num_of_specs > 1) || (flow_attr->priority != 0)) { return -EOPNOTSUPP; } if (flow_attr->num_of_specs == 0) { type[0] = MLX4_FS_MC_SNIFFER; type[1] = MLX4_FS_UC_SNIFFER; } else { union ib_flow_spec *ib_spec; ib_spec = (union ib_flow_spec *)(flow_attr + 1); if (ib_spec->type != IB_FLOW_SPEC_ETH) return -EINVAL; /* if all is zero than MC and UC */ if (is_zero_ether_addr(ib_spec->eth.mask.dst_mac)) { type[0] = MLX4_FS_MC_SNIFFER; type[1] = MLX4_FS_UC_SNIFFER; } else { u8 mac[ETH_ALEN] = {ib_spec->eth.mask.dst_mac[0] ^ 0x01, ib_spec->eth.mask.dst_mac[1], ib_spec->eth.mask.dst_mac[2], ib_spec->eth.mask.dst_mac[3], ib_spec->eth.mask.dst_mac[4], ib_spec->eth.mask.dst_mac[5]}; /* Above xor was only on MC bit, non empty mask is valid * only if this bit is set and rest are zero. */ if (!is_zero_ether_addr(&mac[0])) return -EINVAL; if (is_multicast_ether_addr(ib_spec->eth.val.dst_mac)) type[0] = MLX4_FS_MC_SNIFFER; else type[0] = MLX4_FS_UC_SNIFFER; } } return err; } static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { int err = 0, i = 0, j = 0; struct mlx4_ib_flow *mflow; enum mlx4_net_trans_promisc_mode type[2]; struct mlx4_dev *dev = (to_mdev(qp->device))->dev; int is_bonded = mlx4_is_bonded(dev); if (flow_attr->port < 1 || flow_attr->port > qp->device->phys_port_cnt) return ERR_PTR(-EINVAL); if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && (flow_attr->type != IB_FLOW_ATTR_NORMAL)) return ERR_PTR(-EOPNOTSUPP); memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); if (!mflow) { err = -ENOMEM; goto err_free; } switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: /* If dont trap flag (continue match) is set, under specific * condition traffic be replicated to given qp, * without stealing it */ if (unlikely(flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)) { err = mlx4_ib_add_dont_trap_rule(dev, flow_attr, type); if (err) goto err_free; } else { type[0] = MLX4_FS_REGULAR; } break; case IB_FLOW_ATTR_ALL_DEFAULT: type[0] = MLX4_FS_ALL_DEFAULT; break; case IB_FLOW_ATTR_MC_DEFAULT: type[0] = MLX4_FS_MC_DEFAULT; break; case IB_FLOW_ATTR_SNIFFER: type[0] = MLX4_FS_MIRROR_RX_PORT; type[1] = MLX4_FS_MIRROR_SX_PORT; break; default: err = -EINVAL; goto err_free; } while (i < ARRAY_SIZE(type) && type[i]) { err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], &mflow->reg_id[i].id); if (err) goto err_create_flow; if (is_bonded) { /* Application always sees one port so the mirror rule * must be on port #2 */ flow_attr->port = 2; err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[j], &mflow->reg_id[j].mirror); flow_attr->port = 1; if (err) goto err_create_flow; j++; } i++; } if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) { err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[i].id); if (err) goto err_create_flow; if (is_bonded) { flow_attr->port = 2; err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[j].mirror); flow_attr->port = 1; if (err) goto err_create_flow; j++; } /* function to create mirror rule */ i++; } return &mflow->ibflow; err_create_flow: while (i) { (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, mflow->reg_id[i].id); i--; } while (j) { (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, mflow->reg_id[j].mirror); j--; } err_free: kfree(mflow); return ERR_PTR(err); } static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) { int err, ret = 0; int i = 0; struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); struct mlx4_ib_flow *mflow = to_mflow(flow_id); while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) { err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id); if (err) ret = err; if (mflow->reg_id[i].mirror) { err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].mirror); if (err) ret = err; } i++; } kfree(mflow); return ret; } static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_dev *dev = mdev->dev; struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_steering *ib_steering = NULL; enum mlx4_protocol prot = MLX4_PROT_IB_IPV6; struct mlx4_flow_reg_id reg_id; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); if (!ib_steering) return -ENOMEM; } err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), prot, ®_id.id); if (err) { pr_err("multicast attach op failed, err %d\n", err); goto err_malloc; } reg_id.mirror = 0; if (mlx4_is_bonded(dev)) { err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, (mqp->port == 1) ? 2 : 1, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), prot, ®_id.mirror); if (err) goto err_add; } err = add_gid_entry(ibqp, gid); if (err) goto err_add; if (ib_steering) { memcpy(ib_steering->gid.raw, gid->raw, 16); ib_steering->reg_id = reg_id; mutex_lock(&mqp->mutex); list_add(&ib_steering->list, &mqp->steering_rules); mutex_unlock(&mqp->mutex); } return 0; err_add: mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, prot, reg_id.id); if (reg_id.mirror) mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, prot, reg_id.mirror); err_malloc: kfree(ib_steering); return err; } static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { struct mlx4_ib_gid_entry *ge; struct mlx4_ib_gid_entry *tmp; struct mlx4_ib_gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { ret = ge; break; } } return ret; } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_dev *dev = mdev->dev; struct mlx4_ib_qp *mqp = to_mqp(ibqp); - struct net_device *ndev; + struct ifnet *ndev; struct mlx4_ib_gid_entry *ge; struct mlx4_flow_reg_id reg_id = {0, 0}; enum mlx4_protocol prot = MLX4_PROT_IB_IPV6; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { struct mlx4_ib_steering *ib_steering; mutex_lock(&mqp->mutex); list_for_each_entry(ib_steering, &mqp->steering_rules, list) { if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { list_del(&ib_steering->list); break; } } mutex_unlock(&mqp->mutex); if (&ib_steering->list == &mqp->steering_rules) { pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); return -EINVAL; } reg_id = ib_steering->reg_id; kfree(ib_steering); } err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, prot, reg_id.id); if (err) return err; if (mlx4_is_bonded(dev)) { err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, prot, reg_id.mirror); if (err) return err; } mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { spin_lock_bh(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; if (ndev) if_ref(ndev); spin_unlock_bh(&mdev->iboe.lock); if (ndev) if_rele(ndev); list_del(&ge->list); kfree(ge); } else pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); return 0; } static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; if (mlx4_is_master(dev->dev)) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(dev->ib_dev.node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_hca_type, &dev_attr_board_id }; struct diag_counter { const char *name; u32 offset; }; #define DIAG_COUNTER(_name, _offset) \ { .name = #_name, .offset = _offset } static const struct diag_counter diag_basic[] = { DIAG_COUNTER(rq_num_lle, 0x00), DIAG_COUNTER(sq_num_lle, 0x04), DIAG_COUNTER(rq_num_lqpoe, 0x08), DIAG_COUNTER(sq_num_lqpoe, 0x0C), DIAG_COUNTER(rq_num_lpe, 0x18), DIAG_COUNTER(sq_num_lpe, 0x1C), DIAG_COUNTER(rq_num_wrfe, 0x20), DIAG_COUNTER(sq_num_wrfe, 0x24), DIAG_COUNTER(sq_num_mwbe, 0x2C), DIAG_COUNTER(sq_num_bre, 0x34), DIAG_COUNTER(sq_num_rire, 0x44), DIAG_COUNTER(rq_num_rire, 0x48), DIAG_COUNTER(sq_num_rae, 0x4C), DIAG_COUNTER(rq_num_rae, 0x50), DIAG_COUNTER(sq_num_roe, 0x54), DIAG_COUNTER(sq_num_tree, 0x5C), DIAG_COUNTER(sq_num_rree, 0x64), DIAG_COUNTER(rq_num_rnr, 0x68), DIAG_COUNTER(sq_num_rnr, 0x6C), DIAG_COUNTER(rq_num_oos, 0x100), DIAG_COUNTER(sq_num_oos, 0x104), }; static const struct diag_counter diag_ext[] = { DIAG_COUNTER(rq_num_dup, 0x130), DIAG_COUNTER(sq_num_to, 0x134), }; static const struct diag_counter diag_device_only[] = { DIAG_COUNTER(num_cqovf, 0x1A0), DIAG_COUNTER(rq_num_udsdprd, 0x118), }; static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev, u8 port_num) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_diag_counters *diag = dev->diag_counters; if (!diag[!!port_num].name) return NULL; return rdma_alloc_hw_stats_struct(diag[!!port_num].name, diag[!!port_num].num_counters, RDMA_HW_STATS_DEFAULT_LIFESPAN); } static int mlx4_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u8 port, int index) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_diag_counters *diag = dev->diag_counters; u32 hw_value[ARRAY_SIZE(diag_device_only) + ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {}; int ret; int i; ret = mlx4_query_diag_counters(dev->dev, MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS, diag[!!port].offset, hw_value, diag[!!port].num_counters, port); if (ret) return ret; for (i = 0; i < diag[!!port].num_counters; i++) stats->value[i] = hw_value[i]; return diag[!!port].num_counters; } static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev, const char ***name, u32 **offset, u32 *num, bool port) { u32 num_counters; num_counters = ARRAY_SIZE(diag_basic); if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) num_counters += ARRAY_SIZE(diag_ext); if (!port) num_counters += ARRAY_SIZE(diag_device_only); *name = kcalloc(num_counters, sizeof(**name), GFP_KERNEL); if (!*name) return -ENOMEM; *offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL); if (!*offset) goto err_name; *num = num_counters; return 0; err_name: kfree(*name); return -ENOMEM; } static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev, const char **name, u32 *offset, bool port) { int i; int j; for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) { name[i] = diag_basic[i].name; offset[i] = diag_basic[i].offset; } if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) { for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) { name[j] = diag_ext[i].name; offset[j] = diag_ext[i].offset; } } if (!port) { for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) { name[j] = diag_device_only[i].name; offset[j] = diag_device_only[i].offset; } } } static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev) { struct mlx4_ib_diag_counters *diag = ibdev->diag_counters; int i; int ret; bool per_port = !!(ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT); if (mlx4_is_slave(ibdev->dev)) return 0; for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { /* i == 1 means we are building port counters */ if (i && !per_port) continue; ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].name, &diag[i].offset, &diag[i].num_counters, i); if (ret) goto err_alloc; mlx4_ib_fill_diag_counters(ibdev, diag[i].name, diag[i].offset, i); } ibdev->ib_dev.get_hw_stats = mlx4_ib_get_hw_stats; ibdev->ib_dev.alloc_hw_stats = mlx4_ib_alloc_hw_stats; return 0; err_alloc: if (i) { kfree(diag[i - 1].name); kfree(diag[i - 1].offset); } return ret; } static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev) { int i; for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { kfree(ibdev->diag_counters[i].offset); kfree(ibdev->diag_counters[i].name); } } #define MLX4_IB_INVALID_MAC ((u64)-1) static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, - struct net_device *dev, + struct ifnet *dev, int port) { u64 new_smac = 0; u64 release_mac = MLX4_IB_INVALID_MAC; struct mlx4_ib_qp *qp; new_smac = mlx4_mac_to_u64(IF_LLADDR(dev)); atomic64_set(&ibdev->iboe.mac[port - 1], new_smac); /* no need for update QP1 and mac registration in non-SRIOV */ if (!mlx4_is_mfunc(ibdev->dev)) return; mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); qp = ibdev->qp1_proxy[port - 1]; if (qp) { int new_smac_index; u64 old_smac; struct mlx4_update_qp_params update_params; mutex_lock(&qp->mutex); old_smac = qp->pri.smac; if (new_smac == old_smac) goto unlock; new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); if (new_smac_index < 0) goto unlock; update_params.smac_index = new_smac_index; if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC, &update_params)) { release_mac = new_smac; goto unlock; } /* if old port was zero, no mac was yet registered for this QP */ if (qp->pri.smac_port) release_mac = old_smac; qp->pri.smac = new_smac; qp->pri.smac_port = port; qp->pri.smac_index = new_smac_index; } unlock: if (release_mac != MLX4_IB_INVALID_MAC) mlx4_unregister_mac(ibdev->dev, port, release_mac); if (qp) mutex_unlock(&qp->mutex); mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); } static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, - struct net_device *dev, + struct ifnet *dev, unsigned long event) { struct mlx4_ib_iboe *iboe; int update_qps_port = -1; int port; iboe = &ibdev->iboe; spin_lock_bh(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (dev == iboe->netdevs[port - 1] && (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || event == NETDEV_UP || event == NETDEV_CHANGE)) update_qps_port = port; } spin_unlock_bh(&iboe->lock); if (update_qps_port > 0) mlx4_ib_update_qps(ibdev, dev, update_qps_port); } static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct ifnet *dev = netdev_notifier_info_to_ifp(ptr); struct mlx4_ib_dev *ibdev; if (dev->if_vnet != &init_net) return NOTIFY_DONE; ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); mlx4_ib_scan_netdevs(ibdev, dev, event); return NOTIFY_DONE; } static void init_pkeys(struct mlx4_ib_dev *ibdev) { int port; int slave; int i; if (mlx4_is_master(ibdev->dev)) { for (slave = 0; slave <= ibdev->dev->persist->num_vfs; ++slave) { for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) { ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = /* master has the identity virt2phys pkey mapping */ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; mlx4_sync_pkey_table(ibdev->dev, slave, port, i, ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); } } } /* initialize pkey cache */ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) ibdev->pkeys.phys_pkey_cache[port-1][i] = (i) ? 0 : 0xFFFF; } } } static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i, j, eq = 0, total_eqs = 0; ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors, sizeof(ibdev->eq_table[0]), GFP_KERNEL); if (!ibdev->eq_table) return; for (i = 1; i <= dev->caps.num_ports; i++) { for (j = 0; j < mlx4_get_eqs_per_port(dev, i); j++, total_eqs++) { if (i > 1 && mlx4_is_eq_shared(dev, total_eqs)) continue; ibdev->eq_table[eq] = total_eqs; if (!mlx4_assign_eq(dev, i, &ibdev->eq_table[eq])) eq++; else ibdev->eq_table[eq] = -1; } } for (i = eq; i < dev->caps.num_comp_vectors; ibdev->eq_table[i++] = -1) ; /* Advertise the new number of EQs to clients */ ibdev->ib_dev.num_comp_vectors = eq; } static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; int total_eqs = ibdev->ib_dev.num_comp_vectors; /* no eqs were allocated */ if (!ibdev->eq_table) return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = 0; for (i = 0; i < total_eqs; i++) mlx4_release_eq(dev, ibdev->eq_table[i]); kfree(ibdev->eq_table); ibdev->eq_table = NULL; } static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; struct mlx4_ib_dev *mdev = to_mdev(ibdev); int err; err = mlx4_ib_query_port(ibdev, port_num, &attr); if (err) return err; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) { immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; } else { if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE | RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; } immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } static void get_fw_ver_str(struct ib_device *device, char *str, size_t str_len) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev); snprintf(str, str_len, "%d.%d.%d", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); } static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; int num_ports; int i, j; int err; struct mlx4_ib_iboe *iboe; int ib_num_ports = 0; int num_req_counters; int allocated; u32 counter_index; struct counter_index *new_counter_index = NULL; pr_info_once("%s", mlx4_ib_version); num_ports = 0; mlx4_foreach_ib_transport_port(i, dev) num_ports++; /* No point in registering a device with no ports... */ if (num_ports == 0) return NULL; ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { dev_err(&dev->persist->pdev->dev, "Device struct alloc failed\n"); return NULL; } iboe = &ibdev->iboe; if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) goto err_dealloc; if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!ibdev->uar_map) goto err_uar; MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; ibdev->bond_next_port = 0; strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; ibdev->ib_dev.phys_port_cnt = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->persist->pdev->dev; ibdev->ib_dev.get_netdev = mlx4_ib_get_netdev; ibdev->ib_dev.add_gid = mlx4_ib_add_gid; ibdev->ib_dev.del_gid = mlx4_ib_del_gid; if (dev->caps.userspace_caps) ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; else ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; ibdev->ib_dev.query_gid = mlx4_ib_query_gid; ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; ibdev->ib_dev.modify_device = mlx4_ib_modify_device; ibdev->ib_dev.modify_port = mlx4_ib_modify_port; ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; ibdev->ib_dev.mmap = mlx4_ib_mmap; ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; ibdev->ib_dev.create_ah = mlx4_ib_create_ah; ibdev->ib_dev.query_ah = mlx4_ib_query_ah; ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; ibdev->ib_dev.create_srq = mlx4_ib_create_srq; ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; ibdev->ib_dev.query_srq = mlx4_ib_query_srq; ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; ibdev->ib_dev.create_qp = mlx4_ib_create_qp; ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; ibdev->ib_dev.query_qp = mlx4_ib_query_qp; ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; ibdev->ib_dev.post_send = mlx4_ib_post_send; ibdev->ib_dev.post_recv = mlx4_ib_post_recv; ibdev->ib_dev.create_cq = mlx4_ib_create_cq; ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_mr = mlx4_ib_alloc_mr; ibdev->ib_dev.map_mr_sg = mlx4_ib_map_mr_sg; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; ibdev->ib_dev.get_port_immutable = mlx4_port_immutable; ibdev->ib_dev.get_dev_fw_str = get_fw_ver_str; if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } if (check_flow_steering_support(dev)) { ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED; ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; ibdev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); } ibdev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP); mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) goto err_map; mlx4_init_sl2vl_tbl(ibdev); for (i = 0; i < ibdev->num_ports; ++i) { mutex_init(&ibdev->counters_table[i].mutex); INIT_LIST_HEAD(&ibdev->counters_table[i].counters_list); } num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports; for (i = 0; i < num_req_counters; ++i) { mutex_init(&ibdev->qp1_proxy_lock[i]); allocated = 0; if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { err = mlx4_counter_alloc(ibdev->dev, &counter_index); /* if failed to allocate a new counter, use default */ if (err) counter_index = mlx4_get_default_counter_index(dev, i + 1); else allocated = 1; } else { /* IB_LINK_LAYER_INFINIBAND use the default counter */ counter_index = mlx4_get_default_counter_index(dev, i + 1); } new_counter_index = kmalloc(sizeof(*new_counter_index), GFP_KERNEL); if (!new_counter_index) { if (allocated) mlx4_counter_free(ibdev->dev, counter_index); goto err_counter; } new_counter_index->index = counter_index; new_counter_index->allocated = allocated; list_add_tail(&new_counter_index->list, &ibdev->counters_table[i].counters_list); ibdev->counters_table[i].default_counter = counter_index; pr_info("counter index %d for port %d allocated %d\n", counter_index, i + 1, allocated); } if (mlx4_is_bonded(dev)) for (i = 1; i < ibdev->num_ports ; ++i) { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) goto err_counter; new_counter_index->index = counter_index; new_counter_index->allocated = 0; list_add_tail(&new_counter_index->list, &ibdev->counters_table[i].counters_list); ibdev->counters_table[i].default_counter = counter_index; } mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ib_num_ports++; spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); INIT_LIST_HEAD(&ibdev->qp_list); spin_lock_init(&ibdev->reset_flow_resource_lock); if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && ib_num_ports) { ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); if (err) goto err_counter; ibdev->ib_uc_qpns_bitmap = kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * sizeof(long), GFP_KERNEL); if (!ibdev->ib_uc_qpns_bitmap) { dev_err(&dev->persist->pdev->dev, "bit map alloc failed\n"); goto err_steer_qp_release; } bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( dev, ibdev->steer_qpn_base, ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1); if (err) goto err_steer_free_bitmap; } for (j = 1; j <= ibdev->dev->caps.num_ports; j++) atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]); if (mlx4_ib_alloc_diag_counters(ibdev)) goto err_steer_free_bitmap; if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_diag_counters; if (mlx4_ib_mad_init(ibdev)) goto err_reg; if (mlx4_ib_init_sriov(ibdev)) goto err_mad; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE || dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { if (!iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); if (err) { iboe->nb.notifier_call = NULL; goto err_notif; } } if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT); if (err) { goto err_notif; } } } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j])) goto err_notif; } ibdev->ib_active = true; if (mlx4_is_mfunc(ibdev->dev)) init_pkeys(ibdev); /* create paravirt contexts for any VFs which are active */ if (mlx4_is_master(ibdev->dev)) { for (j = 0; j < MLX4_MFUNC_MAX; j++) { if (j == mlx4_master_func_num(ibdev->dev)) continue; if (mlx4_is_slave_active(ibdev->dev, j)) do_slave_init(ibdev, j, 1); } } return ibdev; err_notif: if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } flush_workqueue(wq); mlx4_ib_close_sriov(ibdev); err_mad: mlx4_ib_mad_cleanup(ibdev); err_reg: ib_unregister_device(&ibdev->ib_dev); err_diag_counters: mlx4_ib_diag_cleanup(ibdev); err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); err_counter: for (i = 0; i < ibdev->num_ports; ++i) mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]); err_map: iounmap(ibdev->uar_map); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); err_pd: mlx4_pd_free(dev, ibdev->priv_pdn); err_dealloc: ib_dealloc_device(&ibdev->ib_dev); return NULL; } int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) { int offset; WARN_ON(!dev->ib_uc_qpns_bitmap); offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, dev->steer_qpn_count, get_count_order(count)); if (offset < 0) return offset; *qpn = dev->steer_qpn_base + offset; return 0; } void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) { if (!qpn || dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) return; BUG_ON(qpn < dev->steer_qpn_base); bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, get_count_order(count)); } int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach) { int err; size_t flow_size; struct ib_flow_attr *flow = NULL; struct ib_flow_spec_ib *ib_spec; if (is_attach) { flow_size = sizeof(struct ib_flow_attr) + sizeof(struct ib_flow_spec_ib); flow = kzalloc(flow_size, GFP_KERNEL); if (!flow) return -ENOMEM; flow->port = mqp->port; flow->num_of_specs = 1; flow->size = flow_size; ib_spec = (struct ib_flow_spec_ib *)(flow + 1); ib_spec->type = IB_FLOW_SPEC_IB; ib_spec->size = sizeof(struct ib_flow_spec_ib); /* Add an empty rule for IB L2 */ memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); err = __mlx4_ib_create_flow(&mqp->ibqp, flow, IB_FLOW_DOMAIN_NIC, MLX4_FS_REGULAR, &mqp->reg_id); } else { err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); } kfree(flow); return err; } static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p; ibdev->ib_active = false; flush_workqueue(wq); mlx4_ib_close_sriov(ibdev); mlx4_ib_mad_cleanup(ibdev); ib_unregister_device(&ibdev->ib_dev); mlx4_ib_diag_cleanup(ibdev); if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); kfree(ibdev->ib_uc_qpns_bitmap); } iounmap(ibdev->uar_map); for (p = 0; p < ibdev->num_ports; ++p) mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[p]); mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); mlx4_ib_free_eqs(dev, ibdev); mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); } static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) { struct mlx4_ib_demux_work **dm = NULL; struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; struct mlx4_active_ports actv_ports; unsigned int ports; unsigned int first_port; if (!mlx4_is_master(dev)) return; actv_ports = mlx4_get_active_ports(dev, slave); ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); return; } for (i = 0; i < ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); while (--i >= 0) kfree(dm[i]); goto out; } INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); dm[i]->port = first_port + i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; } /* initialize or tear down tunnel QPs for the slave */ spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); if (!ibdev->sriov.is_going_down) { for (i = 0; i < ports; i++) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } else { spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); for (i = 0; i < ports; i++) kfree(dm[i]); } out: kfree(dm); return; } static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev) { struct mlx4_ib_qp *mqp; unsigned long flags_qp; unsigned long flags_cq; struct mlx4_ib_cq *send_mcq, *recv_mcq; struct list_head cq_notify_list; struct mlx4_cq *mcq; unsigned long flags; pr_warn("mlx4_ib_handle_catas_error was started\n"); INIT_LIST_HEAD(&cq_notify_list); /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { spin_lock_irqsave(&mqp->sq.lock, flags_qp); if (mqp->sq.tail != mqp->sq.head) { send_mcq = to_mcq(mqp->ibqp.send_cq); spin_lock_irqsave(&send_mcq->lock, flags_cq); if (send_mcq->mcq.comp && mqp->ibqp.send_cq->comp_handler) { if (!send_mcq->mcq.reset_notify_added) { send_mcq->mcq.reset_notify_added = 1; list_add_tail(&send_mcq->mcq.reset_notify, &cq_notify_list); } } spin_unlock_irqrestore(&send_mcq->lock, flags_cq); } spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); /* Now, handle the QP's receive queue */ spin_lock_irqsave(&mqp->rq.lock, flags_qp); /* no handling is needed for SRQ */ if (!mqp->ibqp.srq) { if (mqp->rq.tail != mqp->rq.head) { recv_mcq = to_mcq(mqp->ibqp.recv_cq); spin_lock_irqsave(&recv_mcq->lock, flags_cq); if (recv_mcq->mcq.comp && mqp->ibqp.recv_cq->comp_handler) { if (!recv_mcq->mcq.reset_notify_added) { recv_mcq->mcq.reset_notify_added = 1; list_add_tail(&recv_mcq->mcq.reset_notify, &cq_notify_list); } } spin_unlock_irqrestore(&recv_mcq->lock, flags_cq); } } spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); } list_for_each_entry(mcq, &cq_notify_list, reset_notify) { mcq->comp(mcq); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); pr_warn("mlx4_ib_handle_catas_error ended\n"); } static void handle_bonded_port_state_event(struct work_struct *work) { struct ib_event_work *ew = container_of(work, struct ib_event_work, work); struct mlx4_ib_dev *ibdev = ew->ib_dev; enum ib_port_state bonded_port_state = IB_PORT_NOP; int i; struct ib_event ibev; kfree(ew); spin_lock_bh(&ibdev->iboe.lock); for (i = 0; i < MLX4_MAX_PORTS; ++i) { - struct net_device *curr_netdev = ibdev->iboe.netdevs[i]; + struct ifnet *curr_netdev = ibdev->iboe.netdevs[i]; enum ib_port_state curr_port_state; if (!curr_netdev) continue; curr_port_state = ((curr_netdev->if_drv_flags & IFF_DRV_RUNNING) != 0 && curr_netdev->if_link_state == LINK_STATE_UP) ? IB_PORT_ACTIVE : IB_PORT_DOWN; bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ? curr_port_state : IB_PORT_ACTIVE; } spin_unlock_bh(&ibdev->iboe.lock); ibev.device = &ibdev->ib_dev; ibev.element.port_num = 1; ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; ib_dispatch_event(&ibev); } void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port) { u64 sl2vl; int err; err = mlx4_ib_query_sl2vl(&mdev->ib_dev, port, &sl2vl); if (err) { pr_err("Unable to get current sl to vl mapping for port %d. Using all zeroes (%d)\n", port, err); sl2vl = 0; } atomic64_set(&mdev->sl2vl[port - 1], sl2vl); } static void ib_sl2vl_update_work(struct work_struct *work) { struct ib_event_work *ew = container_of(work, struct ib_event_work, work); struct mlx4_ib_dev *mdev = ew->ib_dev; int port = ew->port; mlx4_ib_sl2vl_update(mdev, port); kfree(ew); } void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, int port) { struct ib_event_work *ew; ew = kmalloc(sizeof(*ew), GFP_ATOMIC); if (ew) { INIT_WORK(&ew->work, ib_sl2vl_update_work); ew->port = port; ew->ib_dev = ibdev; queue_work(wq, &ew->work); } else { pr_err("failed to allocate memory for sl2vl update work\n"); } } static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); struct mlx4_eqe *eqe = NULL; struct ib_event_work *ew; int p = 0; if (mlx4_is_bonded(dev) && ((event == MLX4_DEV_EVENT_PORT_UP) || (event == MLX4_DEV_EVENT_PORT_DOWN))) { ew = kmalloc(sizeof(*ew), GFP_ATOMIC); if (!ew) return; INIT_WORK(&ew->work, handle_bonded_port_state_event); ew->ib_dev = ibdev; queue_work(wq, &ew->work); return; } if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) eqe = (struct mlx4_eqe *)param; else p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: if (p > ibdev->num_ports) return; if (!mlx4_is_slave(dev) && rdma_port_get_link_layer(&ibdev->ib_dev, p) == IB_LINK_LAYER_INFINIBAND) { if (mlx4_is_master(dev)) mlx4_ib_invalidate_all_guid_record(ibdev, p); if (ibdev->dev->flags & MLX4_FLAG_SECURE_HOST && !(ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)) mlx4_sched_ib_sl2vl_update_work(ibdev, p); } ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: if (p > ibdev->num_ports) return; ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; mlx4_ib_handle_catas_error(ibdev); break; case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) { pr_err("failed to allocate memory for events work\n"); break; } INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); ew->ib_dev = ibdev; /* need to queue only for port owner, which uses GEN_EQE */ if (mlx4_is_master(dev)) queue_work(wq, &ew->work); else handle_port_mgmt_change_event(&ew->work); return; case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); if (mlx4_is_master(dev)) { int i; for (i = 1; i <= ibdev->num_ports; i++) { if (rdma_port_get_link_layer(&ibdev->ib_dev, i) == IB_LINK_LAYER_INFINIBAND) mlx4_ib_slave_alias_guid_event(ibdev, p, i, 1); } } return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: if (mlx4_is_master(dev)) { int i; for (i = 1; i <= ibdev->num_ports; i++) { if (rdma_port_get_link_layer(&ibdev->ib_dev, i) == IB_LINK_LAYER_INFINIBAND) mlx4_ib_slave_alias_guid_event(ibdev, p, i, 0); } } /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; default: return; } ibev.device = ibdev_ptr; ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p; ib_dispatch_event(&ibev); } static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, .protocol = MLX4_PROT_IB_IPV6, .flags = MLX4_INTFF_BONDING }; static int __init mlx4_ib_init(void) { int err; wq = alloc_ordered_workqueue("mlx4_ib", WQ_MEM_RECLAIM); if (!wq) return -ENOMEM; err = mlx4_ib_mcg_init(); if (err) goto clean_wq; err = mlx4_register_interface(&mlx4_ib_interface); if (err) goto clean_mcg; return 0; clean_mcg: mlx4_ib_mcg_destroy(); clean_wq: destroy_workqueue(wq); return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); destroy_workqueue(wq); } module_init_order(mlx4_ib_init, SI_ORDER_SEVENTH); module_exit_order(mlx4_ib_cleanup, SI_ORDER_SEVENTH); static int mlx4ib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlx4ib_mod = { .name = "mlx4ib", .evhand = mlx4ib_evhand, }; DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); MODULE_DEPEND(mlx4ib, linuxkpi, 1, 1, 1); diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c index c2a01b26de1c..68dddeb8f1f1 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c @@ -1,3584 +1,3584 @@ /*- * Copyright (c) 2013-2021, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #if defined(CONFIG_X86) #include #endif #include #include #include #undef inode #include #include #include #include #include #include #include #include #include #include #include #include "mlx5_ib.h" #define DRIVER_NAME "mlx5ib" #ifndef DRIVER_VERSION #define DRIVER_VERSION "3.6.0" #endif #define DRIVER_RELDATE "December 2020" MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1); MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1); MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1); MODULE_VERSION(mlx5ib, 1); static const char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver " DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; enum { MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, }; static enum rdma_link_layer mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { switch (port_type_cap) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; } } static enum rdma_link_layer mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *dev = to_mdev(device); int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); return mlx5_port_type_cap_to_rdma_ll(port_type_cap); } static bool mlx5_netdev_match(struct ifnet *ndev, struct mlx5_core_dev *mdev, const char *dname) { return ndev->if_type == IFT_ETHER && ndev->if_dname != NULL && strcmp(ndev->if_dname, dname) == 0 && ndev->if_softc != NULL && *(struct mlx5_core_dev **)ndev->if_softc == mdev; } static int mlx5_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct ifnet *ndev = netdev_notifier_info_to_dev(ptr); + struct ifnet *ndev = netdev_notifier_info_to_ifp(ptr); struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, roce.nb); switch (event) { case NETDEV_REGISTER: case NETDEV_UNREGISTER: write_lock(&ibdev->roce.netdev_lock); /* check if network interface belongs to mlx5en */ if (mlx5_netdev_match(ndev, ibdev->mdev, "mce")) ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev; write_unlock(&ibdev->roce.netdev_lock); break; case NETDEV_UP: case NETDEV_DOWN: { struct ifnet *upper = NULL; if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev)) && ibdev->ib_active) { struct ib_event ibev = {0}; ibev.device = &ibdev->ib_dev; ibev.event = (event == NETDEV_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; ibev.element.port_num = 1; ib_dispatch_event(&ibev); } break; } default: break; } return NOTIFY_DONE; } static struct ifnet *mlx5_ib_get_netdev(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *ibdev = to_mdev(device); struct ifnet *ndev; /* Ensure ndev does not disappear before we invoke if_ref() */ read_lock(&ibdev->roce.netdev_lock); ndev = ibdev->roce.netdev; if (ndev) if_ref(ndev); read_unlock(&ibdev->roce.netdev_lock); return ndev; } static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, u8 *active_width) { switch (eth_proto_oper) { case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII): case MLX5E_PROT_MASK(MLX5E_1000BASE_KX): case MLX5E_PROT_MASK(MLX5E_100BASE_TX): case MLX5E_PROT_MASK(MLX5E_1000BASE_T): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_SDR; break; case MLX5E_PROT_MASK(MLX5E_10GBASE_T): case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4): case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4): case MLX5E_PROT_MASK(MLX5E_10GBASE_KR): case MLX5E_PROT_MASK(MLX5E_10GBASE_CR): case MLX5E_PROT_MASK(MLX5E_10GBASE_SR): case MLX5E_PROT_MASK(MLX5E_10GBASE_ER_LR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_25GBASE_CR): case MLX5E_PROT_MASK(MLX5E_25GBASE_KR): case MLX5E_PROT_MASK(MLX5E_25GBASE_SR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4): case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4_ER4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2): case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2): case MLX5E_PROT_MASK(MLX5E_50GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_56GBASE_R4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_FDR; break; case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4): case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_EDR; break; default: *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; return -EINVAL; } return 0; } static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed, u8 *active_width) { switch (eth_proto_oper) { case MLX5E_PROT_MASK(MLX5E_SGMII_100M): case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_SDR; break; case MLX5E_PROT_MASK(MLX5E_5GBASE_R): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_DDR; break; case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; break; case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2): *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR): *active_width = IB_WIDTH_1X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_EDR; break; case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2): *active_width = IB_WIDTH_2X; *active_speed = IB_SPEED_HDR; break; case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4): *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_HDR; break; default: *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_QDR; return -EINVAL; } return 0; } static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(device); u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {}; struct ifnet *ndev; enum ib_mtu ndev_ib_mtu; u16 qkey_viol_cntr; u32 eth_prot_oper; bool ext; int err; memset(props, 0, sizeof(*props)); /* Possible bad flows are checked before filling out props so in case * of an error it will still be zeroed out. */ err = mlx5_query_port_ptys(dev->mdev, out, sizeof(out), MLX5_PTYS_EN, port_num); if (err) return err; ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet); eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); if (ext) translate_eth_ext_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); else translate_eth_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); props->port_cap_flags |= IB_PORT_CM_SUP; props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, roce_address_table_size); props->max_mtu = IB_MTU_4096; props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); props->pkey_tbl_len = 1; props->state = IB_PORT_DOWN; props->phys_state = 3; mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); props->qkey_viol_cntr = qkey_viol_cntr; ndev = mlx5_ib_get_netdev(device, port_num); if (!ndev) return 0; if (ndev->if_drv_flags & IFF_DRV_RUNNING && ndev->if_link_state == LINK_STATE_UP) { props->state = IB_PORT_ACTIVE; props->phys_state = 5; } ndev_ib_mtu = iboe_get_mtu(ndev->if_mtu); if_rele(ndev); props->active_mtu = min(props->max_mtu, ndev_ib_mtu); return 0; } static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, const struct ib_gid_attr *attr, void *mlx5_addr) { #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, source_l3_address); void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, source_mac_47_32); u16 vlan_id; if (!gid) return; ether_addr_copy(mlx5_addr_mac, IF_LLADDR(attr->ndev)); vlan_id = rdma_vlan_dev_vlan_id(attr->ndev); if (vlan_id != 0xffff) { MLX5_SET_RA(mlx5_addr, vlan_valid, 1); MLX5_SET_RA(mlx5_addr, vlan_id, vlan_id); } switch (attr->gid_type) { case IB_GID_TYPE_IB: MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); break; case IB_GID_TYPE_ROCE_UDP_ENCAP: MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); break; default: WARN_ON(true); } if (attr->gid_type != IB_GID_TYPE_IB) { if (ipv6_addr_v4mapped((void *)gid)) MLX5_SET_RA(mlx5_addr, roce_l3_type, MLX5_ROCE_L3_TYPE_IPV4); else MLX5_SET_RA(mlx5_addr, roce_l3_type, MLX5_ROCE_L3_TYPE_IPV6); } if ((attr->gid_type == IB_GID_TYPE_IB) || !ipv6_addr_v4mapped((void *)gid)) memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); else memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); } static int set_roce_addr(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr) { struct mlx5_ib_dev *dev = to_mdev(device); u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0}; void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num); if (ll != IB_LINK_LAYER_ETHERNET) return -EINVAL; ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); MLX5_SET(set_roce_address_in, in, roce_address_index, index); MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); } static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, __always_unused void **context) { return set_roce_addr(device, port_num, index, gid, attr); } static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num, unsigned int index, __always_unused void **context) { return set_roce_addr(device, port_num, index, NULL, NULL); } __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index) { struct ib_gid_attr attr; union ib_gid gid; if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) return 0; if (!attr.ndev) return 0; if_rele(attr.ndev); if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) return 0; return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); } int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type) { struct ib_gid_attr attr; union ib_gid gid; int ret; ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr); if (ret) return ret; if (!attr.ndev) return -ENODEV; if_rele(attr.ndev); *gid_type = attr.gid_type; return 0; } static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) return !MLX5_CAP_GEN(dev->mdev, ib_virt); return 0; } enum { MLX5_VPORT_ACCESS_METHOD_MAD, MLX5_VPORT_ACCESS_METHOD_HCA, MLX5_VPORT_ACCESS_METHOD_NIC, }; static int mlx5_get_vport_access_method(struct ib_device *ibdev) { if (mlx5_use_mad_ifc(to_mdev(ibdev))) return MLX5_VPORT_ACCESS_METHOD_MAD; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) return MLX5_VPORT_ACCESS_METHOD_NIC; return MLX5_VPORT_ACCESS_METHOD_HCA; } static void get_atomic_caps(struct mlx5_ib_dev *dev, struct ib_device_attr *props) { u8 tmp; u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); u8 atomic_req_8B_endianness_mode = MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode); /* Check if HW supports 8 bytes standard atomic operations and capable * of host endianness respond */ tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; if (((atomic_operations & tmp) == tmp) && (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && (atomic_req_8B_endianness_mode)) { props->atomic_cap = IB_ATOMIC_HCA; } else { props->atomic_cap = IB_ATOMIC_NONE; } } static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u64 tmp; int err; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_system_image_guid(ibdev, sys_image_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); break; default: return -EINVAL; } if (!err) *sys_image_guid = cpu_to_be64(tmp); return err; } static int mlx5_query_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); return 0; default: return -EINVAL; } } static int mlx5_query_vendor_id(struct ib_device *ibdev, u32 *vendor_id) { struct mlx5_ib_dev *dev = to_mdev(ibdev); switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_core_query_vendor_id(dev->mdev, vendor_id); default: return -EINVAL; } } static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) { u64 tmp; int err; switch (mlx5_get_vport_access_method(&dev->ib_dev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_node_guid(dev, node_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); break; default: return -EINVAL; } if (!err) *node_guid = cpu_to_be64(tmp); return err; } struct mlx5_reg_node_desc { u8 desc[IB_DEVICE_NODE_DESC_MAX]; }; static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) { struct mlx5_reg_node_desc in; if (mlx5_use_mad_ifc(dev)) return mlx5_query_mad_ifc_node_desc(dev, node_desc); memset(&in, 0, sizeof(in)); return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, sizeof(struct mlx5_reg_node_desc), MLX5_REG_NODE_DESC, 0, 0); } static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int err = -ENOMEM; int max_sq_desc; int max_rq_sg; int max_sq_sg; u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); struct mlx5_ib_query_device_resp resp = {}; size_t resp_len; u64 max_tso; resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); if (uhw->outlen && uhw->outlen < resp_len) return -EINVAL; else resp.response_length = resp_len; if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) return -EINVAL; memset(props, 0, sizeof(*props)); err = mlx5_query_system_image_guid(ibdev, &props->sys_image_guid); if (err) return err; err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); if (err) return err; err = mlx5_query_vendor_id(ibdev, &props->vendor_id); if (err) return err; props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | ((u32)fw_rev_min(dev->mdev) << 16) | fw_rev_sub(dev->mdev); props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; if (MLX5_CAP_GEN(mdev, pkv)) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (MLX5_CAP_GEN(mdev, qkv)) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; if (MLX5_CAP_GEN(mdev, imaicl)) { props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_WINDOW_TYPE_2B; props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); /* We support 'Gappy' memory registration too */ props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; /* At this stage no support for signature handover */ props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | IB_PROT_T10DIF_TYPE_2 | IB_PROT_T10DIF_TYPE_3; props->sig_guard_cap = IB_GUARD_T10DIF_CRC | IB_GUARD_T10DIF_CSUM; } if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { if (MLX5_CAP_ETH(mdev, csum_cap)) props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); if (max_tso) { resp.tso_caps.max_tso = 1 << max_tso; resp.tso_caps.supported_qpts |= 1 << IB_QPT_RAW_PACKET; resp.response_length += sizeof(resp.tso_caps); } } if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { resp.rss_caps.rx_hash_function = MLX5_RX_HASH_FUNC_TOEPLITZ; resp.rss_caps.rx_hash_fields_mask = MLX5_RX_HASH_SRC_IPV4 | MLX5_RX_HASH_DST_IPV4 | MLX5_RX_HASH_SRC_IPV6 | MLX5_RX_HASH_DST_IPV6 | MLX5_RX_HASH_SRC_PORT_TCP | MLX5_RX_HASH_DST_PORT_TCP | MLX5_RX_HASH_SRC_PORT_UDP | MLX5_RX_HASH_DST_PORT_UDP; resp.response_length += sizeof(resp.rss_caps); } } else { if (field_avail(typeof(resp), tso_caps, uhw->outlen)) resp.response_length += sizeof(resp.tso_caps); if (field_avail(typeof(resp), rss_caps, uhw->outlen)) resp.response_length += sizeof(resp.rss_caps); } if (MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) { props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; props->device_cap_flags |= IB_DEVICE_UD_TSO; } if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && MLX5_CAP_ETH(dev->mdev, scatter_fcs)) props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; props->max_mr_size = ~0ull; props->page_size_cap = ~(min_page_size - 1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); props->max_sge_rd = MLX5_MAX_SGE_RD; props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING if (MLX5_CAP_GEN(mdev, pg)) props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; props->odp_caps = dev->odp_caps; #endif if (MLX5_CAP_GEN(mdev, cd)) props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; if (!mlx5_core_is_pf(mdev)) props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { props->rss_caps.max_rwq_indirection_tables = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); props->rss_caps.max_rwq_indirection_table_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size); props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; props->max_wq_type_rq = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); } if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) return err; } return 0; } enum mlx5_ib_width { MLX5_IB_WIDTH_1X = 1 << 0, MLX5_IB_WIDTH_2X = 1 << 1, MLX5_IB_WIDTH_4X = 1 << 2, MLX5_IB_WIDTH_8X = 1 << 3, MLX5_IB_WIDTH_12X = 1 << 4 }; static int translate_active_width(struct ib_device *ibdev, u8 active_width, u8 *ib_width) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int err = 0; if (active_width & MLX5_IB_WIDTH_1X) { *ib_width = IB_WIDTH_1X; } else if (active_width & MLX5_IB_WIDTH_2X) { *ib_width = IB_WIDTH_2X; } else if (active_width & MLX5_IB_WIDTH_4X) { *ib_width = IB_WIDTH_4X; } else if (active_width & MLX5_IB_WIDTH_8X) { *ib_width = IB_WIDTH_8X; } else if (active_width & MLX5_IB_WIDTH_12X) { *ib_width = IB_WIDTH_12X; } else { mlx5_ib_dbg(dev, "Invalid active_width %d\n", (int)active_width); err = -EINVAL; } return err; } enum ib_max_vl_num { __IB_MAX_VL_0 = 1, __IB_MAX_VL_0_1 = 2, __IB_MAX_VL_0_3 = 3, __IB_MAX_VL_0_7 = 4, __IB_MAX_VL_0_14 = 5, }; enum mlx5_vl_hw_cap { MLX5_VL_HW_0 = 1, MLX5_VL_HW_0_1 = 2, MLX5_VL_HW_0_2 = 3, MLX5_VL_HW_0_3 = 4, MLX5_VL_HW_0_4 = 5, MLX5_VL_HW_0_5 = 6, MLX5_VL_HW_0_6 = 7, MLX5_VL_HW_0_7 = 8, MLX5_VL_HW_0_14 = 15 }; static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, u8 *max_vl_num) { switch (vl_hw_cap) { case MLX5_VL_HW_0: *max_vl_num = __IB_MAX_VL_0; break; case MLX5_VL_HW_0_1: *max_vl_num = __IB_MAX_VL_0_1; break; case MLX5_VL_HW_0_3: *max_vl_num = __IB_MAX_VL_0_3; break; case MLX5_VL_HW_0_7: *max_vl_num = __IB_MAX_VL_0_7; break; case MLX5_VL_HW_0_14: *max_vl_num = __IB_MAX_VL_0_14; break; default: return -EINVAL; } return 0; } static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u32 *rep; int replen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); struct mlx5_ptys_reg *ptys; struct mlx5_pmtu_reg *pmtu; struct mlx5_pvlc_reg pvlc; void *ctx; int err; rep = mlx5_vzalloc(replen); ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); if (!rep || !ptys || !pmtu) { err = -ENOMEM; goto out; } memset(props, 0, sizeof(*props)); err = mlx5_query_hca_vport_context(mdev, port, 0, rep, replen); if (err) goto out; ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context); props->lid = MLX5_GET(hca_vport_context, ctx, lid); props->lmc = MLX5_GET(hca_vport_context, ctx, lmc); props->sm_lid = MLX5_GET(hca_vport_context, ctx, sm_lid); props->sm_sl = MLX5_GET(hca_vport_context, ctx, sm_sl); props->state = MLX5_GET(hca_vport_context, ctx, vport_state); props->phys_state = MLX5_GET(hca_vport_context, ctx, port_physical_state); props->port_cap_flags = MLX5_GET(hca_vport_context, ctx, cap_mask1); props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, pkey_violation_counter); props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, qkey_violation_counter); props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, subnet_timeout); props->init_type_reply = MLX5_GET(hca_vport_context, ctx, init_type_reply); props->grh_required = MLX5_GET(hca_vport_context, ctx, grh_required); ptys->proto_mask |= MLX5_PTYS_IB; ptys->local_port = port; err = mlx5_core_access_ptys(mdev, ptys, 0); if (err) goto out; err = translate_active_width(ibdev, ptys->ib_link_width_oper, &props->active_width); if (err) goto out; props->active_speed = (u8)ptys->ib_proto_oper; pmtu->local_port = port; err = mlx5_core_access_pmtu(mdev, pmtu, 0); if (err) goto out; props->max_mtu = pmtu->max_mtu; props->active_mtu = pmtu->oper_mtu; memset(&pvlc, 0, sizeof(pvlc)); pvlc.local_port = port; err = mlx5_core_access_pvlc(mdev, &pvlc, 0); if (err) goto out; err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap, &props->max_vl_num); out: kvfree(rep); kfree(ptys); kfree(pmtu); return err; } int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_port_roce(ibdev, port, props); default: return -EINVAL; } } static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_gids(ibdev, port, index, gid); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); default: return -EINVAL; } } static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, pkey); default: return -EINVAL; } } static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_reg_node_desc in; struct mlx5_reg_node_desc out; int err; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX); err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, sizeof(out), MLX5_REG_NODE_DESC, 0, 1); if (err) return err; memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); return err; } static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_port_attr attr; u32 tmp; int err; /* * CM layer calls ib_modify_port() regardless of the link * layer. For Ethernet ports, qkey violation and Port * capabilities are meaningless. */ if (mlx5_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_ETHERNET) return 0; mutex_lock(&dev->cap_mask_mutex); err = mlx5_ib_query_port(ibdev, port, &attr); if (err) goto out; tmp = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx5_set_port_caps(dev->mdev, port, tmp); out: mutex_unlock(&dev->cap_mask_mutex); return err; } static u16 calc_dynamic_bfregs(int uars_per_sys_page) { /* Large page with non 4k uar support might limit the dynamic size */ if (uars_per_sys_page == 1 && PAGE_SIZE > 4096) return MLX5_MIN_DYN_BFREGS; return MLX5_MAX_DYN_BFREGS; } static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, struct mlx5_ib_alloc_ucontext_req_v2 *req, struct mlx5_bfreg_info *bfregi) { int uars_per_sys_page; int bfregs_per_sys_page; int ref_bfregs = req->total_num_bfregs; if (req->total_num_bfregs == 0) return -EINVAL; BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE); BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE); if (req->total_num_bfregs > MLX5_MAX_BFREGS) return -ENOMEM; uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k); bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR; /* This holds the required static allocation asked by the user */ req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page); if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) return -EINVAL; bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page); bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs; bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page; mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n", MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", lib_uar_4k ? "yes" : "no", ref_bfregs, req->total_num_bfregs, bfregi->total_num_bfregs, bfregi->num_sys_pages); return 0; } static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) { struct mlx5_bfreg_info *bfregi; int err; int i; bfregi = &context->bfregi; for (i = 0; i < bfregi->num_static_sys_pages; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]); if (err) goto error; mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]); } for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++) bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX; return 0; error: for (--i; i >= 0; i--) if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i])) mlx5_ib_warn(dev, "failed to free uar %d\n", i); return err; } static void deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) { struct mlx5_bfreg_info *bfregi; int i; bfregi = &context->bfregi; for (i = 0; i < bfregi->num_sys_pages; i++) if (i < bfregi->num_static_sys_pages || bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); } static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_ib_ucontext *context; struct mlx5_bfreg_info *bfregi; int ver; int err; size_t reqlen; size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, max_cqe_version); bool lib_uar_4k; bool lib_uar_dyn; if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr)) return ERR_PTR(-EINVAL); reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; else if (reqlen >= min_req_v2) ver = 2; else return ERR_PTR(-EINVAL); err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req))); if (err) return ERR_PTR(err); if (req.flags) return ERR_PTR(-EINVAL); if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) return ERR_PTR(-EOPNOTSUPP); req.total_num_bfregs = ALIGN(req.total_num_bfregs, MLX5_NON_FP_BFREGS_PER_UAR); if (req.num_low_latency_bfregs > req.total_num_bfregs - 1) return ERR_PTR(-EINVAL); if (reqlen > sizeof(req) && !ib_is_udata_cleared(udata, sizeof(req), reqlen - sizeof(req))) return ERR_PTR(-EOPNOTSUPP); resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp.cache_line_size = cache_line_size(); resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); resp.cqe_version = min_t(__u8, (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), req.max_cqe_version); resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT; resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1; resp.response_length = min(offsetof(typeof(resp), response_length) + sizeof(resp.response_length), udata->outlen); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR; bfregi = &context->bfregi; if (lib_uar_dyn) { bfregi->lib_uar_dyn = lib_uar_dyn; goto uar_done; } /* updates req->total_num_bfregs */ err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) goto out_ctx; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count), GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; goto out_ctx; } bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, sizeof(*bfregi->sys_pages), GFP_KERNEL); if (!bfregi->sys_pages) { err = -ENOMEM; goto out_count; } err = allocate_uars(dev, context); if (err) goto out_sys_pages; uar_done: #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn); if (err) goto out_uars; } INIT_LIST_HEAD(&context->vma_private_list); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); if (field_avail(typeof(resp), cqe_version, udata->outlen)) resp.response_length += sizeof(resp.cqe_version); if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; resp.response_length += sizeof(resp.cmds_supp_uhw); } /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's * pretend we don't support reading the HCA's core clock. This is also * forced by mmap function. */ if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) { if (PAGE_SIZE <= 4096) { resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; resp.hca_core_clock_offset = offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; } resp.response_length += sizeof(resp.hca_core_clock_offset); } if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen) resp.response_length += sizeof(resp.log_uar_size); if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen) resp.response_length += sizeof(resp.num_uars_per_page); if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) { resp.num_dyn_bfregs = bfregi->num_dyn_bfregs; resp.response_length += sizeof(resp.num_dyn_bfregs); } err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_td; bfregi->ver = ver; bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs; context->cqe_version = resp.cqe_version; return &context->ibucontext; out_td: if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_dealloc_transport_domain(dev->mdev, context->tdn); out_uars: deallocate_uars(dev, context); out_sys_pages: kfree(bfregi->sys_pages); out_count: kfree(bfregi->count); out_ctx: kfree(context); return ERR_PTR(err); } static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; bfregi = &context->bfregi; if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_dealloc_transport_domain(dev->mdev, context->tdn); deallocate_uars(dev, context); kfree(bfregi->sys_pages); kfree(bfregi->count); kfree(context); return 0; } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int uar_idx) { int fw_uars_per_page; fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; } static int get_command(unsigned long offset) { return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; } static int get_arg(unsigned long offset) { return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); } static int get_index(unsigned long offset) { return get_arg(offset); } /* Index resides in an extra byte to enable larger values than 255 */ static int get_extended_index(unsigned long offset) { return get_arg(offset) | ((offset >> 16) & 0xff) << 8; } static void mlx5_ib_vma_open(struct vm_area_struct *area) { /* vma_open is called when a new VMA is created on top of our VMA. This * is done through either mremap flow or split_vma (usually due to * mlock, madvise, munmap, etc.) We do not support a clone of the VMA, * as this VMA is strongly hardware related. Therefore we set the * vm_ops of the newly created/cloned VMA to NULL, to prevent it from * calling us again and trying to do incorrect actions. We assume that * the original VMA size is exactly a single page, and therefore all * "splitting" operation will not happen to it. */ area->vm_ops = NULL; } static void mlx5_ib_vma_close(struct vm_area_struct *area) { struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data; /* It's guaranteed that all VMAs opened on a FD are closed before the * file itself is closed, therefore no sync is needed with the regular * closing flow. (e.g. mlx5 ib_dealloc_ucontext) * However need a sync with accessing the vma as part of * mlx5_ib_disassociate_ucontext. * The close operation is usually called under mm->mmap_sem except when * process is exiting. * The exiting case is handled explicitly as part of * mlx5_ib_disassociate_ucontext. */ mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data; /* setting the vma context pointer to null in the mlx5_ib driver's * private data, to protect a race condition in * mlx5_ib_disassociate_ucontext(). */ mlx5_ib_vma_priv_data->vma = NULL; list_del(&mlx5_ib_vma_priv_data->list); kfree(mlx5_ib_vma_priv_data); } static const struct vm_operations_struct mlx5_ib_vm_ops = { .open = mlx5_ib_vma_open, .close = mlx5_ib_vma_close }; static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, struct mlx5_ib_ucontext *ctx) { struct mlx5_ib_vma_private_data *vma_prv; struct list_head *vma_head = &ctx->vma_private_list; vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL); if (!vma_prv) return -ENOMEM; vma_prv->vma = vma; vma->vm_private_data = vma_prv; vma->vm_ops = &mlx5_ib_vm_ops; list_add(&vma_prv->list, vma_head); return 0; } static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { int ret; struct vm_area_struct *vma; struct mlx5_ib_vma_private_data *vma_private, *n; struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); if (!owning_process) return; owning_mm = get_task_mm(owning_process); if (!owning_mm) { pr_info("no mm, disassociate ucontext is pending task termination\n"); while (1) { put_task_struct(owning_process); usleep_range(1000, 2000); owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); if (!owning_process || owning_process->task_thread-> td_proc->p_state == PRS_ZOMBIE) { pr_info("disassociate ucontext done, task was terminated\n"); /* in case task was dead need to release the * task struct. */ if (owning_process) put_task_struct(owning_process); return; } } } /* need to protect from a race on closing the vma as part of * mlx5_ib_vma_close. */ down_write(&owning_mm->mmap_sem); list_for_each_entry_safe(vma_private, n, &context->vma_private_list, list) { vma = vma_private->vma; ret = zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); if (ret == -ENOTSUP) { if (bootverbose) WARN_ONCE( "%s: zap_vma_ptes not implemented for unmanaged mappings", __func__); } else { WARN(ret, "%s: zap_vma_ptes failed, error %d", __func__, -ret); } /* context going to be destroyed, should * not access ops any more. */ /* XXXKIB vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); */ vma->vm_ops = NULL; list_del(&vma_private->list); kfree(vma_private); } up_write(&owning_mm->mmap_sem); mmput(owning_mm); put_task_struct(owning_process); } static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) { switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: return "WC"; case MLX5_IB_MMAP_REGULAR_PAGE: return "best effort WC"; case MLX5_IB_MMAP_NC_PAGE: return "NC"; default: return NULL; } } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { struct mlx5_bfreg_info *bfregi = &context->bfregi; int err; unsigned long idx; phys_addr_t pfn; pgprot_t prot; u32 bfreg_dyn_idx = 0; u32 uar_index; int dyn_uar = (cmd == MLX5_IB_MMAP_WC_PAGE); int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : bfregi->num_static_sys_pages; if (bfregi->lib_uar_dyn) return -EINVAL; if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; if (dyn_uar) idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages; else idx = get_index(vma->vm_pgoff); if (idx >= max_valid_idx) { mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n", idx, max_valid_idx); return -EINVAL; } switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ prot = pgprot_writecombine(vma->vm_page_prot); break; case MLX5_IB_MMAP_NC_PAGE: prot = pgprot_noncached(vma->vm_page_prot); break; default: return -EINVAL; } if (dyn_uar) { int uars_per_page; uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR); if (bfreg_dyn_idx >= bfregi->total_num_bfregs) { mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n", bfreg_dyn_idx, bfregi->total_num_bfregs); return -EINVAL; } mutex_lock(&bfregi->lock); /* Fail if uar already allocated, first bfreg index of each * page holds its count. */ if (bfregi->count[bfreg_dyn_idx]) { mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx); mutex_unlock(&bfregi->lock); return -EINVAL; } bfregi->count[bfreg_dyn_idx]++; mutex_unlock(&bfregi->lock); err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); if (err) { mlx5_ib_warn(dev, "UAR alloc failed\n"); goto free_bfreg; } } else { uar_index = bfregi->sys_pages[idx]; } pfn = uar_index2pfn(dev, uar_index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); vma->vm_page_prot = prot; err = io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot); if (err) { mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%llx, pfn=%pa, mmap_cmd=%s\n", err, (unsigned long long)vma->vm_start, &pfn, mmap_cmd2str(cmd)); goto err; } if (dyn_uar) bfregi->sys_pages[idx] = uar_index; return mlx5_ib_set_vma_data(vma, context); err: if (!dyn_uar) return err; mlx5_cmd_free_uar(dev->mdev, idx); free_bfreg: mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx); return err; } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); unsigned long command; phys_addr_t pfn; command = get_command(vma->vm_pgoff); switch (command) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; case MLX5_IB_MMAP_CORE_CLOCK: if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; if (vma->vm_flags & VM_WRITE) return -EPERM; /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) return -EOPNOTSUPP; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); pfn = (dev->mdev->iseg_base + offsetof(struct mlx5_init_seg, internal_timer_h)) >> PAGE_SHIFT; if (io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; mlx5_ib_dbg(dev, "mapped internal timer at 0x%llx, PA 0x%llx\n", (unsigned long long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); break; default: return -EINVAL; } return 0; } static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx5_ib_alloc_pd_resp resp; struct mlx5_ib_pd *pd; int err; pd = kmalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } } return &pd->ibpd; } static int mlx5_ib_dealloc_pd(struct ib_pd *pd) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); kfree(mpd); return 0; } enum { MATCH_CRITERIA_ENABLE_OUTER_BIT, MATCH_CRITERIA_ENABLE_MISC_BIT, MATCH_CRITERIA_ENABLE_INNER_BIT }; #define HEADER_IS_ZERO(match_criteria, headers) \ !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ static u8 get_match_criteria_enable(u32 *match_criteria) { u8 match_criteria_enable; match_criteria_enable = (!HEADER_IS_ZERO(match_criteria, outer_headers)) << MATCH_CRITERIA_ENABLE_OUTER_BIT; match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << MATCH_CRITERIA_ENABLE_MISC_BIT; match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, inner_headers)) << MATCH_CRITERIA_ENABLE_INNER_BIT; return match_criteria_enable; } static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val) { MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val); } static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) { MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val); MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2); MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); } #define LAST_ETH_FIELD vlan_tag #define LAST_IB_FIELD sl #define LAST_IPV4_FIELD tos #define LAST_IPV6_FIELD traffic_class #define LAST_TCP_UDP_FIELD src_port /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ memchr_inv((void *)&filter.field +\ sizeof(filter.field), 0,\ sizeof(filter) -\ offsetof(typeof(filter), field) -\ sizeof(filter.field)) static int parse_flow_attr(u32 *match_c, u32 *match_v, const union ib_flow_spec *ib_spec) { void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, outer_headers); void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, outer_headers); void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters); switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) return -ENOTSUPP; ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dmac_47_16), ib_spec->eth.mask.dst_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dmac_47_16), ib_spec->eth.val.dst_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, smac_47_16), ib_spec->eth.mask.src_mac); ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, smac_47_16), ib_spec->eth.val.src_mac); if (ib_spec->eth.mask.vlan_tag) { MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_vid, ntohs(ib_spec->eth.val.vlan_tag)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_cfi, ntohs(ib_spec->eth.mask.vlan_tag) >> 12); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_cfi, ntohs(ib_spec->eth.val.vlan_tag) >> 12); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, first_prio, ntohs(ib_spec->eth.mask.vlan_tag) >> 13); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, first_prio, ntohs(ib_spec->eth.val.vlan_tag) >> 13); } MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, ntohs(ib_spec->eth.mask.ether_type)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, ntohs(ib_spec->eth.val.ether_type)); break; case IB_FLOW_SPEC_IPV4: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, 0xffff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, ETH_P_IP); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, src_ipv4_src_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.mask.src_ip, sizeof(ib_spec->ipv4.mask.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, src_ipv4_src_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.val.src_ip, sizeof(ib_spec->ipv4.val.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.mask.dst_ip, sizeof(ib_spec->ipv4.mask.dst_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), &ib_spec->ipv4.val.dst_ip, sizeof(ib_spec->ipv4.val.dst_ip)); set_tos(outer_headers_c, outer_headers_v, ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos); set_proto(outer_headers_c, outer_headers_v, ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto); break; case IB_FLOW_SPEC_IPV6: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype, 0xffff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype, IPPROTO_IPV6); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, src_ipv4_src_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.mask.src_ip, sizeof(ib_spec->ipv6.mask.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, src_ipv4_src_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.val.src_ip, sizeof(ib_spec->ipv6.val.src_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.mask.dst_ip, sizeof(ib_spec->ipv6.mask.dst_ip)); memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), &ib_spec->ipv6.val.dst_ip, sizeof(ib_spec->ipv6.val.dst_ip)); set_tos(outer_headers_c, outer_headers_v, ib_spec->ipv6.mask.traffic_class, ib_spec->ipv6.val.traffic_class); set_proto(outer_headers_c, outer_headers_v, ib_spec->ipv6.mask.next_hdr, ib_spec->ipv6.val.next_hdr); MLX5_SET(fte_match_set_misc, misc_params_c, outer_ipv6_flow_label, ntohl(ib_spec->ipv6.mask.flow_label)); MLX5_SET(fte_match_set_misc, misc_params_v, outer_ipv6_flow_label, ntohl(ib_spec->ipv6.val.flow_label)); break; case IB_FLOW_SPEC_TCP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, 0xff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, IPPROTO_TCP); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport, ntohs(ib_spec->tcp_udp.mask.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport, ntohs(ib_spec->tcp_udp.val.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport, ntohs(ib_spec->tcp_udp.mask.dst_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; case IB_FLOW_SPEC_UDP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) return -ENOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, 0xff); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, IPPROTO_UDP); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport, ntohs(ib_spec->tcp_udp.mask.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport, ntohs(ib_spec->tcp_udp.val.src_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport, ntohs(ib_spec->tcp_udp.mask.dst_port)); MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; default: return -EINVAL; } return 0; } /* If a flow could catch both multicast and unicast packets, * it won't fall into the multicast flow steering table and this rule * could steal other multicast packets. */ static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr) { struct ib_flow_spec_eth *eth_spec; if (ib_attr->type != IB_FLOW_ATTR_NORMAL || ib_attr->size < sizeof(struct ib_flow_attr) + sizeof(struct ib_flow_spec_eth) || ib_attr->num_of_specs < 1) return false; eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1); if (eth_spec->type != IB_FLOW_SPEC_ETH || eth_spec->size != sizeof(*eth_spec)) return false; return is_multicast_ether_addr(eth_spec->mask.dst_mac) && is_multicast_ether_addr(eth_spec->val.dst_mac); } static bool is_valid_attr(const struct ib_flow_attr *flow_attr) { union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); bool has_ipv4_spec = false; bool eth_type_ipv4 = true; unsigned int spec_index; /* Validate that ethertype is correct */ for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { if (ib_spec->type == IB_FLOW_SPEC_ETH && ib_spec->eth.mask.ether_type) { if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) && ib_spec->eth.val.ether_type == htons(ETH_P_IP))) eth_type_ipv4 = false; } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) { has_ipv4_spec = true; } ib_spec = (void *)ib_spec + ib_spec->size; } return !has_ipv4_spec || eth_type_ipv4; } static void put_flow_table(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *prio, bool ft_added) { prio->refcount -= !!ft_added; if (!prio->refcount) { mlx5_destroy_flow_table(prio->flow_table); prio->flow_table = NULL; } } static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) { struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); struct mlx5_ib_flow_handler *handler = container_of(flow_id, struct mlx5_ib_flow_handler, ibflow); struct mlx5_ib_flow_handler *iter, *tmp; mutex_lock(&dev->flow_db.lock); list_for_each_entry_safe(iter, tmp, &handler->list, list) { mlx5_del_flow_rule(iter->rule); put_flow_table(dev, iter->prio, true); list_del(&iter->list); kfree(iter); } mlx5_del_flow_rule(handler->rule); put_flow_table(dev, handler->prio, true); mutex_unlock(&dev->flow_db.lock); kfree(handler); return 0; } static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) { priority *= 2; if (!dont_trap) priority++; return priority; } enum flow_table_type { MLX5_IB_FT_RX, MLX5_IB_FT_TX }; #define MLX5_FS_MAX_TYPES 10 #define MLX5_FS_MAX_ENTRIES 32000UL static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, struct ib_flow_attr *flow_attr, enum flow_table_type ft_type) { bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_table *ft; int num_entries; int num_groups; int priority; int err = 0; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { if (flow_is_multicast_only(flow_attr) && !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else priority = ib_prio_to_core_prio(flow_attr->priority, dont_trap); ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); num_entries = MLX5_FS_MAX_ENTRIES; num_groups = MLX5_FS_MAX_TYPES; prio = &dev->flow_db.prios[priority]; } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_LEFTOVERS); build_leftovers_ft_param("bypass", &priority, &num_entries, &num_groups); prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { if (!MLX5_CAP_FLOWTABLE(dev->mdev, allow_sniffer_and_nic_rx_shared_tir)) return ERR_PTR(-ENOTSUPP); ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ? MLX5_FLOW_NAMESPACE_SNIFFER_RX : MLX5_FLOW_NAMESPACE_SNIFFER_TX); prio = &dev->flow_db.sniffer[ft_type]; priority = 0; num_entries = 1; num_groups = 1; } if (!ns) return ERR_PTR(-ENOTSUPP); ft = prio->flow_table; if (!ft) { ft = mlx5_create_auto_grouped_flow_table(ns, priority, "bypass", num_entries, num_groups); if (!IS_ERR(ft)) { prio->refcount = 0; prio->flow_table = ft; } else { err = PTR_ERR(ft); } } return err ? ERR_PTR(err) : prio; } static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, const struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; struct mlx5_flow_spec *spec; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; u32 action; int err = 0; if (!is_valid_attr(flow_attr)) return ERR_PTR(-EINVAL); spec = mlx5_vzalloc(sizeof(*spec)); handler = kzalloc(sizeof(*handler), GFP_KERNEL); if (!handler || !spec) { err = -ENOMEM; goto free; } INIT_LIST_HEAD(&handler->list); for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(spec->match_criteria, spec->match_value, ib_flow); if (err < 0) goto free; ib_flow += ((union ib_flow_spec *)ib_flow)->size; } spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; handler->rule = mlx5_add_flow_rule(ft, spec->match_criteria_enable, spec->match_criteria, spec->match_value, action, MLX5_FS_DEFAULT_FLOW_TAG, dst); if (IS_ERR(handler->rule)) { err = PTR_ERR(handler->rule); goto free; } ft_prio->refcount++; handler->prio = ft_prio; ft_prio->flow_table = ft; free: if (err) kfree(handler); kvfree(spec); return err ? ERR_PTR(err) : handler; } static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_dst = NULL; struct mlx5_ib_flow_handler *handler = NULL; handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); if (!IS_ERR(handler)) { handler_dst = create_flow_rule(dev, ft_prio, flow_attr, dst); if (IS_ERR(handler_dst)) { mlx5_del_flow_rule(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_dst; } else { list_add(&handler_dst->list, &handler->list); } } return handler; } enum { LEFTOVERS_MC, LEFTOVERS_UC, }; static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_ucast = NULL; struct mlx5_ib_flow_handler *handler = NULL; static struct { struct ib_flow_attr flow_attr; struct ib_flow_spec_eth eth_flow; } leftovers_specs[] = { [LEFTOVERS_MC] = { .flow_attr = { .num_of_specs = 1, .size = sizeof(leftovers_specs[0]) }, .eth_flow = { .type = IB_FLOW_SPEC_ETH, .size = sizeof(struct ib_flow_spec_eth), .mask = {.dst_mac = {0x1} }, .val = {.dst_mac = {0x1} } } }, [LEFTOVERS_UC] = { .flow_attr = { .num_of_specs = 1, .size = sizeof(leftovers_specs[0]) }, .eth_flow = { .type = IB_FLOW_SPEC_ETH, .size = sizeof(struct ib_flow_spec_eth), .mask = {.dst_mac = {0x1} }, .val = {.dst_mac = {} } } } }; handler = create_flow_rule(dev, ft_prio, &leftovers_specs[LEFTOVERS_MC].flow_attr, dst); if (!IS_ERR(handler) && flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { handler_ucast = create_flow_rule(dev, ft_prio, &leftovers_specs[LEFTOVERS_UC].flow_attr, dst); if (IS_ERR(handler_ucast)) { mlx5_del_flow_rule(handler->rule); ft_prio->refcount--; kfree(handler); handler = handler_ucast; } else { list_add(&handler_ucast->list, &handler->list); } } return handler; } static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_rx, struct mlx5_ib_flow_prio *ft_tx, struct mlx5_flow_destination *dst) { struct mlx5_ib_flow_handler *handler_rx; struct mlx5_ib_flow_handler *handler_tx; int err; static const struct ib_flow_attr flow_attr = { .num_of_specs = 0, .size = sizeof(flow_attr) }; handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst); if (IS_ERR(handler_rx)) { err = PTR_ERR(handler_rx); goto err; } handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst); if (IS_ERR(handler_tx)) { err = PTR_ERR(handler_tx); goto err_tx; } list_add(&handler_tx->list, &handler_rx->list); return handler_rx; err_tx: mlx5_del_flow_rule(handler_rx->rule); ft_rx->refcount--; kfree(handler_rx); err: return ERR_PTR(err); } static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); struct mlx5_ib_flow_handler *handler = NULL; struct mlx5_flow_destination *dst = NULL; struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; int err; if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOSPC); if (domain != IB_FLOW_DOMAIN_USER || flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) return ERR_PTR(-EINVAL); dst = kzalloc(sizeof(*dst), GFP_KERNEL); if (!dst) return ERR_PTR(-ENOMEM); mutex_lock(&dev->flow_db.lock); ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; } if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX); if (IS_ERR(ft_prio_tx)) { err = PTR_ERR(ft_prio_tx); ft_prio_tx = NULL; goto destroy_ft; } } dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; if (mqp->flags & MLX5_IB_QP_RSS) dst->tir_num = mqp->rss_qp.tirn; else dst->tir_num = mqp->raw_packet_qp.rq.tirn; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { handler = create_dont_trap_rule(dev, ft_prio, flow_attr, dst); } else { handler = create_flow_rule(dev, ft_prio, flow_attr, dst); } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { handler = create_leftovers_rule(dev, ft_prio, flow_attr, dst); } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst); } else { err = -EINVAL; goto destroy_ft; } if (IS_ERR(handler)) { err = PTR_ERR(handler); handler = NULL; goto destroy_ft; } mutex_unlock(&dev->flow_db.lock); kfree(dst); return &handler->ibflow; destroy_ft: put_flow_table(dev, ft_prio, false); if (ft_prio_tx) put_flow_table(dev, ft_prio_tx, false); unlock: mutex_unlock(&dev->flow_db.lock); kfree(dst); kfree(handler); return ERR_PTR(err); } static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int init_node_data(struct mlx5_ib_dev *dev) { int err; err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); if (err) return err; return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); } static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages); } static ssize_t show_reg_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->mdev->pdev->revision); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_fw_pages, &dev_attr_reg_pages, }; static void pkey_change_handler(struct work_struct *work) { struct mlx5_ib_port_resources *ports = container_of(work, struct mlx5_ib_port_resources, pkey_change_work); mutex_lock(&ports->devr->mutex); mlx5_ib_gsi_pkey_change(ports->gsi); mutex_unlock(&ports->devr->mutex); } static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) { struct mlx5_ib_qp *mqp; struct mlx5_ib_cq *send_mcq, *recv_mcq; struct mlx5_core_cq *mcq; struct list_head cq_armed_list; unsigned long flags_qp; unsigned long flags_cq; unsigned long flags; INIT_LIST_HEAD(&cq_armed_list); /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { spin_lock_irqsave(&mqp->sq.lock, flags_qp); if (mqp->sq.tail != mqp->sq.head) { send_mcq = to_mcq(mqp->ibqp.send_cq); spin_lock_irqsave(&send_mcq->lock, flags_cq); if (send_mcq->mcq.comp && mqp->ibqp.send_cq->comp_handler) { if (!send_mcq->mcq.reset_notify_added) { send_mcq->mcq.reset_notify_added = 1; list_add_tail(&send_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&send_mcq->lock, flags_cq); } spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); spin_lock_irqsave(&mqp->rq.lock, flags_qp); /* no handling is needed for SRQ */ if (!mqp->ibqp.srq) { if (mqp->rq.tail != mqp->rq.head) { recv_mcq = to_mcq(mqp->ibqp.recv_cq); spin_lock_irqsave(&recv_mcq->lock, flags_cq); if (recv_mcq->mcq.comp && mqp->ibqp.recv_cq->comp_handler) { if (!recv_mcq->mcq.reset_notify_added) { recv_mcq->mcq.reset_notify_added = 1; list_add_tail(&recv_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&recv_mcq->lock, flags_cq); } } spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); } /*At that point all inflight post send were put to be executed as of we * lock/unlock above locks Now need to arm all involved CQs. */ list_for_each_entry(mcq, &cq_armed_list, reset_notify) { mcq->comp(mcq, NULL); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); } static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; struct ib_event ibev; bool fatal = false; u8 port = (u8)param; switch (event) { case MLX5_DEV_EVENT_SYS_ERROR: ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); fatal = true; break; case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == IB_LINK_LAYER_ETHERNET) return; ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; break; case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; break; case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: ibev.event = IB_EVENT_GID_CHANGE; break; case MLX5_DEV_EVENT_CLIENT_REREG: ibev.event = IB_EVENT_CLIENT_REREGISTER; break; default: /* unsupported event */ return; } ibev.device = &ibdev->ib_dev; ibev.element.port_num = port; if (!rdma_is_port_valid(&ibdev->ib_dev, port)) { mlx5_ib_warn(ibdev, "warning: event(%d) on port %d\n", event, port); return; } if (ibdev->ib_active) ib_dispatch_event(&ibev); if (fatal) ibdev->ib_active = false; } static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) mlx5_query_ext_port_caps(dev, port); } static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; int port; struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); if (!dprops) goto out; err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; } for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); break; } dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys; dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } out: kfree(pprops); kfree(dprops); return err; } static void destroy_umrc_res(struct mlx5_ib_dev *dev) { int err; err = mlx5_mr_cache_cleanup(dev); if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); mlx5_ib_destroy_qp(dev->umrc.qp); ib_free_cq(dev->umrc.cq); ib_dealloc_pd(dev->umrc.pd); } enum { MAX_UMR_WR = 128, }; static int create_umr_res(struct mlx5_ib_dev *dev) { struct ib_qp_init_attr *init_attr = NULL; struct ib_qp_attr *attr = NULL; struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; int ret; attr = kzalloc(sizeof(*attr), GFP_KERNEL); init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); if (!attr || !init_attr) { ret = -ENOMEM; goto error_0; } pd = ib_alloc_pd(&dev->ib_dev, 0); if (IS_ERR(pd)) { mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); ret = PTR_ERR(pd); goto error_0; } cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); goto error_2; } init_attr->send_cq = cq; init_attr->recv_cq = cq; init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; init_attr->cap.max_send_wr = MAX_UMR_WR; init_attr->cap.max_send_sge = 1; init_attr->qp_type = MLX5_IB_QPT_REG_UMR; init_attr->port_num = 1; qp = mlx5_ib_create_qp(pd, init_attr, NULL); if (IS_ERR(qp)) { mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); ret = PTR_ERR(qp); goto error_3; } qp->device = &dev->ib_dev; qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = MLX5_IB_QPT_REG_UMR; attr->qp_state = IB_QPS_INIT; attr->port_num = 1; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); goto error_4; } memset(attr, 0, sizeof(*attr)); attr->qp_state = IB_QPS_RTR; attr->path_mtu = IB_MTU_256; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); goto error_4; } memset(attr, 0, sizeof(*attr)); attr->qp_state = IB_QPS_RTS; ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); if (ret) { mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); goto error_4; } dev->umrc.qp = qp; dev->umrc.cq = cq; dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); ret = mlx5_mr_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); goto error_4; } kfree(attr); kfree(init_attr); return 0; error_4: mlx5_ib_destroy_qp(qp); error_3: ib_free_cq(cq); error_2: ib_dealloc_pd(pd); error_0: kfree(attr); kfree(init_attr); return ret; } static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); mutex_init(&devr->mutex); devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); goto error0; } devr->p0->device = &dev->ib_dev; devr->p0->uobject = NULL; atomic_set(&devr->p0->usecnt, 0); devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); if (IS_ERR(devr->c0)) { ret = PTR_ERR(devr->c0); goto error1; } devr->c0->device = &dev->ib_dev; devr->c0->uobject = NULL; devr->c0->comp_handler = NULL; devr->c0->event_handler = NULL; devr->c0->cq_context = NULL; atomic_set(&devr->c0->usecnt, 0); devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); goto error2; } devr->x0->device = &dev->ib_dev; devr->x0->inode = NULL; atomic_set(&devr->x0->usecnt, 0); mutex_init(&devr->x0->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x0->tgt_qp_list); devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x1)) { ret = PTR_ERR(devr->x1); goto error3; } devr->x1->device = &dev->ib_dev; devr->x1->inode = NULL; atomic_set(&devr->x1->usecnt, 0); mutex_init(&devr->x1->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x1->tgt_qp_list); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_XRC; attr.ext.xrc.cq = devr->c0; attr.ext.xrc.xrcd = devr->x0; devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s0)) { ret = PTR_ERR(devr->s0); goto error4; } devr->s0->device = &dev->ib_dev; devr->s0->pd = devr->p0; devr->s0->uobject = NULL; devr->s0->event_handler = NULL; devr->s0->srq_context = NULL; devr->s0->srq_type = IB_SRQT_XRC; devr->s0->ext.xrc.xrcd = devr->x0; devr->s0->ext.xrc.cq = devr->c0; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); atomic_inc(&devr->s0->ext.xrc.cq->usecnt); atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s1)) { ret = PTR_ERR(devr->s1); goto error5; } devr->s1->device = &dev->ib_dev; devr->s1->pd = devr->p0; devr->s1->uobject = NULL; devr->s1->event_handler = NULL; devr->s1->srq_context = NULL; devr->s1->srq_type = IB_SRQT_BASIC; devr->s1->ext.xrc.cq = devr->c0; atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { INIT_WORK(&devr->ports[port].pkey_change_work, pkey_change_handler); devr->ports[port].devr = devr; } return 0; error5: mlx5_ib_destroy_srq(devr->s0); error4: mlx5_ib_dealloc_xrcd(devr->x1); error3: mlx5_ib_dealloc_xrcd(devr->x0); error2: mlx5_ib_destroy_cq(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0); error0: return ret; } static void destroy_dev_resources(struct mlx5_ib_resources *devr) { struct mlx5_ib_dev *dev = container_of(devr, struct mlx5_ib_dev, devr); int port; mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); /* Make sure no change P_Key work items are still executing */ for (port = 0; port < dev->num_ports; ++port) cancel_work_sync(&devr->ports[port].pkey_change_work); } static u32 get_core_cap_flags(struct ib_device *ibdev) { struct mlx5_ib_dev *dev = to_mdev(ibdev); enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); u32 ret = 0; if (ll == IB_LINK_LAYER_INFINIBAND) return RDMA_CORE_PORT_IBA_IB; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) return 0; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) return 0; if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE; if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; return ret; } static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; int err; err = mlx5_ib_query_port(ibdev, port_num, &attr); if (err) return err; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = get_core_cap_flags(ibdev); immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } static void get_dev_fw_str(struct ib_device *ibdev, char *str, size_t str_len) { struct mlx5_ib_dev *dev = container_of(ibdev, struct mlx5_ib_dev, ib_dev); snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); } static int mlx5_roce_lag_init(struct mlx5_ib_dev *dev) { return 0; } static void mlx5_roce_lag_cleanup(struct mlx5_ib_dev *dev) { } static void mlx5_remove_roce_notifier(struct mlx5_ib_dev *dev) { if (dev->roce.nb.notifier_call) { unregister_netdevice_notifier(&dev->roce.nb); dev->roce.nb.notifier_call = NULL; } } static int mlx5_enable_roce(struct mlx5_ib_dev *dev) { VNET_ITERATOR_DECL(vnet_iter); struct ifnet *idev; int err; /* Check if mlx5en net device already exists */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { IFNET_RLOCK(); CURVNET_SET_QUIET(vnet_iter); CK_STAILQ_FOREACH(idev, &V_ifnet, if_link) { /* check if network interface belongs to mlx5en */ if (!mlx5_netdev_match(idev, dev->mdev, "mce")) continue; write_lock(&dev->roce.netdev_lock); dev->roce.netdev = idev; write_unlock(&dev->roce.netdev_lock); } CURVNET_RESTORE(); IFNET_RUNLOCK(); } VNET_LIST_RUNLOCK(); dev->roce.nb.notifier_call = mlx5_netdev_event; err = register_netdevice_notifier(&dev->roce.nb); if (err) { dev->roce.nb.notifier_call = NULL; return err; } err = mlx5_nic_vport_enable_roce(dev->mdev); if (err) goto err_unregister_netdevice_notifier; err = mlx5_roce_lag_init(dev); if (err) goto err_disable_roce; return 0; err_disable_roce: mlx5_nic_vport_disable_roce(dev->mdev); err_unregister_netdevice_notifier: mlx5_remove_roce_notifier(dev); return err; } static void mlx5_disable_roce(struct mlx5_ib_dev *dev) { mlx5_roce_lag_cleanup(dev); mlx5_nic_vport_disable_roce(dev->mdev); } static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) { mlx5_vport_dealloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, dev->port[port_num].q_cnt_id); dev->port[port_num].q_cnt_id = 0; } static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) { unsigned int i; for (i = 0; i < dev->num_ports; i++) mlx5_ib_dealloc_q_port_counter(dev, i); } static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) { int i; int ret; for (i = 0; i < dev->num_ports; i++) { ret = mlx5_vport_alloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, &dev->port[i].q_cnt_id); if (ret) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d, err %d\n", i + 1, ret); goto dealloc_counters; } } return 0; dealloc_counters: while (--i >= 0) mlx5_ib_dealloc_q_port_counter(dev, i); return ret; } static const char * const names[] = { "rx_write_requests", "rx_read_requests", "rx_atomic_requests", "out_of_buffer", "out_of_sequence", "duplicate_request", "rnr_nak_retry_err", "packet_seq_err", "implied_nak_seq_err", "local_ack_timeout_err", }; static const size_t stats_offsets[] = { MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests), MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests), MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests), MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer), MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence), MLX5_BYTE_OFF(query_q_counter_out, duplicate_request), MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err), MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err), MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err), MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err), }; static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, u8 port_num) { BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets)); /* We support only per port stats */ if (port_num == 0) return NULL; return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names), RDMA_HW_STATS_DEFAULT_LIFESPAN); } static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u8 port, int index) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); void *out; __be32 val; int ret; int i; if (!port || !stats) return -ENOSYS; out = mlx5_vzalloc(outlen); if (!out) return -ENOMEM; ret = mlx5_vport_query_q_counter(dev->mdev, dev->port[port - 1].q_cnt_id, 0, out, outlen); if (ret) goto free; for (i = 0; i < ARRAY_SIZE(names); i++) { val = *(__be32 *)(out + stats_offsets[i]); stats->value[i] = (u64)be32_to_cpu(val); } free: kvfree(out); return ARRAY_SIZE(names); } static int mlx5_ib_stage_bfreg_init(struct mlx5_ib_dev *dev) { int err; err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); if (err) return err; err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); if (err) { mlx5_free_bfreg(dev->mdev, &dev->bfreg); return err; } err = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false); if (err) { mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); mlx5_free_bfreg(dev->mdev, &dev->bfreg); } return err; } static void mlx5_ib_stage_bfreg_cleanup(struct mlx5_ib_dev *dev) { mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg); mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); mlx5_free_bfreg(dev->mdev, &dev->bfreg); } static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; enum rdma_link_layer ll; int port_type_cap; int err; int i; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce)) return NULL; dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) return NULL; dev->mdev = mdev; dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), GFP_KERNEL); if (!dev->port) goto err_dealloc; rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) goto err_free_port; if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); snprintf(dev->ib_dev.name, IB_DEVICE_NAME_MAX, "mlx5_%d", device_get_unit(mdev->pdev->dev.bsddev)); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dma_device = &mdev->pdev->dev; dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_CREATE_AH) | (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); dev->ib_dev.uverbs_ex_cmd_mask = (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; if (ll == IB_LINK_LAYER_ETHERNET) dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; dev->ib_dev.add_gid = mlx5_ib_add_gid; dev->ib_dev.del_gid = mlx5_ib_del_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; dev->ib_dev.mmap = mlx5_ib_mmap; dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; dev->ib_dev.create_ah = mlx5_ib_create_ah; dev->ib_dev.query_ah = mlx5_ib_query_ah; dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; dev->ib_dev.create_srq = mlx5_ib_create_srq; dev->ib_dev.modify_srq = mlx5_ib_modify_srq; dev->ib_dev.query_srq = mlx5_ib_query_srq; dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; dev->ib_dev.create_qp = mlx5_ib_create_qp; dev->ib_dev.modify_qp = mlx5_ib_modify_qp; dev->ib_dev.query_qp = mlx5_ib_query_qp; dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; dev->ib_dev.post_send = mlx5_ib_post_send; dev->ib_dev.post_recv = mlx5_ib_post_recv; dev->ib_dev.create_cq = mlx5_ib_create_cq; dev->ib_dev.modify_cq = mlx5_ib_modify_cq; dev->ib_dev.resize_cq = mlx5_ib_resize_cq; dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; dev->ib_dev.poll_cq = mlx5_ib_poll_cq; dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; dev->ib_dev.process_mad = mlx5_ib_process_mad; dev->ib_dev.alloc_mr = mlx5_ib_alloc_mr; dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; dev->ib_dev.get_dev_fw_str = get_dev_fw_str; if (mlx5_core_is_pf(mdev)) { dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config; dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state; dev->ib_dev.get_vf_stats = mlx5_ib_get_vf_stats; dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; } dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; mlx5_ib_internal_fill_odp_caps(dev); if (MLX5_CAP_GEN(mdev, imaicl)) { dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; } if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { dev->ib_dev.create_flow = mlx5_ib_create_flow; dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; dev->ib_dev.create_wq = mlx5_ib_create_wq; dev->ib_dev.modify_wq = mlx5_ib_modify_wq; dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; dev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); } err = init_node_data(dev); if (err) goto err_free_port; mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); if (ll == IB_LINK_LAYER_ETHERNET) { err = mlx5_enable_roce(dev); if (err) goto err_free_port; } err = create_dev_resources(&dev->devr); if (err) goto err_disable_roce; err = mlx5_ib_odp_init_one(dev); if (err) goto err_rsrc; err = mlx5_ib_alloc_q_counters(dev); if (err) goto err_odp; err = mlx5_ib_stage_bfreg_init(dev); if (err) goto err_q_cnt; err = ib_register_device(&dev->ib_dev, NULL); if (err) goto err_bfreg; err = create_umr_res(dev); if (err) goto err_dev; for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) goto err_umrc; } err = mlx5_ib_init_congestion(dev); if (err) goto err_umrc; dev->ib_active = true; return dev; err_umrc: destroy_umrc_res(dev); err_dev: ib_unregister_device(&dev->ib_dev); err_bfreg: mlx5_ib_stage_bfreg_cleanup(dev); err_q_cnt: mlx5_ib_dealloc_q_counters(dev); err_odp: mlx5_ib_odp_remove_one(dev); err_rsrc: destroy_dev_resources(&dev->devr); err_disable_roce: if (ll == IB_LINK_LAYER_ETHERNET) { mlx5_disable_roce(dev); mlx5_remove_roce_notifier(dev); } err_free_port: kfree(dev->port); err_dealloc: ib_dealloc_device((struct ib_device *)dev); return NULL; } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); mlx5_ib_cleanup_congestion(dev); mlx5_remove_roce_notifier(dev); ib_unregister_device(&dev->ib_dev); mlx5_ib_stage_bfreg_cleanup(dev); mlx5_ib_dealloc_q_counters(dev); destroy_umrc_res(dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); if (ll == IB_LINK_LAYER_ETHERNET) mlx5_disable_roce(dev); kfree(dev->port); ib_dealloc_device(&dev->ib_dev); } static struct mlx5_interface mlx5_ib_interface = { .add = mlx5_ib_add, .remove = mlx5_ib_remove, .event = mlx5_ib_event, .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; static int __init mlx5_ib_init(void) { int err; err = mlx5_ib_odp_init(); if (err) return err; err = mlx5_register_interface(&mlx5_ib_interface); if (err) goto clean_odp; return err; clean_odp: mlx5_ib_odp_cleanup(); return err; } static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); mlx5_ib_odp_cleanup(); } static void mlx5_ib_show_version(void __unused *arg) { printf("%s", mlx5_version); } SYSINIT(mlx5_ib_show_version, SI_SUB_DRIVERS, SI_ORDER_ANY, mlx5_ib_show_version, NULL); module_init_order(mlx5_ib_init, SI_ORDER_SEVENTH); module_exit_order(mlx5_ib_cleanup, SI_ORDER_SEVENTH); diff --git a/sys/ofed/drivers/infiniband/core/core_priv.h b/sys/ofed/drivers/infiniband/core/core_priv.h index 7e3401ddf90a..62772fc2a6c5 100644 --- a/sys/ofed/drivers/infiniband/core/core_priv.h +++ b/sys/ofed/drivers/infiniband/core/core_priv.h @@ -1,140 +1,140 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #ifndef _CORE_PRIV_H #define _CORE_PRIV_H #include #include #include #include #ifdef CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS int cma_configfs_init(void); void cma_configfs_exit(void); #else static inline int cma_configfs_init(void) { return 0; } static inline void cma_configfs_exit(void) { } #endif struct cma_device; void cma_ref_dev(struct cma_device *cma_dev); void cma_deref_dev(struct cma_device *cma_dev); typedef bool (*cma_device_filter)(struct ib_device *, void *); struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie); int cma_get_default_gid_type(struct cma_device *cma_dev, unsigned int port); int cma_set_default_gid_type(struct cma_device *cma_dev, unsigned int port, enum ib_gid_type default_gid_type); struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev); int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); void ib_cache_setup(void); void ib_cache_cleanup(void); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, - struct net_device *idev, void *cookie); + struct ifnet *idev, void *cookie); typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port, - struct net_device *idev, void *cookie); + struct ifnet *idev, void *cookie); void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie); void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie); enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE }; int ib_cache_gid_parse_type_str(const char *buf); const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, - struct net_device *ndev, + struct ifnet *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode); int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr); int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr); int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, - struct net_device *ndev); -void ib_cache_gid_del_all_by_netdev(struct net_device *ndev); + struct ifnet *ndev); +void ib_cache_gid_del_all_by_netdev(struct ifnet *ndev); int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); int roce_rescan_device(struct ib_device *ib_dev); unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); int addr_init(void); void addr_cleanup(void); int ib_mad_init(void); void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); #endif /* _CORE_PRIV_H */ diff --git a/sys/ofed/drivers/infiniband/core/ib_addr.c b/sys/ofed/drivers/infiniband/core/ib_addr.c index ef5e264577e0..a8e951721b8d 100644 --- a/sys/ofed/drivers/infiniband/core/ib_addr.c +++ b/sys/ofed/drivers/infiniband/core/ib_addr.c @@ -1,907 +1,907 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "core_priv.h" struct addr_req { struct list_head list; struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr *addr; struct rdma_addr_client *client; void *context; void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); int timeout; int status; }; static void process_req(struct work_struct *work); static DEFINE_MUTEX(lock); static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; int rdma_addr_size(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); case AF_IB: return sizeof(struct sockaddr_ib); default: return 0; } } EXPORT_SYMBOL(rdma_addr_size); int rdma_addr_size_in6(struct sockaddr_in6 *addr) { int ret = rdma_addr_size((struct sockaddr *) addr); return ret <= sizeof(*addr) ? ret : 0; } EXPORT_SYMBOL(rdma_addr_size_in6); int rdma_addr_size_kss(struct sockaddr_storage *addr) { int ret = rdma_addr_size((struct sockaddr *) addr); return ret <= sizeof(*addr) ? ret : 0; } EXPORT_SYMBOL(rdma_addr_size_kss); static struct rdma_addr_client self; void rdma_addr_register_client(struct rdma_addr_client *client) { atomic_set(&client->refcount, 1); init_completion(&client->comp); } EXPORT_SYMBOL(rdma_addr_register_client); static inline void put_client(struct rdma_addr_client *client) { if (atomic_dec_and_test(&client->refcount)) complete(&client->comp); } void rdma_addr_unregister_client(struct rdma_addr_client *client) { put_client(client); wait_for_completion(&client->comp); } EXPORT_SYMBOL(rdma_addr_unregister_client); static inline void rdma_copy_addr_sub(u8 *dst, const u8 *src, unsigned min, unsigned max) { if (min > max) min = max; memcpy(dst, src, min); memset(dst + min, 0, max - min); } -int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, const unsigned char *dst_dev_addr) { /* check for loopback device */ if (dev->if_flags & IFF_LOOPBACK) { dev_addr->dev_type = ARPHRD_ETHER; memset(dev_addr->src_dev_addr, 0, MAX_ADDR_LEN); memset(dev_addr->broadcast, 0, MAX_ADDR_LEN); memset(dev_addr->dst_dev_addr, 0, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->if_index; return (0); } else if (dev->if_type == IFT_INFINIBAND) dev_addr->dev_type = ARPHRD_INFINIBAND; else if (dev->if_type == IFT_ETHER) dev_addr->dev_type = ARPHRD_ETHER; else dev_addr->dev_type = 0; rdma_copy_addr_sub(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen, MAX_ADDR_LEN); rdma_copy_addr_sub(dev_addr->broadcast, dev->if_broadcastaddr, dev->if_addrlen, MAX_ADDR_LEN); if (dst_dev_addr != NULL) { rdma_copy_addr_sub(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen, MAX_ADDR_LEN); } dev_addr->bound_dev_if = dev->if_index; return 0; } EXPORT_SYMBOL(rdma_copy_addr); int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { - struct net_device *dev; + struct ifnet *dev; int ret; if (dev_addr->bound_dev_if) { dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); } else switch (addr->sa_family) { #ifdef INET case AF_INET: dev = ip_ifp_find(dev_addr->net, ((const struct sockaddr_in *)addr)->sin_addr.s_addr); break; #endif #ifdef INET6 case AF_INET6: dev = ip6_ifp_find(dev_addr->net, ((const struct sockaddr_in6 *)addr)->sin6_addr, 0); break; #endif default: dev = NULL; break; } if (dev != NULL) { /* disallow connections through 127.0.0.1 itself */ if (dev->if_flags & IFF_LOOPBACK) ret = -EINVAL; else ret = rdma_copy_addr(dev_addr, dev, NULL); dev_put(dev); } else { ret = -ENODEV; } return ret; } EXPORT_SYMBOL(rdma_translate_ip); static void set_timeout(int time) { int delay; /* under FreeBSD ticks are 32-bit */ delay = time - jiffies; if (delay <= 0) delay = 1; else if (delay > hz) delay = hz; mod_delayed_work(addr_wq, &work, delay); } static void queue_req(struct addr_req *req) { struct addr_req *temp_req; mutex_lock(&lock); list_for_each_entry_reverse(temp_req, &req_list, list) { if (time_after_eq(req->timeout, temp_req->timeout)) break; } list_add(&req->list, &temp_req->list); if (req_list.next == &req->list) set_timeout(req->timeout); mutex_unlock(&lock); } #if defined(INET) || defined(INET6) static int addr_resolve_multi(u8 *edst, struct ifnet *ifp, struct sockaddr *dst_in) { struct sockaddr *llsa; struct sockaddr_dl sdl; int error; sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; if (ifp->if_resolvemulti == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_resolvemulti(ifp, &llsa, dst_in); if (error == 0) { rdma_copy_addr_sub(edst, LLADDR((struct sockaddr_dl *)llsa), ifp->if_addrlen, MAX_ADDR_LEN); } } return (error); } #endif #ifdef INET static int addr4_resolve(struct sockaddr_in *src_in, const struct sockaddr_in *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { enum { ADDR_VALID = 0, ADDR_SRC_ANY = 1, ADDR_DST_ANY = 2, }; struct sockaddr_in dst_tmp = *dst_in; in_port_t src_port; struct sockaddr *saddr = NULL; struct nhop_object *nh; struct ifnet *ifp; int error; int type; NET_EPOCH_ASSERT(); /* set VNET, if any */ CURVNET_SET(addr->net); /* set default TTL limit */ addr->hoplimit = V_ip_defttl; type = ADDR_VALID; if (src_in->sin_addr.s_addr == INADDR_ANY) type |= ADDR_SRC_ANY; if (dst_tmp.sin_addr.s_addr == INADDR_ANY) type |= ADDR_DST_ANY; /* * Make sure the socket address length field is set. */ dst_tmp.sin_len = sizeof(dst_tmp); /* Step 1 - lookup destination route if any */ switch (type) { case ADDR_VALID: case ADDR_SRC_ANY: /* regular destination route lookup */ nh = fib4_lookup(RT_DEFAULT_FIB, dst_tmp.sin_addr,0,NHR_NONE,0); if (nh == NULL) { error = EHOSTUNREACH; goto done; } break; default: error = ENETUNREACH; goto done; } /* Step 2 - find outgoing network interface */ switch (type) { case ADDR_VALID: /* get source interface */ if (addr->bound_dev_if != 0) { ifp = dev_get_by_index(addr->net, addr->bound_dev_if); } else { ifp = ip_ifp_find(addr->net, src_in->sin_addr.s_addr); } /* check source interface */ if (ifp == NULL) { error = ENETUNREACH; goto done; } else if (ifp->if_flags & IFF_LOOPBACK) { /* * Source address cannot be a loopback device. */ error = EHOSTUNREACH; goto error_put_ifp; } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { if (memcmp(&src_in->sin_addr, &dst_in->sin_addr, sizeof(src_in->sin_addr))) { /* * Destination is loopback, but source * and destination address is not the * same. */ error = EHOSTUNREACH; goto error_put_ifp; } /* get destination network interface from route */ dev_put(ifp); ifp = nh->nh_ifp; dev_hold(ifp); } else if (ifp != nh->nh_ifp) { /* * Source and destination interfaces are * different. */ error = ENETUNREACH; goto error_put_ifp; } break; case ADDR_SRC_ANY: /* check for loopback device */ if (nh->nh_ifp->if_flags & IFF_LOOPBACK) saddr = (struct sockaddr *)&dst_tmp; else saddr = nh->nh_ifa->ifa_addr; /* get destination network interface from route */ ifp = nh->nh_ifp; dev_hold(ifp); break; default: break; } /* * Step 3 - resolve destination MAC address */ if (dst_tmp.sin_addr.s_addr == INADDR_BROADCAST) { rdma_copy_addr_sub(edst, ifp->if_broadcastaddr, ifp->if_addrlen, MAX_ADDR_LEN); error = 0; } else if (IN_MULTICAST(ntohl(dst_tmp.sin_addr.s_addr))) { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV4; } else if (ifp->if_flags & IFF_LOOPBACK) { memset(edst, 0, MAX_ADDR_LEN); error = 0; } else { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; memset(edst, 0, MAX_ADDR_LEN); error = arpresolve(ifp, is_gw, NULL, is_gw ? &nh->gw_sa : (const struct sockaddr *)&dst_tmp, edst, NULL, NULL); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV4; } /* * Step 4 - update source address, if any */ if (saddr != NULL) { src_port = src_in->sin_port; memcpy(src_in, saddr, rdma_addr_size(saddr)); src_in->sin_port = src_port; /* preserve port number */ } *ifpp = ifp; goto done; error_put_ifp: dev_put(ifp); done: CURVNET_RESTORE(); if (error == EWOULDBLOCK || error == EAGAIN) error = ENODATA; return (-error); } #else static int addr4_resolve(struct sockaddr_in *src_in, const struct sockaddr_in *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { return -EADDRNOTAVAIL; } #endif #ifdef INET6 static int addr6_resolve(struct sockaddr_in6 *src_in, const struct sockaddr_in6 *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { enum { ADDR_VALID = 0, ADDR_SRC_ANY = 1, ADDR_DST_ANY = 2, }; struct sockaddr_in6 dst_tmp = *dst_in; in_port_t src_port; struct sockaddr *saddr = NULL; struct nhop_object *nh; struct ifnet *ifp; int error; int type; NET_EPOCH_ASSERT(); /* set VNET, if any */ CURVNET_SET(addr->net); /* set default TTL limit */ addr->hoplimit = V_ip_defttl; type = ADDR_VALID; if (ipv6_addr_any(&src_in->sin6_addr)) type |= ADDR_SRC_ANY; if (ipv6_addr_any(&dst_tmp.sin6_addr)) type |= ADDR_DST_ANY; /* * Make sure the socket address length field is set. */ dst_tmp.sin6_len = sizeof(dst_tmp); /* * Make sure the scope ID gets embedded, else nd6_resolve() will * not find the record. */ dst_tmp.sin6_scope_id = addr->bound_dev_if; sa6_embedscope(&dst_tmp, 0); /* Step 1 - lookup destination route if any */ switch (type) { case ADDR_VALID: /* sanity check for IPv4 addresses */ if (ipv6_addr_v4mapped(&src_in->sin6_addr) != ipv6_addr_v4mapped(&dst_tmp.sin6_addr)) { error = EAFNOSUPPORT; goto done; } /* FALLTHROUGH */ case ADDR_SRC_ANY: /* regular destination route lookup */ nh = fib6_lookup(RT_DEFAULT_FIB, &dst_in->sin6_addr, addr->bound_dev_if, NHR_NONE, 0); if (nh == NULL) { error = EHOSTUNREACH; goto done; } break; default: error = ENETUNREACH; goto done; } /* Step 2 - find outgoing network interface */ switch (type) { case ADDR_VALID: /* get source interface */ if (addr->bound_dev_if != 0) { ifp = dev_get_by_index(addr->net, addr->bound_dev_if); } else { ifp = ip6_ifp_find(addr->net, src_in->sin6_addr, 0); } /* check source interface */ if (ifp == NULL) { error = ENETUNREACH; goto done; } else if (ifp->if_flags & IFF_LOOPBACK) { /* * Source address cannot be a loopback device. */ error = EHOSTUNREACH; goto error_put_ifp; } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { if (memcmp(&src_in->sin6_addr, &dst_in->sin6_addr, sizeof(src_in->sin6_addr))) { /* * Destination is loopback, but source * and destination address is not the * same. */ error = EHOSTUNREACH; goto error_put_ifp; } /* get destination network interface from route */ dev_put(ifp); ifp = nh->nh_ifp; dev_hold(ifp); } else if (ifp != nh->nh_ifp) { /* * Source and destination interfaces are * different. */ error = ENETUNREACH; goto error_put_ifp; } break; case ADDR_SRC_ANY: /* check for loopback device */ if (nh->nh_ifp->if_flags & IFF_LOOPBACK) saddr = (struct sockaddr *)&dst_tmp; else saddr = nh->nh_ifa->ifa_addr; /* get destination network interface from route */ ifp = nh->nh_ifp; dev_hold(ifp); break; default: break; } /* * Step 3 - resolve destination MAC address */ if (IN6_IS_ADDR_MULTICAST(&dst_tmp.sin6_addr)) { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV6; } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) { memset(edst, 0, MAX_ADDR_LEN); error = 0; } else { bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0; memset(edst, 0, MAX_ADDR_LEN); error = nd6_resolve(ifp, is_gw, NULL, is_gw ? &nh->gw_sa : (const struct sockaddr *)&dst_tmp, edst, NULL, NULL); if (error != 0) goto error_put_ifp; else if (is_gw) addr->network = RDMA_NETWORK_IPV6; } /* * Step 4 - update source address, if any */ if (saddr != NULL) { src_port = src_in->sin6_port; memcpy(src_in, saddr, rdma_addr_size(saddr)); src_in->sin6_port = src_port; /* preserve port number */ } *ifpp = ifp; goto done; error_put_ifp: dev_put(ifp); done: CURVNET_RESTORE(); if (error == EWOULDBLOCK || error == EAGAIN) error = ENODATA; return (-error); } #else static int addr6_resolve(struct sockaddr_in6 *src_in, const struct sockaddr_in6 *dst_in, struct rdma_dev_addr *addr, u8 *edst, struct ifnet **ifpp) { return -EADDRNOTAVAIL; } #endif static int addr_resolve_neigh(struct ifnet *dev, const struct sockaddr *dst_in, u8 *edst, struct rdma_dev_addr *addr) { if (dev->if_flags & IFF_LOOPBACK) { int ret; /* * Binding to a loopback device is not allowed. Make * sure the destination device address is global by * clearing the bound device interface: */ if (addr->bound_dev_if == dev->if_index) addr->bound_dev_if = 0; ret = rdma_translate_ip(dst_in, addr); if (ret == 0) { memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); } return ret; } /* If the device doesn't do ARP internally */ if (!(dev->if_flags & IFF_NOARP)) return rdma_copy_addr(addr, dev, edst); return rdma_copy_addr(addr, dev, NULL); } static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr) { struct epoch_tracker et; - struct net_device *ndev = NULL; + struct ifnet *ndev = NULL; u8 edst[MAX_ADDR_LEN]; int ret; if (dst_in->sa_family != src_in->sa_family) return -EINVAL; NET_EPOCH_ENTER(et); switch (src_in->sa_family) { case AF_INET: ret = addr4_resolve((struct sockaddr_in *)src_in, (const struct sockaddr_in *)dst_in, addr, edst, &ndev); break; case AF_INET6: ret = addr6_resolve((struct sockaddr_in6 *)src_in, (const struct sockaddr_in6 *)dst_in, addr, edst, &ndev); break; default: ret = -EADDRNOTAVAIL; break; } NET_EPOCH_EXIT(et); /* check for error */ if (ret != 0) return ret; /* store MAC addresses and check for loopback */ ret = addr_resolve_neigh(ndev, dst_in, edst, addr); /* set belonging VNET, if any */ addr->net = dev_net(ndev); dev_put(ndev); return ret; } static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; struct sockaddr *src_in, *dst_in; struct list_head done_list; INIT_LIST_HEAD(&done_list); mutex_lock(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) continue; } list_move_tail(&req->list, &done_list); } if (!list_empty(&req_list)) { req = list_entry(req_list.next, struct addr_req, list); set_timeout(req->timeout); } mutex_unlock(&lock); list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); req->callback(req->status, (struct sockaddr *) &req->src_addr, req->addr, req->context); put_client(req->client); kfree(req); } } int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; int ret = 0; req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) return -ENOMEM; src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; if (src_addr) { if (src_addr->sa_family != dst_addr->sa_family) { ret = -EINVAL; goto err; } memcpy(src_in, src_addr, rdma_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; req->client = client; atomic_inc(&client->refcount); req->status = addr_resolve(src_in, dst_in, addr); switch (req->status) { case 0: req->timeout = jiffies; queue_req(req); break; case -ENODATA: req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; queue_req(req); break; default: ret = req->status; atomic_dec(&client->refcount); goto err; } return ret; err: kfree(req); return ret; } EXPORT_SYMBOL(rdma_resolve_ip); int rdma_resolve_ip_route(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr) { struct sockaddr_storage ssrc_addr = {}; struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; if (src_addr) { if (src_addr->sa_family != dst_addr->sa_family) return -EINVAL; memcpy(src_in, src_addr, rdma_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } return addr_resolve(src_in, dst_addr, addr); } EXPORT_SYMBOL(rdma_resolve_ip_route); void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; mutex_lock(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->addr == addr) { req->status = -ECANCELED; req->timeout = jiffies; list_move(&req->list, &req_list); set_timeout(req->timeout); break; } } mutex_unlock(&lock); } EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { struct rdma_dev_addr *addr; struct completion comp; int status; }; static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { if (!status) memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct rdma_dev_addr)); ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, struct net_device *dev, + u8 *dmac, struct ifnet *dev, int *hoplimit) { int ret = 0; struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; union rdma_sockaddr sgid_addr, dgid_addr; rdma_gid2ip(&sgid_addr._sockaddr, sgid); rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); dev_addr.bound_dev_if = dev->if_index; dev_addr.net = dev_net(dev); ctx.addr = &dev_addr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, &dev_addr, 1000, resolve_cb, &ctx); if (ret) return ret; wait_for_completion(&ctx.comp); ret = ctx.status; if (ret) return ret; memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); if (hoplimit) *hoplimit = dev_addr.hoplimit; return ret; } EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); int addr_init(void) { addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); if (!addr_wq) return -ENOMEM; rdma_addr_register_client(&self); return 0; } void addr_cleanup(void) { rdma_addr_unregister_client(&self); destroy_workqueue(addr_wq); } diff --git a/sys/ofed/drivers/infiniband/core/ib_cache.c b/sys/ofed/drivers/infiniband/core/ib_cache.c index a5bc498303c6..3680b97ad2bf 100644 --- a/sys/ofed/drivers/infiniband/core/ib_cache.c +++ b/sys/ofed/drivers/infiniband/core/ib_cache.c @@ -1,1258 +1,1258 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "core_priv.h" struct ib_pkey_cache { int table_len; u16 table[0]; }; struct ib_update_work { struct work_struct work; struct ib_device *device; u8 port_num; }; union ib_gid zgid; EXPORT_SYMBOL(zgid); static const struct ib_gid_attr zattr; enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_props { GID_TABLE_ENTRY_INVALID = 1UL << 0, GID_TABLE_ENTRY_DEFAULT = 1UL << 1, }; enum gid_table_write_action { GID_TABLE_WRITE_ACTION_ADD, GID_TABLE_WRITE_ACTION_DEL, /* MODIFY only updates the GID table. Currently only used by * ib_cache_update. */ GID_TABLE_WRITE_ACTION_MODIFY }; struct ib_gid_table_entry { unsigned long props; union ib_gid gid; struct ib_gid_attr attr; void *context; }; struct ib_gid_table { int sz; /* In RoCE, adding a GID to the table requires: * (a) Find if this GID is already exists. * (b) Find a free space. * (c) Write the new GID * * Delete requires different set of operations: * (a) Find the GID * (b) Delete it. * * Add/delete should be carried out atomically. * This is done by locking this mutex from multiple * writers. We don't need this lock for IB, as the MAD * layer replaces all entries. All data_vec entries * are locked by this lock. **/ struct mutex lock; /* This lock protects the table entries from being * read and written simultaneously. */ rwlock_t rwlock; struct ib_gid_table_entry *data_vec; }; static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) { if (rdma_cap_roce_gid_table(ib_dev, port)) { struct ib_event event; event.device = ib_dev; event.element.port_num = port; event.event = IB_EVENT_GID_CHANGE; ib_dispatch_event(&event); } } static const char * const gid_type_str[] = { [IB_GID_TYPE_IB] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) { if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) return gid_type_str[gid_type]; return "Invalid GID type"; } EXPORT_SYMBOL(ib_cache_gid_type_str); int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; size_t len; int err = -EINVAL; len = strlen(buf); if (len == 0) return -EINVAL; if (buf[len - 1] == '\n') len--; for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && len == strlen(gid_type_str[i])) { err = i; break; } return err; } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); /* This function expects that rwlock will be write locked in all * scenarios and that lock will be locked in sleep-able (RoCE) * scenarios. */ static int write_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, const union ib_gid *gid, const struct ib_gid_attr *attr, enum gid_table_write_action action, bool default_gid) __releases(&table->rwlock) __acquires(&table->rwlock) { int ret = 0; - struct net_device *old_net_dev; + struct ifnet *old_net_dev; enum ib_gid_type old_gid_type; /* in rdma_cap_roce_gid_table, this funciton should be protected by a * sleep-able lock. */ if (rdma_cap_roce_gid_table(ib_dev, port)) { table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; write_unlock_irq(&table->rwlock); /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by * RoCE providers and thus only updates the cache. */ if (action == GID_TABLE_WRITE_ACTION_ADD) ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr, &table->data_vec[ix].context); else if (action == GID_TABLE_WRITE_ACTION_DEL) ret = ib_dev->del_gid(ib_dev, port, ix, &table->data_vec[ix].context); write_lock_irq(&table->rwlock); } old_net_dev = table->data_vec[ix].attr.ndev; old_gid_type = table->data_vec[ix].attr.gid_type; if (old_net_dev && old_net_dev != attr->ndev) dev_put(old_net_dev); /* if modify_gid failed, just delete the old gid */ if (ret || action == GID_TABLE_WRITE_ACTION_DEL) { gid = &zgid; attr = &zattr; table->data_vec[ix].context = NULL; } memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); if (default_gid) { table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; if (action == GID_TABLE_WRITE_ACTION_DEL) table->data_vec[ix].attr.gid_type = old_gid_type; } if (table->data_vec[ix].attr.ndev && table->data_vec[ix].attr.ndev != old_net_dev) dev_hold(table->data_vec[ix].attr.ndev); table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; return ret; } static int add_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, const union ib_gid *gid, const struct ib_gid_attr *attr, bool default_gid) { return write_gid(ib_dev, port, table, ix, gid, attr, GID_TABLE_WRITE_ACTION_ADD, default_gid); } static int modify_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, const union ib_gid *gid, const struct ib_gid_attr *attr, bool default_gid) { return write_gid(ib_dev, port, table, ix, gid, attr, GID_TABLE_WRITE_ACTION_MODIFY, default_gid); } static int del_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, bool default_gid) { return write_gid(ib_dev, port, table, ix, &zgid, &zattr, GID_TABLE_WRITE_ACTION_DEL, default_gid); } /* rwlock should be read locked */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask, int *pempty) { int i = 0; int found = -1; int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { struct ib_gid_table_entry *data = &table->data_vec[i]; struct ib_gid_attr *attr = &data->attr; int curr_index = i; i++; if (data->props & GID_TABLE_ENTRY_INVALID) continue; if (empty < 0) if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && !memcmp(attr, &zattr, sizeof(*attr)) && !data->props) empty = curr_index; if (found >= 0) continue; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && memcmp(gid, &data->gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && !!(data->props & GID_TABLE_ENTRY_DEFAULT) != default_gid) continue; found = curr_index; } if (pempty) *pempty = empty; return found; } -static void addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +static void addrconf_ifid_eui48(u8 *eui, struct ifnet *dev) { if (dev->if_addrlen != ETH_ALEN) return; memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); /* NOTE: The scope ID is added by the GID to IP conversion */ eui[3] = 0xFF; eui[4] = 0xFE; eui[0] ^= 2; } -static void make_default_gid(struct net_device *dev, union ib_gid *gid) +static void make_default_gid(struct ifnet *dev, union ib_gid *gid) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); addrconf_ifid_eui48(&gid->raw[8], dev); } int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; int ret = 0; int empty; table = ports_table[port - rdma_start_port(ib_dev)]; if (!memcmp(gid, &zgid, sizeof(*gid))) return -EINVAL; mutex_lock(&table->lock); write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV, &empty); if (ix >= 0) goto out_unlock; if (empty < 0) { ret = -ENOSPC; goto out_unlock; } ret = add_gid(ib_dev, port, table, empty, gid, attr, false); if (!ret) dispatch_gid_change_event(ib_dev, port); out_unlock: write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return ret; } int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV | GID_ATTR_FIND_MASK_DEFAULT, NULL); if (ix < 0) goto out_unlock; if (!del_gid(ib_dev, port, table, ix, false)) dispatch_gid_change_event(ib_dev, port); out_unlock: write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return 0; } int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, - struct net_device *ndev) + struct ifnet *ndev) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; bool deleted = false; table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); write_lock_irq(&table->rwlock); for (ix = 0; ix < table->sz; ix++) if (table->data_vec[ix].attr.ndev == ndev) if (!del_gid(ib_dev, port, table, ix, !!(table->data_vec[ix].props & GID_TABLE_ENTRY_DEFAULT))) deleted = true; write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); if (deleted) dispatch_gid_change_event(ib_dev, port); return 0; } static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, union ib_gid *gid, struct ib_gid_attr *attr) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; table = ports_table[port - rdma_start_port(ib_dev)]; if (index < 0 || index >= table->sz) return -EINVAL; if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) return -EAGAIN; memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); if (attr) { memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); /* make sure network device is valid and attached */ if (attr->ndev != NULL && (attr->ndev->if_flags & IFF_DYING) == 0 && attr->ndev->if_addr != NULL) dev_hold(attr->ndev); else attr->ndev = NULL; } return 0; } static int _ib_cache_gid_table_find(struct ib_device *ib_dev, const union ib_gid *gid, const struct ib_gid_attr *val, unsigned long mask, u8 *port, u16 *index) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; u8 p; int local_index; unsigned long flags; for (p = 0; p < ib_dev->phys_port_cnt; p++) { table = ports_table[p]; read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; if (port) *port = p + rdma_start_port(ib_dev); read_unlock_irqrestore(&table->rwlock, flags); return 0; } read_unlock_irqrestore(&table->rwlock, flags); } return -ENOENT; } static int ib_cache_gid_find(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, - struct net_device *ndev, u8 *port, + struct ifnet *ndev, u8 *port, u16 *index) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, mask, port, index); } int ib_find_cached_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, - u8 port, struct net_device *ndev, + u8 port, struct ifnet *ndev, u16 *index) { int local_index; struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; unsigned long flags; if (port < rdma_start_port(ib_dev) || port > rdma_end_port(ib_dev)) return -ENOENT; table = ports_table[port - rdma_start_port(ib_dev)]; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; read_unlock_irqrestore(&table->rwlock, flags); return 0; } read_unlock_irqrestore(&table->rwlock, flags); return -ENOENT; } EXPORT_SYMBOL(ib_find_cached_gid_by_port); /** * ib_find_gid_by_filter - Returns the GID table index where a specified * GID value occurs * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value could be * searched. * @filter: The filter function is executed on any matching GID in the table. * If the filter function returns true, the corresponding index is returned, * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. * @index: The index into the cached GID table where the GID was found. This * parameter may be NULL. * * ib_cache_gid_find_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. * This function is only supported on RoCE ports. * */ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, const union ib_gid *gid, u8 port, bool (*filter)(const union ib_gid *, const struct ib_gid_attr *, void *), void *context, u16 *index) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned int i; unsigned long flags; bool found = false; if (!ports_table) return -EOPNOTSUPP; if (port < rdma_start_port(ib_dev) || port > rdma_end_port(ib_dev) || !rdma_protocol_roce(ib_dev, port)) return -EPROTONOSUPPORT; table = ports_table[port - rdma_start_port(ib_dev)]; read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_attr attr; if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) goto next; if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) goto next; memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); if (filter(gid, &attr, context)) found = true; next: if (found) break; } read_unlock_irqrestore(&table->rwlock, flags); if (!found) return -ENOENT; if (index) *index = i; return 0; } static struct ib_gid_table *alloc_gid_table(int sz) { struct ib_gid_table *table = kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); if (!table) return NULL; table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); if (!table->data_vec) goto err_free_table; mutex_init(&table->lock); table->sz = sz; rwlock_init(&table->rwlock); return table; err_free_table: kfree(table); return NULL; } static void release_gid_table(struct ib_gid_table *table) { if (table) { kfree(table->data_vec); kfree(table); } } static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { int i; bool deleted = false; if (!table) return; write_lock_irq(&table->rwlock); for (i = 0; i < table->sz; ++i) { if (memcmp(&table->data_vec[i].gid, &zgid, sizeof(table->data_vec[i].gid))) if (!del_gid(ib_dev, port, table, i, table->data_vec[i].props & GID_ATTR_FIND_MASK_DEFAULT)) deleted = true; } write_unlock_irq(&table->rwlock); if (deleted) dispatch_gid_change_event(ib_dev, port); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, - struct net_device *ndev, + struct ifnet *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; union ib_gid gid; struct ib_gid_attr gid_attr; struct ib_gid_attr zattr_type = zattr; struct ib_gid_table *table; unsigned int gid_type; table = ports_table[port - rdma_start_port(ib_dev)]; make_default_gid(ndev, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; /* Default GID is created using unique GUID and local subnet prefix, * as described in section 4.1.1 and 3.5.10 in IB spec 1.3. * Therefore don't create RoCEv2 default GID based on it that * resembles as IPv6 GID based on link local address when IPv6 is * disabled in kernel. */ #ifndef INET6 gid_type_mask &= ~BIT(IB_GID_TYPE_ROCE_UDP_ENCAP); #endif for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { int ix; union ib_gid current_gid; struct ib_gid_attr current_gid_attr = {}; if (1UL << gid_type & ~gid_type_mask) continue; gid_attr.gid_type = gid_type; mutex_lock(&table->lock); write_lock_irq(&table->rwlock); ix = find_gid(table, NULL, &gid_attr, true, GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT, NULL); /* Coudn't find default GID location */ if (WARN_ON(ix < 0)) goto release; zattr_type.gid_type = gid_type; if (!__ib_cache_gid_get(ib_dev, port, ix, ¤t_gid, ¤t_gid_attr) && mode == IB_CACHE_GID_DEFAULT_MODE_SET && !memcmp(&gid, ¤t_gid, sizeof(gid)) && !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) goto release; if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || memcmp(¤t_gid_attr, &zattr_type, sizeof(current_gid_attr))) { if (del_gid(ib_dev, port, table, ix, true)) { pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", ix, gid.raw); goto release; } else { dispatch_gid_change_event(ib_dev, port); } } if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) pr_warn("ib_cache_gid: unable to add default gid %pI6\n", gid.raw); else dispatch_gid_change_event(ib_dev, port); } release: if (current_gid_attr.ndev) dev_put(current_gid_attr.ndev); write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); } } static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { unsigned int i; unsigned long roce_gid_type_mask; unsigned int num_default_gids; unsigned int current_gid = 0; roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); for (i = 0; i < num_default_gids && i < table->sz; i++) { struct ib_gid_table_entry *entry = &table->data_vec[i]; entry->props |= GID_TABLE_ENTRY_DEFAULT; current_gid = find_next_bit(&roce_gid_type_mask, BITS_PER_LONG, current_gid); entry->attr.gid_type = current_gid++; } return 0; } static int _gid_table_setup_one(struct ib_device *ib_dev) { u8 port; struct ib_gid_table **table; int err = 0; table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL); if (!table) { pr_warn("failed to allocate ib gid cache for %s\n", ib_dev->name); return -ENOMEM; } for (port = 0; port < ib_dev->phys_port_cnt; port++) { u8 rdma_port = port + rdma_start_port(ib_dev); table[port] = alloc_gid_table( ib_dev->port_immutable[rdma_port].gid_tbl_len); if (!table[port]) { err = -ENOMEM; goto rollback_table_setup; } err = gid_table_reserve_default(ib_dev, port + rdma_start_port(ib_dev), table[port]); if (err) goto rollback_table_setup; } ib_dev->cache.gid_cache = table; return 0; rollback_table_setup: for (port = 0; port < ib_dev->phys_port_cnt; port++) { cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), table[port]); release_gid_table(table[port]); } kfree(table); return err; } static void gid_table_release_one(struct ib_device *ib_dev) { struct ib_gid_table **table = ib_dev->cache.gid_cache; u8 port; if (!table) return; for (port = 0; port < ib_dev->phys_port_cnt; port++) release_gid_table(table[port]); kfree(table); ib_dev->cache.gid_cache = NULL; } static void gid_table_cleanup_one(struct ib_device *ib_dev) { struct ib_gid_table **table = ib_dev->cache.gid_cache; u8 port; if (!table) return; for (port = 0; port < ib_dev->phys_port_cnt; port++) cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), table[port]); } static int gid_table_setup_one(struct ib_device *ib_dev) { int err; err = _gid_table_setup_one(ib_dev); if (err) return err; err = roce_rescan_device(ib_dev); if (err) { gid_table_cleanup_one(ib_dev); gid_table_release_one(ib_dev); } return err; } int ib_get_cached_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int res; unsigned long flags; struct ib_gid_table **ports_table = device->cache.gid_cache; struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&table->rwlock, flags); res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); read_unlock_irqrestore(&table->rwlock, flags); return res; } EXPORT_SYMBOL(ib_get_cached_gid); int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, - struct net_device *ndev, + struct ifnet *ndev, u8 *port_num, u16 *index) { return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); } EXPORT_SYMBOL(ib_find_cached_gid); int ib_find_gid_by_filter(struct ib_device *device, const union ib_gid *gid, u8 port_num, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context, u16 *index) { /* Only RoCE GID table supports filter function */ if (!rdma_cap_roce_gid_table(device, port_num) && filter) return -EPROTONOSUPPORT; return ib_cache_gid_find_by_filter(device, gid, port_num, filter, context, index); } EXPORT_SYMBOL(ib_find_gid_by_filter); int ib_get_cached_pkey(struct ib_device *device, u8 port_num, int index, u16 *pkey) { struct ib_pkey_cache *cache; unsigned long flags; int ret = 0; if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; if (index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; read_unlock_irqrestore(&device->cache.lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); int ib_find_cached_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; int partial_ix = -1; if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; *index = -1; for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { if (cache->table[i] & 0x8000) { *index = i; ret = 0; break; } else partial_ix = i; } if (ret && partial_ix >= 0) { *index = partial_ix; ret = 0; } read_unlock_irqrestore(&device->cache.lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); int ib_find_exact_cached_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; *index = -1; for (i = 0; i < cache->table_len; ++i) if (cache->table[i] == pkey) { *index = i; ret = 0; break; } read_unlock_irqrestore(&device->cache.lock, flags); return ret; } EXPORT_SYMBOL(ib_find_exact_cached_pkey); int ib_get_cached_lmc(struct ib_device *device, u8 port_num, u8 *lmc) { unsigned long flags; int ret = 0; if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); *lmc = device->cache.lmc_cache[port_num - rdma_start_port(device)]; read_unlock_irqrestore(&device->cache.lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_lmc); static void ib_cache_update(struct ib_device *device, u8 port) { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; struct ib_gid_cache { int table_len; union ib_gid table[0]; } *gid_cache = NULL; int i; int ret; struct ib_gid_table *table; struct ib_gid_table **ports_table = device->cache.gid_cache; bool use_roce_gid_table = rdma_cap_roce_gid_table(device, port); if (port < rdma_start_port(device) || port > rdma_end_port(device)) return; table = ports_table[port - rdma_start_port(device)]; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) return; ret = ib_query_port(device, port, tprops); if (ret) { pr_warn("ib_query_port failed (%d) for %s\n", ret, device->name); goto err; } pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * sizeof *pkey_cache->table, GFP_KERNEL); if (!pkey_cache) goto err; pkey_cache->table_len = tprops->pkey_tbl_len; if (!use_roce_gid_table) { gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * sizeof(*gid_cache->table), GFP_KERNEL); if (!gid_cache) goto err; gid_cache->table_len = tprops->gid_tbl_len; } for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", ret, device->name, i); goto err; } } if (!use_roce_gid_table) { for (i = 0; i < gid_cache->table_len; ++i) { ret = ib_query_gid(device, port, i, gid_cache->table + i, NULL); if (ret) { pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", ret, device->name, i); goto err; } } } write_lock_irq(&device->cache.lock); old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)]; device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; if (!use_roce_gid_table) { write_lock(&table->rwlock); for (i = 0; i < gid_cache->table_len; i++) { modify_gid(device, port, table, i, gid_cache->table + i, &zattr, false); } write_unlock(&table->rwlock); } device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; write_unlock_irq(&device->cache.lock); kfree(gid_cache); kfree(old_pkey_cache); kfree(tprops); return; err: kfree(pkey_cache); kfree(gid_cache); kfree(tprops); } static void ib_cache_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); ib_cache_update(work->device, work->port_num); kfree(work); } static void ib_cache_event(struct ib_event_handler *handler, struct ib_event *event) { struct ib_update_work *work; if (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_SM_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_GID_CHANGE) { work = kmalloc(sizeof *work, GFP_ATOMIC); if (work) { INIT_WORK(&work->work, ib_cache_task); work->device = event->device; work->port_num = event->element.port_num; queue_work(ib_wq, &work->work); } } } int ib_cache_setup_one(struct ib_device *device) { int p; int err; rwlock_init(&device->cache.lock); device->cache.pkey_cache = kzalloc(sizeof *device->cache.pkey_cache * (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); if (!device->cache.pkey_cache || !device->cache.lmc_cache) { pr_warn("Couldn't allocate cache for %s\n", device->name); return -ENOMEM; } err = gid_table_setup_one(device); if (err) /* Allocated memory will be cleaned in the release function */ return err; for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) ib_cache_update(device, p + rdma_start_port(device)); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); err = ib_register_event_handler(&device->cache.event_handler); if (err) goto err; return 0; err: gid_table_cleanup_one(device); return err; } void ib_cache_release_one(struct ib_device *device) { int p; /* * The release function frees all the cache elements. * This function should be called as part of freeing * all the device's resources when the cache could no * longer be accessed. */ if (device->cache.pkey_cache) for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) kfree(device->cache.pkey_cache[p]); gid_table_release_one(device); kfree(device->cache.pkey_cache); kfree(device->cache.lmc_cache); } void ib_cache_cleanup_one(struct ib_device *device) { /* The cleanup function unregisters the event handler, * waits for all in-progress workqueue elements and cleans * up the GID cache. This function should be called after * the device was removed from the devices list and all * clients were removed, so the cache exists but is * non-functional and shouldn't be updated anymore. */ ib_unregister_event_handler(&device->cache.event_handler); flush_workqueue(ib_wq); gid_table_cleanup_one(device); } void __init ib_cache_setup(void) { roce_gid_mgmt_init(); } void __exit ib_cache_cleanup(void) { roce_gid_mgmt_cleanup(); } diff --git a/sys/ofed/drivers/infiniband/core/ib_cm.c b/sys/ofed/drivers/infiniband/core/ib_cm.c index 7d5197cc4a0d..95b442eb0dd1 100644 --- a/sys/ofed/drivers/infiniband/core/ib_cm.c +++ b/sys/ofed/drivers/infiniband/core/ib_cm.c @@ -1,4185 +1,4185 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004-2007 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cm_msgs.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); static void cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device, void *client_data); static struct ib_client cm_client = { .name = "cm", .add = cm_add_one, .remove = cm_remove_one }; static struct ib_cm { spinlock_t lock; struct list_head device_list; rwlock_t device_lock; struct rb_root listen_service_table; u64 listen_service_id; /* struct rb_root peer_service_table; todo: fix peer to peer */ struct rb_root remote_qp_table; struct rb_root remote_id_table; struct rb_root remote_sidr_table; struct idr local_id_table; __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; /* Sync on cm change port state */ spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ enum { CM_REQ_COUNTER, CM_MRA_COUNTER, CM_REJ_COUNTER, CM_REP_COUNTER, CM_RTU_COUNTER, CM_DREQ_COUNTER, CM_DREP_COUNTER, CM_SIDR_REQ_COUNTER, CM_SIDR_REP_COUNTER, CM_LAP_COUNTER, CM_APR_COUNTER, CM_ATTR_COUNT, CM_ATTR_ID_OFFSET = 0x0010, }; enum { CM_XMIT, CM_XMIT_RETRIES, CM_RECV, CM_RECV_DUPLICATES, CM_COUNTER_GROUPS }; static char const counter_group_names[CM_COUNTER_GROUPS] [sizeof("cm_rx_duplicates")] = { "cm_tx_msgs", "cm_tx_retries", "cm_rx_msgs", "cm_rx_duplicates" }; struct cm_counter_group { struct kobject obj; atomic_long_t counter[CM_ATTR_COUNT]; }; struct cm_counter_attribute { struct attribute attr; int index; }; #define CM_COUNTER_ATTR(_name, _index) \ struct cm_counter_attribute cm_##_name##_counter_attr = { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .index = _index \ } static CM_COUNTER_ATTR(req, CM_REQ_COUNTER); static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER); static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER); static CM_COUNTER_ATTR(rep, CM_REP_COUNTER); static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER); static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER); static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER); static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER); static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER); static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER); static CM_COUNTER_ATTR(apr, CM_APR_COUNTER); static struct attribute *cm_counter_default_attrs[] = { &cm_req_counter_attr.attr, &cm_mra_counter_attr.attr, &cm_rej_counter_attr.attr, &cm_rep_counter_attr.attr, &cm_rtu_counter_attr.attr, &cm_dreq_counter_attr.attr, &cm_drep_counter_attr.attr, &cm_sidr_req_counter_attr.attr, &cm_sidr_rep_counter_attr.attr, &cm_lap_counter_attr.attr, &cm_apr_counter_attr.attr, NULL }; struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; struct kobject port_obj; u8 port_num; struct list_head cm_priv_prim_list; struct list_head cm_priv_altr_list; struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; }; struct cm_device { struct list_head list; struct ib_device *ib_device; struct device *device; u8 ack_delay; int going_down; struct cm_port *port[0]; }; struct cm_av { struct cm_port *port; union ib_gid dgid; struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; }; struct cm_work { struct delayed_work work; struct list_head list; struct cm_port *port; struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */ __be32 local_id; /* Established / timewait */ __be32 remote_id; struct ib_cm_event cm_event; struct ib_sa_path_rec path[0]; }; struct cm_timewait_info { struct cm_work work; /* Must be first. */ struct list_head list; struct rb_node remote_qp_node; struct rb_node remote_id_node; __be64 remote_ca_guid; __be32 remote_qpn; u8 inserted_remote_qp; u8 inserted_remote_id; }; struct cm_id_private { struct ib_cm_id id; struct rb_node service_node; struct rb_node sidr_id_node; spinlock_t lock; /* Do not acquire inside cm.lock */ struct completion comp; atomic_t refcount; /* Number of clients sharing this ib_cm_id. Only valid for listeners. * Protected by the cm.lock spinlock. */ int listen_sharecount; struct ib_mad_send_buf *msg; struct cm_timewait_info *timewait_info; /* todo: use alternate port on send failure */ struct cm_av av; struct cm_av alt_av; void *private_data; __be64 tid; __be32 local_qpn; __be32 remote_qpn; enum ib_qp_type qp_type; __be32 sq_psn; __be32 rq_psn; int timeout_ms; enum ib_mtu path_mtu; __be16 pkey; u8 private_data_len; u8 max_cm_retries; u8 peer_to_peer; u8 responder_resources; u8 initiator_depth; u8 retry_count; u8 rnr_retry_count; u8 service_timeout; u8 target_ack_delay; struct list_head prim_list; struct list_head altr_list; /* Indicates that the send port mad is registered and av is set */ int prim_send_port_not_ready; int altr_send_port_not_ready; struct list_head work_list; atomic_t work_count; }; static void cm_work_handler(struct work_struct *work); static inline void cm_deref_id(struct cm_id_private *cm_id_priv) { if (atomic_dec_and_test(&cm_id_priv->refcount)) complete(&cm_id_priv->comp); } static int cm_alloc_msg(struct cm_id_private *cm_id_priv, struct ib_mad_send_buf **msg) { struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; struct cm_av *av; unsigned long flags, flags2; int ret = 0; /* don't let the port to be released till the agent is down */ spin_lock_irqsave(&cm.state_lock, flags2); spin_lock_irqsave(&cm.lock, flags); if (!cm_id_priv->prim_send_port_not_ready) av = &cm_id_priv->av; else if (!cm_id_priv->altr_send_port_not_ready && (cm_id_priv->alt_av.port)) av = &cm_id_priv->alt_av; else { pr_info("%s: not valid CM id\n", __func__); ret = -ENODEV; spin_unlock_irqrestore(&cm.lock, flags); goto out; } spin_unlock_irqrestore(&cm.lock, flags); /* Make sure the port haven't released the mad yet */ mad_agent = cm_id_priv->av.port->mad_agent; if (!mad_agent) { pr_info("%s: not a valid MAD agent\n", __func__); ret = -ENODEV; goto out; } ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto out; } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, av->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); ret = PTR_ERR(m); goto out; } /* Timeout set by caller if response is expected. */ m->ah = ah; m->retries = cm_id_priv->max_cm_retries; atomic_inc(&cm_id_priv->refcount); m->context[0] = cm_id_priv; *msg = m; out: spin_unlock_irqrestore(&cm.state_lock, flags2); return ret; } static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc) { return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); } static int cm_create_response_msg_ah(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, struct ib_mad_send_buf *msg) { struct ib_ah *ah; ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, port->port_num); if (IS_ERR(ah)) return PTR_ERR(ah); msg->ah = ah; return 0; } static void cm_free_msg(struct ib_mad_send_buf *msg) { if (msg->ah) ib_destroy_ah(msg->ah); if (msg->context[0]) cm_deref_id(msg->context[0]); ib_free_send_mad(msg); } static int cm_alloc_response_msg(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, struct ib_mad_send_buf **msg) { struct ib_mad_send_buf *m; int ret; m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); if (IS_ERR(m)) return PTR_ERR(m); ret = cm_create_response_msg_ah(port, mad_recv_wc, m); if (ret) { cm_free_msg(m); return ret; } *msg = m; return 0; } static void * cm_copy_private_data(const void *private_data, u8 private_data_len) { void *data; if (!private_data || !private_data_len) return NULL; data = kmemdup(private_data, private_data_len, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); return data; } static void cm_set_private_data(struct cm_id_private *cm_id_priv, void *private_data, u8 private_data_len) { if (cm_id_priv->private_data && cm_id_priv->private_data_len) kfree(cm_id_priv->private_data); cm_id_priv->private_data = private_data; cm_id_priv->private_data_len = private_data_len; } static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, struct ib_grh *grh, struct cm_av *av) { av->port = port; av->pkey_index = wc->pkey_index; return ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, grh, &av->ah_attr); } static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, struct cm_id_private *cm_id_priv) { struct cm_device *cm_dev; struct cm_port *port = NULL; unsigned long flags; int ret; u8 p; - struct net_device *ndev = ib_get_ndev_from_path(path); + struct ifnet *ndev = ib_get_ndev_from_path(path); read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, path->gid_type, ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } } read_unlock_irqrestore(&cm.device_lock, flags); if (ndev) dev_put(ndev); if (!port) return -EINVAL; ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, be16_to_cpu(path->pkey), &av->pkey_index); if (ret) return ret; av->port = port; ret = ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, &av->ah_attr); if (ret) return ret; av->timeout = path->packet_life_time + 1; spin_lock_irqsave(&cm.lock, flags); if (&cm_id_priv->av == av) list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); else if (&cm_id_priv->alt_av == av) list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); else ret = -EINVAL; spin_unlock_irqrestore(&cm.lock, flags); return ret; } static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; int id; idr_preload(GFP_KERNEL); spin_lock_irqsave(&cm.lock, flags); id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); spin_unlock_irqrestore(&cm.lock, flags); idr_preload_end(); cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; return id < 0 ? id : 0; } static void cm_free_id(__be32 local_id) { spin_lock_irq(&cm.lock); idr_remove(&cm.local_id_table, (__force int) (local_id ^ cm.random_id_operand)); spin_unlock_irq(&cm.lock); } static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; cm_id_priv = idr_find(&cm.local_id_table, (__force int) (local_id ^ cm.random_id_operand)); if (cm_id_priv) { if (cm_id_priv->id.remote_id == remote_id) atomic_inc(&cm_id_priv->refcount); else cm_id_priv = NULL; } return cm_id_priv; } static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; spin_lock_irq(&cm.lock); cm_id_priv = cm_get_id(local_id, remote_id); spin_unlock_irq(&cm.lock); return cm_id_priv; } /* * Trivial helpers to strip endian annotation and compare; the * endianness doesn't actually matter since we just need a stable * order for the RB tree. */ static int be32_lt(__be32 a, __be32 b) { return (__force u32) a < (__force u32) b; } static int be32_gt(__be32 a, __be32 b) { return (__force u32) a > (__force u32) b; } static int be64_lt(__be64 a, __be64 b) { return (__force u64) a < (__force u64) b; } static int be64_gt(__be64 a, __be64 b) { return (__force u64) a > (__force u64) b; } static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.listen_service_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; __be64 service_mask = cm_id_priv->id.service_mask; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); if ((cur_cm_id_priv->id.service_mask & service_id) == (service_mask & cur_cm_id_priv->id.service_id) && (cm_id_priv->id.device == cur_cm_id_priv->id.device)) return cur_cm_id_priv; if (cm_id_priv->id.device < cur_cm_id_priv->id.device) link = &(*link)->rb_left; else if (cm_id_priv->id.device > cur_cm_id_priv->id.device) link = &(*link)->rb_right; else if (be64_lt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_left; else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; else link = &(*link)->rb_right; } rb_link_node(&cm_id_priv->service_node, parent, link); rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table); return NULL; } static struct cm_id_private * cm_find_listen(struct ib_device *device, __be64 service_id) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); if ((cm_id_priv->id.service_mask & service_id) == cm_id_priv->id.service_id && (cm_id_priv->id.device == device)) return cm_id_priv; if (device < cm_id_priv->id.device) node = node->rb_left; else if (device > cm_id_priv->id.device) node = node->rb_right; else if (be64_lt(service_id, cm_id_priv->id.service_id)) node = node->rb_left; else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; else node = node->rb_right; } return NULL; } static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_id_table.rb_node; struct rb_node *parent = NULL; struct cm_timewait_info *cur_timewait_info; __be64 remote_ca_guid = timewait_info->remote_ca_guid; __be32 remote_id = timewait_info->work.remote_id; while (*link) { parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_id_node); if (be32_lt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_left; else if (be32_gt(remote_id, cur_timewait_info->work.remote_id)) link = &(*link)->rb_right; else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; } timewait_info->inserted_remote_id = 1; rb_link_node(&timewait_info->remote_id_node, parent, link); rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table); return NULL; } static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, __be32 remote_id) { struct rb_node *node = cm.remote_id_table.rb_node; struct cm_timewait_info *timewait_info; while (node) { timewait_info = rb_entry(node, struct cm_timewait_info, remote_id_node); if (be32_lt(remote_id, timewait_info->work.remote_id)) node = node->rb_left; else if (be32_gt(remote_id, timewait_info->work.remote_id)) node = node->rb_right; else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_left; else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_right; else return timewait_info; } return NULL; } static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_qp_table.rb_node; struct rb_node *parent = NULL; struct cm_timewait_info *cur_timewait_info; __be64 remote_ca_guid = timewait_info->remote_ca_guid; __be32 remote_qpn = timewait_info->remote_qpn; while (*link) { parent = *link; cur_timewait_info = rb_entry(parent, struct cm_timewait_info, remote_qp_node); if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_left; else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn)) link = &(*link)->rb_right; else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_left; else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) link = &(*link)->rb_right; else return cur_timewait_info; } timewait_info->inserted_remote_qp = 1; rb_link_node(&timewait_info->remote_qp_node, parent, link); rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table); return NULL; } static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.remote_sidr_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; union ib_gid *port_gid = &cm_id_priv->av.dgid; __be32 remote_id = cm_id_priv->id.remote_id; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, sidr_id_node); if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_left; else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_right; else { int cmp; cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid, sizeof *port_gid); if (cmp < 0) link = &(*link)->rb_left; else if (cmp > 0) link = &(*link)->rb_right; else return cur_cm_id_priv; } } rb_link_node(&cm_id_priv->sidr_id_node, parent, link); rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); return NULL; } static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv, enum ib_cm_sidr_status status) { struct ib_cm_sidr_rep_param param; memset(¶m, 0, sizeof param); param.status = status; ib_send_cm_sidr_rep(&cm_id_priv->id, ¶m); } struct ib_cm_id *ib_create_cm_id(struct ib_device *device, ib_cm_handler cm_handler, void *context) { struct cm_id_private *cm_id_priv; int ret; cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->id.state = IB_CM_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.remote_cm_qpn = 1; ret = cm_alloc_id(cm_id_priv); if (ret) goto error; spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->prim_list); INIT_LIST_HEAD(&cm_id_priv->altr_list); atomic_set(&cm_id_priv->work_count, -1); atomic_set(&cm_id_priv->refcount, 1); return &cm_id_priv->id; error: kfree(cm_id_priv); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(ib_create_cm_id); static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv) { struct cm_work *work; if (list_empty(&cm_id_priv->work_list)) return NULL; work = list_entry(cm_id_priv->work_list.next, struct cm_work, list); list_del(&work->list); return work; } static void cm_free_work(struct cm_work *work) { if (work->mad_recv_wc) ib_free_recv_mad(work->mad_recv_wc); kfree(work); } static inline int cm_convert_to_ms(int iba_time) { /* approximate conversion to ms from 4.096us x 2^iba_time */ return 1 << max(iba_time - 8, 0); } /* * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time * Because of how ack_timeout is stored, adding one doubles the timeout. * To avoid large timeouts, select the max(ack_delay, life_time + 1), and * increment it (round up) only if the other is within 50%. */ static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) { int ack_timeout = packet_life_time + 1; if (ack_timeout >= ca_ack_delay) ack_timeout += (ca_ack_delay >= (ack_timeout - 1)); else ack_timeout = ca_ack_delay + (ack_timeout >= (ca_ack_delay - 1)); return min(31, ack_timeout); } static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) { if (timewait_info->inserted_remote_id) { rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); timewait_info->inserted_remote_id = 0; } if (timewait_info->inserted_remote_qp) { rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table); timewait_info->inserted_remote_qp = 0; } } static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) { struct cm_timewait_info *timewait_info; timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); if (!timewait_info) return ERR_PTR(-ENOMEM); timewait_info->work.local_id = local_id; INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler); timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT; return timewait_info; } static void cm_enter_timewait(struct cm_id_private *cm_id_priv) { int wait_time; unsigned long flags; struct cm_device *cm_dev; cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); if (!cm_dev) return; spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); spin_unlock_irqrestore(&cm.lock, flags); /* * The cm_id could be destroyed by the user before we exit timewait. * To protect against this, we search for the cm_id after exiting * timewait before notifying the user that we've exited timewait. */ cm_id_priv->id.state = IB_CM_TIMEWAIT; wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); /* Check if the device started its remove_one */ spin_lock_irqsave(&cm.lock, flags); if (!cm_dev->going_down) queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, msecs_to_jiffies(wait_time)); spin_unlock_irqrestore(&cm.lock, flags); cm_id_priv->timewait_info = NULL; } static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) { unsigned long flags; cm_id_priv->id.state = IB_CM_IDLE; if (cm_id_priv->timewait_info) { spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irqrestore(&cm.lock, flags); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; } } static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; struct cm_work *work; cm_id_priv = container_of(cm_id, struct cm_id_private, id); retest: spin_lock_irq(&cm_id_priv->lock); switch (cm_id->state) { case IB_CM_LISTEN: spin_unlock_irq(&cm_id_priv->lock); spin_lock_irq(&cm.lock); if (--cm_id_priv->listen_sharecount > 0) { /* The id is still shared. */ cm_deref_id(cm_id_priv); spin_unlock_irq(&cm.lock); return; } rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); spin_unlock_irq(&cm.lock); break; case IB_CM_SIDR_REQ_SENT: cm_id->state = IB_CM_IDLE; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); break; case IB_CM_SIDR_REQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); spin_lock_irq(&cm.lock); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); spin_unlock_irq(&cm.lock); break; case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT, &cm_id_priv->id.device->node_guid, sizeof cm_id_priv->id.device->node_guid, NULL, 0); break; case IB_CM_REQ_RCVD: if (err == -ENOMEM) { /* Do not reject to allow future retries. */ cm_reset_to_idle(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); } else { spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); } break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); /* Fall through */ case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); break; case IB_CM_ESTABLISHED: spin_unlock_irq(&cm_id_priv->lock); if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) break; ib_send_cm_dreq(cm_id, NULL, 0); goto retest; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); cm_enter_timewait(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); break; case IB_CM_DREQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_drep(cm_id, NULL, 0); break; default: spin_unlock_irq(&cm_id_priv->lock); break; } spin_lock_irq(&cm.lock); if (!list_empty(&cm_id_priv->altr_list) && (!cm_id_priv->altr_send_port_not_ready)) list_del(&cm_id_priv->altr_list); if (!list_empty(&cm_id_priv->prim_list) && (!cm_id_priv->prim_send_port_not_ready)) list_del(&cm_id_priv->prim_list); spin_unlock_irq(&cm.lock); cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); kfree(cm_id_priv->private_data); kfree(cm_id_priv); } void ib_destroy_cm_id(struct ib_cm_id *cm_id) { cm_destroy_id(cm_id, 0); } EXPORT_SYMBOL(ib_destroy_cm_id); /** * __ib_cm_listen - Initiates listening on the specified service ID for * connection and service ID resolution requests. * @cm_id: Connection identifier associated with the listen request. * @service_id: Service identifier matched against incoming connection * and service ID resolution requests. The service ID should be specified * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will * assign a service ID to the caller. * @service_mask: Mask applied to service ID used to listen across a * range of service IDs. If set to 0, the service ID is matched * exactly. This parameter is ignored if %service_id is set to * IB_CM_ASSIGN_SERVICE_ID. */ static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) { struct cm_id_private *cm_id_priv, *cur_cm_id_priv; int ret = 0; service_mask = service_mask ? service_mask : ~cpu_to_be64(0); service_id &= service_mask; if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && (service_id != IB_CM_ASSIGN_SERVICE_ID)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); if (cm_id->state != IB_CM_IDLE) return -EINVAL; cm_id->state = IB_CM_LISTEN; ++cm_id_priv->listen_sharecount; if (service_id == IB_CM_ASSIGN_SERVICE_ID) { cm_id->service_id = cpu_to_be64(cm.listen_service_id++); cm_id->service_mask = ~cpu_to_be64(0); } else { cm_id->service_id = service_id; cm_id->service_mask = service_mask; } cur_cm_id_priv = cm_insert_listen(cm_id_priv); if (cur_cm_id_priv) { cm_id->state = IB_CM_IDLE; --cm_id_priv->listen_sharecount; ret = -EBUSY; } return ret; } int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm.lock, flags); ret = __ib_cm_listen(cm_id, service_id, service_mask); spin_unlock_irqrestore(&cm.lock, flags); return ret; } EXPORT_SYMBOL(ib_cm_listen); /** * Create a new listening ib_cm_id and listen on the given service ID. * * If there's an existing ID listening on that same device and service ID, * return it. * * @device: Device associated with the cm_id. All related communication will * be associated with the specified device. * @cm_handler: Callback invoked to notify the user of CM events. * @service_id: Service identifier matched against incoming connection * and service ID resolution requests. The service ID should be specified * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will * assign a service ID to the caller. * * Callers should call ib_destroy_cm_id when done with the listener ID. */ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, ib_cm_handler cm_handler, __be64 service_id) { struct cm_id_private *cm_id_priv; struct ib_cm_id *cm_id; unsigned long flags; int err = 0; /* Create an ID in advance, since the creation may sleep */ cm_id = ib_create_cm_id(device, cm_handler, NULL); if (IS_ERR(cm_id)) return cm_id; spin_lock_irqsave(&cm.lock, flags); if (service_id == IB_CM_ASSIGN_SERVICE_ID) goto new_id; /* Find an existing ID */ cm_id_priv = cm_find_listen(device, service_id); if (cm_id_priv) { if (cm_id->cm_handler != cm_handler || cm_id->context) { /* Sharing an ib_cm_id with different handlers is not * supported */ spin_unlock_irqrestore(&cm.lock, flags); return ERR_PTR(-EINVAL); } atomic_inc(&cm_id_priv->refcount); ++cm_id_priv->listen_sharecount; spin_unlock_irqrestore(&cm.lock, flags); ib_destroy_cm_id(cm_id); cm_id = &cm_id_priv->id; return cm_id; } new_id: /* Use newly created ID */ err = __ib_cm_listen(cm_id, service_id, 0); spin_unlock_irqrestore(&cm.lock, flags); if (err) { ib_destroy_cm_id(cm_id); return ERR_PTR(err); } return cm_id; } EXPORT_SYMBOL(ib_cm_insert_listen); static __be64 cm_form_tid(struct cm_id_private *cm_id_priv, enum cm_msg_sequence msg_seq) { u64 hi_tid, low_tid; hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32; low_tid = (u64) ((__force u32)cm_id_priv->id.local_id | (msg_seq << 30)); return cpu_to_be64(hi_tid | low_tid); } static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, __be64 tid) { hdr->base_version = IB_MGMT_BASE_VERSION; hdr->mgmt_class = IB_MGMT_CLASS_CM; hdr->class_version = IB_CM_CLASS_VERSION; hdr->method = IB_MGMT_METHOD_SEND; hdr->attr_id = attr_id; hdr->tid = tid; } static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) { struct ib_sa_path_rec *pri_path = param->primary_path; struct ib_sa_path_rec *alt_path = param->alternate_path; cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); req_msg->local_comm_id = cm_id_priv->id.local_id; req_msg->service_id = param->service_id; req_msg->local_ca_guid = cm_id_priv->id.device->node_guid; cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num)); cm_req_set_init_depth(req_msg, param->initiator_depth); cm_req_set_remote_resp_timeout(req_msg, param->remote_cm_response_timeout); cm_req_set_qp_type(req_msg, param->qp_type); cm_req_set_flow_ctrl(req_msg, param->flow_control); cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn)); cm_req_set_local_resp_timeout(req_msg, param->local_cm_response_timeout); req_msg->pkey = param->primary_path->pkey; cm_req_set_path_mtu(req_msg, param->primary_path->mtu); cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); if (param->qp_type != IB_QPT_XRC_INI) { cm_req_set_resp_res(req_msg, param->responder_resources); cm_req_set_retry_count(req_msg, param->retry_count); cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); cm_req_set_srq(req_msg, param->srq); } if (pri_path->hop_limit <= 1) { req_msg->primary_local_lid = pri_path->slid; req_msg->primary_remote_lid = pri_path->dlid; } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; req_msg->primary_remote_lid = IB_LID_PERMISSIVE; } req_msg->primary_local_gid = pri_path->sgid; req_msg->primary_remote_gid = pri_path->dgid; cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); cm_req_set_primary_packet_rate(req_msg, pri_path->rate); req_msg->primary_traffic_class = pri_path->traffic_class; req_msg->primary_hop_limit = pri_path->hop_limit; cm_req_set_primary_sl(req_msg, pri_path->sl); cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1)); cm_req_set_primary_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, pri_path->packet_life_time)); if (alt_path) { if (alt_path->hop_limit <= 1) { req_msg->alt_local_lid = alt_path->slid; req_msg->alt_remote_lid = alt_path->dlid; } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; } req_msg->alt_local_gid = alt_path->sgid; req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, alt_path->flow_label); cm_req_set_alt_packet_rate(req_msg, alt_path->rate); req_msg->alt_traffic_class = alt_path->traffic_class; req_msg->alt_hop_limit = alt_path->hop_limit; cm_req_set_alt_sl(req_msg, alt_path->sl); cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1)); cm_req_set_alt_local_ack_timeout(req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alt_path->packet_life_time)); } if (param->private_data && param->private_data_len) memcpy(req_msg->private_data, param->private_data, param->private_data_len); } static int cm_validate_req_param(struct ib_cm_req_param *param) { /* peer-to-peer not supported */ if (param->peer_to_peer) return -EINVAL; if (!param->primary_path) return -EINVAL; if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC && param->qp_type != IB_QPT_XRC_INI) return -EINVAL; if (param->private_data && param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE) return -EINVAL; if (param->alternate_path && (param->alternate_path->pkey != param->primary_path->pkey || param->alternate_path->mtu != param->primary_path->mtu)) return -EINVAL; return 0; } int ib_send_cm_req(struct ib_cm_id *cm_id, struct ib_cm_req_param *param) { struct cm_id_private *cm_id_priv; struct cm_req_msg *req_msg; unsigned long flags; int ret; ret = cm_validate_req_param(param); if (ret) return ret; /* Verify that we're not in timewait. */ cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_IDLE) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = -EINVAL; goto out; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); goto out; } ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av, cm_id_priv); if (ret) goto error1; if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv); if (ret) goto error1; } cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = cm_convert_to_ms( param->primary_path->packet_life_time) * 2 + cm_convert_to_ms( param->remote_cm_response_timeout); cm_id_priv->max_cm_retries = param->max_cm_retries; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->retry_count = param->retry_count; cm_id_priv->path_mtu = param->primary_path->mtu; cm_id_priv->pkey = param->primary_path->pkey; cm_id_priv->qp_type = param->qp_type; ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg); if (ret) goto error1; req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad; cm_format_req(req_msg, cm_id_priv, param); cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms; cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT; cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg); cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg); spin_lock_irqsave(&cm_id_priv->lock, flags); ret = ib_post_send_mad(cm_id_priv->msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); goto error2; } BUG_ON(cm_id->state != IB_CM_IDLE); cm_id->state = IB_CM_REQ_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; error2: cm_free_msg(cm_id_priv->msg); error1: kfree(cm_id_priv->timewait_info); out: return ret; } EXPORT_SYMBOL(ib_send_cm_req); static int cm_issue_rej(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, enum ib_cm_rej_reason reason, enum cm_msg_response msg_rejected, void *ari, u8 ari_length) { struct ib_mad_send_buf *msg = NULL; struct cm_rej_msg *rej_msg, *rcv_msg; int ret; ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); if (ret) return ret; /* We just need common CM header information. Cast to any message. */ rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad; rej_msg = (struct cm_rej_msg *) msg->mad; cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid); rej_msg->remote_comm_id = rcv_msg->local_comm_id; rej_msg->local_comm_id = rcv_msg->remote_comm_id; cm_rej_set_msg_rejected(rej_msg, msg_rejected); rej_msg->reason = cpu_to_be16(reason); if (ari && ari_length) { cm_rej_set_reject_info_len(rej_msg, ari_length); memcpy(rej_msg->ari, ari, ari_length); } ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); return ret; } static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct ib_sa_path_rec *primary_path, struct ib_sa_path_rec *alt_path) { memset(primary_path, 0, sizeof *primary_path); primary_path->dgid = req_msg->primary_local_gid; primary_path->sgid = req_msg->primary_remote_gid; primary_path->dlid = req_msg->primary_local_lid; primary_path->slid = req_msg->primary_remote_lid; primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); primary_path->hop_limit = req_msg->primary_hop_limit; primary_path->traffic_class = req_msg->primary_traffic_class; primary_path->reversible = 1; primary_path->pkey = req_msg->pkey; primary_path->sl = cm_req_get_primary_sl(req_msg); primary_path->mtu_selector = IB_SA_EQ; primary_path->mtu = cm_req_get_path_mtu(req_msg); primary_path->rate_selector = IB_SA_EQ; primary_path->rate = cm_req_get_primary_packet_rate(req_msg); primary_path->packet_life_time_selector = IB_SA_EQ; primary_path->packet_life_time = cm_req_get_primary_local_ack_timeout(req_msg); primary_path->packet_life_time -= (primary_path->packet_life_time > 0); primary_path->service_id = req_msg->service_id; if (req_msg->alt_local_lid) { memset(alt_path, 0, sizeof *alt_path); alt_path->dgid = req_msg->alt_local_gid; alt_path->sgid = req_msg->alt_remote_gid; alt_path->dlid = req_msg->alt_local_lid; alt_path->slid = req_msg->alt_remote_lid; alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); alt_path->hop_limit = req_msg->alt_hop_limit; alt_path->traffic_class = req_msg->alt_traffic_class; alt_path->reversible = 1; alt_path->pkey = req_msg->pkey; alt_path->sl = cm_req_get_alt_sl(req_msg); alt_path->mtu_selector = IB_SA_EQ; alt_path->mtu = cm_req_get_path_mtu(req_msg); alt_path->rate_selector = IB_SA_EQ; alt_path->rate = cm_req_get_alt_packet_rate(req_msg); alt_path->packet_life_time_selector = IB_SA_EQ; alt_path->packet_life_time = cm_req_get_alt_local_ack_timeout(req_msg); alt_path->packet_life_time -= (alt_path->packet_life_time > 0); alt_path->service_id = req_msg->service_id; } } static u16 cm_get_bth_pkey(struct cm_work *work) { struct ib_device *ib_dev = work->port->cm_dev->ib_device; u8 port_num = work->port->port_num; u16 pkey_index = work->mad_recv_wc->wc->pkey_index; u16 pkey; int ret; ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); if (ret) { dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n", port_num, pkey_index, ret); return 0; } return pkey; } static void cm_format_req_event(struct cm_work *work, struct cm_id_private *cm_id_priv, struct ib_cm_id *listen_id) { struct cm_req_msg *req_msg; struct ib_cm_req_event_param *param; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.req_rcvd; param->listen_id = listen_id; param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; if (req_msg->alt_local_lid) param->alternate_path = &work->path[1]; else param->alternate_path = NULL; param->remote_ca_guid = req_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(req_msg->local_qkey); param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg)); param->qp_type = cm_req_get_qp_type(req_msg); param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg)); param->responder_resources = cm_req_get_init_depth(req_msg); param->initiator_depth = cm_req_get_resp_res(req_msg); param->local_cm_response_timeout = cm_req_get_remote_resp_timeout(req_msg); param->flow_control = cm_req_get_flow_ctrl(req_msg); param->remote_cm_response_timeout = cm_req_get_local_resp_timeout(req_msg); param->retry_count = cm_req_get_retry_count(req_msg); param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); param->srq = cm_req_get_srq(req_msg); param->ppath_sgid_index = cm_id_priv->av.ah_attr.grh.sgid_index; work->cm_event.private_data = &req_msg->private_data; } static void cm_process_work(struct cm_id_private *cm_id_priv, struct cm_work *work) { int ret; /* We will typically only have the current event to report. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) { spin_lock_irq(&cm_id_priv->lock); work = cm_dequeue_work(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); BUG_ON(!work); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); } cm_deref_id(cm_id_priv); if (ret) cm_destroy_id(&cm_id_priv->id, ret); } static void cm_format_mra(struct cm_mra_msg *mra_msg, struct cm_id_private *cm_id_priv, enum cm_msg_response msg_mraed, u8 service_timeout, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); cm_mra_set_msg_mraed(mra_msg, msg_mraed); mra_msg->local_comm_id = cm_id_priv->id.local_id; mra_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_mra_set_service_timeout(mra_msg, service_timeout); if (private_data && private_data_len) memcpy(mra_msg->private_data, private_data, private_data_len); } static void cm_format_rej(struct cm_rej_msg *rej_msg, struct cm_id_private *cm_id_priv, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid); rej_msg->remote_comm_id = cm_id_priv->id.remote_id; switch(cm_id_priv->id.state) { case IB_CM_REQ_RCVD: rej_msg->local_comm_id = 0; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_MRA_REQ_SENT: rej_msg->local_comm_id = cm_id_priv->id.local_id; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: rej_msg->local_comm_id = cm_id_priv->id.local_id; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP); break; default: rej_msg->local_comm_id = cm_id_priv->id.local_id; cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER); break; } rej_msg->reason = cpu_to_be16(reason); if (ari && ari_length) { cm_rej_set_reject_info_len(rej_msg, ari_length); memcpy(rej_msg->ari, ari, ari_length); } if (private_data && private_data_len) memcpy(rej_msg->private_data, private_data, private_data_len); } static void cm_dup_req_handler(struct cm_work *work, struct cm_id_private *cm_id_priv) { struct ib_mad_send_buf *msg = NULL; int ret; atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_REQ_COUNTER]); /* Quick state check to discard duplicate REQs. */ if (cm_id_priv->id.state == IB_CM_REQ_RCVD) return; ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); if (ret) return; spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_MRA_REQ_SENT: cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); break; case IB_CM_TIMEWAIT: cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0); break; default: goto unlock; } spin_unlock_irq(&cm_id_priv->lock); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; return; unlock: spin_unlock_irq(&cm_id_priv->lock); free: cm_free_msg(msg); } static struct cm_id_private * cm_match_req(struct cm_work *work, struct cm_id_private *cm_id_priv) { struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; struct cm_timewait_info *timewait_info; struct cm_req_msg *req_msg; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; /* Check for possible duplicate REQ. */ spin_lock_irq(&cm.lock); timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info); if (timewait_info) { cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock_irq(&cm.lock); if (cur_cm_id_priv) { cm_dup_req_handler(work, cur_cm_id_priv); cm_deref_id(cur_cm_id_priv); } return NULL; } /* Check for stale connections. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, NULL, 0); return NULL; } /* Find matching listen request. */ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, req_msg->service_id); if (!listen_cm_id_priv) { cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, NULL, 0); goto out; } atomic_inc(&listen_cm_id_priv->refcount); atomic_inc(&cm_id_priv->refcount); cm_id_priv->id.state = IB_CM_REQ_RCVD; atomic_inc(&cm_id_priv->work_count); spin_unlock_irq(&cm.lock); out: return listen_cm_id_priv; } /* * Work-around for inter-subnet connections. If the LIDs are permissive, * we need to override the LID/SL data in the REQ with the LID information * in the work completion. */ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { req_msg->primary_local_lid = cpu_to_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE) req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits); } if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { req_msg->alt_local_lid = cpu_to_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE) req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits); } } static int cm_req_handler(struct cm_work *work) { struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; union ib_gid gid; struct ib_gid_attr gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); cm_id_priv = container_of(cm_id, struct cm_id_private, id); cm_id_priv->id.remote_id = req_msg->local_comm_id; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); if (ret) goto destroy; cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); goto destroy; } cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id; cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid; cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg); listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { ret = -EINVAL; kfree(cm_id_priv->timewait_info); goto destroy; } cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; cm_id_priv->id.context = listen_cm_id_priv->id.context; cm_id_priv->id.service_id = req_msg->service_id; cm_id_priv->id.service_mask = ~cpu_to_be64(0); cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit; ret = ib_get_cached_gid(work->port->cm_dev->ib_device, work->port->port_num, cm_id_priv->av.ah_attr.grh.sgid_index, &gid, &gid_attr); if (!ret) { if (gid_attr.ndev) { work->path[0].ifindex = gid_attr.ndev->if_index; work->path[0].net = dev_net(gid_attr.ndev); dev_put(gid_attr.ndev); } work->path[0].gid_type = gid_attr.gid_type; ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, cm_id_priv); } if (ret) { int err = ib_get_cached_gid(work->port->cm_dev->ib_device, work->port->port_num, 0, &work->path[0].sgid, &gid_attr); if (!err && gid_attr.ndev) { work->path[0].ifindex = gid_attr.ndev->if_index; work->path[0].net = dev_net(gid_attr.ndev); dev_put(gid_attr.ndev); } work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } if (req_msg->alt_local_lid) { ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, cm_id_priv); if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } } cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->timeout_ms = cm_convert_to_ms( cm_req_get_local_resp_timeout(req_msg)); cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg); cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg); cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg); cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg); cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg); cm_id_priv->pkey = req_msg->pkey; cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg); cm_id_priv->retry_count = cm_req_get_retry_count(req_msg); cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); cm_process_work(cm_id_priv, work); cm_deref_id(listen_cm_id_priv); return 0; rejected: atomic_dec(&cm_id_priv->refcount); cm_deref_id(listen_cm_id_priv); destroy: ib_destroy_cm_id(cm_id); return ret; } static void cm_format_rep(struct cm_rep_msg *rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_rep_param *param) { cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); rep_msg->local_comm_id = cm_id_priv->id.local_id; rep_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn)); rep_msg->resp_resources = param->responder_resources; cm_rep_set_target_ack_delay(rep_msg, cm_id_priv->av.port->cm_dev->ack_delay); cm_rep_set_failover(rep_msg, param->failover_accepted); cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count); rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid; if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { rep_msg->initiator_depth = param->initiator_depth; cm_rep_set_flow_ctrl(rep_msg, param->flow_control); cm_rep_set_srq(rep_msg, param->srq); cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); } else { cm_rep_set_srq(rep_msg, 1); cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num)); } if (param->private_data && param->private_data_len) memcpy(rep_msg->private_data, param->private_data, param->private_data_len); } int ib_send_cm_rep(struct ib_cm_id *cm_id, struct ib_cm_rep_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; struct cm_rep_msg *rep_msg; unsigned long flags; int ret; if (param->private_data && param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { ret = -EINVAL; goto out; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; rep_msg = (struct cm_rep_msg *) msg->mad; cm_format_rep(rep_msg, cm_id_priv, param); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->state = IB_CM_REP_SENT; cm_id_priv->msg = msg; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg); cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rep); static void cm_format_rtu(struct cm_rtu_msg *rtu_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid); rtu_msg->local_comm_id = cm_id_priv->id.local_id; rtu_msg->remote_comm_id = cm_id_priv->id.remote_id; if (private_data && private_data_len) memcpy(rtu_msg->private_data, private_data, private_data_len); } int ib_send_cm_rtu(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; void *data; int ret; if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE) return -EINVAL; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { ret = -EINVAL; goto error; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto error; cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); kfree(data); return ret; } cm_id->state = IB_CM_ESTABLISHED; cm_set_private_data(cm_id_priv, data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); return ret; } EXPORT_SYMBOL(ib_send_cm_rtu); static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) { struct cm_rep_msg *rep_msg; struct ib_cm_rep_event_param *param; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rep_rcvd; param->remote_ca_guid = rep_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(rep_msg->local_qkey); param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg)); param->responder_resources = rep_msg->initiator_depth; param->initiator_depth = rep_msg->resp_resources; param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); param->failover_accepted = cm_rep_get_failover(rep_msg); param->flow_control = cm_rep_get_flow_ctrl(rep_msg); param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); param->srq = cm_rep_get_srq(rep_msg); work->cm_event.private_data = &rep_msg->private_data; } static void cm_dup_rep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rep_msg *rep_msg; struct ib_mad_send_buf *msg = NULL; int ret; rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, rep_msg->local_comm_id); if (!cm_id_priv) return; atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_REP_COUNTER]); ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); if (ret) goto deref; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state == IB_CM_ESTABLISHED) cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, cm_id_priv->private_data, cm_id_priv->private_data_len); else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); else goto unlock; spin_unlock_irq(&cm_id_priv->lock); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; goto deref; unlock: spin_unlock_irq(&cm_id_priv->lock); free: cm_free_msg(msg); deref: cm_deref_id(cm_id_priv); } static int cm_rep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rep_msg *rep_msg; int ret; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); return -EINVAL; } cm_format_rep_event(work, cm_id_priv->qp_type); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: break; default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; goto error; } cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id; cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid; cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); spin_lock(&cm.lock); /* Check for duplicate REP. */ if (cm_insert_remote_id(cm_id_priv->timewait_info)) { spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; goto error; } /* Check for a stale connection. */ if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) { rb_erase(&cm_id_priv->timewait_info->remote_id_node, &cm.remote_id_table); cm_id_priv->timewait_info->inserted_remote_id = 0; spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; goto error; } spin_unlock(&cm.lock); cm_id_priv->id.state = IB_CM_REP_RCVD; cm_id_priv->id.remote_id = rep_msg->local_comm_id; cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); cm_id_priv->initiator_depth = rep_msg->resp_resources; cm_id_priv->responder_resources = rep_msg->initiator_depth; cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg); cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); cm_id_priv->av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->av.timeout - 1); cm_id_priv->alt_av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->alt_av.timeout - 1); /* todo: handle peer_to_peer */ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; error: cm_deref_id(cm_id_priv); return ret; } static int cm_establish_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; int ret; /* See comment in cm_establish about lookup. */ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); if (!cm_id_priv) return -EINVAL; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { spin_unlock_irq(&cm_id_priv->lock); goto out; } ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_rtu_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rtu_msg *rtu_msg; int ret; rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id, rtu_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = &rtu_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_REP_SENT && cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { spin_unlock_irq(&cm_id_priv->lock); atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_RTU_COUNTER]); goto out; } cm_id_priv->id.state = IB_CM_ESTABLISHED; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ)); dreq_msg->local_comm_id = cm_id_priv->id.local_id; dreq_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn); if (private_data && private_data_len) memcpy(dreq_msg->private_data, private_data, private_data_len); } int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { ret = -EINVAL; goto out; } if (cm_id->lap_state == IB_CM_LAP_SENT || cm_id->lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) { cm_enter_timewait(cm_id_priv); goto out; } cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, private_data, private_data_len); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; ret = ib_post_send_mad(msg, NULL); if (ret) { cm_enter_timewait(cm_id_priv); spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->state = IB_CM_DREQ_SENT; cm_id_priv->msg = msg; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_dreq); static void cm_format_drep(struct cm_drep_msg *drep_msg, struct cm_id_private *cm_id_priv, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid); drep_msg->local_comm_id = cm_id_priv->id.local_id; drep_msg->remote_comm_id = cm_id_priv->id.remote_id; if (private_data && private_data_len) memcpy(drep_msg->private_data, private_data, private_data_len); } int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; void *data; int ret; if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE) return -EINVAL; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); return -EINVAL; } cm_set_private_data(cm_id_priv, data, private_data_len); cm_enter_timewait(cm_id_priv); ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_drep); static int cm_issue_drep(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_buf *msg = NULL; struct cm_dreq_msg *dreq_msg; struct cm_drep_msg *drep_msg; int ret; ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); if (ret) return ret; dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad; drep_msg = (struct cm_drep_msg *) msg->mad; cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid); drep_msg->remote_comm_id = dreq_msg->local_comm_id; drep_msg->local_comm_id = dreq_msg->remote_comm_id; ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); return ret; } static int cm_dreq_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_dreq_msg *dreq_msg; struct ib_mad_send_buf *msg = NULL; int ret; dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id, dreq_msg->local_comm_id); if (!cm_id_priv) { atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); return -EINVAL; } work->cm_event.private_data = &dreq_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg)) goto unlock; switch (cm_id_priv->id.state) { case IB_CM_REP_SENT: case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); break; case IB_CM_MRA_REP_RCVD: break; case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); if (IS_ERR(msg)) goto unlock; cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); goto unlock; default: goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; cm_id_priv->tid = dreq_msg->hdr.tid; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); deref: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_drep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_drep_msg *drep_msg; int ret; drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id, drep_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = &drep_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_DREQ_SENT && cm_id_priv->id.state != IB_CM_DREQ_RCVD) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_enter_timewait(cm_id_priv); ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, void *ari, u8 ari_length, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) || (ari && ari_length > IB_CM_REJ_ARI_LENGTH)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id->state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: ret = cm_alloc_msg(cm_id_priv, &msg); if (!ret) cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, reason, ari, ari_length, private_data, private_data_len); cm_reset_to_idle(cm_id_priv); break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ret = cm_alloc_msg(cm_id_priv, &msg); if (!ret) cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, reason, ari, ari_length, private_data, private_data_len); cm_enter_timewait(cm_id_priv); break; default: ret = -EINVAL; goto out; } if (ret) goto out; ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rej); static void cm_format_rej_event(struct cm_work *work) { struct cm_rej_msg *rej_msg; struct ib_cm_rej_event_param *param; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rej_rcvd; param->ari = rej_msg->ari; param->ari_length = cm_rej_get_reject_info_len(rej_msg); param->reason = __be16_to_cpu(rej_msg->reason); work->cm_event.private_data = &rej_msg->private_data; } static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; __be32 remote_id; remote_id = rej_msg->local_comm_id; if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) { spin_lock_irq(&cm.lock); timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari), remote_id); if (!timewait_info) { spin_unlock_irq(&cm.lock); return NULL; } cm_id_priv = idr_find(&cm.local_id_table, (__force int) (timewait_info->work.local_id ^ cm.random_id_operand)); if (cm_id_priv) { if (cm_id_priv->id.remote_id == remote_id) atomic_inc(&cm_id_priv->refcount); else cm_id_priv = NULL; } spin_unlock_irq(&cm.lock); } else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ) cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0); else cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id); return cm_id_priv; } static int cm_rej_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rej_msg *rej_msg; int ret; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_rejected_id(rej_msg); if (!cm_id_priv) return -EINVAL; cm_format_rej_event(work); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); /* fall through */ case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN) cm_enter_timewait(cm_id_priv); else cm_reset_to_idle(cm_id_priv); break; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); /* fall through */ case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: cm_enter_timewait(cm_id_priv); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); cm_enter_timewait(cm_id_priv); break; } /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; goto out; } ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } int ib_send_cm_mra(struct ib_cm_id *cm_id, u8 service_timeout, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; enum ib_cm_state cm_state; enum ib_cm_lap_state lap_state; enum cm_msg_response msg_response; void *data; unsigned long flags; int ret; if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) return -EINVAL; data = cm_copy_private_data(private_data, private_data_len); if (IS_ERR(data)) return PTR_ERR(data); cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch(cm_id_priv->id.state) { case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; msg_response = CM_MSG_RESPONSE_REQ; break; case IB_CM_REP_RCVD: cm_state = IB_CM_MRA_REP_SENT; lap_state = cm_id->lap_state; msg_response = CM_MSG_RESPONSE_REP; break; case IB_CM_ESTABLISHED: if (cm_id->lap_state == IB_CM_LAP_RCVD) { cm_state = cm_id->state; lap_state = IB_CM_MRA_LAP_SENT; msg_response = CM_MSG_RESPONSE_OTHER; break; } default: ret = -EINVAL; goto error1; } if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto error1; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, msg_response, service_timeout, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) goto error2; } cm_id->state = cm_state; cm_id->lap_state = lap_state; cm_id_priv->service_timeout = service_timeout; cm_set_private_data(cm_id_priv, data, private_data_len); spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); return ret; error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); cm_free_msg(msg); return ret; } EXPORT_SYMBOL(ib_send_cm_mra); static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { switch (cm_mra_get_msg_mraed(mra_msg)) { case CM_MSG_RESPONSE_REQ: return cm_acquire_id(mra_msg->remote_comm_id, 0); case CM_MSG_RESPONSE_REP: case CM_MSG_RESPONSE_OTHER: return cm_acquire_id(mra_msg->remote_comm_id, mra_msg->local_comm_id); default: return NULL; } } static int cm_mra_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_mra_msg *mra_msg; int timeout, ret; mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_mraed_id(mra_msg); if (!cm_id_priv) return -EINVAL; work->cm_event.private_data = &mra_msg->private_data; work->cm_event.param.mra_rcvd.service_timeout = cm_mra_get_service_timeout(mra_msg); timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) + cm_convert_to_ms(cm_id_priv->av.timeout); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ || ib_modify_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; break; case IB_CM_REP_SENT: if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP || ib_modify_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REP_RCVD; break; case IB_CM_ESTABLISHED: if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER || cm_id_priv->id.lap_state != IB_CM_LAP_SENT || ib_modify_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg, timeout)) { if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) atomic_long_inc(&work->port-> counter_group[CM_RECV_DUPLICATES]. counter[CM_MRA_COUNTER]); goto out; } cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; break; case IB_CM_MRA_REQ_RCVD: case IB_CM_MRA_REP_RCVD: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_MRA_COUNTER]); /* fall through */ default: goto out; } cm_id_priv->msg->context[1] = (void *) (unsigned long) cm_id_priv->id.state; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: spin_unlock_irq(&cm_id_priv->lock); cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_lap(struct cm_lap_msg *lap_msg, struct cm_id_private *cm_id_priv, struct ib_sa_path_rec *alternate_path, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); lap_msg->local_comm_id = cm_id_priv->id.local_id; lap_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn); /* todo: need remote CM response timeout */ cm_lap_set_remote_resp_timeout(lap_msg, 0x1F); lap_msg->alt_local_lid = alternate_path->slid; lap_msg->alt_remote_lid = alternate_path->dlid; lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); lap_msg->alt_hop_limit = alternate_path->hop_limit; cm_lap_set_packet_rate(lap_msg, alternate_path->rate); cm_lap_set_sl(lap_msg, alternate_path->sl); cm_lap_set_subnet_local(lap_msg, 1); /* local only... */ cm_lap_set_local_ack_timeout(lap_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alternate_path->packet_life_time)); if (private_data && private_data_len) memcpy(lap_msg->private_data, private_data, private_data_len); } int ib_send_cm_lap(struct ib_cm_id *cm_id, struct ib_sa_path_rec *alternate_path, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED || (cm_id->lap_state != IB_CM_LAP_UNINIT && cm_id->lap_state != IB_CM_LAP_IDLE)) { ret = -EINVAL; goto out; } ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av, cm_id_priv); if (ret) goto out; cm_id_priv->alt_av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->alt_av.timeout - 1); ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv, alternate_path, private_data, private_data_len); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED; ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->lap_state = IB_CM_LAP_SENT; cm_id_priv->msg = msg; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_lap); static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct ib_sa_path_rec *path, struct cm_lap_msg *lap_msg) { memset(path, 0, sizeof *path); path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; path->dlid = lap_msg->alt_local_lid; path->slid = lap_msg->alt_remote_lid; path->flow_label = cm_lap_get_flow_label(lap_msg); path->hop_limit = lap_msg->alt_hop_limit; path->traffic_class = cm_lap_get_traffic_class(lap_msg); path->reversible = 1; path->pkey = cm_id_priv->pkey; path->sl = cm_lap_get_sl(lap_msg); path->mtu_selector = IB_SA_EQ; path->mtu = cm_id_priv->path_mtu; path->rate_selector = IB_SA_EQ; path->rate = cm_lap_get_packet_rate(lap_msg); path->packet_life_time_selector = IB_SA_EQ; path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); path->packet_life_time -= (path->packet_life_time > 0); } static int cm_lap_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_lap_msg *lap_msg; struct ib_cm_lap_event_param *param; struct ib_mad_send_buf *msg = NULL; int ret; /* todo: verify LAP request and send reject APR if invalid. */ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id, lap_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; param = &work->cm_event.param.lap_rcvd; param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); work->cm_event.private_data = &lap_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) goto unlock; switch (cm_id_priv->id.lap_state) { case IB_CM_LAP_UNINIT: case IB_CM_LAP_IDLE: break; case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); if (IS_ERR(msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_LAP_COUNTER]); goto unlock; default: goto unlock; } cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); if (ret) goto unlock; ret = cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv); if (ret) goto unlock; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); deref: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_apr(struct cm_apr_msg *apr_msg, struct cm_id_private *cm_id_priv, enum ib_cm_apr_status status, void *info, u8 info_length, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid); apr_msg->local_comm_id = cm_id_priv->id.local_id; apr_msg->remote_comm_id = cm_id_priv->id.remote_id; apr_msg->ap_status = (u8) status; if (info && info_length) { apr_msg->info_length = info_length; memcpy(apr_msg->info, info, info_length); } if (private_data && private_data_len) memcpy(apr_msg->private_data, private_data, private_data_len); } int ib_send_cm_apr(struct ib_cm_id *cm_id, enum ib_cm_apr_status status, void *info, u8 info_length, const void *private_data, u8 private_data_len) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) || (info && info_length > IB_CM_APR_INFO_LENGTH)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED || (cm_id->lap_state != IB_CM_LAP_RCVD && cm_id->lap_state != IB_CM_MRA_LAP_SENT)) { ret = -EINVAL; goto out; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status, info, info_length, private_data, private_data_len); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->lap_state = IB_CM_LAP_IDLE; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_apr); static int cm_apr_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_apr_msg *apr_msg; int ret; apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id, apr_msg->local_comm_id); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status; work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info; work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length; work->cm_event.private_data = &apr_msg->private_data; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED || (cm_id_priv->id.lap_state != IB_CM_LAP_SENT && cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); cm_id_priv->msg = NULL; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static int cm_timewait_handler(struct cm_work *work) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; int ret; timewait_info = (struct cm_timewait_info *)work; spin_lock_irq(&cm.lock); list_del(&timewait_info->list); spin_unlock_irq(&cm.lock); cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); if (!cm_id_priv) return -EINVAL; spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_TIMEWAIT || cm_id_priv->remote_qpn != timewait_info->remote_qpn) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.state = IB_CM_IDLE; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); spin_unlock_irq(&cm_id_priv->lock); if (ret) cm_process_work(cm_id_priv, work); else cm_deref_id(cm_id_priv); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_req_param *param) { cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR)); sidr_req_msg->request_id = cm_id_priv->id.local_id; sidr_req_msg->pkey = param->path->pkey; sidr_req_msg->service_id = param->service_id; if (param->private_data && param->private_data_len) memcpy(sidr_req_msg->private_data, param->private_data, param->private_data_len); } int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, struct ib_cm_sidr_req_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if (!param->path || (param->private_data && param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv); if (ret) goto out; cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = param->timeout_ms; cm_id_priv->max_cm_retries = param->max_cm_retries; ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto out; cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv, param); msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_IDLE) ret = ib_post_send_mad(msg, NULL); else ret = -EINVAL; if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); goto out; } cm_id->state = IB_CM_SIDR_REQ_SENT; cm_id_priv->msg = msg; spin_unlock_irqrestore(&cm_id_priv->lock, flags); out: return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); static void cm_format_sidr_req_event(struct cm_work *work, const struct cm_id_private *rx_cm_id, struct ib_cm_id *listen_id) { struct cm_sidr_req_msg *sidr_req_msg; struct ib_cm_sidr_req_event_param *param; sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_req_rcvd; param->pkey = __be16_to_cpu(sidr_req_msg->pkey); param->listen_id = listen_id; param->service_id = sidr_req_msg->service_id; param->bth_pkey = cm_get_bth_pkey(work); param->port = work->port->port_num; param->sgid_index = rx_cm_id->av.ah_attr.grh.sgid_index; work->cm_event.private_data = &sidr_req_msg->private_data; } static int cm_sidr_req_handler(struct cm_work *work) { struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *cur_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; int ret; cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); cm_id_priv = container_of(cm_id, struct cm_id_private, id); /* Record SGID/SLID and request ID for lookup. */ sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; wc = work->mad_recv_wc->wc; cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); cm_id_priv->av.dgid.global.interface_id = 0; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); if (ret) goto out; cm_id_priv->id.remote_id = sidr_req_msg->request_id; cm_id_priv->tid = sidr_req_msg->hdr.tid; atomic_inc(&cm_id_priv->work_count); spin_lock_irq(&cm.lock); cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); if (cur_cm_id_priv) { spin_unlock_irq(&cm.lock); atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_SIDR_REQ_COUNTER]); goto out; /* Duplicate message. */ } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; cur_cm_id_priv = cm_find_listen(cm_id->device, sidr_req_msg->service_id); if (!cur_cm_id_priv) { spin_unlock_irq(&cm.lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); goto out; /* No match. */ } atomic_inc(&cur_cm_id_priv->refcount); atomic_inc(&cm_id_priv->refcount); spin_unlock_irq(&cm.lock); cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler; cm_id_priv->id.context = cur_cm_id_priv->id.context; cm_id_priv->id.service_id = sidr_req_msg->service_id; cm_id_priv->id.service_mask = ~cpu_to_be64(0); cm_format_sidr_req_event(work, cm_id_priv, &cur_cm_id_priv->id); cm_process_work(cm_id_priv, work); cm_deref_id(cur_cm_id_priv); return 0; out: ib_destroy_cm_id(&cm_id_priv->id); return -EINVAL; } static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param) { cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, cm_id_priv->tid); sidr_rep_msg->request_id = cm_id_priv->id.remote_id; sidr_rep_msg->status = param->status; cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num)); sidr_rep_msg->service_id = cm_id_priv->id.service_id; sidr_rep_msg->qkey = cpu_to_be32(param->qkey); if (param->info && param->info_length) memcpy(sidr_rep_msg->info, param->info, param->info_length); if (param->private_data && param->private_data_len) memcpy(sidr_rep_msg->private_data, param->private_data, param->private_data_len); } int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param) { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) || (param->private_data && param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE)) return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_SIDR_REQ_RCVD) { ret = -EINVAL; goto error; } ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) goto error; cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, param); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } cm_id->state = IB_CM_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); spin_lock_irqsave(&cm.lock, flags); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); } spin_unlock_irqrestore(&cm.lock, flags); return 0; error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_rep); static void cm_format_sidr_rep_event(struct cm_work *work) { struct cm_sidr_rep_msg *sidr_rep_msg; struct ib_cm_sidr_rep_event_param *param; sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_rep_rcvd; param->status = sidr_rep_msg->status; param->qkey = be32_to_cpu(sidr_rep_msg->qkey); param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg)); param->info = &sidr_rep_msg->info; param->info_len = sidr_rep_msg->info_length; work->cm_event.private_data = &sidr_rep_msg->private_data; } static int cm_sidr_rep_handler(struct cm_work *work) { struct cm_sidr_rep_msg *sidr_rep_msg; struct cm_id_private *cm_id_priv; sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) { spin_unlock_irq(&cm_id_priv->lock); goto out; } cm_id_priv->id.state = IB_CM_IDLE; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); cm_format_sidr_rep_event(work); cm_process_work(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } static void cm_process_send_error(struct ib_mad_send_buf *msg, enum ib_wc_status wc_status) { struct cm_id_private *cm_id_priv; struct ib_cm_event cm_event; enum ib_cm_state state; int ret; memset(&cm_event, 0, sizeof cm_event); cm_id_priv = msg->context[0]; /* Discard old sends or ones without a response. */ spin_lock_irq(&cm_id_priv->lock); state = (enum ib_cm_state) (unsigned long) msg->context[1]; if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) goto discard; switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: cm_reset_to_idle(cm_id_priv); cm_event.event = IB_CM_REQ_ERROR; break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_reset_to_idle(cm_id_priv); cm_event.event = IB_CM_REP_ERROR; break; case IB_CM_DREQ_SENT: cm_enter_timewait(cm_id_priv); cm_event.event = IB_CM_DREQ_ERROR; break; case IB_CM_SIDR_REQ_SENT: cm_id_priv->id.state = IB_CM_IDLE; cm_event.event = IB_CM_SIDR_REQ_ERROR; break; default: goto discard; } spin_unlock_irq(&cm_id_priv->lock); cm_event.param.send_status = wc_status; /* No other events can occur on the cm_id at this point. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event); cm_free_msg(msg); if (ret) ib_destroy_cm_id(&cm_id_priv->id); return; discard: spin_unlock_irq(&cm_id_priv->lock); cm_free_msg(msg); } static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; struct cm_port *port; u16 attr_index; port = mad_agent->context; attr_index = be16_to_cpu(((struct ib_mad_hdr *) msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; /* * If the send was in response to a received message (context[0] is not * set to a cm_id), and is not a REJ, then it is a send that was * manually retried. */ if (!msg->context[0] && (attr_index != CM_REJ_COUNTER)) msg->retries = 1; atomic_long_add(1 + msg->retries, &port->counter_group[CM_XMIT].counter[attr_index]); if (msg->retries) atomic_long_add(msg->retries, &port->counter_group[CM_XMIT_RETRIES]. counter[attr_index]); switch (mad_send_wc->status) { case IB_WC_SUCCESS: case IB_WC_WR_FLUSH_ERR: cm_free_msg(msg); break; default: if (msg->context[0] && msg->context[1]) cm_process_send_error(msg, mad_send_wc->status); else cm_free_msg(msg); break; } } static void cm_work_handler(struct work_struct *_work) { struct cm_work *work = container_of(_work, struct cm_work, work.work); int ret; switch (work->cm_event.event) { case IB_CM_REQ_RECEIVED: ret = cm_req_handler(work); break; case IB_CM_MRA_RECEIVED: ret = cm_mra_handler(work); break; case IB_CM_REJ_RECEIVED: ret = cm_rej_handler(work); break; case IB_CM_REP_RECEIVED: ret = cm_rep_handler(work); break; case IB_CM_RTU_RECEIVED: ret = cm_rtu_handler(work); break; case IB_CM_USER_ESTABLISHED: ret = cm_establish_handler(work); break; case IB_CM_DREQ_RECEIVED: ret = cm_dreq_handler(work); break; case IB_CM_DREP_RECEIVED: ret = cm_drep_handler(work); break; case IB_CM_SIDR_REQ_RECEIVED: ret = cm_sidr_req_handler(work); break; case IB_CM_SIDR_REP_RECEIVED: ret = cm_sidr_rep_handler(work); break; case IB_CM_LAP_RECEIVED: ret = cm_lap_handler(work); break; case IB_CM_APR_RECEIVED: ret = cm_apr_handler(work); break; case IB_CM_TIMEWAIT_EXIT: ret = cm_timewait_handler(work); break; default: ret = -EINVAL; break; } if (ret) cm_free_work(work); } static int cm_establish(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; struct cm_work *work; unsigned long flags; int ret = 0; struct cm_device *cm_dev; cm_dev = ib_get_client_data(cm_id->device, &cm_client); if (!cm_dev) return -ENODEV; work = kmalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id->state) { case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_id->state = IB_CM_ESTABLISHED; break; case IB_CM_ESTABLISHED: ret = -EISCONN; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (ret) { kfree(work); goto out; } /* * The CM worker thread may try to destroy the cm_id before it * can execute this work item. To prevent potential deadlock, * we need to find the cm_id once we're in the context of the * worker thread, rather than holding a reference on it. */ INIT_DELAYED_WORK(&work->work, cm_work_handler); work->local_id = cm_id->local_id; work->remote_id = cm_id->remote_id; work->mad_recv_wc = NULL; work->cm_event.event = IB_CM_USER_ESTABLISHED; /* Check if the device started its remove_one */ spin_lock_irqsave(&cm.lock, flags); if (!cm_dev->going_down) { queue_delayed_work(cm.wq, &work->work, 0); } else { kfree(work); ret = -ENODEV; } spin_unlock_irqrestore(&cm.lock, flags); out: return ret; } static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; struct cm_av tmp_av; unsigned long flags; int tmp_send_port_not_ready; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_ESTABLISHED && (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; /* Swap address vector */ tmp_av = cm_id_priv->av; cm_id_priv->av = cm_id_priv->alt_av; cm_id_priv->alt_av = tmp_av; /* Swap port send ready state */ tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready; cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready; cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) { int ret; switch (event) { case IB_EVENT_COMM_EST: ret = cm_establish(cm_id); break; case IB_EVENT_PATH_MIG: ret = cm_migrate(cm_id); break; default: ret = -EINVAL; } return ret; } EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct cm_port *port = mad_agent->context; struct cm_work *work; enum ib_cm_event_type event; u16 attr_id; int paths = 0; int going_down = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { case CM_REQ_ATTR_ID: paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)-> alt_local_lid != 0); event = IB_CM_REQ_RECEIVED; break; case CM_MRA_ATTR_ID: event = IB_CM_MRA_RECEIVED; break; case CM_REJ_ATTR_ID: event = IB_CM_REJ_RECEIVED; break; case CM_REP_ATTR_ID: event = IB_CM_REP_RECEIVED; break; case CM_RTU_ATTR_ID: event = IB_CM_RTU_RECEIVED; break; case CM_DREQ_ATTR_ID: event = IB_CM_DREQ_RECEIVED; break; case CM_DREP_ATTR_ID: event = IB_CM_DREP_RECEIVED; break; case CM_SIDR_REQ_ATTR_ID: event = IB_CM_SIDR_REQ_RECEIVED; break; case CM_SIDR_REP_ATTR_ID: event = IB_CM_SIDR_REP_RECEIVED; break; case CM_LAP_ATTR_ID: paths = 1; event = IB_CM_LAP_RECEIVED; break; case CM_APR_ATTR_ID: event = IB_CM_APR_RECEIVED; break; default: ib_free_recv_mad(mad_recv_wc); return; } attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); atomic_long_inc(&port->counter_group[CM_RECV]. counter[attr_id - CM_ATTR_ID_OFFSET]); work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths, GFP_KERNEL); if (!work) { ib_free_recv_mad(mad_recv_wc); return; } INIT_DELAYED_WORK(&work->work, cm_work_handler); work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; work->port = port; /* Check if the device started its remove_one */ spin_lock_irq(&cm.lock); if (!port->cm_dev->going_down) queue_delayed_work(cm.wq, &work->work, 0); else going_down = 1; spin_unlock_irq(&cm.lock); if (going_down) { kfree(work); ib_free_recv_mad(mad_recv_wc); } } static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; if (cm_id_priv->responder_resources) qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC; qp_attr->pkey_index = cm_id_priv->av.pkey_index; qp_attr->port_num = cm_id_priv->av.port->port_num; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); if (cm_id_priv->qp_type == IB_QPT_RC || cm_id_priv->qp_type == IB_QPT_XRC_TGT) { *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; qp_attr->max_dest_rd_atomic = cm_id_priv->responder_resources; qp_attr->min_rnr_timer = 0; } if (cm_id_priv->alt_av.ah_attr.dlid) { *qp_attr_mask |= IB_QP_ALT_PATH; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; } ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->id.state) { /* Allow transition to RTS before sending REP */ case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) { *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN; qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn); switch (cm_id_priv->qp_type) { case IB_QPT_RC: case IB_QPT_XRC_INI: *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC; qp_attr->retry_cnt = cm_id_priv->retry_count; qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; /* fall through */ case IB_QPT_XRC_TGT: *qp_attr_mask |= IB_QP_TIMEOUT; qp_attr->timeout = cm_id_priv->av.timeout; break; default: break; } if (cm_id_priv->alt_av.ah_attr.dlid) { *qp_attr_mask |= IB_QP_PATH_MIG_STATE; qp_attr->path_mig_state = IB_MIG_REARM; } } else { *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; qp_attr->path_mig_state = IB_MIG_REARM; } ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct cm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct cm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTR: ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(ib_cm_init_qp_attr); static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, char *buf) { struct cm_counter_group *group; struct cm_counter_attribute *cm_attr; group = container_of(obj, struct cm_counter_group, obj); cm_attr = container_of(attr, struct cm_counter_attribute, attr); return sprintf(buf, "%ld\n", atomic_long_read(&group->counter[cm_attr->index])); } static const struct sysfs_ops cm_counter_ops = { .show = cm_show_counter }; static struct kobj_type cm_counter_obj_type = { .sysfs_ops = &cm_counter_ops, .default_attrs = cm_counter_default_attrs }; static void cm_release_port_obj(struct kobject *obj) { struct cm_port *cm_port; cm_port = container_of(obj, struct cm_port, port_obj); kfree(cm_port); } static struct kobj_type cm_port_obj_type = { .release = cm_release_port_obj }; static char *cm_devnode(struct device *dev, umode_t *mode) { if (mode) *mode = 0666; return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } struct class cm_class = { .owner = THIS_MODULE, .name = "infiniband_cm", .devnode = cm_devnode, }; EXPORT_SYMBOL(cm_class); static int cm_create_port_fs(struct cm_port *port) { int i, ret; ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type, &port->cm_dev->device->kobj, "%d", port->port_num); if (ret) { kfree(port); return ret; } for (i = 0; i < CM_COUNTER_GROUPS; i++) { ret = kobject_init_and_add(&port->counter_group[i].obj, &cm_counter_obj_type, &port->port_obj, "%s", counter_group_names[i]); if (ret) goto error; } return 0; error: while (i--) kobject_put(&port->counter_group[i].obj); kobject_put(&port->port_obj); return ret; } static void cm_remove_port_fs(struct cm_port *port) { int i; for (i = 0; i < CM_COUNTER_GROUPS; i++) kobject_put(&port->counter_group[i].obj); kobject_put(&port->port_obj); } static void cm_add_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; struct ib_mad_reg_req reg_req = { .mgmt_class = IB_MGMT_CLASS_CM, .mgmt_class_version = IB_CM_CLASS_VERSION, }; struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; int ret; int count = 0; u8 i; cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * ib_device->phys_port_cnt, GFP_KERNEL); if (!cm_dev) return; cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, "%s", ib_device->name); if (IS_ERR(cm_dev->device)) { kfree(cm_dev); return; } set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i <= ib_device->phys_port_cnt; i++) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = kzalloc(sizeof *port, GFP_KERNEL); if (!port) goto error1; cm_dev->port[i-1] = port; port->cm_dev = cm_dev; port->port_num = i; INIT_LIST_HEAD(&port->cm_priv_prim_list); INIT_LIST_HEAD(&port->cm_priv_altr_list); ret = cm_create_port_fs(port); if (ret) goto error1; port->mad_agent = ib_register_mad_agent(ib_device, i, IB_QPT_GSI, ®_req, 0, cm_send_handler, cm_recv_handler, port, 0); if (IS_ERR(port->mad_agent)) goto error2; ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) goto error3; count++; } if (!count) goto free; ib_set_client_data(ib_device, &cm_client, cm_dev); write_lock_irqsave(&cm.device_lock, flags); list_add_tail(&cm_dev->list, &cm.device_list); write_unlock_irqrestore(&cm.device_lock, flags); return; error3: ib_unregister_mad_agent(port->mad_agent); error2: cm_remove_port_fs(port); error1: port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); cm_remove_port_fs(port); } free: device_unregister(cm_dev->device); kfree(cm_dev); } static void cm_remove_one(struct ib_device *ib_device, void *client_data) { struct cm_device *cm_dev = client_data; struct cm_port *port; struct cm_id_private *cm_id_priv; struct ib_mad_agent *cur_mad_agent; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; int i; if (!cm_dev) return; write_lock_irqsave(&cm.device_lock, flags); list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); spin_lock_irq(&cm.lock); cm_dev->going_down = 1; spin_unlock_irq(&cm.lock); for (i = 1; i <= ib_device->phys_port_cnt; i++) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); /* Mark all the cm_id's as not valid */ spin_lock_irq(&cm.lock); list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list) cm_id_priv->altr_send_port_not_ready = 1; list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list) cm_id_priv->prim_send_port_not_ready = 1; spin_unlock_irq(&cm.lock); /* * We flush the queue here after the going_down set, this * verify that no new works will be queued in the recv handler, * after that we can call the unregister_mad_agent */ flush_workqueue(cm.wq); spin_lock_irq(&cm.state_lock); cur_mad_agent = port->mad_agent; port->mad_agent = NULL; spin_unlock_irq(&cm.state_lock); ib_unregister_mad_agent(cur_mad_agent); cm_remove_port_fs(port); } device_unregister(cm_dev->device); kfree(cm_dev); } static int __init ib_cm_init(void) { int ret; memset(&cm, 0, sizeof cm); INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; cm.remote_qp_table = RB_ROOT; cm.remote_sidr_table = RB_ROOT; idr_init(&cm.local_id_table); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); ret = class_register(&cm_class); if (ret) { ret = -ENOMEM; goto error1; } cm.wq = create_workqueue("ib_cm"); if (!cm.wq) { ret = -ENOMEM; goto error2; } ret = ib_register_client(&cm_client); if (ret) goto error3; return 0; error3: destroy_workqueue(cm.wq); error2: class_unregister(&cm_class); error1: idr_destroy(&cm.local_id_table); return ret; } static void __exit ib_cm_cleanup(void) { struct cm_timewait_info *timewait_info, *tmp; spin_lock_irq(&cm.lock); list_for_each_entry(timewait_info, &cm.timewait_list, list) cancel_delayed_work(&timewait_info->work.work); spin_unlock_irq(&cm.lock); ib_unregister_client(&cm_client); destroy_workqueue(cm.wq); list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) { cancel_delayed_work_sync(&timewait_info->work.work); list_del(&timewait_info->list); kfree(timewait_info); } class_unregister(&cm_class); idr_destroy(&cm.local_id_table); } module_init_order(ib_cm_init, SI_ORDER_SECOND); module_exit_order(ib_cm_cleanup, SI_ORDER_SECOND); diff --git a/sys/ofed/drivers/infiniband/core/ib_cma.c b/sys/ofed/drivers/infiniband/core/ib_cma.c index 7c03841d51d7..829d668dbd05 100644 --- a/sys/ofed/drivers/infiniband/core/ib_cma.c +++ b/sys/ofed/drivers/infiniband/core/ib_cma.c @@ -1,4619 +1,4619 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #define LINUXKPI_PARAM_PREFIX ibcore_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); static int cma_check_linklocal(struct rdma_dev_addr *, struct sockaddr *); static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; struct cma_pernet { struct idr tcp_ps; struct idr udp_ps; struct idr ipoib_ps; struct idr ib_ps; struct idr sdp_ps; }; VNET_DEFINE(struct cma_pernet, cma_pernet); static struct cma_pernet *cma_pernet_ptr(struct vnet *vnet) { struct cma_pernet *retval; CURVNET_SET_QUIET(vnet); retval = &VNET(cma_pernet); CURVNET_RESTORE(); return (retval); } static struct idr *cma_pernet_idr(struct vnet *net, enum rdma_port_space ps) { struct cma_pernet *pernet = cma_pernet_ptr(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; case RDMA_PS_SDP: return &pernet->sdp_ps; default: return NULL; } } struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; atomic_t refcount; struct list_head id_list; struct sysctl_ctx_list sysctl_ctx; enum ib_gid_type *default_gid_type; }; struct rdma_bind_list { enum rdma_port_space ps; struct hlist_head owners; unsigned short port; }; struct class_port_info_context { struct ib_class_port_info *class_port_info; struct ib_device *device; struct completion done; struct ib_sa_query *sa_query; u8 port_num; }; static int cma_ps_alloc(struct vnet *vnet, enum rdma_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct idr *idr = cma_pernet_idr(vnet, ps); return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct vnet *net, enum rdma_port_space ps, int snum) { struct idr *idr = cma_pernet_idr(net, ps); return idr_find(idr, snum); } static void cma_ps_remove(struct vnet *net, enum rdma_port_space ps, int snum) { struct idr *idr = cma_pernet_idr(net, ps); idr_remove(idr, snum); } enum { CMA_OPTION_AFONLY, }; void cma_ref_dev(struct cma_device *cma_dev) { atomic_inc(&cma_dev->refcount); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_ref_dev(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, unsigned int port) { if (port < rdma_start_port(cma_dev->device) || port > rdma_end_port(cma_dev->device)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, unsigned int port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (port < rdma_start_port(cma_dev->device) || port > rdma_end_port(cma_dev->device)) return -EINVAL; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ struct cma_device *cma_dev; struct list_head mc_list; int internal_id; enum rdma_cm_state state; spinlock_t lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; int backlog; int timeout_ms; struct ib_sa_query *query; int query_id; union { struct ib_cm_id *ib; struct iw_cm_id *iw; } cm_id; u32 seq_num; u32 qkey; u32 qp_num; pid_t owner; u32 options; u8 srq; u8 tos; u8 reuseaddr; u8 afonly; enum ib_gid_type gid_type; }; struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *ib; } multicast; struct list_head list; void *context; struct sockaddr_storage addr; struct kref mcref; bool igmp_joined; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; struct cma_ndev_work { struct work_struct work; struct rdma_id_private *id; struct rdma_cm_event event; }; struct iboe_mcast_work { struct work_struct work; struct rdma_id_private *id; struct cma_multicast *mc; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 #define SDP_MAJ_VERSION 0x2 struct cma_req_info { struct ib_device *device; int port; union ib_gid local_gid; __be64 service_id; u16 pkey; bool has_gid:1; }; static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); ret = (id_priv->state == comp); spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, enum rdma_cm_state exch) { unsigned long flags; enum rdma_cm_state old; spin_lock_irqsave(&id_priv->lock, flags); old = id_priv->state; id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return old; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static inline u8 sdp_get_majv(u8 sdp_version) { return sdp_version >> 4; } static inline u8 sdp_get_ip_ver(const struct sdp_hh *hh) { return hh->ipv_cap >> 4; } static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) { hh->ipv_cap = (ip_ver << 4) | (hh->ipv_cap & 0xF); } -static int cma_igmp_send(struct net_device *ndev, const union ib_gid *mgid, bool join) +static int cma_igmp_send(struct ifnet *ndev, const union ib_gid *mgid, bool join) { int retval; if (ndev) { union rdma_sockaddr addr; rdma_gid2ip(&addr._sockaddr, mgid); CURVNET_SET_QUIET(ndev->if_vnet); if (join) retval = -if_addmulti(ndev, &addr._sockaddr, NULL); else retval = -if_delmulti(ndev, &addr._sockaddr); CURVNET_RESTORE(); } else { retval = -ENODEV; } return retval; } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } static inline void release_mc(struct kref *kref) { struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); kfree(mc->multicast.ib); kfree(mc); } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del(&id_priv->list); cma_deref_dev(id_priv->cma_dev); id_priv->cma_dev = NULL; mutex_unlock(&lock); } static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *) &id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { struct ib_sa_mcmember_rec rec; int ret = 0; if (id_priv->qkey) { if (qkey && id_priv->qkey != qkey) return -EINVAL; return 0; } if (qkey) { id_priv->qkey = qkey; return 0; } switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static inline int cma_validate_port(struct ib_device *device, u8 port, enum ib_gid_type gid_type, union ib_gid *gid, const struct rdma_dev_addr *dev_addr) { const int dev_type = dev_addr->dev_type; - struct net_device *ndev; + struct ifnet *ndev; int ret = -ENODEV; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) return ret; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); } else { ndev = NULL; gid_type = IB_GID_TYPE_IB; } ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, NULL); if (ndev) dev_put(ndev); return ret; } static int cma_acquire_dev(struct rdma_id_private *id_priv, struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; union ib_gid gid, iboe_gid, *gidp; int ret = -ENODEV; u8 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; mutex_lock(&lock); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); if (listen_id_priv) { cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; if (rdma_is_port_valid(cma_dev->device, port)) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; ret = cma_validate_port(cma_dev->device, port, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : listen_id_priv->gid_type, gidp, dev_addr); if (!ret) { id_priv->id.port_num = port; goto out; } } } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { if (listen_id_priv && listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; ret = cma_validate_port(cma_dev->device, port, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : cma_dev->default_gid_type[port - 1], gidp, dev_addr); if (!ret) { id_priv->id.port_num = port; goto out; } } } out: if (!ret) cma_attach_to_dev(id_priv, cma_dev); mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; u16 pkey, index; u8 p; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); list_for_each_entry(cur_dev, &dev_list, list) { for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid, NULL); i++) { if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix)) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; } } } } if (!cma_dev) return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); addr = (struct sockaddr_ib *) cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof sgid); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_deref_id(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } struct rdma_cm_id *rdma_create_id(struct vnet *net, rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *id_priv; #ifdef VIMAGE if (net == NULL) return ERR_PTR(-EINVAL); #endif id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->owner = task_pid_nr(current); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = net; return &id_priv->id; } EXPORT_SYMBOL(rdma_create_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) return -EINVAL; qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto err; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); return 0; err: ib_destroy_qp(qp); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, qp_attr.ah_attr.grh.sgid_index, &sgid, NULL); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_qkey(id_priv, 0); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else ret = -ENOSYS; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline int cma_zero_addr(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); default: return 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); case AF_IB: return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); default: return 0; } } static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *) src)->sin_addr.s_addr != ((struct sockaddr_in *) dst)->sin_addr.s_addr; case AF_INET6: return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_cm_id *listen_id, struct ib_sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_ip6_clear_scope_id(struct in6_addr *addr) { /* make sure link local scope ID gets zeroed */ if (IN6_IS_SCOPE_LINKLOCAL(addr) || IN6_IS_ADDR_MC_INTFACELOCAL(addr)) { /* use byte-access to be alignment safe */ addr->s6_addr[2] = 0; addr->s6_addr[3] = 0; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_len = sizeof(struct sockaddr_in6), .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; cma_ip6_clear_scope_id(&src_addr->sin6_addr); } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_len = sizeof(struct sockaddr_in6), .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; cma_ip6_clear_scope_id(&dst_addr->sin6_addr); } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int sdp_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct sdp_hh *hdr, __be64 service_id) { __be16 local_port; BUG_ON(src_addr == NULL || dst_addr == NULL); if (sdp_get_majv(hdr->majv_minv) != SDP_MAJ_VERSION) return -EINVAL; local_port = htons(cma_port_from_service_id(service_id)); switch (sdp_get_ip_ver(hdr)) { case 4: { struct sockaddr_in *s4, *d4; s4 = (void *)src_addr; d4 = (void *)dst_addr; *s4 = (struct sockaddr_in) { .sin_len = sizeof(*s4), .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; *d4 = (struct sockaddr_in) { .sin_len = sizeof(*d4), .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; break; } case 6: { struct sockaddr_in6 *s6, *d6; s6 = (void *)src_addr; d6 = (void *)dst_addr; *s6 = (struct sockaddr_in6) { .sin6_len = sizeof(*s6), .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; *d6 = (struct sockaddr_in6) { .sin6_len = sizeof(*d6), .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; cma_ip6_clear_scope_id(&s6->sin6_addr); cma_ip6_clear_scope_id(&d6->sin6_addr); break; } default: return -EAFNOSUPPORT; } return 0; } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; if (rdma_ps_from_service_id(service_id) == RDMA_PS_SDP) return sdp_save_ip_info(src_addr, dst_addr, ib_event->private_data, service_id); hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } -static bool validate_ipv4_net_dev(struct net_device *net_dev, +static bool validate_ipv4_net_dev(struct ifnet *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { #ifdef INET __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; - struct net_device *dst_dev; + struct ifnet *dst_dev; struct nhop_object *nh; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; dst_dev = ip_ifp_find(net_dev->if_vnet, daddr); if (dst_dev != net_dev) { if (dst_dev != NULL) dev_put(dst_dev); return false; } dev_put(dst_dev); /* * Check for loopback. */ if (saddr == daddr) return true; CURVNET_SET(net_dev->if_vnet); nh = fib4_lookup(RT_DEFAULT_FIB, src_addr->sin_addr, 0, NHR_NONE, 0); if (nh != NULL) ret = (nh->nh_ifp == net_dev); else ret = false; CURVNET_RESTORE(); return ret; #else return false; #endif } -static bool validate_ipv6_net_dev(struct net_device *net_dev, +static bool validate_ipv6_net_dev(struct ifnet *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #ifdef INET6 struct sockaddr_in6 src_tmp = *src_addr; struct sockaddr_in6 dst_tmp = *dst_addr; - struct net_device *dst_dev; + struct ifnet *dst_dev; struct nhop_object *nh; bool ret; dst_dev = ip6_ifp_find(net_dev->if_vnet, dst_tmp.sin6_addr, net_dev->if_index); if (dst_dev != net_dev) { if (dst_dev != NULL) dev_put(dst_dev); return false; } dev_put(dst_dev); CURVNET_SET(net_dev->if_vnet); /* * Make sure the scope ID gets embedded. */ src_tmp.sin6_scope_id = net_dev->if_index; sa6_embedscope(&src_tmp, 0); dst_tmp.sin6_scope_id = net_dev->if_index; sa6_embedscope(&dst_tmp, 0); /* * Check for loopback after scope ID * has been embedded: */ if (memcmp(&src_tmp.sin6_addr, &dst_tmp.sin6_addr, sizeof(dst_tmp.sin6_addr)) == 0) { ret = true; } else { /* non-loopback case */ nh = fib6_lookup(RT_DEFAULT_FIB, &src_addr->sin6_addr, net_dev->if_index, NHR_NONE, 0); if (nh != NULL) ret = (nh->nh_ifp == net_dev); else ret = false; } CURVNET_RESTORE(); return ret; #else return false; #endif } -static bool validate_net_dev(struct net_device *net_dev, +static bool validate_net_dev(struct ifnet *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } -static struct net_device * +static struct ifnet * roce_get_net_dev_by_cm_event(struct ib_device *device, u8 port_num, const struct ib_cm_event *ib_event) { struct ib_gid_attr sgid_attr; union ib_gid sgid; int err = -EINVAL; if (ib_event->event == IB_CM_REQ_RECEIVED) { err = ib_get_cached_gid(device, port_num, ib_event->param.req_rcvd.ppath_sgid_index, &sgid, &sgid_attr); } else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { err = ib_get_cached_gid(device, port_num, ib_event->param.sidr_req_rcvd.sgid_index, &sgid, &sgid_attr); } if (err) return (NULL); return (sgid_attr.ndev); } -static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, +static struct ifnet *cma_get_net_dev(struct ib_cm_event *ib_event, const struct cma_req_info *req) { struct sockaddr_storage listen_addr_storage, src_addr_storage; struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage, *src_addr = (struct sockaddr *)&src_addr_storage; - struct net_device *net_dev; + struct ifnet *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; struct epoch_tracker et; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) { net_dev = roce_get_net_dev_by_cm_event(req->device, req->port, ib_event); } else { net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); } if (!net_dev) return ERR_PTR(-ENODEV); NET_EPOCH_ENTER(et); if (!validate_net_dev(net_dev, listen_addr, src_addr)) { NET_EPOCH_EXIT(et); dev_put(net_dev); return ERR_PTR(-EHOSTUNREACH); } NET_EPOCH_EXIT(et); return net_dev; } static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool sdp_match_private_data(struct rdma_id_private *id_priv, const struct sdp_hh *hdr, struct sockaddr *addr) { __be32 ip4_addr; struct in6_addr ip6_addr; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (sdp_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (sdp_get_ip_ver(hdr) != 6) return false; cma_ip6_clear_scope_id(&ip6_addr); if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const void *vhdr) { const struct cma_hdr *hdr = vhdr; struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; if (id_priv->id.ps == RDMA_PS_SDP) return sdp_match_private_data(id_priv, vhdr, addr); switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; cma_ip6_clear_scope_id(&ip6_addr); if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num) { enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num); enum rdma_transport_type transport = rdma_node_get_transport(device->node_type); return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const int port_num = id->port_num ?: rdma_start_port(device); return cma_protocol_roce_dev_port(device, port_num); } static bool cma_match_net_dev(const struct rdma_cm_id *id, - const struct net_device *net_dev, + const struct ifnet *net_dev, u8 port_num) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) { if (id->port_num && id->port_num != port_num) return false; if (id->ps == RDMA_PS_SDP) { if (addr->src_addr.ss_family == AF_INET || addr->src_addr.ss_family == AF_INET6) return true; return false; } /* This request is an AF_IB request or a RoCE request */ return addr->src_addr.ss_family == AF_IB || cma_protocol_roce_dev_port(id->device, port_num); } return !addr->dev_addr.bound_dev_if || (net_eq(dev_net(net_dev), addr->dev_addr.net) && addr->dev_addr.bound_dev_if == net_dev->if_index); } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, - const struct net_device *net_dev) + const struct ifnet *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req->port)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_list) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event, - struct net_device **net_dev) + struct ifnet **net_dev) { struct cma_req_info req; struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, &req); if (err) return ERR_PTR(err); if (rdma_ps_from_service_id(cm_id->service_id) == RDMA_PS_SDP) { *net_dev = NULL; goto there_is_no_net_dev; } *net_dev = cma_get_net_dev(ib_event, &req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } there_is_no_net_dev: bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req.service_id), cma_port_from_service_id(req.service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline int cma_user_data_offset(struct rdma_id_private *id_priv) { if (cma_family(id_priv) == AF_IB) return 0; if (id_priv->id.ps == RDMA_PS_SDP) return 0; return sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_entry(id_priv->listen_list.next, struct rdma_id_private, listen_list); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->list); list_del(&dev_id_priv->listen_list); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct vnet *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); if (rdma_cap_ib_mcast(id_priv->cma_dev->device, id_priv->id.port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); } else { if (mc->igmp_joined) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = NULL; + struct ifnet *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev) { cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, false); dev_put(ndev); } } kref_put(&mc->mcref, release_mc); } } } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; enum rdma_cm_state state; id_priv = container_of(id, struct rdma_id_private, id); state = cma_exch(id_priv, RDMA_CM_DESTROYING); cma_cancel_operation(id_priv, state); /* * Wait for any active callback to finish. New callbacks will find * the id_priv state set to destroying and abort. */ mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_deref_id(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_deref_id(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static int sdp_verify_rep(const struct sdp_hah *data) { if (sdp_get_majv(data->majv_minv) != SDP_MAJ_VERSION) return -EINVAL; return 0; } static void cma_set_rep_event_data(struct rdma_cm_event *event, struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; } static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; int ret = 0; mutex_lock(&id_priv->handler_mutex); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && id_priv->state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && id_priv->state != RDMA_CM_DISCONNECT)) goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (id_priv->id.ps == RDMA_PS_SDP) { event.status = sdp_verify_rep(ib_event->private_data); if (event.status) event.event = RDMA_CM_EVENT_CONNECT_ERROR; else event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } else { if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; /* fall through */ case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event, - struct net_device *net_dev) + struct ifnet *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; id = rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) return NULL; id_priv = container_of(id, struct rdma_id_private, id); if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); if (ret) goto err; } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event, - struct net_device *net_dev) + struct ifnet *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct vnet *net = listen_id->route.addr.dev_addr.net; int ret; id = rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; id_priv = container_of(id, struct rdma_id_private, id); if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); if (ret) goto err; } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = (char *)private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; } static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event; - struct net_device *net_dev; + struct ifnet *net_dev; int offset, ret; listen_id = cma_id_from_event(cm_id, ib_event, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); if (!cma_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (listen_id->state != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err1; } memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = (char *)ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err1; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_acquire_dev(conn_id, listen_id); if (ret) goto err2; conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; /* * Acquire mutex to prevent user executing rdma_destroy_id() * while we're accessing the cm_id. */ mutex_lock(&lock); if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); cma_deref_id(conn_id); if (net_dev) dev_put(net_dev); return 0; err3: cma_deref_id(conn_id); /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; err2: cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); err1: mutex_unlock(&listen_id->handler_mutex); if (conn_id) rdma_destroy_id(&conn_id->id); net_dev_put: if (net_dev) dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (id_priv->state != RDMA_CM_CONNECT) goto out; memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: BUG_ON(1); } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (listen_id->state != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = cma_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); rdma_destroy_id(&conn_id->id); goto out; } mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); id->tos = id_priv->tos; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; return id_priv->id.event_handler(id, event); } static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { struct rdma_id_private *dev_id_priv; struct rdma_cm_id *id; struct vnet *net = id_priv->id.route.addr.dev_addr.net; int ret; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return; id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) return; dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; ret = rdma_listen(id, id_priv->backlog); if (ret) pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; mutex_lock(&lock); list_add_tail(&id_priv->list, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); id_priv->tos = (u8) tos; } EXPORT_SYMBOL(rdma_set_service_type); static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { struct cma_work *work = context; struct rdma_route *route; route = &work->id->id.route; if (!status) { route->num_paths = 1; *route->path_rec = *path_rec; } else { work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; } queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_set_ib_paths(struct rdma_cm_id *id, struct ib_sa_path_rec *path_rec, int num_paths) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } id->route.num_paths = num_paths; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_paths); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; queue_work(cma_wq, &work->work); return 0; } -static int iboe_tos_to_sl(struct net_device *ndev, int tos) +static int iboe_tos_to_sl(struct ifnet *ndev, int tos) { /* get service level, SL, from IPv4 type of service, TOS */ int sl = (tos >> 5) & 0x7; /* final mappings are done by the vendor specific drivers */ return sl; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct net_device *ndev = NULL; + struct ifnet *ndev = NULL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_paths = 1; if (addr->dev_addr.bound_dev_if) { unsigned long supported_gids; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; goto err2; } route->path_rec->net = ndev->if_vnet; route->path_rec->ifindex = ndev->if_index; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); route->path_rec->gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); } if (!ndev) { ret = -ENODEV; goto err2; } memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); /* Use the hint from IP Stack to select GID Type */ if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network)) route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); route->path_rec->traffic_class = id_priv->tos; route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; work->event.status = 0; queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) ret = cma_resolve_iboe_route(id_priv); else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv, timeout_ms); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { if (!ib_query_port(cur_dev->device, p, &port_attr) && port_attr.state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event; memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv, NULL); if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; err: kfree(work); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; src_addr->sa_family = dst_addr->sa_family; if (dst_addr->sa_family == AF_INET6) { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; if (IN6_IS_SCOPE_LINKLOCAL(&dst_addr6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&dst_addr6->sin6_addr)) id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *) src_addr)->sib_pkey = ((struct sockaddr_ib *) dst_addr)->sib_pkey; } } return rdma_bind_addr(id, src_addr); } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); if (ret) return ret; } if (cma_family(id_priv) != dst_addr->sa_family) return -EINVAL; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { ret = cma_check_linklocal(&id->route.addr.dev_addr, dst_addr); if (ret) goto err; ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); } } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (reuse || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = (unsigned short)ret; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_alloc_any_port(enum rdma_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct vnet *net = id_priv->id.route.addr.dev_addr.net; u32 rand; inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; get_random_bytes(&rand, sizeof(rand)); rover = rand % remaining + low; retry: if (last_used_port != rover && !cma_ps_find(net, ps, (unsigned short)rover)) { int ret = cma_alloc_port(ps, id_priv, rover); /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < IPPORT_RESERVED && priv_check(curthread, PRIV_NETINET_BINDANY) != 0) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static int cma_bind_listen(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; int ret = 0; mutex_lock(&lock); if (bind_list->owners.first->next) ret = cma_check_port(bind_list, id_priv, 0); mutex_unlock(&lock); return ret; } static enum rdma_port_space cma_select_inet_ps( struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: case RDMA_PS_SDP: return id_priv->id.ps; default: return 0; } } static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT; mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #ifdef INET6 struct sockaddr_in6 sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = *(struct sockaddr_in6 *)addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6.sin6_addr)) { bool failure; CURVNET_SET_QUIET(dev_addr->net); failure = sa6_recoverscope(&sin6) || sin6.sin6_scope_id == 0; CURVNET_RESTORE(); /* check if IPv6 scope ID is not set */ if (failure) return -EINVAL; dev_addr->bound_dev_if = sin6.sin6_scope_id; } #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { id->route.addr.src_addr.ss_family = AF_INET; ret = rdma_bind_addr(id, cma_src_addr(id_priv)); if (ret) return ret; } if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) return -EINVAL; if (id_priv->reuseaddr) { ret = cma_bind_listen(id_priv); if (ret) goto err; } id_priv->backlog = backlog; if (id->device) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; } else { ret = -ENOSYS; goto err; } } else cma_listen_on_all(id_priv); return 0; err: id_priv->backlog = 0; cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id->route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev(id_priv, NULL); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #ifdef INET6 else if (addr->sa_family == AF_INET6) { CURVNET_SET_QUIET(id_priv->id.route.addr.dev_addr.net); id_priv->afonly = V_ip6_v6only; CURVNET_RESTORE(); } #endif } ret = cma_get_port(id_priv); if (ret) goto err2; return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } EXPORT_SYMBOL(rdma_bind_addr); static int sdp_format_hdr(struct sdp_hh *sdp_hdr, struct rdma_id_private *id_priv) { /* * XXXCEM: CMA just sets the version itself rather than relying on * passed in packet to have the major version set. Should we? */ if (sdp_get_majv(sdp_hdr->majv_minv) != SDP_MAJ_VERSION) return -EINVAL; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); sdp_set_ip_ver(sdp_hdr, 4); sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; sdp_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); sdp_set_ip_ver(sdp_hdr, 6); sdp_hdr->src_addr.ip6 = src6->sin6_addr; sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; sdp_hdr->port = src6->sin6_port; cma_ip6_clear_scope_id(&sdp_hdr->src_addr.ip6); cma_ip6_clear_scope_id(&sdp_hdr->dst_addr.ip6); } else return -EAFNOSUPPORT; return 0; } static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; if (id_priv->id.ps == RDMA_PS_SDP) return sdp_format_hdr(hdr, id_priv); cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; cma_ip6_clear_scope_id(&cma_hdr->src_addr.ip6); cma_ip6_clear_scope_id(&cma_hdr->dst_addr.ip6); } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (id_priv->state != RDMA_CM_CONNECT) goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } ret = ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; int offset, ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy((char *)private_data + offset, conn_param->private_data, conn_param->private_data_len); if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; int offset, ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy((char *)private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); cm_id->tos = id_priv->tos; id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_connect_iw(id_priv, conn_param); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) { iw_param.qpn = id_priv->qp_num; } else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { ret = cma_set_qkey(id_priv, qkey); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; } rep.private_data = private_data; rep.private_data_len = private_data_len; return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); id_priv->owner = task_pid_nr(current); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_accept_iw(id_priv, conn_param); else ret = -ENOSYS; if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); else ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); } else ret = -ENOSYS; return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else ret = -EINVAL; out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; int ret = 0; id_priv = mc->id_priv; mutex_lock(&id_priv->handler_mutex); if (id_priv->state != RDMA_CM_ADDR_BOUND && id_priv->state != RDMA_CM_ADDR_RESOLVED) goto out; if (!status) status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, be16_to_cpu(multicast->rec.mlid)); mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; if (!status) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = + struct ifnet *ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); enum ib_gid_type gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ret = ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, ndev, gid_type, &event.param.ud.ah_attr); if (ret) event.event = RDMA_CM_EVENT_MULTICAST_ERROR; event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); if (ndev) dev_put(ndev); } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static void cma_query_sa_classport_info_cb(int status, struct ib_class_port_info *rec, void *context) { struct class_port_info_context *cb_ctx = context; WARN_ON(!context); if (status || !rec) { pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n", cb_ctx->device->name, cb_ctx->port_num, status); goto out; } memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info)); out: complete(&cb_ctx->done); } static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num, struct ib_class_port_info *class_port_info) { struct class_port_info_context *cb_ctx; int ret; cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL); if (!cb_ctx) return -ENOMEM; cb_ctx->device = device; cb_ctx->class_port_info = class_port_info; cb_ctx->port_num = port_num; init_completion(&cb_ctx->done); ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num, CMA_QUERY_CLASSPORT_INFO_TIMEOUT, GFP_KERNEL, cma_query_sa_classport_info_cb, cb_ctx, &cb_ctx->sa_query); if (ret < 0) { pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n", device->name, port_num, ret); goto out; } wait_for_completion(&cb_ctx->done); out: kfree(cb_ctx); return ret; } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct ib_class_port_info class_port_info; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; ret = cma_set_qkey(id_priv, 0); if (ret) return ret; cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) { ret = cma_query_sa_classport_info(id_priv->id.device, id_priv->id.port_num, &class_port_info); if (ret) return ret; if (!(ib_get_cpi_capmask2(&class_port_info) & IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) { pr_warn("RDMA CM: %s port %u Unable to multicast join\n" "RDMA CM: SM doesn't support Send Only Full Member option\n", id_priv->id.device->name, id_priv->id.port_num); return -EOPNOTSUPP; } } comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_ERR_OR_ZERO(mc->multicast.ib); } static void iboe_mcast_work_handler(struct work_struct *work) { struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); struct cma_multicast *mc = mw->mc; struct ib_sa_multicast *m = mc->multicast.ib; mc->multicast.ib->context = mc; cma_ib_mc_handler(0, m); kref_put(&mc->mcref, release_mc); kfree(mw); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; mgid->raw[1] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; - struct net_device *ndev = NULL; + struct ifnet *ndev = NULL; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); if (!mc->multicast.ib) { err = -ENOMEM; goto out1; } gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; } mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); if (addr->sa_family == AF_INET || addr->sa_family == AF_INET6) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, true); if (!err) mc->igmp_joined = true; } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) err = -ENOTSUPP; } dev_put(ndev); if (err || !mc->multicast.ib->rec.mtu) { if (!err) err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); kref_get(&mc->mcref); queue_work(cma_wq, &work->work); return 0; out2: kfree(mc->multicast.ib); out1: kfree(work); return err; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; int ret; if (!id->device) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) return -EINVAL; mc = kmalloc(sizeof *mc, GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; mc->igmp_joined = false; mc->join_state = join_state; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); if (rdma_protocol_roce(id->device, id->port_num)) { kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); } else if (rdma_cap_ib_mcast(id->device, id->port_num)) ret = cma_join_ib_multicast(id_priv, mc); else ret = -ENOSYS; if (ret) { spin_lock_irq(&id_priv->lock); list_del(&mc->list); spin_unlock_irq(&id_priv->lock); kfree(mc); } return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); if (id->qp) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, be16_to_cpu(mc->multicast.ib->rec.mlid)); BUG_ON(id_priv->cma_dev->device != id->device); if (rdma_cap_ib_mcast(id->device, id->port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); } else if (rdma_protocol_roce(id->device, id->port_num)) { if (mc->igmp_joined) { struct rdma_dev_addr *dev_addr = &id->route.addr.dev_addr; - struct net_device *ndev = NULL; + struct ifnet *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev) { cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, false); dev_put(ndev); } mc->igmp_joined = false; } kref_put(&mc->mcref, release_mc); } return; } } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int sysctl_cma_default_roce_mode(SYSCTL_HANDLER_ARGS) { struct cma_device *cma_dev = arg1; const int port = arg2; char buf[64]; int error; strlcpy(buf, ib_cache_gid_type_str( cma_get_default_gid_type(cma_dev, port)), sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) goto done; error = ib_cache_gid_parse_type_str(buf); if (error < 0) { error = EINVAL; goto done; } cma_set_default_gid_type(cma_dev, port, error); error = 0; done: return (error); } static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned int i; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; sysctl_ctx_init(&cma_dev->sysctl_ctx); cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); if (!cma_dev->default_gid_type) { kfree(cma_dev); return; } for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { unsigned long supported_gids; unsigned int default_gid_type; supported_gids = roce_gid_type_mask_support(device, i); if (WARN_ON(!supported_gids)) { /* set something valid */ default_gid_type = 0; } else if (test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) { /* prefer RoCEv2, if supported */ default_gid_type = IB_GID_TYPE_ROCE_UDP_ENCAP; } else { default_gid_type = find_first_bit(&supported_gids, BITS_PER_LONG); } cma_dev->default_gid_type[i - rdma_start_port(device)] = default_gid_type; } init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { char buf[64]; snprintf(buf, sizeof(buf), "default_roce_mode_port%d", i); (void) SYSCTL_ADD_PROC(&cma_dev->sysctl_ctx, SYSCTL_CHILDREN(device->ports_parent->parent->oidp), OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, cma_dev, i, &sysctl_cma_default_roce_mode, "A", "Default RoCE mode. Valid values: IB/RoCE v1 and RoCE v2"); } } static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; enum rdma_cm_state state; int ret = 0; /* Record that we want to remove the device */ state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); if (state == RDMA_CM_DESTROYING) return 0; cma_cancel_operation(id_priv, state); mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) goto out; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; ret = id_priv->id.event_handler(&id_priv->id, &event); out: mutex_unlock(&id_priv->handler_mutex); return ret; } static void cma_process_remove(struct cma_device *cma_dev) { struct rdma_id_private *id_priv; int ret; mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { id_priv = list_entry(cma_dev->id_list.next, struct rdma_id_private, list); list_del(&id_priv->listen_list); list_del_init(&id_priv->list); atomic_inc(&id_priv->refcount); mutex_unlock(&lock); ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); cma_deref_id(id_priv); if (ret) rdma_destroy_id(&id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); cma_deref_dev(cma_dev); wait_for_completion(&cma_dev->comp); } static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; if (!cma_dev) return; mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); sysctl_ctx_free(&cma_dev->sysctl_ctx); kfree(cma_dev->default_gid_type); kfree(cma_dev); } static void cma_init_vnet(void *arg) { struct cma_pernet *pernet = &VNET(cma_pernet); idr_init(&pernet->tcp_ps); idr_init(&pernet->udp_ps); idr_init(&pernet->ipoib_ps); idr_init(&pernet->ib_ps); idr_init(&pernet->sdp_ps); } VNET_SYSINIT(cma_init_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_FIRST, cma_init_vnet, NULL); static void cma_destroy_vnet(void *arg) { struct cma_pernet *pernet = &VNET(cma_pernet); idr_destroy(&pernet->tcp_ps); idr_destroy(&pernet->udp_ps); idr_destroy(&pernet->ipoib_ps); idr_destroy(&pernet->ib_ps); idr_destroy(&pernet->sdp_ps); } VNET_SYSUNINIT(cma_destroy_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_SECOND, cma_destroy_vnet, NULL); static int __init cma_init(void) { int ret; cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); ret = ib_register_client(&cma_client); if (ret) goto err; cma_configfs_init(); return 0; err: rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); } module_init_order(cma_init, SI_ORDER_FOURTH); module_exit_order(cma_cleanup, SI_ORDER_FOURTH); diff --git a/sys/ofed/drivers/infiniband/core/ib_device.c b/sys/ofed/drivers/infiniband/core/ib_device.c index 4d83b7e7936b..8c6a2d9f1a8a 100644 --- a/sys/ofed/drivers/infiniband/core/ib_device.c +++ b/sys/ofed/drivers/infiniband/core/ib_device.c @@ -1,1084 +1,1084 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct ib_client_data { struct list_head list; struct ib_client *client; void * data; /* The device or client is going down. Do not call client or device * callbacks other than remove(). */ bool going_down; }; struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); /* The device_list and client_list contain devices and clients after their * registration has completed, and the devices and clients are removed * during unregistration. */ static LIST_HEAD(device_list); static LIST_HEAD(client_list); /* * device_mutex and lists_rwsem protect access to both device_list and * client_list. device_mutex protects writer access by device and client * registration / de-registration. lists_rwsem protects reader access to * these lists. Iterators of these lists must lock it for read, while updates * to the lists must be done with a write lock. A special case is when the * device_mutex is locked. In this case locking the lists for read access is * not necessary as the device_mutex implies it. * * lists_rwsem also protects access to the client data list. */ static DEFINE_MUTEX(device_mutex); static DECLARE_RWSEM(lists_rwsem); static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(query_gid), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_ah), IB_MANDATORY_FUNC(destroy_ah), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((char *) device + mandatory_table[i].offset)) { pr_warn("Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); return -EINVAL; } } return 0; } static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; list_for_each_entry(device, &device_list, core_list) if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) return device; return NULL; } static int alloc_name(char *name) { unsigned long *inuse; char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); if (!inuse) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { if (!sscanf(device->name, name, &i)) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); snprintf(buf, sizeof buf, name, i); if (__ib_device_get_by_name(buf)) return -ENFILE; strlcpy(name, buf, IB_DEVICE_NAME_MAX); return 0; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); WARN_ON(dev->reg_state == IB_DEV_REGISTERED); if (dev->reg_state == IB_DEV_UNREGISTERED) { /* * In IB_DEV_UNINITIALIZED state, cache or port table * is not even created. Free cache and port table only when * device reaches UNREGISTERED state. */ ib_cache_release_one(dev); kfree(dev->port_immutable); } kfree(dev); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, }; /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *ib_alloc_device(size_t size) { struct ib_device *device; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; device->dev.parent = &linux_root_device; device->dev.class = &ib_class; device_initialize(&device->dev); dev_set_drvdata(&device->dev, device); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); INIT_LIST_HEAD(&device->client_data_list); INIT_LIST_HEAD(&device->port_list); return device; } EXPORT_SYMBOL(ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); kobject_put(&device->dev.kobj); } EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; unsigned long flags; context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { pr_warn("Couldn't allocate client context for %s/%s\n", device->name, client->name); return -ENOMEM; } context->client = client; context->data = NULL; context->going_down = false; down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_add(&context->list, &device->client_data_list); spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); return 0; } static int verify_immutable(const struct ib_device *dev, u8 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int read_port_immutable(struct ib_device *device) { int ret; u8 start_port = rdma_start_port(device); u8 end_port = rdma_end_port(device); u8 port; /** * device->port_immutable is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_immutable is declared as a 1 based array with * potential empty slots at the beginning. */ device->port_immutable = kzalloc(sizeof(*device->port_immutable) * (end_port + 1), GFP_KERNEL); if (!device->port_immutable) return -ENOMEM; for (port = start_port; port <= end_port; ++port) { ret = device->get_port_immutable(device, port, &device->port_immutable[port]); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) { if (dev->get_dev_fw_str) dev->get_dev_fw_str(dev, str, str_len); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); /** * ib_register_device - Register an IB device with IB core * @device:Device to register * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) { int ret; struct ib_client *client; struct ib_udata uhw = {.outlen = 0, .inlen = 0}; mutex_lock(&device_mutex); if (strchr(device->name, '%')) { ret = alloc_name(device->name); if (ret) goto out; } if (ib_device_check_mandatory(device)) { ret = -EINVAL; goto out; } ret = read_port_immutable(device); if (ret) { pr_warn("Couldn't create per port immutable data %s\n", device->name); goto out; } ret = ib_cache_setup_one(device); if (ret) { pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); goto port_cleanup; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->query_device(device, &device->attrs, &uhw); if (ret) { pr_warn("Couldn't query the device attributes\n"); goto cache_cleanup; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { pr_warn("Couldn't register device %s with driver model\n", device->name); goto cache_cleanup; } device->reg_state = IB_DEV_REGISTERED; list_for_each_entry(client, &client_list, list) if (client->add && !add_client_context(device, client)) client->add(device); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); mutex_unlock(&device_mutex); return 0; cache_cleanup: ib_cache_cleanup_one(device); ib_cache_release_one(device); port_cleanup: kfree(device->port_immutable); out: mutex_unlock(&device_mutex); return ret; } EXPORT_SYMBOL(ib_register_device); /** * ib_unregister_device - Unregister an IB device * @device:Device to unregister * * Unregister an IB device. All clients will receive a remove callback. */ void ib_unregister_device(struct ib_device *device) { struct ib_client_data *context, *tmp; unsigned long flags; mutex_lock(&device_mutex); down_write(&lists_rwsem); list_del(&device->core_list); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) context->going_down = true; spin_unlock_irqrestore(&device->client_data_lock, flags); downgrade_write(&lists_rwsem); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) { if (context->client->remove) context->client->remove(device, context->data); } up_read(&lists_rwsem); mutex_unlock(&device_mutex); ib_device_unregister_sysfs(device); ib_cache_cleanup_one(device); down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; } EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; mutex_lock(&device_mutex); list_for_each_entry(device, &device_list, core_list) if (client->add && !add_client_context(device, client)) client->add(device); down_write(&lists_rwsem); list_add_tail(&client->list, &client_list); up_write(&lists_rwsem); mutex_unlock(&device_mutex); return 0; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. */ void ib_unregister_client(struct ib_client *client) { struct ib_client_data *context, *tmp; struct ib_device *device; unsigned long flags; mutex_lock(&device_mutex); down_write(&lists_rwsem); list_del(&client->list); up_write(&lists_rwsem); list_for_each_entry(device, &device_list, core_list) { struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); if (client->remove) client->remove(device, found_context ? found_context->data : NULL); if (!found_context) { pr_warn("No client context found for %s/%s\n", device->name, client->name); continue; } down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_del(&found_context->list); kfree(found_context); spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); } mutex_unlock(&device_mutex); } EXPORT_SYMBOL(ib_unregister_client); /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns client context set with * ib_set_client_data(). */ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; void *ret = NULL; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } spin_unlock_irqrestore(&device->client_data_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_client_data); /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context that can be retrieved with * ib_get_client_data(). */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { struct ib_client_data *context; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; goto out; } pr_warn("No client context found for %s/%s\n", device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ int ib_register_event_handler (struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ int ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(struct ib_event *event) { unsigned long flags; struct ib_event_handler *handler; spin_lock_irqsave(&event->device->event_handler_lock, flags); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); spin_unlock_irqrestore(&event->device->event_handler_lock, flags); } EXPORT_SYMBOL(ib_dispatch_event); /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { union ib_gid gid; int err; if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; memset(port_attr, 0, sizeof(*port_attr)); err = device->query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; err = ib_query_gid(device, port_num, 0, &gid, NULL); if (err) return err; port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); return 0; } EXPORT_SYMBOL(ib_query_port); /** * ib_query_gid - Get GID table entry * @device:Device to query * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID * @attr: Returned GID attributes related to this GID index (only in RoCE). * NULL means ignore. * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr) { if (rdma_cap_roce_gid_table(device, port_num)) return ib_get_cached_gid(device, port_num, index, gid, attr); if (attr) return -EINVAL; return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u8 port; for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); port++) if (rdma_protocol_roce(ib_dev, port)) { - struct net_device *idev = NULL; + struct ifnet *idev = NULL; if (ib_dev->get_netdev) idev = ib_dev->get_netdev(ib_dev, port); if (idev && (idev->if_flags & IFF_DYING)) { dev_put(idev); idev = NULL; } if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); if (idev) dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; down_read(&lists_rwsem); list_for_each_entry(dev, &device_list, core_list) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&lists_rwsem); } /** * ib_cache_gid_del_all_by_netdev - delete GIDs belonging a netdevice * * @ndev: Pointer to netdevice */ -void ib_cache_gid_del_all_by_netdev(struct net_device *ndev) +void ib_cache_gid_del_all_by_netdev(struct ifnet *ndev) { struct ib_device *ib_dev; u8 port; down_read(&lists_rwsem); list_for_each_entry(ib_dev, &device_list, core_list) { for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); port++) { if (rdma_protocol_roce(ib_dev, port) == 0) continue; (void) ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } } up_read(&lists_rwsem); } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey) { return device->query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->modify_device) return -ENOSYS; return device->modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { if (!device->modify_port) return -ENOSYS; if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, port_modify); } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: Type of GID. * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - enum ib_gid_type gid_type, struct net_device *ndev, + enum ib_gid_type gid_type, struct ifnet *ndev, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { if (rdma_cap_roce_gid_table(device, port)) { if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, index)) { *port_num = port; return 0; } } if (gid_type != IB_GID_TYPE_IB) continue; for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. */ -struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, +struct ifnet *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { - struct net_device *net_dev = NULL; + struct ifnet *net_dev = NULL; struct ib_client_data *context; if (!rdma_protocol_ib(dev, port)) return NULL; down_read(&lists_rwsem); list_for_each_entry(context, &dev->client_data_list, list) { struct ib_client *client = context->client; if (context->going_down) continue; if (client->get_net_dev_by_params) { net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, context->data); if (net_dev) break; } } up_read(&lists_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); static int __init ib_core_init(void) { int ret; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, mp_ncpus * 4 /* WQ_UNBOUND_MAX_ACTIVE */); if (!ib_comp_wq) { ret = -ENOMEM; goto err; } ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp; } ret = addr_init(); if (ret) { pr_warn("Could't init IB address resolution\n"); goto err_sysfs; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ib_cache_setup(); return 0; err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_sysfs: class_unregister(&ib_class); err_comp: destroy_workqueue(ib_comp_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } /* * Typical loading and unloading order values and their use: * * SI_ORDER_FIRST (default for module_init): * Core modules (PCI, infiniband) * SI_ORDER_SECOND (default for module_exit): * Infiniband core modules (CM) * SI_ORDER_THIRD: * SI_ORDER_FOURTH: * Infiniband core modules (CMA) * SI_ORDER_FIFTH: * Infiniband user-space modules (UCM,UCMA,UMAD,UVERBS,IPOIB) * SI_ORDER_SIXTH: * Network HW driver modules * SI_ORDER_SEVENTH: * Infiniband HW driver modules */ module_init_order(ib_core_init, SI_ORDER_FIRST); module_exit_order(ib_core_cleanup, SI_ORDER_FIRST); MODULE_VERSION(ibcore, 1); MODULE_DEPEND(ibcore, linuxkpi, 1, 1, 1); diff --git a/sys/ofed/drivers/infiniband/core/ib_multicast.c b/sys/ofed/drivers/infiniband/core/ib_multicast.c index 15e2c486bc0f..d81767bcd3a1 100644 --- a/sys/ofed/drivers/infiniband/core/ib_multicast.c +++ b/sys/ofed/drivers/infiniband/core/ib_multicast.c @@ -1,904 +1,904 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #define LINUXKPI_PARAM_PREFIX ibcore_ #include #include #include #include #include #include #include #include #include #include "sa.h" static void mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { .name = "ib_multicast", .add = mcast_add_one, .remove = mcast_remove_one }; static struct ib_sa_client sa_client; static struct workqueue_struct *mcast_wq; static union ib_gid mgid0; struct mcast_device; struct mcast_port { struct mcast_device *dev; spinlock_t lock; struct rb_root table; atomic_t refcount; struct completion comp; u8 port_num; }; struct mcast_device { struct ib_device *device; struct ib_event_handler event_handler; int start_port; int end_port; struct mcast_port port[0]; }; enum mcast_state { MCAST_JOINING, MCAST_MEMBER, MCAST_ERROR, }; enum mcast_group_state { MCAST_IDLE, MCAST_BUSY, MCAST_GROUP_ERROR, MCAST_PKEY_EVENT }; enum { MCAST_INVALID_PKEY_INDEX = 0xFFFF }; struct mcast_member; struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; struct mcast_port *port; spinlock_t lock; struct work_struct work; struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; u16 pkey_index; u8 leave_state; int retries; }; struct mcast_member { struct ib_sa_multicast multicast; struct ib_sa_client *client; struct mcast_group *group; struct list_head list; enum mcast_state state; atomic_t refcount; struct completion comp; }; static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static struct mcast_group *mcast_find(struct mcast_port *port, union ib_gid *mgid) { struct rb_node *node = port->table.rb_node; struct mcast_group *group; int ret; while (node) { group = rb_entry(node, struct mcast_group, node); ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); if (!ret) return group; if (ret < 0) node = node->rb_left; else node = node->rb_right; } return NULL; } static struct mcast_group *mcast_insert(struct mcast_port *port, struct mcast_group *group, int allow_duplicates) { struct rb_node **link = &port->table.rb_node; struct rb_node *parent = NULL; struct mcast_group *cur_group; int ret; while (*link) { parent = *link; cur_group = rb_entry(parent, struct mcast_group, node); ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, sizeof group->rec.mgid); if (ret < 0) link = &(*link)->rb_left; else if (ret > 0) link = &(*link)->rb_right; else if (allow_duplicates) link = &(*link)->rb_left; else return cur_group; } rb_link_node(&group->node, parent, link); rb_insert_color(&group->node, &port->table); return NULL; } static void deref_port(struct mcast_port *port) { if (atomic_dec_and_test(&port->refcount)) complete(&port->comp); } static void release_group(struct mcast_group *group) { struct mcast_port *port = group->port; unsigned long flags; spin_lock_irqsave(&port->lock, flags); if (atomic_dec_and_test(&group->refcount)) { rb_erase(&group->node, &port->table); spin_unlock_irqrestore(&port->lock, flags); kfree(group); deref_port(port); } else spin_unlock_irqrestore(&port->lock, flags); } static void deref_member(struct mcast_member *member) { if (atomic_dec_and_test(&member->refcount)) complete(&member->comp); } static void queue_join(struct mcast_member *member) { struct mcast_group *group = member->group; unsigned long flags; spin_lock_irqsave(&group->lock, flags); list_add_tail(&member->list, &group->pending_list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } spin_unlock_irqrestore(&group->lock, flags); } /* * A multicast group has four types of members: full member, non member, * sendonly non member and sendonly full member. * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } /* * If a multicast group has zero members left for a particular join state, but * the group is still a member with the SA, we need to leave that join state. * Determine which join states we still belong to, but that do not have any * active members. */ static u8 get_leave_state(struct mcast_group *group) { u8 leave_state = 0; int i; for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); return leave_state & group->rec.join_state; } static int check_selector(ib_sa_comp_mask comp_mask, ib_sa_comp_mask selector_mask, ib_sa_comp_mask value_mask, u8 selector, u8 src_value, u8 dst_value) { int err; if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) return 0; switch (selector) { case IB_SA_GT: err = (src_value <= dst_value); break; case IB_SA_LT: err = (src_value >= dst_value); break; case IB_SA_EQ: err = (src_value != dst_value); break; default: err = 0; break; } return err; } static int cmp_rec(struct ib_sa_mcmember_rec *src, struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) { /* MGID must already match */ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID && memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, src->rate, dst->rate)) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, dst->packet_life_time_selector, src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && src->flow_label != dst->flow_label) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && src->hop_limit != dst->hop_limit) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope) return -EINVAL; /* join_state checked separately, proxy_join ignored */ return 0; } static int send_join(struct mcast_group *group, struct mcast_member *member) { struct mcast_port *port = group->port; int ret; group->last_join = member; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_MGMT_METHOD_SET, &member->multicast.rec, member->multicast.comp_mask, 3000, GFP_KERNEL, join_handler, group, &group->query); return (ret > 0) ? 0 : ret; } static int send_leave(struct mcast_group *group, u8 leave_state) { struct mcast_port *port = group->port; struct ib_sa_mcmember_rec rec; int ret; rec = group->rec; rec.join_state = leave_state; group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE, 3000, GFP_KERNEL, leave_handler, group, &group->query); return (ret > 0) ? 0 : ret; } static void join_group(struct mcast_group *group, struct mcast_member *member, u8 join_state) { member->state = MCAST_MEMBER; adjust_membership(group, join_state, 1); group->rec.join_state |= join_state; member->multicast.rec = group->rec; member->multicast.rec.join_state = join_state; list_move(&member->list, &group->active_list); } static int fail_join(struct mcast_group *group, struct mcast_member *member, int status) { spin_lock_irq(&group->lock); list_del_init(&member->list); spin_unlock_irq(&group->lock); return member->multicast.callback(status, &member->multicast); } static void process_group_error(struct mcast_group *group) { struct mcast_member *member; int ret = 0; u16 pkey_index; if (group->state == MCAST_PKEY_EVENT) ret = ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(group->rec.pkey), &pkey_index); spin_lock_irq(&group->lock); if (group->state == MCAST_PKEY_EVENT && !ret && group->pkey_index == pkey_index) goto out; while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); atomic_inc(&member->refcount); list_del_init(&member->list); adjust_membership(group, member->multicast.rec.join_state, -1); member->state = MCAST_ERROR; spin_unlock_irq(&group->lock); ret = member->multicast.callback(-ENETRESET, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } group->rec.join_state = 0; out: group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); } static void mcast_work_handler(struct work_struct *work) { struct mcast_group *group; struct mcast_member *member; struct ib_sa_multicast *multicast; int status, ret; u8 join_state; group = container_of(work, typeof(*group), work); retest: spin_lock_irq(&group->lock); while (!list_empty(&group->pending_list) || (group->state != MCAST_BUSY)) { if (group->state != MCAST_BUSY) { spin_unlock_irq(&group->lock); process_group_error(group); goto retest; } member = list_entry(group->pending_list.next, struct mcast_member, list); multicast = &member->multicast; join_state = multicast->rec.join_state; atomic_inc(&member->refcount); if (join_state == (group->rec.join_state & join_state)) { status = cmp_rec(&group->rec, &multicast->rec, multicast->comp_mask); if (!status) join_group(group, member, join_state); else list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = multicast->callback(status, multicast); } else { spin_unlock_irq(&group->lock); status = send_join(group, member); if (!status) { deref_member(member); return; } ret = fail_join(group, member, status); } deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } join_state = get_leave_state(group); if (join_state) { group->rec.join_state &= ~join_state; spin_unlock_irq(&group->lock); if (send_leave(group, join_state)) goto retest; } else { group->state = MCAST_IDLE; spin_unlock_irq(&group->lock); release_group(group); } } /* * Fail a join request if it is still active - at the head of the pending queue. */ static void process_join_error(struct mcast_group *group, int status) { struct mcast_member *member; int ret; spin_lock_irq(&group->lock); member = list_entry(group->pending_list.next, struct mcast_member, list); if (group->last_join == member) { atomic_inc(&member->refcount); list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = member->multicast.callback(status, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); } else spin_unlock_irq(&group->lock); } static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; u16 pkey_index = MCAST_INVALID_PKEY_INDEX; if (status) process_join_error(group, status); else { int mgids_changed, is_mgid0; if (ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index)) pkey_index = MCAST_INVALID_PKEY_INDEX; spin_lock_irq(&group->port->lock); if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, sizeof(group->rec.mgid)); group->rec = *rec; if (mgids_changed) { rb_erase(&group->node, &group->port->table); is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, sizeof(mgid0)); mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } mcast_work_handler(&group->work); } static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; else mcast_work_handler(&group->work); } static struct mcast_group *acquire_group(struct mcast_port *port, union ib_gid *mgid, gfp_t gfp_mask) { struct mcast_group *group, *cur_group; unsigned long flags; int is_mgid0; is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); if (!is_mgid0) { spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) goto found; spin_unlock_irqrestore(&port->lock, flags); } group = kzalloc(sizeof *group, gfp_mask); if (!group) return NULL; group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; INIT_LIST_HEAD(&group->pending_list); INIT_LIST_HEAD(&group->active_list); INIT_WORK(&group->work, mcast_work_handler); spin_lock_init(&group->lock); spin_lock_irqsave(&port->lock, flags); cur_group = mcast_insert(port, group, is_mgid0); if (cur_group) { kfree(group); group = cur_group; } else atomic_inc(&port->refcount); found: atomic_inc(&group->refcount); spin_unlock_irqrestore(&port->lock, flags); return group; } /* * We serialize all join requests to a single group to make our lives much * easier. Otherwise, two users could try to join the same group * simultaneously, with different configurations, one could leave while the * join is in progress, etc., which makes locking around error recovery * difficult. */ struct ib_sa_multicast * ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context) { struct mcast_device *dev; struct mcast_member *member; struct ib_sa_multicast *multicast; int ret; dev = ib_get_client_data(device, &mcast_client); if (!dev) return ERR_PTR(-ENODEV); member = kmalloc(sizeof *member, gfp_mask); if (!member) return ERR_PTR(-ENOMEM); ib_sa_client_get(client); member->client = client; member->multicast.rec = *rec; member->multicast.comp_mask = comp_mask; member->multicast.callback = callback; member->multicast.context = context; init_completion(&member->comp); atomic_set(&member->refcount, 1); member->state = MCAST_JOINING; member->group = acquire_group(&dev->port[port_num - dev->start_port], &rec->mgid, gfp_mask); if (!member->group) { ret = -ENOMEM; goto err; } /* * The user will get the multicast structure in their callback. They * could then free the multicast structure before we can return from * this routine. So we save the pointer to return before queuing * any callback. */ multicast = &member->multicast; queue_join(member); return multicast; err: ib_sa_client_put(client); kfree(member); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_sa_join_multicast); void ib_sa_free_multicast(struct ib_sa_multicast *multicast) { struct mcast_member *member; struct mcast_group *group; member = container_of(multicast, struct mcast_member, multicast); group = member->group; spin_lock_irq(&group->lock); if (member->state == MCAST_MEMBER) adjust_membership(group, multicast->rec.join_state, -1); list_del_init(&member->list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); /* Continue to hold reference on group until callback */ queue_work(mcast_wq, &group->work); } else { spin_unlock_irq(&group->lock); release_group(group); } deref_member(member); wait_for_completion(&member->comp); ib_sa_client_put(member->client); kfree(member); } EXPORT_SYMBOL(ib_sa_free_multicast); int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) { struct mcast_device *dev; struct mcast_port *port; struct mcast_group *group; unsigned long flags; int ret = 0; dev = ib_get_client_data(device, &mcast_client); if (!dev) return -ENODEV; port = &dev->port[port_num - dev->start_port]; spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) *rec = group->rec; else ret = -EADDRNOTAVAIL; spin_unlock_irqrestore(&port->lock, flags); return ret; } EXPORT_SYMBOL(ib_sa_get_mcmember_rec); int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, - struct net_device *ndev, + struct ifnet *ndev, enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; /* GID table is not based on the netdevice for IB link layer, * so ignore ndev during search. */ if (rdma_protocol_ib(device, port_num)) ndev = NULL; else if (!rdma_protocol_roce(device, port_num)) return -EINVAL; ret = ib_find_cached_gid_by_port(device, &rec->port_gid, gid_type, port_num, ndev, &gid_index); if (ret) return ret; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->mlid); ah_attr->sl = rec->sl; ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->mgid; ah_attr->grh.sgid_index = (u8) gid_index; ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; return 0; } EXPORT_SYMBOL(ib_init_ah_from_mcmember); static void mcast_groups_event(struct mcast_port *port, enum mcast_group_state state) { struct mcast_group *group; struct rb_node *node; unsigned long flags; spin_lock_irqsave(&port->lock, flags); for (node = rb_first(&port->table); node; node = rb_next(node)) { group = rb_entry(node, struct mcast_group, node); spin_lock(&group->lock); if (group->state == MCAST_IDLE) { atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } if (group->state != MCAST_GROUP_ERROR) group->state = state; spin_unlock(&group->lock); } spin_unlock_irqrestore(&port->lock, flags); } static void mcast_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct mcast_device *dev; int index; dev = container_of(handler, struct mcast_device, event_handler); if (!rdma_cap_ib_mcast(dev->device, event->element.port_num)) return; index = event->element.port_num - dev->start_port; switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; case IB_EVENT_PKEY_CHANGE: mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); break; default: break; } } static void mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; int i; int count = 0; dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, GFP_KERNEL); if (!dev) return; dev->start_port = rdma_start_port(device); dev->end_port = rdma_end_port(device); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (!rdma_cap_ib_mcast(device, dev->start_port + i)) continue; port = &dev->port[i]; port->dev = dev; port->port_num = dev->start_port + i; spin_lock_init(&port->lock); port->table = RB_ROOT; init_completion(&port->comp); atomic_set(&port->refcount, 1); ++count; } if (!count) { kfree(dev); return; } dev->device = device; ib_set_client_data(device, &mcast_client, dev); INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); } static void mcast_remove_one(struct ib_device *device, void *client_data) { struct mcast_device *dev = client_data; struct mcast_port *port; int i; if (!dev) return; ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (rdma_cap_ib_mcast(device, dev->start_port + i)) { port = &dev->port[i]; deref_port(port); wait_for_completion(&port->comp); } } kfree(dev); } int mcast_init(void) { int ret; mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM); if (!mcast_wq) return -ENOMEM; ib_sa_register_client(&sa_client); ret = ib_register_client(&mcast_client); if (ret) goto err; return 0; err: ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); return ret; } void mcast_cleanup(void) { ib_unregister_client(&mcast_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); } diff --git a/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c b/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c index 02acf29159e3..fb19f2944180 100644 --- a/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c +++ b/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c @@ -1,484 +1,484 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2015-2017, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "core_priv.h" #include #include #include #include #include #include #include static struct workqueue_struct *roce_gid_mgmt_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct roce_netdev_event_work { struct work_struct work; - struct net_device *ndev; + struct ifnet *ndev; }; struct roce_rescan_work { struct work_struct work; struct ib_device *ib_dev; }; static const struct { bool (*is_supported)(const struct ib_device *device, u8 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, - u8 port, union ib_gid *gid, struct net_device *ndev) + u8 port, union ib_gid *gid, struct ifnet *ndev) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); struct ib_gid_attr gid_attr; memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; for (i = 0; i != IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr.gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, &gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, &gid_attr); break; } } } } static int roce_gid_match_netdev(struct ib_device *ib_dev, u8 port, - struct net_device *idev, void *cookie) + struct ifnet *idev, void *cookie) { - struct net_device *ndev = (struct net_device *)cookie; + struct ifnet *ndev = (struct ifnet *)cookie; if (idev == NULL) return (0); return (ndev == idev); } static int roce_gid_match_all(struct ib_device *ib_dev, u8 port, - struct net_device *idev, void *cookie) + struct ifnet *idev, void *cookie) { if (idev == NULL) return (0); return (1); } static int roce_gid_enum_netdev_default(struct ib_device *ib_dev, - u8 port, struct net_device *idev) + u8 port, struct ifnet *idev) { unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, idev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); return (hweight_long(gid_type_mask)); } static void roce_gid_update_addr_callback(struct ib_device *device, u8 port, - struct net_device *ndev, void *cookie) + struct ifnet *ndev, void *cookie) { struct ipx_entry { STAILQ_ENTRY(ipx_entry) entry; union ipx_addr { struct sockaddr sa[0]; struct sockaddr_in v4; struct sockaddr_in6 v6; } ipx_addr; - struct net_device *ndev; + struct ifnet *ndev; }; struct ipx_entry *entry; - struct net_device *idev; + struct ifnet *idev; #if defined(INET) || defined(INET6) struct ifaddr *ifa; #endif VNET_ITERATOR_DECL(vnet_iter); struct ib_gid_attr gid_attr; union ib_gid gid; int default_gids; u16 index_num; int i; STAILQ_HEAD(, ipx_entry) ipx_head; STAILQ_INIT(&ipx_head); /* make sure default GIDs are in */ default_gids = roce_gid_enum_netdev_default(device, port, ndev); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); IFNET_RLOCK(); CK_STAILQ_FOREACH(idev, &V_ifnet, if_link) { struct epoch_tracker et; if (idev != ndev) { if (idev->if_type != IFT_L2VLAN) continue; if (ndev != rdma_vlan_dev_real_dev(idev)) continue; } /* clone address information for IPv4 and IPv6 */ NET_EPOCH_ENTER(et); #if defined(INET) CK_STAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET) continue; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) { pr_warn("roce_gid_update_addr_callback: " "couldn't allocate entry for IPv4 update\n"); continue; } entry->ipx_addr.v4 = *((struct sockaddr_in *)ifa->ifa_addr); entry->ndev = idev; STAILQ_INSERT_TAIL(&ipx_head, entry, entry); } #endif #if defined(INET6) CK_STAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET6) continue; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) { pr_warn("roce_gid_update_addr_callback: " "couldn't allocate entry for IPv6 update\n"); continue; } entry->ipx_addr.v6 = *((struct sockaddr_in6 *)ifa->ifa_addr); entry->ndev = idev; /* trash IPv6 scope ID */ sa6_recoverscope(&entry->ipx_addr.v6); entry->ipx_addr.v6.sin6_scope_id = 0; STAILQ_INSERT_TAIL(&ipx_head, entry, entry); } #endif NET_EPOCH_EXIT(et); } IFNET_RUNLOCK(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); /* add missing GIDs, if any */ STAILQ_FOREACH(entry, &ipx_head, entry) { unsigned long gid_type_mask = roce_gid_type_mask_support(device, port); if (rdma_ip2gid(&entry->ipx_addr.sa[0], &gid) != 0) continue; for (i = 0; i != IB_GID_TYPE_SIZE; i++) { if (!((1UL << i) & gid_type_mask)) continue; /* check if entry found */ if (ib_find_cached_gid_by_port(device, &gid, i, port, entry->ndev, &index_num) == 0) break; } if (i != IB_GID_TYPE_SIZE) continue; /* add new GID */ update_gid(GID_ADD, device, port, &gid, entry->ndev); } /* remove stale GIDs, if any */ for (i = default_gids; ib_get_cached_gid(device, port, i, &gid, &gid_attr) == 0; i++) { union ipx_addr ipx; /* check for valid network device pointer */ ndev = gid_attr.ndev; if (ndev == NULL) continue; dev_put(ndev); /* don't delete empty entries */ if (memcmp(&gid, &zgid, sizeof(zgid)) == 0) continue; /* zero default */ memset(&ipx, 0, sizeof(ipx)); rdma_gid2ip(&ipx.sa[0], &gid); STAILQ_FOREACH(entry, &ipx_head, entry) { if (entry->ndev == ndev && memcmp(&entry->ipx_addr, &ipx, sizeof(ipx)) == 0) break; } /* check if entry found */ if (entry != NULL) continue; /* remove GID */ update_gid(GID_DEL, device, port, &gid, ndev); } while ((entry = STAILQ_FIRST(&ipx_head))) { STAILQ_REMOVE_HEAD(&ipx_head, entry); kfree(entry); } } static void roce_gid_queue_scan_event_handler(struct work_struct *_work) { struct roce_netdev_event_work *work = container_of(_work, struct roce_netdev_event_work, work); ib_enum_all_roce_netdevs(roce_gid_match_netdev, work->ndev, roce_gid_update_addr_callback, NULL); dev_put(work->ndev); kfree(work); } static void -roce_gid_queue_scan_event(struct net_device *ndev) +roce_gid_queue_scan_event(struct ifnet *ndev) { struct roce_netdev_event_work *work; retry: switch (ndev->if_type) { case IFT_ETHER: break; case IFT_L2VLAN: ndev = rdma_vlan_dev_real_dev(ndev); if (ndev != NULL) goto retry; /* FALLTHROUGH */ default: return; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) { pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); return; } INIT_WORK(&work->work, roce_gid_queue_scan_event_handler); dev_hold(ndev); work->ndev = ndev; queue_work(roce_gid_mgmt_wq, &work->work); } static void roce_gid_delete_all_event_handler(struct work_struct *_work) { struct roce_netdev_event_work *work = container_of(_work, struct roce_netdev_event_work, work); ib_cache_gid_del_all_by_netdev(work->ndev); dev_put(work->ndev); kfree(work); } static void -roce_gid_delete_all_event(struct net_device *ndev) +roce_gid_delete_all_event(struct ifnet *ndev) { struct roce_netdev_event_work *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) { pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); return; } INIT_WORK(&work->work, roce_gid_delete_all_event_handler); dev_hold(ndev); work->ndev = ndev; queue_work(roce_gid_mgmt_wq, &work->work); /* make sure job is complete before returning */ flush_workqueue(roce_gid_mgmt_wq); } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct ifnet *ndev = netdev_notifier_info_to_ifp(ptr); switch (event) { case NETDEV_UNREGISTER: roce_gid_delete_all_event(ndev); break; case NETDEV_REGISTER: case NETDEV_CHANGEADDR: case NETDEV_CHANGEIFADDR: roce_gid_queue_scan_event(ndev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static eventhandler_tag eh_ifnet_event; static void roce_ifnet_event(void *arg, struct ifnet *ifp, int event) { if (event != IFNET_EVENT_PCP || is_vlan_dev(ifp)) return; /* make sure GID table is reloaded */ roce_gid_delete_all_event(ifp); roce_gid_queue_scan_event(ifp); } static void roce_rescan_device_handler(struct work_struct *_work) { struct roce_rescan_work *work = container_of(_work, struct roce_rescan_work, work); ib_enum_roce_netdev(work->ib_dev, roce_gid_match_all, NULL, roce_gid_update_addr_callback, NULL); kfree(work); } /* Caller must flush system workqueue before removing the ib_device */ int roce_rescan_device(struct ib_device *ib_dev) { struct roce_rescan_work *work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) return -ENOMEM; work->ib_dev = ib_dev; INIT_WORK(&work->work, roce_rescan_device_handler); queue_work(roce_gid_mgmt_wq, &work->work); return 0; } int __init roce_gid_mgmt_init(void) { roce_gid_mgmt_wq = alloc_ordered_workqueue("roce_gid_mgmt_wq", 0); if (!roce_gid_mgmt_wq) { pr_warn("roce_gid_mgmt: can't allocate work queue\n"); return -ENOMEM; } register_inetaddr_notifier(&nb_inetaddr); /* * We rely on the netdevice notifier to enumerate all existing * devices in the system. Register to this notifier last to * make sure we will not miss any IP add/del callbacks. */ register_netdevice_notifier(&nb_inetaddr); eh_ifnet_event = EVENTHANDLER_REGISTER(ifnet_event, roce_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (eh_ifnet_event != NULL) EVENTHANDLER_DEREGISTER(ifnet_event, eh_ifnet_event); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_inetaddr); /* * Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ synchronize_rcu(); drain_workqueue(roce_gid_mgmt_wq); destroy_workqueue(roce_gid_mgmt_wq); } diff --git a/sys/ofed/drivers/infiniband/core/ib_sa_query.c b/sys/ofed/drivers/infiniband/core/ib_sa_query.c index 2b19e42020d2..b423d78d0407 100644 --- a/sys/ofed/drivers/infiniband/core/ib_sa_query.c +++ b/sys/ofed/drivers/infiniband/core/ib_sa_query.c @@ -1,1580 +1,1580 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sa.h" #include "core_priv.h" #define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 struct ib_sa_sm_ah { struct ib_ah *ah; struct kref ref; u16 pkey_index; u8 src_path_mask; }; struct ib_sa_classport_cache { bool valid; struct ib_class_port_info data; }; struct ib_sa_port { struct ib_mad_agent *agent; struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; struct ib_sa_classport_cache classport_info; spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; u8 port_num; }; struct ib_sa_device { int start_port, end_port; struct ib_event_handler event_handler; struct ib_sa_port port[0]; }; struct ib_sa_query { void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; struct ib_mad_send_buf *mad_buf; struct ib_sa_sm_ah *sm_ah; int id; u32 flags; struct list_head list; /* Local svc request list */ u32 seq; /* Local svc request sequence number */ unsigned long timeout; /* Local svc timeout */ u8 path_use; /* How will the pathrecord be used */ }; #define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 #define IB_SA_CANCEL 0x00000002 struct ib_sa_service_query { void (*callback)(int, struct ib_sa_service_rec *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_path_query { void (*callback)(int, struct ib_sa_path_rec *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_guidinfo_query { void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_classport_info_query { void (*callback)(int, struct ib_class_port_info *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; struct ib_sa_query sa_query; }; static void ib_sa_add_one(struct ib_device *device); static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { .name = "sa", .add = ib_sa_add_one, .remove = ib_sa_remove_one }; static DEFINE_SPINLOCK(idr_lock); static DEFINE_IDR(query_idr); static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_path_rec *) 0)->field, \ .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { { PATH_REC_FIELD(service_id), .offset_words = 0, .offset_bits = 0, .size_bits = 64 }, { PATH_REC_FIELD(dgid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { PATH_REC_FIELD(sgid), .offset_words = 6, .offset_bits = 0, .size_bits = 128 }, { PATH_REC_FIELD(dlid), .offset_words = 10, .offset_bits = 0, .size_bits = 16 }, { PATH_REC_FIELD(slid), .offset_words = 10, .offset_bits = 16, .size_bits = 16 }, { PATH_REC_FIELD(raw_traffic), .offset_words = 11, .offset_bits = 0, .size_bits = 1 }, { RESERVED, .offset_words = 11, .offset_bits = 1, .size_bits = 3 }, { PATH_REC_FIELD(flow_label), .offset_words = 11, .offset_bits = 4, .size_bits = 20 }, { PATH_REC_FIELD(hop_limit), .offset_words = 11, .offset_bits = 24, .size_bits = 8 }, { PATH_REC_FIELD(traffic_class), .offset_words = 12, .offset_bits = 0, .size_bits = 8 }, { PATH_REC_FIELD(reversible), .offset_words = 12, .offset_bits = 8, .size_bits = 1 }, { PATH_REC_FIELD(numb_path), .offset_words = 12, .offset_bits = 9, .size_bits = 7 }, { PATH_REC_FIELD(pkey), .offset_words = 12, .offset_bits = 16, .size_bits = 16 }, { PATH_REC_FIELD(qos_class), .offset_words = 13, .offset_bits = 0, .size_bits = 12 }, { PATH_REC_FIELD(sl), .offset_words = 13, .offset_bits = 12, .size_bits = 4 }, { PATH_REC_FIELD(mtu_selector), .offset_words = 13, .offset_bits = 16, .size_bits = 2 }, { PATH_REC_FIELD(mtu), .offset_words = 13, .offset_bits = 18, .size_bits = 6 }, { PATH_REC_FIELD(rate_selector), .offset_words = 13, .offset_bits = 24, .size_bits = 2 }, { PATH_REC_FIELD(rate), .offset_words = 13, .offset_bits = 26, .size_bits = 6 }, { PATH_REC_FIELD(packet_life_time_selector), .offset_words = 14, .offset_bits = 0, .size_bits = 2 }, { PATH_REC_FIELD(packet_life_time), .offset_words = 14, .offset_bits = 2, .size_bits = 6 }, { PATH_REC_FIELD(preference), .offset_words = 14, .offset_bits = 8, .size_bits = 8 }, { RESERVED, .offset_words = 14, .offset_bits = 16, .size_bits = 48 }, }; #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ .field_name = "sa_mcmember_rec:" #field static const struct ib_field mcmember_rec_table[] = { { MCMEMBER_REC_FIELD(mgid), .offset_words = 0, .offset_bits = 0, .size_bits = 128 }, { MCMEMBER_REC_FIELD(port_gid), .offset_words = 4, .offset_bits = 0, .size_bits = 128 }, { MCMEMBER_REC_FIELD(qkey), .offset_words = 8, .offset_bits = 0, .size_bits = 32 }, { MCMEMBER_REC_FIELD(mlid), .offset_words = 9, .offset_bits = 0, .size_bits = 16 }, { MCMEMBER_REC_FIELD(mtu_selector), .offset_words = 9, .offset_bits = 16, .size_bits = 2 }, { MCMEMBER_REC_FIELD(mtu), .offset_words = 9, .offset_bits = 18, .size_bits = 6 }, { MCMEMBER_REC_FIELD(traffic_class), .offset_words = 9, .offset_bits = 24, .size_bits = 8 }, { MCMEMBER_REC_FIELD(pkey), .offset_words = 10, .offset_bits = 0, .size_bits = 16 }, { MCMEMBER_REC_FIELD(rate_selector), .offset_words = 10, .offset_bits = 16, .size_bits = 2 }, { MCMEMBER_REC_FIELD(rate), .offset_words = 10, .offset_bits = 18, .size_bits = 6 }, { MCMEMBER_REC_FIELD(packet_life_time_selector), .offset_words = 10, .offset_bits = 24, .size_bits = 2 }, { MCMEMBER_REC_FIELD(packet_life_time), .offset_words = 10, .offset_bits = 26, .size_bits = 6 }, { MCMEMBER_REC_FIELD(sl), .offset_words = 11, .offset_bits = 0, .size_bits = 4 }, { MCMEMBER_REC_FIELD(flow_label), .offset_words = 11, .offset_bits = 4, .size_bits = 20 }, { MCMEMBER_REC_FIELD(hop_limit), .offset_words = 11, .offset_bits = 24, .size_bits = 8 }, { MCMEMBER_REC_FIELD(scope), .offset_words = 12, .offset_bits = 0, .size_bits = 4 }, { MCMEMBER_REC_FIELD(join_state), .offset_words = 12, .offset_bits = 4, .size_bits = 4 }, { MCMEMBER_REC_FIELD(proxy_join), .offset_words = 12, .offset_bits = 8, .size_bits = 1 }, { RESERVED, .offset_words = 12, .offset_bits = 9, .size_bits = 23 }, }; #define SERVICE_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \ .field_name = "sa_service_rec:" #field static const struct ib_field service_rec_table[] = { { SERVICE_REC_FIELD(id), .offset_words = 0, .offset_bits = 0, .size_bits = 64 }, { SERVICE_REC_FIELD(gid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(pkey), .offset_words = 6, .offset_bits = 0, .size_bits = 16 }, { SERVICE_REC_FIELD(lease), .offset_words = 7, .offset_bits = 0, .size_bits = 32 }, { SERVICE_REC_FIELD(key), .offset_words = 8, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(name), .offset_words = 12, .offset_bits = 0, .size_bits = 64*8 }, { SERVICE_REC_FIELD(data8), .offset_words = 28, .offset_bits = 0, .size_bits = 16*8 }, { SERVICE_REC_FIELD(data16), .offset_words = 32, .offset_bits = 0, .size_bits = 8*16 }, { SERVICE_REC_FIELD(data32), .offset_words = 36, .offset_bits = 0, .size_bits = 4*32 }, { SERVICE_REC_FIELD(data64), .offset_words = 40, .offset_bits = 0, .size_bits = 2*64 }, }; #define CLASSPORTINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ .field_name = "ib_class_port_info:" #field static const struct ib_field classport_info_rec_table[] = { { CLASSPORTINFO_REC_FIELD(base_version), .offset_words = 0, .offset_bits = 0, .size_bits = 8 }, { CLASSPORTINFO_REC_FIELD(class_version), .offset_words = 0, .offset_bits = 8, .size_bits = 8 }, { CLASSPORTINFO_REC_FIELD(capability_mask), .offset_words = 0, .offset_bits = 16, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), .offset_words = 1, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(redirect_gid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), .offset_words = 6, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(redirect_lid), .offset_words = 7, .offset_bits = 0, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(redirect_pkey), .offset_words = 7, .offset_bits = 16, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(redirect_qp), .offset_words = 8, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(redirect_qkey), .offset_words = 9, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(trap_gid), .offset_words = 10, .offset_bits = 0, .size_bits = 128 }, { CLASSPORTINFO_REC_FIELD(trap_tcslfl), .offset_words = 14, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(trap_lid), .offset_words = 15, .offset_bits = 0, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(trap_pkey), .offset_words = 15, .offset_bits = 16, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(trap_hlqp), .offset_words = 16, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(trap_qkey), .offset_words = 17, .offset_bits = 0, .size_bits = 32 }, }; #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ .field_name = "sa_guidinfo_rec:" #field static const struct ib_field guidinfo_rec_table[] = { { GUIDINFO_REC_FIELD(lid), .offset_words = 0, .offset_bits = 0, .size_bits = 16 }, { GUIDINFO_REC_FIELD(block_num), .offset_words = 0, .offset_bits = 16, .size_bits = 8 }, { GUIDINFO_REC_FIELD(res1), .offset_words = 0, .offset_bits = 24, .size_bits = 8 }, { GUIDINFO_REC_FIELD(res2), .offset_words = 1, .offset_bits = 0, .size_bits = 32 }, { GUIDINFO_REC_FIELD(guid_info_list), .offset_words = 2, .offset_bits = 0, .size_bits = 512 }, }; static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) { query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; } static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); ib_destroy_ah(sm_ah->ah); kfree(sm_ah); } static void update_sm_ah(struct work_struct *work) { struct ib_sa_port *port = container_of(work, struct ib_sa_port, update_task); struct ib_sa_sm_ah *new_ah; struct ib_port_attr port_attr; struct ib_ah_attr ah_attr; if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { pr_warn("Couldn't query port\n"); return; } new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL); if (!new_ah) { return; } kref_init(&new_ah->ref); new_ah->src_path_mask = (1 << port_attr.lmc) - 1; new_ah->pkey_index = 0; if (ib_find_pkey(port->agent->device, port->port_num, IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) pr_err("Couldn't find index for default PKey\n"); memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = port_attr.sm_lid; ah_attr.sl = port_attr.sm_sl; ah_attr.port_num = port->port_num; if (port_attr.grh_required) { ah_attr.ah_flags = IB_AH_GRH; ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(port_attr.subnet_prefix); ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID); } new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); if (IS_ERR(new_ah->ah)) { pr_warn("Couldn't create new SM AH\n"); kfree(new_ah); return; } spin_lock_irq(&port->ah_lock); if (port->sm_ah) kref_put(&port->sm_ah->ref, free_sm_ah); port->sm_ah = new_ah; spin_unlock_irq(&port->ah_lock); } static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event) { if (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_SM_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER) { unsigned long flags; struct ib_sa_device *sa_dev = container_of(handler, typeof(*sa_dev), event_handler); struct ib_sa_port *port = &sa_dev->port[event->element.port_num - sa_dev->start_port]; if (!rdma_cap_ib_sa(handler->device, port->port_num)) return; spin_lock_irqsave(&port->ah_lock, flags); if (port->sm_ah) kref_put(&port->sm_ah->ref, free_sm_ah); port->sm_ah = NULL; spin_unlock_irqrestore(&port->ah_lock, flags); if (event->event == IB_EVENT_SM_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_LID_CHANGE) { spin_lock_irqsave(&port->classport_lock, flags); port->classport_info.valid = false; spin_unlock_irqrestore(&port->classport_lock, flags); } queue_work(ib_wq, &sa_dev->port[event->element.port_num - sa_dev->start_port].update_task); } } void ib_sa_register_client(struct ib_sa_client *client) { atomic_set(&client->users, 1); init_completion(&client->comp); } EXPORT_SYMBOL(ib_sa_register_client); void ib_sa_unregister_client(struct ib_sa_client *client) { ib_sa_client_put(client); wait_for_completion(&client->comp); } EXPORT_SYMBOL(ib_sa_unregister_client); /** * ib_sa_cancel_query - try to cancel an SA query * @id:ID of query to cancel * @query:query pointer to cancel * * Try to cancel an SA query. If the id and query don't match up or * the query has already completed, nothing is done. Otherwise the * query is canceled and will complete with a status of -EINTR. */ void ib_sa_cancel_query(int id, struct ib_sa_query *query) { unsigned long flags; struct ib_mad_agent *agent; struct ib_mad_send_buf *mad_buf; spin_lock_irqsave(&idr_lock, flags); if (idr_find(&query_idr, id) != query) { spin_unlock_irqrestore(&idr_lock, flags); return; } agent = query->port->agent; mad_buf = query->mad_buf; spin_unlock_irqrestore(&idr_lock, flags); ib_cancel_mad(agent, mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); static u8 get_src_path_mask(struct ib_device *device, u8 port_num) { struct ib_sa_device *sa_dev; struct ib_sa_port *port; unsigned long flags; u8 src_path_mask; sa_dev = ib_get_client_data(device, &sa_client); if (!sa_dev) return 0x7f; port = &sa_dev->port[port_num - sa_dev->start_port]; spin_lock_irqsave(&port->ah_lock, flags); src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f; spin_unlock_irqrestore(&port->ah_lock, flags); return src_path_mask; } int ib_init_ah_from_path(struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; int use_roce; - struct net_device *ndev = NULL; + struct ifnet *ndev = NULL; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); ah_attr->sl = rec->sl; ah_attr->src_path_bits = be16_to_cpu(rec->slid) & get_src_path_mask(device, port_num); ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; use_roce = rdma_cap_eth_ah(device, port_num); if (use_roce) { - struct net_device *idev; - struct net_device *resolved_dev; + struct ifnet *idev; + struct ifnet *resolved_dev; struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex, .net = rec->net ? rec->net : &init_net}; union rdma_sockaddr sgid_addr, dgid_addr; if (!device->get_netdev) return -EOPNOTSUPP; rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); /* validate the route */ ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, &dgid_addr._sockaddr, &dev_addr); if (ret) return ret; if ((dev_addr.network == RDMA_NETWORK_IPV4 || dev_addr.network == RDMA_NETWORK_IPV6) && rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) return -EINVAL; idev = device->get_netdev(device, port_num); if (!idev) return -ENODEV; resolved_dev = dev_get_by_index(dev_addr.net, dev_addr.bound_dev_if); if (!resolved_dev) { dev_put(idev); return -ENODEV; } ndev = ib_get_ndev_from_path(rec); if ((ndev && ndev != resolved_dev) || (resolved_dev != idev && rdma_vlan_dev_real_dev(resolved_dev) != idev)) ret = -EHOSTUNREACH; dev_put(idev); dev_put(resolved_dev); if (ret) { if (ndev) dev_put(ndev); return ret; } } if (rec->hop_limit > 0 || use_roce) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; ret = ib_find_cached_gid_by_port(device, &rec->sgid, rec->gid_type, port_num, ndev, &gid_index); if (ret) { if (ndev) dev_put(ndev); return ret; } ah_attr->grh.sgid_index = gid_index; ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; if (ndev) dev_put(ndev); } if (use_roce) memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) { unsigned long flags; spin_lock_irqsave(&query->port->ah_lock, flags); if (!query->port->sm_ah) { spin_unlock_irqrestore(&query->port->ah_lock, flags); return -EAGAIN; } kref_get(&query->port->sm_ah->ref); query->sm_ah = query->port->sm_ah; spin_unlock_irqrestore(&query->port->ah_lock, flags); query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, gfp_mask, IB_MGMT_BASE_VERSION); if (IS_ERR(query->mad_buf)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -ENOMEM; } query->mad_buf->ah = query->sm_ah->ah; return 0; } static void free_mad(struct ib_sa_query *query) { ib_free_send_mad(query->mad_buf); kref_put(&query->sm_ah->ref, free_sm_ah); } static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) { unsigned long flags; memset(mad, 0, sizeof *mad); mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; spin_lock_irqsave(&tid_lock, flags); mad->mad_hdr.tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++); spin_unlock_irqrestore(&tid_lock, flags); } static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; if (preload) idr_preload(gfp_mask); spin_lock_irqsave(&idr_lock, flags); id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); spin_unlock_irqrestore(&idr_lock, flags); if (preload) idr_preload_end(); if (id < 0) return id; query->mad_buf->timeout_ms = timeout_ms; query->mad_buf->context[0] = query; query->id = id; if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) { ib_sa_disable_local_svc(query); } ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { spin_lock_irqsave(&idr_lock, flags); idr_remove(&query_idr, id); spin_unlock_irqrestore(&idr_lock, flags); } /* * It's not safe to dereference query any more, because the * send may already have completed and freed the query in * another context. */ return ret ? ret : id; } void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec) { ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); } EXPORT_SYMBOL(ib_sa_unpack_path); void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute) { ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); } EXPORT_SYMBOL(ib_sa_pack_path); static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_path_query *query = container_of(sa_query, struct ib_sa_path_query, sa_query); if (mad) { struct ib_sa_path_rec rec; ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); rec.net = NULL; rec.ifindex = 0; rec.gid_type = IB_GID_TYPE_IB; eth_zero_addr(rec.dmac); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); } /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client * @device:device to send query on * @port_num: port number to send query on * @rec:Path Record to send in query * @comp_mask:component mask to send in query * @timeout_ms:time to wait for response * @gfp_mask:GFP mask to use for internal allocations * @callback:function called when query completes, times out or is * canceled * @context:opaque user context passed to callback * @sa_query:query context, used to cancel query * * Send a Path Record Get query to the SA to look up a path. The * callback function will be called when the query completes (or * fails); status is 0 for a successful response, -EINTR if the query * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error * occurred sending the query. The resp parameter of the callback is * only valid if status is 0. * * If the return value of ib_sa_path_rec_get() is negative, it is an * error code. Otherwise it is a query ID that can be used to cancel * the query. */ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_path_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_path_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL; query->sa_query.release = ib_sa_path_rec_release; mad->mad_hdr.method = IB_MGMT_METHOD_GET; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data); *sa_query = &query->sa_query; query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; query->sa_query.mad_buf->context[1] = rec; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_path_rec_get); static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_service_query *query = container_of(sa_query, struct ib_sa_service_query, sa_query); if (mad) { struct ib_sa_service_rec rec; ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_service_query, sa_query)); } /** * ib_sa_service_rec_query - Start Service Record operation * @client:SA client * @device:device to send request on * @port_num: port number to send request on * @method:SA method - should be get, set, or delete * @rec:Service Record to send in request * @comp_mask:component mask to send in request * @timeout_ms:time to wait for response * @gfp_mask:GFP mask to use for internal allocations * @callback:function called when request completes, times out or is * canceled * @context:opaque user context passed to callback * @sa_query:request context, used to cancel request * * Send a Service Record set/get/delete to the SA to register, * unregister or query a service record. * The callback function will be called when the request completes (or * fails); status is 0 for a successful response, -EINTR if the query * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error * occurred sending the query. The resp parameter of the callback is * only valid if status is 0. * * If the return value of ib_sa_service_rec_query() is negative, it is an * error code. Otherwise it is a request ID that can be used to cancel * the query. */ int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_service_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; if (method != IB_MGMT_METHOD_GET && method != IB_MGMT_METHOD_SET && method != IB_SA_METHOD_DELETE) return -EINVAL; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL; query->sa_query.release = ib_sa_service_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_service_rec_query); static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_mcmember_query *query = container_of(sa_query, struct ib_sa_mcmember_query, sa_query); if (mad) { struct ib_sa_mcmember_rec rec; ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query)); } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_mcmember_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL; query->sa_query.release = ib_sa_mcmember_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); if (mad) { struct ib_sa_guidinfo_rec rec; ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); } int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_guidinfo_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; if (method != IB_MGMT_METHOD_GET && method != IB_MGMT_METHOD_SET && method != IB_SA_METHOD_DELETE) { return -EINVAL; } port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; query->sa_query.release = ib_sa_guidinfo_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); /* Support get SA ClassPortInfo */ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { unsigned long flags; struct ib_sa_classport_info_query *query = container_of(sa_query, struct ib_sa_classport_info_query, sa_query); if (mad) { struct ib_class_port_info rec; ib_unpack(classport_info_rec_table, ARRAY_SIZE(classport_info_rec_table), mad->data, &rec); spin_lock_irqsave(&sa_query->port->classport_lock, flags); if (!status && !sa_query->port->classport_info.valid) { memcpy(&sa_query->port->classport_info.data, &rec, sizeof(sa_query->port->classport_info.data)); sa_query->port->classport_info.valid = true; } spin_unlock_irqrestore(&sa_query->port->classport_lock, flags); query->callback(status, &rec, query->context); } else { query->callback(status, NULL, query->context); } } static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_classport_info_query, sa_query)); } int ib_sa_classport_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_class_port_info *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_classport_info_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; struct ib_class_port_info cached_class_port_info; int ret; unsigned long flags; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; /* Use cached ClassPortInfo attribute if valid instead of sending mad */ spin_lock_irqsave(&port->classport_lock, flags); if (port->classport_info.valid && callback) { memcpy(&cached_class_port_info, &port->classport_info.data, sizeof(cached_class_port_info)); spin_unlock_irqrestore(&port->classport_lock, flags); callback(0, &cached_class_port_info, context); return 0; } spin_unlock_irqrestore(&port->classport_lock, flags); query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL; query->sa_query.release = ib_sa_portclass_info_rec_release; /* support GET only */ mad->mad_hdr.method = IB_MGMT_METHOD_GET; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); mad->sa_hdr.comp_mask = 0; *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_classport_info_rec_query); static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; unsigned long flags; if (query->callback) switch (mad_send_wc->status) { case IB_WC_SUCCESS: /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: query->callback(query, -ETIMEDOUT, NULL); break; case IB_WC_WR_FLUSH_ERR: query->callback(query, -EINTR, NULL); break; default: query->callback(query, -EIO, NULL); break; } spin_lock_irqsave(&idr_lock, flags); idr_remove(&query_idr, query->id); spin_unlock_irqrestore(&idr_lock, flags); free_mad(query); ib_sa_client_put(query->client); query->release(query); } static void recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; if (!send_buf) return; query = send_buf->context[0]; if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, mad_recv_wc->recv_buf.mad->mad_hdr.status ? -EINVAL : 0, (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); else query->callback(query, -EIO, NULL); } ib_free_recv_mad(mad_recv_wc); } static void ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; int s, e, i; int count = 0; s = rdma_start_port(device); e = rdma_end_port(device); sa_dev = kzalloc(sizeof *sa_dev + (e - s + 1) * sizeof (struct ib_sa_port), GFP_KERNEL); if (!sa_dev) return; sa_dev->start_port = s; sa_dev->end_port = e; for (i = 0; i <= e - s; ++i) { spin_lock_init(&sa_dev->port[i].ah_lock); if (!rdma_cap_ib_sa(device, i + 1)) continue; sa_dev->port[i].sm_ah = NULL; sa_dev->port[i].port_num = i + s; spin_lock_init(&sa_dev->port[i].classport_lock); sa_dev->port[i].classport_info.valid = false; sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, recv_handler, sa_dev, 0); if (IS_ERR(sa_dev->port[i].agent)) goto err; INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); count++; } if (!count) goto free; ib_set_client_data(device, &sa_client, sa_dev); /* * We register our event handler after everything is set up, * and then update our cached info after the event handler is * registered to avoid any problems if a port changes state * during our initialization. */ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); if (ib_register_event_handler(&sa_dev->event_handler)) goto err; for (i = 0; i <= e - s; ++i) { if (rdma_cap_ib_sa(device, i + 1)) update_sm_ah(&sa_dev->port[i].update_task); } return; err: while (--i >= 0) { if (rdma_cap_ib_sa(device, i + 1)) ib_unregister_mad_agent(sa_dev->port[i].agent); } free: kfree(sa_dev); return; } static void ib_sa_remove_one(struct ib_device *device, void *client_data) { struct ib_sa_device *sa_dev = client_data; int i; if (!sa_dev) return; ib_unregister_event_handler(&sa_dev->event_handler); flush_workqueue(ib_wq); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { if (rdma_cap_ib_sa(device, i + 1)) { ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); } } kfree(sa_dev); } int ib_sa_init(void) { int ret; get_random_bytes(&tid, sizeof tid); ret = ib_register_client(&sa_client); if (ret) { pr_err("Couldn't register ib_sa client\n"); goto err1; } ret = mcast_init(); if (ret) { pr_err("Couldn't initialize multicast handling\n"); goto err2; } return 0; err2: ib_unregister_client(&sa_client); err1: return ret; } void ib_sa_cleanup(void) { mcast_cleanup(); ib_unregister_client(&sa_client); idr_destroy(&query_idr); } diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c index 1e4358ba0a15..b0f69842ac02 100644 --- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1,1438 +1,1438 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" #include #include #include #include #include #include #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); -static struct net_device *ipoib_get_net_dev_by_params( +static struct ifnet *ipoib_get_net_dev_by_params( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static void ipoib_start(struct ifnet *dev); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static struct unrhdr *ipoib_unrhdr; static void ipoib_unrhdr_init(void *arg) { ipoib_unrhdr = new_unrhdr(0, 65535, NULL); } SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL); static void ipoib_unrhdr_uninit(void *arg) { if (ipoib_unrhdr != NULL) { struct unrhdr *hdr; hdr = ipoib_unrhdr; ipoib_unrhdr = NULL; delete_unrhdr(hdr); } } SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL); static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } static int ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate) { struct ifnet *ifp; struct ifreq ifr; int error; ifp = priv->dev; if (ifp->if_mtu == new_mtu) return (0); if (propagate) { strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); ifr.ifr_mtu = new_mtu; CURVNET_SET(ifp->if_vnet); error = ifhwioctl(SIOCSIFMTU, ifp, (caddr_t)&ifr, curthread); CURVNET_RESTORE(); } else { ifp->if_mtu = new_mtu; error = 0; } return (error); } int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate) { int error, prev_admin_mtu; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); return (ipoib_propagate_ifnet_mtu(priv, new_mtu, propagate)); } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; prev_admin_mtu = priv->admin_mtu; priv->admin_mtu = new_mtu; error = ipoib_propagate_ifnet_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu), propagate); if (error == 0) { /* check for MTU change to avoid infinite loop */ if (prev_admin_mtu != new_mtu) queue_work(ipoib_workqueue, &priv->flush_light); } else priv->admin_mtu = prev_admin_mtu; return (error); } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], INFINIBAND_ALEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ error = -ipoib_change_mtu(priv, ifr->ifr_mtu, false); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct epoch_tracker et; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); NET_EPOCH_ENTER(et); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } NET_EPOCH_EXIT(et); } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) _IF_ENQUEUE(&path->queue, mb); else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (!path->query && path_rec_start(priv, path)) { if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } void ipoib_start_locked(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; assert_spin_locked(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; INFINIBAND_BPF_MTAP(dev, mb); ipoib_send_one(priv, mb); } } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); ipoib_start_locked(dev, priv); spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { priv->gone = 1; infiniband_ifdetach(dev); if_free(dev); free_unr(ipoib_unrhdr, priv->unit); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; priv->unit = alloc_unr(ipoib_unrhdr); if (priv->unit == -1) { if_free(dev); free(priv, M_TEMP); return NULL; } if_initname(dev, name, priv->unit); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; infiniband_ifattach(dev, NULL, priv->broadcastaddr); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr = &hca->attrs; priv->hca_caps = device_attr->device_cap_flags; priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list = client_data; if (!dev_list) return; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static int -ipoib_match_dev_addr(const struct sockaddr *addr, struct net_device *dev) +ipoib_match_dev_addr(const struct sockaddr *addr, struct ifnet *dev) { struct epoch_tracker et; struct ifaddr *ifa; int retval = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != addr->sa_family || ifa->ifa_addr->sa_len != addr->sa_len) { continue; } if (memcmp(ifa->ifa_addr, addr, addr->sa_len) == 0) { retval = 1; break; } } NET_EPOCH_EXIT(et); return (retval); } /* * ipoib_match_gid_pkey_addr - returns the number of IPoIB netdevs on * top a given ipoib device matching a pkey_index and address, if one * exists. * * @found_net_dev: contains a matching net_device if the return value * >= 1, with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, - struct net_device **found_net_dev) + struct ifnet **found_net_dev) { struct ipoib_dev_priv *child_priv; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (addr == NULL || ipoib_match_dev_addr(addr, priv->dev) != 0) { if (*found_net_dev == NULL) { - struct net_device *net_dev; + struct ifnet *net_dev; if (priv->parent != NULL) net_dev = priv->parent; else net_dev = priv->dev; *found_net_dev = net_dev; dev_hold(net_dev); } matches++; } } /* Check child interfaces */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, found_net_dev); if (matches > 1) break; } mutex_unlock(&priv->vlan_mutex); return matches; } /* * __ipoib_get_net_dev_by_params - returns the number of matching * net_devs found (between 0 and 2). Also return the matching * net_device in the @net_dev parameter, holding a reference to the * net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port, u16 pkey_index, const union ib_gid *gid, - const struct sockaddr *addr, struct net_device **net_dev) + const struct sockaddr *addr, struct ifnet **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, net_dev); if (matches > 1) break; } return matches; } -static struct net_device * +static struct ifnet * ipoib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { - struct net_device *net_dev; + struct ifnet *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; if (!dev_list) return NULL; /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with L2 parameters only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); /* Fall through */ case 1: return net_dev; } } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct epoch_tracker et; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; NET_EPOCH_ENTER(et); dev = VLAN_DEVAT(ifp, vtag); NET_EPOCH_EXIT(et); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct epoch_tracker et; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; NET_EPOCH_ENTER(et); dev = VLAN_DEVAT(ifp, vtag); NET_EPOCH_EXIT(et); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } module_init_order(ipoib_init_module, SI_ORDER_FIFTH); module_exit_order(ipoib_cleanup_module, SI_ORDER_FIFTH); static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, if_infiniband, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1); diff --git a/sys/ofed/include/rdma/ib_addr.h b/sys/ofed/include/rdma/ib_addr.h index cea39f288207..1a56970594b2 100644 --- a/sys/ofed/include/rdma/ib_addr.h +++ b/sys/ofed/include/rdma/ib_addr.h @@ -1,379 +1,379 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #if !defined(IB_ADDR_H) #define IB_ADDR_H #include #include #include #include #include #include #include #include #include #include #include #include /* Linux netdevice.h but for working on an ifnet rather than a net_device. */ #define dev_hold(d) if_ref(d) #define dev_put(d) if_rele(d) #define dev_net(d) ((d)->if_vnet) #define net_eq(a,b) ((a) == (b)) struct rdma_addr_client { atomic_t refcount; struct completion comp; }; union rdma_sockaddr { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; struct sockaddr_storage _sockaddr_ss; }; /** * rdma_addr_register_client - Register an address client. */ void rdma_addr_register_client(struct rdma_addr_client *client); /** * rdma_addr_unregister_client - Deregister an address client. * @client: Client object to deregister. */ void rdma_addr_unregister_client(struct rdma_addr_client *client); /** * struct rdma_dev_addr - Contains resolved RDMA hardware addresses * @src_dev_addr: Source MAC address. * @dst_dev_addr: Destination MAC address. * @broadcast: Broadcast address of the device. * @dev_type: The interface hardware type of the device. * @bound_dev_if: An optional device interface index. * @transport: The transport type used. * @net: Network namespace containing the bound_dev_if net_dev. */ struct vnet; struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; struct vnet *net; enum rdma_network_type network; int hoplimit; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. * * The dev_addr->net and dev_addr->bound_dev_if fields must be initialized. */ int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr); /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. * @client: Address client associated with request. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has * been invoked. The net field of the addr struct must be valid. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), void *context); int rdma_resolve_ip_route(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr); void rdma_addr_cancel(struct rdma_dev_addr *addr); -int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, const unsigned char *dst_dev_addr); int rdma_addr_size(struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct sockaddr_storage *addr); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *smac, struct net_device *dev, + u8 *smac, struct ifnet *dev, int *hoplimit); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; } static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) { dev_addr->broadcast[8] = pkey >> 8; dev_addr->broadcast[9] = (unsigned char) pkey; } static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } -static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) +static inline u16 rdma_vlan_dev_vlan_id(const struct ifnet *dev) { uint16_t tag; if (dev->if_type == IFT_ETHER && dev->if_pcp != IFNET_PCP_NONE) return 0x0000; /* prio-tagged traffic */ if (VLAN_TAG(__DECONST(struct ifnet *, dev), &tag) != 0) return 0xffff; return tag; } static inline int rdma_ip2gid(const struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: ipv6_addr_set_v4mapped(((const struct sockaddr_in *) addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16); /* make sure scope ID gets zeroed inside GID */ if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)gid->raw) || IN6_IS_ADDR_MC_INTFACELOCAL((struct in6_addr *)gid->raw)) { gid->raw[2] = 0; gid->raw[3] = 0; } break; default: return -EINVAL; } return 0; } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) { if (ipv6_addr_v4mapped((const struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_len = sizeof(*out_in); out_in->sin_family = AF_INET; memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); } else { struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin6_len = sizeof(*out_in); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); } } static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - struct net_device *dev; + struct ifnet *dev; struct ifaddr *ifa; #ifdef VIMAGE if (dev_addr->net == NULL) return; #endif dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (dev) { CK_STAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != AF_INET) continue; ipv6_addr_set_v4mapped(((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; } dev_put(dev); } } static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { if (dev_addr->transport == RDMA_TRANSPORT_IB && dev_addr->dev_type != ARPHRD_INFINIBAND) iboe_addr_get_sgid(dev_addr, gid); else memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline enum ib_mtu iboe_get_mtu(int mtu) { /* * reduce IB headers from effective IBoE MTU. 28 stands for * atomic header which is the biggest possible header after BTH */ mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28; if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) return IB_MTU_4096; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) return IB_MTU_2048; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) return IB_MTU_1024; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) return IB_MTU_512; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) return IB_MTU_256; else return 0; } -static inline int iboe_get_rate(struct net_device *dev) +static inline int iboe_get_rate(struct ifnet *dev) { uint64_t baudrate = dev->if_baudrate; #ifdef if_baudrate_pf int exp; for (exp = dev->if_baudrate_pf; exp > 0; exp--) baudrate *= 10; #endif if (baudrate >= IF_Gbps(40)) return IB_RATE_40_GBPS; else if (baudrate >= IF_Gbps(30)) return IB_RATE_30_GBPS; else if (baudrate >= IF_Gbps(20)) return IB_RATE_20_GBPS; else if (baudrate >= IF_Gbps(10)) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; } static inline int rdma_link_local_addr(struct in6_addr *addr) { if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; return 0; } static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) { memcpy(mac, &addr->s6_addr[8], 3); memcpy(mac + 3, &addr->s6_addr[13], 3); mac[0] ^= 2; } static inline int rdma_is_multicast_addr(struct in6_addr *addr) { __be32 ipv4_addr; if (addr->s6_addr[0] == 0xff) return 1; ipv4_addr = addr->s6_addr32[3]; return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr)); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; mac[0] = 0x33; mac[1] = 0x33; for (i = 2; i < 6; ++i) mac[i] = addr->s6_addr[i + 10]; } static inline u16 rdma_get_vlan_id(union ib_gid *dgid) { u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; return vid < 0x1000 ? vid : 0xffff; } -static inline struct net_device *rdma_vlan_dev_real_dev(struct net_device *dev) +static inline struct ifnet *rdma_vlan_dev_real_dev(struct ifnet *dev) { struct epoch_tracker et; NET_EPOCH_ENTER(et); if (dev->if_type != IFT_ETHER || dev->if_pcp == IFNET_PCP_NONE) dev = VLAN_TRUNKDEV(dev); /* non prio-tagged traffic */ NET_EPOCH_EXIT(et); return (dev); } #endif /* IB_ADDR_H */ diff --git a/sys/ofed/include/rdma/ib_cache.h b/sys/ofed/include/rdma/ib_cache.h index 1286010bcbbd..9a3b7b799548 100644 --- a/sys/ofed/include/rdma/ib_cache.h +++ b/sys/ofed/include/rdma/ib_cache.h @@ -1,172 +1,172 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #ifndef _IB_CACHE_H #define _IB_CACHE_H #include /** * ib_get_cached_gid - Returns a cached GID table entry * @device: The device to query. * @port_num: The port number of the device to query. * @index: The index into the cached GID table to query. * @gid: The GID value found at the specified index. * @attr: The GID attribute found at the specified index (only in RoCE). * NULL means ignore (output parameter). * * ib_get_cached_gid() fetches the specified GID table entry stored in * the local software cache. */ int ib_get_cached_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr); /** * ib_find_cached_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @ndev: In RoCE, the net device of the device. NULL means ignore. * @port_num: The port number of the device where the GID value was found. * @index: The index into the cached GID table where the GID was found. This * parameter may be NULL. * * ib_find_cached_gid() searches for the specified GID value in * the local software cache. */ int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, - struct net_device *ndev, + struct ifnet *ndev, u8 *port_num, u16 *index); /** * ib_find_cached_gid_by_port - Returns the GID table index where a specified * GID value occurs * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @port_num: The port number of the device where the GID value sould be * searched. * @ndev: In RoCE, the net device of the device. Null means ignore. * @index: The index into the cached GID table where the GID was found. This * parameter may be NULL. * * ib_find_cached_gid() searches for the specified GID value in * the local software cache. */ int ib_find_cached_gid_by_port(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, u8 port_num, - struct net_device *ndev, + struct ifnet *ndev, u16 *index); int ib_find_gid_by_filter(struct ib_device *device, const union ib_gid *gid, u8 port_num, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context, u16 *index); /** * ib_get_cached_pkey - Returns a cached PKey table entry * @device: The device to query. * @port_num: The port number of the device to query. * @index: The index into the cached PKey table to query. * @pkey: The PKey value found at the specified index. * * ib_get_cached_pkey() fetches the specified PKey table entry stored in * the local software cache. */ int ib_get_cached_pkey(struct ib_device *device_handle, u8 port_num, int index, u16 *pkey); /** * ib_find_cached_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the cached PKey table where the PKey was found. * * ib_find_cached_pkey() searches the specified PKey table in * the local software cache. */ int ib_find_cached_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); /** * ib_find_exact_cached_pkey - Returns the PKey table index where a specified * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the cached PKey table where the PKey was found. * * ib_find_exact_cached_pkey() searches the specified PKey table in * the local software cache. */ int ib_find_exact_cached_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); /** * ib_get_cached_lmc - Returns a cached lmc table entry * @device: The device to query. * @port_num: The port number of the device to query. * @lmc: The lmc value for the specified port for that device. * * ib_get_cached_lmc() fetches the specified lmc table entry stored in * the local software cache. */ int ib_get_cached_lmc(struct ib_device *device, u8 port_num, u8 *lmc); #endif /* _IB_CACHE_H */ diff --git a/sys/ofed/include/rdma/ib_sa.h b/sys/ofed/include/rdma/ib_sa.h index beea96a02e1a..8a0216e50571 100644 --- a/sys/ofed/include/rdma/ib_sa.h +++ b/sys/ofed/include/rdma/ib_sa.h @@ -1,474 +1,474 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #ifndef IB_SA_H #define IB_SA_H #include #include #include #include #include #include enum { IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */ IB_SA_METHOD_GET_TABLE = 0x12, IB_SA_METHOD_GET_TABLE_RESP = 0x92, IB_SA_METHOD_DELETE = 0x15, IB_SA_METHOD_DELETE_RESP = 0x95, IB_SA_METHOD_GET_MULTI = 0x14, IB_SA_METHOD_GET_MULTI_RESP = 0x94, IB_SA_METHOD_GET_TRACE_TBL = 0x13 }; enum { IB_SA_ATTR_CLASS_PORTINFO = 0x01, IB_SA_ATTR_NOTICE = 0x02, IB_SA_ATTR_INFORM_INFO = 0x03, IB_SA_ATTR_NODE_REC = 0x11, IB_SA_ATTR_PORT_INFO_REC = 0x12, IB_SA_ATTR_SL2VL_REC = 0x13, IB_SA_ATTR_SWITCH_REC = 0x14, IB_SA_ATTR_LINEAR_FDB_REC = 0x15, IB_SA_ATTR_RANDOM_FDB_REC = 0x16, IB_SA_ATTR_MCAST_FDB_REC = 0x17, IB_SA_ATTR_SM_INFO_REC = 0x18, IB_SA_ATTR_LINK_REC = 0x20, IB_SA_ATTR_GUID_INFO_REC = 0x30, IB_SA_ATTR_SERVICE_REC = 0x31, IB_SA_ATTR_PARTITION_REC = 0x33, IB_SA_ATTR_PATH_REC = 0x35, IB_SA_ATTR_VL_ARB_REC = 0x36, IB_SA_ATTR_MC_MEMBER_REC = 0x38, IB_SA_ATTR_TRACE_REC = 0x39, IB_SA_ATTR_MULTI_PATH_REC = 0x3a, IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b, IB_SA_ATTR_INFORM_INFO_REC = 0xf3 }; enum ib_sa_selector { IB_SA_GT = 0, IB_SA_LT = 1, IB_SA_EQ = 2, /* * The meaning of "best" depends on the attribute: for * example, for MTU best will return the largest available * MTU, while for packet life time, best will return the * smallest available life time. */ IB_SA_BEST = 3 }; /* * There are 4 types of join states: * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. * The order corresponds to JoinState bits in MCMemberRecord. */ enum ib_sa_mc_join_states { FULLMEMBER_JOIN, NONMEMBER_JOIN, SENDONLY_NONMEBER_JOIN, SENDONLY_FULLMEMBER_JOIN, NUM_JOIN_MEMBERSHIP_TYPES, }; #define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12) /* * Structures for SA records are named "struct ib_sa_xxx_rec." No * attempt is made to pack structures to match the physical layout of * SA records in SA MADs; all packing and unpacking is handled by the * SA query code. * * For a record with structure ib_sa_xxx_rec, the naming convention * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we * never use different abbreviations or otherwise change the spelling * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY). * * Reserved rows are indicated with comments to help maintainability. */ #define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\ IB_SA_COMP_MASK( 1)) #define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2) #define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3) #define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4) #define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5) #define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6) /* reserved: 7 */ #define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8) #define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9) #define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10) #define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11) #define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12) #define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13) #define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14) #define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15) #define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16) #define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17) #define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18) #define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19) #define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20) #define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21) #define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22) struct ib_sa_path_rec { __be64 service_id; union ib_gid dgid; union ib_gid sgid; __be16 dlid; __be16 slid; int raw_traffic; /* reserved */ __be32 flow_label; u8 hop_limit; u8 traffic_class; int reversible; u8 numb_path; __be16 pkey; __be16 qos_class; u8 sl; u8 mtu_selector; u8 mtu; u8 rate_selector; u8 rate; u8 packet_life_time_selector; u8 packet_life_time; u8 preference; u8 dmac[ETH_ALEN]; /* ignored in IB */ int ifindex; /* ignored in IB */ struct vnet *net; enum ib_gid_type gid_type; }; -static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec) +static inline struct ifnet *ib_get_ndev_from_path(struct ib_sa_path_rec *rec) { #ifdef VIMAGE if (rec->net == NULL) return NULL; #endif return dev_get_by_index(rec->net, rec->ifindex); } #define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) #define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1) #define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2) #define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3) #define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4) #define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5) #define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6) #define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7) #define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8) #define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9) #define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10) #define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11) #define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12) #define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13) #define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14) #define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15) #define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16) #define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17) struct ib_sa_mcmember_rec { union ib_gid mgid; union ib_gid port_gid; __be32 qkey; __be16 mlid; u8 mtu_selector; u8 mtu; u8 traffic_class; __be16 pkey; u8 rate_selector; u8 rate; u8 packet_life_time_selector; u8 packet_life_time; u8 sl; __be32 flow_label; u8 hop_limit; u8 scope; u8 join_state; int proxy_join; }; /* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */ #define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0) #define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1) #define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2) /* reserved: 3 */ #define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4) #define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5) #define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6) #define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7) #define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8) #define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9) #define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10) #define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11) #define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12) #define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13) #define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14) #define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15) #define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16) #define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17) #define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18) #define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19) #define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20) #define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21) #define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22) #define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23) #define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24) #define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25) #define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26) #define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27) #define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28) #define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29) #define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30) #define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31) #define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32) #define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33) #define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34) #define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35) #define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36) #define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF struct ib_sa_service_rec { u64 id; union ib_gid gid; __be16 pkey; /* reserved */ u32 lease; u8 key[16]; u8 name[64]; u8 data8[16]; u16 data16[8]; u32 data32[4]; u64 data64[2]; }; #define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0) #define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1) #define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2) #define IB_SA_GUIDINFO_REC_RES2 IB_SA_COMP_MASK(3) #define IB_SA_GUIDINFO_REC_GID0 IB_SA_COMP_MASK(4) #define IB_SA_GUIDINFO_REC_GID1 IB_SA_COMP_MASK(5) #define IB_SA_GUIDINFO_REC_GID2 IB_SA_COMP_MASK(6) #define IB_SA_GUIDINFO_REC_GID3 IB_SA_COMP_MASK(7) #define IB_SA_GUIDINFO_REC_GID4 IB_SA_COMP_MASK(8) #define IB_SA_GUIDINFO_REC_GID5 IB_SA_COMP_MASK(9) #define IB_SA_GUIDINFO_REC_GID6 IB_SA_COMP_MASK(10) #define IB_SA_GUIDINFO_REC_GID7 IB_SA_COMP_MASK(11) struct ib_sa_guidinfo_rec { __be16 lid; u8 block_num; /* reserved */ u8 res1; __be32 res2; u8 guid_info_list[64]; }; struct ib_sa_client { atomic_t users; struct completion comp; }; /** * ib_sa_register_client - Register an SA client. */ void ib_sa_register_client(struct ib_sa_client *client); /** * ib_sa_unregister_client - Deregister an SA client. * @client: Client object to deregister. */ void ib_sa_unregister_client(struct ib_sa_client *client); struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_path_rec *resp, void *context), void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; ib_sa_comp_mask comp_mask; int (*callback)(int status, struct ib_sa_multicast *multicast); void *context; }; /** * ib_sa_join_multicast - Initiates a join request to the specified multicast * group. * @client: SA client * @device: Device associated with the multicast group. * @port_num: Port on the specified device to associate with the multicast * group. * @rec: SA multicast member record specifying group attributes. * @comp_mask: Component mask indicating which group attributes of %rec are * valid. * @gfp_mask: GFP mask for memory allocations. * @callback: User callback invoked once the join operation completes. * @context: User specified context stored with the ib_sa_multicast structure. * * This call initiates a multicast join request with the SA for the specified * multicast group. If the join operation is started successfully, it returns * an ib_sa_multicast structure that is used to track the multicast operation. * Users must free this structure by calling ib_free_multicast, even if the * join operation later fails. (The callback status is non-zero.) * * If the join operation fails; status will be non-zero, with the following * failures possible: * -ETIMEDOUT: The request timed out. * -EIO: An error occurred sending the query. * -EINVAL: The MCMemberRecord values differed from the existing group's. * -ENETRESET: Indicates that an fatal error has occurred on the multicast * group, and the user must rejoin the group to continue using it. */ struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context); /** * ib_free_multicast - Frees the multicast tracking structure, and releases * any reference on the multicast group. * @multicast: Multicast tracking structure allocated by ib_join_multicast. * * This call blocks until the multicast identifier is destroyed. It may * not be called from within the multicast callback; however, returning a non- * zero value from the callback will result in destroying the multicast * tracking structure. */ void ib_sa_free_multicast(struct ib_sa_multicast *multicast); /** * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and * returns it if found. * @device: Device associated with the multicast group. * @port_num: Port on the specified device to associate with the multicast * group. * @mgid: MGID of multicast group. * @rec: Location to copy SA multicast member record. */ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec); /** * ib_init_ah_from_mcmember - Initialize address handle attributes based on * an SA multicast member record. */ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, - struct net_device *ndev, + struct ifnet *ndev, enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr); /** * ib_init_ah_from_path - Initialize address handle attributes based on an SA * path record. */ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr); /** * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec * to IB MAD wire format. */ void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute); /** * ib_sa_unpack_path - Convert a path record from MAD format to struct * ib_sa_path_rec. */ void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec); /* Support GuidInfoRecord */ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), void *context, struct ib_sa_query **sa_query); /* Support get SA ClassPortInfo */ int ib_sa_classport_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_class_port_info *resp, void *context), void *context, struct ib_sa_query **sa_query); #endif /* IB_SA_H */ diff --git a/sys/ofed/include/rdma/ib_verbs.h b/sys/ofed/include/rdma/ib_verbs.h index 06eb71775d5c..2450682ffa66 100644 --- a/sys/ofed/include/rdma/ib_verbs.h +++ b/sys/ofed/include/rdma/ib_verbs.h @@ -1,3381 +1,3381 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #if !defined(IB_VERBS_H) #define IB_VERBS_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct ifla_vf_info; struct ifla_vf_stats; extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; extern union ib_gid zgid; enum ib_gid_type { /* If link layer is Ethernet, this is RoCE V1 */ IB_GID_TYPE_IB = 0, IB_GID_TYPE_ROCE = 0, IB_GID_TYPE_ROCE_UDP_ENCAP = 1, IB_GID_TYPE_SIZE }; #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { enum ib_gid_type gid_type; - struct net_device *ndev; + struct ifnet *ndev; }; enum rdma_node_type { /* IB values map to NodeInfo:NodeType. */ RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, RDMA_NODE_RNIC, RDMA_NODE_USNIC, RDMA_NODE_USNIC_UDP, }; enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, RDMA_TRANSPORT_USNIC, RDMA_TRANSPORT_USNIC_UDP }; enum rdma_protocol_type { RDMA_PROTOCOL_IB, RDMA_PROTOCOL_IBOE, RDMA_PROTOCOL_IWARP, RDMA_PROTOCOL_USNIC_UDP }; __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type); enum rdma_network_type { RDMA_NETWORK_IB, RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB, RDMA_NETWORK_IPV4, RDMA_NETWORK_IPV6 }; static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) { if (network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) return IB_GID_TYPE_ROCE_UDP_ENCAP; /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */ return IB_GID_TYPE_IB; } static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type, union ib_gid *gid) { if (gid_type == IB_GID_TYPE_IB) return RDMA_NETWORK_IB; if (ipv6_addr_v4mapped((struct in6_addr *)gid)) return RDMA_NETWORK_IPV4; else return RDMA_NETWORK_IPV6; } enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = (1 << 0), IB_DEVICE_BAD_PKEY_CNTR = (1 << 1), IB_DEVICE_BAD_QKEY_CNTR = (1 << 2), IB_DEVICE_RAW_MULTI = (1 << 3), IB_DEVICE_AUTO_PATH_MIG = (1 << 4), IB_DEVICE_CHANGE_PHY_PORT = (1 << 5), IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), IB_DEVICE_SHUTDOWN_PORT = (1 << 8), IB_DEVICE_INIT_TYPE = (1 << 9), IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), IB_DEVICE_SRQ_RESIZE = (1 << 13), IB_DEVICE_N_NOTIFY_CQ = (1 << 14), /* * This device supports a per-device lkey or stag that can be * used without performing a memory registration for the local * memory. Note that ULPs should never check this flag, but * instead of use the local_dma_lkey flag in the ib_pd structure, * which will always contain a usable lkey. */ IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16), IB_DEVICE_MEM_WINDOW = (1 << 17), /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = (1 << 18), IB_DEVICE_UD_TSO = (1 << 19), IB_DEVICE_XRC = (1 << 20), /* * This device supports the IB "base memory management extension", * which includes support for fast registrations (IB_WR_REG_MR, * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should * also be set by any iWarp device which must support FRs to comply * to the iWarp verbs spec. iWarp devices also support the * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21), IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22), IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23), IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24), IB_DEVICE_RC_IP_CSUM = (1 << 25), IB_DEVICE_RAW_IP_CSUM = (1 << 26), /* * Devices should set IB_DEVICE_CROSS_CHANNEL if they * support execution of WQEs that involve synchronization * of I/O operations with single completion queue managed * by hardware. */ IB_DEVICE_CROSS_CHANNEL = (1 << 27), IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), IB_DEVICE_SG_GAPS_REG = (1ULL << 32), IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34), }; enum ib_signature_prot_cap { IB_PROT_T10DIF_TYPE_1 = 1, IB_PROT_T10DIF_TYPE_2 = 1 << 1, IB_PROT_T10DIF_TYPE_3 = 1 << 2, }; enum ib_signature_guard_cap { IB_GUARD_T10DIF_CRC = 1, IB_GUARD_T10DIF_CSUM = 1 << 1, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; enum ib_odp_general_cap_bits { IB_ODP_SUPPORT = 1 << 0, }; enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_SEND = 1 << 0, IB_ODP_SUPPORT_RECV = 1 << 1, IB_ODP_SUPPORT_WRITE = 1 << 2, IB_ODP_SUPPORT_READ = 1 << 3, IB_ODP_SUPPORT_ATOMIC = 1 << 4, }; struct ib_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; } per_transport_caps; }; struct ib_rss_caps { /* Corresponding bit will be set if qp type from * 'enum ib_qp_type' is supported, e.g. * supported_qpts |= 1 << IB_QPT_UD */ u32 supported_qpts; u32 max_rwq_indirection_tables; u32 max_rwq_indirection_table_size; }; enum ib_cq_creation_flags { IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, }; struct ib_cq_init_attr { unsigned int cqe; u32 comp_vector; u32 flags; }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; int max_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; uint64_t timestamp_mask; uint64_t hca_core_clock; /* in KHZ */ struct ib_rss_caps rss_caps; u32 max_wq_type_rq; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5, IB_PORT_DUMMY = -1, /* force enum signed */ }; enum ib_port_cap_flags { IB_PORT_SM = 1 << 1, IB_PORT_NOTICE_SUP = 1 << 2, IB_PORT_TRAP_SUP = 1 << 3, IB_PORT_OPT_IPD_SUP = 1 << 4, IB_PORT_AUTO_MIGR_SUP = 1 << 5, IB_PORT_SL_MAP_SUP = 1 << 6, IB_PORT_MKEY_NVRAM = 1 << 7, IB_PORT_PKEY_NVRAM = 1 << 8, IB_PORT_LED_INFO_SUP = 1 << 9, IB_PORT_SM_DISABLED = 1 << 10, IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IB_PORT_CM_SUP = 1 << 16, IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, IB_PORT_REINIT_SUP = 1 << 18, IB_PORT_DEVICE_MGMT_SUP = 1 << 19, IB_PORT_VENDOR_CLASS_SUP = 1 << 20, IB_PORT_DR_NOTICE_SUP = 1 << 21, IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, IB_PORT_BOOT_MGMT_SUP = 1 << 23, IB_PORT_LINK_LATENCY_SUP = 1 << 24, IB_PORT_CLIENT_REG_SUP = 1 << 25, IB_PORT_IP_BASED_GIDS = 1 << 26, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_2X = 16, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_2X: return 2; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32, IB_SPEED_HDR = 64 }; /** * struct rdma_hw_stats * @timestamp - Used by the core code to track when the last update was * @lifespan - Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. * @name - Array of pointers to static names used for the counters in * directory. * @num_counters - How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. * @value - Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { unsigned long timestamp; unsigned long lifespan; const char * const *names; int num_counters; u64 value[]; }; #define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 /** * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct * for drivers. * @names - Array of static const char * * @num_counters - How many elements in array * @lifespan - How many milliseconds between updates */ static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const char * const *names, int num_counters, unsigned long lifespan) { struct rdma_hw_stats *stats; stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64), GFP_KERNEL); if (!stats) return NULL; stats->names = names; stats->num_counters = num_counters; stats->lifespan = msecs_to_jiffies(lifespan); return stats; } /* Define bits for the various functionality this port needs to be supported by * the core. */ /* Management 0x00000FFF */ #define RDMA_CORE_CAP_IB_MAD 0x00000001 #define RDMA_CORE_CAP_IB_SMI 0x00000002 #define RDMA_CORE_CAP_IB_CM 0x00000004 #define RDMA_CORE_CAP_IW_CM 0x00000008 #define RDMA_CORE_CAP_IB_SA 0x00000010 #define RDMA_CORE_CAP_OPA_MAD 0x00000020 /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_SMI \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_IB_SA \ | RDMA_CORE_CAP_AF_IB) #define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ | RDMA_CORE_CAP_OPA_MAD) struct ib_port_attr { u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; int gid_tbl_len; u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u16 lid; u16 sm_lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u8 active_speed; u8 phys_state; bool grh_required; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; #define IB_DEVICE_NODE_DESC_MAX 64 struct ib_device_modify { u64 sys_image_guid; char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; struct ib_wq *wq; u8 port_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; union rdma_network_hdr { struct ib_grh ibgrh; struct { /* The IB spec states that if it's IPv4, the header * is located in the last 20 bytes of the header. */ u8 reserved[20]; struct ip roce4grh; }; }; enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) #define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18, IB_RATE_28_GBPS = 19, IB_RATE_50_GBPS = 20, IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, }; /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); /** * enum ib_mr_type - memory region type * @IB_MR_TYPE_MEM_REG: memory region that is used for * normal registration * @IB_MR_TYPE_SIGNATURE: memory region that is used for * signature operations (data-integrity * capable regions) * @IB_MR_TYPE_SG_GAPS: memory region that is capable to * register any arbitrary sg lists (without * the normal mr constraints - see * ib_map_mr_sg) */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SIGNATURE, IB_MR_TYPE_SG_GAPS, }; /** * Signature types * IB_SIG_TYPE_NONE: Unprotected. * IB_SIG_TYPE_T10_DIF: Type T10-DIF */ enum ib_signature_type { IB_SIG_TYPE_NONE, IB_SIG_TYPE_T10_DIF, }; /** * Signature T10-DIF block-guard types * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules. * IB_T10DIF_CSUM: Corresponds to IP checksum rules. */ enum ib_t10_dif_bg_type { IB_T10DIF_CRC, IB_T10DIF_CSUM }; /** * struct ib_t10_dif_domain - Parameters specific for T10-DIF * domain. * @bg_type: T10-DIF block guard type (CRC|CSUM) * @pi_interval: protection information interval. * @bg: seed of guard computation. * @app_tag: application tag of guard block * @ref_tag: initial guard block reference tag. * @ref_remap: Indicate wethear the reftag increments each block * @app_escape: Indicate to skip block check if apptag=0xffff * @ref_escape: Indicate to skip block check if reftag=0xffffffff * @apptag_check_mask: check bitmask of application tag. */ struct ib_t10_dif_domain { enum ib_t10_dif_bg_type bg_type; u16 pi_interval; u16 bg; u16 app_tag; u32 ref_tag; bool ref_remap; bool app_escape; bool ref_escape; u16 apptag_check_mask; }; /** * struct ib_sig_domain - Parameters for signature domain * @sig_type: specific signauture type * @sig: union of all signature domain attributes that may * be used to set domain layout. */ struct ib_sig_domain { enum ib_signature_type sig_type; union { struct ib_t10_dif_domain dif; } sig; }; /** * struct ib_sig_attrs - Parameters for signature handover operation * @check_mask: bitmask for signature byte check (8 bytes) * @mem: memory domain layout desciptor. * @wire: wire domain layout desciptor. */ struct ib_sig_attrs { u8 check_mask; struct ib_sig_domain mem; struct ib_sig_domain wire; }; enum ib_sig_err_type { IB_SIG_BAD_GUARD, IB_SIG_BAD_REFTAG, IB_SIG_BAD_APPTAG, }; /** * struct ib_sig_err - signature error descriptor */ struct ib_sig_err { enum ib_sig_err_type err_type; u32 expected; u32 actual; u64 sig_err_offset; u32 key; }; enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; /** * struct ib_mr_status - Memory region status container * * @fail_status: Bitmask of MR checks status. For each * failed check a corresponding status bit is set. * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS * failure. */ struct ib_mr_status { u32 fail_status; struct ib_sig_err sig_err; }; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct ib_ah_attr { struct ib_global_route grh; u16 dlid; u8 sl; u8 src_path_bits; u8 static_rate; u8 ah_flags; u8 port_num; u8 dmac[ETH_ALEN]; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); enum ib_wc_opcode { IB_WC_SEND, IB_WC_RDMA_WRITE, IB_WC_RDMA_READ, IB_WC_COMP_SWAP, IB_WC_FETCH_ADD, IB_WC_LSO, IB_WC_LOCAL_INV, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM, IB_WC_DUMMY = -1, /* force enum signed */ }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), IB_WC_WITH_SMAC = (1<<4), IB_WC_WITH_VLAN = (1<<5), IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { union { u64 wr_id; struct ib_cqe *wr_cqe; }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; int wc_flags; u16 pkey_index; u16 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; u8 network_hdr_type; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC, IB_SRQT_XRC }; enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; } xrc; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; /* * Maximum number of rdma_rw_ctx structures in flight at a time. * ib_create_qp() will calculate the right amount of neededed WRs * and MRs based on this. */ u32 max_rdma_ctxs; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC, IB_QPT_UC, IB_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = 8, IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, IB_QPT_MAX, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer */ IB_QPT_RESERVED1 = 0x1000, IB_QPT_RESERVED2, IB_QPT_RESERVED3, IB_QPT_RESERVED4, IB_QPT_RESERVED5, IB_QPT_RESERVED6, IB_QPT_RESERVED7, IB_QPT_RESERVED8, IB_QPT_RESERVED9, IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, IB_QP_CREATE_SCATTER_FCS = 1 << 8, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. */ struct ib_qp_init_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; /* * Only needed for special QP types, or when using the RW API. */ u8 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), IB_QP_RESERVED1 = (1<<21), IB_QP_RESERVED2 = (1<<22), IB_QP_RESERVED3 = (1<<23), IB_QP_RESERVED4 = (1<<24), }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR, IB_QPS_DUMMY = -1, /* force enum signed */ }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; enum ib_mw_type { IB_MW_TYPE_1 = 1, IB_MW_TYPE_2 = 2 }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct ib_ah_attr ah_attr; struct ib_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u8 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; }; enum ib_wr_opcode { IB_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND, IB_WR_SEND_WITH_IMM, IB_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD, IB_WR_LSO, IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, IB_WR_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, IB_WR_REG_SIG_MR, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ IB_WR_RESERVED1 = 0xf0, IB_WR_RESERVED2, IB_WR_RESERVED3, IB_WR_RESERVED4, IB_WR_RESERVED5, IB_WR_RESERVED6, IB_WR_RESERVED7, IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, IB_WR_DUMMY = -1, /* force enum signed */ }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; struct ib_cqe { void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; }; struct ib_rdma_wr { struct ib_send_wr wr; u64 remote_addr; u32 rkey; }; static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr) { return container_of(wr, struct ib_rdma_wr, wr); } struct ib_atomic_wr { struct ib_send_wr wr; u64 remote_addr; u64 compare_add; u64 swap; u64 compare_add_mask; u64 swap_mask; u32 rkey; }; static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr) { return container_of(wr, struct ib_atomic_wr, wr); } struct ib_ud_wr { struct ib_send_wr wr; struct ib_ah *ah; void *header; int hlen; int mss; u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ u8 port_num; /* valid for DR SMPs on switch only */ }; static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr) { return container_of(wr, struct ib_ud_wr, wr); } struct ib_reg_wr { struct ib_send_wr wr; struct ib_mr *mr; u32 key; int access; }; static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr) { return container_of(wr, struct ib_reg_wr, wr); } struct ib_sig_handover_wr { struct ib_send_wr wr; struct ib_sig_attrs *sig_attrs; struct ib_mr *sig_mr; int access_flags; struct ib_sge *prot; }; static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr) { return container_of(wr, struct ib_sig_handover_wr, wr); } struct ib_recv_wr { struct ib_recv_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = 1, IB_ACCESS_REMOTE_WRITE = (1<<1), IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), IB_ZERO_BASED = (1<<5), IB_ACCESS_ON_DEMAND = (1<<6), }; /* * XXX: these are apparently used for ->rereg_user_mr, no idea why they * are hidden here instead of a uapi header! */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), IB_MR_REREG_ACCESS = (1<<2), IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; struct ib_fmr_attr { int max_pages; int max_maps; u8 page_shift; }; struct ib_umem; struct ib_ucontext { struct ib_device *device; struct list_head pd_list; struct list_head mr_list; struct list_head mw_list; struct list_head cq_list; struct list_head qp_list; struct list_head srq_list; struct list_head ah_list; struct list_head xrcd_list; struct list_head rule_list; struct list_head wq_list; struct list_head rwq_ind_tbl_list; int closing; pid_t tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct rb_root umem_tree; /* * Protects .umem_rbroot and tree, as well as odp_mrs_count and * mmu notifiers registration. */ struct rw_semaphore umem_rwsem; void (*invalidate_range)(struct ib_umem *umem, unsigned long start, unsigned long end); struct mmu_notifier mn; atomic_t notifier_count; /* A list of umems that don't have private mmu notifier counters yet. */ struct list_head no_private_counters; int odp_mrs_count; #endif }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; struct ib_udata { const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { u32 local_dma_lkey; u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ u32 unsafe_global_rkey; /* * Implementation details of the RDMA core, don't use in drivers: */ struct ib_mr *__internal_mr; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { IB_POLL_DIRECT, /* caller context, no hw completions */ IB_POLL_SOFTIRQ, /* poll from softirq context */ IB_POLL_WORKQUEUE, /* poll from workqueue */ }; struct ib_cq { struct ib_device *device; struct ib_uobject *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; struct work_struct work; }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; u32 srq_num; } xrc; } ext; }; enum ib_wq_type { IB_WQT_RQ }; enum ib_wq_state { IB_WQS_RESET, IB_WQS_RDY, IB_WQS_ERR }; struct ib_wq { struct ib_device *device; struct ib_uobject *uobject; void *wq_context; void (*event_handler)(struct ib_event *, void *); struct ib_pd *pd; struct ib_cq *cq; u32 wq_num; enum ib_wq_state state; enum ib_wq_type wq_type; atomic_t usecnt; }; struct ib_wq_init_attr { void *wq_context; enum ib_wq_type wq_type; u32 max_wr; u32 max_sge; struct ib_cq *cq; void (*event_handler)(struct ib_event *, void *); }; enum ib_wq_attr_mask { IB_WQ_STATE = 1 << 0, IB_WQ_CUR_STATE = 1 << 1, }; struct ib_wq_attr { enum ib_wq_state wq_state; enum ib_wq_state curr_wq_state; }; struct ib_rwq_ind_table { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; u32 ind_tbl_num; u32 log_ind_tbl_size; struct ib_wq **ind_tbl; }; struct ib_rwq_ind_table_init_attr { u32 log_ind_tbl_size; /* Each entry is a pointer to Receive Work Queue */ struct ib_wq **ind_tbl; }; /* * @max_write_sge: Maximum SGE elements per RDMA WRITE request. * @max_read_sge: Maximum SGE elements per RDMA READ request. */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; spinlock_t mr_lock; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; u32 max_write_sge; u32 max_read_sge; enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; u32 lkey; u32 rkey; u64 iova; u64 length; unsigned int page_size; bool need_inval; union { struct ib_uobject *uobject; /* user */ struct list_head qp_entry; /* FR */ }; }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; enum ib_mw_type type; }; struct ib_fmr { struct ib_device *device; struct ib_pd *pd; struct list_head list; u32 lkey; u32 rkey; }; /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ IB_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IB_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IB_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IB_FLOW_ATTR_SNIFFER = 0x3 }; /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41 }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 4 /* Flow steering rule priority is set according to it's domain. * Lower domain value means higher priority. */ enum ib_flow_domain { IB_FLOW_DOMAIN_USER, IB_FLOW_DOMAIN_ETHTOOL, IB_FLOW_DOMAIN_RFS, IB_FLOW_DOMAIN_NIC, IB_FLOW_DOMAIN_NUM /* Must be last */ }; enum ib_flow_flags { IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ }; struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_eth { enum ib_flow_spec_type type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; }; struct ib_flow_ib_filter { __be16 dlid; __u8 sl; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_ib { enum ib_flow_spec_type type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; }; /* IPv4 header flags */ enum ib_ipv4_flags { IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the last have this flag set */ }; struct ib_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; u8 proto; u8 tos; u8 ttl; u8 flags; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_ipv4 { enum ib_flow_spec_type type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; }; struct ib_flow_ipv6_filter { u8 src_ip[16]; u8 dst_ip[16]; __be32 flow_label; u8 next_hdr; u8 traffic_class; u8 hop_limit; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_ipv6 { enum ib_flow_spec_type type; u16 size; struct ib_flow_ipv6_filter val; struct ib_flow_ipv6_filter mask; }; struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; /* Must be last */ u8 real_sz[0]; }; struct ib_flow_spec_tcp_udp { enum ib_flow_spec_type type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; }; union ib_flow_spec { struct { enum ib_flow_spec_type type; u16 size; }; struct ib_flow_spec_eth eth; struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; u32 flags; u8 num_of_specs; u8 port; /* Following are the optional layers according to user request * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy */ }; struct ib_flow { struct ib_qp *qp; struct ib_uobject *uobject; }; struct ib_mad_hdr; struct ib_grh; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; #define IB_DEVICE_NAME_MAX 64 struct ib_cache { rwlock_t lock; struct ib_event_handler event_handler; struct ib_pkey_cache **pkey_cache; struct ib_gid_table **gid_cache; u8 *lmc_cache; }; struct ib_dma_mapping_ops { int (*mapping_error)(struct ib_device *dev, u64 dma_addr); u64 (*map_single)(struct ib_device *dev, void *ptr, size_t size, enum dma_data_direction direction); void (*unmap_single)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); u64 (*map_page)(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction); void (*unmap_page)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); int (*map_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); void (*unmap_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); int (*map_sg_attrs)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); void (*unmap_sg_attrs)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); void (*sync_single_for_cpu)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void *(*alloc_coherent)(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag); void (*free_coherent)(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle); }; struct iw_cm_verbs; struct ib_port_immutable { int pkey_tbl_len; int gid_tbl_len; u32 core_cap_flags; u32 max_mad_size; }; struct ib_device { struct device *dma_device; char name[IB_DEVICE_NAME_MAX]; struct list_head event_handler_list; spinlock_t event_handler_lock; spinlock_t client_data_lock; struct list_head core_list; /* Access to the client_data_list is protected by the client_data_lock * spinlock and the lists_rwsem read-write semaphore */ struct list_head client_data_list; struct ib_cache cache; /** * port_immutable is indexed by port number */ struct ib_port_immutable *port_immutable; int num_comp_vectors; struct iw_cm_verbs *iwcm; /** * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the * driver initialized data. The struct is kfree()'ed by the sysfs * core when the device is removed. A lifespan of -1 in the return * struct tells the core to set a default lifespan. */ struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device, u8 port_num); /** * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated * Return codes - * < 0 - Error, no counters updated * index - Updated the single counter pointed to by index * num_counters - Updated all counters (will reset the timestamp * and prevent further calls for lifespan milliseconds) * Drivers are allowed to update all counters in leiu of just the * one given in index at their option */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u8 port, int index); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr, struct ib_udata *udata); int (*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u8 port_num); /* When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold * on this net device. The HW vendor's device driver must guarantee * that this function returns NULL before the net device reaches * NETDEV_UNREGISTER_FINAL state. */ - struct net_device *(*get_netdev)(struct ib_device *device, + struct ifnet *(*get_netdev)(struct ib_device *device, u8 port_num); int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); /* When calling add_gid, the HW vendor's driver should * add the gid of device @device at gid index @index of * port @port_num to be @gid. Meta-info of that gid (for example, * the network device related to this gid is available * at @attr. @context allows the HW vendor driver to store extra * information together with a GID entry. The HW vendor may allocate * memory to contain this information and store it in @context when a * new GID entry is written to. Params are consistent until the next * call of add_gid or delete_gid. The function should return 0 on * success or error otherwise. The function could be called * concurrently for different ports. This function is only called * when roce_gid_table is used. */ int (*add_gid)(struct ib_device *device, u8 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr, void **context); /* When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index @index of port @port_num. * Upon the deletion of a GID entry, the HW vendor must free any * allocated memory. The caller will clear @context afterwards. * This function is only called when roce_gid_table is used. */ int (*del_gid)(struct ib_device *device, u8 port_num, unsigned int index, void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int (*modify_port)(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device, struct ib_udata *udata); int (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); struct ib_pd * (*alloc_pd)(struct ib_device *device, struct ib_ucontext *context, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd); struct ib_ah * (*create_ah)(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah); struct ib_srq * (*create_srq)(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq); int (*post_srq_recv)(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); struct ib_qp * (*create_qp)(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp); int (*post_send)(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); struct ib_cq * (*create_cq)(struct ib_device *device, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*req_ncomp_notif)(struct ib_cq *cq, int wc_cnt); struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); int (*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int (*dereg_mr)(struct ib_mr *mr); struct ib_mr * (*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int (*map_phys_fmr)(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova); int (*unmap_fmr)(struct list_head *fmr_list); int (*dealloc_fmr)(struct ib_fmr *fmr); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*process_mad)(struct ib_device *device, int process_mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in_mad, size_t in_mad_size, struct ib_mad_hdr *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, struct ib_ucontext *ucontext, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd); struct ib_flow * (*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int (*destroy_flow)(struct ib_flow *flow_id); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); void (*drain_rq)(struct ib_qp *qp); void (*drain_sq)(struct ib_qp *qp); int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port, int state); int (*get_vf_config)(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *ivf); int (*get_vf_stats)(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats); int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid, int type); struct ib_wq * (*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); int (*destroy_wq)(struct ib_wq *wq); int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask, struct ib_udata *udata); struct ib_rwq_ind_table * (*create_rwq_ind_table)(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dma_mapping_ops *dma_ops; struct module *owner; struct device dev; struct kobject *ports_parent; struct list_head port_list; enum { IB_DEV_UNINITIALIZED, IB_DEV_REGISTERED, IB_DEV_UNREGISTERED } reg_state; int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; u8 node_type; u8 phys_port_cnt; struct ib_device_attr attrs; struct attribute_group *hw_stats_ag; struct rdma_hw_stats *hw_stats; /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device * in fast paths. */ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); }; struct ib_client { char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. * @dev: An RDMA device that the net_dev use for communication. * @port: A physical port number on the RDMA device. * @pkey: P_Key that the net_dev uses if applicable. * @gid: A GID that the net_dev uses to communicate. * @addr: An IP address the net_dev is configured with. * @client_data: The device's client data set by ib_set_client_data(). * * An ib_client that implements a net_dev on top of RDMA devices * (such as IP over IB) should implement this callback, allowing the * rdma_cm module to find the right net_dev for a given request. * * The caller is responsible for calling dev_put on the returned * netdev. */ - struct net_device *(*get_net_dev_by_params)( + struct ifnet *(*get_net_dev_by_params)( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); struct list_head list; }; struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len); int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } static inline bool ib_is_udata_cleared(struct ib_udata *udata, size_t offset, size_t len) { const void __user *p = (const char __user *)udata->inbuf + offset; bool ret; u8 *buf; if (len > USHRT_MAX) return false; buf = memdup_user(p, len); if (IS_ERR(buf)) return false; ret = !memchr_inv(buf, 0, len); kfree(buf); return ret; } /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask, enum rdma_link_layer ll); int ib_register_event_handler (struct ib_event_handler *event_handler); int ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); /** * rdma_cap_ib_switch - Check if the device is IB switch * @device: Device to check * * Device driver is responsible for setting is_switch bit on * in ib_device structure at init time. * * Return: true if the device is IB switch. */ static inline bool rdma_cap_ib_switch(const struct ib_device *device) { return device->is_switch; } /** * rdma_start_port - Return the first valid port number for the device * specified * * @device: Device to be checked * * Return start port number */ static inline u8 rdma_start_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : 1; } /** * rdma_end_port - Return the last valid port number for the device * specified * * @device: Device to be checked * * Return last port number */ static inline u8 rdma_end_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } static inline int rdma_is_port_valid(const struct ib_device *device, unsigned int port) { return (port >= rdma_start_port(device) && port <= rdma_end_port(device)); } static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) { return rdma_protocol_ib(device, port_num) || rdma_protocol_roce(device, port_num); } /** * rdma_cap_ib_mad - Check if the port of a device supports Infiniband * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Management Datagrams (MAD) are a required part of the InfiniBand * specification and are supported on all InfiniBand devices. A slightly * extended version are also supported on OPA interfaces. * * Return: true if the port supports sending/receiving of MAD packets. */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; } /** * rdma_cap_opa_mad - Check if the port of device provides support for OPA * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Intel OmniPath devices extend and/or replace the InfiniBand Management * datagrams with their own versions. These OPA MADs share many but not all of * the characteristics of InfiniBand MADs. * * OPA MADs differ in the following ways: * * 1) MADs are variable size up to 2K * IBTA defined MADs remain fixed at 256 bytes * 2) OPA SMPs must carry valid PKeys * 3) OPA SMP packets are a different format * * Return: true if the port supports OPA MAD packet formats. */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) { return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) == RDMA_CORE_CAP_OPA_MAD; } /** * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). * @device: Device to check * @port_num: Port number to check * * Each InfiniBand node is required to provide a Subnet Management Agent * that the subnet manager can access. Prior to the fabric being fully * configured by the subnet manager, the SMA is accessed via a well known * interface called the Subnet Management Interface (SMI). This interface * uses directed route packets to communicate with the SM to get around the * chicken and egg problem of the SM needing to know what's on the fabric * in order to configure the fabric, and needing to configure the fabric in * order to send packets to the devices on the fabric. These directed * route packets do not need the fabric fully configured in order to reach * their destination. The SMI is the only method allowed to send * directed route packets on an InfiniBand fabric. * * Return: true if the port provides an SMI. */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; } /** * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband * Communication Manager. * @device: Device to check * @port_num: Port number to check * * The InfiniBand Communication Manager is one of many pre-defined General * Service Agents (GSA) that are accessed via the General Service * Interface (GSI). It's role is to facilitate establishment of connections * between nodes as well as other management related tasks for established * connections. * * Return: true if the port supports an IB CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; } /** * rdma_cap_iw_cm - Check if the port of device has the capability IWARP * Communication Manager. * @device: Device to check * @port_num: Port number to check * * Similar to above, but specific to iWARP connections which have a different * managment protocol than InfiniBand. * * Return: true if the port supports an iWARP CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; } /** * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband * Subnet Administration. * @device: Device to check * @port_num: Port number to check * * An InfiniBand Subnet Administration (SA) service is a pre-defined General * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand * fabrics, devices should resolve routes to other hosts by contacting the * SA to query the proper route. * * Return: true if the port should act as a client to the fabric Subnet * Administration interface. This does not imply that the SA service is * running locally. */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; } /** * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband * Multicast. * @device: Device to check * @port_num: Port number to check * * InfiniBand multicast registration is more complex than normal IPv4 or * IPv6 multicast registration. Each Host Channel Adapter must register * with the Subnet Manager when it wishes to join a multicast group. It * should do so only once regardless of how many queue pairs it subscribes * to this group. And it should leave the group only after all queue pairs * attached to the group have been detached. * * Return: true if the port must undertake the additional adminstrative * overhead of registering/unregistering with the SM and tracking of the * total number of queue pairs attached to the multicast group. */ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num) { return rdma_cap_ib_sa(device, port_num); } /** * rdma_cap_af_ib - Check if the port of device has the capability * Native Infiniband Address. * @device: Device to check * @port_num: Port number to check * * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default * GID. RoCE uses a different mechanism, but still generates a GID via * a prescribed mechanism and port specific data. * * Return: true if the port uses a GID address to identify devices on the * network. */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; } /** * rdma_cap_eth_ah - Check if the port of device has the capability * Ethernet Address Handle. * @device: Device to check * @port_num: Port number to check * * RoCE is InfiniBand over Ethernet, and it uses a well defined technique * to fabricate GIDs over Ethernet/IP specific addresses native to the * port. Normally, packet headers are generated by the sending host * adapter, but when sending connectionless datagrams, we must manually * inject the proper headers for the fabric we are communicating over. * * Return: true if we are running as a RoCE port and must force the * addition of a Global Route Header built from our Ethernet Address * Handle into our header list for connectionless packets. */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; } /** * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. * * @device: Device * @port_num: Port number * * This MAD size includes the MAD headers and MAD payload. No other headers * are included. * * Return the max MAD size required by the Port. Will return 0 if the port * does not support MADs */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) { return device->port_immutable[port_num].max_mad_size; } /** * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table * @device: Device to check * @port_num: Port number to check * * RoCE GID table mechanism manages the various GIDs for a device. * * NOTE: if allocating the port's GID table has failed, this call will still * return true, but any RoCE GID table API will fail. * * Return: true if the port uses RoCE GID table mechanism in order to manage * its GIDs. */ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, u8 port_num) { return rdma_protocol_roce(device, port_num) && device->add_gid && device->del_gid; } /* * Check if the device supports READ W/ INVALIDATE. */ static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) { /* * iWarp drivers must support READ W/ INVALIDATE. No other protocol * has support for it yet. */ return rdma_protocol_iwarp(dev, port_num); } int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr); int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, int state); int ib_get_vf_config(struct ib_device *device, int vf, u8 port, struct ifla_vf_info *info); int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, struct ifla_vf_stats *stats); int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type); int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, - enum ib_gid_type gid_type, struct net_device *ndev, + enum ib_gid_type gid_type, struct ifnet *ndev, u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); enum ib_pd_flags { /* * Create a memory registration for all memory in the system and place * the rkey for it into pd->unsafe_global_rkey. This can be used by * ULPs to avoid the overhead of dynamic MRs. * * This flag is generally considered unsafe and must only be used in * extremly trusted environments. Every use of it will log a warning * in the kernel log. */ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, }; struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller); #define ib_alloc_pd(device, flags) \ __ib_alloc_pd((device), (flags), __func__) void ib_dealloc_pd(struct ib_pd *pd); /** * ib_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); /** * ib_init_ah_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct ib_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num); /** * ib_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_destroy_ah - Destroys an address handle. * @ah: The address handle to destroy. */ int ib_destroy_ah(struct ib_ah *ah); /** * ib_create_srq - Creates a SRQ associated with the specified protection * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to * the actual capabilities of the created SRQ. * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the SRQ, and set to the actual values allocated * on return. If ib_create_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr); /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq - Destroys the specified SRQ. * @srq: The SRQ to destroy. */ int ib_destroy_srq(struct ib_srq *srq); /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr); } /** * ib_create_qp - Creates a QP associated with the specified protection * domain. * @pd: The protection domain associated with the QP. * @qp_init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. */ int ib_destroy_qp(struct ib_qp *qp); /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd - XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr) { return qp->device->post_send(qp, send_wr, bad_send_wr); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx); void ib_free_cq(struct ib_cq *cq); /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr); /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** * ib_modify_cq - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event * @cq_period: max period of time in usec before triggering an event * */ int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq - Destroys the specified CQ. * @cq: The CQ to destroy. */ int ib_destroy_cq(struct ib_cq *cq); /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->poll_cq(cq, num_entries, wc); } /** * ib_peek_cq - Returns the number of unreaped completions currently * on the specified CQ. * @cq: The CQ to peek. * @wc_cnt: A minimum number of unreaped completions to check for. * * If the number of unreaped completions is greater than or equal to wc_cnt, * this function returns wc_cnt, otherwise, it returns the actual number of * unreaped completions. */ int ib_peek_cq(struct ib_cq *cq, int wc_cnt); /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->req_notify_cq(cq, flags); } /** * ib_req_ncomp_notif - Request completion notification when there are * at least the specified number of unreaped completions on the CQ. * @cq: The CQ to generate an event for. * @wc_cnt: The number of unreaped completions that should be on the * CQ before an event is generated. */ static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt) { return cq->device->req_ncomp_notif ? cq->device->req_ncomp_notif(cq, wc_cnt) : -ENOSYS; } /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (dev->dma_ops) return dev->dma_ops->mapping_error(dev, dma_addr); return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_single(dev, cpu_addr, size, direction); return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_single(dev, addr, size, direction); else dma_unmap_single(dev->dma_device, addr, size, direction); } static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { return dma_map_single_attrs(dev->dma_device, cpu_addr, size, direction, dma_attrs); } static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { return dma_unmap_single_attrs(dev->dma_device, addr, size, direction, dma_attrs); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_page(dev, page, offset, size, direction); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_page(dev, addr, size, direction); else dma_unmap_page(dev->dma_device, addr, size, direction); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_sg(dev, sg, nents, direction); return dma_map_sg(dev->dma_device, sg, nents, direction); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_sg(dev, sg, nents, direction); else dma_unmap_sg(dev->dma_device, sg, nents, direction); } static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { if (dev->dma_ops) return dev->dma_ops->map_sg_attrs(dev, sg, nents, direction, dma_attrs); else return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *dma_attrs) { if (dev->dma_ops) return dev->dma_ops->unmap_sg_attrs(dev, sg, nents, direction, dma_attrs); else dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } /** * ib_sg_dma_address - Return the DMA address from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry * * Note: this function is obsolete. To do: change all occurrences of * ib_sg_dma_address() into sg_dma_address(). */ static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) { return sg_dma_address(sg); } /** * ib_sg_dma_len - Return the DMA length from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry * * Note: this function is obsolete. To do: change all occurrences of * ib_sg_dma_len() into sg_dma_len(). */ static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg) { return sg_dma_len(sg); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir); else dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_device(dev, addr, size, dir); else dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /** * ib_dma_alloc_coherent - Allocate memory and map it for DMA * @dev: The device for which the DMA address is requested * @size: The size of the region to allocate in bytes * @dma_handle: A pointer for returning the DMA address of the region * @flag: memory allocator flags */ static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag) { if (dev->dma_ops) return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag); else { dma_addr_t handle; void *ret; ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag); *dma_handle = handle; return ret; } } /** * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent() * @dev: The device for which the DMA addresses were allocated * @size: The size of the region * @cpu_addr: the address returned by ib_dma_alloc_coherent() * @dma_handle: the DMA address returned by ib_dma_alloc_coherent() */ static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle) { if (dev->dma_ops) dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); else dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); } /** * ib_dereg_mr - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr(struct ib_mr *mr); struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr - struct ib_mr pointer to be updated. * @newkey - new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey - the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return ((rkey + 1) & mask) | (rkey & ~mask); } /** * ib_alloc_fmr - Allocates a unmapped fast memory region. * @pd: The protection domain associated with the unmapped region. * @mr_access_flags: Specifies the memory access rights. * @fmr_attr: Attributes of the unmapped region. * * A fast memory region must be mapped before it can be used as part of * a work request. */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); /** * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. * @fmr: The fast memory region to associate with the pages. * @page_list: An array of physical pages to map to the fast memory region. * @list_len: The number of pages in page_list. * @iova: The I/O virtual address to use with the mapped region. */ static inline int ib_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova) { return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova); } /** * ib_unmap_fmr - Removes the mapping from a list of fast memory regions. * @fmr_list: A linked list of fast memory regions to unmap. */ int ib_unmap_fmr(struct list_head *fmr_list); /** * ib_dealloc_fmr - Deallocates a fast memory region. * @fmr: The fast memory region to deallocate. */ int ib_dealloc_fmr(struct ib_fmr *fmr); /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_alloc_xrcd - Allocates an XRC domain. * @device: The device on which to allocate the XRC domain. */ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); /** * ib_dealloc_xrcd - Deallocates an XRC domain. * @xrcd: The XRC domain to deallocate. */ int ib_dealloc_xrcd(struct ib_xrcd *xrcd); struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int ib_destroy_flow(struct ib_flow *flow_id); static inline int ib_check_mr_access(int flags) { /* * Local write permission is required if remote write or * remote atomic permission is also requested. */ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && !(flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; return 0; } /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected * ib_mr. first use is for signature status check. * * @mr: A memory region. * @check_mask: Bitmask of which checks to perform from * ib_mr_status_check enumeration. * @mr_status: The container of relevant status checks. * failed checks will be indicated in the status bitmask * and the relevant info shall be in the error item. */ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); -struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, +struct ifnet *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq(struct ib_wq *wq); int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask); struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr* wq_ind_table_init_attr); int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); static inline int ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { int n; n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); mr->iova = 0; return n; } int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); int ib_resolve_eth_dmac(struct ib_device *device, struct ib_ah_attr *ah_attr); #endif /* IB_VERBS_H */