Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (revision 341534) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h (revision 341535) @@ -1,767 +1,767 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #ifndef _IPOIB_H #define _IPOIB_H #define LINUXKPI_PARAM_PREFIX ipoib_ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ofed.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include /* constants */ #define INFINIBAND_ALEN 20 /* Octets in IPoIB HW addr */ #ifdef IPOIB_CM #define CONFIG_INFINIBAND_IPOIB_CM #endif #ifdef IPOIB_DEBUG #define CONFIG_INFINIBAND_IPOIB_DEBUG #define CONFIG_INFINIBAND_IPOIB_DEBUG_DATA #endif enum ipoib_flush_level { IPOIB_FLUSH_LIGHT, IPOIB_FLUSH_NORMAL, IPOIB_FLUSH_HEAVY }; enum { IPOIB_ENCAP_LEN = 4, IPOIB_HEADER_LEN = IPOIB_ENCAP_LEN + INFINIBAND_ALEN, IPOIB_UD_MAX_MTU = 4 * 1024, // IPOIB_UD_RX_SG = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE), IPOIB_UD_RX_SG = 2, IPOIB_UD_TX_SG = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2, IPOIB_CM_MAX_MTU = (64 * 1024), IPOIB_CM_TX_SG = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2, IPOIB_CM_RX_SG = (IPOIB_CM_MAX_MTU / MJUMPAGESIZE), IPOIB_RX_RING_SIZE = 256, IPOIB_TX_RING_SIZE = 128, IPOIB_MAX_RX_SG = MAX(IPOIB_CM_RX_SG, IPOIB_UD_RX_SG), IPOIB_MAX_TX_SG = MAX(IPOIB_CM_TX_SG, IPOIB_UD_TX_SG), IPOIB_MAX_QUEUE_SIZE = 8192, IPOIB_MIN_QUEUE_SIZE = 2, IPOIB_CM_MAX_CONN_QP = 4096, IPOIB_NUM_WC = 4, IPOIB_MAX_PATH_REC_QUEUE = 3, IPOIB_MAX_MCAST_QUEUE = 3, IPOIB_FLAG_OPER_UP = 0, IPOIB_FLAG_INITIALIZED = 1, IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, IPOIB_PKEY_STOP = 4, IPOIB_FLAG_SUBINTERFACE = 5, IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_FLAG_UMCAST = 10, IPOIB_FLAG_CSUM = 11, IPOIB_MAX_BACKOFF_SECONDS = 16, IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, IPOIB_MAX_LRO_DESCRIPTORS = 8, IPOIB_LRO_MAX_AGGR = 64, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, }; #define IPOIB_OP_RECV (1ul << 31) #ifdef CONFIG_INFINIBAND_IPOIB_CM #define IPOIB_OP_CM (1ul << 30) #else #define IPOIB_OP_CM (0) #endif /* structs */ struct ipoib_header { u8 hwaddr[INFINIBAND_ALEN]; __be16 proto; u16 reserved; }; struct ipoib_pseudoheader { u8 hwaddr[INFINIBAND_ALEN]; }; /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ struct ipoib_mcast { struct ib_sa_mcmember_rec mcmember; struct ib_sa_multicast *mc; struct ipoib_ah *ah; struct rb_node rb_node; struct list_head list; unsigned long created; unsigned long backoff; unsigned long flags; unsigned char logcount; struct ifqueue pkt_queue; struct ipoib_dev_priv *priv; }; struct ipoib_cm_rx_buf { struct mbuf *mb; u64 mapping[IPOIB_CM_RX_SG]; }; struct ipoib_cm_tx_buf { struct mbuf *mb; u64 mapping[IPOIB_CM_TX_SG]; }; struct ipoib_rx_buf { struct mbuf *mb; u64 mapping[IPOIB_UD_RX_SG]; }; struct ipoib_tx_buf { struct mbuf *mb; u64 mapping[IPOIB_UD_TX_SG]; }; struct ib_cm_id; struct ipoib_cm_data { __be32 qpn; /* High byte MUST be ignored on receive */ __be32 mtu; }; /* * Quoting 10.3.1 Queue Pair and EE Context States: * * Note, for QPs that are associated with an SRQ, the Consumer should take the * QP through the Error State before invoking a Destroy QP or a Modify QP to the * Reset State. The Consumer may invoke the Destroy QP without first performing * a Modify QP to the Error State and waiting for the Affiliated Asynchronous * Last WQE Reached Event. However, if the Consumer does not wait for the * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment * leakage may occur. Therefore, it is good programming practice to tear down a * QP that is associated with an SRQ by using the following process: * * - Put the QP in the Error State * - Wait for the Affiliated Asynchronous Last WQE Reached Event; * - either: * drain the CQ by invoking the Poll CQ verb and either wait for CQ * to be empty or the number of Poll CQ operations has exceeded * CQ capacity size; * - or * post another WR that completes on the same CQ and wait for this * WR to return as a WC; * - and then invoke a Destroy QP or Reset QP. * * We use the second option and wait for a completion on the * same CQ before destroying QPs attached to our SRQ. */ enum ipoib_cm_state { IPOIB_CM_RX_LIVE, IPOIB_CM_RX_ERROR, /* Ignored by stale task */ IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ }; struct ipoib_cm_rx { struct ib_cm_id *id; struct ib_qp *qp; struct ipoib_cm_rx_buf *rx_ring; struct list_head list; struct ipoib_dev_priv *priv; unsigned long jiffies; enum ipoib_cm_state state; int recv_count; }; struct ipoib_cm_tx { struct ib_cm_id *id; struct ib_qp *qp; struct list_head list; struct ipoib_dev_priv *priv; struct ipoib_path *path; struct ipoib_cm_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; unsigned long flags; u32 mtu; /* remote specified mtu, with grh. */ }; struct ipoib_cm_dev_priv { struct ib_srq *srq; struct ipoib_cm_rx_buf *srq_ring; struct ib_cm_id *id; struct list_head passive_ids; /* state: LIVE */ struct list_head rx_error_list; /* state: ERROR */ struct list_head rx_flush_list; /* state: FLUSH, drain not started */ struct list_head rx_drain_list; /* state: FLUSH, drain started */ struct list_head rx_reap_list; /* state: FLUSH, drain done */ struct work_struct start_task; struct work_struct reap_task; struct work_struct mb_task; struct work_struct rx_reap_task; struct delayed_work stale_task; struct ifqueue mb_queue; struct list_head start_list; struct list_head reap_list; struct ib_sge rx_sge[IPOIB_CM_RX_SG]; struct ib_recv_wr rx_wr; int nonsrq_conn_qp; int max_cm_mtu; /* Actual buf size. */ int num_frags; }; struct ipoib_ethtool_st { u16 coalesce_usecs; u16 max_coalesced_frames; }; /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside * of tx_lock (ie tx_lock must be acquired first if needed). */ struct ipoib_dev_priv { spinlock_t lock; spinlock_t drain_lock; struct ifnet *dev; u8 broadcastaddr[INFINIBAND_ALEN]; unsigned long flags; int gone; int unit; struct mutex vlan_mutex; struct rb_root path_tree; struct list_head path_list; struct ipoib_mcast *broadcast; struct list_head multicast_list; struct rb_root multicast_tree; struct delayed_work pkey_poll_task; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; struct work_struct flush_normal; struct work_struct flush_heavy; struct work_struct restart_task; struct delayed_work ah_reap_task; struct ib_device *ca; u8 port; u16 pkey; u16 pkey_index; struct ib_pd *pd; struct ib_cq *recv_cq; struct ib_cq *send_cq; struct ib_qp *qp; u32 qkey; union ib_gid local_gid; u16 local_lid; unsigned int admin_mtu; /* User selected MTU, no GRH. */ unsigned int mcast_mtu; /* Minus GRH bytes, from mcast group. */ unsigned int max_ib_mtu; /* Without header, actual buf size. */ struct ipoib_rx_buf *rx_ring; struct ipoib_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; struct ib_sge tx_sge[IPOIB_MAX_TX_SG]; struct ib_ud_wr tx_wr; unsigned tx_outstanding; struct ib_wc send_wc[MAX_SEND_CQE]; struct ib_recv_wr rx_wr; struct ib_sge rx_sge[IPOIB_MAX_RX_SG]; struct ib_wc ibwc[IPOIB_NUM_WC]; struct list_head dead_ahs; struct ib_event_handler event_handler; struct ifnet *parent; struct list_head child_intfs; struct list_head list; #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_dev_priv cm; #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct list_head fs_list; struct dentry *mcg_dentry; struct dentry *path_dentry; #endif int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; }; struct ipoib_ah { struct ipoib_dev_priv *priv; struct ib_ah *ah; struct list_head list; struct kref ref; unsigned last_send; }; struct ipoib_path { struct ipoib_dev_priv *priv; struct rb_node rb_node; struct list_head list; #ifdef CONFIG_INFINIBAND_IPOIB_CM uint8_t hwaddr[INFINIBAND_ALEN]; struct ipoib_cm_tx *cm; #endif struct ipoib_ah *ah; struct ib_sa_path_rec pathrec; struct ifqueue queue; int query_id; struct ib_sa_query *query; struct completion done; int valid; }; /* UD Only transmits encap len but we want the two sizes to be symmetrical. */ #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) #define IPOIB_CM_MTU(ib_mtu) (ib_mtu - 0x10) #define IPOIB_IS_MULTICAST(addr) ((addr)[4] == 0xff) extern struct workqueue_struct *ipoib_workqueue; #define IPOIB_MTAP_PROTO(_ifp, _m, _proto) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_proto((_ifp), (_m), (_proto)); \ } \ } while (0) /* functions */ void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto); void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *, struct ib_pd *pd, struct ib_ah_attr *attr); void ipoib_free_ah(struct kref *kref); static inline void ipoib_put_ah(struct ipoib_ah *ah) { kref_put(&ah->ref, ipoib_free_ah); } int ipoib_open(struct ipoib_dev_priv *priv); int ipoib_add_pkey_attr(struct ipoib_dev_priv *priv); int ipoib_add_umcast_attr(struct ipoib_dev_priv *priv); void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto); void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn); void ipoib_reap_ah(struct work_struct *work); void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv); void ipoib_flush_paths(struct ipoib_dev_priv *priv); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv); int ipoib_ib_dev_open(struct ipoib_dev_priv *priv); int ipoib_ib_dev_up(struct ipoib_dev_priv *priv); int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush); int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush); int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb); void ipoib_mcast_restart_task(struct work_struct *work); void ipoib_mcast_restart(struct ipoib_dev_priv *); int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv); int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush); void ipoib_mcast_dev_down(struct ipoib_dev_priv *priv); void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv); void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv); int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *gid, unsigned long *created, unsigned int *queuelen, unsigned int *complete, unsigned int *send_only); struct ipoib_path_iter *ipoib_path_iter_init(struct ipoib_dev_priv *priv); int ipoib_path_iter_next(struct ipoib_path_iter *iter); void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path); #endif -int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu); +int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate); int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey); int ipoib_init_qp(struct ipoib_dev_priv *priv); int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca); void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv); void ipoib_event(struct ib_event_handler *handler, struct ib_event *record); void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct ipoib_dev_priv *priv); int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max); void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); int ipoib_poll_tx(struct ipoib_dev_priv *priv); void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req); void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length); struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size); void ipoib_set_ethtool_ops(struct ifnet *dev); int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); #ifdef CONFIG_INFINIBAND_IPOIB_CM #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 /* We don't support UC connections at the moment */ #define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) extern int ipoib_max_conn_qp; static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { return IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)); } static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { return IPOIB_CM_SUPPORTED(hwaddr); } static inline int ipoib_cm_up(struct ipoib_path *path) { return test_bit(IPOIB_FLAG_OPER_UP, &path->cm->flags); } static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { return path->cm; } static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { path->cm = tx; } static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { return !!priv->cm.srq; } static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { return priv->cm.max_cm_mtu; } void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx); int ipoib_cm_dev_open(struct ipoib_dev_priv *priv); void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv); int ipoib_cm_dev_init(struct ipoib_dev_priv *priv); int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv); void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv); struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path); void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu); void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc); #else struct ipoib_cm_tx; #define ipoib_max_conn_qp 0 static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv) { return 0; } static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { return 0; } static inline int ipoib_cm_up(struct ipoib_path *path) { return 0; } static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path) { return NULL; } static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx) { } static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv) { return 0; } static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx) { return; } static inline int ipoib_cm_dev_open(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv) { return; } static inline int ipoib_cm_dev_init(struct ipoib_dev_priv *priv) { return -ENOSYS; } static inline void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv) { return; } static inline struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path) { return NULL; } static inline void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { return; } static inline int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv) { return 0; } static inline void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu) { m_freem(mb); } static inline void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } static inline void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { } #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG void ipoib_create_debug_files(struct ipoib_dev_priv *priv); void ipoib_delete_debug_files(struct ipoib_dev_priv *priv); int ipoib_register_debugfs(void); void ipoib_unregister_debugfs(void); #else static inline void ipoib_create_debug_files(struct ipoib_dev_priv *priv) { } static inline void ipoib_delete_debug_files(struct ipoib_dev_priv *priv) { } static inline int ipoib_register_debugfs(void) { return 0; } static inline void ipoib_unregister_debugfs(void) { } #endif #define ipoib_printk(level, priv, format, arg...) \ printk(level "%s: " format, if_name(((struct ipoib_dev_priv *) priv)->dev), ## arg) #define ipoib_warn(priv, format, arg...) \ ipoib_printk(KERN_WARNING, priv, format , ## arg) extern int ipoib_sendq_size; extern int ipoib_recvq_size; extern struct ib_sa_client ipoib_sa_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG extern int ipoib_debug_level; #define ipoib_dbg(priv, format, arg...) \ do { \ if (ipoib_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #define ipoib_dbg_mcast(priv, format, arg...) \ do { \ if (mcast_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #else /* CONFIG_INFINIBAND_IPOIB_DEBUG */ #define ipoib_dbg(priv, format, arg...) \ do { (void) (priv); } while (0) #define ipoib_dbg_mcast(priv, format, arg...) \ do { (void) (priv); } while (0) #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA #define ipoib_dbg_data(priv, format, arg...) \ do { \ if (data_debug_level > 0) \ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ } while (0) #else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ #define ipoib_dbg_data(priv, format, arg...) \ do { (void) (priv); } while (0) #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) #endif /* _IPOIB_H */ Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 341534) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 341535) @@ -1,1722 +1,1747 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #include #include #include #include #include /* For ARPHRD_xxx */ #include #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void ipoib_input(struct ifnet *ifp, struct mbuf *m); #define IPOIB_MTAP(_ifp, _m) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_mb((_ifp), (_m)); \ } \ } while (0) static struct unrhdr *ipoib_unrhdr; static void ipoib_unrhdr_init(void *arg) { ipoib_unrhdr = new_unrhdr(0, 65535, NULL); } SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL); static void ipoib_unrhdr_uninit(void *arg) { if (ipoib_unrhdr != NULL) { struct unrhdr *hdr; hdr = ipoib_unrhdr; ipoib_unrhdr = NULL; delete_unrhdr(hdr); } } SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL); /* * This is for clients that have an ipoib_header in the mbuf. */ static void ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) { struct ipoib_header *ih; struct ether_header eh; ih = mtod(mb, struct ipoib_header *); eh.ether_type = ih->proto; bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); bzero(&eh.ether_shost, ETHER_ADDR_LEN); mb->m_data += sizeof(struct ipoib_header); mb->m_len -= sizeof(struct ipoib_header); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(struct ipoib_header); mb->m_len += sizeof(struct ipoib_header); } void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) { struct ether_header eh; eh.ether_type = proto; bzero(&eh.ether_shost, ETHER_ADDR_LEN); bzero(&eh.ether_dhost, ETHER_ADDR_LEN); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); } static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } +static int +ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv *priv, int new_mtu, + bool propagate) +{ + struct ifnet *ifp; + struct ifreq ifr; + int error; + + ifp = priv->dev; + if (ifp->if_mtu == new_mtu) + return (0); + if (propagate) { + strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); + ifr.ifr_mtu = new_mtu; + CURVNET_SET(ifp->if_vnet); + error = ifhwioctl(SIOCSIFMTU, ifp, (caddr_t)&ifr, curthread); + CURVNET_RESTORE(); + } else { + ifp->if_mtu = new_mtu; + error = 0; + } + return (error); +} + int -ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu) +ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate) { - struct ifnet *dev = priv->dev; + int error, prev_admin_mtu; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); - dev->if_mtu = new_mtu; - return 0; + return (ipoib_propagate_ifnet_mtu(priv, new_mtu, propagate)); } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; + prev_admin_mtu = priv->admin_mtu; priv->admin_mtu = new_mtu; - - dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu); - - queue_work(ipoib_workqueue, &priv->flush_light); - - return 0; + error = ipoib_propagate_ifnet_mtu(priv, min(priv->mcast_mtu, + priv->admin_mtu), propagate); + if (error == 0) + queue_work(ipoib_workqueue, &priv->flush_light); + else + priv->admin_mtu = prev_admin_mtu; + return (error); } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], INFINIBAND_ALEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ - error = -ipoib_change_mtu(priv, ifr->ifr_mtu); + error = -ipoib_change_mtu(priv, ifr->ifr_mtu, false); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) _IF_ENQUEUE(&path->queue, mb); else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; IPOIB_MTAP(dev, mb); ipoib_send_one(priv, mb); } spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { priv->gone = 1; bpfdetach(dev); if_detach(dev); if_free(dev); free_unr(ipoib_unrhdr, priv->unit); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; priv->unit = alloc_unr(ipoib_unrhdr); if (priv->unit == -1) { if_free(dev); free(priv, M_TEMP); return NULL; } if_initname(dev, name, priv->unit); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; if_attach(dev); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = dev->if_addrlen; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr = &hca->attrs; priv->hca_caps = device_attr->device_cap_flags; priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list = client_data; if (!dev_list) return; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static int ipoib_match_dev_addr(const struct sockaddr *addr, struct net_device *dev) { struct ifaddr *ifa; int retval = 0; CURVNET_SET(dev->if_vnet); IF_ADDR_RLOCK(dev); CK_STAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != addr->sa_family || ifa->ifa_addr->sa_len != addr->sa_len) { continue; } if (memcmp(ifa->ifa_addr, addr, addr->sa_len) == 0) { retval = 1; break; } } IF_ADDR_RUNLOCK(dev); CURVNET_RESTORE(); return (retval); } /* * ipoib_match_gid_pkey_addr - returns the number of IPoIB netdevs on * top a given ipoib device matching a pkey_index and address, if one * exists. * * @found_net_dev: contains a matching net_device if the return value * >= 1, with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, struct net_device **found_net_dev) { struct ipoib_dev_priv *child_priv; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (addr == NULL || ipoib_match_dev_addr(addr, priv->dev) != 0) { if (*found_net_dev == NULL) { struct net_device *net_dev; if (priv->parent != NULL) net_dev = priv->parent; else net_dev = priv->dev; *found_net_dev = net_dev; dev_hold(net_dev); } matches++; } } /* Check child interfaces */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, found_net_dev); if (matches > 1) break; } mutex_unlock(&priv->vlan_mutex); return matches; } /* * __ipoib_get_net_dev_by_params - returns the number of matching * net_devs found (between 0 and 2). Also return the matching * net_device in the @net_dev parameter, holding a reference to the * net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port, u16 pkey_index, const union ib_gid *gid, const struct sockaddr *addr, struct net_device **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, net_dev); if (matches > 1) break; } return matches; } static struct net_device * ipoib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { struct net_device *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; if (!dev_list) return NULL; /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with L2 parameters only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); /* Fall through */ case 1: return net_dev; } } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } /* * Infiniband output routine. */ static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_char edst[INFINIBAND_ALEN]; #if defined(INET) || defined(INET6) struct llentry *lle = NULL; #endif struct ipoib_header *eh; int error = 0, is_gw = 0; short type; if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); else error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); else bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); else error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } eh = mtod(m, struct ipoib_header *); (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Upper layer processing for a received Infiniband packet. */ void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) { int isr; #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { if_printf(ifp, "discard frame at IFF_MONITOR\n"); m_freem(m); return; } /* * Dispatch frame to upper layer. */ switch (proto) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: m_freem(m); } /* * Process a received Infiniband packet. */ static void ipoib_input(struct ifnet *ifp, struct mbuf *m) { struct ipoib_header *eh; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } CURVNET_SET_QUIET(ifp->if_vnet); /* Let BPF have it before we strip the header. */ IPOIB_MTAP(ifp, m); eh = mtod(m, struct ipoib_header *); /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Infiniband header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, IPOIB_HEADER_LEN); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } ipoib_demux(ifp, m, ntohs(eh->proto)); CURVNET_RESTORE(); } static int ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!IPOIB_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all * of the multicast address used for IP6. * This has no meaning in ipoib. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: return EAFNOSUPPORT; } } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1); Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (revision 341534) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (revision 341535) @@ -1,920 +1,921 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" #include #include #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static int mcast_debug_level = 1; module_param(mcast_debug_level, int, 0644); MODULE_PARM_DESC(mcast_debug_level, "Enable multicast debug tracing if > 0"); #endif static DEFINE_MUTEX(mcast_mutex); struct ipoib_mcast_iter { struct ipoib_dev_priv *priv; union ib_gid mgid; unsigned long created; unsigned int queuelen; unsigned int complete; unsigned int send_only; }; static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct ifnet *dev = mcast->priv->dev; int tx_dropped = 0; ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); if (mcast->ah) ipoib_put_ah(mcast->ah); tx_dropped = mcast->pkt_queue.ifq_len; _IF_DRAIN(&mcast->pkt_queue); /* XXX Locking. */ if_inc_counter(dev, IFCOUNTER_OERRORS, tx_dropped); kfree(mcast); } static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv, int can_sleep) { struct ipoib_mcast *mcast; mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); if (!mcast) return NULL; mcast->priv = priv; mcast->created = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue)); return mcast; } static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv, void *mgid) { struct rb_node *n = priv->multicast_tree.rb_node; while (n) { struct ipoib_mcast *mcast; int ret; mcast = rb_entry(n, struct ipoib_mcast, rb_node); ret = memcmp(mgid, mcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return mcast; } return NULL; } static int __ipoib_mcast_add(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast) { struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; while (*n) { struct ipoib_mcast *tmcast; int ret; pn = *n; tmcast = rb_entry(pn, struct ipoib_mcast, rb_node); ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&mcast->rb_node, pn, n); rb_insert_color(&mcast->rb_node, &priv->multicast_tree); return 0; } static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, struct ib_sa_mcmember_rec *mcmember) { struct ipoib_dev_priv *priv = mcast->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah; int ret; int set_qkey = 0; mcast->mcmember = *mcmember; /* Set the cached Q_Key before we attach if it's the broadcast group */ if (!memcmp(mcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid))) { spin_lock_irq(&priv->lock); if (!priv->broadcast) { spin_unlock_irq(&priv->lock); return -EAGAIN; } priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.remote_qkey = priv->qkey; set_qkey = 1; } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { ipoib_warn(priv, "multicast group %16D already attached\n", mcast->mcmember.mgid.raw, ":"); return 0; } ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid), &mcast->mcmember.mgid, set_qkey); if (ret < 0) { ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); return ret; } } { struct ib_ah_attr av = { .dlid = be16_to_cpu(mcast->mcmember.mlid), .port_num = priv->port, .sl = mcast->mcmember.sl, .ah_flags = IB_AH_GRH, .static_rate = mcast->mcmember.rate, .grh = { .flow_label = be32_to_cpu(mcast->mcmember.flow_label), .hop_limit = mcast->mcmember.hop_limit, .sgid_index = 0, .traffic_class = mcast->mcmember.traffic_class } }; av.grh.dgid = mcast->mcmember.mgid; ah = ipoib_create_ah(priv, priv->pd, &av); if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { spin_lock_irq(&priv->lock); mcast->ah = ah; spin_unlock_irq(&priv->lock); ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n", mcast->mcmember.mgid.raw, ":", mcast->ah->ah, be16_to_cpu(mcast->mcmember.mlid), mcast->mcmember.sl); } } /* actually send any queued packets */ while (mcast->pkt_queue.ifq_len) { struct mbuf *mb; _IF_DEQUEUE(&mcast->pkt_queue, mb); mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); } return 0; } static int ipoib_mcast_sendonly_join_complete(int status, struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; struct ipoib_dev_priv *priv = mcast->priv; /* We trap for port events ourselves. */ if (status == -ENETRESET) return 0; if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (status) { if (mcast->logcount++ < 20) ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); /* Flush out any queued packets */ if_inc_counter(priv->dev, IFCOUNTER_OERRORS, mcast->pkt_queue.ifq_len); _IF_DRAIN(&mcast->pkt_queue); /* Clear the busy flag so we try again */ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); } return status; } static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) { struct ipoib_dev_priv *priv = mcast->priv; struct ib_sa_mcmember_rec rec = { #if 0 /* Some SMs don't support send-only yet */ .join_state = 4 #else .join_state = 1 #endif }; int ret = 0; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); return -ENODEV; } if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); return -EBUSY; } rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE, GFP_ATOMIC, ipoib_mcast_sendonly_join_complete, mcast); if (IS_ERR(mcast->mc)) { ret = PTR_ERR(mcast->mc); clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", ret); } else { ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n", mcast->mcmember.mgid.raw, ":"); } return ret; } void ipoib_mcast_carrier_on_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; /* * Take rtnl_lock to avoid racing with ipoib_stop() and * turning the carrier back on while a device is being * removed. */ if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } if_link_state_change(priv->dev, LINK_STATE_UP); } static int ipoib_mcast_join_complete(int status, struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; struct ipoib_dev_priv *priv = mcast->priv; ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n", mcast->mcmember.mgid.raw, ":", status); /* We trap for port events ourselves. */ if (status == -ENETRESET) return 0; if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (!status) { mcast->backoff = 1; mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); /* * Defer carrier on work to ipoib_workqueue to avoid a * deadlock on rtnl_lock here. */ if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); return 0; } if (mcast->logcount++ < 20) { if (status == -ETIMEDOUT || status == -EAGAIN) { ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); } else { ipoib_warn(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); } } mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; /* Clear the busy flag so we try again */ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mutex_lock(&mcast_mutex); spin_lock_irq(&priv->lock); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); return status; } static void ipoib_mcast_join(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast, int create) { struct ib_sa_mcmember_rec rec = { .join_state = 1 }; ib_sa_comp_mask comp_mask; int ret = 0; ipoib_dbg_mcast(priv, "joining MGID %16D\n", mcast->mcmember.mgid.raw, ":"); rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE; if (create) { comp_mask |= IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_HOP_LIMIT; rec.qkey = priv->broadcast->mcmember.qkey; rec.mtu_selector = IB_SA_EQ; rec.mtu = priv->broadcast->mcmember.mtu; rec.traffic_class = priv->broadcast->mcmember.traffic_class; rec.rate_selector = IB_SA_EQ; rec.rate = priv->broadcast->mcmember.rate; rec.sl = priv->broadcast->mcmember.sl; rec.flow_label = priv->broadcast->mcmember.flow_label; rec.hop_limit = priv->broadcast->mcmember.hop_limit; } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, mcast->backoff * HZ); mutex_unlock(&mcast_mutex); } } void ipoib_mcast_join_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct ifnet *dev = priv->dev; struct ib_port_attr attr; ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags); if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n", __func__, attr.state); return; } if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); { struct ib_port_attr attr; if (!ib_query_port(priv->ca, priv->port, &attr)) priv->local_lid = attr.lid; else ipoib_warn(priv, "ib_query_port failed\n"); } if (!priv->broadcast) { struct ipoib_mcast *broadcast; if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) return; broadcast = ipoib_mcast_alloc(priv, 1); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, HZ); mutex_unlock(&mcast_mutex); return; } spin_lock_irq(&priv->lock); memcpy(broadcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; __ipoib_mcast_add(priv, priv->broadcast); spin_unlock_irq(&priv->lock); } if (priv->broadcast && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { if (priv->broadcast && !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) ipoib_mcast_join(priv, priv->broadcast, 0); return; } while (1) { struct ipoib_mcast *mcast = NULL; spin_lock_irq(&priv->lock); list_for_each_entry(mcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { /* Found the next unjoined group */ break; } } spin_unlock_irq(&priv->lock); if (&mcast->list == &priv->multicast_list) { /* All done */ break; } ipoib_mcast_join(priv, mcast, 1); return; } spin_lock_irq(&priv->lock); if (priv->broadcast) priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); else priv->mcast_mtu = priv->admin_mtu; spin_unlock_irq(&priv->lock); if (!ipoib_cm_admin_enabled(priv)) - ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu)); + ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu), + true); ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); } int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv) { ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n", priv->flags); mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); return 0; } int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush) { ipoib_dbg_mcast(priv, "stopping multicast thread\n"); mutex_lock(&mcast_mutex); clear_bit(IPOIB_MCAST_RUN, &priv->flags); cancel_delayed_work(&priv->mcast_task); mutex_unlock(&mcast_mutex); if (flush) flush_workqueue(ipoib_workqueue); return 0; } static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast) { int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { ipoib_dbg_mcast(priv, "leaving MGID %16D\n", mcast->mcmember.mgid.raw, ":"); /* Remove ourselves from the multicast group */ ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, be16_to_cpu(mcast->mcmember.mlid)); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); } return 0; } void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb) { struct ifnet *dev = priv->dev; struct ipoib_mcast *mcast; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || !priv->broadcast || !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); return; } mcast = __ipoib_mcast_find(priv, mgid); if (!mcast) { /* Let's create a new send only group now */ ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n", mgid, ":"); mcast = ipoib_mcast_alloc(priv, 0); if (!mcast) { ipoib_warn(priv, "unable to allocate memory for " "multicast structure\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); goto out; } set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); __ipoib_mcast_add(priv, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } if (!mcast->ah) { if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) { _IF_ENQUEUE(&mcast->pkt_queue, mb); } else { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ipoib_dbg_mcast(priv, "no address vector, " "but multicast join already started\n"); else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) ipoib_mcast_sendonly_join(mcast); /* * If lookup completes between here and out:, don't * want to send packet twice. */ mcast = NULL; } out: if (mcast && mcast->ah) ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN); } void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv) { LIST_HEAD(remove_list); struct ipoib_mcast *mcast, *tmcast; unsigned long flags; ipoib_dbg_mcast(priv, "flushing multicast list\n"); spin_lock_irqsave(&priv->lock, flags); list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { list_del(&mcast->list); rb_erase(&mcast->rb_node, &priv->multicast_tree); list_add_tail(&mcast->list, &remove_list); } if (priv->broadcast) { rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); list_add_tail(&priv->broadcast->list, &remove_list); priv->broadcast = NULL; } spin_unlock_irqrestore(&priv->lock, flags); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(priv, mcast); ipoib_mcast_free(mcast); } } static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen, const u8 *broadcast) { if (addrlen != INFINIBAND_ALEN) return 0; /* reserved QPN, prefix, scope */ if (memcmp(addr, broadcast, 6)) return 0; /* signature lower, pkey */ if (memcmp(addr + 7, broadcast + 7, 3)) return 0; return 1; } void ipoib_mcast_restart_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, restart_task); ipoib_mcast_restart(priv); } void ipoib_mcast_restart(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; struct ifmultiaddr *ifma; struct ipoib_mcast *mcast, *tmcast; LIST_HEAD(remove_list); struct ib_sa_mcmember_rec rec; int addrlen; ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n", priv->flags); ipoib_mcast_stop_thread(priv, 0); if_maddr_rlock(dev); spin_lock(&priv->lock); /* * Unfortunately, the networking core only gives us a list of all of * the multicast hardware addresses. We need to figure out which ones * are new and which ones have been removed */ /* Clear out the found flag */ list_for_each_entry(mcast, &priv->multicast_list, list) clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); /* Mark all of the entries that are found or don't exist */ CK_STAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) { union ib_gid mgid; uint8_t *addr; if (ifma->ifma_addr->sa_family != AF_LINK) continue; addr = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); addrlen = ((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen; if (!ipoib_mcast_addr_is_valid(addr, addrlen, dev->if_broadcastaddr)) continue; memcpy(mgid.raw, addr + 4, sizeof mgid); mcast = __ipoib_mcast_find(priv, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { struct ipoib_mcast *nmcast; /* ignore group which is directly joined by userspace */ if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n", mgid.raw, ":"); continue; } /* Not found or send-only group, let's add a new entry */ ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n", mgid.raw, ":"); nmcast = ipoib_mcast_alloc(priv, 0); if (!nmcast) { ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); continue; } set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags); nmcast->mcmember.mgid = mgid; if (mcast) { /* Destroy the send only entry */ list_move_tail(&mcast->list, &remove_list); rb_replace_node(&mcast->rb_node, &nmcast->rb_node, &priv->multicast_tree); } else __ipoib_mcast_add(priv, nmcast); list_add_tail(&nmcast->list, &priv->multicast_list); } if (mcast) set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); } /* Remove all of the entries don't exist anymore */ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { ipoib_dbg_mcast(priv, "deleting multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); rb_erase(&mcast->rb_node, &priv->multicast_tree); /* Move to the remove list */ list_move_tail(&mcast->list, &remove_list); } } spin_unlock(&priv->lock); if_maddr_runlock(dev); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(mcast->priv, mcast); ipoib_mcast_free(mcast); } if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) ipoib_mcast_start_thread(priv); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_mcast_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->mgid.raw, 0, 16); if (ipoib_mcast_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_mcast *mcast; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->multicast_tree); while (n) { mcast = rb_entry(n, struct ipoib_mcast, rb_node); if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw, sizeof (union ib_gid)) < 0) { iter->mgid = mcast->mcmember.mgid; iter->created = mcast->created; iter->queuelen = mcast->pkt_queue.ifq_len; iter->complete = !!mcast->ah; iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *mgid, unsigned long *created, unsigned int *queuelen, unsigned int *complete, unsigned int *send_only) { *mgid = iter->mgid; *created = iter->created; *queuelen = iter->queuelen; *complete = iter->complete; *send_only = iter->send_only; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */