Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c (revision 353545) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c (revision 353546) @@ -1,1459 +1,1462 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2006 Mellanox Technologies. All rights reserved * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" #ifdef CONFIG_INFINIBAND_IPOIB_CM #include #include #include #include #include #include int ipoib_max_conn_qp = 128; module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); MODULE_PARM_DESC(max_nonsrq_conn_qp, "Max number of connected-mode QPs per interface " "(applied only if shared receive queue is not available)"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; module_param_named(cm_data_debug_level, data_debug_level, int, 0644); MODULE_PARM_DESC(cm_data_debug_level, "Enable data path debug tracing for connected mode if > 0"); #endif #define IPOIB_CM_IETF_ID 0x1000000000000000ULL #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) #define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) #define IPOIB_CM_RX_DELAY (3 * 256 * HZ) #define IPOIB_CM_RX_UPDATE_MASK (0x3) static struct ib_qp_attr ipoib_cm_err_attr = { .qp_state = IB_QPS_ERR }; #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff static struct ib_send_wr ipoib_cm_rx_drain_wr = { .wr_id = IPOIB_CM_RX_DRAIN_WRID, .opcode = IB_WR_SEND, }; static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req) { ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req); } static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id) { struct ib_recv_wr *bad_wr; struct ipoib_rx_buf *rx_req; struct mbuf *m; int ret; int i; rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id]; for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { priv->cm.rx_sge[i].addr = rx_req->mapping[i]; priv->cm.rx_sge[i].length = m->m_len; } priv->cm.rx_wr.num_sge = i; priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); ipoib_dma_unmap_rx(priv, rx_req); m_freem(priv->cm.srq_ring[id].mb); priv->cm.srq_ring[id].mb = NULL; } return ret; } static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *rx, struct ib_recv_wr *wr, struct ib_sge *sge, int id) { struct ipoib_rx_buf *rx_req; struct ib_recv_wr *bad_wr; struct mbuf *m; int ret; int i; rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id]; for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { sge[i].addr = rx_req->mapping[i]; sge[i].length = m->m_len; } wr->num_sge = i; wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; ret = ib_post_recv(rx->qp, wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); ipoib_dma_unmap_rx(priv, rx_req); m_freem(rx->rx_ring[id].mb); rx->rx_ring[id].mb = NULL; } return ret; } static struct mbuf * ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req) { return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req, priv->cm.max_cm_mtu); } static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_ring) { int i; for (i = 0; i < ipoib_recvq_size; ++i) if (rx_ring[i].mb) { ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]); m_freem(rx_ring[i].mb); } kfree(rx_ring); } static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) { struct ib_send_wr *bad_wr; struct ipoib_cm_rx *p; /* We only reserved 1 extra slot in CQ for drain WRs, so * make sure we have at most 1 outstanding WR. */ if (list_empty(&priv->cm.rx_flush_list) || !list_empty(&priv->cm.rx_drain_list)) return; /* * QPs on flush list are error state. This way, a "flush * error" WC will be immediately generated for each WR we post. */ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) ipoib_warn(priv, "failed to post drain wr\n"); list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); } static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) { struct ipoib_cm_rx *p = ctx; struct ipoib_dev_priv *priv = p->priv; unsigned long flags; if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) return; spin_lock_irqsave(&priv->lock, flags); list_move(&p->list, &priv->cm.rx_flush_list); p->state = IPOIB_CM_RX_FLUSH; ipoib_cm_start_rx_drain(priv); spin_unlock_irqrestore(&priv->lock, flags); } static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *p) { struct ib_qp_init_attr attr = { .event_handler = ipoib_cm_rx_event_handler, .send_cq = priv->recv_cq, /* For drain WR */ .recv_cq = priv->recv_cq, .srq = priv->cm.srq, .cap.max_send_wr = 1, /* For drain WR */ .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = p, }; if (!ipoib_cm_has_srq(priv)) { attr.cap.max_recv_wr = ipoib_recvq_size; attr.cap.max_recv_sge = priv->cm.num_frags; } return ib_create_qp(priv->pd, &attr); } static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp, unsigned psn) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); return ret; } ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); return ret; } qp_attr.qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); return ret; } qp_attr.rq_psn = psn; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); return ret; } /* * Current Mellanox HCA firmware won't generate completions * with error for drain WRs unless the QP has been moved to * RTS first. This work-around leaves a window where a QP has * moved to error asynchronously, but this will eventually get * fixed in firmware, so let's not error out if modify QP * fails. */ qp_attr.qp_state = IB_QPS_RTS; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); return 0; } ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); return 0; } return 0; } static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv, struct ib_recv_wr *wr, struct ib_sge *sge) { int i; for (i = 0; i < IPOIB_CM_RX_SG; i++) sge[i].lkey = priv->pd->local_dma_lkey; wr->next = NULL; wr->sg_list = sge; wr->num_sge = 1; } static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx) { struct { struct ib_recv_wr wr; struct ib_sge sge[IPOIB_CM_RX_SG]; } *t; int ret; int i; rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL); if (!rx->rx_ring) { printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); return -ENOMEM; } memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring); t = kmalloc(sizeof *t, GFP_KERNEL); if (!t) { ret = -ENOMEM; goto err_free; } ipoib_cm_init_rx_wr(priv, &t->wr, t->sge); spin_lock_irq(&priv->lock); if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { spin_unlock_irq(&priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0); ret = -EINVAL; goto err_free; } else ++priv->cm.nonsrq_conn_qp; spin_unlock_irq(&priv->lock); for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; } ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i); if (ret) { ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " "failed for buf %d\n", i); ret = -EIO; goto err_count; } } rx->recv_count = ipoib_recvq_size; kfree(t); return 0; err_count: spin_lock_irq(&priv->lock); --priv->cm.nonsrq_conn_qp; spin_unlock_irq(&priv->lock); err_free: kfree(t); ipoib_cm_free_rx_ring(priv, rx->rx_ring); return ret; } static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp, struct ib_cm_req_event_param *req, unsigned psn) { struct ipoib_cm_data data = {}; struct ib_cm_rep_param rep = {}; data.qpn = cpu_to_be32(priv->qp->qp_num); data.mtu = cpu_to_be32(priv->cm.max_cm_mtu); rep.private_data = &data; rep.private_data_len = sizeof data; rep.flow_control = 0; rep.rnr_retry_count = req->rnr_retry_count; rep.srq = ipoib_cm_has_srq(priv); rep.qp_num = qp->qp_num; rep.starting_psn = psn; return ib_send_cm_rep(cm_id, &rep); } static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_dev_priv *priv = cm_id->context; struct ipoib_cm_rx *p; unsigned psn; int ret; ipoib_dbg(priv, "REQ arrived\n"); p = kzalloc(sizeof *p, GFP_KERNEL); if (!p) return -ENOMEM; p->priv = priv; p->id = cm_id; cm_id->context = p; p->state = IPOIB_CM_RX_LIVE; p->jiffies = jiffies; INIT_LIST_HEAD(&p->list); p->qp = ipoib_cm_create_rx_qp(priv, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); goto err_qp; } psn = random() & 0xffffff; ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn); if (ret) goto err_modify; if (!ipoib_cm_has_srq(priv)) { ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p); if (ret) goto err_modify; } spin_lock_irq(&priv->lock); queue_delayed_work(ipoib_workqueue, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); /* Add this entry to passive ids list head, but do not re-add it * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ p->jiffies = jiffies; if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irq(&priv->lock); ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn); if (ret) { ipoib_warn(priv, "failed to send REP: %d\n", ret); if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); } return 0; err_modify: ib_destroy_qp(p->qp); err_qp: kfree(p); return ret; } static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_rx *p; struct ipoib_dev_priv *priv; switch (event->event) { case IB_CM_REQ_RECEIVED: return ipoib_cm_req_handler(cm_id, event); case IB_CM_DREQ_RECEIVED: p = cm_id->context; ib_send_cm_drep(cm_id, NULL, 0); /* Fall through */ case IB_CM_REJ_RECEIVED: p = cm_id->context; priv = p->priv; if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); /* Fall through */ default: return 0; } } void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ipoib_cm_rx_buf saverx; struct ipoib_cm_rx_buf *rx_ring; unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); struct ifnet *dev = priv->dev; struct mbuf *mb, *newmb; struct ipoib_cm_rx *p; int has_srq; u_short proto; CURVNET_SET_QUIET(dev->if_vnet); ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { spin_lock(&priv->lock); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); if (priv->cm.id != NULL) queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); spin_unlock(&priv->lock); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", wr_id, ipoib_recvq_size); goto done; } p = wc->qp->qp_context; has_srq = ipoib_cm_has_srq(priv); rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; mb = rx_ring[wr_id].mb; if (unlikely(wc->status != IB_WC_SUCCESS)) { ipoib_dbg(priv, "cm recv error " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); if_inc_counter(dev, IFCOUNTER_IERRORS, 1); if (has_srq) goto repost; else { if (!--p->recv_count) { spin_lock(&priv->lock); list_move(&p->list, &priv->cm.rx_reap_list); queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); spin_unlock(&priv->lock); } goto done; } } if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { p->jiffies = jiffies; /* Move this entry to list head, but do not re-add it * if it has been moved out of list. */ if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); } } memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx)); newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]); if (unlikely(!newmb)) { /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); if_inc_counter(dev, IFCOUNTER_IERRORS, 1); memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx)); goto repost; } ipoib_cm_dma_unmap_rx(priv, &saverx); ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); ipoib_dma_mb(priv, mb, wc->byte_len); if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); mb->m_pkthdr.rcvif = dev; proto = *mtod(mb, uint16_t *); m_adj(mb, IPOIB_ENCAP_LEN); IPOIB_MTAP_PROTO(dev, mb, proto); ipoib_demux(dev, mb, ntohs(proto)); repost: if (has_srq) { if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id))) ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " "for buf %d\n", wr_id); } else { if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p, &priv->cm.rx_wr, priv->cm.rx_sge, wr_id))) { --p->recv_count; ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " "for buf %d\n", wr_id); } } done: CURVNET_RESTORE(); return; } static inline int post_send(struct ipoib_dev_priv *priv, struct ipoib_cm_tx *tx, struct ipoib_cm_tx_buf *tx_req, unsigned int wr_id) { struct ib_send_wr *bad_wr; struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { priv->tx_sge[i].addr = mapping[i]; priv->tx_sge[i].length = m->m_len; } priv->tx_wr.wr.num_sge = i; priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM; priv->tx_wr.wr.opcode = IB_WR_SEND; return ib_post_send(tx->qp, &priv->tx_wr.wr, &bad_wr); } void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx) { struct ipoib_cm_tx_buf *tx_req; struct ifnet *dev = priv->dev; if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) { while (ipoib_poll_tx(priv, false)) ; /* nothing */ } m_adj(mb, sizeof(struct ipoib_pseudoheader)); if (unlikely(mb->m_pkthdr.len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", mb->m_pkthdr.len, tx->mtu); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu)); return; } ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num); /* * We put the mb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; tx_req->mb = mb; if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req, priv->cm.num_frags))) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); if (tx_req->mb) m_freem(tx_req->mb); return; } if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) { ipoib_warn(priv, "post_send failed\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); m_freem(mb); } else { ++tx->tx_head; if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) ipoib_warn(priv, "request notify on send CQ failed\n"); dev->if_drv_flags |= IFF_DRV_OACTIVE; } } } void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ipoib_cm_tx *tx = wc->qp->qp_context; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; struct ifnet *dev = priv->dev; struct ipoib_cm_tx_buf *tx_req; ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_sendq_size)) { ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", wr_id, ipoib_sendq_size); return; } tx_req = &tx->tx_ring[wr_id]; ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); /* FIXME: is this right? Shouldn't we only increment on success? */ if_inc_counter(dev, IFCOUNTER_OPACKETS, 1); m_freem(tx_req->mb); ++tx->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) { struct ipoib_path *path; ipoib_dbg(priv, "failed cm send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); path = tx->path; if (path) { path->cm = NULL; rb_erase(&path->rb_node, &priv->path_tree); list_del(&path->list); } if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); } } int ipoib_cm_dev_open(struct ipoib_dev_priv *priv) { int ret; if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev))) return 0; priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv); if (IS_ERR(priv->cm.id)) { printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); goto err_cm; } ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 0); if (ret) { printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, IPOIB_CM_IETF_ID | priv->qp->qp_num); goto err_listen; } return 0; err_listen: ib_destroy_cm_id(priv->cm.id); err_cm: priv->cm.id = NULL; return ret; } static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv) { struct ipoib_cm_rx *rx, *n; LIST_HEAD(list); spin_lock_irq(&priv->lock); list_splice_init(&priv->cm.rx_reap_list, &list); spin_unlock_irq(&priv->lock); list_for_each_entry_safe(rx, n, &list, list) { ib_destroy_cm_id(rx->id); ib_destroy_qp(rx->qp); if (!ipoib_cm_has_srq(priv)) { ipoib_cm_free_rx_ring(priv, rx->rx_ring); spin_lock_irq(&priv->lock); --priv->cm.nonsrq_conn_qp; spin_unlock_irq(&priv->lock); } kfree(rx); } } void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv) { struct ipoib_cm_rx *p; unsigned long begin; int ret; if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)) || !priv->cm.id) return; ib_destroy_cm_id(priv->cm.id); priv->cm.id = NULL; cancel_work_sync(&priv->cm.rx_reap_task); spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); list_move(&p->list, &priv->cm.rx_error_list); p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); if (ret) ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); spin_lock_irq(&priv->lock); } /* Wait for all RX to be drained */ begin = jiffies; while (!list_empty(&priv->cm.rx_error_list) || !list_empty(&priv->cm.rx_flush_list) || !list_empty(&priv->cm.rx_drain_list)) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "RX drain timing out\n"); /* * assume the HW is wedged and just free up everything. */ list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_reap_list); list_splice_init(&priv->cm.rx_error_list, &priv->cm.rx_reap_list); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); break; } spin_unlock_irq(&priv->lock); msleep(1); ipoib_drain_cq(priv); spin_lock_irq(&priv->lock); } spin_unlock_irq(&priv->lock); ipoib_cm_free_rx_reap_list(priv); cancel_delayed_work_sync(&priv->cm.stale_task); } static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_tx *p = cm_id->context; struct ipoib_dev_priv *priv = p->priv; struct ipoib_cm_data *data = event->private_data; + struct epoch_tracker et; struct ifqueue mbqueue; struct ib_qp_attr qp_attr; int qp_attr_mask, ret; struct mbuf *mb; ipoib_dbg(priv, "cm rep handler\n"); p->mtu = be32_to_cpu(data->mtu); if (p->mtu <= IPOIB_ENCAP_LEN) { ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", p->mtu, IPOIB_ENCAP_LEN); return -EINVAL; } qp_attr.qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); return ret; } qp_attr.rq_psn = 0 /* FIXME */; ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); return ret; } qp_attr.qp_state = IB_QPS_RTS; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); return ret; } ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); return ret; } bzero(&mbqueue, sizeof(mbqueue)); spin_lock_irq(&priv->lock); set_bit(IPOIB_FLAG_OPER_UP, &p->flags); if (p->path) for (;;) { _IF_DEQUEUE(&p->path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } spin_unlock_irq(&priv->lock); + NET_EPOCH_ENTER(et); for (;;) { struct ifnet *dev = p->priv->dev; _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } + NET_EPOCH_EXIT(et); ret = ib_send_cm_rtu(cm_id, NULL, 0); if (ret) { ipoib_warn(priv, "failed to send RTU: %d\n", ret); return ret; } return 0; } static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv, struct ipoib_cm_tx *tx) { struct ib_qp_init_attr attr = { .send_cq = priv->send_cq, .recv_cq = priv->recv_cq, .srq = priv->cm.srq, .cap.max_send_wr = ipoib_sendq_size, .cap.max_send_sge = priv->cm.num_frags, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = tx }; return ib_create_qp(priv->pd, &attr); } static int ipoib_cm_send_req(struct ipoib_dev_priv *priv, struct ib_cm_id *id, struct ib_qp *qp, u32 qpn, struct ib_sa_path_rec *pathrec) { struct ipoib_cm_data data = {}; struct ib_cm_req_param req = {}; ipoib_dbg(priv, "cm send req\n"); data.qpn = cpu_to_be32(priv->qp->qp_num); data.mtu = cpu_to_be32(priv->cm.max_cm_mtu); req.primary_path = pathrec; req.alternate_path = NULL; req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); req.qp_num = qp->qp_num; req.qp_type = qp->qp_type; req.private_data = &data; req.private_data_len = sizeof data; req.flow_control = 0; req.starting_psn = 0; /* FIXME */ /* * Pick some arbitrary defaults here; we could make these * module parameters if anyone cared about setting them. */ req.responder_resources = 4; req.remote_cm_response_timeout = 20; req.local_cm_response_timeout = 20; req.retry_count = 0; /* RFC draft warns against retries */ req.rnr_retry_count = 0; /* RFC draft warns against retries */ req.max_cm_retries = 15; req.srq = ipoib_cm_has_srq(priv); return ib_send_cm_req(id, &req); } static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); if (ret) { ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret); return ret; } qp_attr.qp_state = IB_QPS_INIT; qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; qp_attr.port_num = priv->port; qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); return ret; } return 0; } static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ib_sa_path_rec *pathrec) { struct ipoib_dev_priv *priv = p->priv; int ret; p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->priv, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); goto err_qp; } p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); if (IS_ERR(p->id)) { ret = PTR_ERR(p->id); ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); goto err_id; } ret = ipoib_cm_modify_tx_init(p->priv, p->id, p->qp); if (ret) { ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); goto err_modify; } ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec); if (ret) { ipoib_warn(priv, "failed to send cm req: %d\n", ret); goto err_send_cm; } ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n", p->qp->qp_num, pathrec->dgid.raw, qpn); return 0; err_send_cm: err_modify: ib_destroy_cm_id(p->id); err_id: p->id = NULL; ib_destroy_qp(p->qp); err_qp: p->qp = NULL; kfree(p->tx_ring); err_tx: return ret; } static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { struct ipoib_dev_priv *priv = p->priv; struct ifnet *dev = priv->dev; struct ipoib_cm_tx_buf *tx_req; unsigned long begin; ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); if (p->path) ipoib_path_free(priv, p->path); if (p->id) ib_destroy_cm_id(p->id); if (p->tx_ring) { /* Wait for all sends to complete */ begin = jiffies; while ((int) p->tx_tail - (int) p->tx_head < 0) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "timing out; %d sends not completed\n", p->tx_head - p->tx_tail); goto timeout; } msleep(1); } } timeout: while ((int) p->tx_tail - (int) p->tx_head < 0) { tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); m_freem(tx_req->mb); ++p->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; } if (p->qp) ib_destroy_qp(p->qp); kfree(p->tx_ring); kfree(p); } static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_tx *tx = cm_id->context; struct ipoib_dev_priv *priv = tx->priv; struct ipoib_path *path; unsigned long flags; int ret; switch (event->event) { case IB_CM_DREQ_RECEIVED: ipoib_dbg(priv, "DREQ received.\n"); ib_send_cm_drep(cm_id, NULL, 0); break; case IB_CM_REP_RECEIVED: ipoib_dbg(priv, "REP received.\n"); ret = ipoib_cm_rep_handler(cm_id, event); if (ret) ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); break; case IB_CM_REQ_ERROR: case IB_CM_REJ_RECEIVED: case IB_CM_TIMEWAIT_EXIT: ipoib_dbg(priv, "CM error %d.\n", event->event); spin_lock_irqsave(&priv->lock, flags); path = tx->path; if (path) { path->cm = NULL; tx->path = NULL; rb_erase(&path->rb_node, &priv->path_tree); list_del(&path->list); } if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); } spin_unlock_irqrestore(&priv->lock, flags); if (path) ipoib_path_free(tx->priv, path); break; default: break; } return 0; } struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ipoib_cm_tx *tx; tx = kzalloc(sizeof *tx, GFP_ATOMIC); if (!tx) return NULL; ipoib_dbg(priv, "Creating cm tx\n"); path->cm = tx; tx->path = path; tx->priv = priv; list_add(&tx->list, &priv->cm.start_list); set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); queue_work(ipoib_workqueue, &priv->cm.start_task); return tx; } void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = tx->priv; if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { spin_lock(&priv->lock); list_move(&tx->list, &priv->cm.reap_list); spin_unlock(&priv->lock); queue_work(ipoib_workqueue, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", tx->path->pathrec.dgid.raw); tx->path = NULL; } } static void ipoib_cm_tx_start(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.start_task); struct ipoib_path *path; struct ipoib_cm_tx *p; unsigned long flags; int ret; struct ib_sa_path_rec pathrec; u32 qpn; ipoib_dbg(priv, "cm start task\n"); spin_lock_irqsave(&priv->lock, flags); while (!list_empty(&priv->cm.start_list)) { p = list_entry(priv->cm.start_list.next, typeof(*p), list); list_del_init(&p->list); path = p->path; qpn = IPOIB_QPN(path->hwaddr); memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); spin_unlock_irqrestore(&priv->lock, flags); ret = ipoib_cm_tx_init(p, qpn, &pathrec); spin_lock_irqsave(&priv->lock, flags); if (ret) { path = p->path; if (path) { path->cm = NULL; rb_erase(&path->rb_node, &priv->path_tree); list_del(&path->list); ipoib_path_free(priv, path); } list_del(&p->list); kfree(p); } } spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_cm_tx_reap(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.reap_task); struct ipoib_cm_tx *p; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); while (!list_empty(&priv->cm.reap_list)) { p = list_entry(priv->cm.reap_list.next, typeof(*p), list); list_del(&p->list); spin_unlock_irqrestore(&priv->lock, flags); ipoib_cm_tx_destroy(p); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_cm_mb_reap(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.mb_task); struct mbuf *mb; unsigned long flags; #if defined(INET) || defined(INET6) unsigned mtu = priv->mcast_mtu; #endif uint16_t proto; spin_lock_irqsave(&priv->lock, flags); for (;;) { IF_DEQUEUE(&priv->cm.mb_queue, mb); if (mb == NULL) break; spin_unlock_irqrestore(&priv->lock, flags); proto = htons(*mtod(mb, uint16_t *)); m_adj(mb, IPOIB_ENCAP_LEN); switch (proto) { #if defined(INET) case ETHERTYPE_IP: icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu); break; #endif default: m_freem(mb); } spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu) { int e = priv->cm.mb_queue.ifq_len; IF_ENQUEUE(&priv->cm.mb_queue, mb); if (e == 0) queue_work(ipoib_workqueue, &priv->cm.mb_task); } static void ipoib_cm_rx_reap(struct work_struct *work) { ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, cm.rx_reap_task)); } static void ipoib_cm_stale_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.stale_task.work); struct ipoib_cm_rx *p; int ret; spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { /* List is sorted by LRU, start from tail, * stop when we see a recently used entry */ p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) break; list_move(&p->list, &priv->cm.rx_error_list); p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); if (ret) ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); spin_lock_irq(&priv->lock); } if (!list_empty(&priv->cm.passive_ids)) queue_delayed_work(ipoib_workqueue, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge) { struct ib_srq_init_attr srq_init_attr = { .attr = { .max_wr = ipoib_recvq_size, .max_sge = max_sge } }; priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); if (IS_ERR(priv->cm.srq)) { if (PTR_ERR(priv->cm.srq) != -ENOSYS) printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", priv->ca->name, PTR_ERR(priv->cm.srq)); priv->cm.srq = NULL; return; } priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL); if (!priv->cm.srq_ring) { printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); ib_destroy_srq(priv->cm.srq); priv->cm.srq = NULL; return; } memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring); } int ipoib_cm_dev_init(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; int i; int max_srq_sge; INIT_LIST_HEAD(&priv->cm.passive_ids); INIT_LIST_HEAD(&priv->cm.reap_list); INIT_LIST_HEAD(&priv->cm.start_list); INIT_LIST_HEAD(&priv->cm.rx_error_list); INIT_LIST_HEAD(&priv->cm.rx_flush_list); INIT_LIST_HEAD(&priv->cm.rx_drain_list); INIT_LIST_HEAD(&priv->cm.rx_reap_list); INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap); INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue)); mtx_init(&priv->cm.mb_queue.ifq_mtx, dev->if_xname, "if send queue", MTX_DEF); max_srq_sge = priv->ca->attrs.max_srq_sge; ipoib_dbg(priv, "max_srq_sge=%d\n", max_srq_sge); max_srq_sge = min_t(int, IPOIB_CM_RX_SG, max_srq_sge); ipoib_cm_create_srq(priv, max_srq_sge); if (ipoib_cm_has_srq(priv)) { priv->cm.max_cm_mtu = max_srq_sge * MJUMPAGESIZE; priv->cm.num_frags = max_srq_sge; ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", priv->cm.max_cm_mtu, priv->cm.num_frags); } else { priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU; priv->cm.num_frags = IPOIB_CM_RX_SG; } ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge); if (ipoib_cm_has_srq(priv)) { for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(priv); return -ENOMEM; } if (ipoib_cm_post_receive_srq(priv, i)) { ipoib_warn(priv, "ipoib_cm_post_receive_srq " "failed for buf %d\n", i); ipoib_cm_dev_cleanup(priv); return -EIO; } } } IF_LLADDR(priv->dev)[0] = IPOIB_FLAGS_RC; return 0; } void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv) { int ret; if (!priv->cm.srq) return; ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); ret = ib_destroy_srq(priv->cm.srq); if (ret) ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); priv->cm.srq = NULL; if (!priv->cm.srq_ring) return; ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring); priv->cm.srq_ring = NULL; mtx_destroy(&priv->cm.mb_queue.ifq_mtx); } #endif /* CONFIG_INFINIBAND_IPOIB_CM */ Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 353545) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 353546) @@ -1,1762 +1,1767 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" #include static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #include #include #include #include #include /* For ARPHRD_xxx */ #include #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void ipoib_input(struct ifnet *ifp, struct mbuf *m); #define IPOIB_MTAP(_ifp, _m) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_mb((_ifp), (_m)); \ } \ } while (0) static struct unrhdr *ipoib_unrhdr; static void ipoib_unrhdr_init(void *arg) { ipoib_unrhdr = new_unrhdr(0, 65535, NULL); } SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL); static void ipoib_unrhdr_uninit(void *arg) { if (ipoib_unrhdr != NULL) { struct unrhdr *hdr; hdr = ipoib_unrhdr; ipoib_unrhdr = NULL; delete_unrhdr(hdr); } } SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL); /* * This is for clients that have an ipoib_header in the mbuf. */ static void ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) { struct ipoib_header *ih; struct ether_header eh; ih = mtod(mb, struct ipoib_header *); eh.ether_type = ih->proto; bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); bzero(&eh.ether_shost, ETHER_ADDR_LEN); mb->m_data += sizeof(struct ipoib_header); mb->m_len -= sizeof(struct ipoib_header); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(struct ipoib_header); mb->m_len += sizeof(struct ipoib_header); } void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) { struct ether_header eh; eh.ether_type = proto; bzero(&eh.ether_shost, ETHER_ADDR_LEN); bzero(&eh.ether_dhost, ETHER_ADDR_LEN); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); } static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } static int ipoib_propagate_ifnet_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate) { struct ifnet *ifp; struct ifreq ifr; int error; ifp = priv->dev; if (ifp->if_mtu == new_mtu) return (0); if (propagate) { strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); ifr.ifr_mtu = new_mtu; CURVNET_SET(ifp->if_vnet); error = ifhwioctl(SIOCSIFMTU, ifp, (caddr_t)&ifr, curthread); CURVNET_RESTORE(); } else { ifp->if_mtu = new_mtu; error = 0; } return (error); } int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu, bool propagate) { int error, prev_admin_mtu; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); return (ipoib_propagate_ifnet_mtu(priv, new_mtu, propagate)); } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; prev_admin_mtu = priv->admin_mtu; priv->admin_mtu = new_mtu; error = ipoib_propagate_ifnet_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu), propagate); if (error == 0) { /* check for MTU change to avoid infinite loop */ if (prev_admin_mtu != new_mtu) queue_work(ipoib_workqueue, &priv->flush_light); } else priv->admin_mtu = prev_admin_mtu; return (error); } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], INFINIBAND_ALEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ error = -ipoib_change_mtu(priv, ifr->ifr_mtu, false); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; + struct epoch_tracker et; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); + NET_EPOCH_ENTER(et); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } + NET_EPOCH_EXIT(et); } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) _IF_ENQUEUE(&path->queue, mb); else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } void ipoib_start_locked(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; assert_spin_locked(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; IPOIB_MTAP(dev, mb); ipoib_send_one(priv, mb); } } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); ipoib_start_locked(dev, priv); spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { priv->gone = 1; bpfdetach(dev); if_detach(dev); if_free(dev); free_unr(ipoib_unrhdr, priv->unit); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; priv->unit = alloc_unr(ipoib_unrhdr); if (priv->unit == -1) { if_free(dev); free(priv, M_TEMP); return NULL; } if_initname(dev, name, priv->unit); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; if_attach(dev); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = dev->if_addrlen; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr = &hca->attrs; priv->hca_caps = device_attr->device_cap_flags; priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list = client_data; if (!dev_list) return; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static int ipoib_match_dev_addr(const struct sockaddr *addr, struct net_device *dev) { struct epoch_tracker et; struct ifaddr *ifa; int retval = 0; CURVNET_SET(dev->if_vnet); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != addr->sa_family || ifa->ifa_addr->sa_len != addr->sa_len) { continue; } if (memcmp(ifa->ifa_addr, addr, addr->sa_len) == 0) { retval = 1; break; } } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (retval); } /* * ipoib_match_gid_pkey_addr - returns the number of IPoIB netdevs on * top a given ipoib device matching a pkey_index and address, if one * exists. * * @found_net_dev: contains a matching net_device if the return value * >= 1, with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, struct net_device **found_net_dev) { struct ipoib_dev_priv *child_priv; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (addr == NULL || ipoib_match_dev_addr(addr, priv->dev) != 0) { if (*found_net_dev == NULL) { struct net_device *net_dev; if (priv->parent != NULL) net_dev = priv->parent; else net_dev = priv->dev; *found_net_dev = net_dev; dev_hold(net_dev); } matches++; } } /* Check child interfaces */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, found_net_dev); if (matches > 1) break; } mutex_unlock(&priv->vlan_mutex); return matches; } /* * __ipoib_get_net_dev_by_params - returns the number of matching * net_devs found (between 0 and 2). Also return the matching * net_device in the @net_dev parameter, holding a reference to the * net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port, u16 pkey_index, const union ib_gid *gid, const struct sockaddr *addr, struct net_device **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, net_dev); if (matches > 1) break; } return matches; } static struct net_device * ipoib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { struct net_device *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; if (!dev_list) return NULL; /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with L2 parameters only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); /* Fall through */ case 1: return net_dev; } } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } /* * Infiniband output routine. */ static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_char edst[INFINIBAND_ALEN]; #if defined(INET) || defined(INET6) struct llentry *lle = NULL; #endif struct ipoib_header *eh; int error = 0, is_gw = 0; short type; + + NET_EPOCH_ASSERT(); if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); else error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); else bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); else error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } eh = mtod(m, struct ipoib_header *); (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Upper layer processing for a received Infiniband packet. */ void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) { struct epoch_tracker et; int isr; #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { if_printf(ifp, "discard frame at IFF_MONITOR\n"); m_freem(m); return; } /* * Dispatch frame to upper layer. */ switch (proto) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); return; discard: m_freem(m); } /* * Process a received Infiniband packet. */ static void ipoib_input(struct ifnet *ifp, struct mbuf *m) { struct ipoib_header *eh; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } CURVNET_SET_QUIET(ifp->if_vnet); /* Let BPF have it before we strip the header. */ IPOIB_MTAP(ifp, m); eh = mtod(m, struct ipoib_header *); /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Infiniband header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, IPOIB_HEADER_LEN); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } ipoib_demux(ifp, m, ntohs(eh->proto)); CURVNET_RESTORE(); } static int ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!IPOIB_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all * of the multicast address used for IP6. * This has no meaning in ipoib. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: return EAFNOSUPPORT; } } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1); Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (revision 353545) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (revision 353546) @@ -1,930 +1,934 @@ /*- * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 * * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "ipoib.h" #include #include #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static int mcast_debug_level = 1; module_param(mcast_debug_level, int, 0644); MODULE_PARM_DESC(mcast_debug_level, "Enable multicast debug tracing if > 0"); #endif static DEFINE_MUTEX(mcast_mutex); struct ipoib_mcast_iter { struct ipoib_dev_priv *priv; union ib_gid mgid; unsigned long created; unsigned int queuelen; unsigned int complete; unsigned int send_only; }; static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct ifnet *dev = mcast->priv->dev; int tx_dropped = 0; ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); if (mcast->ah) ipoib_put_ah(mcast->ah); tx_dropped = mcast->pkt_queue.ifq_len; _IF_DRAIN(&mcast->pkt_queue); /* XXX Locking. */ if_inc_counter(dev, IFCOUNTER_OERRORS, tx_dropped); kfree(mcast); } static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv, int can_sleep) { struct ipoib_mcast *mcast; mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); if (!mcast) return NULL; mcast->priv = priv; mcast->created = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue)); return mcast; } static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv, void *mgid) { struct rb_node *n = priv->multicast_tree.rb_node; while (n) { struct ipoib_mcast *mcast; int ret; mcast = rb_entry(n, struct ipoib_mcast, rb_node); ret = memcmp(mgid, mcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return mcast; } return NULL; } static int __ipoib_mcast_add(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast) { struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; while (*n) { struct ipoib_mcast *tmcast; int ret; pn = *n; tmcast = rb_entry(pn, struct ipoib_mcast, rb_node); ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&mcast->rb_node, pn, n); rb_insert_color(&mcast->rb_node, &priv->multicast_tree); return 0; } static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, struct ib_sa_mcmember_rec *mcmember) { struct ipoib_dev_priv *priv = mcast->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah; + struct epoch_tracker et; int ret; int set_qkey = 0; mcast->mcmember = *mcmember; /* Set the cached Q_Key before we attach if it's the broadcast group */ if (!memcmp(mcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid))) { spin_lock_irq(&priv->lock); if (!priv->broadcast) { spin_unlock_irq(&priv->lock); return -EAGAIN; } priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.remote_qkey = priv->qkey; set_qkey = 1; } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { ipoib_warn(priv, "multicast group %16D already attached\n", mcast->mcmember.mgid.raw, ":"); return 0; } ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid), &mcast->mcmember.mgid, set_qkey); if (ret < 0) { ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); return ret; } } { struct ib_ah_attr av = { .dlid = be16_to_cpu(mcast->mcmember.mlid), .port_num = priv->port, .sl = mcast->mcmember.sl, .ah_flags = IB_AH_GRH, .static_rate = mcast->mcmember.rate, .grh = { .flow_label = be32_to_cpu(mcast->mcmember.flow_label), .hop_limit = mcast->mcmember.hop_limit, .sgid_index = 0, .traffic_class = mcast->mcmember.traffic_class } }; av.grh.dgid = mcast->mcmember.mgid; ah = ipoib_create_ah(priv, priv->pd, &av); if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { spin_lock_irq(&priv->lock); mcast->ah = ah; spin_unlock_irq(&priv->lock); ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n", mcast->mcmember.mgid.raw, ":", mcast->ah->ah, be16_to_cpu(mcast->mcmember.mlid), mcast->mcmember.sl); } } + NET_EPOCH_ENTER(et); + /* actually send any queued packets */ while (mcast->pkt_queue.ifq_len) { struct mbuf *mb; _IF_DEQUEUE(&mcast->pkt_queue, mb); mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); } + NET_EPOCH_EXIT(et); return 0; } static int ipoib_mcast_sendonly_join_complete(int status, struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; struct ipoib_dev_priv *priv = mcast->priv; /* We trap for port events ourselves. */ if (status == -ENETRESET) return 0; if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (status) { if (mcast->logcount++ < 20) ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); /* Flush out any queued packets */ if_inc_counter(priv->dev, IFCOUNTER_OERRORS, mcast->pkt_queue.ifq_len); _IF_DRAIN(&mcast->pkt_queue); /* Clear the busy flag so we try again */ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); } return status; } static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) { struct ipoib_dev_priv *priv = mcast->priv; struct ib_sa_mcmember_rec rec = { #if 0 /* Some SMs don't support send-only yet */ .join_state = 4 #else .join_state = 1 #endif }; int ret = 0; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); return -ENODEV; } if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); return -EBUSY; } rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE, GFP_ATOMIC, ipoib_mcast_sendonly_join_complete, mcast); if (IS_ERR(mcast->mc)) { ret = PTR_ERR(mcast->mc); clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", ret); } else { ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n", mcast->mcmember.mgid.raw, ":"); } return ret; } void ipoib_mcast_carrier_on_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; /* * Take rtnl_lock to avoid racing with ipoib_stop() and * turning the carrier back on while a device is being * removed. */ if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } if_link_state_change(priv->dev, LINK_STATE_UP); } static int ipoib_mcast_join_complete(int status, struct ib_sa_multicast *multicast) { struct ipoib_mcast *mcast = multicast->context; struct ipoib_dev_priv *priv = mcast->priv; ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n", mcast->mcmember.mgid.raw, ":", status); /* We trap for port events ourselves. */ if (status == -ENETRESET) return 0; if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (!status) { mcast->backoff = 1; mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); /* * Defer carrier on work to ipoib_workqueue to avoid a * deadlock on rtnl_lock here. */ if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); return 0; } if (mcast->logcount++ < 20) { if (status == -ETIMEDOUT || status == -EAGAIN) { ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); } else { ipoib_warn(priv, "multicast join failed for %16D, status %d\n", mcast->mcmember.mgid.raw, ":", status); } } mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; /* Clear the busy flag so we try again */ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mutex_lock(&mcast_mutex); spin_lock_irq(&priv->lock); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); return status; } static void ipoib_mcast_join(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast, int create) { struct ib_sa_mcmember_rec rec = { .join_state = 1 }; ib_sa_comp_mask comp_mask; int ret = 0; ipoib_dbg_mcast(priv, "joining MGID %16D\n", mcast->mcmember.mgid.raw, ":"); rec.mgid = mcast->mcmember.mgid; rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE; if (create) { comp_mask |= IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_HOP_LIMIT; rec.qkey = priv->broadcast->mcmember.qkey; rec.mtu_selector = IB_SA_EQ; rec.mtu = priv->broadcast->mcmember.mtu; rec.traffic_class = priv->broadcast->mcmember.traffic_class; rec.rate_selector = IB_SA_EQ; rec.rate = priv->broadcast->mcmember.rate; rec.sl = priv->broadcast->mcmember.sl; rec.flow_label = priv->broadcast->mcmember.flow_label; rec.hop_limit = priv->broadcast->mcmember.hop_limit; } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); mcast->backoff *= 2; if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, mcast->backoff * HZ); mutex_unlock(&mcast_mutex); } } void ipoib_mcast_join_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct ifnet *dev = priv->dev; struct ib_port_attr attr; ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags); if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n", __func__, attr.state); return; } if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); { struct ib_port_attr attr; if (!ib_query_port(priv->ca, priv->port, &attr)) priv->local_lid = attr.lid; else ipoib_warn(priv, "ib_query_port failed\n"); } if (!priv->broadcast) { struct ipoib_mcast *broadcast; if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) return; broadcast = ipoib_mcast_alloc(priv, 1); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, HZ); mutex_unlock(&mcast_mutex); return; } spin_lock_irq(&priv->lock); memcpy(broadcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; __ipoib_mcast_add(priv, priv->broadcast); spin_unlock_irq(&priv->lock); } if (priv->broadcast && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { if (priv->broadcast && !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) ipoib_mcast_join(priv, priv->broadcast, 0); return; } while (1) { struct ipoib_mcast *mcast = NULL; spin_lock_irq(&priv->lock); list_for_each_entry(mcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { /* Found the next unjoined group */ break; } } spin_unlock_irq(&priv->lock); if (&mcast->list == &priv->multicast_list) { /* All done */ break; } ipoib_mcast_join(priv, mcast, 1); return; } spin_lock_irq(&priv->lock); if (priv->broadcast) priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); else priv->mcast_mtu = priv->admin_mtu; spin_unlock_irq(&priv->lock); if (!ipoib_cm_admin_enabled(priv)) ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu), true); ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); } int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv) { ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n", priv->flags); mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); return 0; } int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush) { ipoib_dbg_mcast(priv, "stopping multicast thread\n"); mutex_lock(&mcast_mutex); clear_bit(IPOIB_MCAST_RUN, &priv->flags); cancel_delayed_work(&priv->mcast_task); mutex_unlock(&mcast_mutex); if (flush) flush_workqueue(ipoib_workqueue); return 0; } static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast) { int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { ipoib_dbg_mcast(priv, "leaving MGID %16D\n", mcast->mcmember.mgid.raw, ":"); /* Remove ourselves from the multicast group */ ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, be16_to_cpu(mcast->mcmember.mlid)); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); } return 0; } void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb) { struct ifnet *dev = priv->dev; struct ipoib_mcast *mcast; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || !priv->broadcast || !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); return; } mcast = __ipoib_mcast_find(priv, mgid); if (!mcast) { /* Let's create a new send only group now */ ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n", mgid, ":"); mcast = ipoib_mcast_alloc(priv, 0); if (!mcast) { ipoib_warn(priv, "unable to allocate memory for " "multicast structure\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); goto out; } set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); __ipoib_mcast_add(priv, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } if (!mcast->ah) { if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) { _IF_ENQUEUE(&mcast->pkt_queue, mb); } else { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ipoib_dbg_mcast(priv, "no address vector, " "but multicast join already started\n"); else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) ipoib_mcast_sendonly_join(mcast); /* * If lookup completes between here and out:, don't * want to send packet twice. */ mcast = NULL; } out: if (mcast && mcast->ah) ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN); } void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv) { LIST_HEAD(remove_list); struct ipoib_mcast *mcast, *tmcast; unsigned long flags; ipoib_dbg_mcast(priv, "flushing multicast list\n"); spin_lock_irqsave(&priv->lock, flags); list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { list_del(&mcast->list); rb_erase(&mcast->rb_node, &priv->multicast_tree); list_add_tail(&mcast->list, &remove_list); } if (priv->broadcast) { rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); list_add_tail(&priv->broadcast->list, &remove_list); priv->broadcast = NULL; } spin_unlock_irqrestore(&priv->lock, flags); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(priv, mcast); ipoib_mcast_free(mcast); } } static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen, const u8 *broadcast) { if (addrlen != INFINIBAND_ALEN) return 0; /* reserved QPN, prefix, scope */ if (memcmp(addr, broadcast, 6)) return 0; /* signature lower, pkey */ if (memcmp(addr + 7, broadcast + 7, 3)) return 0; return 1; } void ipoib_mcast_restart_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, restart_task); ipoib_mcast_restart(priv); } struct ipoib_mcast_ctx { struct ipoib_dev_priv *priv; struct list_head remove_list; }; static u_int ipoib_process_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt) { struct ipoib_mcast_ctx *ctx = arg; struct ipoib_dev_priv *priv = ctx->priv; struct ipoib_mcast *mcast; struct ib_sa_mcmember_rec rec; union ib_gid mgid; uint8_t *addr; int addrlen; addr = LLADDR(sdl); addrlen = sdl->sdl_alen; if (!ipoib_mcast_addr_is_valid(addr, addrlen, priv->dev->if_broadcastaddr)) return (0); memcpy(mgid.raw, addr + 4, sizeof mgid); mcast = __ipoib_mcast_find(priv, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { struct ipoib_mcast *nmcast; /* ignore group which is directly joined by userspace */ if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n", mgid.raw, ":"); return (0); } /* Not found or send-only group, let's add a new entry */ ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n", mgid.raw, ":"); nmcast = ipoib_mcast_alloc(priv, 0); if (!nmcast) { ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); return (0); } set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags); nmcast->mcmember.mgid = mgid; if (mcast) { /* Destroy the send only entry */ list_move_tail(&mcast->list, &ctx->remove_list); rb_replace_node(&mcast->rb_node, &nmcast->rb_node, &priv->multicast_tree); } else __ipoib_mcast_add(priv, nmcast); list_add_tail(&nmcast->list, &priv->multicast_list); } if (mcast) set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); return (1); } void ipoib_mcast_restart(struct ipoib_dev_priv *priv) { struct ipoib_mcast_ctx ctx = { priv, { &ctx.remove_list, &ctx.remove_list }}; struct ifnet *dev = priv->dev; struct ipoib_mcast *mcast, *tmcast; ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n", priv->flags); ipoib_mcast_stop_thread(priv, 0); spin_lock(&priv->lock); /* * Unfortunately, the networking core only gives us a list of all of * the multicast hardware addresses. We need to figure out which ones * are new and which ones have been removed */ /* Clear out the found flag */ list_for_each_entry(mcast, &priv->multicast_list, list) clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); /* Mark all of the entries that are found or don't exist */ ctx.priv = priv; if_foreach_llmaddr(dev, ipoib_process_maddr, &ctx); /* Remove all of the entries don't exist anymore */ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { ipoib_dbg_mcast(priv, "deleting multicast group %16D\n", mcast->mcmember.mgid.raw, ":"); rb_erase(&mcast->rb_node, &priv->multicast_tree); /* Move to the remove list */ list_move_tail(&mcast->list, &ctx.remove_list); } } spin_unlock(&priv->lock); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &ctx.remove_list, list) { ipoib_mcast_leave(mcast->priv, mcast); ipoib_mcast_free(mcast); } if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) ipoib_mcast_start_thread(priv); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_mcast_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->mgid.raw, 0, 16); if (ipoib_mcast_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_mcast *mcast; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->multicast_tree); while (n) { mcast = rb_entry(n, struct ipoib_mcast, rb_node); if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw, sizeof (union ib_gid)) < 0) { iter->mgid = mcast->mcmember.mgid; iter->created = mcast->created; iter->queuelen = mcast->pkt_queue.ifq_len; iter->complete = !!mcast->ah; iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, union ib_gid *mgid, unsigned long *created, unsigned int *queuelen, unsigned int *complete, unsigned int *send_only) { *mgid = iter->mgid; *created = iter->created; *queuelen = iter->queuelen; *complete = iter->complete; *send_only = iter->send_only; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */