Index: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h =================================================================== --- head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h (revision 294609) +++ head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h (revision 294610) @@ -1,177 +1,178 @@ /************************************************************************** Copyright (c) 2007, 2008 Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $FreeBSD$ ***************************************************************************/ #ifndef __IWCH_H__ #define __IWCH_H__ struct iwch_pd; struct iwch_cq; struct iwch_qp; struct iwch_mr; enum t3ctype { T3A = 0, T3B, T3C }; #define PAGE_MASK_IWARP (~(PAGE_SIZE-1)) struct iwch_rnic_attributes { u32 vendor_id; u32 vendor_part_id; u32 max_qps; u32 max_wrs; /* Max for any SQ/RQ */ u32 max_sge_per_wr; u32 max_sge_per_rdma_write_wr; /* for RDMA Write WR */ u32 max_cqs; u32 max_cqes_per_cq; u32 max_mem_regs; u32 max_phys_buf_entries; /* for phys buf list */ u32 max_pds; /* * The memory page sizes supported by this RNIC. * Bit position i in bitmap indicates page of * size (4k)^i. Phys block list mode unsupported. */ u32 mem_pgsizes_bitmask; u64 max_mr_size; u8 can_resize_wq; /* * The maximum number of RDMA Reads that can be outstanding * per QP with this RNIC as the target. */ u32 max_rdma_reads_per_qp; /* * The maximum number of resources used for RDMA Reads * by this RNIC with this RNIC as the target. */ u32 max_rdma_read_resources; /* * The max depth per QP for initiation of RDMA Read * by this RNIC. */ u32 max_rdma_read_qp_depth; /* * The maximum depth for initiation of RDMA Read * operations by this RNIC on all QPs */ u32 max_rdma_read_depth; u8 rq_overflow_handled; u32 can_modify_ird; u32 can_modify_ord; u32 max_mem_windows; u32 stag0_value; u8 zbva_support; u8 local_invalidate_fence; u32 cq_overflow_detection; }; struct iwch_dev { struct ib_device ibdev; struct cxio_rdev rdev; u32 device_cap_flags; struct iwch_rnic_attributes attr; struct idr cqidr; struct idr qpidr; struct idr mmidr; struct mtx lock; TAILQ_ENTRY(iwch_dev) entry; }; #ifndef container_of #define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) #endif static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) { return container_of(ibdev, struct iwch_dev, ibdev); } static inline int t3b_device(const struct iwch_dev *rhp __unused) { return (0); } static inline int t3a_device(const struct iwch_dev *rhp __unused) { return (0); } static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid) { return idr_find(&rhp->cqidr, cqid); } static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid) { return idr_find(&rhp->qpidr, qpid); } static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid) { return idr_find(&rhp->mmidr, mmid); } static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, void *handle, u32 id) { int ret; u32 newid; do { if (!idr_pre_get(idr, GFP_KERNEL)) { return -ENOMEM; } mtx_lock(&rhp->lock); ret = idr_get_new_above(idr, handle, id, &newid); WARN_ON(ret != 0); WARN_ON(!ret && newid != id); mtx_unlock(&rhp->lock); } while (ret == -EAGAIN); return ret; } static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id) { mtx_lock(&rhp->lock); idr_remove(idr, id); mtx_unlock(&rhp->lock); } void iwch_ev_dispatch(struct iwch_dev *, struct mbuf *); +void process_newconn(struct iw_cm_id *parent_cm_id, struct socket *child_so); #endif Index: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c =================================================================== --- head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c (revision 294609) +++ head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c (revision 294610) @@ -1,1719 +1,1686 @@ /************************************************************************** Copyright (c) 2007, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTR static char *states[] = { "idle", "listen", "connecting", "mpa_wait_req", "mpa_req_sent", "mpa_req_rcvd", "mpa_rep_sent", "fpdu_mode", "aborting", "closing", "moribund", "dead", NULL, }; #endif SYSCTL_NODE(_hw, OID_AUTO, iw_cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters"); static int ep_timeout_secs = 60; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RWTUN, &ep_timeout_secs, 0, "CM Endpoint operation timeout in seconds (default=60)"); static int mpa_rev = 1; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RWTUN, &mpa_rev, 0, "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)"); static int markers_enabled = 0; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RWTUN, &markers_enabled, 0, "Enable MPA MARKERS (default(0)=disabled)"); static int crc_enabled = 1; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RWTUN, &crc_enabled, 0, "Enable MPA CRC (default(1)=enabled)"); static int rcv_win = 256 * 1024; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RWTUN, &rcv_win, 0, "TCP receive window in bytes (default=256KB)"); static int snd_win = 32 * 1024; SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, snd_win, CTLFLAG_RWTUN, &snd_win, 0, "TCP send window in bytes (default=32KB)"); static unsigned int nocong = 0; SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, nocong, CTLFLAG_RWTUN, &nocong, 0, "Turn off congestion control (default=0)"); static unsigned int cong_flavor = 1; SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RWTUN, &cong_flavor, 0, "TCP Congestion control flavor (default=1)"); static void ep_timeout(void *arg); static void connect_reply_upcall(struct iwch_ep *ep, int status); static int iwch_so_upcall(struct socket *so, void *arg, int waitflag); /* * Cruft to offload socket upcalls onto thread. */ static struct mtx req_lock; static TAILQ_HEAD(iwch_ep_list, iwch_ep_common) req_list; static struct task iw_cxgb_task; static struct taskqueue *iw_cxgb_taskq; static void process_req(void *ctx, int pending); static void start_ep_timer(struct iwch_ep *ep) { CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); if (callout_pending(&ep->timer)) { CTR2(KTR_IW_CXGB, "%s stopped / restarted timer ep %p", __FUNCTION__, ep); callout_deactivate(&ep->timer); callout_drain(&ep->timer); } else { /* * XXX this looks racy */ get_ep(&ep->com); callout_init(&ep->timer, 1); } callout_reset(&ep->timer, ep_timeout_secs * hz, ep_timeout, ep); } static void stop_ep_timer(struct iwch_ep *ep) { CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); if (!callout_pending(&ep->timer)) { CTR3(KTR_IW_CXGB, "%s timer stopped when its not running! ep %p state %u\n", __func__, ep, ep->com.state); return; } callout_drain(&ep->timer); put_ep(&ep->com); } static int set_tcpinfo(struct iwch_ep *ep) { struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; struct toepcb *toep; int rc = 0; INP_WLOCK(inp); tp = intotcpcb(inp); if ((tp->t_flags & TF_TOE) == 0) { rc = EINVAL; printf("%s: connection NOT OFFLOADED!\n", __func__); goto done; } toep = tp->t_toe; ep->hwtid = toep->tp_tid; ep->snd_seq = tp->snd_nxt; ep->rcv_seq = tp->rcv_nxt; ep->emss = tp->t_maxseg; if (ep->emss < 128) ep->emss = 128; done: INP_WUNLOCK(inp); return (rc); } static enum iwch_ep_state state_read(struct iwch_ep_common *epc) { enum iwch_ep_state state; mtx_lock(&epc->lock); state = epc->state; mtx_unlock(&epc->lock); return state; } static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) { epc->state = new; } static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) { mtx_lock(&epc->lock); CTR3(KTR_IW_CXGB, "%s - %s -> %s", __FUNCTION__, states[epc->state], states[new]); __state_set(epc, new); mtx_unlock(&epc->lock); return; } static void * alloc_ep(int size, int flags) { struct iwch_ep_common *epc; epc = malloc(size, M_DEVBUF, flags); if (epc) { memset(epc, 0, size); refcount_init(&epc->refcount, 1); mtx_init(&epc->lock, "iwch_epc lock", NULL, MTX_DEF|MTX_DUPOK); cv_init(&epc->waitq, "iwch_epc cv"); } CTR2(KTR_IW_CXGB, "%s alloc ep %p", __FUNCTION__, epc); return epc; } void __free_ep(struct iwch_ep_common *epc) { CTR3(KTR_IW_CXGB, "%s ep %p state %s", __FUNCTION__, epc, states[state_read(epc)]); - KASSERT(!epc->so, ("%s warning ep->so %p \n", __FUNCTION__, epc->so)); KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list!\n", __FUNCTION__, epc)); free(epc, M_DEVBUF); } static int find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, __be16 peer_port, u8 tos, struct nhop4_extended *pnh4) { struct in_addr addr; addr.s_addr = peer_ip; return (fib4_lookup_nh_ext(RT_DEFAULT_FIB, addr, NHR_REF, 0, pnh4)); } static void close_socket(struct iwch_ep_common *epc, int close) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]); SOCK_LOCK(epc->so); soupcall_clear(epc->so, SO_RCV); SOCK_UNLOCK(epc->so); if (close) soclose(epc->so); else soshutdown(epc->so, SHUT_WR|SHUT_RD); epc->so = NULL; } static void shutdown_socket(struct iwch_ep_common *epc) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]); soshutdown(epc->so, SHUT_WR); } static void abort_socket(struct iwch_ep *ep) { struct sockopt sopt; int err; struct linger l; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); l.l_onoff = 1; l.l_linger = 0; /* linger_time of 0 forces RST to be sent */ sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_LINGER; sopt.sopt_val = (caddr_t)&l; sopt.sopt_valsize = sizeof l; sopt.sopt_td = NULL; err = sosetopt(ep->com.so, &sopt); if (err) printf("%s can't set linger to 0, no RST! err %d\n", __FUNCTION__, err); } static void send_mpa_req(struct iwch_ep *ep) { int mpalen; struct mpa_message *mpa; struct mbuf *m; int err; CTR3(KTR_IW_CXGB, "%s ep %p pd_len %d", __FUNCTION__, ep, ep->plen); mpalen = sizeof(*mpa) + ep->plen; m = m_gethdr(mpalen, M_NOWAIT); if (m == NULL) { connect_reply_upcall(ep, -ENOMEM); return; } mpa = mtod(m, struct mpa_message *); m->m_len = mpalen; m->m_pkthdr.len = mpalen; memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); mpa->flags = (crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0); mpa->private_data_size = htons(ep->plen); mpa->revision = mpa_rev; if (ep->plen) memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); if (err) { m_freem(m); connect_reply_upcall(ep, -ENOMEM); return; } start_ep_timer(ep); state_set(&ep->com, MPA_REQ_SENT); return; } static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) { int mpalen; struct mpa_message *mpa; struct mbuf *m; int err; CTR3(KTR_IW_CXGB, "%s ep %p plen %d", __FUNCTION__, ep, plen); mpalen = sizeof(*mpa) + plen; m = m_gethdr(mpalen, M_NOWAIT); if (m == NULL) { printf("%s - cannot alloc mbuf!\n", __FUNCTION__); return (-ENOMEM); } mpa = mtod(m, struct mpa_message *); m->m_len = mpalen; m->m_pkthdr.len = mpalen; memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = MPA_REJECT; mpa->revision = mpa_rev; mpa->private_data_size = htons(plen); if (plen) memcpy(mpa->private_data, pdata, plen); err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); PANIC_IF(err); return 0; } static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) { int mpalen; struct mpa_message *mpa; struct mbuf *m; CTR4(KTR_IW_CXGB, "%s ep %p so %p plen %d", __FUNCTION__, ep, ep->com.so, plen); mpalen = sizeof(*mpa) + plen; m = m_gethdr(mpalen, M_NOWAIT); if (m == NULL) { printf("%s - cannot alloc mbuf!\n", __FUNCTION__); return (-ENOMEM); } mpa = mtod(m, struct mpa_message *); m->m_len = mpalen; m->m_pkthdr.len = mpalen; memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0); mpa->revision = mpa_rev; mpa->private_data_size = htons(plen); if (plen) memcpy(mpa->private_data, pdata, plen); state_set(&ep->com, MPA_REP_SENT); return sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); } static void close_complete_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "close complete delivered ep %p cm_id %p tid %d", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; } } static void abort_connection(struct iwch_ep *ep) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); state_set(&ep->com, ABORTING); abort_socket(ep); close_socket(&ep->com, 0); close_complete_upcall(ep); state_set(&ep->com, DEAD); put_ep(&ep->com); } static void peer_close_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_DISCONNECT; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "peer close delivered ep %p cm_id %p tid %d", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } } static void peer_abort_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; event.status = ECONNRESET; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "abort delivered ep %p cm_id %p tid %d", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; } } static void connect_reply_upcall(struct iwch_ep *ep, int status) { struct iw_cm_event event; CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], status); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REPLY; event.status = status; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; if ((status == 0) || (status == ECONNREFUSED)) { event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } if (ep->com.cm_id) { CTR4(KTR_IW_CXGB, "%s ep %p tid %d status %d", __FUNCTION__, ep, ep->hwtid, status); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } if (status < 0) { ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; } } static void connect_request_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REQUEST; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); event.provider_data = ep; event.so = ep->com.so; if (state_read(&ep->parent_ep->com) != DEAD) { get_ep(&ep->com); ep->parent_ep->com.cm_id->event_handler( ep->parent_ep->com.cm_id, &event); } put_ep(&ep->parent_ep->com); } static void established_upcall(struct iwch_ep *ep) { struct iw_cm_event event; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_ESTABLISHED; if (ep->com.cm_id) { CTR3(KTR_IW_CXGB, "%s ep %p tid %d", __FUNCTION__, ep, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } } static void process_mpa_reply(struct iwch_ep *ep) { struct mpa_message *mpa; u16 plen; struct iwch_qp_attributes attrs; enum iwch_qp_attr_mask mask; int err; struct mbuf *top, *m; int flags = MSG_DONTWAIT; struct uio uio; int len; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); /* * Stop mpa timer. If it expired, then the state has * changed and we bail since ep_timeout already aborted * the connection. */ stop_ep_timer(ep); if (state_read(&ep->com) != MPA_REQ_SENT) return; uio.uio_resid = len = 1000000; uio.uio_td = ep->com.thread; err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); if (err) { if (err == EWOULDBLOCK) { start_ep_timer(ep); return; } err = -err; goto err; } if (ep->com.so->so_rcv.sb_mb) { printf("%s data after soreceive called! so %p sb_mb %p top %p\n", __FUNCTION__, ep->com.so, ep->com.so->so_rcv.sb_mb, top); } m = top; do { /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { err = (-EINVAL); goto err; } /* * copy the new data into our accumulation buffer. */ m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); ep->mpa_pkt_len += m->m_len; if (!m->m_next) m = m->m_nextpkt; else m = m->m_next; } while (m); m_freem(top); /* * if we don't even have the mpa message, then bail. */ if (ep->mpa_pkt_len < sizeof(*mpa)) return; mpa = (struct mpa_message *)ep->mpa_pkt; /* Validate MPA header. */ if (mpa->revision != mpa_rev) { CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision); err = EPROTO; goto err; } if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key); err = EPROTO; goto err; } plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen); err = EPROTO; goto err; } /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGB, "%s pkt too big %d", __FUNCTION__, ep->mpa_pkt_len); err = EPROTO; goto err; } ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) return; if (mpa->flags & MPA_REJECT) { err = ECONNREFUSED; goto err; } /* * If we get here we have accumulated the entire mpa * start reply message including private data. And * the MPA header is valid. */ CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__); state_set(&ep->com, FPDU_MODE); ep->mpa_attr.initiator = 1; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa_rev; if (set_tcpinfo(ep)) { printf("%s set_tcpinfo error\n", __FUNCTION__); goto err; } CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, " "xmit_marker_enabled=%d, version=%d", __FUNCTION__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = IWCH_QP_STATE_RTS; mask = IWCH_QP_ATTR_NEXT_STATE | IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; /* bind QP and TID with INIT_WR */ err = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (!err) goto out; err: abort_connection(ep); out: connect_reply_upcall(ep, err); return; } static void process_mpa_request(struct iwch_ep *ep) { struct mpa_message *mpa; u16 plen; int flags = MSG_DONTWAIT; struct mbuf *top, *m; int err; struct uio uio; int len; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); /* * Stop mpa timer. If it expired, then the state has * changed and we bail since ep_timeout already aborted * the connection. */ stop_ep_timer(ep); if (state_read(&ep->com) != MPA_REQ_WAIT) return; uio.uio_resid = len = 1000000; uio.uio_td = ep->com.thread; err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); if (err) { if (err == EWOULDBLOCK) { start_ep_timer(ep); return; } err = -err; goto err; } m = top; do { /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { CTR2(KTR_IW_CXGB, "%s mpa message too big %d", __FUNCTION__, ep->mpa_pkt_len + m->m_len); goto err; } /* * Copy the new data into our accumulation buffer. */ m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); ep->mpa_pkt_len += m->m_len; if (!m->m_next) m = m->m_nextpkt; else m = m->m_next; } while (m); m_freem(top); /* * If we don't even have the mpa message, then bail. * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < sizeof(*mpa)) { start_ep_timer(ep); CTR2(KTR_IW_CXGB, "%s not enough header %d...waiting...", __FUNCTION__, ep->mpa_pkt_len); return; } mpa = (struct mpa_message *) ep->mpa_pkt; /* * Validate MPA Header. */ if (mpa->revision != mpa_rev) { CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision); goto err; } if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key); goto err; } plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen); goto err; } /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGB, "%s more data after private data %d", __FUNCTION__, ep->mpa_pkt_len); goto err; } ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) { start_ep_timer(ep); CTR2(KTR_IW_CXGB, "%s more mpa msg to come %d", __FUNCTION__, ep->mpa_pkt_len); return; } /* * If we get here we have accumulated the entire mpa * start reply message including private data. */ ep->mpa_attr.initiator = 0; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa_rev; if (set_tcpinfo(ep)) { printf("%s set_tcpinfo error\n", __FUNCTION__); goto err; } CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, " "xmit_marker_enabled=%d, version=%d", __FUNCTION__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); state_set(&ep->com, MPA_REQ_RCVD); /* drive upcall */ connect_request_upcall(ep); return; err: abort_connection(ep); return; } static void process_peer_close(struct iwch_ep *ep) { struct iwch_qp_attributes attrs; int disconnect = 1; int release = 0; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); mtx_lock(&ep->com.lock); switch (ep->com.state) { case MPA_REQ_WAIT: __state_set(&ep->com, CLOSING); break; case MPA_REQ_SENT: __state_set(&ep->com, CLOSING); connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REQ_RCVD: /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or * rejects the CR. */ __state_set(&ep->com, CLOSING); break; case MPA_REP_SENT: __state_set(&ep->com, CLOSING); break; case FPDU_MODE: start_ep_timer(ep); __state_set(&ep->com, CLOSING); attrs.next_state = IWCH_QP_STATE_CLOSING; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); peer_close_upcall(ep); break; case ABORTING: disconnect = 0; break; case CLOSING: __state_set(&ep->com, MORIBUND); disconnect = 0; break; case MORIBUND: stop_ep_timer(ep); if (ep->com.cm_id && ep->com.qp) { attrs.next_state = IWCH_QP_STATE_IDLE; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } close_socket(&ep->com, 0); close_complete_upcall(ep); __state_set(&ep->com, DEAD); release = 1; disconnect = 0; break; case DEAD: disconnect = 0; break; default: PANIC_IF(1); } mtx_unlock(&ep->com.lock); if (disconnect) iwch_ep_disconnect(ep, 0, M_NOWAIT); if (release) put_ep(&ep->com); return; } static void process_conn_error(struct iwch_ep *ep) { struct iwch_qp_attributes attrs; int ret; mtx_lock(&ep->com.lock); CTR3(KTR_IW_CXGB, "%s ep %p state %u", __func__, ep, ep->com.state); switch (ep->com.state) { case MPA_REQ_WAIT: stop_ep_timer(ep); break; case MPA_REQ_SENT: stop_ep_timer(ep); connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REP_SENT: ep->com.rpl_err = ECONNRESET; CTR1(KTR_IW_CXGB, "waking up ep %p", ep); break; case MPA_REQ_RCVD: /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or * rejects the CR. */ break; case MORIBUND: case CLOSING: stop_ep_timer(ep); /*FALLTHROUGH*/ case FPDU_MODE: if (ep->com.cm_id && ep->com.qp) { attrs.next_state = IWCH_QP_STATE_ERROR; ret = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); if (ret) log(LOG_ERR, "%s - qp <- error failed!\n", __FUNCTION__); } peer_abort_upcall(ep); break; case ABORTING: break; case DEAD: mtx_unlock(&ep->com.lock); CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__, ep->com.so->so_error); return; default: PANIC_IF(1); break; } if (ep->com.state != ABORTING) { close_socket(&ep->com, 0); __state_set(&ep->com, DEAD); put_ep(&ep->com); } mtx_unlock(&ep->com.lock); return; } static void process_close_complete(struct iwch_ep *ep) { struct iwch_qp_attributes attrs; int release = 0; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); PANIC_IF(!ep); /* The cm_id may be null if we failed to connect */ mtx_lock(&ep->com.lock); switch (ep->com.state) { case CLOSING: __state_set(&ep->com, MORIBUND); break; case MORIBUND: stop_ep_timer(ep); if ((ep->com.cm_id) && (ep->com.qp)) { attrs.next_state = IWCH_QP_STATE_IDLE; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } if (ep->parent_ep) close_socket(&ep->com, 1); else close_socket(&ep->com, 0); close_complete_upcall(ep); __state_set(&ep->com, DEAD); release = 1; break; case ABORTING: break; case DEAD: default: PANIC_IF(1); break; } mtx_unlock(&ep->com.lock); if (release) put_ep(&ep->com); return; } /* * T3A does 3 things when a TERM is received: * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet * 2) generate an async event on the QP with the TERMINATE opcode * 3) post a TERMINATE opcde cqe into the associated CQ. * * For (1), we save the message in the qp for later consumer consumption. * For (2), we move the QP into TERMINATE, post a QP event and disconnect. * For (3), we toss the CQE in cxio_poll_cq(). * * terminate() handles case (1)... */ static int terminate(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; uint32_t hash = *((uint32_t *)r + 1); unsigned int tid = ntohl(hash) >> 8 & 0xfffff; struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct socket *so = toep->tp_inp->inp_socket; struct iwch_ep *ep = so->so_rcv.sb_upcallarg; if (state_read(&ep->com) != FPDU_MODE) goto done; m_adj(m, sizeof(struct cpl_rdma_terminate)); CTR4(KTR_IW_CXGB, "%s: tid %u, ep %p, saved %d bytes", __func__, tid, ep, m->m_len); m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer); ep->com.qp->attr.terminate_msg_len = m->m_len; ep->com.qp->attr.is_terminate_local = 0; done: m_freem(m); return (0); } static int ec_status(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { struct adapter *sc = qs->adap; struct tom_data *td = sc->tom_softc; struct cpl_rdma_ec_status *rep = mtod(m, void *); unsigned int tid = GET_TID(rep); struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct socket *so = toep->tp_inp->inp_socket; struct iwch_ep *ep = so->so_rcv.sb_upcallarg; if (rep->status) { struct iwch_qp_attributes attrs; CTR1(KTR_IW_CXGB, "%s BAD CLOSE - Aborting", __FUNCTION__); stop_ep_timer(ep); attrs.next_state = IWCH_QP_STATE_ERROR; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); abort_connection(ep); } m_freem(m); return (0); } static void ep_timeout(void *arg) { struct iwch_ep *ep = (struct iwch_ep *)arg; struct iwch_qp_attributes attrs; int err = 0; int abort = 1; mtx_lock(&ep->com.lock); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); switch (ep->com.state) { case MPA_REQ_SENT: __state_set(&ep->com, ABORTING); connect_reply_upcall(ep, -ETIMEDOUT); break; case MPA_REQ_WAIT: __state_set(&ep->com, ABORTING); break; case CLOSING: case MORIBUND: if (ep->com.cm_id && ep->com.qp) err = 1; __state_set(&ep->com, ABORTING); break; default: CTR3(KTR_IW_CXGB, "%s unexpected state ep %p state %u\n", __func__, ep, ep->com.state); abort = 0; } mtx_unlock(&ep->com.lock); if (err){ attrs.next_state = IWCH_QP_STATE_ERROR; iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } if (abort) abort_connection(ep); put_ep(&ep->com); } int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { int err; struct iwch_ep *ep = to_ep(cm_id); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); if (state_read(&ep->com) == DEAD) { put_ep(&ep->com); return (-ECONNRESET); } PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD); if (mpa_rev == 0) { abort_connection(ep); } else { err = send_mpa_reject(ep, pdata, pdata_len); err = soshutdown(ep->com.so, 3); } put_ep(&ep->com); return 0; } int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err; struct iwch_qp_attributes attrs; enum iwch_qp_attr_mask mask; struct iwch_ep *ep = to_ep(cm_id); struct iwch_dev *h = to_iwch_dev(cm_id->device); struct iwch_qp *qp = get_qhp(h, conn_param->qpn); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); if (state_read(&ep->com) == DEAD) { err = -ECONNRESET; goto err; } PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD); PANIC_IF(!qp); if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { abort_connection(ep); err = -EINVAL; goto err; } cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.qp = qp; ep->com.rpl_err = 0; ep->com.rpl_done = 0; ep->ird = conn_param->ird; ep->ord = conn_param->ord; CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord); /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = IWCH_QP_STATE_RTS; /* bind QP and TID with INIT_WR */ mask = IWCH_QP_ATTR_NEXT_STATE | IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; err = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) goto err1; err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) goto err1; state_set(&ep->com, FPDU_MODE); established_upcall(ep); put_ep(&ep->com); return 0; err1: ep->com.cm_id = NULL; ep->com.qp = NULL; cm_id->rem_ref(cm_id); err: put_ep(&ep->com); return err; } static int init_sock(struct iwch_ep_common *epc) { int err; struct sockopt sopt; int on=1; SOCK_LOCK(epc->so); soupcall_set(epc->so, SO_RCV, iwch_so_upcall, epc); epc->so->so_state |= SS_NBIO; SOCK_UNLOCK(epc->so); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = (caddr_t)&on; sopt.sopt_valsize = sizeof on; sopt.sopt_td = NULL; err = sosetopt(epc->so, &sopt); if (err) printf("%s can't set TCP_NODELAY err %d\n", __FUNCTION__, err); return 0; } static int is_loopback_dst(struct iw_cm_id *cm_id) { uint16_t port = cm_id->remote_addr.sin_port; int ifa_present; cm_id->remote_addr.sin_port = 0; ifa_present = ifa_ifwithaddr_check( (struct sockaddr *)&cm_id->remote_addr); cm_id->remote_addr.sin_port = port; return (ifa_present); } int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err = 0; struct iwch_dev *h = to_iwch_dev(cm_id->device); struct iwch_ep *ep; struct nhop4_extended nh4; struct toedev *tdev; if (is_loopback_dst(cm_id)) { err = -ENOSYS; goto out; } ep = alloc_ep(sizeof(*ep), M_NOWAIT); if (!ep) { printf("%s - cannot alloc ep.\n", __FUNCTION__); err = (-ENOMEM); goto out; } callout_init(&ep->timer, 1); ep->plen = conn_param->private_data_len; if (ep->plen) memcpy(ep->mpa_pkt + sizeof(struct mpa_message), conn_param->private_data, ep->plen); ep->ird = conn_param->ird; ep->ord = conn_param->ord; cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.qp = get_qhp(h, conn_param->qpn); ep->com.thread = curthread; PANIC_IF(!ep->com.qp); CTR4(KTR_IW_CXGB, "%s qpn 0x%x qp %p cm_id %p", __FUNCTION__, conn_param->qpn, ep->com.qp, cm_id); ep->com.so = cm_id->so; err = init_sock(&ep->com); if (err) goto fail2; /* find a route */ err = find_route(cm_id->local_addr.sin_addr.s_addr, cm_id->remote_addr.sin_addr.s_addr, cm_id->local_addr.sin_port, cm_id->remote_addr.sin_port, IPTOS_LOWDELAY, &nh4); if (err) { printf("%s - cannot find route.\n", __FUNCTION__); err = EHOSTUNREACH; goto fail2; } if (!(nh4.nh_ifp->if_flags & IFCAP_TOE)) { printf("%s - interface not TOE capable.\n", __FUNCTION__); fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); goto fail2; } tdev = TOEDEV(nh4.nh_ifp); if (tdev == NULL) { printf("%s - No toedev for interface.\n", __FUNCTION__); fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); goto fail2; } fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); state_set(&ep->com, CONNECTING); ep->com.local_addr = cm_id->local_addr; ep->com.remote_addr = cm_id->remote_addr; err = soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr, ep->com.thread); if (!err) goto out; fail2: put_ep(&ep->com); out: return err; } int -iwch_create_listen(struct iw_cm_id *cm_id, int backlog) +iwch_create_listen_ep(struct iw_cm_id *cm_id, int backlog) { int err = 0; struct iwch_listen_ep *ep; ep = alloc_ep(sizeof(*ep), M_NOWAIT); if (!ep) { printf("%s - cannot alloc ep.\n", __FUNCTION__); err = ENOMEM; goto out; } CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->backlog = backlog; ep->com.local_addr = cm_id->local_addr; ep->com.thread = curthread; state_set(&ep->com, LISTEN); ep->com.so = cm_id->so; - err = init_sock(&ep->com); - if (err) - goto fail; - - err = solisten(ep->com.so, ep->backlog, ep->com.thread); - if (!err) { - cm_id->provider_data = ep; - goto out; - } - close_socket(&ep->com, 0); -fail: - cm_id->rem_ref(cm_id); - put_ep(&ep->com); + cm_id->provider_data = ep; out: return err; } -int -iwch_destroy_listen(struct iw_cm_id *cm_id) +void +iwch_destroy_listen_ep(struct iw_cm_id *cm_id) { struct iwch_listen_ep *ep = to_listen_ep(cm_id); CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); state_set(&ep->com, DEAD); - close_socket(&ep->com, 0); cm_id->rem_ref(cm_id); put_ep(&ep->com); - return 0; + return; } int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags) { int close = 0; mtx_lock(&ep->com.lock); PANIC_IF(!ep); PANIC_IF(!ep->com.so); CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], abrupt); switch (ep->com.state) { case MPA_REQ_WAIT: case MPA_REQ_SENT: case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: close = 1; if (abrupt) ep->com.state = ABORTING; else { ep->com.state = CLOSING; start_ep_timer(ep); } break; case CLOSING: close = 1; if (abrupt) { stop_ep_timer(ep); ep->com.state = ABORTING; } else ep->com.state = MORIBUND; break; case MORIBUND: case ABORTING: case DEAD: CTR3(KTR_IW_CXGB, "%s ignoring disconnect ep %p state %u\n", __func__, ep, ep->com.state); break; default: panic("unknown state: %d\n", ep->com.state); break; } mtx_unlock(&ep->com.lock); if (close) { if (abrupt) abort_connection(ep); else { if (!ep->parent_ep) __state_set(&ep->com, MORIBUND); shutdown_socket(&ep->com); } } return 0; } static void process_data(struct iwch_ep *ep) { struct sockaddr_in *local, *remote; CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); switch (state_read(&ep->com)) { case MPA_REQ_SENT: process_mpa_reply(ep); break; case MPA_REQ_WAIT: /* * XXX * Set local and remote addrs here because when we * dequeue the newly accepted socket, they aren't set * yet in the pcb! */ in_getsockaddr(ep->com.so, (struct sockaddr **)&local); in_getpeeraddr(ep->com.so, (struct sockaddr **)&remote); CTR3(KTR_IW_CXGB, "%s local %s remote %s", __FUNCTION__, inet_ntoa(local->sin_addr), inet_ntoa(remote->sin_addr)); ep->com.local_addr = *local; ep->com.remote_addr = *remote; free(local, M_SONAME); free(remote, M_SONAME); process_mpa_request(ep); break; default: if (sbavail(&ep->com.so->so_rcv)) printf("%s Unexpected streaming data." " ep %p state %d so %p so_state %x so_rcv.sb_cc %u so_rcv.sb_mb %p\n", __FUNCTION__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state, sbavail(&ep->com.so->so_rcv), ep->com.so->so_rcv.sb_mb); break; } return; } static void process_connected(struct iwch_ep *ep) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); if ((ep->com.so->so_state & SS_ISCONNECTED) && !ep->com.so->so_error) { send_mpa_req(ep); } else { connect_reply_upcall(ep, -ep->com.so->so_error); close_socket(&ep->com, 0); state_set(&ep->com, DEAD); put_ep(&ep->com); } } -static struct socket * -dequeue_socket(struct socket *head, struct sockaddr_in **remote, struct iwch_ep *child_ep) +void +process_newconn(struct iw_cm_id *parent_cm_id, struct socket *child_so) { - struct socket *so; - - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (!so) { - ACCEPT_UNLOCK(); - return NULL; - } - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - SOCK_LOCK(so); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - soref(so); - soupcall_set(so, SO_RCV, iwch_so_upcall, child_ep); - so->so_state |= SS_NBIO; - PANIC_IF(!(so->so_state & SS_ISCONNECTED)); - PANIC_IF(so->so_error); - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - soaccept(so, (struct sockaddr **)remote); - return so; -} - -static void -process_newconn(struct iwch_ep *parent_ep) -{ - struct socket *child_so; struct iwch_ep *child_ep; + struct sockaddr_in *local; struct sockaddr_in *remote; + struct iwch_ep *parent_ep = parent_cm_id->provider_data; CTR3(KTR_IW_CXGB, "%s parent ep %p so %p", __FUNCTION__, parent_ep, parent_ep->com.so); + if (!child_so) { + log(LOG_ERR, "%s - invalid child socket!\n", __func__); + return; + } child_ep = alloc_ep(sizeof(*child_ep), M_NOWAIT); if (!child_ep) { log(LOG_ERR, "%s - failed to allocate ep entry!\n", __FUNCTION__); return; } - child_so = dequeue_socket(parent_ep->com.so, &remote, child_ep); - if (!child_so) { - log(LOG_ERR, "%s - failed to dequeue child socket!\n", - __FUNCTION__); - __free_ep(&child_ep->com); - return; - } + SOCKBUF_LOCK(&child_so->so_rcv); + soupcall_set(child_so, SO_RCV, iwch_so_upcall, child_ep); + SOCKBUF_UNLOCK(&child_so->so_rcv); + + in_getsockaddr(child_so, (struct sockaddr **)&local); + in_getpeeraddr(child_so, (struct sockaddr **)&remote); + CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__, inet_ntoa(remote->sin_addr), ntohs(remote->sin_port)); child_ep->com.tdev = parent_ep->com.tdev; child_ep->com.local_addr.sin_family = parent_ep->com.local_addr.sin_family; child_ep->com.local_addr.sin_port = parent_ep->com.local_addr.sin_port; child_ep->com.local_addr.sin_addr.s_addr = parent_ep->com.local_addr.sin_addr.s_addr; child_ep->com.local_addr.sin_len = parent_ep->com.local_addr.sin_len; child_ep->com.remote_addr.sin_family = remote->sin_family; child_ep->com.remote_addr.sin_port = remote->sin_port; child_ep->com.remote_addr.sin_addr.s_addr = remote->sin_addr.s_addr; child_ep->com.remote_addr.sin_len = remote->sin_len; child_ep->com.so = child_so; child_ep->com.cm_id = NULL; child_ep->com.thread = parent_ep->com.thread; child_ep->parent_ep = parent_ep; + free(local, M_SONAME); free(remote, M_SONAME); get_ep(&parent_ep->com); - child_ep->parent_ep = parent_ep; callout_init(&child_ep->timer, 1); state_set(&child_ep->com, MPA_REQ_WAIT); start_ep_timer(child_ep); /* maybe the request has already been queued up on the socket... */ process_mpa_request(child_ep); } static int iwch_so_upcall(struct socket *so, void *arg, int waitflag) { struct iwch_ep *ep = arg; CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]); mtx_lock(&req_lock); if (ep && ep->com.so && !ep->com.entry.tqe_prev) { get_ep(&ep->com); TAILQ_INSERT_TAIL(&req_list, &ep->com, entry); taskqueue_enqueue(iw_cxgb_taskq, &iw_cxgb_task); } mtx_unlock(&req_lock); return (SU_OK); } static void process_socket_event(struct iwch_ep *ep) { int state = state_read(&ep->com); struct socket *so = ep->com.so; CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]); if (state == CONNECTING) { process_connected(ep); return; } if (state == LISTEN) { - process_newconn(ep); + /* socket listening events are handled at IWCM */ + CTR3(KTR_IW_CXGB, "%s Invalid ep state:%u, ep:%p", __func__, + ep->com.state, ep); + BUG(); return; } /* connection error */ if (so->so_error) { process_conn_error(ep); return; } /* peer close */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state < CLOSING) { process_peer_close(ep); return; } /* close complete */ if (so->so_state & (SS_ISDISCONNECTED)) { process_close_complete(ep); return; } /* rx data */ process_data(ep); return; } static void process_req(void *ctx, int pending) { struct iwch_ep_common *epc; CTR1(KTR_IW_CXGB, "%s enter", __FUNCTION__); mtx_lock(&req_lock); while (!TAILQ_EMPTY(&req_list)) { epc = TAILQ_FIRST(&req_list); TAILQ_REMOVE(&req_list, epc, entry); epc->entry.tqe_prev = NULL; mtx_unlock(&req_lock); if (epc->so) process_socket_event((struct iwch_ep *)epc); put_ep(epc); mtx_lock(&req_lock); } mtx_unlock(&req_lock); } int iwch_cm_init(void) { TAILQ_INIT(&req_list); mtx_init(&req_lock, "iw_cxgb req_list lock", NULL, MTX_DEF); iw_cxgb_taskq = taskqueue_create("iw_cxgb_taskq", M_NOWAIT, taskqueue_thread_enqueue, &iw_cxgb_taskq); if (iw_cxgb_taskq == NULL) { printf("failed to allocate iw_cxgb taskqueue\n"); return (ENOMEM); } taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq"); TASK_INIT(&iw_cxgb_task, 0, process_req, NULL); return (0); } void iwch_cm_term(void) { taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task); taskqueue_free(iw_cxgb_taskq); } void iwch_cm_init_cpl(struct adapter *sc) { t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, terminate); t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, ec_status); } void iwch_cm_term_cpl(struct adapter *sc) { t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, NULL); t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, NULL); } #endif Index: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h =================================================================== --- head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h (revision 294609) +++ head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h (revision 294610) @@ -1,248 +1,248 @@ /************************************************************************** Copyright (c) 2007, 2008 Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. $FreeBSD$ ***************************************************************************/ #ifndef _IWCH_CM_H_ #define _IWCH_CM_H_ #include #include #include #include #include #define MPA_KEY_REQ "MPA ID Req Frame" #define MPA_KEY_REP "MPA ID Rep Frame" #define MPA_MAX_PRIVATE_DATA 256 #define MPA_REV 0 /* XXX - amso1100 uses rev 0 ! */ #define MPA_REJECT 0x20 #define MPA_CRC 0x40 #define MPA_MARKERS 0x80 #define MPA_FLAGS_MASK 0xE0 #define put_ep(ep) { \ CTR4(KTR_IW_CXGB, "put_ep (via %s:%u) ep %p refcnt %d", __FUNCTION__, __LINE__, \ ep, atomic_load_acq_int(&((ep)->refcount))); \ if (refcount_release(&((ep)->refcount))) \ __free_ep(ep); \ } #define get_ep(ep) { \ CTR4(KTR_IW_CXGB, "get_ep (via %s:%u) ep %p, refcnt %d", __FUNCTION__, __LINE__, \ ep, atomic_load_acq_int(&((ep)->refcount))); \ refcount_acquire(&((ep)->refcount)); \ } struct mpa_message { u8 key[16]; u8 flags; u8 revision; __be16 private_data_size; u8 private_data[0]; }; struct terminate_message { u8 layer_etype; u8 ecode; __be16 hdrct_rsvd; u8 len_hdrs[0]; }; #define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) enum iwch_layers_types { LAYER_RDMAP = 0x00, LAYER_DDP = 0x10, LAYER_MPA = 0x20, RDMAP_LOCAL_CATA = 0x00, RDMAP_REMOTE_PROT = 0x01, RDMAP_REMOTE_OP = 0x02, DDP_LOCAL_CATA = 0x00, DDP_TAGGED_ERR = 0x01, DDP_UNTAGGED_ERR = 0x02, DDP_LLP = 0x03 }; enum iwch_rdma_ecodes { RDMAP_INV_STAG = 0x00, RDMAP_BASE_BOUNDS = 0x01, RDMAP_ACC_VIOL = 0x02, RDMAP_STAG_NOT_ASSOC = 0x03, RDMAP_TO_WRAP = 0x04, RDMAP_INV_VERS = 0x05, RDMAP_INV_OPCODE = 0x06, RDMAP_STREAM_CATA = 0x07, RDMAP_GLOBAL_CATA = 0x08, RDMAP_CANT_INV_STAG = 0x09, RDMAP_UNSPECIFIED = 0xff }; enum iwch_ddp_ecodes { DDPT_INV_STAG = 0x00, DDPT_BASE_BOUNDS = 0x01, DDPT_STAG_NOT_ASSOC = 0x02, DDPT_TO_WRAP = 0x03, DDPT_INV_VERS = 0x04, DDPU_INV_QN = 0x01, DDPU_INV_MSN_NOBUF = 0x02, DDPU_INV_MSN_RANGE = 0x03, DDPU_INV_MO = 0x04, DDPU_MSG_TOOBIG = 0x05, DDPU_INV_VERS = 0x06 }; enum iwch_mpa_ecodes { MPA_CRC_ERR = 0x02, MPA_MARKER_ERR = 0x03 }; enum iwch_ep_state { IDLE = 0, LISTEN, CONNECTING, MPA_REQ_WAIT, MPA_REQ_SENT, MPA_REQ_RCVD, MPA_REP_SENT, FPDU_MODE, ABORTING, CLOSING, MORIBUND, DEAD, }; enum iwch_ep_flags { PEER_ABORT_IN_PROGRESS = (1 << 0), ABORT_REQ_IN_PROGRESS = (1 << 1), }; struct iwch_ep_common { TAILQ_ENTRY(iwch_ep_common) entry; struct iw_cm_id *cm_id; struct iwch_qp *qp; struct toedev *tdev; enum iwch_ep_state state; u_int refcount; struct cv waitq; struct mtx lock; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; int rpl_err; int rpl_done; struct thread *thread; struct socket *so; }; struct iwch_listen_ep { struct iwch_ep_common com; unsigned int stid; int backlog; }; struct iwch_ep { struct iwch_ep_common com; struct iwch_ep *parent_ep; struct callout timer; unsigned int atid; u32 hwtid; u32 snd_seq; u32 rcv_seq; struct l2t_entry *l2t; struct mbuf *mpa_mbuf; struct iwch_mpa_attributes mpa_attr; unsigned int mpa_pkt_len; u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; u8 tos; u16 emss; u16 plen; u32 ird; u32 ord; u32 flags; }; static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; } static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; } static inline int compute_wscale(int win) { int wscale = 0; while (wscale < 14 && (65535< __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int iwch_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, struct ib_port_modify *props) { return (-ENOSYS); } static struct ib_ah * iwch_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { return ERR_PTR(-ENOSYS); } static int iwch_ah_destroy(struct ib_ah *ah) { return (-ENOSYS); } static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return (-ENOSYS); } static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return (-ENOSYS); } static int iwch_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { return (-ENOSYS); } static int iwch_dealloc_ucontext(struct ib_ucontext *context) { struct iwch_dev *rhp = to_iwch_dev(context->device); struct iwch_ucontext *ucontext = to_iwch_ucontext(context); struct iwch_mm_entry *mm, *tmp; CTR2(KTR_IW_CXGB, "%s context %p", __FUNCTION__, context); TAILQ_FOREACH_SAFE(mm, &ucontext->mmaps, entry, tmp) { TAILQ_REMOVE(&ucontext->mmaps, mm, entry); cxfree(mm); } cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); cxfree(ucontext); return 0; } static struct ib_ucontext * iwch_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct iwch_ucontext *context; struct iwch_dev *rhp = to_iwch_dev(ibdev); CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); context = malloc(sizeof(*context), M_DEVBUF, M_ZERO|M_NOWAIT); if (!context) return ERR_PTR(-ENOMEM); cxio_init_ucontext(&rhp->rdev, &context->uctx); TAILQ_INIT(&context->mmaps); mtx_init(&context->mmap_lock, "ucontext mmap", NULL, MTX_DEF); return &context->ibucontext; } static int iwch_destroy_cq(struct ib_cq *ib_cq) { struct iwch_cq *chp; CTR2(KTR_IW_CXGB, "%s ib_cq %p", __FUNCTION__, ib_cq); chp = to_iwch_cq(ib_cq); remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); mtx_lock(&chp->lock); if (--chp->refcnt) msleep(chp, &chp->lock, 0, "iwch_destroy_cq", 0); mtx_unlock(&chp->lock); cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); cxfree(chp); return 0; } static struct ib_cq * iwch_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *ib_context, struct ib_udata *udata) { struct iwch_dev *rhp; struct iwch_cq *chp; struct iwch_create_cq_resp uresp; struct iwch_create_cq_req ureq; struct iwch_ucontext *ucontext = NULL; static int warned; size_t resplen; int entries = attr->cqe; CTR3(KTR_IW_CXGB, "%s ib_dev %p entries %d", __FUNCTION__, ibdev, entries); rhp = to_iwch_dev(ibdev); chp = malloc(sizeof(*chp), M_DEVBUF, M_NOWAIT|M_ZERO); if (!chp) { return ERR_PTR(-ENOMEM); } if (ib_context) { ucontext = to_iwch_ucontext(ib_context); if (!t3a_device(rhp)) { if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) { cxfree(chp); return ERR_PTR(-EFAULT); } chp->user_rptr_addr = (u32 /*__user */*)(unsigned long)ureq.user_rptr_addr; } } if (t3a_device(rhp)) { /* * T3A: Add some fluff to handle extra CQEs inserted * for various errors. * Additional CQE possibilities: * TERMINATE, * incoming RDMA WRITE Failures * incoming RDMA READ REQUEST FAILUREs * NOTE: We cannot ensure the CQ won't overflow. */ entries += 16; } entries = roundup_pow_of_two(entries); chp->cq.size_log2 = ilog2(entries); if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) { cxfree(chp); return ERR_PTR(-ENOMEM); } chp->rhp = rhp; chp->ibcq.cqe = 1 << chp->cq.size_log2; mtx_init(&chp->lock, "cxgb cq", NULL, MTX_DEF|MTX_DUPOK); chp->refcnt = 1; if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) { cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); cxfree(chp); return ERR_PTR(-ENOMEM); } if (ucontext) { struct iwch_mm_entry *mm; mm = kmalloc(sizeof *mm, M_NOWAIT); if (!mm) { iwch_destroy_cq(&chp->ibcq); return ERR_PTR(-ENOMEM); } uresp.cqid = chp->cq.cqid; uresp.size_log2 = chp->cq.size_log2; mtx_lock(&ucontext->mmap_lock); uresp.key = ucontext->key; ucontext->key += PAGE_SIZE; mtx_unlock(&ucontext->mmap_lock); mm->key = uresp.key; mm->addr = vtophys(chp->cq.queue); if (udata->outlen < sizeof uresp) { if (!warned++) CTR1(KTR_IW_CXGB, "%s Warning - " "downlevel libcxgb3 (non-fatal).\n", __func__); mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * sizeof(struct t3_cqe)); resplen = sizeof(struct iwch_create_cq_resp_v0); } else { mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) * sizeof(struct t3_cqe)); uresp.memsize = mm->len; resplen = sizeof uresp; } if (ib_copy_to_udata(udata, &uresp, resplen)) { cxfree(mm); iwch_destroy_cq(&chp->ibcq); return ERR_PTR(-EFAULT); } insert_mmap(ucontext, mm); } CTR4(KTR_IW_CXGB, "created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx", chp->cq.cqid, chp, (1 << chp->cq.size_log2), (unsigned long long) chp->cq.dma_addr); return &chp->ibcq; } static int iwch_resize_cq(struct ib_cq *cq __unused, int cqe __unused, struct ib_udata *udata __unused) { return (-ENOSYS); } static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct iwch_dev *rhp; struct iwch_cq *chp; enum t3_cq_opcode cq_op; int err; u32 rptr; chp = to_iwch_cq(ibcq); rhp = chp->rhp; if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) cq_op = CQ_ARM_SE; else cq_op = CQ_ARM_AN; if (chp->user_rptr_addr) { if (copyin(chp->user_rptr_addr, &rptr, sizeof(rptr))) return (-EFAULT); mtx_lock(&chp->lock); chp->cq.rptr = rptr; } else mtx_lock(&chp->lock); CTR2(KTR_IW_CXGB, "%s rptr 0x%x", __FUNCTION__, chp->cq.rptr); err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0); mtx_unlock(&chp->lock); if (err < 0) log(LOG_ERR, "Error %d rearming CQID 0x%x\n", err, chp->cq.cqid); if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS)) err = 0; return err; } static int iwch_mmap(struct ib_ucontext *context __unused, struct vm_area_struct *vma __unused) { return (-ENOSYS); } static int iwch_deallocate_pd(struct ib_pd *pd) { struct iwch_dev *rhp; struct iwch_pd *php; php = to_iwch_pd(pd); rhp = php->rhp; CTR3(KTR_IW_CXGB, "%s ibpd %p pdid 0x%x", __FUNCTION__, pd, php->pdid); cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); cxfree(php); return 0; } static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct iwch_pd *php; u32 pdid; struct iwch_dev *rhp; CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); rhp = (struct iwch_dev *) ibdev; pdid = cxio_hal_get_pdid(rhp->rdev.rscp); if (!pdid) return ERR_PTR(-EINVAL); php = malloc(sizeof(*php), M_DEVBUF, M_ZERO|M_NOWAIT); if (!php) { cxio_hal_put_pdid(rhp->rdev.rscp, pdid); return ERR_PTR(-ENOMEM); } php->pdid = pdid; php->rhp = rhp; if (context) { if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) { iwch_deallocate_pd(&php->ibpd); return ERR_PTR(-EFAULT); } } CTR3(KTR_IW_CXGB, "%s pdid 0x%0x ptr 0x%p", __FUNCTION__, pdid, php); return &php->ibpd; } static int iwch_dereg_mr(struct ib_mr *ib_mr) { struct iwch_dev *rhp; struct iwch_mr *mhp; u32 mmid; CTR2(KTR_IW_CXGB, "%s ib_mr %p", __FUNCTION__, ib_mr); /* There can be no memory windows */ if (atomic_load_acq_int(&ib_mr->usecnt.counter)) return (-EINVAL); mhp = to_iwch_mr(ib_mr); rhp = mhp->rhp; mmid = mhp->attr.stag >> 8; cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, mhp->attr.pbl_addr); iwch_free_pbl(mhp); remove_handle(rhp, &rhp->mmidr, mmid); if (mhp->kva) cxfree((void *) (unsigned long) mhp->kva); if (mhp->umem) ib_umem_release(mhp->umem); CTR3(KTR_IW_CXGB, "%s mmid 0x%x ptr %p", __FUNCTION__, mmid, mhp); cxfree(mhp); return 0; } static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, u64 *iova_start) { __be64 *page_list; int shift; u64 total_size; int npages; struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mr *mhp; int ret; CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); php = to_iwch_pd(pd); rhp = php->rhp; mhp = malloc(sizeof(*mhp), M_DEVBUF, M_ZERO|M_NOWAIT); if (!mhp) return ERR_PTR(-ENOMEM); mhp->rhp = rhp; /* First check that we have enough alignment */ if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { ret = -EINVAL; goto err; } if (num_phys_buf > 1 && ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { ret = -EINVAL; goto err; } ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, &total_size, &npages, &shift, &page_list); if (ret) goto err; ret = iwch_alloc_pbl(mhp, npages); if (ret) { cxfree(page_list); goto err_pbl; } ret = iwch_write_pbl(mhp, page_list, npages, 0); cxfree(page_list); if (ret) goto err; mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; mhp->attr.perms = iwch_ib_to_tpt_access(acc); mhp->attr.va_fbo = *iova_start; mhp->attr.page_size = shift - 12; mhp->attr.len = (u32) total_size; mhp->attr.pbl_size = npages; ret = iwch_register_mem(rhp, php, mhp, shift); if (ret) goto err_pbl; return &mhp->ibmr; err_pbl: iwch_free_pbl(mhp); err: cxfree(mhp); return ERR_PTR(ret); } static int iwch_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, u64 * iova_start) { struct iwch_mr mh, *mhp; struct iwch_pd *php; struct iwch_dev *rhp; __be64 *page_list = NULL; int shift = 0; u64 total_size; int npages = 0; int ret; CTR3(KTR_IW_CXGB, "%s ib_mr %p ib_pd %p", __FUNCTION__, mr, pd); /* There can be no memory windows */ if (atomic_load_acq_int(&mr->usecnt.counter)) return (-EINVAL); mhp = to_iwch_mr(mr); rhp = mhp->rhp; php = to_iwch_pd(mr->pd); /* make sure we are on the same adapter */ if (rhp != php->rhp) return (-EINVAL); memcpy(&mh, mhp, sizeof *mhp); if (mr_rereg_mask & IB_MR_REREG_PD) php = to_iwch_pd(pd); if (mr_rereg_mask & IB_MR_REREG_ACCESS) mh.attr.perms = iwch_ib_to_tpt_access(acc); if (mr_rereg_mask & IB_MR_REREG_TRANS) { ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, &total_size, &npages, &shift, &page_list); if (ret) return ret; } ret = iwch_reregister_mem(rhp, php, &mh, shift, npages); cxfree(page_list); if (ret) { return ret; } if (mr_rereg_mask & IB_MR_REREG_PD) mhp->attr.pdid = php->pdid; if (mr_rereg_mask & IB_MR_REREG_ACCESS) mhp->attr.perms = iwch_ib_to_tpt_access(acc); if (mr_rereg_mask & IB_MR_REREG_TRANS) { mhp->attr.zbva = 0; mhp->attr.va_fbo = *iova_start; mhp->attr.page_size = shift - 12; mhp->attr.len = (u32) total_size; mhp->attr.pbl_size = npages; } return 0; } static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata, int mr_id) { __be64 *pages; int shift, n, len; int i, k, entry; int err = 0; struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mr *mhp; struct iwch_reg_user_mr_resp uresp; struct scatterlist *sg; CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); php = to_iwch_pd(pd); rhp = php->rhp; mhp = malloc(sizeof(*mhp), M_DEVBUF, M_NOWAIT|M_ZERO); if (!mhp) return ERR_PTR(-ENOMEM); mhp->rhp = rhp; mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(mhp->umem)) { err = PTR_ERR(mhp->umem); cxfree(mhp); return ERR_PTR(-err); } shift = ffs(mhp->umem->page_size) - 1; n = mhp->umem->nmap; err = iwch_alloc_pbl(mhp, n); if (err) goto err; pages = (__be64 *) kmalloc(n * sizeof(u64), M_NOWAIT); if (!pages) { err = -ENOMEM; goto err_pbl; } i = n = 0; for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { len = sg_dma_len(sg) >> shift; for (k = 0; k < len; ++k) { pages[i++] = cpu_to_be64(sg_dma_address(sg) + mhp->umem->page_size * k); if (i == PAGE_SIZE / sizeof *pages) { err = iwch_write_pbl(mhp, pages, i, n); if (err) goto pbl_done; n += i; i = 0; } } } #if 0 TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; for (k = 0; k < len; ++k) { pages[i++] = htobe64(sg_dma_address( &chunk->page_list[j]) + mhp->umem->page_size * k); if (i == PAGE_SIZE / sizeof *pages) { err = iwch_write_pbl(mhp, pages, i, n); if (err) goto pbl_done; n += i; i = 0; } } } #endif if (i) err = iwch_write_pbl(mhp, pages, i, n); pbl_done: cxfree(pages); if (err) goto err_pbl; mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; mhp->attr.perms = iwch_ib_to_tpt_access(acc); mhp->attr.va_fbo = virt; mhp->attr.page_size = shift - 12; mhp->attr.len = (u32) length; err = iwch_register_mem(rhp, php, mhp, shift); if (err) goto err_pbl; if (udata && !t3a_device(rhp)) { uresp.pbl_addr = (mhp->attr.pbl_addr - rhp->rdev.rnic_info.pbl_base) >> 3; CTR2(KTR_IW_CXGB, "%s user resp pbl_addr 0x%x", __FUNCTION__, uresp.pbl_addr); if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { iwch_dereg_mr(&mhp->ibmr); err = EFAULT; goto err; } } return &mhp->ibmr; err_pbl: iwch_free_pbl(mhp); err: ib_umem_release(mhp->umem); cxfree(mhp); return ERR_PTR(-err); } static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) { struct ib_phys_buf bl; u64 kva; struct ib_mr *ibmr; CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); /* * T3 only supports 32 bits of size. */ bl.size = 0xffffffff; bl.addr = 0; kva = 0; ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva); return ibmr; } static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mw *mhp; u32 mmid; u32 stag = 0; int ret; php = to_iwch_pd(pd); rhp = php->rhp; mhp = malloc(sizeof(*mhp), M_DEVBUF, M_ZERO|M_NOWAIT); if (!mhp) return ERR_PTR(-ENOMEM); ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid); if (ret) { cxfree(mhp); return ERR_PTR(-ret); } mhp->rhp = rhp; mhp->attr.pdid = php->pdid; mhp->attr.type = TPT_MW; mhp->attr.stag = stag; mmid = (stag) >> 8; mhp->ibmw.rkey = stag; if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); cxfree(mhp); return ERR_PTR(-ENOMEM); } CTR4(KTR_IW_CXGB, "%s mmid 0x%x mhp %p stag 0x%x", __FUNCTION__, mmid, mhp, stag); return &(mhp->ibmw); } static int iwch_dealloc_mw(struct ib_mw *mw) { struct iwch_dev *rhp; struct iwch_mw *mhp; u32 mmid; mhp = to_iwch_mw(mw); rhp = mhp->rhp; mmid = (mw->rkey) >> 8; cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); remove_handle(rhp, &rhp->mmidr, mmid); cxfree(mhp); CTR4(KTR_IW_CXGB, "%s ib_mw %p mmid 0x%x ptr %p", __FUNCTION__, mw, mmid, mhp); return 0; } static int iwch_destroy_qp(struct ib_qp *ib_qp) { struct iwch_dev *rhp; struct iwch_qp *qhp; struct iwch_qp_attributes attrs; struct iwch_ucontext *ucontext; qhp = to_iwch_qp(ib_qp); rhp = qhp->rhp; attrs.next_state = IWCH_QP_STATE_ERROR; iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0); mtx_lock(&qhp->lock); if (qhp->ep) msleep(qhp, &qhp->lock, 0, "iwch_destroy_qp1", 0); mtx_unlock(&qhp->lock); remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid); mtx_lock(&qhp->lock); if (--qhp->refcnt) msleep(qhp, &qhp->lock, 0, "iwch_destroy_qp2", 0); mtx_unlock(&qhp->lock); ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context) : NULL; cxio_destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); CTR4(KTR_IW_CXGB, "%s ib_qp %p qpid 0x%0x qhp %p", __FUNCTION__, ib_qp, qhp->wq.qpid, qhp); cxfree(qhp); return 0; } static struct ib_qp *iwch_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { struct iwch_dev *rhp; struct iwch_qp *qhp; struct iwch_pd *php; struct iwch_cq *schp; struct iwch_cq *rchp; struct iwch_create_qp_resp uresp; int wqsize, sqsize, rqsize; struct iwch_ucontext *ucontext; CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); if (attrs->qp_type != IB_QPT_RC) return ERR_PTR(-EINVAL); php = to_iwch_pd(pd); rhp = php->rhp; schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid); rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid); if (!schp || !rchp) return ERR_PTR(-EINVAL); /* The RQT size must be # of entries + 1 rounded up to a power of two */ rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr); if (rqsize == attrs->cap.max_recv_wr) rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1); /* T3 doesn't support RQT depth < 16 */ if (rqsize < 16) rqsize = 16; if (rqsize > T3_MAX_RQ_SIZE) return ERR_PTR(-EINVAL); if (attrs->cap.max_inline_data > T3_MAX_INLINE) return ERR_PTR(-EINVAL); /* * NOTE: The SQ and total WQ sizes don't need to be * a power of two. However, all the code assumes * they are. EG: Q_FREECNT() and friends. */ sqsize = roundup_pow_of_two(attrs->cap.max_send_wr); wqsize = roundup_pow_of_two(rqsize + sqsize); CTR4(KTR_IW_CXGB, "%s wqsize %d sqsize %d rqsize %d", __FUNCTION__, wqsize, sqsize, rqsize); qhp = malloc(sizeof(*qhp), M_DEVBUF, M_ZERO|M_NOWAIT); if (!qhp) return ERR_PTR(-ENOMEM); qhp->wq.size_log2 = ilog2(wqsize); qhp->wq.rq_size_log2 = ilog2(rqsize); qhp->wq.sq_size_log2 = ilog2(sqsize); ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL; if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) { cxfree(qhp); return ERR_PTR(-ENOMEM); } attrs->cap.max_recv_wr = rqsize - 1; attrs->cap.max_send_wr = sqsize; attrs->cap.max_inline_data = T3_MAX_INLINE; qhp->rhp = rhp; qhp->attr.pd = php->pdid; qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid; qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid; qhp->attr.sq_num_entries = attrs->cap.max_send_wr; qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; qhp->attr.sq_max_sges = attrs->cap.max_send_sge; qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; qhp->attr.state = IWCH_QP_STATE_IDLE; qhp->attr.next_state = IWCH_QP_STATE_IDLE; /* * XXX - These don't get passed in from the openib user * at create time. The CM sets them via a QP modify. * Need to fix... I think the CM should */ qhp->attr.enable_rdma_read = 1; qhp->attr.enable_rdma_write = 1; qhp->attr.enable_bind = 1; qhp->attr.max_ord = 1; qhp->attr.max_ird = 1; mtx_init(&qhp->lock, "cxgb qp", NULL, MTX_DEF|MTX_DUPOK); qhp->refcnt = 1; if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) { cxio_destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); cxfree(qhp); return ERR_PTR(-ENOMEM); } if (udata) { struct iwch_mm_entry *mm1, *mm2; mm1 = kmalloc(sizeof *mm1, M_NOWAIT); if (!mm1) { iwch_destroy_qp(&qhp->ibqp); return ERR_PTR(-ENOMEM); } mm2 = kmalloc(sizeof *mm2, M_NOWAIT); if (!mm2) { cxfree(mm1); iwch_destroy_qp(&qhp->ibqp); return ERR_PTR(-ENOMEM); } uresp.qpid = qhp->wq.qpid; uresp.size_log2 = qhp->wq.size_log2; uresp.sq_size_log2 = qhp->wq.sq_size_log2; uresp.rq_size_log2 = qhp->wq.rq_size_log2; mtx_lock(&ucontext->mmap_lock); uresp.key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.db_key = ucontext->key; ucontext->key += PAGE_SIZE; mtx_unlock(&ucontext->mmap_lock); if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { cxfree(mm1); cxfree(mm2); iwch_destroy_qp(&qhp->ibqp); return ERR_PTR(-EFAULT); } mm1->key = uresp.key; mm1->addr = vtophys(qhp->wq.queue); mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr)); insert_mmap(ucontext, mm1); mm2->key = uresp.db_key; mm2->addr = qhp->wq.udb & PAGE_MASK; mm2->len = PAGE_SIZE; insert_mmap(ucontext, mm2); } qhp->ibqp.qp_num = qhp->wq.qpid; callout_init(&(qhp->timer), 1); CTR6(KTR_IW_CXGB, "sq_num_entries %d, rq_num_entries %d " "qpid 0x%0x qhp %p dma_addr 0x%llx size %d", qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr, 1 << qhp->wq.size_log2); return &qhp->ibqp; } static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct iwch_dev *rhp; struct iwch_qp *qhp; enum iwch_qp_attr_mask mask = 0; struct iwch_qp_attributes attrs; CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, ibqp); /* iwarp does not support the RTR state */ if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) attr_mask &= ~IB_QP_STATE; /* Make sure we still have something left to do */ if (!attr_mask) return 0; memset(&attrs, 0, sizeof attrs); qhp = to_iwch_qp(ibqp); rhp = qhp->rhp; attrs.next_state = iwch_convert_state(attr->qp_state); attrs.enable_rdma_read = (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) ? 1 : 0; attrs.enable_rdma_write = (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0; mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? (IWCH_QP_ATTR_ENABLE_RDMA_READ | IWCH_QP_ATTR_ENABLE_RDMA_WRITE | IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0; return iwch_modify_qp(rhp, qhp, mask, &attrs, 0); } void iwch_qp_add_ref(struct ib_qp *qp) { CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, qp); mtx_lock(&to_iwch_qp(qp)->lock); to_iwch_qp(qp)->refcnt++; mtx_unlock(&to_iwch_qp(qp)->lock); } void iwch_qp_rem_ref(struct ib_qp *qp) { CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, qp); mtx_lock(&to_iwch_qp(qp)->lock); if (--to_iwch_qp(qp)->refcnt == 0) wakeup(to_iwch_qp(qp)); mtx_unlock(&to_iwch_qp(qp)->lock); } static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn) { CTR3(KTR_IW_CXGB, "%s ib_dev %p qpn 0x%x", __FUNCTION__, dev, qpn); return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn); } static int iwch_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey) { CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); *pkey = 0; return 0; } static int iwch_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct iwch_dev *dev; struct port_info *pi; struct adapter *sc; CTR5(KTR_IW_CXGB, "%s ibdev %p, port %d, index %d, gid %p", __FUNCTION__, ibdev, port, index, gid); dev = to_iwch_dev(ibdev); sc = dev->rdev.adap; PANIC_IF(port == 0 || port > 2); pi = &sc->port[port - 1]; memset(&(gid->raw[0]), 0, sizeof(gid->raw)); memcpy(&(gid->raw[0]), pi->hw_addr, 6); return 0; } static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct iwch_dev *dev; struct adapter *sc; CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); dev = to_iwch_dev(ibdev); sc = dev->rdev.adap; memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, sc->port[0].hw_addr, 6); props->device_cap_flags = dev->device_cap_flags; props->page_size_cap = dev->attr.mem_pgsizes_bitmask; props->vendor_id = pci_get_vendor(sc->dev); props->vendor_part_id = pci_get_device(sc->dev); props->max_mr_size = dev->attr.max_mr_size; props->max_qp = dev->attr.max_qps; props->max_qp_wr = dev->attr.max_wrs; props->max_sge = dev->attr.max_sge_per_wr; props->max_sge_rd = 1; props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp; props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp; props->max_cq = dev->attr.max_cqs; props->max_cqe = dev->attr.max_cqes_per_cq; props->max_mr = dev->attr.max_mem_regs; props->max_pd = dev->attr.max_pds; props->local_ca_ack_delay = 0; return 0; } static int iwch_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); memset(props, 0, sizeof(struct ib_port_attr)); props->max_mtu = IB_MTU_4096; props->active_mtu = IB_MTU_2048; props->state = IB_PORT_ACTIVE; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_REINIT_SUP | IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; props->active_width = 2; props->active_speed = 2; props->max_msg_sz = -1; return 0; } int iwch_register_device(struct iwch_dev *dev) { int ret; struct adapter *sc = dev->rdev.adap; CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev); strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, sc->port[0].hw_addr, 6); dev->device_cap_flags = (IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW); dev->ibdev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_POLL_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_POST_SEND) | (1ull << IB_USER_VERBS_CMD_POST_RECV); dev->ibdev.node_type = RDMA_NODE_RNIC; memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); dev->ibdev.phys_port_cnt = sc->params.nports; dev->ibdev.num_comp_vectors = 1; dev->ibdev.dma_device = dev->rdev.adap->dev; dev->ibdev.query_device = iwch_query_device; dev->ibdev.query_port = iwch_query_port; dev->ibdev.modify_port = iwch_modify_port; dev->ibdev.query_pkey = iwch_query_pkey; dev->ibdev.query_gid = iwch_query_gid; dev->ibdev.alloc_ucontext = iwch_alloc_ucontext; dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext; dev->ibdev.mmap = iwch_mmap; dev->ibdev.alloc_pd = iwch_allocate_pd; dev->ibdev.dealloc_pd = iwch_deallocate_pd; dev->ibdev.create_ah = iwch_ah_create; dev->ibdev.destroy_ah = iwch_ah_destroy; dev->ibdev.create_qp = iwch_create_qp; dev->ibdev.modify_qp = iwch_ib_modify_qp; dev->ibdev.destroy_qp = iwch_destroy_qp; dev->ibdev.create_cq = iwch_create_cq; dev->ibdev.destroy_cq = iwch_destroy_cq; dev->ibdev.resize_cq = iwch_resize_cq; dev->ibdev.poll_cq = iwch_poll_cq; dev->ibdev.get_dma_mr = iwch_get_dma_mr; dev->ibdev.reg_phys_mr = iwch_register_phys_mem; dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem; dev->ibdev.reg_user_mr = iwch_reg_user_mr; dev->ibdev.dereg_mr = iwch_dereg_mr; dev->ibdev.alloc_mw = iwch_alloc_mw; dev->ibdev.bind_mw = iwch_bind_mw; dev->ibdev.dealloc_mw = iwch_dealloc_mw; dev->ibdev.attach_mcast = iwch_multicast_attach; dev->ibdev.detach_mcast = iwch_multicast_detach; dev->ibdev.process_mad = iwch_process_mad; dev->ibdev.req_notify_cq = iwch_arm_cq; dev->ibdev.post_send = iwch_post_send; dev->ibdev.post_recv = iwch_post_receive; dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), M_NOWAIT); if (!dev->ibdev.iwcm) return (ENOMEM); dev->ibdev.iwcm->connect = iwch_connect; dev->ibdev.iwcm->accept = iwch_accept_cr; dev->ibdev.iwcm->reject = iwch_reject_cr; - dev->ibdev.iwcm->create_listen = iwch_create_listen; - dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen; + dev->ibdev.iwcm->create_listen_ep = iwch_create_listen_ep; + dev->ibdev.iwcm->destroy_listen_ep = iwch_destroy_listen_ep; + dev->ibdev.iwcm->newconn = process_newconn; dev->ibdev.iwcm->add_ref = iwch_qp_add_ref; dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; dev->ibdev.iwcm->get_qp = iwch_get_qp; ret = ib_register_device(&dev->ibdev, NULL); if (ret) goto bail1; return (0); bail1: cxfree(dev->ibdev.iwcm); return (ret); } void iwch_unregister_device(struct iwch_dev *dev) { ib_unregister_device(&dev->ibdev); cxfree(dev->ibdev.iwcm); return; } #endif Index: head/sys/dev/cxgbe/iw_cxgbe/cm.c =================================================================== --- head/sys/dev/cxgbe/iw_cxgbe/cm.c (revision 294609) +++ head/sys/dev/cxgbe/iw_cxgbe/cm.c (revision 294610) @@ -1,2441 +1,2406 @@ /* - * Copyright (c) 2009-2013 Chelsio, Inc. All rights reserved. + * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct sge_iq; struct rss_header; #include #include "offload.h" #include "tom/t4_tom.h" #define TOEPCB(so) ((struct toepcb *)(so_sototcpcb((so))->t_toe)) #include "iw_cxgbe.h" #include #include #include #include #include #include static spinlock_t req_lock; static TAILQ_HEAD(c4iw_ep_list, c4iw_ep_common) req_list; static struct work_struct c4iw_task; static struct workqueue_struct *c4iw_taskq; static LIST_HEAD(timeout_list); static spinlock_t timeout_lock; static void process_req(struct work_struct *ctx); static void start_ep_timer(struct c4iw_ep *ep); static void stop_ep_timer(struct c4iw_ep *ep); static int set_tcpinfo(struct c4iw_ep *ep); static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc); static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state tostate); static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state tostate); static void *alloc_ep(int size, gfp_t flags); void __free_ep(struct c4iw_ep_common *epc); static int find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, __be16 peer_port, u8 tos, struct nhop4_extended *pnh4); static int close_socket(struct c4iw_ep_common *epc, int close); static int shutdown_socket(struct c4iw_ep_common *epc); static void abort_socket(struct c4iw_ep *ep); static void send_mpa_req(struct c4iw_ep *ep); static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen); static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen); static void close_complete_upcall(struct c4iw_ep *ep, int status); static int abort_connection(struct c4iw_ep *ep); static void peer_close_upcall(struct c4iw_ep *ep); static void peer_abort_upcall(struct c4iw_ep *ep); static void connect_reply_upcall(struct c4iw_ep *ep, int status); static int connect_request_upcall(struct c4iw_ep *ep); static void established_upcall(struct c4iw_ep *ep); static void process_mpa_reply(struct c4iw_ep *ep); static void process_mpa_request(struct c4iw_ep *ep); static void process_peer_close(struct c4iw_ep *ep); static void process_conn_error(struct c4iw_ep *ep); static void process_close_complete(struct c4iw_ep *ep); static void ep_timeout(unsigned long arg); static void init_sock(struct c4iw_ep_common *epc); static void process_data(struct c4iw_ep *ep); static void process_connected(struct c4iw_ep *ep); -static struct socket * dequeue_socket(struct socket *head, struct sockaddr_in **remote, struct c4iw_ep *child_ep); -static void process_newconn(struct c4iw_ep *parent_ep); static int c4iw_so_upcall(struct socket *so, void *arg, int waitflag); static void process_socket_event(struct c4iw_ep *ep); static void release_ep_resources(struct c4iw_ep *ep); #define START_EP_TIMER(ep) \ do { \ CTR3(KTR_IW_CXGBE, "start_ep_timer (%s:%d) ep %p", \ __func__, __LINE__, (ep)); \ start_ep_timer(ep); \ } while (0) #define STOP_EP_TIMER(ep) \ do { \ CTR3(KTR_IW_CXGBE, "stop_ep_timer (%s:%d) ep %p", \ __func__, __LINE__, (ep)); \ stop_ep_timer(ep); \ } while (0) #ifdef KTR static char *states[] = { "idle", "listen", "connecting", "mpa_wait_req", "mpa_req_sent", "mpa_req_rcvd", "mpa_rep_sent", "fpdu_mode", "aborting", "closing", "moribund", "dead", NULL, }; #endif static void process_req(struct work_struct *ctx) { struct c4iw_ep_common *epc; spin_lock(&req_lock); while (!TAILQ_EMPTY(&req_list)) { epc = TAILQ_FIRST(&req_list); TAILQ_REMOVE(&req_list, epc, entry); epc->entry.tqe_prev = NULL; spin_unlock(&req_lock); if (epc->so) process_socket_event((struct c4iw_ep *)epc); c4iw_put_ep(epc); spin_lock(&req_lock); } spin_unlock(&req_lock); } /* * XXX: doesn't belong here in the iWARP driver. * XXX: assumes that the connection was offloaded by cxgbe/t4_tom if TF_TOE is * set. Is this a valid assumption for active open? */ static int set_tcpinfo(struct c4iw_ep *ep) { struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; struct toepcb *toep; int rc = 0; INP_WLOCK(inp); tp = intotcpcb(inp); if ((tp->t_flags & TF_TOE) == 0) { rc = EINVAL; log(LOG_ERR, "%s: connection not offloaded (so %p, ep %p)\n", __func__, so, ep); goto done; } toep = TOEPCB(so); ep->hwtid = toep->tid; ep->snd_seq = tp->snd_nxt; ep->rcv_seq = tp->rcv_nxt; ep->emss = max(tp->t_maxseg, 128); done: INP_WUNLOCK(inp); return (rc); } static int find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, __be16 peer_port, u8 tos, struct nhop4_extended *pnh4) { struct in_addr addr; int err; CTR5(KTR_IW_CXGBE, "%s:frtB %x, %x, %d, %d", __func__, local_ip, peer_ip, ntohs(local_port), ntohs(peer_port)); addr.s_addr = peer_ip; err = fib4_lookup_nh_ext(RT_DEFAULT_FIB, addr, NHR_REF, 0, pnh4); CTR2(KTR_IW_CXGBE, "%s:frtE %d", __func__, err); return err; } static int close_socket(struct c4iw_ep_common *epc, int close) { struct socket *so = epc->so; int rc; CTR4(KTR_IW_CXGBE, "%s: so %p, ep %p, state %s", __func__, epc, so, states[epc->state]); SOCK_LOCK(so); soupcall_clear(so, SO_RCV); SOCK_UNLOCK(so); if (close) rc = soclose(so); else rc = soshutdown(so, SHUT_WR | SHUT_RD); epc->so = NULL; return (rc); } static int shutdown_socket(struct c4iw_ep_common *epc) { CTR4(KTR_IW_CXGBE, "%s: so %p, ep %p, state %s", __func__, epc->so, epc, states[epc->state]); return (soshutdown(epc->so, SHUT_WR)); } static void abort_socket(struct c4iw_ep *ep) { struct sockopt sopt; int rc; struct linger l; CTR4(KTR_IW_CXGBE, "%s ep %p so %p state %s", __func__, ep, ep->com.so, states[ep->com.state]); l.l_onoff = 1; l.l_linger = 0; /* linger_time of 0 forces RST to be sent */ sopt.sopt_dir = SOPT_SET; sopt.sopt_level = SOL_SOCKET; sopt.sopt_name = SO_LINGER; sopt.sopt_val = (caddr_t)&l; sopt.sopt_valsize = sizeof l; sopt.sopt_td = NULL; rc = sosetopt(ep->com.so, &sopt); if (rc) { log(LOG_ERR, "%s: can't set linger to 0, no RST! err %d\n", __func__, rc); } } static void process_peer_close(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs; int disconnect = 1; int release = 0; CTR4(KTR_IW_CXGBE, "%s:ppcB ep %p so %p state %s", __func__, ep, ep->com.so, states[ep->com.state]); mutex_lock(&ep->com.mutex); switch (ep->com.state) { case MPA_REQ_WAIT: CTR2(KTR_IW_CXGBE, "%s:ppc1 %p MPA_REQ_WAIT CLOSING", __func__, ep); __state_set(&ep->com, CLOSING); break; case MPA_REQ_SENT: CTR2(KTR_IW_CXGBE, "%s:ppc2 %p MPA_REQ_SENT CLOSING", __func__, ep); __state_set(&ep->com, DEAD); connect_reply_upcall(ep, -ECONNABORTED); disconnect = 0; STOP_EP_TIMER(ep); close_socket(&ep->com, 0); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; release = 1; break; case MPA_REQ_RCVD: /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or * rejects the CR. */ CTR2(KTR_IW_CXGBE, "%s:ppc3 %p MPA_REQ_RCVD CLOSING", __func__, ep); __state_set(&ep->com, CLOSING); c4iw_get_ep(&ep->com); break; case MPA_REP_SENT: CTR2(KTR_IW_CXGBE, "%s:ppc4 %p MPA_REP_SENT CLOSING", __func__, ep); __state_set(&ep->com, CLOSING); break; case FPDU_MODE: CTR2(KTR_IW_CXGBE, "%s:ppc5 %p FPDU_MODE CLOSING", __func__, ep); START_EP_TIMER(ep); __state_set(&ep->com, CLOSING); attrs.next_state = C4IW_QP_STATE_CLOSING; c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); peer_close_upcall(ep); break; case ABORTING: CTR2(KTR_IW_CXGBE, "%s:ppc6 %p ABORTING (disconn)", __func__, ep); disconnect = 0; break; case CLOSING: CTR2(KTR_IW_CXGBE, "%s:ppc7 %p CLOSING MORIBUND", __func__, ep); __state_set(&ep->com, MORIBUND); disconnect = 0; break; case MORIBUND: CTR2(KTR_IW_CXGBE, "%s:ppc8 %p MORIBUND DEAD", __func__, ep); STOP_EP_TIMER(ep); if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } close_socket(&ep->com, 0); close_complete_upcall(ep, 0); __state_set(&ep->com, DEAD); release = 1; disconnect = 0; break; case DEAD: CTR2(KTR_IW_CXGBE, "%s:ppc9 %p DEAD (disconn)", __func__, ep); disconnect = 0; break; default: panic("%s: ep %p state %d", __func__, ep, ep->com.state); break; } mutex_unlock(&ep->com.mutex); if (disconnect) { CTR2(KTR_IW_CXGBE, "%s:ppca %p", __func__, ep); c4iw_ep_disconnect(ep, 0, M_NOWAIT); } if (release) { CTR2(KTR_IW_CXGBE, "%s:ppcb %p", __func__, ep); c4iw_put_ep(&ep->com); } CTR2(KTR_IW_CXGBE, "%s:ppcE %p", __func__, ep); return; } static void process_conn_error(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs; int ret; int state; state = state_read(&ep->com); CTR5(KTR_IW_CXGBE, "%s:pceB ep %p so %p so->so_error %u state %s", __func__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]); switch (state) { case MPA_REQ_WAIT: STOP_EP_TIMER(ep); break; case MPA_REQ_SENT: STOP_EP_TIMER(ep); connect_reply_upcall(ep, -ECONNRESET); break; case MPA_REP_SENT: ep->com.rpl_err = ECONNRESET; CTR1(KTR_IW_CXGBE, "waking up ep %p", ep); break; case MPA_REQ_RCVD: /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or * rejects the CR. */ c4iw_get_ep(&ep->com); break; case MORIBUND: case CLOSING: STOP_EP_TIMER(ep); /*FALLTHROUGH*/ case FPDU_MODE: if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_ERROR; ret = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); if (ret) log(LOG_ERR, "%s - qp <- error failed!\n", __func__); } peer_abort_upcall(ep); break; case ABORTING: break; case DEAD: CTR2(KTR_IW_CXGBE, "%s so_error %d IN DEAD STATE!!!!", __func__, ep->com.so->so_error); return; default: panic("%s: ep %p state %d", __func__, ep, state); break; } if (state != ABORTING) { CTR2(KTR_IW_CXGBE, "%s:pce1 %p", __func__, ep); close_socket(&ep->com, 0); state_set(&ep->com, DEAD); c4iw_put_ep(&ep->com); } CTR2(KTR_IW_CXGBE, "%s:pceE %p", __func__, ep); return; } static void process_close_complete(struct c4iw_ep *ep) { struct c4iw_qp_attributes attrs; int release = 0; CTR4(KTR_IW_CXGBE, "%s:pccB ep %p so %p state %s", __func__, ep, ep->com.so, states[ep->com.state]); /* The cm_id may be null if we failed to connect */ mutex_lock(&ep->com.mutex); switch (ep->com.state) { case CLOSING: CTR2(KTR_IW_CXGBE, "%s:pcc1 %p CLOSING MORIBUND", __func__, ep); __state_set(&ep->com, MORIBUND); break; case MORIBUND: CTR2(KTR_IW_CXGBE, "%s:pcc1 %p MORIBUND DEAD", __func__, ep); STOP_EP_TIMER(ep); if ((ep->com.cm_id) && (ep->com.qp)) { CTR2(KTR_IW_CXGBE, "%s:pcc2 %p QP_STATE_IDLE", __func__, ep); attrs.next_state = C4IW_QP_STATE_IDLE; c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } if (ep->parent_ep) { CTR2(KTR_IW_CXGBE, "%s:pcc3 %p", __func__, ep); close_socket(&ep->com, 1); } else { CTR2(KTR_IW_CXGBE, "%s:pcc4 %p", __func__, ep); close_socket(&ep->com, 0); } close_complete_upcall(ep, 0); __state_set(&ep->com, DEAD); release = 1; break; case ABORTING: CTR2(KTR_IW_CXGBE, "%s:pcc5 %p ABORTING", __func__, ep); break; case DEAD: default: CTR2(KTR_IW_CXGBE, "%s:pcc6 %p DEAD", __func__, ep); panic("%s:pcc6 %p DEAD", __func__, ep); break; } mutex_unlock(&ep->com.mutex); if (release) { CTR2(KTR_IW_CXGBE, "%s:pcc7 %p", __func__, ep); c4iw_put_ep(&ep->com); } CTR2(KTR_IW_CXGBE, "%s:pccE %p", __func__, ep); return; } static void init_sock(struct c4iw_ep_common *epc) { int rc; struct sockopt sopt; struct socket *so = epc->so; int on = 1; SOCK_LOCK(so); soupcall_set(so, SO_RCV, c4iw_so_upcall, epc); so->so_state |= SS_NBIO; SOCK_UNLOCK(so); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = (caddr_t)&on; sopt.sopt_valsize = sizeof on; sopt.sopt_td = NULL; rc = sosetopt(so, &sopt); if (rc) { log(LOG_ERR, "%s: can't set TCP_NODELAY on so %p (%d)\n", __func__, so, rc); } } static void process_data(struct c4iw_ep *ep) { struct sockaddr_in *local, *remote; CTR5(KTR_IW_CXGBE, "%s: so %p, ep %p, state %s, sbused %d", __func__, ep->com.so, ep, states[ep->com.state], sbused(&ep->com.so->so_rcv)); switch (state_read(&ep->com)) { case MPA_REQ_SENT: process_mpa_reply(ep); break; case MPA_REQ_WAIT: in_getsockaddr(ep->com.so, (struct sockaddr **)&local); in_getpeeraddr(ep->com.so, (struct sockaddr **)&remote); ep->com.local_addr = *local; ep->com.remote_addr = *remote; free(local, M_SONAME); free(remote, M_SONAME); process_mpa_request(ep); break; default: if (sbused(&ep->com.so->so_rcv)) log(LOG_ERR, "%s: Unexpected streaming data. ep %p, " "state %d, so %p, so_state 0x%x, sbused %u\n", __func__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state, sbused(&ep->com.so->so_rcv)); break; } } static void process_connected(struct c4iw_ep *ep) { if ((ep->com.so->so_state & SS_ISCONNECTED) && !ep->com.so->so_error) send_mpa_req(ep); else { connect_reply_upcall(ep, -ep->com.so->so_error); close_socket(&ep->com, 0); state_set(&ep->com, DEAD); c4iw_put_ep(&ep->com); } } -static struct socket * -dequeue_socket(struct socket *head, struct sockaddr_in **remote, - struct c4iw_ep *child_ep) +void +process_newconn(struct iw_cm_id *parent_cm_id, struct socket *child_so) { - struct socket *so; - - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (!so) { - ACCEPT_UNLOCK(); - return (NULL); - } - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - SOCK_LOCK(so); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - soref(so); - soupcall_set(so, SO_RCV, c4iw_so_upcall, child_ep); - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - soaccept(so, (struct sockaddr **)remote); - - return (so); -} - -static void -process_newconn(struct c4iw_ep *parent_ep) -{ - struct socket *child_so; struct c4iw_ep *child_ep; + struct sockaddr_in *local; struct sockaddr_in *remote; + struct c4iw_ep *parent_ep = parent_cm_id->provider_data; + if (!child_so) { + CTR4(KTR_IW_CXGBE, + "%s: parent so %p, parent ep %p, child so %p, invalid so", + __func__, parent_ep->com.so, parent_ep, child_so); + log(LOG_ERR, "%s: invalid child socket\n", __func__); + return; + } child_ep = alloc_ep(sizeof(*child_ep), M_NOWAIT); if (!child_ep) { CTR3(KTR_IW_CXGBE, "%s: parent so %p, parent ep %p, ENOMEM", __func__, parent_ep->com.so, parent_ep); log(LOG_ERR, "%s: failed to allocate ep entry\n", __func__); return; } + SOCKBUF_LOCK(&child_so->so_rcv); + soupcall_set(child_so, SO_RCV, c4iw_so_upcall, child_ep); + SOCKBUF_UNLOCK(&child_so->so_rcv); - child_so = dequeue_socket(parent_ep->com.so, &remote, child_ep); - if (!child_so) { - CTR4(KTR_IW_CXGBE, - "%s: parent so %p, parent ep %p, child ep %p, dequeue err", - __func__, parent_ep->com.so, parent_ep, child_ep); - log(LOG_ERR, "%s: failed to dequeue child socket\n", __func__); - __free_ep(&child_ep->com); - return; - - } - CTR5(KTR_IW_CXGBE, "%s: parent so %p, parent ep %p, child so %p, child ep %p", __func__, parent_ep->com.so, parent_ep, child_so, child_ep); - child_ep->com.local_addr = parent_ep->com.local_addr; + in_getsockaddr(child_so, (struct sockaddr **)&local); + in_getpeeraddr(child_so, (struct sockaddr **)&remote); + + child_ep->com.local_addr = *local; child_ep->com.remote_addr = *remote; child_ep->com.dev = parent_ep->com.dev; child_ep->com.so = child_so; child_ep->com.cm_id = NULL; child_ep->com.thread = parent_ep->com.thread; child_ep->parent_ep = parent_ep; + free(local, M_SONAME); free(remote, M_SONAME); + c4iw_get_ep(&parent_ep->com); - child_ep->parent_ep = parent_ep; init_timer(&child_ep->timer); state_set(&child_ep->com, MPA_REQ_WAIT); START_EP_TIMER(child_ep); /* maybe the request has already been queued up on the socket... */ process_mpa_request(child_ep); + return; } static int c4iw_so_upcall(struct socket *so, void *arg, int waitflag) { struct c4iw_ep *ep = arg; spin_lock(&req_lock); CTR6(KTR_IW_CXGBE, "%s: so %p, so_state 0x%x, ep %p, ep_state %s, tqe_prev %p", __func__, so, so->so_state, ep, states[ep->com.state], ep->com.entry.tqe_prev); if (ep && ep->com.so && !ep->com.entry.tqe_prev) { KASSERT(ep->com.so == so, ("%s: XXX review.", __func__)); c4iw_get_ep(&ep->com); TAILQ_INSERT_TAIL(&req_list, &ep->com, entry); queue_work(c4iw_taskq, &c4iw_task); } spin_unlock(&req_lock); return (SU_OK); } static void process_socket_event(struct c4iw_ep *ep) { int state = state_read(&ep->com); struct socket *so = ep->com.so; CTR6(KTR_IW_CXGBE, "process_socket_event: so %p, so_state 0x%x, " "so_err %d, sb_state 0x%x, ep %p, ep_state %s", so, so->so_state, so->so_error, so->so_rcv.sb_state, ep, states[state]); if (state == CONNECTING) { process_connected(ep); return; } if (state == LISTEN) { - process_newconn(ep); + /* socket listening events are handled at IWCM */ + CTR3(KTR_IW_CXGBE, "%s Invalid ep state:%u, ep:%p", __func__, + ep->com.state, ep); + BUG(); return; } /* connection error */ if (so->so_error) { process_conn_error(ep); return; } /* peer close */ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state < CLOSING) { process_peer_close(ep); return; } /* close complete */ if (so->so_state & SS_ISDISCONNECTED) { process_close_complete(ep); return; } /* rx data */ process_data(ep); } SYSCTL_NODE(_hw, OID_AUTO, iw_cxgbe, CTLFLAG_RD, 0, "iw_cxgbe driver parameters"); int db_delay_usecs = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, db_delay_usecs, CTLFLAG_RWTUN, &db_delay_usecs, 0, "Usecs to delay awaiting db fifo to drain"); static int dack_mode = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, dack_mode, CTLFLAG_RWTUN, &dack_mode, 0, "Delayed ack mode (default = 1)"); int c4iw_max_read_depth = 8; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, c4iw_max_read_depth, CTLFLAG_RWTUN, &c4iw_max_read_depth, 0, "Per-connection max ORD/IRD (default = 8)"); static int enable_tcp_timestamps; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_timestamps, CTLFLAG_RWTUN, &enable_tcp_timestamps, 0, "Enable tcp timestamps (default = 0)"); static int enable_tcp_sack; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_sack, CTLFLAG_RWTUN, &enable_tcp_sack, 0, "Enable tcp SACK (default = 0)"); static int enable_tcp_window_scaling = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, enable_tcp_window_scaling, CTLFLAG_RWTUN, &enable_tcp_window_scaling, 0, "Enable tcp window scaling (default = 1)"); int c4iw_debug = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, c4iw_debug, CTLFLAG_RWTUN, &c4iw_debug, 0, "Enable debug logging (default = 0)"); static int peer2peer; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, peer2peer, CTLFLAG_RWTUN, &peer2peer, 0, "Support peer2peer ULPs (default = 0)"); static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, p2p_type, CTLFLAG_RWTUN, &p2p_type, 0, "RDMAP opcode to use for the RTR message: 1 = RDMA_READ 0 = RDMA_WRITE (default 1)"); static int ep_timeout_secs = 60; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, ep_timeout_secs, CTLFLAG_RWTUN, &ep_timeout_secs, 0, "CM Endpoint operation timeout in seconds (default = 60)"); static int mpa_rev = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, mpa_rev, CTLFLAG_RWTUN, &mpa_rev, 0, "MPA Revision, 0 supports amso1100, 1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft compliant (default = 1)"); static int markers_enabled; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, markers_enabled, CTLFLAG_RWTUN, &markers_enabled, 0, "Enable MPA MARKERS (default(0) = disabled)"); static int crc_enabled = 1; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, crc_enabled, CTLFLAG_RWTUN, &crc_enabled, 0, "Enable MPA CRC (default(1) = enabled)"); static int rcv_win = 256 * 1024; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, rcv_win, CTLFLAG_RWTUN, &rcv_win, 0, "TCP receive window in bytes (default = 256KB)"); static int snd_win = 128 * 1024; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, snd_win, CTLFLAG_RWTUN, &snd_win, 0, "TCP send window in bytes (default = 128KB)"); int db_fc_threshold = 2000; SYSCTL_INT(_hw_iw_cxgbe, OID_AUTO, db_fc_threshold, CTLFLAG_RWTUN, &db_fc_threshold, 0, "QP count/threshold that triggers automatic"); static void start_ep_timer(struct c4iw_ep *ep) { if (timer_pending(&ep->timer)) { CTR2(KTR_IW_CXGBE, "%s: ep %p, already started", __func__, ep); printk(KERN_ERR "%s timer already started! ep %p\n", __func__, ep); return; } clear_bit(TIMEOUT, &ep->com.flags); c4iw_get_ep(&ep->com); ep->timer.expires = jiffies + ep_timeout_secs * HZ; ep->timer.data = (unsigned long)ep; ep->timer.function = ep_timeout; add_timer(&ep->timer); } static void stop_ep_timer(struct c4iw_ep *ep) { del_timer_sync(&ep->timer); if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { c4iw_put_ep(&ep->com); } } static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc) { enum c4iw_ep_state state; mutex_lock(&epc->mutex); state = epc->state; mutex_unlock(&epc->mutex); return (state); } static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) { epc->state = new; } static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) { mutex_lock(&epc->mutex); __state_set(epc, new); mutex_unlock(&epc->mutex); } static void * alloc_ep(int size, gfp_t gfp) { struct c4iw_ep_common *epc; epc = kzalloc(size, gfp); if (epc == NULL) return (NULL); kref_init(&epc->kref); mutex_init(&epc->mutex); c4iw_init_wr_wait(&epc->wr_wait); return (epc); } void __free_ep(struct c4iw_ep_common *epc) { CTR2(KTR_IW_CXGBE, "%s:feB %p", __func__, epc); KASSERT(!epc->so, ("%s warning ep->so %p \n", __func__, epc->so)); KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list!\n", __func__, epc)); free(epc, M_DEVBUF); CTR2(KTR_IW_CXGBE, "%s:feE %p", __func__, epc); } void _c4iw_free_ep(struct kref *kref) { struct c4iw_ep *ep; struct c4iw_ep_common *epc; ep = container_of(kref, struct c4iw_ep, com.kref); epc = &ep->com; - KASSERT(!epc->so, ("%s ep->so %p", __func__, epc->so)); KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list", __func__, epc)); kfree(ep); } static void release_ep_resources(struct c4iw_ep *ep) { CTR2(KTR_IW_CXGBE, "%s:rerB %p", __func__, ep); set_bit(RELEASE_RESOURCES, &ep->com.flags); c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:rerE %p", __func__, ep); } static void send_mpa_req(struct c4iw_ep *ep) { int mpalen; struct mpa_message *mpa; struct mpa_v2_conn_params mpa_v2_params; struct mbuf *m; char mpa_rev_to_use = mpa_rev; int err; if (ep->retry_with_mpa_v1) mpa_rev_to_use = 1; mpalen = sizeof(*mpa) + ep->plen; if (mpa_rev_to_use == 2) mpalen += sizeof(struct mpa_v2_conn_params); mpa = malloc(mpalen, M_CXGBE, M_NOWAIT); if (mpa == NULL) { failed: connect_reply_upcall(ep, -ENOMEM); return; } memset(mpa, 0, mpalen); memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); mpa->flags = (crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0) | (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0); mpa->private_data_size = htons(ep->plen); mpa->revision = mpa_rev_to_use; if (mpa_rev_to_use == 1) { ep->tried_with_mpa_v1 = 1; ep->retry_with_mpa_v1 = 0; } if (mpa_rev_to_use == 2) { mpa->private_data_size += htons(sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons((u16)ep->ird); mpa_v2_params.ord = htons((u16)ep->ord); if (peer2peer) { mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_WRITE_RTR); } else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_READ_RTR); } } memcpy(mpa->private_data, &mpa_v2_params, sizeof(struct mpa_v2_conn_params)); if (ep->plen) { memcpy(mpa->private_data + sizeof(struct mpa_v2_conn_params), ep->mpa_pkt + sizeof(*mpa), ep->plen); } } else { if (ep->plen) memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); CTR2(KTR_IW_CXGBE, "%s:smr7 %p", __func__, ep); } m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA); if (m == NULL) { free(mpa, M_CXGBE); goto failed; } m_copyback(m, 0, mpalen, (void *)mpa); free(mpa, M_CXGBE); err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); if (err) goto failed; START_EP_TIMER(ep); state_set(&ep->com, MPA_REQ_SENT); ep->mpa_attr.initiator = 1; } static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) { int mpalen ; struct mpa_message *mpa; struct mpa_v2_conn_params mpa_v2_params; struct mbuf *m; int err; CTR4(KTR_IW_CXGBE, "%s:smrejB %p %u %d", __func__, ep, ep->hwtid, ep->plen); mpalen = sizeof(*mpa) + plen; if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpalen += sizeof(struct mpa_v2_conn_params); CTR4(KTR_IW_CXGBE, "%s:smrej1 %p %u %d", __func__, ep, ep->mpa_attr.version, mpalen); } mpa = malloc(mpalen, M_CXGBE, M_NOWAIT); if (mpa == NULL) return (-ENOMEM); memset(mpa, 0, mpalen); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = MPA_REJECT; mpa->revision = mpa_rev; mpa->private_data_size = htons(plen); if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; mpa->private_data_size += htons(sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons(((u16)ep->ird) | (peer2peer ? MPA_V2_PEER2PEER_MODEL : 0)); mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ? (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE ? MPA_V2_RDMA_WRITE_RTR : p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ ? MPA_V2_RDMA_READ_RTR : 0) : 0)); memcpy(mpa->private_data, &mpa_v2_params, sizeof(struct mpa_v2_conn_params)); if (ep->plen) memcpy(mpa->private_data + sizeof(struct mpa_v2_conn_params), pdata, plen); CTR5(KTR_IW_CXGBE, "%s:smrej3 %p %d %d %d", __func__, ep, mpa_v2_params.ird, mpa_v2_params.ord, ep->plen); } else if (plen) memcpy(mpa->private_data, pdata, plen); m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA); if (m == NULL) { free(mpa, M_CXGBE); return (-ENOMEM); } m_copyback(m, 0, mpalen, (void *)mpa); free(mpa, M_CXGBE); err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); if (!err) ep->snd_seq += mpalen; CTR4(KTR_IW_CXGBE, "%s:smrejE %p %u %d", __func__, ep, ep->hwtid, err); return err; } static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) { int mpalen; struct mpa_message *mpa; struct mbuf *m; struct mpa_v2_conn_params mpa_v2_params; int err; CTR2(KTR_IW_CXGBE, "%s:smrepB %p", __func__, ep); mpalen = sizeof(*mpa) + plen; if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { CTR3(KTR_IW_CXGBE, "%s:smrep1 %p %d", __func__, ep, ep->mpa_attr.version); mpalen += sizeof(struct mpa_v2_conn_params); } mpa = malloc(mpalen, M_CXGBE, M_NOWAIT); if (mpa == NULL) return (-ENOMEM); memset(mpa, 0, sizeof(*mpa)); memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | (markers_enabled ? MPA_MARKERS : 0); mpa->revision = ep->mpa_attr.version; mpa->private_data_size = htons(plen); if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { mpa->flags |= MPA_ENHANCED_RDMA_CONN; mpa->private_data_size += htons(sizeof(struct mpa_v2_conn_params)); mpa_v2_params.ird = htons((u16)ep->ird); mpa_v2_params.ord = htons((u16)ep->ord); CTR5(KTR_IW_CXGBE, "%s:smrep3 %p %d %d %d", __func__, ep, ep->mpa_attr.version, mpa_v2_params.ird, mpa_v2_params.ord); if (peer2peer && (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED)) { mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_WRITE_RTR); CTR5(KTR_IW_CXGBE, "%s:smrep4 %p %d %d %d", __func__, ep, p2p_type, mpa_v2_params.ird, mpa_v2_params.ord); } else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) { mpa_v2_params.ord |= htons(MPA_V2_RDMA_READ_RTR); CTR5(KTR_IW_CXGBE, "%s:smrep5 %p %d %d %d", __func__, ep, p2p_type, mpa_v2_params.ird, mpa_v2_params.ord); } } memcpy(mpa->private_data, &mpa_v2_params, sizeof(struct mpa_v2_conn_params)); if (ep->plen) memcpy(mpa->private_data + sizeof(struct mpa_v2_conn_params), pdata, plen); } else if (plen) memcpy(mpa->private_data, pdata, plen); m = m_getm(NULL, mpalen, M_NOWAIT, MT_DATA); if (m == NULL) { free(mpa, M_CXGBE); return (-ENOMEM); } m_copyback(m, 0, mpalen, (void *)mpa); free(mpa, M_CXGBE); state_set(&ep->com, MPA_REP_SENT); ep->snd_seq += mpalen; err = -sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); CTR3(KTR_IW_CXGBE, "%s:smrepE %p %d", __func__, ep, err); return err; } static void close_complete_upcall(struct c4iw_ep *ep, int status) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:ccuB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; event.status = status; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:ccu1 %1", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; set_bit(CLOSE_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:ccuE %p", __func__, ep); } static int abort_connection(struct c4iw_ep *ep) { int err; CTR2(KTR_IW_CXGBE, "%s:abB %p", __func__, ep); state_set(&ep->com, ABORTING); abort_socket(ep); err = close_socket(&ep->com, 0); set_bit(ABORT_CONN, &ep->com.history); CTR2(KTR_IW_CXGBE, "%s:abE %p", __func__, ep); return err; } static void peer_close_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:pcuB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_DISCONNECT; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:pcu1 %p", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); set_bit(DISCONN_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:pcuE %p", __func__, ep); } static void peer_abort_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:pauB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CLOSE; event.status = -ECONNRESET; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:pau1 %p", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; set_bit(ABORT_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:pauE %p", __func__, ep); } static void connect_reply_upcall(struct c4iw_ep *ep, int status) { struct iw_cm_event event; CTR3(KTR_IW_CXGBE, "%s:cruB %p", __func__, ep, status); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REPLY; event.status = (status ==-ECONNABORTED)?-ECONNRESET: status; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; if ((status == 0) || (status == -ECONNREFUSED)) { if (!ep->tried_with_mpa_v1) { CTR2(KTR_IW_CXGBE, "%s:cru1 %p", __func__, ep); /* this means MPA_v2 is used */ event.private_data_len = ep->plen - sizeof(struct mpa_v2_conn_params); event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) + sizeof(struct mpa_v2_conn_params); } else { CTR2(KTR_IW_CXGBE, "%s:cru2 %p", __func__, ep); /* this means MPA_v1 is used */ event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } } if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:cru3 %p", __func__, ep); set_bit(CONN_RPL_UPCALL, &ep->com.history); ep->com.cm_id->event_handler(ep->com.cm_id, &event); } if(status == -ECONNABORTED) { CTR3(KTR_IW_CXGBE, "%s:cruE %p %d", __func__, ep, status); return; } if (status < 0) { CTR3(KTR_IW_CXGBE, "%s:cru4 %p %d", __func__, ep, status); ep->com.cm_id->rem_ref(ep->com.cm_id); ep->com.cm_id = NULL; ep->com.qp = NULL; } CTR2(KTR_IW_CXGBE, "%s:cruE %p", __func__, ep); } static int connect_request_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; int ret; CTR3(KTR_IW_CXGBE, "%s: ep %p, mpa_v1 %d", __func__, ep, ep->tried_with_mpa_v1); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_CONNECT_REQUEST; event.local_addr = ep->com.local_addr; event.remote_addr = ep->com.remote_addr; event.provider_data = ep; event.so = ep->com.so; if (!ep->tried_with_mpa_v1) { /* this means MPA_v2 is used */ event.ord = ep->ord; event.ird = ep->ird; event.private_data_len = ep->plen - sizeof(struct mpa_v2_conn_params); event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) + sizeof(struct mpa_v2_conn_params); } else { /* this means MPA_v1 is used. Send max supported */ event.ord = c4iw_max_read_depth; event.ird = c4iw_max_read_depth; event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); } c4iw_get_ep(&ep->com); ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id, &event); if(ret) c4iw_put_ep(&ep->com); set_bit(CONNREQ_UPCALL, &ep->com.history); c4iw_put_ep(&ep->parent_ep->com); return ret; } static void established_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; CTR2(KTR_IW_CXGBE, "%s:euB %p", __func__, ep); memset(&event, 0, sizeof(event)); event.event = IW_CM_EVENT_ESTABLISHED; event.ird = ep->ird; event.ord = ep->ord; if (ep->com.cm_id) { CTR2(KTR_IW_CXGBE, "%s:eu1 %p", __func__, ep); ep->com.cm_id->event_handler(ep->com.cm_id, &event); set_bit(ESTAB_UPCALL, &ep->com.history); } CTR2(KTR_IW_CXGBE, "%s:euE %p", __func__, ep); } static void process_mpa_reply(struct c4iw_ep *ep) { struct mpa_message *mpa; struct mpa_v2_conn_params *mpa_v2_params; u16 plen; u16 resp_ird, resp_ord; u8 rtr_mismatch = 0, insuff_ird = 0; struct c4iw_qp_attributes attrs; enum c4iw_qp_attr_mask mask; int err; struct mbuf *top, *m; int flags = MSG_DONTWAIT; struct uio uio; CTR2(KTR_IW_CXGBE, "%s:pmrB %p", __func__, ep); /* * Stop mpa timer. If it expired, then the state has * changed and we bail since ep_timeout already aborted * the connection. */ STOP_EP_TIMER(ep); if (state_read(&ep->com) != MPA_REQ_SENT) return; uio.uio_resid = 1000000; uio.uio_td = ep->com.thread; err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); if (err) { if (err == EWOULDBLOCK) { CTR2(KTR_IW_CXGBE, "%s:pmr1 %p", __func__, ep); START_EP_TIMER(ep); return; } err = -err; CTR2(KTR_IW_CXGBE, "%s:pmr2 %p", __func__, ep); goto err; } if (ep->com.so->so_rcv.sb_mb) { CTR2(KTR_IW_CXGBE, "%s:pmr3 %p", __func__, ep); printf("%s data after soreceive called! so %p sb_mb %p top %p\n", __func__, ep->com.so, ep->com.so->so_rcv.sb_mb, top); } m = top; do { CTR2(KTR_IW_CXGBE, "%s:pmr4 %p", __func__, ep); /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { CTR3(KTR_IW_CXGBE, "%s:pmr5 %p %d", __func__, ep, ep->mpa_pkt_len + m->m_len); err = (-EINVAL); goto err; } /* * copy the new data into our accumulation buffer. */ m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); ep->mpa_pkt_len += m->m_len; if (!m->m_next) m = m->m_nextpkt; else m = m->m_next; } while (m); m_freem(top); /* * if we don't even have the mpa message, then bail. */ if (ep->mpa_pkt_len < sizeof(*mpa)) return; mpa = (struct mpa_message *) ep->mpa_pkt; /* Validate MPA header. */ if (mpa->revision > mpa_rev) { CTR4(KTR_IW_CXGBE, "%s:pmr6 %p %d %d", __func__, ep, mpa->revision, mpa_rev); printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d, " " Received = %d\n", __func__, mpa_rev, mpa->revision); err = -EPROTO; goto err; } if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { CTR2(KTR_IW_CXGBE, "%s:pmr7 %p", __func__, ep); err = -EPROTO; goto err; } plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ if (plen > MPA_MAX_PRIVATE_DATA) { CTR2(KTR_IW_CXGBE, "%s:pmr8 %p", __func__, ep); err = -EPROTO; goto err; } /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGBE, "%s:pmr9 %p", __func__, ep); err = -EPROTO; goto err; } ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) { CTR2(KTR_IW_CXGBE, "%s:pmra %p", __func__, ep); return; } if (mpa->flags & MPA_REJECT) { CTR2(KTR_IW_CXGBE, "%s:pmrb %p", __func__, ep); err = -ECONNREFUSED; goto err; } /* * If we get here we have accumulated the entire mpa * start reply message including private data. And * the MPA header is valid. */ state_set(&ep->com, FPDU_MODE); ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa->revision; ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; if (mpa->revision == 2) { CTR2(KTR_IW_CXGBE, "%s:pmrc %p", __func__, ep); ep->mpa_attr.enhanced_rdma_conn = mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; if (ep->mpa_attr.enhanced_rdma_conn) { CTR2(KTR_IW_CXGBE, "%s:pmrd %p", __func__, ep); mpa_v2_params = (struct mpa_v2_conn_params *) (ep->mpa_pkt + sizeof(*mpa)); resp_ird = ntohs(mpa_v2_params->ird) & MPA_V2_IRD_ORD_MASK; resp_ord = ntohs(mpa_v2_params->ord) & MPA_V2_IRD_ORD_MASK; /* * This is a double-check. Ideally, below checks are * not required since ird/ord stuff has been taken * care of in c4iw_accept_cr */ if ((ep->ird < resp_ord) || (ep->ord > resp_ird)) { CTR2(KTR_IW_CXGBE, "%s:pmre %p", __func__, ep); err = -ENOMEM; ep->ird = resp_ord; ep->ord = resp_ird; insuff_ird = 1; } if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL) { CTR2(KTR_IW_CXGBE, "%s:pmrf %p", __func__, ep); if (ntohs(mpa_v2_params->ord) & MPA_V2_RDMA_WRITE_RTR) { CTR2(KTR_IW_CXGBE, "%s:pmrg %p", __func__, ep); ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_RDMA_WRITE; } else if (ntohs(mpa_v2_params->ord) & MPA_V2_RDMA_READ_RTR) { CTR2(KTR_IW_CXGBE, "%s:pmrh %p", __func__, ep); ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; } } } } else { CTR2(KTR_IW_CXGBE, "%s:pmri %p", __func__, ep); if (mpa->revision == 1) { CTR2(KTR_IW_CXGBE, "%s:pmrj %p", __func__, ep); if (peer2peer) { CTR2(KTR_IW_CXGBE, "%s:pmrk %p", __func__, ep); ep->mpa_attr.p2p_type = p2p_type; } } } if (set_tcpinfo(ep)) { CTR2(KTR_IW_CXGBE, "%s:pmrl %p", __func__, ep); printf("%s set_tcpinfo error\n", __func__); goto err; } CTR6(KTR_IW_CXGBE, "%s - crc_enabled = %d, recv_marker_enabled = %d, " "xmit_marker_enabled = %d, version = %d p2p_type = %d", __func__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, ep->mpa_attr.p2p_type); /* * If responder's RTR does not match with that of initiator, assign * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not * generated when moving QP to RTS state. * A TERM message will be sent after QP has moved to RTS state */ if ((ep->mpa_attr.version == 2) && peer2peer && (ep->mpa_attr.p2p_type != p2p_type)) { CTR2(KTR_IW_CXGBE, "%s:pmrm %p", __func__, ep); ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; rtr_mismatch = 1; } //ep->ofld_txq = TOEPCB(ep->com.so)->ofld_txq; attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = C4IW_QP_STATE_RTS; mask = C4IW_QP_ATTR_NEXT_STATE | C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR | C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD; /* bind QP and TID with INIT_WR */ err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) { CTR2(KTR_IW_CXGBE, "%s:pmrn %p", __func__, ep); goto err; } /* * If responder's RTR requirement did not match with what initiator * supports, generate TERM message */ if (rtr_mismatch) { CTR2(KTR_IW_CXGBE, "%s:pmro %p", __func__, ep); printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__); attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_NOMATCH_RTR; attrs.next_state = C4IW_QP_STATE_TERMINATE; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); err = -ENOMEM; goto out; } /* * Generate TERM if initiator IRD is not sufficient for responder * provided ORD. Currently, we do the same behaviour even when * responder provided IRD is also not sufficient as regards to * initiator ORD. */ if (insuff_ird) { CTR2(KTR_IW_CXGBE, "%s:pmrp %p", __func__, ep); printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n", __func__); attrs.layer_etype = LAYER_MPA | DDP_LLP; attrs.ecode = MPA_INSUFF_IRD; attrs.next_state = C4IW_QP_STATE_TERMINATE; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); err = -ENOMEM; goto out; } goto out; err: state_set(&ep->com, ABORTING); abort_connection(ep); out: connect_reply_upcall(ep, err); CTR2(KTR_IW_CXGBE, "%s:pmrE %p", __func__, ep); return; } static void process_mpa_request(struct c4iw_ep *ep) { struct mpa_message *mpa; u16 plen; int flags = MSG_DONTWAIT; int rc; struct iovec iov; struct uio uio; enum c4iw_ep_state state = state_read(&ep->com); CTR3(KTR_IW_CXGBE, "%s: ep %p, state %s", __func__, ep, states[state]); if (state != MPA_REQ_WAIT) return; iov.iov_base = &ep->mpa_pkt[ep->mpa_pkt_len]; iov.iov_len = sizeof(ep->mpa_pkt) - ep->mpa_pkt_len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(ep->mpa_pkt) - ep->mpa_pkt_len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = NULL; /* uio.uio_td = ep->com.thread; */ rc = soreceive(ep->com.so, NULL, &uio, NULL, NULL, &flags); if (rc == EAGAIN) return; else if (rc) { abort: STOP_EP_TIMER(ep); abort_connection(ep); return; } KASSERT(uio.uio_offset > 0, ("%s: sorecieve on so %p read no data", __func__, ep->com.so)); ep->mpa_pkt_len += uio.uio_offset; /* * If we get more than the supported amount of private data then we must * fail this connection. XXX: check so_rcv->sb_cc, or peek with another * soreceive, or increase the size of mpa_pkt by 1 and abort if the last * byte is filled by the soreceive above. */ /* Don't even have the MPA message. Wait for more data to arrive. */ if (ep->mpa_pkt_len < sizeof(*mpa)) return; mpa = (struct mpa_message *) ep->mpa_pkt; /* * Validate MPA Header. */ if (mpa->revision > mpa_rev) { log(LOG_ERR, "%s: MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); goto abort; } if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) goto abort; /* * Fail if there's too much private data. */ plen = ntohs(mpa->private_data_size); if (plen > MPA_MAX_PRIVATE_DATA) goto abort; /* * If plen does not account for pkt size */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) goto abort; ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) return; /* * If we get here we have accumulated the entire mpa * start reply message including private data. */ ep->mpa_attr.initiator = 0; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ep->mpa_attr.version = mpa->revision; if (mpa->revision == 1) ep->tried_with_mpa_v1 = 1; ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; if (mpa->revision == 2) { ep->mpa_attr.enhanced_rdma_conn = mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; if (ep->mpa_attr.enhanced_rdma_conn) { struct mpa_v2_conn_params *mpa_v2_params; u16 ird, ord; mpa_v2_params = (void *)&ep->mpa_pkt[sizeof(*mpa)]; ird = ntohs(mpa_v2_params->ird); ord = ntohs(mpa_v2_params->ord); ep->ird = ird & MPA_V2_IRD_ORD_MASK; ep->ord = ord & MPA_V2_IRD_ORD_MASK; if (ird & MPA_V2_PEER2PEER_MODEL && peer2peer) { if (ord & MPA_V2_RDMA_WRITE_RTR) { ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_RDMA_WRITE; } else if (ord & MPA_V2_RDMA_READ_RTR) { ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; } } } } else if (mpa->revision == 1 && peer2peer) ep->mpa_attr.p2p_type = p2p_type; if (set_tcpinfo(ep)) goto abort; CTR5(KTR_IW_CXGBE, "%s: crc_enabled = %d, recv_marker_enabled = %d, " "xmit_marker_enabled = %d, version = %d", __func__, ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); state_set(&ep->com, MPA_REQ_RCVD); STOP_EP_TIMER(ep); /* drive upcall */ mutex_lock(&ep->parent_ep->com.mutex); if (ep->parent_ep->com.state != DEAD) { if(connect_request_upcall(ep)) { abort_connection(ep); } }else abort_connection(ep); mutex_unlock(&ep->parent_ep->com.mutex); } /* * Upcall from the adapter indicating data has been transmitted. * For us its just the single MPA request or reply. We can now free * the skb holding the mpa message. */ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) { int err; struct c4iw_ep *ep = to_ep(cm_id); CTR2(KTR_IW_CXGBE, "%s:crcB %p", __func__, ep); if (state_read(&ep->com) == DEAD) { CTR2(KTR_IW_CXGBE, "%s:crc1 %p", __func__, ep); c4iw_put_ep(&ep->com); return -ECONNRESET; } set_bit(ULP_REJECT, &ep->com.history); BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); if (mpa_rev == 0) { CTR2(KTR_IW_CXGBE, "%s:crc2 %p", __func__, ep); abort_connection(ep); } else { CTR2(KTR_IW_CXGBE, "%s:crc3 %p", __func__, ep); err = send_mpa_reject(ep, pdata, pdata_len); err = soshutdown(ep->com.so, 3); } c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:crc4 %p", __func__, ep); return 0; } int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err; struct c4iw_qp_attributes attrs; enum c4iw_qp_attr_mask mask; struct c4iw_ep *ep = to_ep(cm_id); struct c4iw_dev *h = to_c4iw_dev(cm_id->device); struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); CTR2(KTR_IW_CXGBE, "%s:cacB %p", __func__, ep); if (state_read(&ep->com) == DEAD) { CTR2(KTR_IW_CXGBE, "%s:cac1 %p", __func__, ep); err = -ECONNRESET; goto err; } BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); BUG_ON(!qp); set_bit(ULP_ACCEPT, &ep->com.history); if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { CTR2(KTR_IW_CXGBE, "%s:cac2 %p", __func__, ep); abort_connection(ep); err = -EINVAL; goto err; } if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { CTR2(KTR_IW_CXGBE, "%s:cac3 %p", __func__, ep); if (conn_param->ord > ep->ird) { CTR2(KTR_IW_CXGBE, "%s:cac4 %p", __func__, ep); ep->ird = conn_param->ird; ep->ord = conn_param->ord; send_mpa_reject(ep, conn_param->private_data, conn_param->private_data_len); abort_connection(ep); err = -ENOMEM; goto err; } if (conn_param->ird > ep->ord) { CTR2(KTR_IW_CXGBE, "%s:cac5 %p", __func__, ep); if (!ep->ord) { CTR2(KTR_IW_CXGBE, "%s:cac6 %p", __func__, ep); conn_param->ird = 1; } else { CTR2(KTR_IW_CXGBE, "%s:cac7 %p", __func__, ep); abort_connection(ep); err = -ENOMEM; goto err; } } } ep->ird = conn_param->ird; ep->ord = conn_param->ord; if (ep->mpa_attr.version != 2) { CTR2(KTR_IW_CXGBE, "%s:cac8 %p", __func__, ep); if (peer2peer && ep->ird == 0) { CTR2(KTR_IW_CXGBE, "%s:cac9 %p", __func__, ep); ep->ird = 1; } } cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.qp = qp; //ep->ofld_txq = TOEPCB(ep->com.so)->ofld_txq; /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = C4IW_QP_STATE_RTS; /* bind QP and TID with INIT_WR */ mask = C4IW_QP_ATTR_NEXT_STATE | C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR | C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD; err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) { CTR2(KTR_IW_CXGBE, "%s:caca %p", __func__, ep); goto err1; } err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) { CTR2(KTR_IW_CXGBE, "%s:caca %p", __func__, ep); goto err1; } state_set(&ep->com, FPDU_MODE); established_upcall(ep); c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:cacE %p", __func__, ep); return 0; err1: ep->com.cm_id = NULL; ep->com.qp = NULL; cm_id->rem_ref(cm_id); err: c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s:cacE err %p", __func__, ep); return err; } int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { int err = 0; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_ep *ep = NULL; struct nhop4_extended nh4; struct toedev *tdev; CTR2(KTR_IW_CXGBE, "%s:ccB %p", __func__, cm_id); if ((conn_param->ord > c4iw_max_read_depth) || (conn_param->ird > c4iw_max_read_depth)) { CTR2(KTR_IW_CXGBE, "%s:cc1 %p", __func__, cm_id); err = -EINVAL; goto out; } ep = alloc_ep(sizeof(*ep), M_NOWAIT); if (!ep) { CTR2(KTR_IW_CXGBE, "%s:cc2 %p", __func__, cm_id); printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); err = -ENOMEM; goto out; } init_timer(&ep->timer); ep->plen = conn_param->private_data_len; if (ep->plen) { CTR2(KTR_IW_CXGBE, "%s:cc3 %p", __func__, ep); memcpy(ep->mpa_pkt + sizeof(struct mpa_message), conn_param->private_data, ep->plen); } ep->ird = conn_param->ird; ep->ord = conn_param->ord; if (peer2peer && ep->ord == 0) { CTR2(KTR_IW_CXGBE, "%s:cc4 %p", __func__, ep); ep->ord = 1; } cm_id->add_ref(cm_id); ep->com.dev = dev; ep->com.cm_id = cm_id; ep->com.qp = get_qhp(dev, conn_param->qpn); if (!ep->com.qp) { CTR2(KTR_IW_CXGBE, "%s:cc5 %p", __func__, ep); err = -EINVAL; goto fail2; } ep->com.thread = curthread; ep->com.so = cm_id->so; init_sock(&ep->com); /* find a route */ err = find_route( cm_id->local_addr.sin_addr.s_addr, cm_id->remote_addr.sin_addr.s_addr, cm_id->local_addr.sin_port, cm_id->remote_addr.sin_port, 0, &nh4); if (err) { CTR2(KTR_IW_CXGBE, "%s:cc7 %p", __func__, ep); printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); err = -EHOSTUNREACH; goto fail2; } if (!(nh4.nh_ifp->if_capenable & IFCAP_TOE)) { CTR2(KTR_IW_CXGBE, "%s:cc8 %p", __func__, ep); printf("%s - interface not TOE capable.\n", __func__); close_socket(&ep->com, 0); err = -ENOPROTOOPT; goto fail3; } tdev = TOEDEV(nh4.nh_ifp); if (tdev == NULL) { CTR2(KTR_IW_CXGBE, "%s:cc9 %p", __func__, ep); printf("%s - No toedev for interface.\n", __func__); goto fail3; } fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); state_set(&ep->com, CONNECTING); ep->tos = 0; ep->com.local_addr = cm_id->local_addr; ep->com.remote_addr = cm_id->remote_addr; err = soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr, ep->com.thread); if (!err) { CTR2(KTR_IW_CXGBE, "%s:cca %p", __func__, ep); goto out; } else { close_socket(&ep->com, 0); goto fail2; } fail3: CTR2(KTR_IW_CXGBE, "%s:ccb %p", __func__, ep); fib4_free_nh_ext(RT_DEFAULT_FIB, &nh4); fail2: cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); out: CTR2(KTR_IW_CXGBE, "%s:ccE %p", __func__, ep); return err; } /* - * iwcm->create_listen. Returns -errno on failure. + * iwcm->create_listen_ep. Returns -errno on failure. */ int -c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) +c4iw_create_listen_ep(struct iw_cm_id *cm_id, int backlog) { int rc; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_listen_ep *ep; struct socket *so = cm_id->so; ep = alloc_ep(sizeof(*ep), GFP_KERNEL); CTR5(KTR_IW_CXGBE, "%s: cm_id %p, lso %p, ep %p, inp %p", __func__, cm_id, so, ep, so->so_pcb); if (ep == NULL) { log(LOG_ERR, "%s: failed to alloc memory for endpoint\n", __func__); rc = ENOMEM; goto failed; } cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.dev = dev; ep->backlog = backlog; ep->com.local_addr = cm_id->local_addr; ep->com.thread = curthread; state_set(&ep->com, LISTEN); ep->com.so = so; - init_sock(&ep->com); - rc = solisten(so, ep->backlog, ep->com.thread); - if (rc != 0) { - log(LOG_ERR, "%s: failed to start listener: %d\n", __func__, - rc); - close_socket(&ep->com, 0); - cm_id->rem_ref(cm_id); - c4iw_put_ep(&ep->com); - goto failed; - } - cm_id->provider_data = ep; return (0); failed: CTR3(KTR_IW_CXGBE, "%s: cm_id %p, FAILED (%d)", __func__, cm_id, rc); return (-rc); } -int -c4iw_destroy_listen(struct iw_cm_id *cm_id) +void +c4iw_destroy_listen_ep(struct iw_cm_id *cm_id) { - int rc; struct c4iw_listen_ep *ep = to_listen_ep(cm_id); - CTR4(KTR_IW_CXGBE, "%s: cm_id %p, so %p, inp %p", __func__, cm_id, - cm_id->so, cm_id->so->so_pcb); + CTR4(KTR_IW_CXGBE, "%s: cm_id %p, so %p, state %s", __func__, cm_id, + cm_id->so, states[ep->com.state]); state_set(&ep->com, DEAD); - rc = close_socket(&ep->com, 0); cm_id->rem_ref(cm_id); c4iw_put_ep(&ep->com); - return (rc); + return; } int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) { int ret = 0; int close = 0; int fatal = 0; struct c4iw_rdev *rdev; mutex_lock(&ep->com.mutex); CTR2(KTR_IW_CXGBE, "%s:cedB %p", __func__, ep); rdev = &ep->com.dev->rdev; if (c4iw_fatal_error(rdev)) { CTR2(KTR_IW_CXGBE, "%s:ced1 %p", __func__, ep); fatal = 1; close_complete_upcall(ep, -ECONNRESET); ep->com.state = DEAD; } CTR3(KTR_IW_CXGBE, "%s:ced2 %p %s", __func__, ep, states[ep->com.state]); switch (ep->com.state) { case MPA_REQ_WAIT: case MPA_REQ_SENT: case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: close = 1; if (abrupt) ep->com.state = ABORTING; else { ep->com.state = CLOSING; START_EP_TIMER(ep); } set_bit(CLOSE_SENT, &ep->com.flags); break; case CLOSING: if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { close = 1; if (abrupt) { STOP_EP_TIMER(ep); ep->com.state = ABORTING; } else ep->com.state = MORIBUND; } break; case MORIBUND: case ABORTING: case DEAD: CTR3(KTR_IW_CXGBE, "%s ignoring disconnect ep %p state %u", __func__, ep, ep->com.state); break; default: BUG(); break; } mutex_unlock(&ep->com.mutex); if (close) { CTR2(KTR_IW_CXGBE, "%s:ced3 %p", __func__, ep); if (abrupt) { CTR2(KTR_IW_CXGBE, "%s:ced4 %p", __func__, ep); set_bit(EP_DISC_ABORT, &ep->com.history); ret = abort_connection(ep); } else { CTR2(KTR_IW_CXGBE, "%s:ced5 %p", __func__, ep); set_bit(EP_DISC_CLOSE, &ep->com.history); if (!ep->parent_ep) __state_set(&ep->com, MORIBUND); ret = shutdown_socket(&ep->com); } if (ret) { fatal = 1; } } if (fatal) { release_ep_resources(ep); CTR2(KTR_IW_CXGBE, "%s:ced6 %p", __func__, ep); } CTR2(KTR_IW_CXGBE, "%s:cedE %p", __func__, ep); return ret; } #ifdef C4IW_EP_REDIRECT int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t) { struct c4iw_ep *ep = ctx; if (ep->dst != old) return 0; PDBG("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new, l2t); dst_hold(new); cxgb4_l2t_release(ep->l2t); ep->l2t = l2t; dst_release(old); ep->dst = new; return 1; } #endif static void ep_timeout(unsigned long arg) { struct c4iw_ep *ep = (struct c4iw_ep *)arg; int kickit = 0; CTR2(KTR_IW_CXGBE, "%s:etB %p", __func__, ep); spin_lock(&timeout_lock); if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { list_add_tail(&ep->entry, &timeout_list); kickit = 1; } spin_unlock(&timeout_lock); if (kickit) { CTR2(KTR_IW_CXGBE, "%s:et1 %p", __func__, ep); queue_work(c4iw_taskq, &c4iw_task); } CTR2(KTR_IW_CXGBE, "%s:etE %p", __func__, ep); } static int fw6_wr_rpl(struct adapter *sc, const __be64 *rpl) { uint64_t val = be64toh(*rpl); int ret; struct c4iw_wr_wait *wr_waitp; ret = (int)((val >> 8) & 0xff); wr_waitp = (struct c4iw_wr_wait *)rpl[1]; CTR3(KTR_IW_CXGBE, "%s wr_waitp %p ret %u", __func__, wr_waitp, ret); if (wr_waitp) c4iw_wake_up(wr_waitp, ret ? -ret : 0); return (0); } static int fw6_cqe_handler(struct adapter *sc, const __be64 *rpl) { struct t4_cqe cqe =*(const struct t4_cqe *)(&rpl[0]); CTR2(KTR_IW_CXGBE, "%s rpl %p", __func__, rpl); c4iw_ev_dispatch(sc->iwarp_softc, &cqe); return (0); } static int terminate(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rdma_terminate *cpl = mtod(m, const void *); unsigned int tid = GET_TID(cpl); struct c4iw_qp_attributes attrs; struct toepcb *toep = lookup_tid(sc, tid); struct socket *so; struct c4iw_ep *ep; INP_WLOCK(toep->inp); so = inp_inpcbtosocket(toep->inp); ep = so->so_rcv.sb_upcallarg; INP_WUNLOCK(toep->inp); CTR2(KTR_IW_CXGBE, "%s:tB %p %d", __func__, ep); if (ep && ep->com.qp) { printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", tid, ep->com.qp->wq.sq.qid); attrs.next_state = C4IW_QP_STATE_TERMINATE; c4iw_modify_qp(ep->com.dev, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } else printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid); CTR2(KTR_IW_CXGBE, "%s:tE %p %d", __func__, ep); return 0; } void c4iw_cm_init_cpl(struct adapter *sc) { t4_register_cpl_handler(sc, CPL_RDMA_TERMINATE, terminate); t4_register_fw_msg_handler(sc, FW6_TYPE_WR_RPL, fw6_wr_rpl); t4_register_fw_msg_handler(sc, FW6_TYPE_CQE, fw6_cqe_handler); t4_register_an_handler(sc, c4iw_ev_handler); } void c4iw_cm_term_cpl(struct adapter *sc) { t4_register_cpl_handler(sc, CPL_RDMA_TERMINATE, NULL); t4_register_fw_msg_handler(sc, FW6_TYPE_WR_RPL, NULL); t4_register_fw_msg_handler(sc, FW6_TYPE_CQE, NULL); } int __init c4iw_cm_init(void) { TAILQ_INIT(&req_list); spin_lock_init(&req_lock); INIT_LIST_HEAD(&timeout_list); spin_lock_init(&timeout_lock); INIT_WORK(&c4iw_task, process_req); c4iw_taskq = create_singlethread_workqueue("iw_cxgbe"); if (!c4iw_taskq) return -ENOMEM; return 0; } void __exit c4iw_cm_term(void) { WARN_ON(!TAILQ_EMPTY(&req_list)); WARN_ON(!list_empty(&timeout_list)); flush_workqueue(c4iw_taskq); destroy_workqueue(c4iw_taskq); } #endif Index: head/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h =================================================================== --- head/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h (revision 294609) +++ head/sys/dev/cxgbe/iw_cxgbe/iw_cxgbe.h (revision 294610) @@ -1,1042 +1,1044 @@ /* - * Copyright (c) 2009-2013 Chelsio, Inc. All rights reserved. + * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $FreeBSD$ */ #ifndef __IW_CXGB4_H__ #define __IW_CXGB4_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef prefetch #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "t4_l2t.h" #define DRV_NAME "iw_cxgbe" #define MOD DRV_NAME ":" #define KTR_IW_CXGBE KTR_SPARE3 extern int c4iw_debug; #define PDBG(fmt, args...) \ do { \ if (c4iw_debug) \ printf(MOD fmt, ## args); \ } while (0) #include "t4.h" static inline void *cplhdr(struct mbuf *m) { return mtod(m, void*); } #define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->adap->vres.pbl.start) #define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->adap->vres.rq.start) #define C4IW_ID_TABLE_F_RANDOM 1 /* Pseudo-randomize the id's returned */ #define C4IW_ID_TABLE_F_EMPTY 2 /* Table is initially empty */ struct c4iw_id_table { u32 flags; u32 start; /* logical minimal id */ u32 last; /* hint for find */ u32 max; spinlock_t lock; unsigned long *table; }; struct c4iw_resource { struct c4iw_id_table tpt_table; struct c4iw_id_table qid_table; struct c4iw_id_table pdid_table; }; struct c4iw_qid_list { struct list_head entry; u32 qid; }; struct c4iw_dev_ucontext { struct list_head qpids; struct list_head cqids; struct mutex lock; }; enum c4iw_rdev_flags { T4_FATAL_ERROR = (1<<0), }; struct c4iw_stat { u64 total; u64 cur; u64 max; u64 fail; }; struct c4iw_stats { struct mutex lock; struct c4iw_stat qid; struct c4iw_stat pd; struct c4iw_stat stag; struct c4iw_stat pbl; struct c4iw_stat rqt; u64 db_full; u64 db_empty; u64 db_drop; u64 db_state_transitions; }; struct c4iw_rdev { struct adapter *adap; struct c4iw_resource resource; unsigned long qpshift; u32 qpmask; unsigned long cqshift; u32 cqmask; struct c4iw_dev_ucontext uctx; struct gen_pool *pbl_pool; struct gen_pool *rqt_pool; u32 flags; struct c4iw_stats stats; }; static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) { return rdev->flags & T4_FATAL_ERROR; } static inline int c4iw_num_stags(struct c4iw_rdev *rdev) { return min((int)T4_MAX_NUM_STAG, (int)(rdev->adap->vres.stag.size >> 5)); } #define C4IW_WR_TO (10*HZ) struct c4iw_wr_wait { int ret; atomic_t completion; }; static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) { wr_waitp->ret = 0; atomic_set(&wr_waitp->completion, 0); } static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret) { wr_waitp->ret = ret; atomic_set(&wr_waitp->completion, 1); wakeup(wr_waitp); } static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, struct c4iw_wr_wait *wr_waitp, u32 hwtid, u32 qpid, const char *func) { struct adapter *sc = rdev->adap; unsigned to = C4IW_WR_TO; while (!atomic_read(&wr_waitp->completion)) { tsleep(wr_waitp, 0, "c4iw_wait", to); if (SIGPENDING(curthread)) { printf("%s - Device %s not responding - " "tid %u qpid %u\n", func, device_get_nameunit(sc->dev), hwtid, qpid); if (c4iw_fatal_error(rdev)) { wr_waitp->ret = -EIO; break; } to = to << 2; } } if (wr_waitp->ret) CTR4(KTR_IW_CXGBE, "%s: FW reply %d tid %u qpid %u", device_get_nameunit(sc->dev), wr_waitp->ret, hwtid, qpid); return (wr_waitp->ret); } enum db_state { NORMAL = 0, FLOW_CONTROL = 1, RECOVERY = 2 }; struct c4iw_dev { struct ib_device ibdev; struct c4iw_rdev rdev; u32 device_cap_flags; struct idr cqidr; struct idr qpidr; struct idr mmidr; spinlock_t lock; struct dentry *debugfs_root; enum db_state db_state; int qpcnt; }; static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) { return container_of(ibdev, struct c4iw_dev, ibdev); } static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev) { return container_of(rdev, struct c4iw_dev, rdev); } static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid) { return idr_find(&rhp->cqidr, cqid); } static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid) { return idr_find(&rhp->qpidr, qpid); } static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid) { return idr_find(&rhp->mmidr, mmid); } static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr, void *handle, u32 id, int lock) { int ret; int newid; do { if (!idr_pre_get(idr, lock ? GFP_KERNEL : GFP_ATOMIC)) return -ENOMEM; if (lock) spin_lock_irq(&rhp->lock); ret = idr_get_new_above(idr, handle, id, &newid); BUG_ON(!ret && newid != id); if (lock) spin_unlock_irq(&rhp->lock); } while (ret == -EAGAIN); return ret; } static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr, void *handle, u32 id) { return _insert_handle(rhp, idr, handle, id, 1); } static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr, void *handle, u32 id) { return _insert_handle(rhp, idr, handle, id, 0); } static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id, int lock) { if (lock) spin_lock_irq(&rhp->lock); idr_remove(idr, id); if (lock) spin_unlock_irq(&rhp->lock); } static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id) { _remove_handle(rhp, idr, id, 1); } static inline void remove_handle_nolock(struct c4iw_dev *rhp, struct idr *idr, u32 id) { _remove_handle(rhp, idr, id, 0); } struct c4iw_pd { struct ib_pd ibpd; u32 pdid; struct c4iw_dev *rhp; }; static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd) { return container_of(ibpd, struct c4iw_pd, ibpd); } struct tpt_attributes { u64 len; u64 va_fbo; enum fw_ri_mem_perms perms; u32 stag; u32 pdid; u32 qpid; u32 pbl_addr; u32 pbl_size; u32 state:1; u32 type:2; u32 rsvd:1; u32 remote_invaliate_disable:1; u32 zbva:1; u32 mw_bind_enable:1; u32 page_size:5; }; struct c4iw_mr { struct ib_mr ibmr; struct ib_umem *umem; struct c4iw_dev *rhp; u64 kva; struct tpt_attributes attr; }; static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr) { return container_of(ibmr, struct c4iw_mr, ibmr); } struct c4iw_mw { struct ib_mw ibmw; struct c4iw_dev *rhp; u64 kva; struct tpt_attributes attr; }; static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw) { return container_of(ibmw, struct c4iw_mw, ibmw); } struct c4iw_fr_page_list { struct ib_fast_reg_page_list ibpl; DECLARE_PCI_UNMAP_ADDR(mapping); dma_addr_t dma_addr; struct c4iw_dev *dev; int size; }; static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list( struct ib_fast_reg_page_list *ibpl) { return container_of(ibpl, struct c4iw_fr_page_list, ibpl); } struct c4iw_cq { struct ib_cq ibcq; struct c4iw_dev *rhp; struct t4_cq cq; spinlock_t lock; spinlock_t comp_handler_lock; atomic_t refcnt; wait_queue_head_t wait; }; static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq) { return container_of(ibcq, struct c4iw_cq, ibcq); } struct c4iw_mpa_attributes { u8 initiator; u8 recv_marker_enabled; u8 xmit_marker_enabled; u8 crc_enabled; u8 enhanced_rdma_conn; u8 version; u8 p2p_type; }; struct c4iw_qp_attributes { u32 scq; u32 rcq; u32 sq_num_entries; u32 rq_num_entries; u32 sq_max_sges; u32 sq_max_sges_rdma_write; u32 rq_max_sges; u32 state; u8 enable_rdma_read; u8 enable_rdma_write; u8 enable_bind; u8 enable_mmid0_fastreg; u32 max_ord; u32 max_ird; u32 pd; u32 next_state; char terminate_buffer[52]; u32 terminate_msg_len; u8 is_terminate_local; struct c4iw_mpa_attributes mpa_attr; struct c4iw_ep *llp_stream_handle; u8 layer_etype; u8 ecode; u16 sq_db_inc; u16 rq_db_inc; }; struct c4iw_qp { struct ib_qp ibqp; struct c4iw_dev *rhp; struct c4iw_ep *ep; struct c4iw_qp_attributes attr; struct t4_wq wq; spinlock_t lock; struct mutex mutex; atomic_t refcnt; wait_queue_head_t wait; struct timer_list timer; }; static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) { return container_of(ibqp, struct c4iw_qp, ibqp); } struct c4iw_ucontext { struct ib_ucontext ibucontext; struct c4iw_dev_ucontext uctx; u32 key; spinlock_t mmap_lock; struct list_head mmaps; }; static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) { return container_of(c, struct c4iw_ucontext, ibucontext); } struct c4iw_mm_entry { struct list_head entry; u64 addr; u32 key; unsigned len; }; static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, u32 key, unsigned len) { struct list_head *pos, *nxt; struct c4iw_mm_entry *mm; spin_lock(&ucontext->mmap_lock); list_for_each_safe(pos, nxt, &ucontext->mmaps) { mm = list_entry(pos, struct c4iw_mm_entry, entry); if (mm->key == key && mm->len == len) { list_del_init(&mm->entry); spin_unlock(&ucontext->mmap_lock); CTR4(KTR_IW_CXGBE, "%s key 0x%x addr 0x%llx len %d", __func__, key, (unsigned long long) mm->addr, mm->len); return mm; } } spin_unlock(&ucontext->mmap_lock); return NULL; } static inline void insert_mmap(struct c4iw_ucontext *ucontext, struct c4iw_mm_entry *mm) { spin_lock(&ucontext->mmap_lock); CTR4(KTR_IW_CXGBE, "%s key 0x%x addr 0x%llx len %d", __func__, mm->key, (unsigned long long) mm->addr, mm->len); list_add_tail(&mm->entry, &ucontext->mmaps); spin_unlock(&ucontext->mmap_lock); } enum c4iw_qp_attr_mask { C4IW_QP_ATTR_NEXT_STATE = 1 << 0, C4IW_QP_ATTR_SQ_DB = 1<<1, C4IW_QP_ATTR_RQ_DB = 1<<2, C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, C4IW_QP_ATTR_MAX_ORD = 1 << 11, C4IW_QP_ATTR_MAX_IRD = 1 << 12, C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, C4IW_QP_ATTR_MPA_ATTR = 1 << 24, C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ | C4IW_QP_ATTR_ENABLE_RDMA_WRITE | C4IW_QP_ATTR_MAX_ORD | C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_STREAM_MSG_BUFFER | C4IW_QP_ATTR_MPA_ATTR | C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE) }; int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, enum c4iw_qp_attr_mask mask, struct c4iw_qp_attributes *attrs, int internal); enum c4iw_qp_state { C4IW_QP_STATE_IDLE, C4IW_QP_STATE_RTS, C4IW_QP_STATE_ERROR, C4IW_QP_STATE_TERMINATE, C4IW_QP_STATE_CLOSING, C4IW_QP_STATE_TOT }; static inline int c4iw_convert_state(enum ib_qp_state ib_state) { switch (ib_state) { case IB_QPS_RESET: case IB_QPS_INIT: return C4IW_QP_STATE_IDLE; case IB_QPS_RTS: return C4IW_QP_STATE_RTS; case IB_QPS_SQD: return C4IW_QP_STATE_CLOSING; case IB_QPS_SQE: return C4IW_QP_STATE_TERMINATE; case IB_QPS_ERR: return C4IW_QP_STATE_ERROR; default: return -1; } } static inline int to_ib_qp_state(int c4iw_qp_state) { switch (c4iw_qp_state) { case C4IW_QP_STATE_IDLE: return IB_QPS_INIT; case C4IW_QP_STATE_RTS: return IB_QPS_RTS; case C4IW_QP_STATE_CLOSING: return IB_QPS_SQD; case C4IW_QP_STATE_TERMINATE: return IB_QPS_SQE; case C4IW_QP_STATE_ERROR: return IB_QPS_ERR; } return IB_QPS_ERR; } static inline u32 c4iw_ib_to_tpt_access(int a) { return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) | (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) | FW_RI_MEM_ACCESS_LOCAL_READ; } static inline u32 c4iw_ib_to_tpt_bind_access(int acc) { return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0); } enum c4iw_mmid_state { C4IW_STAG_STATE_VALID, C4IW_STAG_STATE_INVALID }; #define C4IW_NODE_DESC "iw_cxgbe Chelsio Communications" #define MPA_KEY_REQ "MPA ID Req Frame" #define MPA_KEY_REP "MPA ID Rep Frame" #define MPA_MAX_PRIVATE_DATA 256 #define MPA_ENHANCED_RDMA_CONN 0x10 #define MPA_REJECT 0x20 #define MPA_CRC 0x40 #define MPA_MARKERS 0x80 #define MPA_FLAGS_MASK 0xE0 #define MPA_V2_PEER2PEER_MODEL 0x8000 #define MPA_V2_ZERO_LEN_FPDU_RTR 0x4000 #define MPA_V2_RDMA_WRITE_RTR 0x8000 #define MPA_V2_RDMA_READ_RTR 0x4000 #define MPA_V2_IRD_ORD_MASK 0x3FFF #define c4iw_put_ep(ep) { \ CTR4(KTR_IW_CXGBE, "put_ep (%s:%u) ep %p, refcnt %d", \ __func__, __LINE__, ep, atomic_read(&(ep)->kref.refcount)); \ WARN_ON(atomic_read(&(ep)->kref.refcount) < 1); \ kref_put(&((ep)->kref), _c4iw_free_ep); \ } #define c4iw_get_ep(ep) { \ CTR4(KTR_IW_CXGBE, "get_ep (%s:%u) ep %p, refcnt %d", \ __func__, __LINE__, ep, atomic_read(&(ep)->kref.refcount)); \ kref_get(&((ep)->kref)); \ } void _c4iw_free_ep(struct kref *kref); struct mpa_message { u8 key[16]; u8 flags; u8 revision; __be16 private_data_size; u8 private_data[0]; }; struct mpa_v2_conn_params { __be16 ird; __be16 ord; }; struct terminate_message { u8 layer_etype; u8 ecode; __be16 hdrct_rsvd; u8 len_hdrs[0]; }; #define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) enum c4iw_layers_types { LAYER_RDMAP = 0x00, LAYER_DDP = 0x10, LAYER_MPA = 0x20, RDMAP_LOCAL_CATA = 0x00, RDMAP_REMOTE_PROT = 0x01, RDMAP_REMOTE_OP = 0x02, DDP_LOCAL_CATA = 0x00, DDP_TAGGED_ERR = 0x01, DDP_UNTAGGED_ERR = 0x02, DDP_LLP = 0x03 }; enum c4iw_rdma_ecodes { RDMAP_INV_STAG = 0x00, RDMAP_BASE_BOUNDS = 0x01, RDMAP_ACC_VIOL = 0x02, RDMAP_STAG_NOT_ASSOC = 0x03, RDMAP_TO_WRAP = 0x04, RDMAP_INV_VERS = 0x05, RDMAP_INV_OPCODE = 0x06, RDMAP_STREAM_CATA = 0x07, RDMAP_GLOBAL_CATA = 0x08, RDMAP_CANT_INV_STAG = 0x09, RDMAP_UNSPECIFIED = 0xff }; enum c4iw_ddp_ecodes { DDPT_INV_STAG = 0x00, DDPT_BASE_BOUNDS = 0x01, DDPT_STAG_NOT_ASSOC = 0x02, DDPT_TO_WRAP = 0x03, DDPT_INV_VERS = 0x04, DDPU_INV_QN = 0x01, DDPU_INV_MSN_NOBUF = 0x02, DDPU_INV_MSN_RANGE = 0x03, DDPU_INV_MO = 0x04, DDPU_MSG_TOOBIG = 0x05, DDPU_INV_VERS = 0x06 }; enum c4iw_mpa_ecodes { MPA_CRC_ERR = 0x02, MPA_MARKER_ERR = 0x03, MPA_LOCAL_CATA = 0x05, MPA_INSUFF_IRD = 0x06, MPA_NOMATCH_RTR = 0x07, }; enum c4iw_ep_state { IDLE = 0, LISTEN, CONNECTING, MPA_REQ_WAIT, MPA_REQ_SENT, MPA_REQ_RCVD, MPA_REP_SENT, FPDU_MODE, ABORTING, CLOSING, MORIBUND, DEAD, }; enum c4iw_ep_flags { PEER_ABORT_IN_PROGRESS = 0, ABORT_REQ_IN_PROGRESS = 1, RELEASE_RESOURCES = 2, CLOSE_SENT = 3, TIMEOUT = 4 }; enum c4iw_ep_history { ACT_OPEN_REQ = 0, ACT_OFLD_CONN = 1, ACT_OPEN_RPL = 2, ACT_ESTAB = 3, PASS_ACCEPT_REQ = 4, PASS_ESTAB = 5, ABORT_UPCALL = 6, ESTAB_UPCALL = 7, CLOSE_UPCALL = 8, ULP_ACCEPT = 9, ULP_REJECT = 10, TIMEDOUT = 11, PEER_ABORT = 12, PEER_CLOSE = 13, CONNREQ_UPCALL = 14, ABORT_CONN = 15, DISCONN_UPCALL = 16, EP_DISC_CLOSE = 17, EP_DISC_ABORT = 18, CONN_RPL_UPCALL = 19, ACT_RETRY_NOMEM = 20, ACT_RETRY_INUSE = 21 }; struct c4iw_ep_common { TAILQ_ENTRY(c4iw_ep_common) entry; /* Work queue attachment */ struct iw_cm_id *cm_id; struct c4iw_qp *qp; struct c4iw_dev *dev; enum c4iw_ep_state state; struct kref kref; struct mutex mutex; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; struct c4iw_wr_wait wr_wait; unsigned long flags; unsigned long history; int rpl_err; int rpl_done; struct thread *thread; struct socket *so; }; struct c4iw_listen_ep { struct c4iw_ep_common com; unsigned int stid; int backlog; }; struct c4iw_ep { struct c4iw_ep_common com; struct c4iw_ep *parent_ep; struct timer_list timer; struct list_head entry; unsigned int atid; u32 hwtid; u32 snd_seq; u32 rcv_seq; struct l2t_entry *l2t; struct dst_entry *dst; struct c4iw_mpa_attributes mpa_attr; u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; unsigned int mpa_pkt_len; u32 ird; u32 ord; u32 smac_idx; u32 tx_chan; u32 mtu; u16 mss; u16 emss; u16 plen; u16 rss_qid; u16 txq_idx; u16 ctrlq_idx; u8 tos; u8 retry_with_mpa_v1; u8 tried_with_mpa_v1; }; static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; } static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; } static inline int compute_wscale(int win) { int wscale = 0; while (wscale < 14 && (65535< struct gen_pool { blist_t gen_list; daddr_t gen_base; int gen_chunk_shift; struct mutex gen_lock; }; static __inline struct gen_pool * gen_pool_create(daddr_t base, u_int chunk_shift, u_int len) { struct gen_pool *gp; gp = malloc(sizeof(struct gen_pool), M_DEVBUF, M_NOWAIT); if (gp == NULL) return (NULL); memset(gp, 0, sizeof(struct gen_pool)); gp->gen_list = blist_create(len >> chunk_shift, M_NOWAIT); if (gp->gen_list == NULL) { free(gp, M_DEVBUF); return (NULL); } blist_free(gp->gen_list, 0, len >> chunk_shift); gp->gen_base = base; gp->gen_chunk_shift = chunk_shift; //mutex_init(&gp->gen_lock, "genpool", NULL, MTX_DUPOK|MTX_DEF); mutex_init(&gp->gen_lock); return (gp); } static __inline unsigned long gen_pool_alloc(struct gen_pool *gp, int size) { int chunks; daddr_t blkno; chunks = (size + (1<gen_chunk_shift) - 1) >> gp->gen_chunk_shift; mutex_lock(&gp->gen_lock); blkno = blist_alloc(gp->gen_list, chunks); mutex_unlock(&gp->gen_lock); if (blkno == SWAPBLK_NONE) return (0); return (gp->gen_base + ((1 << gp->gen_chunk_shift) * blkno)); } static __inline void gen_pool_free(struct gen_pool *gp, daddr_t address, int size) { int chunks; daddr_t blkno; chunks = (size + (1<gen_chunk_shift) - 1) >> gp->gen_chunk_shift; blkno = (address - gp->gen_base) / (1 << gp->gen_chunk_shift); mutex_lock(&gp->gen_lock); blist_free(gp->gen_list, blkno, chunks); mutex_unlock(&gp->gen_lock); } static __inline void gen_pool_destroy(struct gen_pool *gp) { blist_destroy(gp->gen_list); free(gp, M_DEVBUF); } #if defined(__i386__) || defined(__amd64__) #define L1_CACHE_BYTES 128 #else #define L1_CACHE_BYTES 32 #endif static inline int idr_for_each(struct idr *idp, int (*fn)(int id, void *p, void *data), void *data) { int n, id, max, error = 0; struct idr_layer *p; struct idr_layer *pa[MAX_LEVEL]; struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; p = idp->top; max = 1 << n; id = 0; while (id < max) { while (n > 0 && p) { n -= IDR_BITS; *paa++ = p; p = p->ary[(id >> n) & IDR_MASK]; } if (p) { error = fn(id, (void *)p, data); if (error) break; } id += 1 << n; while (n < fls(id)) { n += IDR_BITS; p = *--paa; } } return error; } void c4iw_cm_init_cpl(struct adapter *); void c4iw_cm_term_cpl(struct adapter *); void your_reg_device(struct c4iw_dev *dev); #define SGE_CTRLQ_NUM 0 extern int spg_creds;/* Status Page size in credit units(1 unit = 64) */ #endif Index: head/sys/dev/cxgbe/iw_cxgbe/provider.c =================================================================== --- head/sys/dev/cxgbe/iw_cxgbe/provider.c (revision 294609) +++ head/sys/dev/cxgbe/iw_cxgbe/provider.c (revision 294610) @@ -1,501 +1,502 @@ /* - * Copyright (c) 2009-2013 Chelsio, Inc. All rights reserved. + * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include "iw_cxgbe.h" #include "user.h" static int fastreg_support = 1; module_param(fastreg_support, int, 0644); MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default = 1)"); static int c4iw_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, struct ib_port_modify *props) { return -ENOSYS; } static struct ib_ah *c4iw_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { return ERR_PTR(-ENOSYS); } static int c4iw_ah_destroy(struct ib_ah *ah) { return -ENOSYS; } static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { return -ENOSYS; } static int c4iw_dealloc_ucontext(struct ib_ucontext *context) { struct c4iw_dev *rhp = to_c4iw_dev(context->device); struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); struct c4iw_mm_entry *mm, *tmp; CTR2(KTR_IW_CXGBE, "%s context %p", __func__, context); list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); kfree(ucontext); return 0; } static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct c4iw_ucontext *context; struct c4iw_dev *rhp = to_c4iw_dev(ibdev); CTR2(KTR_IW_CXGBE, "%s ibdev %p", __func__, ibdev); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); return &context->ibucontext; } #ifdef DOT5 static inline pgprot_t t4_pgprot_wc(pgprot_t prot) { return pgprot_writecombine(prot); } #endif static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { int len = vma->vm_end - vma->vm_start; u32 key = vma->vm_pgoff << PAGE_SHIFT; struct c4iw_rdev *rdev; int ret = 0; struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr, paddr; u64 va_regs_res = 0, va_udbs_res = 0; u64 len_regs_res = 0, len_udbs_res = 0; CTR3(KTR_IW_CXGBE, "%s:1 ctx %p vma %p", __func__, context, vma); CTR4(KTR_IW_CXGBE, "%s:1a pgoff 0x%lx key 0x%x len %d", __func__, vma->vm_pgoff, key, len); if (vma->vm_start & (PAGE_SIZE-1)) { CTR3(KTR_IW_CXGBE, "%s:2 unaligned vm_start %u vma %p", __func__, vma->vm_start, vma); return -EINVAL; } rdev = &(to_c4iw_dev(context->device)->rdev); ucontext = to_c4iw_ucontext(context); mm = remove_mmap(ucontext, key, len); if (!mm) { CTR4(KTR_IW_CXGBE, "%s:3 ucontext %p key %u len %u", __func__, ucontext, key, len); return -EINVAL; } addr = mm->addr; kfree(mm); va_regs_res = (u64)rman_get_virtual(rdev->adap->regs_res); len_regs_res = (u64)rman_get_size(rdev->adap->regs_res); va_udbs_res = (u64)rman_get_virtual(rdev->adap->udbs_res); len_udbs_res = (u64)rman_get_size(rdev->adap->udbs_res); CTR6(KTR_IW_CXGBE, "%s:4 addr %p, masync region %p:%p, udb region %p:%p", __func__, addr, va_regs_res, va_regs_res+len_regs_res, va_udbs_res, va_udbs_res+len_udbs_res); if (addr >= va_regs_res && addr < va_regs_res + len_regs_res) { CTR4(KTR_IW_CXGBE, "%s:5 MA_SYNC addr %p region %p, reglen %u", __func__, addr, va_regs_res, len_regs_res); /* * MA_SYNC register... */ paddr = vtophys(addr); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, paddr >> PAGE_SHIFT, len, vma->vm_page_prot); } else { if (addr >= va_udbs_res && addr < va_udbs_res + len_udbs_res) { /* * Map user DB or OCQP memory... */ paddr = vtophys(addr); CTR4(KTR_IW_CXGBE, "%s:6 USER DB-GTS addr %p region %p, reglen %u", __func__, addr, va_udbs_res, len_udbs_res); #ifdef DOT5 if (is_t5(rdev->lldi.adapter_type) && map_udb_as_wc) vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); else #endif vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, paddr >> PAGE_SHIFT, len, vma->vm_page_prot); } else { /* * Map WQ or CQ contig dma memory... */ CTR4(KTR_IW_CXGBE, "%s:7 WQ/CQ addr %p vm_start %u vma %p", __func__, addr, vma->vm_start, vma); ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); } } CTR4(KTR_IW_CXGBE, "%s:8 ctx %p vma %p ret %u", __func__, context, vma, ret); return ret; } static int c4iw_deallocate_pd(struct ib_pd *pd) { struct c4iw_pd *php = to_c4iw_pd(pd); struct c4iw_dev *rhp = php->rhp; CTR3(KTR_IW_CXGBE, "%s: pd %p, pdid 0x%x", __func__, pd, php->pdid); c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid); mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur--; mutex_unlock(&rhp->rdev.stats.lock); kfree(php); return (0); } static struct ib_pd * c4iw_allocate_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct c4iw_pd *php; u32 pdid; struct c4iw_dev *rhp; CTR4(KTR_IW_CXGBE, "%s: ibdev %p, context %p, data %p", __func__, ibdev, context, udata); rhp = (struct c4iw_dev *) ibdev; pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); if (!pdid) return ERR_PTR(-EINVAL); php = kzalloc(sizeof(*php), GFP_KERNEL); if (!php) { c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); return ERR_PTR(-ENOMEM); } php->pdid = pdid; php->rhp = rhp; if (context) { if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) { c4iw_deallocate_pd(&php->ibpd); return ERR_PTR(-EFAULT); } } mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur++; if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max) rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; mutex_unlock(&rhp->rdev.stats.lock); CTR6(KTR_IW_CXGBE, "%s: ibdev %p, context %p, data %p, pddid 0x%x, pd %p", __func__, ibdev, context, udata, pdid, php); return (&php->ibpd); } static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, pkey %p", __func__, ibdev, port, index, pkey); *pkey = 0; return (0); } static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct c4iw_dev *dev; struct port_info *pi; struct adapter *sc; CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, gid %p", __func__, ibdev, port, index, gid); memset(&gid->raw[0], 0, sizeof(gid->raw)); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port == 0 || port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; memcpy(&gid->raw[0], pi->vi[0].hw_addr, ETHER_ADDR_LEN); return (0); } static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct c4iw_dev *dev = to_c4iw_dev(ibdev); struct adapter *sc = dev->rdev.adap; CTR3(KTR_IW_CXGBE, "%s ibdev %p, props %p", __func__, ibdev, props); memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); props->hw_ver = sc->params.chipid; props->fw_ver = sc->params.fw_vers; props->device_cap_flags = dev->device_cap_flags; props->page_size_cap = T4_PAGESIZE_MASK; props->vendor_id = pci_get_vendor(sc->dev); props->vendor_part_id = pci_get_device(sc->dev); props->max_mr_size = T4_MAX_MR_SIZE; props->max_qp = T4_MAX_NUM_QP; props->max_qp_wr = T4_MAX_QP_DEPTH; props->max_sge = T4_MAX_RECV_SGE; props->max_sge_rd = 1; props->max_qp_rd_atom = c4iw_max_read_depth; props->max_qp_init_rd_atom = c4iw_max_read_depth; props->max_cq = T4_MAX_NUM_CQ; props->max_cqe = T4_MAX_CQ_DEPTH; props->max_mr = c4iw_num_stags(&dev->rdev); props->max_pd = T4_MAX_NUM_PD; props->local_ca_ack_delay = 0; props->max_fast_reg_page_list_len = T4_MAX_FR_DEPTH; return (0); } /* * Returns -errno on failure. */ static int c4iw_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct c4iw_dev *dev; struct adapter *sc; struct port_info *pi; struct ifnet *ifp; CTR4(KTR_IW_CXGBE, "%s ibdev %p, port %d, props %p", __func__, ibdev, port, props); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; ifp = pi->vi[0].ifp; memset(props, 0, sizeof(struct ib_port_attr)); props->max_mtu = IB_MTU_4096; if (ifp->if_mtu >= 4096) props->active_mtu = IB_MTU_4096; else if (ifp->if_mtu >= 2048) props->active_mtu = IB_MTU_2048; else if (ifp->if_mtu >= 1024) props->active_mtu = IB_MTU_1024; else if (ifp->if_mtu >= 512) props->active_mtu = IB_MTU_512; else props->active_mtu = IB_MTU_256; props->state = pi->link_cfg.link_ok ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_REINIT_SUP | IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; props->active_width = 2; props->active_speed = 2; props->max_msg_sz = -1; return 0; } /* * Returns -errno on error. */ int c4iw_register_device(struct c4iw_dev *dev) { struct adapter *sc = dev->rdev.adap; struct ib_device *ibdev = &dev->ibdev; struct iw_cm_verbs *iwcm; int ret; CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, sc); BUG_ON(!sc->port[0]); strlcpy(ibdev->name, device_get_nameunit(sc->dev), sizeof(ibdev->name)); memset(&ibdev->node_guid, 0, sizeof(ibdev->node_guid)); memcpy(&ibdev->node_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); ibdev->owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; if (fastreg_support) dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; ibdev->local_dma_lkey = 0; ibdev->uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_POLL_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_POST_SEND) | (1ull << IB_USER_VERBS_CMD_POST_RECV); ibdev->node_type = RDMA_NODE_RNIC; strlcpy(ibdev->node_desc, C4IW_NODE_DESC, sizeof(ibdev->node_desc)); ibdev->phys_port_cnt = sc->params.nports; ibdev->num_comp_vectors = 1; ibdev->dma_device = sc->dev; ibdev->query_device = c4iw_query_device; ibdev->query_port = c4iw_query_port; ibdev->modify_port = c4iw_modify_port; ibdev->query_pkey = c4iw_query_pkey; ibdev->query_gid = c4iw_query_gid; ibdev->alloc_ucontext = c4iw_alloc_ucontext; ibdev->dealloc_ucontext = c4iw_dealloc_ucontext; ibdev->mmap = c4iw_mmap; ibdev->alloc_pd = c4iw_allocate_pd; ibdev->dealloc_pd = c4iw_deallocate_pd; ibdev->create_ah = c4iw_ah_create; ibdev->destroy_ah = c4iw_ah_destroy; ibdev->create_qp = c4iw_create_qp; ibdev->modify_qp = c4iw_ib_modify_qp; ibdev->query_qp = c4iw_ib_query_qp; ibdev->destroy_qp = c4iw_destroy_qp; ibdev->create_cq = c4iw_create_cq; ibdev->destroy_cq = c4iw_destroy_cq; ibdev->resize_cq = c4iw_resize_cq; ibdev->poll_cq = c4iw_poll_cq; ibdev->get_dma_mr = c4iw_get_dma_mr; ibdev->reg_phys_mr = c4iw_register_phys_mem; ibdev->rereg_phys_mr = c4iw_reregister_phys_mem; ibdev->reg_user_mr = c4iw_reg_user_mr; ibdev->dereg_mr = c4iw_dereg_mr; ibdev->alloc_mw = c4iw_alloc_mw; ibdev->bind_mw = c4iw_bind_mw; ibdev->dealloc_mw = c4iw_dealloc_mw; ibdev->alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr; ibdev->alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl; ibdev->free_fast_reg_page_list = c4iw_free_fastreg_pbl; ibdev->attach_mcast = c4iw_multicast_attach; ibdev->detach_mcast = c4iw_multicast_detach; ibdev->process_mad = c4iw_process_mad; ibdev->req_notify_cq = c4iw_arm_cq; ibdev->post_send = c4iw_post_send; ibdev->post_recv = c4iw_post_receive; ibdev->uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; iwcm = kmalloc(sizeof(*iwcm), GFP_KERNEL); if (iwcm == NULL) return (-ENOMEM); iwcm->connect = c4iw_connect; iwcm->accept = c4iw_accept_cr; iwcm->reject = c4iw_reject_cr; - iwcm->create_listen = c4iw_create_listen; - iwcm->destroy_listen = c4iw_destroy_listen; + iwcm->create_listen_ep = c4iw_create_listen_ep; + iwcm->destroy_listen_ep = c4iw_destroy_listen_ep; + iwcm->newconn = process_newconn; iwcm->add_ref = c4iw_qp_add_ref; iwcm->rem_ref = c4iw_qp_rem_ref; iwcm->get_qp = c4iw_get_qp; ibdev->iwcm = iwcm; ret = ib_register_device(&dev->ibdev, NULL); if (ret) kfree(iwcm); return (ret); } void c4iw_unregister_device(struct c4iw_dev *dev) { CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, dev->rdev.adap); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); return; } #endif Index: head/sys/ofed/drivers/infiniband/core/cma.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/cma.c (revision 294609) +++ head/sys/ofed/drivers/infiniband/core/cma.c (revision 294610) @@ -1,3747 +1,3824 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT; module_param_named(cma_response_timeout, cma_response_timeout, int, 0644); MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT (default=20)"); static int def_prec2sl = 3; module_param_named(def_prec2sl, def_prec2sl, int, 0644); MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); static int unify_tcp_port_space = 1; module_param(unify_tcp_port_space, int, 0644); MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " "space allocation (default=1)"); static int debug_level = 0; #define cma_pr(level, priv, format, arg...) \ printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg) #define cma_dbg(priv, format, arg...) \ do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0) #define cma_warn(priv, format, arg...) \ cma_pr(KERN_WARNING, priv, format, ## arg) #define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x" #define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\ ((u8 *)(gid))[13],\ ((u8 *)(gid))[14],\ ((u8 *)(gid))[15] #define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw) #define cma_debug_path(priv, pfx, p) \ cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" \ CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), \ CMA_GID_ARG(p.dgid)) #define cma_debug_gid(priv, g) \ cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g) module_param_named(debug_level, debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "debug level default=0"); static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; static struct workqueue_struct *cma_free_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); static DEFINE_IDR(ipoib_ps); static DEFINE_IDR(ib_ps); struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; atomic_t refcount; struct list_head id_list; }; struct rdma_bind_list { struct idr *ps; struct hlist_head owners; unsigned short port; }; enum { CMA_OPTION_AFONLY, }; /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ struct cma_device *cma_dev; struct list_head mc_list; int internal_id; enum rdma_cm_state state; spinlock_t lock; spinlock_t cm_lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; struct work_struct work; /* garbage coll */ int backlog; int timeout_ms; struct ib_sa_query *query; int query_id; union { struct ib_cm_id *ib; struct iw_cm_id *iw; } cm_id; u32 seq_num; u32 qkey; u32 qp_num; pid_t owner; u32 options; u8 srq; u8 tos; u8 reuseaddr; u8 afonly; int qp_timeout; /* cache for mc record params */ struct ib_sa_mcmember_rec rec; int is_valid_rec; }; struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *ib; } multicast; struct list_head list; void *context; struct sockaddr_storage addr; struct kref mcref; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; struct cma_ndev_work { struct work_struct work; struct rdma_id_private *id; struct rdma_cm_event event; }; struct iboe_mcast_work { struct work_struct work; struct rdma_id_private *id; struct cma_multicast *mc; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; struct sdp_hh { u8 bsdh[16]; u8 sdp_version; /* Major version: 7:4 */ u8 ip_version; /* IP version: 7:4 */ u8 sdp_specific1[10]; __be16 port; __be16 sdp_specific2; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; struct sdp_hah { u8 bsdh[16]; u8 sdp_version; }; #define CMA_VERSION 0x00 #define SDP_MAJ_VERSION 0x2 static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); ret = (id_priv->state == comp); spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, enum rdma_cm_state exch) { unsigned long flags; enum rdma_cm_state old; spin_lock_irqsave(&id_priv->lock, flags); old = id_priv->state; id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return old; } static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static inline u8 sdp_get_majv(u8 sdp_version) { return sdp_version >> 4; } static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) { return hh->ip_version >> 4; } static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) { hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { atomic_inc(&cma_dev->refcount); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } static inline void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } static inline void release_mc(struct kref *kref) { struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); kfree(mc->multicast.ib); kfree(mc); } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del(&id_priv->list); cma_deref_dev(id_priv->cma_dev); id_priv->cma_dev = NULL; mutex_unlock(&lock); } static int cma_set_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; if (id_priv->qkey) return 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) { int i; int err; struct ib_port_attr props; union ib_gid tmp; err = ib_query_port(device, port_num, &props); if (err) return 1; for (i = 0; i < props.gid_tbl_len; ++i) { err = ib_query_gid(device, port_num, i, &tmp); if (err) return 1; if (!memcmp(&tmp, gid, sizeof tmp)) return 0; } return -EAGAIN; } +int +rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type, + void **cm_id) +{ + int ret; + u8 port; + int found_dev = 0, found_cmid = 0; + struct rdma_id_private *id_priv; + struct rdma_id_private *dev_id_priv; + struct cma_device *cma_dev; + struct rdma_dev_addr dev_addr; + union ib_gid gid; + enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + + memset(&dev_addr, 0, sizeof(dev_addr)); + + ret = rdma_translate_ip((struct sockaddr *)local_addr, + &dev_addr, NULL); + if (ret) + goto err; + + /* find rdma device based on MAC address/gid */ + mutex_lock(&lock); + + memcpy(&gid, dev_addr.src_dev_addr + + rdma_addr_gid_offset(&dev_addr), sizeof(gid)); + + list_for_each_entry(cma_dev, &dev_list, list) + for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) + if ((rdma_port_get_link_layer(cma_dev->device, port) == + dev_ll) && + (rdma_node_get_transport(cma_dev->device->node_type) == + RDMA_TRANSPORT_IWARP)) { + ret = find_gid_port(cma_dev->device, + &gid, port); + if (!ret) { + found_dev = 1; + goto out; + } else if (ret == 1) { + mutex_unlock(&lock); + goto err; + } + } +out: + mutex_unlock(&lock); + + if (!found_dev) + goto err; + + /* Traverse through the list of listening cm_id's to find the + * desired cm_id based on rdma device & port number. + */ + list_for_each_entry(id_priv, &listen_any_list, list) + list_for_each_entry(dev_id_priv, &id_priv->listen_list, + listen_list) + if (dev_id_priv->cma_dev == cma_dev) + if (dev_id_priv->cm_id.iw->local_addr.sin_port + == local_addr->sin_port) { + *cm_id = (void *)dev_id_priv->cm_id.iw; + found_cmid = 1; + } + return found_cmid ? 0 : -ENODEV; + +err: + return -ENODEV; +} +EXPORT_SYMBOL(rdma_find_cmid_laddr); + static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; union ib_gid gid, iboe_gid; int ret = -ENODEV; u8 port; enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; if (dev_ll != IB_LINK_LAYER_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; mutex_lock(&lock); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) ret = find_gid_port(cma_dev->device, &iboe_gid, port); else ret = find_gid_port(cma_dev->device, &gid, port); if (!ret) { id_priv->id.port_num = port; goto out; } else if (ret == 1) break; } } } out: if (!ret) cma_attach_to_dev(id_priv, cma_dev); mutex_unlock(&lock); return ret; } static void cma_deref_id(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static int cma_disable_callback(struct rdma_id_private *id_priv, enum rdma_cm_state state) { mutex_lock(&id_priv->handler_mutex); if (id_priv->state != state) { mutex_unlock(&id_priv->handler_mutex); return -EINVAL; } return 0; } struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->owner = curthread->td_proc->p_pid; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); spin_lock_init(&id_priv->cm_lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); return &id_priv->id; } EXPORT_SYMBOL(rdma_create_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) return -EINVAL; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto err; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); return 0; err: ib_destroy_qp(qp); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, qp_attr.ah_attr.grh.sgid_index, &sgid); if (ret) goto out; if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == IB_LINK_LAYER_ETHERNET) { ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL); if (ret) goto out; } if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; if (id_priv->qp_timeout && id_priv->id.qp->qp_type == IB_QPT_RC) { qp_attr.timeout = id_priv->qp_timeout; qp_attr_mask |= IB_QP_TIMEOUT; } ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == IB_LINK_LAYER_INFINIBAND) pkey = ib_addr_get_pkey(dev_addr); else pkey = 0xffff; ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); break; default: ret = -ENOSYS; break; } return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline int cma_zero_addr(struct sockaddr *addr) { struct in6_addr *ip6; if (addr->sa_family == AF_INET) return ipv4_is_zeronet( ((struct sockaddr_in *)addr)->sin_addr.s_addr); else { ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { if (addr->sa_family == AF_INET) return ipv4_is_loopback( ((struct sockaddr_in *) addr)->sin_addr.s_addr); else return ipv6_addr_loopback( &((struct sockaddr_in6 *) addr)->sin6_addr); } static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } +int +rdma_cma_any_addr(struct sockaddr *addr) +{ + return cma_any_addr(addr); +} +EXPORT_SYMBOL(rdma_cma_any_addr); static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *) src)->sin_addr.s_addr != ((struct sockaddr_in *) dst)->sin_addr.s_addr; default: return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); } } static inline __be16 cma_port(struct sockaddr *addr) { if (addr->sa_family == AF_INET) return ((struct sockaddr_in *) addr)->sin_port; else return ((struct sockaddr_in6 *) addr)->sin6_port; } static inline int cma_any_port(struct sockaddr *addr) { return !cma_port(addr); } static int cma_get_net_info(void *hdr, enum rdma_port_space ps, u8 *ip_ver, __be16 *port, union cma_ip_addr **src, union cma_ip_addr **dst) { switch (ps) { case RDMA_PS_SDP: if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; *ip_ver = sdp_get_ip_ver(hdr); *port = ((struct sdp_hh *) hdr)->port; *src = &((struct sdp_hh *) hdr)->src_addr; *dst = &((struct sdp_hh *) hdr)->dst_addr; break; default: if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) return -EINVAL; *ip_ver = cma_get_ip_ver(hdr); *port = ((struct cma_hdr *) hdr)->port; *src = &((struct cma_hdr *) hdr)->src_addr; *dst = &((struct cma_hdr *) hdr)->dst_addr; break; } if (*ip_ver != 4 && *ip_ver != 6) return -EINVAL; return 0; } static void cma_save_net_info(struct rdma_addr *addr, struct rdma_addr *listen_addr, u8 ip_ver, __be16 port, union cma_ip_addr *src, union cma_ip_addr *dst) { struct sockaddr_in *listen4, *ip4; struct sockaddr_in6 *listen6, *ip6; switch (ip_ver) { case 4: listen4 = (struct sockaddr_in *) &listen_addr->src_addr; ip4 = (struct sockaddr_in *) &addr->src_addr; ip4->sin_family = listen4->sin_family; ip4->sin_addr.s_addr = dst->ip4.addr; ip4->sin_port = listen4->sin_port; ip4 = (struct sockaddr_in *) &addr->dst_addr; ip4->sin_family = listen4->sin_family; ip4->sin_addr.s_addr = src->ip4.addr; ip4->sin_port = port; break; case 6: listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; ip6 = (struct sockaddr_in6 *) &addr->src_addr; ip6->sin6_family = listen6->sin6_family; ip6->sin6_addr = dst->ip6; ip6->sin6_port = listen6->sin6_port; ip6 = (struct sockaddr_in6 *) &addr->dst_addr; ip6->sin6_family = listen6->sin6_family; ip6->sin6_addr = src->ip6; ip6->sin6_port = port; break; default: break; } } static inline int cma_user_data_offset(enum rdma_port_space ps) { switch (ps) { case RDMA_PS_SDP: return 0; default: return sizeof(struct cma_hdr); } } static void cma_cancel_route(struct rdma_id_private *id_priv) { switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); break; default: break; } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_entry(id_priv->listen_list.next, struct rdma_id_private, listen_list); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->list); list_del(&dev_id_priv->listen_list); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; mutex_lock(&lock); bind_list = id_priv->bind_list; if (!bind_list) { mutex_unlock(&lock); return; } hlist_del(&id_priv->node); id_priv->bind_list = NULL; if (hlist_empty(&bind_list->owners)) { idr_remove(bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); if (id_priv->sock) sock_release(id_priv->sock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); break; case IB_LINK_LAYER_ETHERNET: kref_put(&mc->mcref, release_mc); break; default: break; } } } static void __rdma_free(struct work_struct *work) { struct rdma_id_private *id_priv; id_priv = container_of(work, struct rdma_id_private, work); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_deref_id(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; enum rdma_cm_state state; unsigned long flags; struct ib_cm_id *ib; id_priv = container_of(id, struct rdma_id_private, id); state = cma_exch(id_priv, RDMA_CM_DESTROYING); cma_cancel_operation(id_priv, state); /* * Wait for any active callback to finish. New callbacks will find * the id_priv state set to destroying and abort. */ mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: spin_lock_irqsave(&id_priv->cm_lock, flags); if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) { ib = id_priv->cm_id.ib; id_priv->cm_id.ib = NULL; spin_unlock_irqrestore(&id_priv->cm_lock, flags); ib_destroy_cm_id(ib); } else spin_unlock_irqrestore(&id_priv->cm_lock, flags); break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); break; default: break; } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_deref_id(id_priv); INIT_WORK(&id_priv->work, __rdma_free); queue_work(cma_free_wq, &id_priv->work); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; cma_dbg(id_priv, "sending RTU\n"); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); cma_dbg(id_priv, "sending REJ\n"); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) { if (id_priv->id.ps == RDMA_PS_SDP && sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; return 0; } static void cma_set_rep_event_data(struct rdma_cm_event *event, struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; } static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; int ret = 0; if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: event.status = cma_verify_rep(id_priv, ib_event->private_data); if (event.status) event.event = RDMA_CM_EVENT_CONNECT_ERROR; else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; /* fall through */ case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; union cma_ip_addr *src, *dst; __be16 port; u8 ip_ver; int ret; if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) return NULL; id = rdma_create_id(listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) return NULL; cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else { ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, &rt->addr.dev_addr, NULL); if (ret) goto err; } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; union cma_ip_addr *src, *dst; __be16 port; u8 ip_ver; int ret; id = rdma_create_id(listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) goto err; cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, &id->route.addr.dev_addr, NULL); if (ret) goto err; } id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; } static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int offset, ret; u8 smac[ETH_ALEN]; u8 alt_smac[ETH_ALEN]; u8 *psmac = smac; u8 *palt_smac = alt_smac; int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) == RDMA_TRANSPORT_IB) && (rdma_port_get_link_layer(cm_id->device, ib_event->param.req_rcvd.port) == IB_LINK_LAYER_ETHERNET)); int is_sidr = 0; listen_id = cm_id->context; if (!cma_check_req_qp_type(&listen_id->id, ib_event)) return -EINVAL; if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { is_sidr = 1; conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_new_conn_id(&listen_id->id, ib_event); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err1; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_acquire_dev(conn_id); if (ret) goto err2; conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; if (is_iboe && !is_sidr) { if (ib_event->param.req_rcvd.primary_path != NULL) rdma_addr_find_smac_by_sgid( &ib_event->param.req_rcvd.primary_path->sgid, psmac, NULL); else psmac = NULL; if (ib_event->param.req_rcvd.alternate_path != NULL) rdma_addr_find_smac_by_sgid( &ib_event->param.req_rcvd.alternate_path->sgid, palt_smac, NULL); else palt_smac = NULL; } /* * Acquire mutex to prevent user executing rdma_destroy_id() * while we're accessing the cm_id. */ mutex_lock(&lock); if (is_iboe && !is_sidr) ib_update_cm_av(cm_id, psmac, palt_smac); if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) { cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n"); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); cma_deref_id(conn_id); return 0; err3: cma_deref_id(conn_id); /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; err2: cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); err1: mutex_unlock(&listen_id->handler_mutex); if (conn_id) rdma_destroy_id(&conn_id->id); return ret; } static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) { return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); } static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, struct ib_cm_compare_data *compare) { struct cma_hdr *cma_data, *cma_mask; struct sdp_hh *sdp_data, *sdp_mask; __be32 ip4_addr; struct in6_addr ip6_addr; memset(compare, 0, sizeof *compare); cma_data = (void *) compare->data; cma_mask = (void *) compare->mask; sdp_data = (void *) compare->data; sdp_mask = (void *) compare->mask; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 4); sdp_set_ip_ver(sdp_mask, 0xF); if (!cma_any_addr(addr)) { sdp_data->dst_addr.ip4.addr = ip4_addr; sdp_mask->dst_addr.ip4.addr = htonl(~0); } } else { cma_set_ip_ver(cma_data, 4); cma_set_ip_ver(cma_mask, 0xF); if (!cma_any_addr(addr)) { cma_data->dst_addr.ip4.addr = ip4_addr; cma_mask->dst_addr.ip4.addr = htonl(~0); } } break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 6); sdp_set_ip_ver(sdp_mask, 0xF); if (!cma_any_addr(addr)) { sdp_data->dst_addr.ip6 = ip6_addr; memset(&sdp_mask->dst_addr.ip6, 0xFF, sizeof(sdp_mask->dst_addr.ip6)); } } else { cma_set_ip_ver(cma_data, 6); cma_set_ip_ver(cma_mask, 0xF); if (!cma_any_addr(addr)) { cma_data->dst_addr.ip6 = ip6_addr; memset(&cma_mask->dst_addr.ip6, 0xFF, sizeof(cma_mask->dst_addr.ip6)); } } break; default: break; } } static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; struct sockaddr_in *sin; int ret = 0; if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; *sin = iw_event->local_addr; sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; *sin = iw_event->remote_addr; switch ((int)iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: BUG_ON(1); } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct sockaddr_in *sin; struct net_device *dev = NULL; struct rdma_cm_event event; int ret; struct ib_device_attr attr; listen_id = cm_id->context; if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = cma_acquire_dev(conn_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; *sin = iw_event->local_addr; sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; *sin = iw_event->remote_addr; ret = ib_query_device(conn_id->id.device, &attr); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. */ atomic_inc(&conn_id->refcount); ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, RDMA_CM_DESTROYING); mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); rdma_destroy_id(&conn_id->id); goto out; } mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); out: if (dev) dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct ib_cm_compare_data compare_data; struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; int ret; id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; svc_id = cma_get_service_id(id_priv->id.ps, addr); if (cma_any_addr(addr) && !id_priv->afonly) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); else { cma_set_compare_data(id_priv->id.ps, addr, &compare_data); ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); } if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } return ret; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct sockaddr_in *sin; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, id_priv->sock, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.iw = id; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; id_priv->cm_id.iw->local_addr = *sin; ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; return id_priv->id.event_handler(id, event); } static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { struct rdma_id_private *dev_id_priv; struct rdma_cm_id *id; int ret; id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) return; dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = RDMA_CM_ADDR_BOUND; + dev_id_priv->sock = id_priv->sock; memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; ret = rdma_listen(id, id_priv->backlog); if (ret) cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; mutex_lock(&lock); list_add_tail(&id_priv->list, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); id_priv->tos = (u8) tos; } EXPORT_SYMBOL(rdma_set_service_type); void rdma_set_timeout(struct rdma_cm_id *id, int timeout) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); id_priv->qp_timeout = (u8) timeout; } EXPORT_SYMBOL(rdma_set_timeout); static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { struct cma_work *work = context; struct rdma_route *route; route = &work->id->id.route; if (!status) { route->num_paths = 1; *route->path_rec = *path_rec; } else { work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; } queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { struct rdma_addr *addr = &id_priv->id.route.addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; memset(&path_rec, 0, sizeof path_rec); rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &addr->dst_addr); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; if (addr->src_addr.ss_family == AF_INET) { path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; } else { sin6 = (struct sockaddr_in6 *) &addr->src_addr; path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } static void cma_ndev_work_handler(struct work_struct *_work) { struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (id_priv->state == RDMA_CM_DESTROYING || id_priv->state == RDMA_CM_DEVICE_REMOVAL) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_set_ib_paths(struct rdma_cm_id *id, struct ib_sa_path_rec *path_rec, int num_paths) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } id->route.num_paths = num_paths; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_paths); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; queue_work(cma_wq, &work->work); return 0; } static u8 tos_to_sl(u8 tos) { return def_prec2sl & 7; } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; if (src_addr->sin_family != dst_addr->sin_family) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_paths = 1; if (addr->dev_addr.bound_dev_if) ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; goto err2; } route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = tos_to_sl(id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; work->event.status = 0; queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ret = cma_resolve_ib_route(id_priv, timeout_ms); break; case IB_LINK_LAYER_ETHERNET: ret = cma_resolve_iboe_route(id_priv); break; default: ret = -ENOSYS; } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_resolve_iw_route(id_priv, timeout_ms); break; default: ret = -ENOSYS; break; } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type) { /* APM is not supported yet */ return -EINVAL; } EXPORT_SYMBOL(rdma_enable_apm); static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; mutex_lock(&lock); if (list_empty(&dev_list)) { ret = -ENODEV; goto out; } list_for_each_entry(cma_dev, &dev_list, list) for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) if (!ib_query_port(cma_dev->device, p, &port_attr) && port_attr.state == IB_PORT_ACTIVE) goto port_found; p = 1; cma_dev = list_entry(dev_list.next, struct cma_device, list); port_found: ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event; memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; memcpy(&id_priv->id.route.addr.src_addr, src_addr, ip_addr_size(src_addr)); if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv); if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; struct sockaddr *src, *dst; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; if (cma_zero_addr(src)) { dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; if ((src->sa_family = dst->sa_family) == AF_INET) { ((struct sockaddr_in *)src)->sin_addr = ((struct sockaddr_in *)dst)->sin_addr; } else { ((struct sockaddr_in6 *)src)->sin6_addr = ((struct sockaddr_in6 *)dst)->sin6_addr; } } work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; err: kfree(work); return ret; } static int cma_resolve_scif(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; /* we probably can leave it empty here */ work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) { ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; } } if (!cma_any_addr(src_addr)) return rdma_bind_addr(id, src_addr); else { struct sockaddr_in addr_in; memset(&addr_in, 0, sizeof addr_in); addr_in.sin_family = dst_addr->sa_family; addr_in.sin_len = sizeof addr_in; return rdma_bind_addr(id, (struct sockaddr *) &addr_in); } } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); if (ret) return ret; } if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); if (cma_any_addr(dst_addr)) ret = cma_resolve_loopback(id_priv); else if (id_priv->id.device && rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF) ret = cma_resolve_scif(id_priv); else ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr_in *sin; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; sin->sin_port = htons(bind_list->port); id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int port, ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; do { ret = idr_get_new_above(ps, bind_list, snum, &port); } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); if (ret) goto err1; if (port != snum) { ret = -EADDRNOTAVAIL; goto err2; } bind_list->ps = ps; bind_list->port = (unsigned short) port; cma_bind_port(bind_list, id_priv); return 0; err2: idr_remove(ps, port); err1: kfree(bind_list); return ret; } static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; rover = random() % remaining + low; retry: if (last_used_port != rover && !idr_find(ps, (unsigned short) rover)) { int ret = cma_alloc_port(ps, id_priv, rover); /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && cur_id->reuseaddr) continue; cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); bind_list = idr_find(ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static int cma_bind_listen(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; int ret = 0; mutex_lock(&lock); if (bind_list->owners.first->next) ret = cma_check_port(bind_list, id_priv, 0); mutex_unlock(&lock); return ret; } static int cma_get_tcp_port(struct rdma_id_private *id_priv) { int ret; int size; struct socket *sock; ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; #ifdef __linux__ ret = sock->ops->bind(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); #else ret = -sobind(sock, (struct sockaddr *)&id_priv->id.route.addr.src_addr, curthread); #endif if (ret) { sock_release(sock); return ret; } size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); ret = sock_getname(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, &size, 0); if (ret) { sock_release(sock); return ret; } id_priv->sock = sock; return 0; } static int cma_get_port(struct rdma_id_private *id_priv) { struct idr *ps; int ret; switch (id_priv->id.ps) { case RDMA_PS_SDP: ps = &sdp_ps; break; case RDMA_PS_TCP: ps = &tcp_ps; if (unify_tcp_port_space) { ret = cma_get_tcp_port(id_priv); if (ret) goto out; } break; case RDMA_PS_UDP: ps = &udp_ps; break; case RDMA_PS_IPOIB: ps = &ipoib_ps; break; case RDMA_PS_IB: ps = &ib_ps; break; default: return -EPROTONOSUPPORT; } mutex_lock(&lock); if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); out: return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if defined(INET6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && !sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); if (ret) return ret; } if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) return -EINVAL; if (id_priv->reuseaddr) { ret = cma_bind_listen(id_priv); if (ret) goto err; } id_priv->backlog = backlog; if (id->device) { switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: ret = cma_ib_listen(id_priv); if (ret) goto err; break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; break; default: ret = -ENOSYS; goto err; } } else cma_listen_on_all(id_priv); return 0; err: id_priv->backlog = 0; cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; int ret; #if defined(INET6) int ipv6only; size_t var_size = sizeof(int); #endif if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); if (ret) goto err1; memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); if (!cma_any_addr(addr)) { ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL); if (ret) goto err1; ret = cma_acquire_dev(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if defined(INET6) else if (addr->sa_family == AF_INET6) id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only", &ipv6only, &var_size, NULL, 0, NULL, 0); #endif } ret = cma_get_port(id_priv); if (ret) goto err2; return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, enum rdma_port_space ps, struct rdma_route *route) { struct cma_hdr *cma_hdr; struct sdp_hh *sdp_hdr; if (route->addr.src_addr.ss_family == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) &route->addr.src_addr; dst4 = (struct sockaddr_in *) &route->addr.dst_addr; switch (ps) { case RDMA_PS_SDP: sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; sdp_set_ip_ver(sdp_hdr, 4); sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; sdp_hdr->port = src4->sin_port; break; default: cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; break; } } else { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) &route->addr.src_addr; dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; switch (ps) { case RDMA_PS_SDP: sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; sdp_set_ip_ver(sdp_hdr, 6); sdp_hdr->src_addr.ip6 = src6->sin6_addr; sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; sdp_hdr->port = src6->sin6_port; break; default: cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; break; } } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; break; } ret = cma_set_qkey(id_priv); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = -EINVAL; break; } if (id_priv->qkey != rep->qkey) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -EINVAL; break; } ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct rdma_route *route; struct ib_cm_id *id; int ret; req.private_data_len = sizeof(struct cma_hdr) + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!req.private_data) return -ENOMEM; if (conn_param->private_data && conn_param->private_data_len) memcpy((void *) req.private_data + sizeof(struct cma_hdr), conn_param->private_data, conn_param->private_data_len); route = &id_priv->id.route; ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); if (ret) goto out; id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = route->path_rec; req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.timeout_ms = 1 << (cma_response_timeout - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; cma_dbg(id_priv, "sending SIDR\n"); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(req.private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; int offset, ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv->id.ps); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; ret = cma_format_hdr(private_data, id_priv->id.ps, route); if (ret) goto out; req.private_data = private_data; req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = cma_response_timeout; req.local_cm_response_timeout = cma_response_timeout; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; cma_dbg(id_priv, "sending REQ\n"); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; struct sockaddr_in* sin; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); id_priv->cm_id.iw = cm_id; sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; cm_id->local_addr = *sin; sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; cm_id->remote_addr = *sin; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_connect_iw(id_priv, conn_param); break; default: ret = -ENOSYS; break; } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; cma_dbg(id_priv, "sending REP\n"); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) { iw_param.qpn = id_priv->qp_num; } else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { ret = cma_set_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; } rep.private_data = private_data; rep.private_data_len = private_data_len; cma_dbg(id_priv, "sending SIDR\n"); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); id_priv->owner = curthread->td_proc->p_pid; if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = cma_accept_iw(id_priv, conn_param); break; default: ret = -ENOSYS; break; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, private_data, private_data_len); else { cma_dbg(id_priv, "sending REJ\n"); ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); break; default: ret = -ENOSYS; break; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ cma_dbg(id_priv, "sending DREQ\n"); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { cma_dbg(id_priv, "sending DREP\n"); ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); } break; case RDMA_TRANSPORT_IWARP: case RDMA_TRANSPORT_SCIF: ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); break; default: ret = -EINVAL; break; } out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; struct rdma_dev_addr *dev_addr; int ret; struct net_device *ndev = NULL; u16 vlan; id_priv = mc->id_priv; dev_addr = &id_priv->id.route.addr.dev_addr; if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) return 0; mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, be16_to_cpu(multicast->rec.mlid)); mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (!ndev) { status = -ENODEV; } else { vlan = rdma_vlan_dev_vlan_id(ndev); dev_put(ndev); } if (!status) { event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, &event.param.ud.ah_attr); event.param.ud.ah_attr.vlan_id = vlan; event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); } else { event.event = RDMA_CM_EVENT_MULTICAST_ERROR; /* mark that the cached record is no longer valid */ if (status != -ENETRESET && status != -EAGAIN) { spin_lock(&id_priv->lock); id_priv->is_valid_rec = 0; spin_unlock(&id_priv->lock); } } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; } mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; #if defined(INET6) struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; #endif if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); #if defined(INET6) } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); #endif } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret = 0; ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid); /* cache ipoib bc record */ spin_lock(&id_priv->lock); if (!id_priv->is_valid_rec) ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &id_priv->rec.mgid, &id_priv->rec); if (ret) { id_priv->is_valid_rec = 0; spin_unlock(&id_priv->lock); return ret; } else { rec = id_priv->rec; id_priv->is_valid_rec = 1; } spin_unlock(&id_priv->lock); cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); if (id_priv->id.ps == RDMA_PS_UDP) rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = 1; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_RET(mc->multicast.ib); } static void iboe_mcast_work_handler(struct work_struct *work) { struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); struct cma_multicast *mc = mw->mc; struct ib_sa_multicast *m = mc->multicast.ib; mc->multicast.ib->context = mc; cma_ib_mc_handler(0, m); kref_put(&mc->mcref, release_mc); kfree(mw); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = 0xff; mgid->raw[1] = 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); if (!mc->multicast.ib) { err = -ENOMEM; goto out1; } cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; } mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); dev_put(ndev); if (!mc->multicast.ib->rec.mtu) { err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); kref_get(&mc->mcref); queue_work(cma_wq, &work->work); return 0; out2: kfree(mc->multicast.ib); out1: kfree(work); return err; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) return -EINVAL; mc = kmalloc(sizeof *mc, GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, ip_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ret = cma_join_ib_multicast(id_priv, mc); break; case IB_LINK_LAYER_ETHERNET: kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); break; default: ret = -EINVAL; } break; default: ret = -ENOSYS; break; } if (ret) { spin_lock_irq(&id_priv->lock); list_del(&mc->list); spin_unlock_irq(&id_priv->lock); kfree(mc); } return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); if (id->qp) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, be16_to_cpu(mc->multicast.ib->rec.mlid)); if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); break; case IB_LINK_LAYER_ETHERNET: kref_put(&mc->mcref, release_mc); break; default: break; } } return; } } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_ndev_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->if_index) && memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->if_xname, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_ndev_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; atomic_inc(&id_priv->refcount); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct net_device *ndev = (struct net_device *)ctx; struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; /* BONDING related, commented out until the bonding is resolved */ #if 0 if (dev_net(ndev) != &init_net) return NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; #endif if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, list) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; cma_dev->device = device; init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; enum rdma_cm_state state; int ret = 0; /* Record that we want to remove the device */ state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); if (state == RDMA_CM_DESTROYING) return 0; cma_cancel_operation(id_priv, state); mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) goto out; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; ret = id_priv->id.event_handler(&id_priv->id, &event); out: mutex_unlock(&id_priv->handler_mutex); return ret; } static void cma_process_remove(struct cma_device *cma_dev) { struct rdma_id_private *id_priv; int ret; mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { id_priv = list_entry(cma_dev->id_list.next, struct rdma_id_private, list); list_del(&id_priv->listen_list); list_del_init(&id_priv->list); atomic_inc(&id_priv->refcount); mutex_unlock(&lock); ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); cma_deref_id(id_priv); if (ret) rdma_destroy_id(&id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); cma_deref_dev(cma_dev); wait_for_completion(&cma_dev->comp); } static void cma_remove_one(struct ib_device *device) { struct cma_device *cma_dev; cma_dev = ib_get_client_data(device, &cma_client); if (!cma_dev) return; mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev); } static int __init cma_init(void) { int ret = -ENOMEM; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; cma_free_wq = create_singlethread_workqueue("rdma_cm_fr"); if (!cma_free_wq) goto err1; ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); if (ret) goto err; return 0; err: unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_free_wq); err1: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); flush_workqueue(cma_free_wq); destroy_workqueue(cma_free_wq); destroy_workqueue(cma_wq); idr_destroy(&sdp_ps); idr_destroy(&tcp_ps); idr_destroy(&udp_ps); idr_destroy(&ipoib_ps); idr_destroy(&ib_ps); } module_init(cma_init); module_exit(cma_cleanup); Index: head/sys/ofed/drivers/infiniband/core/iwcm.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/iwcm.c (revision 294609) +++ head/sys/ofed/drivers/infiniband/core/iwcm.c (revision 294610) @@ -1,1038 +1,1318 @@ /* * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ +#include "opt_inet.h" + #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include +#include #include #include #include "iwcm.h" MODULE_AUTHOR("Tom Tucker"); MODULE_DESCRIPTION("iWARP CM"); MODULE_LICENSE("Dual BSD/GPL"); static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; struct list_head list; struct iw_cm_event event; struct list_head free_list; }; +struct iwcm_listen_work { + struct work_struct work; + struct iw_cm_id *cm_id; +}; +static LIST_HEAD(listen_port_list); + +static DEFINE_MUTEX(listen_port_mutex); +static DEFINE_MUTEX(dequeue_mutex); + +struct listen_port_info { + struct list_head list; + uint16_t port_num; + uint32_t refcnt; +}; + +static int32_t +add_port_to_listenlist(uint16_t port) +{ + struct listen_port_info *port_info; + int err = 0; + + mutex_lock(&listen_port_mutex); + + list_for_each_entry(port_info, &listen_port_list, list) + if (port_info->port_num == port) + goto found_port; + + port_info = kmalloc(sizeof(*port_info), GFP_KERNEL); + if (!port_info) { + err = -ENOMEM; + mutex_unlock(&listen_port_mutex); + goto out; + } + + port_info->port_num = port; + port_info->refcnt = 0; + + list_add(&port_info->list, &listen_port_list); + +found_port: + ++(port_info->refcnt); + mutex_unlock(&listen_port_mutex); + return port_info->refcnt; +out: + return err; +} + +static int32_t +rem_port_from_listenlist(uint16_t port) +{ + struct listen_port_info *port_info; + int ret, found_port = 0; + + mutex_lock(&listen_port_mutex); + + list_for_each_entry(port_info, &listen_port_list, list) + if (port_info->port_num == port) { + found_port = 1; + break; + } + + if (found_port) { + --(port_info->refcnt); + ret = port_info->refcnt; + if (port_info->refcnt == 0) { + /* Remove this entry from the list as there are no + * more listeners for this port_num. + */ + list_del(&port_info->list); + kfree(port_info); + } + } else { + ret = -EINVAL; + } + mutex_unlock(&listen_port_mutex); + return ret; + +} + /* * The following services provide a mechanism for pre-allocating iwcm_work * elements. The design pre-allocates them based on the cm_id type: * LISTENING IDS: Get enough elements preallocated to handle the * listen backlog. * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE * * Allocating them in connect and listen avoids having to deal * with allocation failures on the event upcall from the provider (which * is called in the interrupt context). * * One exception is when creating the cm_id for incoming connection requests. * There are two cases: * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If * the backlog is exceeded, then no more connection request events will * be processed. cm_event_handler() returns -ENOMEM in this case. Its up * to the provider to reject the connection request. * 2) in the connection request workqueue handler, cm_conn_req_handler(). * If work elements cannot be allocated for the new connect request cm_id, * then IWCM will call the provider reject method. This is ok since * cm_conn_req_handler() runs in the workqueue thread context. */ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) { struct iwcm_work *work; if (list_empty(&cm_id_priv->work_free_list)) return NULL; work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, free_list); list_del_init(&work->free_list); return work; } static void put_work(struct iwcm_work *work) { list_add(&work->free_list, &work->cm_id->work_free_list); } static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) { struct list_head *e, *tmp; list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) kfree(list_entry(e, struct iwcm_work, free_list)); } static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) { struct iwcm_work *work; BUG_ON(!list_empty(&cm_id_priv->work_free_list)); while (count--) { work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL); if (!work) { dealloc_work_entries(cm_id_priv); return -ENOMEM; } work->cm_id = cm_id_priv; INIT_LIST_HEAD(&work->list); put_work(work); } return 0; } /* * Save private data from incoming connection requests to * iw_cm_event, so the low level driver doesn't have to. Adjust * the event ptr to point to the local copy. */ static int copy_private_data(struct iw_cm_event *event) { void *p; p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC); if (!p) return -ENOMEM; event->private_data = p; return 0; } static void free_cm_id(struct iwcm_id_private *cm_id_priv) { dealloc_work_entries(cm_id_priv); kfree(cm_id_priv); } /* * Release a reference on cm_id. If the last reference is being * released, enable the waiting thread (in iw_destroy_cm_id) to * get woken up, and return 1 if a thread is already waiting. */ static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { BUG_ON(atomic_read(&cm_id_priv->refcount)==0); if (atomic_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); complete(&cm_id_priv->destroy_comp); return 1; } return 0; } static void add_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); atomic_inc(&cm_id_priv->refcount); } static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); if (iwcm_deref_id(cm_id_priv) && test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); } } static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so, iw_cm_handler cm_handler, void *context) { struct iwcm_id_private *cm_id_priv; cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->state = IW_CM_STATE_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.event_handler = cm_event_handler; cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; cm_id_priv->id.so = so; spin_lock_init(&cm_id_priv->lock); atomic_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; } EXPORT_SYMBOL(iw_create_cm_id); static int iwcm_modify_qp_err(struct ib_qp *qp) { struct ib_qp_attr qp_attr; if (!qp) return -EINVAL; qp_attr.qp_state = IB_QPS_ERR; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * This is really the RDMAC CLOSING state. It is most similar to the * IB SQD QP state. */ static int iwcm_modify_qp_sqd(struct ib_qp *qp) { struct ib_qp_attr qp_attr; BUG_ON(qp == NULL); qp_attr.qp_state = IB_QPS_SQD; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * CM_ID <-- CLOSING * * Block if a passive or active connection is currently being processed. Then * process the event as follows: * - If we are ESTABLISHED, move to CLOSING and modify the QP state * based on the abrupt flag * - If the connection is already in the CLOSING or IDLE state, the peer is * disconnecting concurrently with us and we've already seen the * DISCONNECT event -- ignore the request and return 0 * - Disconnect on a listening endpoint returns -EINVAL */ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* Wait if we're currently in a connect or accept downcall */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_CLOSING; /* QP could be for user-mode client */ if (cm_id_priv->qp) qp = cm_id_priv->qp; else ret = -EINVAL; break; case IW_CM_STATE_LISTEN: ret = -EINVAL; break; case IW_CM_STATE_CLOSING: /* remote peer closed first */ case IW_CM_STATE_IDLE: /* accept or connect returned !0 */ break; case IW_CM_STATE_CONN_RECV: /* * App called disconnect before/without calling accept after * connect_request event delivered. */ break; case IW_CM_STATE_CONN_SENT: /* Can only get here if wait above fails */ default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) { if (abrupt) ret = iwcm_modify_qp_err(qp); else ret = iwcm_modify_qp_sqd(qp); /* * If both sides are disconnecting the QP could * already be in ERR or SQD states */ ret = 0; } return ret; } EXPORT_SYMBOL(iw_cm_disconnect); +static struct socket * +dequeue_socket(struct socket *head) +{ + struct socket *so; + struct sockaddr_in *remote; + + ACCEPT_LOCK(); + so = TAILQ_FIRST(&head->so_comp); + if (!so) { + ACCEPT_UNLOCK(); + return NULL; + } + + SOCK_LOCK(so); + /* + * Before changing the flags on the socket, we have to bump the + * reference count. Otherwise, if the protocol calls sofree(), + * the socket will be released due to a zero refcount. + */ + soref(so); + TAILQ_REMOVE(&head->so_comp, so, so_list); + head->so_qlen--; + so->so_qstate &= ~SQ_COMP; + so->so_head = NULL; + so->so_state |= SS_NBIO; + SOCK_UNLOCK(so); + ACCEPT_UNLOCK(); + soaccept(so, (struct sockaddr **)&remote); + + free(remote, M_SONAME); + return so; +} +static void +iw_so_event_handler(struct work_struct *_work) +{ +#ifdef INET + struct iwcm_listen_work *work = container_of(_work, + struct iwcm_listen_work, work); + struct iw_cm_id *listen_cm_id = work->cm_id; + struct iwcm_id_private *cm_id_priv; + struct iw_cm_id *real_cm_id; + struct sockaddr_in *local; + struct socket *so; + + cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id); + + if (cm_id_priv->state != IW_CM_STATE_LISTEN) { + kfree(work); + return; + } + mutex_lock(&dequeue_mutex); + + /* Dequeue & process all new 'so' connection requests for this cmid */ + while ((so = dequeue_socket(work->cm_id->so)) != NULL) { + if (rdma_cma_any_addr((struct sockaddr *) + &listen_cm_id->local_addr)) { + in_getsockaddr(so, (struct sockaddr **)&local); + if (rdma_find_cmid_laddr(local, ARPHRD_ETHER, + (void **) &real_cm_id)) { + free(local, M_SONAME); + goto err; + } + free(local, M_SONAME); + + real_cm_id->device->iwcm->newconn(real_cm_id, so); + } else { + listen_cm_id->device->iwcm->newconn(listen_cm_id, so); + } + } +err: + mutex_unlock(&dequeue_mutex); + kfree(work); +#endif + return; +} +static int +iw_so_upcall(struct socket *parent_so, void *arg, int waitflag) +{ + struct iwcm_listen_work *work; + struct socket *so; + struct iw_cm_id *cm_id = arg; + + mutex_lock(&dequeue_mutex); + /* check whether iw_so_event_handler() already dequeued this 'so' */ + so = TAILQ_FIRST(&parent_so->so_comp); + if (!so) + return SU_OK; + work = kzalloc(sizeof(*work), M_NOWAIT); + if (!work) + return -ENOMEM; + work->cm_id = cm_id; + + INIT_WORK(&work->work, iw_so_event_handler); + queue_work(iwcm_wq, &work->work); + + mutex_unlock(&dequeue_mutex); + return SU_OK; +} + +static void +iw_init_sock(struct iw_cm_id *cm_id) +{ + struct sockopt sopt; + struct socket *so = cm_id->so; + int on = 1; + + SOCK_LOCK(so); + soupcall_set(so, SO_RCV, iw_so_upcall, cm_id); + so->so_state |= SS_NBIO; + SOCK_UNLOCK(so); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_NODELAY; + sopt.sopt_val = (caddr_t)&on; + sopt.sopt_valsize = sizeof(on); + sopt.sopt_td = NULL; + sosetopt(so, &sopt); +} + +static int +iw_close_socket(struct iw_cm_id *cm_id, int close) +{ + struct socket *so = cm_id->so; + int rc; + + + SOCK_LOCK(so); + soupcall_clear(so, SO_RCV); + SOCK_UNLOCK(so); + + if (close) + rc = soclose(so); + else + rc = soshutdown(so, SHUT_WR | SHUT_RD); + + cm_id->so = NULL; + + return rc; +} + +static int +iw_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int rc; + + iw_init_sock(cm_id); + rc = solisten(cm_id->so, backlog, curthread); + if (rc != 0) + iw_close_socket(cm_id, 0); + return rc; +} + +static int +iw_destroy_listen(struct iw_cm_id *cm_id) +{ + int rc; + rc = iw_close_socket(cm_id, 0); + return rc; +} + + /* * CM_ID <-- DESTROYING * * Clean up all resources associated with the connection and release * the initial reference taken by iw_create_cm_id. */ static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret; + int ret = 0, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* * Wait if we're currently in a connect or accept downcall. A * listening endpoint should never block here. */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - /* destroy the listening endpoint */ - ret = cm_id->device->iwcm->destroy_listen(cm_id); + if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { + refcnt = + rem_port_from_listenlist(cm_id->local_addr.sin_port); + + if (refcnt == 0) + ret = iw_destroy_listen(cm_id); + + cm_id->device->iwcm->destroy_listen_ep(cm_id); + } else { + ret = iw_destroy_listen(cm_id); + cm_id->device->iwcm->destroy_listen_ep(cm_id); + } spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* Abrupt close of the connection */ (void)iwcm_modify_qp_err(cm_id_priv->qp); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_IDLE: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_DESTROYING; break; case IW_CM_STATE_CONN_RECV: /* * App called destroy before/without calling accept after * receiving connection request event notification or * returned non zero from the event callback function. * In either case, must tell the provider to reject. */ cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id->device->iwcm->reject(cm_id, NULL, 0); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_DESTROYING: default: BUG(); break; } if (cm_id_priv->qp) { cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); (void)iwcm_deref_id(cm_id_priv); } /* * This function is only called by the application thread and cannot * be called by the event thread. The function will wait for all * references to be released on the cm_id and then kfree the cm_id * object. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)); destroy_cm_id(cm_id); wait_for_completion(&cm_id_priv->destroy_comp); free_cm_id(cm_id_priv); } EXPORT_SYMBOL(iw_destroy_cm_id); /* * CM_ID <-- LISTEN * * Start listening for connect requests. Generates one CONNECT_REQUEST * event for each inbound connect request. */ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret; + int ret, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->create_listen(cm_id, backlog); - if (ret) + + if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { + refcnt = + add_port_to_listenlist(cm_id->local_addr.sin_port); + + if (refcnt == 1) { + ret = iw_create_listen(cm_id, backlog); + } else if (refcnt <= 0) { + ret = -EINVAL; + } else { + /* if refcnt > 1, a socket listener created + * already. And we need not create socket + * listener on other rdma devices/listen cm_id's + * due to TOE. That is when a socket listener is + * created with INADDR_ANY all registered TOE + * devices will get a call to start + * hardware listeners. + */ + } + } else { + ret = iw_create_listen(cm_id, backlog); + } + if (!ret) + cm_id->device->iwcm->create_listen_ep(cm_id, backlog); + else cm_id_priv->state = IW_CM_STATE_IDLE; + spin_lock_irqsave(&cm_id_priv->lock, flags); break; default: ret = -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(iw_cm_listen); /* * CM_ID <-- IDLE * * Rejects an inbound connection request. No events are generated. */ int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->reject(cm_id, private_data, private_data_len); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_reject); /* * CM_ID <-- ESTABLISHED * * Accepts an inbound connection request and generates an ESTABLISHED * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block * until the ESTABLISHED event is received from the provider. */ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->accept(cm_id, iw_param); if (ret) { /* An error on accept precludes provider events */ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id->device->iwcm->rem_ref(qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } return ret; } EXPORT_SYMBOL(iw_cm_accept); /* * Active Side: CM_ID <-- CONN_SENT * * If successful, results in the generation of a CONNECT_REPLY * event. iw_cm_disconnect and iw_cm_destroy will block until the * CONNECT_REPLY event is received from the provider. */ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; int ret; unsigned long flags; struct ib_qp *qp; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); ret = alloc_work_entries(cm_id_priv, 4); if (ret) return ret; set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_IDLE) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->iwcm->connect(cm_id, iw_param); if (ret) { spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id->device->iwcm->rem_ref(qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); cm_id_priv->state = IW_CM_STATE_IDLE; clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } return ret; } EXPORT_SYMBOL(iw_cm_connect); /* * Passive Side: new CM_ID <-- CONN_RECV * * Handles an inbound connect request. The function creates a new * iw_cm_id to represent the new connection and inherits the client * callback function and other attributes from the listening parent. * * The work item contains a pointer to the listen_cm_id and the event. The * listen_cm_id contains the client cm_handler, context and * device. These are copied when the device is cloned. The event * contains the new four tuple. * * An error on the child should not affect the parent, so this * function does not return a value. */ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; struct iw_cm_id *cm_id; struct iwcm_id_private *cm_id_priv; int ret; /* * The provider should never generate a connection request * event with a bad status. */ BUG_ON(iw_event->status); cm_id = iw_create_cm_id(listen_id_priv->id.device, iw_event->so, listen_id_priv->id.cm_handler, listen_id_priv->id.context); /* If the cm_id could not be created, ignore the request */ if (IS_ERR(cm_id)) goto out; cm_id->provider_data = iw_event->provider_data; cm_id->local_addr = iw_event->local_addr; cm_id->remote_addr = iw_event->remote_addr; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; /* * We could be destroying the listening id. If so, ignore this * upcall. */ spin_lock_irqsave(&listen_id_priv->lock, flags); if (listen_id_priv->state != IW_CM_STATE_LISTEN) { spin_unlock_irqrestore(&listen_id_priv->lock, flags); iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } spin_unlock_irqrestore(&listen_id_priv->lock, flags); ret = alloc_work_entries(cm_id_priv, 3); if (ret) { iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } /* Call the client CM handler */ ret = cm_id->cm_handler(cm_id, iw_event); if (ret) { iw_cm_reject(cm_id, NULL, 0); set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); destroy_cm_id(cm_id); if (atomic_read(&cm_id_priv->refcount)==0) free_cm_id(cm_id_priv); } out: if (iw_event->private_data_len) kfree(iw_event->private_data); } /* * Passive Side: CM_ID <-- ESTABLISHED * * The provider generated an ESTABLISHED event which means that * the MPA negotion has completed successfully and we are now in MPA * FPDU mode. * * This event can only be received in the CONN_RECV state. If the * remote peer closed, the ESTABLISHED event would be received followed * by the CLOSE event. If the app closes, it will block until we wake * it up after processing this event. */ static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * We clear the CONNECT_WAIT bit here to allow the callback * function to call iw_cm_disconnect. Calling iw_destroy_cm_id * from a callback handler is not allowed. */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_ESTABLISHED; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * Active Side: CM_ID <-- ESTABLISHED * * The app has called connect and is waiting for the established event to * post it's requests to the server. This event will wake up anyone * blocked in iw_cm_disconnect or iw_destroy_id. */ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * Clear the connect wait bit so a callback function calling * iw_cm_disconnect will not wait and deadlock this thread */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); if (iw_event->status == 0) { cm_id_priv->id.local_addr = iw_event->local_addr; cm_id_priv->id.remote_addr = iw_event->remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); if (iw_event->private_data_len) kfree(iw_event->private_data); /* Wake up waiters on connect complete */ wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * CM_ID <-- CLOSING * * If in the ESTABLISHED state, move to CLOSING. */ static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) cm_id_priv->state = IW_CM_STATE_CLOSING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * CM_ID <-- IDLE * * If in the ESTBLISHED or CLOSING states, the QP will have have been * moved by the provider to the ERR state. Disassociate the CM_ID from * the QP, move to IDLE, and remove the 'connected' reference. * * If in some other state, the cm_id was destroyed asynchronously. * This is the last reference that will result in waking up * the app thread blocked in iw_destroy_cm_id. */ static int cm_close_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_DESTROYING: break; default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int process_event(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { int ret = 0; switch (iw_event->event) { case IW_CM_EVENT_CONNECT_REQUEST: cm_conn_req_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CONNECT_REPLY: ret = cm_conn_rep_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_ESTABLISHED: ret = cm_conn_est_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_DISCONNECT: cm_disconnect_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CLOSE: ret = cm_close_handler(cm_id_priv, iw_event); break; default: BUG(); } return ret; } /* * Process events on the work_list for the cm_id. If the callback * function requests that the cm_id be deleted, a flag is set in the * cm_id flags to indicate that when the last reference is * removed, the cm_id is to be destroyed. This is necessary to * distinguish between an object that will be destroyed by the app * thread asleep on the destroy_comp list vs. an object destroyed * here synchronously when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { struct iwcm_work *work = container_of(_work, struct iwcm_work, work); struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; int empty; int ret = 0; int destroy_id; spin_lock_irqsave(&cm_id_priv->lock, flags); empty = list_empty(&cm_id_priv->work_list); while (!empty) { work = list_entry(cm_id_priv->work_list.next, struct iwcm_work, list); list_del_init(&work->list); empty = list_empty(&cm_id_priv->work_list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = process_event(cm_id_priv, &levent); if (ret) { set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); destroy_cm_id(&cm_id_priv->id); } BUG_ON(atomic_read(&cm_id_priv->refcount)==0); destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); if (iwcm_deref_id(cm_id_priv)) { if (destroy_id) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); } return; } spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into * the CM and/or block. Events are queued to a per-CM_ID * work_list. If this is the first event on the work_list, the work * element is also queued on the iwcm_wq thread. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be * deleted. * * Returns: * 0 - the event was handled. * -ENOMEM - the event was not handled due to lack of resources. */ static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct iwcm_work *work; struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); work = get_work(cm_id_priv); if (!work) { ret = -ENOMEM; goto out; } INIT_WORK(&work->work, cm_work_handler); work->cm_id = cm_id_priv; work->event = *iw_event; if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || work->event.event == IW_CM_EVENT_CONNECT_REPLY) && work->event.private_data_len) { ret = copy_private_data(&work->event); if (ret) { put_work(work); goto out; } } atomic_inc(&cm_id_priv->refcount); if (list_empty(&cm_id_priv->work_list)) { list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); } else list_add_tail(&work->list, &cm_id_priv->work_list); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_READ; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = 0; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct iwcm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: case IB_QPS_RTR: ret = iwcm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = iwcm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(iw_cm_init_qp_attr); static int __init iw_cm_init(void) { iwcm_wq = create_singlethread_workqueue("iw_cm_wq"); if (!iwcm_wq) return -ENOMEM; return 0; } static void __exit iw_cm_cleanup(void) { destroy_workqueue(iwcm_wq); } module_init(iw_cm_init); module_exit(iw_cm_cleanup); Index: head/sys/ofed/include/rdma/iw_cm.h =================================================================== --- head/sys/ofed/include/rdma/iw_cm.h (revision 294609) +++ head/sys/ofed/include/rdma/iw_cm.h (revision 294610) @@ -1,253 +1,257 @@ /* * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IW_CM_H #define IW_CM_H #include #include struct iw_cm_id; enum iw_cm_event_type { IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */ IW_CM_EVENT_CONNECT_REPLY, /* reply from active connect request */ IW_CM_EVENT_ESTABLISHED, /* passive side accept successful */ IW_CM_EVENT_DISCONNECT, /* orderly shutdown */ IW_CM_EVENT_CLOSE /* close complete */ }; struct iw_cm_event { enum iw_cm_event_type event; int status; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; void *private_data; void *provider_data; u8 private_data_len; struct socket *so; u8 ord; u8 ird; }; /** * iw_cm_handler - Function to be called by the IW CM when delivering events * to the client. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. */ typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id, struct iw_cm_event *event); /** * iw_event_handler - Function called by the provider when delivering provider * events to the IW CM. Returns either 0 indicating the event was processed * or -errno if the event could not be processed. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. */ typedef int (*iw_event_handler)(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id { iw_cm_handler cm_handler; /* client callback function */ void *context; /* client cb context */ struct ib_device *device; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; void *provider_data; /* provider private data */ iw_event_handler event_handler; /* cb for provider events */ /* Used by provider to add and remove refs on IW cm_id */ void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); struct socket *so; }; struct iw_cm_conn_param { const void *private_data; u16 private_data_len; u32 ord; u32 ird; u32 qpn; }; struct iw_cm_verbs { void (*add_ref)(struct ib_qp *qp); void (*rem_ref)(struct ib_qp *qp); struct ib_qp * (*get_qp)(struct ib_device *device, int qpn); int (*connect)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*accept)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); - int (*create_listen)(struct iw_cm_id *cm_id, + int (*create_listen_ep)(struct iw_cm_id *cm_id, int backlog); - int (*destroy_listen)(struct iw_cm_id *cm_id); + void (*destroy_listen_ep)(struct iw_cm_id *cm_id); + + void (*newconn)(struct iw_cm_id *parent_cm_id, + struct socket *so); }; /** * iw_create_cm_id - Create an IW CM identifier. * * @device: The IB device on which to create the IW CM identier. * @event_handler: User callback invoked to report events associated with the * returned IW CM identifier. * @context: User specified context associated with the id. */ struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so, iw_cm_handler cm_handler, void *context); /** * iw_destroy_cm_id - Destroy an IW CM identifier. * * @cm_id: The previously created IW CM identifier to destroy. * * The client can assume that no events will be delivered for the CM ID after * this function returns. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id); /** * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP * * @cm_id: The IW CM idenfier to unbind from the QP. * @qp: The QP * * This is called by the provider when destroying the QP to ensure * that any references held by the IWCM are released. It may also * be called by the IWCM when destroying a CM_ID to that any * references held by the provider are released. */ void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp); /** * iw_cm_get_qp - Return the ib_qp associated with a QPN * * @ib_device: The IB device * @qpn: The queue pair number */ struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn); /** * iw_cm_listen - Listen for incoming connection requests on the * specified IW CM id. * * @cm_id: The IW CM identifier. * @backlog: The maximum number of outstanding un-accepted inbound listen * requests to queue. * * The source address and port number are specified in the IW CM identifier * structure. */ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog); /** * iw_cm_accept - Called to accept an incoming connect request. * * @cm_id: The IW CM identifier associated with the connection request. * @iw_param: Pointer to a structure containing connection establishment * parameters. * * The specified cm_id will have been provided in the event data for a * CONNECT_REQUEST event. Subsequent events related to this connection will be * delivered to the specified IW CM identifier prior and may occur prior to * the return of this function. If this function returns a non-zero value, the * client can assume that no events will be delivered to the specified IW CM * identifier. */ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); /** * iw_cm_reject - Reject an incoming connection request. * * @cm_id: Connection identifier associated with the request. * @private_daa: Pointer to data to deliver to the remote peer as part of the * reject message. * @private_data_len: The number of bytes in the private_data parameter. * * The client can assume that no events will be delivered to the specified IW * CM identifier following the return of this function. The private_data * buffer is available for reuse when this function returns. */ int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, u8 private_data_len); /** * iw_cm_connect - Called to request a connection to a remote peer. * * @cm_id: The IW CM identifier for the connection. * @iw_param: Pointer to a structure containing connection establishment * parameters. * * Events may be delivered to the specified IW CM identifier prior to the * return of this function. If this function returns a non-zero value, the * client can assume that no events will be delivered to the specified IW CM * identifier. */ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); /** * iw_cm_disconnect - Close the specified connection. * * @cm_id: The IW CM identifier to close. * @abrupt: If 0, the connection will be closed gracefully, otherwise, the * connection will be reset. * * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is * delivered. */ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt); /** * iw_cm_init_qp_attr - Called to initialize the attributes of the QP * associated with a IW CM identifier. * * @cm_id: The IW CM identifier associated with the QP * @qp_attr: Pointer to the QP attributes structure. * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are * valid. */ int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask); #endif /* IW_CM_H */ Index: head/sys/ofed/include/rdma/rdma_cm.h =================================================================== --- head/sys/ofed/include/rdma/rdma_cm.h (revision 294609) +++ head/sys/ofed/include/rdma/rdma_cm.h (revision 294610) @@ -1,404 +1,407 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(RDMA_CM_H) #define RDMA_CM_H #include #include #include #include /* * Upon receiving a device removal event, users must destroy the associated * RDMA identifier and release all resources allocated with the device. */ enum rdma_cm_event_type { RDMA_CM_EVENT_ADDR_RESOLVED, RDMA_CM_EVENT_ADDR_ERROR, RDMA_CM_EVENT_ROUTE_RESOLVED, RDMA_CM_EVENT_ROUTE_ERROR, RDMA_CM_EVENT_CONNECT_REQUEST, RDMA_CM_EVENT_CONNECT_RESPONSE, RDMA_CM_EVENT_CONNECT_ERROR, RDMA_CM_EVENT_UNREACHABLE, RDMA_CM_EVENT_REJECTED, RDMA_CM_EVENT_ESTABLISHED, RDMA_CM_EVENT_DISCONNECTED, RDMA_CM_EVENT_DEVICE_REMOVAL, RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, RDMA_CM_EVENT_TIMEWAIT_EXIT, RDMA_CM_EVENT_ALT_ROUTE_RESOLVED, RDMA_CM_EVENT_ALT_ROUTE_ERROR, RDMA_CM_EVENT_LOAD_ALT_PATH, RDMA_CM_EVENT_ALT_PATH_LOADED, }; enum rdma_port_space { RDMA_PS_SDP = 0x0001, RDMA_PS_IPOIB = 0x0002, RDMA_PS_IB = 0x013F, RDMA_PS_TCP = 0x0106, RDMA_PS_UDP = 0x0111, }; enum alt_path_type { RDMA_ALT_PATH_NONE, RDMA_ALT_PATH_PORT, RDMA_ALT_PATH_LID, RDMA_ALT_PATH_BEST }; struct rdma_addr { struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr dev_addr; }; struct rdma_route { struct rdma_addr addr; struct ib_sa_path_rec *path_rec; int num_paths; }; struct rdma_conn_param { const void *private_data; u8 private_data_len; u8 responder_resources; u8 initiator_depth; u8 flow_control; u8 retry_count; /* ignored when accepting */ u8 rnr_retry_count; /* Fields below ignored if a QP is created on the rdma_cm_id. */ u8 srq; u32 qp_num; }; struct rdma_ud_param { const void *private_data; u8 private_data_len; struct ib_ah_attr ah_attr; u32 qp_num; u32 qkey; u8 alt_path_index; }; struct rdma_cm_event { enum rdma_cm_event_type event; int status; union { struct rdma_conn_param conn; struct rdma_ud_param ud; } param; }; enum rdma_cm_state { RDMA_CM_IDLE, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN, RDMA_CM_DEVICE_REMOVAL, RDMA_CM_DESTROYING }; struct rdma_cm_id; /** * rdma_cm_event_handler - Callback used to report user events. * * Notes: Users may not call rdma_destroy_id from this callback to destroy * the passed in id, or a corresponding listen id. Returning a * non-zero value from the callback will destroy the passed in id. */ typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id, struct rdma_cm_event *event); struct rdma_cm_id { struct ib_device *device; void *context; struct ib_qp *qp; rdma_cm_event_handler event_handler; struct rdma_route route; enum rdma_port_space ps; enum ib_qp_type qp_type; u8 port_num; void *ucontext; }; /** * rdma_create_id - Create an RDMA identifier. * * @event_handler: User callback invoked to report events associated with the * returned rdma_id. * @context: User specified context associated with the id. * @ps: RDMA port space. * @qp_type: type of queue pair associated with the id. */ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type); /** * rdma_destroy_id - Destroys an RDMA identifier. * * @id: RDMA identifier. * * Note: calling this function has the effect of canceling in-flight * asynchronous operations associated with the id. */ void rdma_destroy_id(struct rdma_cm_id *id); /** * rdma_bind_addr - Bind an RDMA identifier to a source address and * associated RDMA device, if needed. * * @id: RDMA identifier. * @addr: Local address information. Wildcard values are permitted. * * This associates a source address with the RDMA identifier before calling * rdma_listen. If a specific local address is given, the RDMA identifier will * be bound to a local RDMA device. */ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); /** * rdma_resolve_addr - Resolve destination and optional source addresses * from IP addresses to an RDMA address. If successful, the specified * rdma_cm_id will be bound to a local device. * * @id: RDMA identifier. * @src_addr: Source address information. This parameter may be NULL. * @dst_addr: Destination address information. * @timeout_ms: Time to wait for resolution to complete. */ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms); /** * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier * into route information needed to establish a connection. * * This is called on the client side of a connection. * Users must have first called rdma_resolve_addr to resolve a dst_addr * into an RDMA address before calling this routine. */ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); /** * rdma_enable_apm - Get ready to use APM for the given ID. * Actual Alternate path discovery and load will take place only * after a connection has been established. * * Calling this function only has an effect on the connection's client side. * It should be called after rdma_resolve_route and before rdma_connect. * * @id: RDMA identifier. * @alt_type: Alternate path type to resolve. */ int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type); /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA * identifier. * * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA * through their states. */ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); /** * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA * identifier. * * Users must destroy any QP associated with an RDMA identifier before * destroying the RDMA ID. */ void rdma_destroy_qp(struct rdma_cm_id *id); /** * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning * to a specified QP state. * @id: Communication identifier associated with the QP attributes to * initialize. * @qp_attr: On input, specifies the desired QP state. On output, the * mandatory and desired optional attributes will be set in order to * modify the QP to the specified state. * @qp_attr_mask: The QP attribute mask that may be used to transition the * QP to the specified state. * * Users must set the @qp_attr->qp_state to the desired QP state. This call * will set all required attributes for the given transition, along with * known optional attributes. Users may override the attributes returned from * this call before calling ib_modify_qp. * * Users that wish to have their QP automatically transitioned through its * states can associate a QP with the rdma_cm_id by calling rdma_create_qp(). */ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with * by having called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP * information for unconnected rdma_cm_id's. The actual operation is * based on the rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); /** * rdma_listen - This function is called by the passive side to * listen for incoming connection requests. * * Users must have bound the rdma_cm_id to a local address by calling * rdma_bind_addr before calling this routine. */ int rdma_listen(struct rdma_cm_id *id, int backlog); /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); /** * rdma_notify - Notifies the RDMA CM of an asynchronous event that has * occurred on the connection. * @id: Connection identifier to transition to established. * @event: Asynchronous event. * * This routine should be invoked by users to notify the CM of relevant * communication events. Events that should be reported to the CM and * when to report them are: * * IB_EVENT_COMM_EST - Used when a message is received on a connected * QP before an RTU has been received. */ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event); /** * rdma_reject - Called to reject a connection request or response. */ int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len); /** * rdma_disconnect - This function disconnects the associated QP and * transitions it into the error state. */ int rdma_disconnect(struct rdma_cm_id *id); /** * rdma_join_multicast - Join the multicast group specified by the given * address. * @id: Communication identifier associated with the request. * @addr: Multicast address identifying the group to join. * @context: User-defined context associated with the join request, returned * to the user through the private_data pointer in multicast events. */ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context); /** * rdma_leave_multicast - Leave the multicast group specified by the given * address. */ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); /** * rdma_set_service_type - Set the type of service associated with a * connection identifier. * @id: Communication identifier to associated with service type. * @tos: Type of service. * * The type of service is interpretted as a differentiated service * field (RFC 2474). The service type should be specified before * performing route resolution, as existing communication on the * connection identifier may be unaffected. The type of service * requested may not be supported by the network to all destinations. */ void rdma_set_service_type(struct rdma_cm_id *id, int tos); /** * rdma_set_reuseaddr - Allow the reuse of local addresses when binding * the rdma_cm_id. * @id: Communication identifier to configure. * @reuse: Value indicating if the bound address is reusable. * * Reuse must be set before an address is bound to the id. */ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); /** * rdma_set_afonly - Specify that listens are restricted to the * bound address family only. * @id: Communication identifer to configure. * @afonly: Value indicating if listens are restricted. * * Must be set before identifier is in the listening state. */ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); /** * rdma_set_timeout - Set the QP timeout associated with a connection * identifier. * @id: Communication identifier to associated with service type. * @timeout: QP timeout */ void rdma_set_timeout(struct rdma_cm_id *id, int timeout); - +int rdma_cma_any_addr(struct sockaddr *addr); +int rdma_find_cmid_laddr(struct sockaddr_in *local_addr, + unsigned short dev_type, void **cm_id); #endif /* RDMA_CM_H */