Index: stable/10/sys/dev/cxgbe/iw_cxgbe/provider.c =================================================================== --- stable/10/sys/dev/cxgbe/iw_cxgbe/provider.c (revision 325610) +++ stable/10/sys/dev/cxgbe/iw_cxgbe/provider.c (revision 325611) @@ -1,507 +1,526 @@ /* * Copyright (c) 2009-2013, 2016 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #define LINUXKPI_PARAM_PREFIX iw_cxgbe_ #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include "iw_cxgbe.h" #include "user.h" static int fastreg_support = 1; module_param(fastreg_support, int, 0644); MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default = 1)"); static int c4iw_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, struct ib_port_modify *props) { return -ENOSYS; } static struct ib_ah *c4iw_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { return ERR_PTR(-ENOSYS); } static int c4iw_ah_destroy(struct ib_ah *ah) { return -ENOSYS; } static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return -ENOSYS; } static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { return -ENOSYS; } static int c4iw_dealloc_ucontext(struct ib_ucontext *context) { struct c4iw_dev *rhp = to_c4iw_dev(context->device); struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); struct c4iw_mm_entry *mm, *tmp; CTR2(KTR_IW_CXGBE, "%s context %p", __func__, context); list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); kfree(ucontext); return 0; } static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct c4iw_ucontext *context; struct c4iw_dev *rhp = to_c4iw_dev(ibdev); CTR2(KTR_IW_CXGBE, "%s ibdev %p", __func__, ibdev); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); return &context->ibucontext; } #ifdef DOT5 static inline pgprot_t t4_pgprot_wc(pgprot_t prot) { return pgprot_writecombine(prot); } #endif static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { int len = vma->vm_end - vma->vm_start; u32 key = vma->vm_pgoff << PAGE_SHIFT; struct c4iw_rdev *rdev; int ret = 0; struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr, paddr; u64 va_regs_res = 0, va_udbs_res = 0; u64 len_regs_res = 0, len_udbs_res = 0; CTR3(KTR_IW_CXGBE, "%s:1 ctx %p vma %p", __func__, context, vma); CTR4(KTR_IW_CXGBE, "%s:1a pgoff 0x%lx key 0x%x len %d", __func__, vma->vm_pgoff, key, len); if (vma->vm_start & (PAGE_SIZE-1)) { CTR3(KTR_IW_CXGBE, "%s:2 unaligned vm_start %u vma %p", __func__, vma->vm_start, vma); return -EINVAL; } rdev = &(to_c4iw_dev(context->device)->rdev); ucontext = to_c4iw_ucontext(context); mm = remove_mmap(ucontext, key, len); if (!mm) { CTR4(KTR_IW_CXGBE, "%s:3 ucontext %p key %u len %u", __func__, ucontext, key, len); return -EINVAL; } addr = mm->addr; kfree(mm); va_regs_res = (u64)rman_get_virtual(rdev->adap->regs_res); len_regs_res = (u64)rman_get_size(rdev->adap->regs_res); va_udbs_res = (u64)rman_get_virtual(rdev->adap->udbs_res); len_udbs_res = (u64)rman_get_size(rdev->adap->udbs_res); CTR6(KTR_IW_CXGBE, "%s:4 addr %p, masync region %p:%p, udb region %p:%p", __func__, addr, va_regs_res, va_regs_res+len_regs_res, va_udbs_res, va_udbs_res+len_udbs_res); if (addr >= va_regs_res && addr < va_regs_res + len_regs_res) { CTR4(KTR_IW_CXGBE, "%s:5 MA_SYNC addr %p region %p, reglen %u", __func__, addr, va_regs_res, len_regs_res); /* * MA_SYNC register... */ paddr = vtophys(addr); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, paddr >> PAGE_SHIFT, len, vma->vm_page_prot); } else { if (addr >= va_udbs_res && addr < va_udbs_res + len_udbs_res) { /* * Map user DB or OCQP memory... */ paddr = vtophys(addr); CTR4(KTR_IW_CXGBE, "%s:6 USER DB-GTS addr %p region %p, reglen %u", __func__, addr, va_udbs_res, len_udbs_res); #ifdef DOT5 if (!is_t4(rdev->lldi.adapter_type) && map_udb_as_wc) vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); else #endif vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, paddr >> PAGE_SHIFT, len, vma->vm_page_prot); } else { /* * Map WQ or CQ contig dma memory... */ CTR4(KTR_IW_CXGBE, "%s:7 WQ/CQ addr %p vm_start %u vma %p", __func__, addr, vma->vm_start, vma); ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); } } CTR4(KTR_IW_CXGBE, "%s:8 ctx %p vma %p ret %u", __func__, context, vma, ret); return ret; } static int c4iw_deallocate_pd(struct ib_pd *pd) { struct c4iw_pd *php = to_c4iw_pd(pd); struct c4iw_dev *rhp = php->rhp; CTR3(KTR_IW_CXGBE, "%s: pd %p, pdid 0x%x", __func__, pd, php->pdid); c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid); mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur--; mutex_unlock(&rhp->rdev.stats.lock); kfree(php); return (0); } static struct ib_pd * c4iw_allocate_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct c4iw_pd *php; u32 pdid; struct c4iw_dev *rhp; CTR4(KTR_IW_CXGBE, "%s: ibdev %p, context %p, data %p", __func__, ibdev, context, udata); rhp = (struct c4iw_dev *) ibdev; pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); if (!pdid) return ERR_PTR(-EINVAL); php = kzalloc(sizeof(*php), GFP_KERNEL); if (!php) { c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); return ERR_PTR(-ENOMEM); } php->pdid = pdid; php->rhp = rhp; if (context) { if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) { c4iw_deallocate_pd(&php->ibpd); return ERR_PTR(-EFAULT); } } mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur++; if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max) rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; mutex_unlock(&rhp->rdev.stats.lock); CTR6(KTR_IW_CXGBE, "%s: ibdev %p, context %p, data %p, pddid 0x%x, pd %p", __func__, ibdev, context, udata, pdid, php); return (&php->ibpd); } static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, pkey %p", __func__, ibdev, port, index, pkey); *pkey = 0; return (0); } static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct c4iw_dev *dev; struct port_info *pi; struct adapter *sc; CTR5(KTR_IW_CXGBE, "%s ibdev %p, port %d, index %d, gid %p", __func__, ibdev, port, index, gid); memset(&gid->raw[0], 0, sizeof(gid->raw)); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port == 0 || port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; memcpy(&gid->raw[0], pi->vi[0].hw_addr, ETHER_ADDR_LEN); return (0); } static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct c4iw_dev *dev = to_c4iw_dev(ibdev); struct adapter *sc = dev->rdev.adap; const int spg_ndesc = sc->params.sge.spg_len / EQ_ESIZE; CTR3(KTR_IW_CXGBE, "%s ibdev %p, props %p", __func__, ibdev, props); memset(props, 0, sizeof *props); memcpy(&props->sys_image_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); props->hw_ver = sc->params.chipid; props->fw_ver = sc->params.fw_vers; props->device_cap_flags = dev->device_cap_flags; props->page_size_cap = T4_PAGESIZE_MASK; props->vendor_id = pci_get_vendor(sc->dev); props->vendor_part_id = pci_get_device(sc->dev); props->max_mr_size = T4_MAX_MR_SIZE; props->max_qp = sc->vres.qp.size / 2; props->max_qp_wr = T4_MAX_QP_DEPTH(spg_ndesc); props->max_sge = T4_MAX_RECV_SGE; props->max_sge_rd = 1; props->max_res_rd_atom = sc->params.max_ird_adapter; props->max_qp_rd_atom = min(sc->params.max_ordird_qp, c4iw_max_read_depth); props->max_qp_init_rd_atom = props->max_qp_rd_atom; props->max_cq = sc->vres.qp.size; props->max_cqe = T4_MAX_CQ_DEPTH; props->max_mr = c4iw_num_stags(&dev->rdev); props->max_pd = T4_MAX_NUM_PD; props->local_ca_ack_delay = 0; props->max_fast_reg_page_list_len = T4_MAX_FR_DEPTH; return (0); } /* * Returns -errno on failure. */ static int c4iw_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct c4iw_dev *dev; struct adapter *sc; struct port_info *pi; struct ifnet *ifp; CTR4(KTR_IW_CXGBE, "%s ibdev %p, port %d, props %p", __func__, ibdev, port, props); dev = to_c4iw_dev(ibdev); sc = dev->rdev.adap; if (port > sc->params.nports) return (-EINVAL); pi = sc->port[port - 1]; ifp = pi->vi[0].ifp; memset(props, 0, sizeof(struct ib_port_attr)); props->max_mtu = IB_MTU_4096; if (ifp->if_mtu >= 4096) props->active_mtu = IB_MTU_4096; else if (ifp->if_mtu >= 2048) props->active_mtu = IB_MTU_2048; else if (ifp->if_mtu >= 1024) props->active_mtu = IB_MTU_1024; else if (ifp->if_mtu >= 512) props->active_mtu = IB_MTU_512; else props->active_mtu = IB_MTU_256; props->state = pi->link_cfg.link_ok ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_REINIT_SUP | IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; props->active_width = 2; props->active_speed = 2; props->max_msg_sz = -1; return 0; } +static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + + return 0; +} + /* * Returns -errno on error. */ int c4iw_register_device(struct c4iw_dev *dev) { struct adapter *sc = dev->rdev.adap; struct ib_device *ibdev = &dev->ibdev; struct iw_cm_verbs *iwcm; int ret; CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, sc); BUG_ON(!sc->port[0]); strlcpy(ibdev->name, device_get_nameunit(sc->dev), sizeof(ibdev->name)); memset(&ibdev->node_guid, 0, sizeof(ibdev->node_guid)); memcpy(&ibdev->node_guid, sc->port[0]->vi[0].hw_addr, ETHER_ADDR_LEN); ibdev->owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; if (fastreg_support) dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; ibdev->local_dma_lkey = 0; ibdev->uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_POLL_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_POST_SEND) | (1ull << IB_USER_VERBS_CMD_POST_RECV); ibdev->node_type = RDMA_NODE_RNIC; strlcpy(ibdev->node_desc, C4IW_NODE_DESC, sizeof(ibdev->node_desc)); ibdev->phys_port_cnt = sc->params.nports; ibdev->num_comp_vectors = 1; ibdev->dma_device = NULL; ibdev->query_device = c4iw_query_device; ibdev->query_port = c4iw_query_port; ibdev->modify_port = c4iw_modify_port; ibdev->query_pkey = c4iw_query_pkey; ibdev->query_gid = c4iw_query_gid; ibdev->alloc_ucontext = c4iw_alloc_ucontext; ibdev->dealloc_ucontext = c4iw_dealloc_ucontext; ibdev->mmap = c4iw_mmap; ibdev->alloc_pd = c4iw_allocate_pd; ibdev->dealloc_pd = c4iw_deallocate_pd; ibdev->create_ah = c4iw_ah_create; ibdev->destroy_ah = c4iw_ah_destroy; ibdev->create_qp = c4iw_create_qp; ibdev->modify_qp = c4iw_ib_modify_qp; ibdev->query_qp = c4iw_ib_query_qp; ibdev->destroy_qp = c4iw_destroy_qp; ibdev->create_cq = c4iw_create_cq; ibdev->destroy_cq = c4iw_destroy_cq; ibdev->resize_cq = c4iw_resize_cq; ibdev->poll_cq = c4iw_poll_cq; ibdev->get_dma_mr = c4iw_get_dma_mr; ibdev->reg_phys_mr = c4iw_register_phys_mem; ibdev->rereg_phys_mr = c4iw_reregister_phys_mem; ibdev->reg_user_mr = c4iw_reg_user_mr; ibdev->dereg_mr = c4iw_dereg_mr; ibdev->alloc_mw = c4iw_alloc_mw; ibdev->bind_mw = c4iw_bind_mw; ibdev->dealloc_mw = c4iw_dealloc_mw; ibdev->alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr; ibdev->alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl; ibdev->free_fast_reg_page_list = c4iw_free_fastreg_pbl; ibdev->attach_mcast = c4iw_multicast_attach; ibdev->detach_mcast = c4iw_multicast_detach; ibdev->process_mad = c4iw_process_mad; ibdev->req_notify_cq = c4iw_arm_cq; ibdev->post_send = c4iw_post_send; ibdev->post_recv = c4iw_post_receive; ibdev->uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; + ibdev->get_port_immutable = c4iw_port_immutable; iwcm = kmalloc(sizeof(*iwcm), GFP_KERNEL); if (iwcm == NULL) return (-ENOMEM); iwcm->connect = c4iw_connect; iwcm->accept = c4iw_accept_cr; iwcm->reject = c4iw_reject_cr; iwcm->create_listen_ep = c4iw_create_listen_ep; iwcm->destroy_listen_ep = c4iw_destroy_listen_ep; iwcm->newconn = process_newconn; iwcm->add_ref = c4iw_qp_add_ref; iwcm->rem_ref = c4iw_qp_rem_ref; iwcm->get_qp = c4iw_get_qp; ibdev->iwcm = iwcm; ret = ib_register_device(&dev->ibdev, NULL); if (ret) kfree(iwcm); return (ret); } void c4iw_unregister_device(struct c4iw_dev *dev) { CTR3(KTR_IW_CXGBE, "%s c4iw_dev %p, adapter %p", __func__, dev, dev->rdev.adap); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); return; } #endif Index: stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c =================================================================== --- stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision 325610) +++ stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision 325611) @@ -1,2290 +1,2342 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef inode #include #include #include #include "user.h" #include "mlx5_ib.h" #include #define DRIVER_NAME "mlx5_ib" #define DRIVER_VERSION "3.2-rc1" #define DRIVER_RELDATE "May 2016" #undef MODULE_VERSION #include MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1); MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1); MODULE_VERSION(mlx5ib, 1); static int deprecated_prof_sel = 2; module_param_named(prof_sel, deprecated_prof_sel, int, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); enum { MLX5_STANDARD_ATOMIC_SIZE = 0x8, }; struct workqueue_struct *mlx5_ib_wq; static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; static void get_atomic_caps(struct mlx5_ib_dev *dev, struct ib_device_attr *props) { int tmp; u8 atomic_operations; u8 atomic_size_qp; u8 atomic_req_endianess; atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode) || !mlx5_host_is_le(); tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; if (((atomic_operations & tmp) == tmp) && (atomic_size_qp & 8)) { if (atomic_req_endianess) { props->atomic_cap = IB_ATOMIC_HCA; } else { props->atomic_cap = IB_ATOMIC_NONE; } } else { props->atomic_cap = IB_ATOMIC_NONE; } tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; if (((atomic_operations & tmp) == tmp) &&(atomic_size_qp & 8)) { if (atomic_req_endianess) props->masked_atomic_cap = IB_ATOMIC_HCA; else { props->masked_atomic_cap = IB_ATOMIC_NONE; } } else { props->masked_atomic_cap = IB_ATOMIC_NONE; } } static enum rdma_link_layer mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *dev = to_mdev(device); switch (MLX5_CAP_GEN(dev->mdev, port_type)) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; } } static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { return !dev->mdev->issi; } enum { MLX5_VPORT_ACCESS_METHOD_MAD, MLX5_VPORT_ACCESS_METHOD_HCA, MLX5_VPORT_ACCESS_METHOD_NIC, }; static int mlx5_get_vport_access_method(struct ib_device *ibdev) { if (mlx5_use_mad_ifc(to_mdev(ibdev))) return MLX5_VPORT_ACCESS_METHOD_MAD; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) return MLX5_VPORT_ACCESS_METHOD_NIC; return MLX5_VPORT_ACCESS_METHOD_HCA; } static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u64 tmp; int err; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_system_image_guid_mad_ifc(ibdev, sys_image_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); if (!err) *sys_image_guid = cpu_to_be64(tmp); return err; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); if (!err) *sys_image_guid = cpu_to_be64(tmp); return err; default: return -EINVAL; } } static int mlx5_query_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); return 0; default: return -EINVAL; } } static int mlx5_query_vendor_id(struct ib_device *ibdev, u32 *vendor_id) { struct mlx5_ib_dev *dev = to_mdev(ibdev); switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_core_query_vendor_id(dev->mdev, vendor_id); default: return -EINVAL; } } static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) { u64 tmp; int err; switch (mlx5_get_vport_access_method(&dev->ib_dev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_node_guid_mad_ifc(dev, node_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); if (!err) *node_guid = cpu_to_be64(tmp); return err; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); if (!err) *node_guid = cpu_to_be64(tmp); return err; default: return -EINVAL; } } struct mlx5_reg_node_desc { u8 desc[64]; }; static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) { struct mlx5_reg_node_desc in; if (mlx5_use_mad_ifc(dev)) return mlx5_query_node_desc_mad_ifc(dev, node_desc); memset(&in, 0, sizeof(in)); return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, sizeof(struct mlx5_reg_node_desc), MLX5_REG_NODE_DESC, 0, 0); } static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int max_sq_desc; int max_rq_sg; int max_sq_sg; int err; memset(props, 0, sizeof(*props)); err = mlx5_query_system_image_guid(ibdev, &props->sys_image_guid); if (err) return err; err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); if (err) return err; err = mlx5_query_vendor_id(ibdev, &props->vendor_id); if (err) return err; props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | ((u64)fw_rev_min(dev->mdev) << 16) | fw_rev_sub(dev->mdev); props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; if (MLX5_CAP_GEN(mdev, pkv)) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (MLX5_CAP_GEN(mdev, qkv)) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; props->max_mr_size = ~0ull; props->page_size_cap = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = (unsigned int)-1; get_atomic_caps(dev, props); props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ props->max_ah = INT_MAX; return 0; } enum mlx5_ib_width { MLX5_IB_WIDTH_1X = 1 << 0, MLX5_IB_WIDTH_2X = 1 << 1, MLX5_IB_WIDTH_4X = 1 << 2, MLX5_IB_WIDTH_8X = 1 << 3, MLX5_IB_WIDTH_12X = 1 << 4 }; static int translate_active_width(struct ib_device *ibdev, u8 active_width, u8 *ib_width) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int err = 0; if (active_width & MLX5_IB_WIDTH_1X) { *ib_width = IB_WIDTH_1X; } else if (active_width & MLX5_IB_WIDTH_2X) { mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n", (int)active_width); err = -EINVAL; } else if (active_width & MLX5_IB_WIDTH_4X) { *ib_width = IB_WIDTH_4X; } else if (active_width & MLX5_IB_WIDTH_8X) { *ib_width = IB_WIDTH_8X; } else if (active_width & MLX5_IB_WIDTH_12X) { *ib_width = IB_WIDTH_12X; } else { mlx5_ib_dbg(dev, "Invalid active_width %d\n", (int)active_width); err = -EINVAL; } return err; } /* * TODO: Move to IB core */ enum ib_max_vl_num { __IB_MAX_VL_0 = 1, __IB_MAX_VL_0_1 = 2, __IB_MAX_VL_0_3 = 3, __IB_MAX_VL_0_7 = 4, __IB_MAX_VL_0_14 = 5, }; enum mlx5_vl_hw_cap { MLX5_VL_HW_0 = 1, MLX5_VL_HW_0_1 = 2, MLX5_VL_HW_0_2 = 3, MLX5_VL_HW_0_3 = 4, MLX5_VL_HW_0_4 = 5, MLX5_VL_HW_0_5 = 6, MLX5_VL_HW_0_6 = 7, MLX5_VL_HW_0_7 = 8, MLX5_VL_HW_0_14 = 15 }; static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, u8 *max_vl_num) { switch (vl_hw_cap) { case MLX5_VL_HW_0: *max_vl_num = __IB_MAX_VL_0; break; case MLX5_VL_HW_0_1: *max_vl_num = __IB_MAX_VL_0_1; break; case MLX5_VL_HW_0_3: *max_vl_num = __IB_MAX_VL_0_3; break; case MLX5_VL_HW_0_7: *max_vl_num = __IB_MAX_VL_0_7; break; case MLX5_VL_HW_0_14: *max_vl_num = __IB_MAX_VL_0_14; break; default: return -EINVAL; } return 0; } static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u32 *rep; int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); struct mlx5_ptys_reg *ptys; struct mlx5_pmtu_reg *pmtu; struct mlx5_pvlc_reg pvlc; void *ctx; int err; rep = mlx5_vzalloc(outlen); ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); if (!rep || !ptys || !pmtu) { err = -ENOMEM; goto out; } memset(props, 0, sizeof(*props)); /* what if I am pf with dual port */ err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen); if (err) goto out; ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context); props->lid = MLX5_GET(hca_vport_context, ctx, lid); props->lmc = MLX5_GET(hca_vport_context, ctx, lmc); props->sm_lid = MLX5_GET(hca_vport_context, ctx, sm_lid); props->sm_sl = MLX5_GET(hca_vport_context, ctx, sm_sl); props->state = MLX5_GET(hca_vport_context, ctx, vport_state); props->phys_state = MLX5_GET(hca_vport_context, ctx, port_physical_state); props->port_cap_flags = MLX5_GET(hca_vport_context, ctx, cap_mask1); props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, pkey_violation_counter); props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, qkey_violation_counter); props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, subnet_timeout); props->init_type_reply = MLX5_GET(hca_vport_context, ctx, init_type_reply); ptys->proto_mask |= MLX5_PTYS_IB; ptys->local_port = port; err = mlx5_core_access_ptys(mdev, ptys, 0); if (err) goto out; err = translate_active_width(ibdev, ptys->ib_link_width_oper, &props->active_width); if (err) goto out; props->active_speed = (u8)ptys->ib_proto_oper; pmtu->local_port = port; err = mlx5_core_access_pmtu(mdev, pmtu, 0); if (err) goto out; props->max_mtu = pmtu->max_mtu; props->active_mtu = pmtu->oper_mtu; memset(&pvlc, 0, sizeof(pvlc)); pvlc.local_port = port; err = mlx5_core_access_pvlc(mdev, &pvlc, 0); if (err) goto out; err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap, &props->max_vl_num); out: kvfree(rep); kfree(ptys); kfree(pmtu); return err; } int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_port_mad_ifc(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_port_ib(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_port_roce(ibdev, port, props); default: return -EINVAL; } } static void mlx5_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) { if (dev->if_addrlen != ETH_ALEN) return; memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; } else { eui[3] = 0xFF; eui[4] = 0xFE; } eui[0] ^= 2; } static void mlx5_make_default_gid(struct net_device *dev, union ib_gid *gid) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); mlx5_addrconf_ifid_eui48(&gid->raw[8], 0xFFFF, dev); } static void mlx5_ib_roce_port_update(void *arg) { struct mlx5_ib_port *port = (struct mlx5_ib_port *)arg; struct mlx5_ib_dev *dev = port->dev; struct mlx5_core_dev *mdev = dev->mdev; struct net_device *xdev[MLX5_IB_GID_MAX]; struct net_device *idev; struct net_device *ndev; union ib_gid gid_temp; while (port->port_gone == 0) { int update = 0; int gid_index = 0; int j; int error; ndev = mlx5_get_protocol_dev(mdev, MLX5_INTERFACE_PROTOCOL_ETH); if (ndev == NULL) { pause("W", hz); continue; } CURVNET_SET_QUIET(ndev->if_vnet); memset(&gid_temp, 0, sizeof(gid_temp)); mlx5_make_default_gid(ndev, &gid_temp); if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { port->gid_table[gid_index] = gid_temp; update = 1; } xdev[gid_index] = ndev; gid_index++; IFNET_RLOCK(); TAILQ_FOREACH(idev, &V_ifnet, if_link) { if (idev == ndev) break; } if (idev != NULL) { TAILQ_FOREACH(idev, &V_ifnet, if_link) { u16 vid; if (idev != ndev) { if (idev->if_type != IFT_L2VLAN) continue; if (ndev != rdma_vlan_dev_real_dev(idev)) continue; } /* setup valid MAC-based GID */ memset(&gid_temp, 0, sizeof(gid_temp)); gid_temp.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); vid = rdma_vlan_dev_vlan_id(idev); mlx5_addrconf_ifid_eui48(&gid_temp.raw[8], vid, idev); /* check for existing entry */ for (j = 0; j != gid_index; j++) { if (bcmp(&gid_temp, &port->gid_table[j], sizeof(gid_temp)) == 0) break; } /* check if new entry should be added */ if (j == gid_index && gid_index < MLX5_IB_GID_MAX) { if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { port->gid_table[gid_index] = gid_temp; update = 1; } xdev[gid_index] = idev; gid_index++; } } } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (update != 0 && mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { struct ib_event event = { .device = &dev->ib_dev, .element.port_num = port->port_num + 1, .event = IB_EVENT_GID_CHANGE, }; /* add new entries, if any */ for (j = 0; j != gid_index; j++) { error = modify_gid_roce(&dev->ib_dev, port->port_num, j, port->gid_table + j, xdev[j]); if (error != 0) printf("mlx5_ib: Failed to update ROCE GID table: %d\n", error); } memset(&gid_temp, 0, sizeof(gid_temp)); /* clear old entries, if any */ for (; j != MLX5_IB_GID_MAX; j++) { if (bcmp(&gid_temp, port->gid_table + j, sizeof(gid_temp)) == 0) continue; port->gid_table[j] = gid_temp; (void) modify_gid_roce(&dev->ib_dev, port->port_num, j, port->gid_table + j, ndev); } /* make sure ibcore gets updated */ ib_dispatch_event(&event); } pause("W", hz); } do { struct ib_event event = { .device = &dev->ib_dev, .element.port_num = port->port_num + 1, .event = IB_EVENT_GID_CHANGE, }; /* make sure ibcore gets updated */ ib_dispatch_event(&event); /* wait a bit */ pause("W", hz); } while (0); port->port_gone = 2; kthread_exit(); } static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_gids_mad_ifc(ibdev, port, index, gid); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); case MLX5_VPORT_ACCESS_METHOD_NIC: if (port == 0 || port > MLX5_CAP_GEN(mdev, num_ports) || index < 0 || index >= MLX5_IB_GID_MAX || dev->port[port - 1].port_gone != 0) memset(gid, 0, sizeof(*gid)); else *gid = dev->port[port - 1].gid_table[index]; return 0; default: return -EINVAL; } } static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, pkey); default: return -EINVAL; } } static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_reg_node_desc in; struct mlx5_reg_node_desc out; int err; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ memcpy(&in, props->node_desc, 64); err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, sizeof(out), MLX5_REG_NODE_DESC, 0, 1); if (err) return err; memcpy(ibdev->node_desc, props->node_desc, 64); return err; } static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_ETHERNET); struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_port_attr attr; u32 tmp; int err; /* return OK if this is RoCE. CM calls ib_modify_port() regardless * of whether port link layer is ETH or IB. For ETH ports, qkey * violations and port capabilities are not valid. */ if (is_eth) return 0; mutex_lock(&dev->cap_mask_mutex); err = mlx5_ib_query_port(ibdev, port, &attr); if (err) goto out; tmp = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx5_set_port_caps(dev->mdev, port, tmp); out: mutex_unlock(&dev->cap_mask_mutex); return err; } enum mlx5_cap_flags { MLX5_CAP_COMPACT_AV = 1 << 0, }; static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev) { *flags |= MLX5_CAP_GEN(dev, compact_address_vector) ? MLX5_CAP_COMPACT_AV : 0; } static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req; struct mlx5_ib_alloc_ucontext_resp resp; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; int gross_uuars; int num_uars; int ver; int uuarn; int err; int i; size_t reqlen; if (!dev->ib_active) return ERR_PTR(-EAGAIN); memset(&req, 0, sizeof(req)); memset(&resp, 0, sizeof(resp)); reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) ver = 2; else { mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", (long)reqlen); return ERR_PTR(-EINVAL); } err = ib_copy_from_udata(&req, udata, reqlen); if (err) { mlx5_ib_err(dev, "copy failed\n"); return ERR_PTR(err); } if (req.reserved) { mlx5_ib_err(dev, "request corrupted\n"); return ERR_PTR(-EINVAL); } if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) { mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars); return ERR_PTR(-ENOMEM); } req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_NON_FP_BF_REGS_PER_PAGE); if (req.num_low_latency_uuars > req.total_num_uuars - 1) { mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n", req.total_num_uuars, req.total_num_uuars); return ERR_PTR(-EINVAL); } num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp.cache_line_size = L1_CACHE_BYTES; resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); set_mlx5_flags(&resp.flags, dev->mdev); if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen) resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc); if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen) resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); uuari = &context->uuari; mutex_init(&uuari->lock); uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); if (!uars) { err = -ENOMEM; goto out_ctx; } uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), sizeof(*uuari->bitmap), GFP_KERNEL); if (!uuari->bitmap) { err = -ENOMEM; goto out_uar_ctx; } /* * clear all fast path uuars */ for (i = 0; i < gross_uuars; i++) { uuarn = i & 3; if (uuarn == 2 || uuarn == 3) set_bit(i, uuari->bitmap); } uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); if (!uuari->count) { err = -ENOMEM; goto out_bitmap; } for (i = 0; i < num_uars; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); if (err) { mlx5_ib_err(dev, "uar alloc failed at %d\n", i); goto out_uars; } } for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX; INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); err = ib_copy_to_udata(udata, &resp, min_t(size_t, udata->outlen, sizeof(resp))); if (err) goto out_uars; uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn); if (err) goto out_uars; } return &context->ibucontext; out_uars: for (i--; i >= 0; i--) mlx5_cmd_free_uar(dev->mdev, uars[i].index); kfree(uuari->count); out_bitmap: kfree(uuari->bitmap); out_uar_ctx: kfree(uars); out_ctx: kfree(context); return ERR_PTR(err); } static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; int i; if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) mlx5_dealloc_transport_domain(dev->mdev, context->tdn); for (i = 0; i < uuari->num_uars; i++) { if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); } for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) { if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX) mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]); } kfree(uuari->count); kfree(uuari->bitmap); kfree(uuari->uars); kfree(context); return 0; } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) { return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; } static int get_command(unsigned long offset) { return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; } static int get_arg(unsigned long offset) { return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); } static int get_index(unsigned long offset) { return get_arg(offset); } static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc, struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) { unsigned long idx; phys_addr_t pfn; if (vma->vm_end - vma->vm_start != PAGE_SIZE) { mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n", (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start)); return -EINVAL; } idx = get_index(vma->vm_pgoff); if (idx >= uuari->num_uars) { mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n", idx, uuari->num_uars); return -EINVAL; } pfn = uar_index2pfn(dev, uuari->uars[idx].index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, (unsigned long long)pfn); vma->vm_page_prot = prot; if (io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot)) { mlx5_ib_err(dev, "io remap failed\n"); return -EAGAIN; } mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC", (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); return 0; } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; unsigned long command; command = get_command(vma->vm_pgoff); switch (command) { case MLX5_IB_MMAP_REGULAR_PAGE: return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), true, uuari, dev, context); break; case MLX5_IB_MMAP_WC_PAGE: return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), true, uuari, dev, context); break; case MLX5_IB_MMAP_NC_PAGE: return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot), false, uuari, dev, context); break; default: return -EINVAL; } return 0; } static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) { struct mlx5_create_mkey_mbox_in *in; struct mlx5_mkey_seg *seg; struct mlx5_core_mr mr; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return -ENOMEM; seg = &in->seg; seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); goto err_in; } kfree(in); *key = mr.key; return 0; err_in: kfree(in); return err; } static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) { struct mlx5_core_mr mr; int err; memset(&mr, 0, sizeof(mr)); mr.key = key; err = mlx5_core_destroy_mkey(dev->mdev, &mr); if (err) mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); } static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_pd_resp resp; struct mlx5_ib_pd *pd; int err; pd = kmalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); if (err) { mlx5_ib_warn(dev, "pd alloc failed\n"); kfree(pd); return ERR_PTR(err); } if (context) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_ib_err(dev, "copy failed\n"); mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } } else { err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); if (err) { mlx5_ib_err(dev, "alloc mkey failed\n"); mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); kfree(pd); return ERR_PTR(err); } } return &pd->ibpd; } static int mlx5_ib_dealloc_pd(struct ib_pd *pd) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); if (!pd->uobject) free_pa_mkey(mdev, mpd->pa_lkey); mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); kfree(mpd); return 0; } static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; if (ibqp->qp_type == IB_QPT_RAW_PACKET) err = -EOPNOTSUPP; else err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; if (ibqp->qp_type == IB_QPT_RAW_PACKET) err = -EOPNOTSUPP; else err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int init_node_data(struct mlx5_ib_dev *dev) { int err; err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); if (err) return err; return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); } static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages); } static ssize_t show_reg_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", (unsigned)dev->mdev->pdev->revision); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_fw_pages, &dev_attr_reg_pages, }; static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) { struct mlx5_ib_qp *mqp; struct mlx5_ib_cq *send_mcq, *recv_mcq; struct mlx5_core_cq *mcq; struct list_head cq_armed_list; unsigned long flags_qp; unsigned long flags_cq; unsigned long flags; mlx5_ib_warn(ibdev, " started\n"); INIT_LIST_HEAD(&cq_armed_list); /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { spin_lock_irqsave(&mqp->sq.lock, flags_qp); if (mqp->sq.tail != mqp->sq.head) { send_mcq = to_mcq(mqp->ibqp.send_cq); spin_lock_irqsave(&send_mcq->lock, flags_cq); if (send_mcq->mcq.comp && mqp->ibqp.send_cq->comp_handler) { if (!send_mcq->mcq.reset_notify_added) { send_mcq->mcq.reset_notify_added = 1; list_add_tail(&send_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&send_mcq->lock, flags_cq); } spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); spin_lock_irqsave(&mqp->rq.lock, flags_qp); /* no handling is needed for SRQ */ if (!mqp->ibqp.srq) { if (mqp->rq.tail != mqp->rq.head) { recv_mcq = to_mcq(mqp->ibqp.recv_cq); spin_lock_irqsave(&recv_mcq->lock, flags_cq); if (recv_mcq->mcq.comp && mqp->ibqp.recv_cq->comp_handler) { if (!recv_mcq->mcq.reset_notify_added) { recv_mcq->mcq.reset_notify_added = 1; list_add_tail(&recv_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&recv_mcq->lock, flags_cq); } } spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); } /*At that point all inflight post send were put to be executed as of we * lock/unlock above locks Now need to arm all involved CQs. */ list_for_each_entry(mcq, &cq_armed_list, reset_notify) { mcq->comp(mcq); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); mlx5_ib_warn(ibdev, " ended\n"); return; } static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; struct ib_event ibev; u8 port = 0; switch (event) { case MLX5_DEV_EVENT_SYS_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); break; case MLX5_DEV_EVENT_PORT_UP: ibev.event = IB_EVENT_PORT_ACTIVE; port = (u8)param; break; case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: ibev.event = IB_EVENT_PORT_ERR; port = (u8)param; break; case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_GUID_CHANGE: ibev.event = IB_EVENT_GID_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_CLIENT_REREG: ibev.event = IB_EVENT_CLIENT_REREGISTER; port = (u8)param; break; default: break; } ibev.device = &ibdev->ib_dev; ibev.element.port_num = port; if ((event != MLX5_DEV_EVENT_SYS_ERROR) && (port < 1 || port > ibdev->num_ports)) { mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); return; } if (ibdev->ib_active) ib_dispatch_event(&ibev); } static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) mlx5_query_ext_port_caps(dev, port); } static void config_atomic_responder(struct mlx5_ib_dev *dev, struct ib_device_attr *props) { enum ib_atomic_cap cap = props->atomic_cap; #if 0 if (cap == IB_ATOMIC_HCA || cap == IB_ATOMIC_GLOB) #endif dev->enable_atomic_resp = 1; dev->atomic_cap = cap; } enum mlx5_addr_align { MLX5_ADDR_ALIGN_0 = 0, MLX5_ADDR_ALIGN_64 = 64, MLX5_ADDR_ALIGN_128 = 128, }; static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; int port; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); if (!dprops) goto out; err = mlx5_ib_query_device(&dev->ib_dev, dprops); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; } config_atomic_responder(dev, dprops); for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); break; } dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys; dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } out: kfree(pprops); kfree(dprops); return err; } static void destroy_umrc_res(struct mlx5_ib_dev *dev) { int err; err = mlx5_mr_cache_cleanup(dev); if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); ib_dereg_mr(dev->umrc.mr); ib_dealloc_pd(dev->umrc.pd); } enum { MAX_UMR_WR = 128, }; static int create_umr_res(struct mlx5_ib_dev *dev) { struct ib_pd *pd; struct ib_mr *mr; int ret; pd = ib_alloc_pd(&dev->ib_dev); if (IS_ERR(pd)) { mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); ret = PTR_ERR(pd); goto error_0; } mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(mr)) { mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); ret = PTR_ERR(mr); goto error_1; } dev->umrc.mr = mr; dev->umrc.pd = pd; ret = mlx5_mr_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); goto error_4; } return 0; error_4: ib_dereg_mr(mr); error_1: ib_dealloc_pd(pd); error_0: return ret; } static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); goto error0; } devr->p0->device = &dev->ib_dev; devr->p0->uobject = NULL; atomic_set(&devr->p0->usecnt, 0); devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL); if (IS_ERR(devr->c0)) { ret = PTR_ERR(devr->c0); goto error1; } devr->c0->device = &dev->ib_dev; devr->c0->uobject = NULL; devr->c0->comp_handler = NULL; devr->c0->event_handler = NULL; devr->c0->cq_context = NULL; atomic_set(&devr->c0->usecnt, 0); devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); goto error2; } devr->x0->device = &dev->ib_dev; devr->x0->inode = NULL; atomic_set(&devr->x0->usecnt, 0); mutex_init(&devr->x0->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x0->tgt_qp_list); devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x1)) { ret = PTR_ERR(devr->x1); goto error3; } devr->x1->device = &dev->ib_dev; devr->x1->inode = NULL; atomic_set(&devr->x1->usecnt, 0); mutex_init(&devr->x1->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x1->tgt_qp_list); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_XRC; attr.ext.xrc.cq = devr->c0; attr.ext.xrc.xrcd = devr->x0; devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s0)) { ret = PTR_ERR(devr->s0); goto error4; } devr->s0->device = &dev->ib_dev; devr->s0->pd = devr->p0; devr->s0->uobject = NULL; devr->s0->event_handler = NULL; devr->s0->srq_context = NULL; devr->s0->srq_type = IB_SRQT_XRC; devr->s0->ext.xrc.xrcd = devr->x0; devr->s0->ext.xrc.cq = devr->c0; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); atomic_inc(&devr->s0->ext.xrc.cq->usecnt); atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s1)) { ret = PTR_ERR(devr->s1); goto error5; } devr->s1->device = &dev->ib_dev; devr->s1->pd = devr->p0; devr->s1->uobject = NULL; devr->s1->event_handler = NULL; devr->s1->srq_context = NULL; devr->s1->srq_type = IB_SRQT_BASIC; devr->s1->ext.xrc.cq = devr->c0; atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s1->usecnt, 0); return 0; error5: mlx5_ib_destroy_srq(devr->s0); error4: mlx5_ib_dealloc_xrcd(devr->x1); error3: mlx5_ib_dealloc_xrcd(devr->x0); error2: mlx5_ib_destroy_cq(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0); error0: return ret; } static void destroy_dev_resources(struct mlx5_ib_resources *devr) { mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); } +static u32 get_core_cap_flags(struct ib_device *ibdev) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); + u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); + u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); + u32 ret = 0; + + if (ll == IB_LINK_LAYER_INFINIBAND) + return RDMA_CORE_PORT_IBA_IB; + + ret = RDMA_CORE_PORT_RAW_PACKET; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) + return ret; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) + return ret; + + if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE; + + if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + return ret; +} + +static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num); + int err; + + immutable->core_cap_flags = get_core_cap_flags(ibdev); + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = get_core_cap_flags(ibdev); + if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce)) + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + static void enable_dc_tracer(struct mlx5_ib_dev *dev) { struct device *device = dev->ib_dev.dma_device; struct mlx5_dc_tracer *dct = &dev->dctr; int order; void *tmp; int size; int err; size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096; if (size <= PAGE_SIZE) order = 0; else order = 1; dct->pg = alloc_pages(GFP_KERNEL, order); if (!dct->pg) { mlx5_ib_err(dev, "failed to allocate %d pages\n", order); return; } tmp = page_address(dct->pg); memset(tmp, 0xff, size); dct->size = size; dct->order = order; dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE); if (dma_mapping_error(device, dct->dma)) { mlx5_ib_err(dev, "dma mapping error\n"); goto map_err; } err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma); if (err) { mlx5_ib_warn(dev, "failed to enable DC tracer\n"); goto cmd_err; } return; cmd_err: dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE); map_err: __free_pages(dct->pg, dct->order); dct->pg = NULL; } static void disable_dc_tracer(struct mlx5_ib_dev *dev) { struct device *device = dev->ib_dev.dma_device; struct mlx5_dc_tracer *dct = &dev->dctr; int err; if (!dct->pg) return; err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma); if (err) { mlx5_ib_warn(dev, "failed to disable DC tracer\n"); return; } dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE); __free_pages(dct->pg, dct->order); dct->pg = NULL; } enum { MLX5_DC_CNAK_SIZE = 128, MLX5_NUM_BUF_IN_PAGE = PAGE_SIZE / MLX5_DC_CNAK_SIZE, MLX5_CNAK_TX_CQ_SIGNAL_FACTOR = 128, MLX5_DC_CNAK_SL = 0, MLX5_DC_CNAK_VL = 0, }; static int init_dc_improvements(struct mlx5_ib_dev *dev) { if (!mlx5_core_is_pf(dev->mdev)) return 0; if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace))) return 0; enable_dc_tracer(dev); return 0; } static void cleanup_dc_improvements(struct mlx5_ib_dev *dev) { disable_dc_tracer(dev); } static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) { mlx5_vport_dealloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, dev->port[port_num].q_cnt_id); dev->port[port_num].q_cnt_id = 0; } static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) { unsigned int i; for (i = 0; i < dev->num_ports; i++) mlx5_ib_dealloc_q_port_counter(dev, i); } static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) { int i; int ret; for (i = 0; i < dev->num_ports; i++) { ret = mlx5_vport_alloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, &dev->port[i].q_cnt_id); if (ret) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d\n", i + 1); goto dealloc_counters; } } return 0; dealloc_counters: while (--i >= 0) mlx5_ib_dealloc_q_port_counter(dev, i); return ret; } struct port_attribute { struct attribute attr; ssize_t (*show)(struct mlx5_ib_port *, struct port_attribute *, char *buf); ssize_t (*store)(struct mlx5_ib_port *, struct port_attribute *, const char *buf, size_t count); }; struct port_counter_attribute { struct port_attribute attr; size_t offset; }; static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct port_attribute *port_attr = container_of(attr, struct port_attribute, attr); struct mlx5_ib_port_sysfs_group *p = container_of(kobj, struct mlx5_ib_port_sysfs_group, kobj); struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port, group); if (!port_attr->show) return -EIO; return port_attr->show(mibport, port_attr, buf); } static ssize_t show_port_counter(struct mlx5_ib_port *p, struct port_attribute *port_attr, char *buf) { int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); struct port_counter_attribute *counter_attr = container_of(port_attr, struct port_counter_attribute, attr); void *out; int ret; out = mlx5_vzalloc(outlen); if (!out) return -ENOMEM; ret = mlx5_vport_query_q_counter(p->dev->mdev, p->q_cnt_id, 0, out, outlen); if (ret) goto free; ret = sprintf(buf, "%d\n", be32_to_cpu(*(__be32 *)(out + counter_attr->offset))); free: kfree(out); return ret; } #define PORT_COUNTER_ATTR(_name) \ struct port_counter_attribute port_counter_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_port_counter, NULL), \ .offset = MLX5_BYTE_OFF(query_q_counter_out, _name) \ } static PORT_COUNTER_ATTR(rx_write_requests); static PORT_COUNTER_ATTR(rx_read_requests); static PORT_COUNTER_ATTR(rx_atomic_requests); static PORT_COUNTER_ATTR(rx_dct_connect); static PORT_COUNTER_ATTR(out_of_buffer); static PORT_COUNTER_ATTR(out_of_sequence); static PORT_COUNTER_ATTR(duplicate_request); static PORT_COUNTER_ATTR(rnr_nak_retry_err); static PORT_COUNTER_ATTR(packet_seq_err); static PORT_COUNTER_ATTR(implied_nak_seq_err); static PORT_COUNTER_ATTR(local_ack_timeout_err); static struct attribute *counter_attrs[] = { &port_counter_attr_rx_write_requests.attr.attr, &port_counter_attr_rx_read_requests.attr.attr, &port_counter_attr_rx_atomic_requests.attr.attr, &port_counter_attr_rx_dct_connect.attr.attr, &port_counter_attr_out_of_buffer.attr.attr, &port_counter_attr_out_of_sequence.attr.attr, &port_counter_attr_duplicate_request.attr.attr, &port_counter_attr_rnr_nak_retry_err.attr.attr, &port_counter_attr_packet_seq_err.attr.attr, &port_counter_attr_implied_nak_seq_err.attr.attr, &port_counter_attr_local_ack_timeout_err.attr.attr, NULL }; static struct attribute_group port_counters_group = { .name = "counters", .attrs = counter_attrs }; static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show }; static struct kobj_type port_type = { .sysfs_ops = &port_sysfs_ops, }; static int add_port_attrs(struct mlx5_ib_dev *dev, struct kobject *parent, struct mlx5_ib_port_sysfs_group *port, u8 port_num) { int ret; ret = kobject_init_and_add(&port->kobj, &port_type, parent, "%d", port_num); if (ret) return ret; if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { ret = sysfs_create_group(&port->kobj, &port_counters_group); if (ret) goto put_kobj; } port->enabled = true; return ret; put_kobj: kobject_put(&port->kobj); return ret; } static void destroy_ports_attrs(struct mlx5_ib_dev *dev, unsigned int num_ports) { unsigned int i; for (i = 0; i < num_ports; i++) { struct mlx5_ib_port_sysfs_group *port = &dev->port[i].group; if (!port->enabled) continue; if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) sysfs_remove_group(&port->kobj, &port_counters_group); kobject_put(&port->kobj); port->enabled = false; } if (dev->ports_parent) { kobject_put(dev->ports_parent); dev->ports_parent = NULL; } } static int create_port_attrs(struct mlx5_ib_dev *dev) { int ret = 0; unsigned int i = 0; struct device *device = &dev->ib_dev.dev; dev->ports_parent = kobject_create_and_add("mlx5_ports", &device->kobj); if (!dev->ports_parent) return -ENOMEM; for (i = 0; i < dev->num_ports; i++) { ret = add_port_attrs(dev, dev->ports_parent, &dev->port[i].group, i + 1); if (ret) goto _destroy_ports_attrs; } return 0; _destroy_ports_attrs: destroy_ports_attrs(dev, i); return ret; } static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; int err; int i; printk_once(KERN_INFO "%s", mlx5_version); dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) return NULL; dev->mdev = mdev; dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), GFP_KERNEL); if (!dev->port) goto err_dealloc; for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { dev->port[i].dev = dev; dev->port[i].port_num = i; dev->port[i].port_gone = 0; memset(dev->port[i].gid_table, 0, sizeof(dev->port[i].gid_table)); } err = get_port_caps(dev); if (err) goto err_free_port; if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { if (MLX5_CAP_GEN(mdev, roce)) { err = mlx5_nic_vport_enable_roce(mdev); if (err) goto err_free_port; } else { goto err_free_port; } } MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = mdev->special_contexts.resd_lkey; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dma_device = &mdev->pdev->dev; dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; dev->ib_dev.query_gid = mlx5_ib_query_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; dev->ib_dev.mmap = mlx5_ib_mmap; dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; dev->ib_dev.create_ah = mlx5_ib_create_ah; dev->ib_dev.query_ah = mlx5_ib_query_ah; dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; dev->ib_dev.create_srq = mlx5_ib_create_srq; dev->ib_dev.modify_srq = mlx5_ib_modify_srq; dev->ib_dev.query_srq = mlx5_ib_query_srq; dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; dev->ib_dev.create_qp = mlx5_ib_create_qp; dev->ib_dev.modify_qp = mlx5_ib_modify_qp; dev->ib_dev.query_qp = mlx5_ib_query_qp; dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; dev->ib_dev.post_send = mlx5_ib_post_send; dev->ib_dev.post_recv = mlx5_ib_post_recv; dev->ib_dev.create_cq = mlx5_ib_create_cq; dev->ib_dev.modify_cq = mlx5_ib_modify_cq; dev->ib_dev.resize_cq = mlx5_ib_resize_cq; dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; dev->ib_dev.poll_cq = mlx5_ib_poll_cq; dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; dev->ib_dev.reg_phys_mr = mlx5_ib_reg_phys_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.get_port_immutable = mlx5_port_immutable; dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } err = init_node_data(dev); if (err) goto err_disable_roce; mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); err = create_dev_resources(&dev->devr); if (err) goto err_disable_roce; err = mlx5_ib_alloc_q_counters(dev); if (err) goto err_odp; err = ib_register_device(&dev->ib_dev, NULL); if (err) goto err_q_cnt; err = create_umr_res(dev); if (err) goto err_dev; if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) { if (init_dc_improvements(dev)) mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n"); } err = create_port_attrs(dev); if (err) goto err_dc; for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) goto err_port_attrs; } if (1) { struct thread *rl_thread = NULL; struct proc *rl_proc = NULL; for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { (void) kproc_kthread_add(mlx5_ib_roce_port_update, dev->port + i, &rl_proc, &rl_thread, RFHIGHPID, 0, "mlx5-ib-roce-port", "mlx5-ib-roce_port-%d", i); } } dev->ib_active = true; return dev; err_port_attrs: destroy_ports_attrs(dev, dev->num_ports); err_dc: if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) cleanup_dc_improvements(dev); destroy_umrc_res(dev); err_dev: ib_unregister_device(&dev->ib_dev); err_q_cnt: mlx5_ib_dealloc_q_counters(dev); err_odp: destroy_dev_resources(&dev->devr); err_disable_roce: if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) mlx5_nic_vport_disable_roce(mdev); err_free_port: kfree(dev->port); err_dealloc: ib_dealloc_device((struct ib_device *)dev); return NULL; } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; int i; for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { dev->port[i].port_gone = 1; while (dev->port[i].port_gone != 2) pause("W", hz); } for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { device_remove_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); } destroy_ports_attrs(dev, dev->num_ports); if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) cleanup_dc_improvements(dev); mlx5_ib_dealloc_q_counters(dev); ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); destroy_dev_resources(&dev->devr); if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) mlx5_nic_vport_disable_roce(mdev); kfree(dev->port); ib_dealloc_device(&dev->ib_dev); } static struct mlx5_interface mlx5_ib_interface = { .add = mlx5_ib_add, .remove = mlx5_ib_remove, .event = mlx5_ib_event, .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; static int __init mlx5_ib_init(void) { int err; if (deprecated_prof_sel != 2) printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); err = mlx5_register_interface(&mlx5_ib_interface); if (err) goto clean_odp; mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq"); if (!mlx5_ib_wq) { printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__); goto err_unreg; } return err; err_unreg: mlx5_unregister_interface(&mlx5_ib_interface); clean_odp: return err; } static void __exit mlx5_ib_cleanup(void) { destroy_workqueue(mlx5_ib_wq); mlx5_unregister_interface(&mlx5_ib_interface); } module_init_order(mlx5_ib_init, SI_ORDER_THIRD); module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD); Index: stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c =================================================================== --- stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c (revision 325610) +++ stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c (revision 325611) @@ -1,2991 +1,2991 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include "mlx5_ib.h" #include "user.h" #include #include #define IPV6_DEFAULT_HOPLIMIT 64 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state); /* not supported currently */ static int workqueue_signature; enum { MLX5_IB_ACK_REQ_FREQ = 8, }; enum { MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, MLX5_IB_LINK_TYPE_IB = 0, MLX5_IB_LINK_TYPE_ETH = 1 }; enum { MLX5_IB_SQ_STRIDE = 6, MLX5_IB_CACHE_LINE_SIZE = 64, }; enum { MLX5_RQ_NUM_STATE = MLX5_RQC_STATE_ERR + 1, MLX5_SQ_NUM_STATE = MLX5_SQC_STATE_ERR + 1, MLX5_QP_STATE = MLX5_QP_NUM_STATE + 1, MLX5_QP_STATE_BAD = MLX5_QP_STATE + 1, }; static const u32 mlx5_ib_opcode[] = { [IB_WR_SEND] = MLX5_OPCODE_SEND, [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, }; struct umr_wr { u64 virt_addr; struct ib_pd *pd; unsigned int page_shift; unsigned int npages; u32 length; int access_flags; u32 mkey; }; static int is_qp0(enum ib_qp_type qp_type) { return qp_type == IB_QPT_SMI; } static int is_qp1(enum ib_qp_type qp_type) { return qp_type == IB_QPT_GSI; } static int is_sqp(enum ib_qp_type qp_type) { return is_qp0(qp_type) || is_qp1(qp_type); } static void *get_wqe(struct mlx5_ib_qp *qp, int offset) { return mlx5_buf_offset(&qp->buf, offset); } static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) { return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); } void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) { return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); } static int query_wqe_idx(struct mlx5_ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); struct mlx5_query_qp_mbox_out *outb; struct mlx5_qp_context *context; int ret; outb = kzalloc(sizeof(*outb), GFP_KERNEL); if (!outb) return -ENOMEM; context = &outb->ctx; mutex_lock(&qp->mutex); ret = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); if (ret) goto out_free; ret = be16_to_cpu(context->hw_sq_wqe_counter) & (qp->sq.wqe_cnt - 1); out_free: mutex_unlock(&qp->mutex); kfree(outb); return ret; } static int mlx5_handle_sig_pipelining(struct mlx5_ib_qp *qp) { int wqe_idx; wqe_idx = query_wqe_idx(qp); if (wqe_idx < 0) { printf("mlx5_ib: ERR: ""Failed to query QP 0x%x wqe index\n", qp->mqp.qpn); return wqe_idx; } if (qp->sq.swr_ctx[wqe_idx].sig_piped) { struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); struct mlx5_wqe_ctrl_seg *cwqe; cwqe = mlx5_get_send_wqe(qp, wqe_idx); cwqe->opmod_idx_opcode = cpu_to_be32(be32_to_cpu(cwqe->opmod_idx_opcode) & 0xffffff00); qp->sq.swr_ctx[wqe_idx].w_list.opcode |= MLX5_OPCODE_SIGNATURE_CANCELED; mlx5_ib_dbg(dev, "Cancel QP 0x%x wqe_index 0x%x\n", qp->mqp.qpn, wqe_idx); } return 0; } static void mlx5_ib_sqd_work(struct work_struct *work) { struct mlx5_ib_sqd *sqd; struct mlx5_ib_qp *qp; struct ib_qp_attr qp_attr; sqd = container_of(work, struct mlx5_ib_sqd, work); qp = sqd->qp; if (mlx5_handle_sig_pipelining(qp)) goto out; mutex_lock(&qp->mutex); if (__mlx5_ib_modify_qp(&qp->ibqp, &qp_attr, 0, IB_QPS_SQD, IB_QPS_RTS)) printf("mlx5_ib: ERR: ""Failed to resume QP 0x%x\n", qp->mqp.qpn); mutex_unlock(&qp->mutex); out: kfree(sqd); } static void mlx5_ib_sigerr_sqd_event(struct mlx5_ib_qp *qp) { struct mlx5_ib_sqd *sqd; sqd = kzalloc(sizeof(*sqd), GFP_ATOMIC); if (!sqd) return; sqd->qp = qp; INIT_WORK(&sqd->work, mlx5_ib_sqd_work); queue_work(mlx5_ib_wq, &sqd->work); } static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; struct ib_event event; if (type == MLX5_EVENT_TYPE_SQ_DRAINED && to_mibqp(qp)->state != IB_QPS_SQD) { mlx5_ib_sigerr_sqd_event(to_mibqp(qp)); return; } if (type == MLX5_EVENT_TYPE_PATH_MIG) to_mibqp(qp)->port = to_mibqp(qp)->alt_port; if (ibqp->event_handler) { event.device = ibqp->device; event.element.qp = ibqp; switch (type) { case MLX5_EVENT_TYPE_PATH_MIG: event.event = IB_EVENT_PATH_MIG; break; case MLX5_EVENT_TYPE_COMM_EST: event.event = IB_EVENT_COMM_EST; break; case MLX5_EVENT_TYPE_SQ_DRAINED: event.event = IB_EVENT_SQ_DRAINED; break; case MLX5_EVENT_TYPE_SRQ_LAST_WQE: event.event = IB_EVENT_QP_LAST_WQE_REACHED; break; case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: event.event = IB_EVENT_QP_FATAL; break; case MLX5_EVENT_TYPE_PATH_MIG_FAILED: event.event = IB_EVENT_PATH_MIG_ERR; break; case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: event.event = IB_EVENT_QP_REQ_ERR; break; case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: event.event = IB_EVENT_QP_ACCESS_ERR; break; default: printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); return; } ibqp->event_handler(&event, ibqp->qp_context); } } static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) { int wqe_size; int wq_size; /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) return -EINVAL; if (!has_rq) { qp->rq.max_gs = 0; qp->rq.wqe_cnt = 0; qp->rq.wqe_shift = 0; cap->max_recv_wr = 0; cap->max_recv_sge = 0; } else { if (ucmd) { qp->rq.wqe_cnt = ucmd->rq_wqe_count; qp->rq.wqe_shift = ucmd->rq_wqe_shift; qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; qp->rq.max_post = qp->rq.wqe_cnt; } else { wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); wqe_size = roundup_pow_of_two(wqe_size); wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); qp->rq.wqe_cnt = wq_size / wqe_size; if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) { mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)); return -EINVAL; } qp->rq.wqe_shift = ilog2(wqe_size); qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; qp->rq.max_post = qp->rq.wqe_cnt; } } return 0; } static int sq_overhead(enum ib_qp_type qp_type) { int size = 0; switch (qp_type) { case IB_QPT_XRC_INI: size += sizeof(struct mlx5_wqe_xrc_seg); /* fall through */ case IB_QPT_RC: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_atomic_seg) + sizeof(struct mlx5_wqe_raddr_seg) + sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_mkey_seg); break; case IB_QPT_XRC_TGT: return 0; case IB_QPT_UC: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_raddr_seg) + sizeof(struct mlx5_wqe_umr_ctrl_seg) + sizeof(struct mlx5_mkey_seg); break; case IB_QPT_UD: case IB_QPT_SMI: case IB_QPT_GSI: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; default: return -EINVAL; } return size; } static int calc_send_wqe(struct ib_qp_init_attr *attr) { int inl_size = 0; int size; size = sq_overhead(attr->qp_type); if (size < 0) return size; if (attr->cap.max_inline_data) { inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + attr->cap.max_inline_data; } size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); } static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size) { int max_sge; if (attr->qp_type == IB_QPT_RC) max_sge = (min_t(int, wqe_size, 512) - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); else if (attr->qp_type == IB_QPT_XRC_INI) max_sge = (min_t(int, wqe_size, 512) - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_xrc_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); else max_sge = (wqe_size - sq_overhead(attr->qp_type)) / sizeof(struct mlx5_wqe_data_seg); return min_t(int, max_sge, wqe_size - sq_overhead(attr->qp_type) / sizeof(struct mlx5_wqe_data_seg)); } static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, struct mlx5_ib_qp *qp) { int wqe_size; int wq_size; if (!attr->cap.max_send_wr) return 0; wqe_size = calc_send_wqe(attr); mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); if (wqe_size < 0) return wqe_size; if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { mlx5_ib_warn(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); return -EINVAL; } qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; wq_size = roundup_pow_of_two(attr->cap.max_send_wr * (u64)wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { mlx5_ib_warn(dev, "wqe count(%d) exceeds limits(%d)\n", qp->sq.wqe_cnt, 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); return -ENOMEM; } qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.max_gs = get_send_sge(attr, wqe_size); if (qp->sq.max_gs < attr->cap.max_send_sge) { mlx5_ib_warn(dev, "max sge(%d) exceeds limits(%d)\n", qp->sq.max_gs, attr->cap.max_send_sge); return -ENOMEM; } attr->cap.max_send_sge = qp->sq.max_gs; qp->sq.max_post = wq_size / wqe_size; attr->cap.max_send_wr = qp->sq.max_post; return wq_size; } static int set_user_buf_size(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd, struct ib_qp_init_attr *attr) { int desc_sz = 1 << qp->sq.wqe_shift; if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); return -EINVAL; } if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", ucmd->sq_wqe_count, ucmd->sq_wqe_count); return -EINVAL; } qp->sq.wqe_cnt = ucmd->sq_wqe_count; if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", qp->sq.wqe_cnt, 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); return -EINVAL; } if (attr->qp_type == IB_QPT_RAW_PACKET) { qp->buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; qp->sq_buf_size = qp->sq.wqe_cnt << 6; } else { qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << 6); qp->sq_buf_size = 0; } return 0; } static int qp_has_rq(struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT || attr->srq || !attr->cap.max_recv_wr) return 0; return 1; } static int first_med_uuar(void) { return 1; } static int next_uuar(int n) { n++; while (((n % 4) & 2)) n++; return n; } static int num_med_uuar(struct mlx5_uuar_info *uuari) { int n; n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - uuari->num_low_latency_uuars - 1; return n >= 0 ? n : 0; } static int max_uuari(struct mlx5_uuar_info *uuari) { return uuari->num_uars * 4; } static int first_hi_uuar(struct mlx5_uuar_info *uuari) { int med; int i; int t; med = num_med_uuar(uuari); for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { t++; if (t == med) return next_uuar(i); } return 0; } static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) { int i; for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { if (!test_bit(i, uuari->bitmap)) { set_bit(i, uuari->bitmap); uuari->count[i]++; return i; } } return -ENOMEM; } static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) { int minidx = first_med_uuar(); int i; for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { if (uuari->count[i] < uuari->count[minidx]) minidx = i; } uuari->count[minidx]++; return minidx; } static int alloc_uuar(struct mlx5_uuar_info *uuari, enum mlx5_ib_latency_class lat) { int uuarn = -EINVAL; mutex_lock(&uuari->lock); switch (lat) { case MLX5_IB_LATENCY_CLASS_LOW: uuarn = 0; uuari->count[uuarn]++; break; case MLX5_IB_LATENCY_CLASS_MEDIUM: if (uuari->ver < 2) uuarn = -ENOMEM; else uuarn = alloc_med_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_HIGH: if (uuari->ver < 2) uuarn = -ENOMEM; else uuarn = alloc_high_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_FAST_PATH: uuarn = 2; break; } mutex_unlock(&uuari->lock); return uuarn; } static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) { clear_bit(uuarn, uuari->bitmap); --uuari->count[uuarn]; } static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) { clear_bit(uuarn, uuari->bitmap); --uuari->count[uuarn]; } static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) { int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; int high_uuar = nuuars - uuari->num_low_latency_uuars; mutex_lock(&uuari->lock); if (uuarn == 0) { --uuari->count[uuarn]; goto out; } if (uuarn < high_uuar) { free_med_class_uuar(uuari, uuarn); goto out; } free_high_class_uuar(uuari, uuarn); out: mutex_unlock(&uuari->lock); } static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) { switch (state) { case IB_QPS_RESET: return MLX5_QP_STATE_RST; case IB_QPS_INIT: return MLX5_QP_STATE_INIT; case IB_QPS_RTR: return MLX5_QP_STATE_RTR; case IB_QPS_RTS: return MLX5_QP_STATE_RTS; case IB_QPS_SQD: return MLX5_QP_STATE_SQD; case IB_QPS_SQE: return MLX5_QP_STATE_SQER; case IB_QPS_ERR: return MLX5_QP_STATE_ERR; default: return -1; } } static int to_mlx5_st(enum ib_qp_type type) { switch (type) { case IB_QPT_RC: return MLX5_QP_ST_RC; case IB_QPT_UC: return MLX5_QP_ST_UC; case IB_QPT_UD: return MLX5_QP_ST_UD; case IB_QPT_XRC_INI: case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; case IB_QPT_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; case IB_QPT_MAX: default: return -EINVAL; } } static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) { return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; } static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, struct ib_qp_init_attr *attr, struct mlx5_create_qp_mbox_in **in, int *inlen, struct mlx5_exp_ib_create_qp *ucmd) { struct mlx5_exp_ib_create_qp_resp resp; struct mlx5_ib_ucontext *context; int page_shift = 0; int uar_index; int npages; u32 offset = 0; int uuarn; int ncont = 0; int err; context = to_mucontext(pd->uobject->context); memset(&resp, 0, sizeof(resp)); resp.size_of_prefix = offsetof(struct mlx5_exp_ib_create_qp_resp, prefix_reserved); /* * TBD: should come from the verbs when we have the API */ if (ucmd->exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX) { if (ucmd->exp.wc_uar_index == MLX5_EXP_CREATE_QP_DB_ONLY_UUAR) { /* Assign LATENCY_CLASS_LOW (DB only UUAR) to this QP */ uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); if (uuarn < 0) { mlx5_ib_warn(dev, "DB only uuar allocation failed\n"); return uuarn; } uar_index = uuarn_to_uar_index(&context->uuari, uuarn); } else if (ucmd->exp.wc_uar_index >= MLX5_IB_MAX_CTX_DYNAMIC_UARS || context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index] == MLX5_IB_INVALID_UAR_INDEX) { mlx5_ib_warn(dev, "dynamic uuar allocation failed\n"); return -EINVAL; } else { uar_index = context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index]; uuarn = MLX5_EXP_INVALID_UUAR; } } else { uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); if (uuarn < 0) { mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); mlx5_ib_dbg(dev, "reverting to medium latency\n"); uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); if (uuarn < 0) { mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); mlx5_ib_dbg(dev, "reverting to high latency\n"); uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); if (uuarn < 0) { mlx5_ib_warn(dev, "uuar allocation failed\n"); return uuarn; } } } uar_index = uuarn_to_uar_index(&context->uuari, uuarn); } mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); qp->rq.offset = 0; qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; err = set_user_buf_size(dev, qp, (struct mlx5_ib_create_qp *)ucmd, attr); if (err) goto err_uuar; if (ucmd->buf_addr && qp->buf_size) { qp->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { mlx5_ib_warn(dev, "umem_get failed\n"); err = PTR_ERR(qp->umem); goto err_uuar; } } else { qp->umem = NULL; } if (qp->umem) { mlx5_ib_cont_pages(qp->umem, ucmd->buf_addr, &npages, &page_shift, &ncont, NULL); err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, &offset); if (err) { mlx5_ib_warn(dev, "bad offset\n"); goto err_umem; } mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", (unsigned long long)ucmd->buf_addr, qp->buf_size, npages, page_shift, ncont, offset); } *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_umem; } if (qp->umem) mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); resp.uuar_index = uuarn; qp->uuarn = uuarn; err = mlx5_ib_db_map_user(context, ucmd->db_addr, &qp->db); if (err) { mlx5_ib_warn(dev, "map failed\n"); goto err_free; } err = ib_copy_to_udata(udata, &resp, sizeof(struct mlx5_ib_create_qp_resp)); if (err) { mlx5_ib_err(dev, "copy failed\n"); goto err_unmap; } qp->create_type = MLX5_QP_USER; return 0; err_unmap: mlx5_ib_db_unmap_user(context, &qp->db); err_free: kvfree(*in); err_umem: if (qp->umem) ib_umem_release(qp->umem); err_uuar: free_uuar(&context->uuari, uuarn); return err; } static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) { struct mlx5_ib_ucontext *context; context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &qp->db); if (qp->umem) ib_umem_release(qp->umem); if (qp->sq_umem) ib_umem_release(qp->sq_umem); /* * Free only the UUARs handled by the kernel. * UUARs of UARs allocated dynamically are handled by user. */ if (qp->uuarn != MLX5_EXP_INVALID_UUAR) free_uuar(&context->uuari, qp->uuarn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_qp *qp, struct mlx5_create_qp_mbox_in **in, int *inlen) { enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; struct mlx5_uuar_info *uuari; int uar_index; int uuarn; int err; uuari = &dev->mdev->priv.uuari; if (init_attr->create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) return -EINVAL; uuarn = alloc_uuar(uuari, lc); if (uuarn < 0) { mlx5_ib_warn(dev, "\n"); return -ENOMEM; } qp->bf = &uuari->bfs[uuarn]; uar_index = qp->bf->uar->index; err = calc_sq_size(dev, init_attr, qp); if (err < 0) { mlx5_ib_warn(dev, "err %d\n", err); goto err_uuar; } qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); if (err) { mlx5_ib_warn(dev, "err %d\n", err); goto err_uuar; } qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_buf; } (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); /* Set "fast registration enabled" for all kernel QPs */ (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); mlx5_fill_page_array(&qp->buf, (*in)->pas); err = mlx5_db_alloc(dev->mdev, &qp->db); if (err) { mlx5_ib_warn(dev, "err %d\n", err); goto err_free; } qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx), GFP_KERNEL); qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->rq.rwr_ctx), GFP_KERNEL); if (!qp->sq.swr_ctx || !qp->rq.rwr_ctx) { err = -ENOMEM; goto err_wrid; } qp->create_type = MLX5_QP_KERNEL; return 0; err_wrid: mlx5_db_free(dev->mdev, &qp->db); kfree(qp->sq.swr_ctx); kfree(qp->rq.rwr_ctx); err_free: kvfree(*in); err_buf: mlx5_buf_free(dev->mdev, &qp->buf); err_uuar: free_uuar(&dev->mdev->priv.uuari, uuarn); return err; } static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { mlx5_db_free(dev->mdev, &qp->db); kfree(qp->sq.swr_ctx); kfree(qp->rq.rwr_ctx); mlx5_buf_free(dev->mdev, &qp->buf); free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); } static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { enum ib_qp_type qt = attr->qp_type; if (attr->srq || (qt == IB_QPT_XRC_TGT) || (qt == IB_QPT_XRC_INI)) return cpu_to_be32(MLX5_SRQ_RQ); else if (!qp->has_rq) return cpu_to_be32(MLX5_ZERO_LEN_RQ); else return cpu_to_be32(MLX5_NON_ZERO_RQ); } static int is_connected(enum ib_qp_type qp_type) { if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) return 1; return 0; } static void get_cqs(enum ib_qp_type qp_type, struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) { switch (qp_type) { case IB_QPT_XRC_TGT: *send_cq = NULL; *recv_cq = NULL; break; case IB_QPT_XRC_INI: *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; *recv_cq = NULL; break; case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_RAW_PACKET: *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; break; case IB_QPT_MAX: default: *send_cq = NULL; *recv_cq = NULL; break; } } enum { MLX5_QP_END_PAD_MODE_ALIGN = MLX5_WQ_END_PAD_MODE_ALIGN, MLX5_QP_END_PAD_MODE_NONE = MLX5_WQ_END_PAD_MODE_NONE, }; static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) { struct mlx5_ib_resources *devr = &dev->devr; struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_create_qp_mbox_in *in = NULL; struct mlx5_exp_ib_create_qp ucmd; struct mlx5_ib_create_qp *pucmd = NULL; struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; int inlen = sizeof(*in); size_t ucmd_size; int err; int st; u32 uidx; void *qpc; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { mlx5_ib_warn(dev, "block multicast loopback isn't supported\n"); return -EINVAL; } else { qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; } } if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; if (pd && pd->uobject) { memset(&ucmd, 0, sizeof(ucmd)); ucmd_size = sizeof(struct mlx5_ib_create_qp); if (ucmd_size > offsetof(struct mlx5_exp_ib_create_qp, size_of_prefix)) { mlx5_ib_warn(dev, "mlx5_ib_create_qp is too big to fit as prefix of mlx5_exp_ib_create_qp\n"); return -EINVAL; } err = ib_copy_from_udata(&ucmd, udata, min(udata->inlen, ucmd_size)); if (err) { mlx5_ib_err(dev, "copy failed\n"); return err; } pucmd = (struct mlx5_ib_create_qp *)&ucmd; if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_UIDX) uidx = ucmd.exp.uidx; else uidx = 0xffffff; qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); } else { qp->wq_sig = !!workqueue_signature; uidx = 0xffffff; } qp->has_rq = qp_has_rq(init_attr); err = set_rq_size(dev, &init_attr->cap, qp->has_rq, qp, (pd && pd->uobject) ? pucmd : NULL); if (err) { mlx5_ib_warn(dev, "err %d\n", err); return err; } if (pd) { if (pd->uobject) { __u32 max_wqes = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || ucmd.rq_wqe_count != qp->rq.wqe_cnt) { mlx5_ib_warn(dev, "invalid rq params\n"); return -EINVAL; } if (ucmd.sq_wqe_count > max_wqes) { mlx5_ib_warn(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", ucmd.sq_wqe_count, max_wqes); return -EINVAL; } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &inlen, &ucmd); if (err) mlx5_ib_warn(dev, "err %d\n", err); } else { if (init_attr->qp_type == IB_QPT_RAW_PACKET) { mlx5_ib_warn(dev, "Raw Eth QP is disabled for Kernel consumers\n"); return -EINVAL; } err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); if (err) mlx5_ib_warn(dev, "err %d\n", err); else qp->pa_lkey = to_mpd(pd)->pa_lkey; } if (err) return err; } else { in = mlx5_vzalloc(sizeof(*in)); if (!in) return -ENOMEM; qp->create_type = MLX5_QP_EMPTY; } if (is_sqp(init_attr->qp_type)) qp->port = init_attr->port_num; st = to_mlx5_st(init_attr->qp_type); if (st < 0) { mlx5_ib_warn(dev, "invalid service type\n"); err = st; goto err_create; } in->ctx.flags |= cpu_to_be32(st << 16 | MLX5_QP_PM_MIGRATED << 11); in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); if (qp->wq_sig) in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); if (qp->flags & MLX5_IB_QP_CAP_RX_END_PADDING) in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_ALIGN << 2); else in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_NONE << 2); if (qp->scat_cqe && is_connected(init_attr->qp_type)) { int rcqe_sz; int scqe_sz; rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); if (rcqe_sz == 128) { in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; } else { in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; } if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { in->ctx.cs_req = 0; } else { if (scqe_sz == 128) in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; else in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; } } if (qp->rq.wqe_cnt) { in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; } in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); if (qp->sq.wqe_cnt) in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); else in->ctx.sq_crq_size |= cpu_to_be16(0x8000); /* Set default resources */ switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); break; case IB_QPT_XRC_INI: in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); break; default: if (init_attr->srq) { in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); } else { in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s1)->msrq.srqn); } } if (init_attr->send_cq) in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); if (init_attr->recv_cq) in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); if (MLX5_CAP_GEN(mdev, cqe_version)) { qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); /* 0xffffff means we ask to work with cqe version 0 */ MLX5_SET(qpc, qpc, user_index, uidx); } if (init_attr->qp_type == IB_QPT_RAW_PACKET) { if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) { mlx5_ib_warn(dev, "Raw Ethernet QP is allowed only for Ethernet link layer\n"); return -ENOSYS; } if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD) { qp->sq_buf_addr = ucmd.exp.sq_buf_addr; } else { mlx5_ib_warn(dev, "Raw Ethernet QP needs SQ buff address\n"); return -EINVAL; } err = -EOPNOTSUPP; } else { err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); qp->mqp.event = mlx5_ib_qp_event; } if (err) { mlx5_ib_warn(dev, "create qp failed\n"); goto err_create; } kvfree(in); /* Hardware wants QPN written in big-endian order (after * shifting) for send doorbell. Precompute this value to save * a little bit when posting sends. */ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx5_ib_lock_cqs(send_cq, recv_cq); /* Maintain device to QPs access, needed for further handling via reset * flow */ list_add_tail(&qp->qps_list, &dev->qp_list); /* Maintain CQ to QPs access, needed for further handling via reset flow */ if (send_cq) list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); if (recv_cq) list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); return 0; err_create: if (qp->create_type == MLX5_QP_USER) destroy_qp_user(pd, qp); else if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); kvfree(in); return err; } static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { if (send_cq) { if (recv_cq) { if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else { spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } else { spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } } else if (recv_cq) { spin_lock(&recv_cq->lock); __acquire(&send_cq->lock); } else { __acquire(&send_cq->lock); __acquire(&recv_cq->lock); } } static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) __releases(&send_cq->lock) __releases(&recv_cq->lock) { if (send_cq) { if (recv_cq) { if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock(&send_cq->lock); } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { __release(&recv_cq->lock); spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); spin_unlock(&recv_cq->lock); } } else { __release(&recv_cq->lock); spin_unlock(&send_cq->lock); } } else if (recv_cq) { __release(&send_cq->lock); spin_unlock(&recv_cq->lock); } else { __release(&recv_cq->lock); __release(&send_cq->lock); } } static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) { return to_mpd(qp->ibqp.pd); } static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_modify_qp_mbox_in *in; unsigned long flags; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return; if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { if (mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, in, 0, &qp->mqp)) mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", qp->mqp.qpn); } } get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx5_ib_lock_cqs(send_cq, recv_cq); /* del from lists under both locks above to protect reset flow paths */ list_del(&qp->qps_list); if (send_cq) list_del(&qp->cq_send_list); if (recv_cq) list_del(&qp->cq_recv_list); if (qp->create_type == MLX5_QP_KERNEL) { __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); } mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { } else { err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); if (err) mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn); } kfree(in); if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); else if (qp->create_type == MLX5_QP_USER) destroy_qp_user(&get_pd(qp)->ibpd, qp); } static const char *ib_qp_type_str(enum ib_qp_type type) { switch (type) { case IB_QPT_SMI: return "IB_QPT_SMI"; case IB_QPT_GSI: return "IB_QPT_GSI"; case IB_QPT_RC: return "IB_QPT_RC"; case IB_QPT_UC: return "IB_QPT_UC"; case IB_QPT_UD: return "IB_QPT_UD"; case IB_QPT_RAW_IPV6: return "IB_QPT_RAW_IPV6"; case IB_QPT_RAW_ETHERTYPE: return "IB_QPT_RAW_ETHERTYPE"; case IB_QPT_XRC_INI: return "IB_QPT_XRC_INI"; case IB_QPT_XRC_TGT: return "IB_QPT_XRC_TGT"; case IB_QPT_RAW_PACKET: return "IB_QPT_RAW_PACKET"; case IB_QPT_MAX: default: return "Invalid QP type"; } } struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; u16 xrcdn = 0; int err; u32 rcqn; u32 scqn; init_attr->qpg_type = IB_QPG_NONE; if (pd) { dev = to_mdev(pd->device); } else { /* being cautious here */ if (init_attr->qp_type != IB_QPT_XRC_TGT) { printf("mlx5_ib: WARN: ""%s: no PD for transport %s\n", __func__, ib_qp_type_str(init_attr->qp_type)); return ERR_PTR(-EINVAL); } dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); } switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: case IB_QPT_XRC_INI: if (!MLX5_CAP_GEN(dev->mdev, xrc)) { mlx5_ib_warn(dev, "XRC not supported\n"); return ERR_PTR(-ENOSYS); } init_attr->recv_cq = NULL; if (init_attr->qp_type == IB_QPT_XRC_TGT) { xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; init_attr->send_cq = NULL; } /* fall through */ case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_RAW_PACKET: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); err = create_qp_common(dev, pd, init_attr, udata, qp); if (err) { mlx5_ib_warn(dev, "create_qp_common failed\n"); kfree(qp); return ERR_PTR(err); } if (is_qp0(init_attr->qp_type)) qp->ibqp.qp_num = 0; else if (is_qp1(init_attr->qp_type)) qp->ibqp.qp_num = 1; else qp->ibqp.qp_num = qp->mqp.qpn; rcqn = init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1; scqn = init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1; mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", qp->ibqp.qp_num, qp->mqp.qpn, rcqn, scqn); qp->xrcdn = xrcdn; break; case IB_QPT_RAW_IPV6: case IB_QPT_MAX: default: mlx5_ib_warn(dev, "unsupported qp type %d\n", init_attr->qp_type); /* Don't support raw QPs */ return ERR_PTR(-EINVAL); } return &qp->ibqp; } int mlx5_ib_destroy_qp(struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); destroy_qp_common(dev, mqp); kfree(mqp); return 0; } static u32 atomic_mode_qp(struct mlx5_ib_dev *dev) { unsigned long mask; unsigned long tmp; mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp) & MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); tmp = find_last_bit(&mask, BITS_PER_LONG); if (tmp < 2 || tmp >= BITS_PER_LONG) return MLX5_ATOMIC_MODE_NONE; if (tmp == 2) return MLX5_ATOMIC_MODE_CX; return tmp << MLX5_ATOMIC_MODE_OFF; } static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, int attr_mask) { struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); u32 hw_access_flags = 0; u8 dest_rd_atomic; u32 access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else dest_rd_atomic = qp->resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else access_flags = qp->atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) hw_access_flags |= MLX5_QP_BIT_RRE; if (access_flags & IB_ACCESS_REMOTE_ATOMIC) hw_access_flags |= (MLX5_QP_BIT_RAE | atomic_mode_qp(dev)); if (access_flags & IB_ACCESS_REMOTE_WRITE) hw_access_flags |= MLX5_QP_BIT_RWE; return cpu_to_be32(hw_access_flags); } enum { MLX5_PATH_FLAG_FL = 1 << 0, MLX5_PATH_FLAG_FREE_AR = 1 << 1, MLX5_PATH_FLAG_COUNTER = 1 << 2, }; static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) { if (rate == IB_RATE_PORT_CURRENT) { return 0; } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { return -EINVAL; } else { while (rate != IB_RATE_2_5_GBPS && !(1 << (rate + MLX5_STAT_RATE_OFFSET) & MLX5_CAP_GEN(dev->mdev, stat_rate_support))) --rate; } return rate + MLX5_STAT_RATE_OFFSET; } static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, u32 path_flags, const struct ib_qp_attr *attr, int alt) { enum rdma_link_layer ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); int err; int gid_type; if ((ll == IB_LINK_LAYER_ETHERNET) || (ah->ah_flags & IB_AH_GRH)) { - int len = dev->ib_dev.gid_tbl_len[port - 1]; + int len = dev->mdev->port_caps[port - 1].gid_table_len; if (ah->grh.sgid_index >= len) { printf("mlx5_ib: ERR: ""sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, len - 1); return -EINVAL; } } if (ll == IB_LINK_LAYER_ETHERNET) { if (!(ah->ah_flags & IB_AH_GRH)) return -EINVAL; err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index, &gid_type); if (err) return err; err = mlx5_ib_resolve_grh(ah, path->rmac, NULL); if (err) return err; path->udp_sport = mlx5_get_roce_udp_sport(dev, port, ah->grh.sgid_index, 0); path->dci_cfi_prio_sl = (ah->sl & 0xf) << 4; } else { path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; path->grh_mlid = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); if (ah->ah_flags & IB_AH_GRH) path->grh_mlid |= 1 << 7; if (attr_mask & IB_QP_PKEY_INDEX) path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : attr->pkey_index); path->dci_cfi_prio_sl = ah->sl & 0xf; } path->fl_free_ar |= (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; if (ah->ah_flags & IB_AH_GRH) { path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; path->tclass_flowlabel = cpu_to_be32((ah->grh.traffic_class << 20) | (ah->grh.flow_label)); memcpy(path->rgid, ah->grh.dgid.raw, 16); } err = ib_rate_to_mlx5(dev, ah->static_rate); if (err < 0) return err; path->static_rate = err; path->port = port; if (attr_mask & IB_QP_TIMEOUT) path->ackto_lt = alt ? attr->alt_timeout << 3 : attr->timeout << 3; return 0; } static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { [MLX5_QP_STATE_INIT] = { [MLX5_QP_STATE_INIT] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_PRI_PORT, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_PRI_PORT, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_PRI_PORT, [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PRI_PORT | MLX5_QP_OPTPAR_DC_KEY | MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_RAE, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX, [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_DC_KEY, }, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_RNR_TIMEOUT, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PM_STATE, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_RAE, }, }, [MLX5_QP_STATE_RTS] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RNR_TIMEOUT | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_SRQN | MLX5_QP_OPTPAR_CQN_RCV, [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | MLX5_QP_OPTPAR_PM_STATE | MLX5_QP_OPTPAR_RAE, }, }, [MLX5_QP_STATE_SQER] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RRE, [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | MLX5_QP_OPTPAR_RAE, }, }, [MLX5_QP_STATE_SQD] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RRE, }, }, }; static int ib_nr_to_mlx5_nr(int ib_mask) { switch (ib_mask) { case IB_QP_STATE: return 0; case IB_QP_CUR_STATE: return 0; case IB_QP_EN_SQD_ASYNC_NOTIFY: return 0; case IB_QP_ACCESS_FLAGS: return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; case IB_QP_PKEY_INDEX: return MLX5_QP_OPTPAR_PKEY_INDEX; case IB_QP_PORT: return MLX5_QP_OPTPAR_PRI_PORT; case IB_QP_QKEY: return MLX5_QP_OPTPAR_Q_KEY; case IB_QP_AV: return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX5_QP_OPTPAR_PRI_PORT; case IB_QP_PATH_MTU: return 0; case IB_QP_TIMEOUT: return MLX5_QP_OPTPAR_ACK_TIMEOUT; case IB_QP_RETRY_CNT: return MLX5_QP_OPTPAR_RETRY_COUNT; case IB_QP_RNR_RETRY: return MLX5_QP_OPTPAR_RNR_RETRY; case IB_QP_RQ_PSN: return 0; case IB_QP_MAX_QP_RD_ATOMIC: return MLX5_QP_OPTPAR_SRA_MAX; case IB_QP_ALT_PATH: return MLX5_QP_OPTPAR_ALT_ADDR_PATH; case IB_QP_MIN_RNR_TIMER: return MLX5_QP_OPTPAR_RNR_TIMEOUT; case IB_QP_SQ_PSN: return 0; case IB_QP_MAX_DEST_RD_ATOMIC: return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; case IB_QP_PATH_MIG_STATE: return MLX5_QP_OPTPAR_PM_STATE; case IB_QP_CAP: return 0; case IB_QP_DEST_QPN: return 0; } return 0; } static int ib_mask_to_mlx5_opt(int ib_mask) { int result = 0; int i; for (i = 0; i < 8 * sizeof(int); i++) { if ((1 << i) & ib_mask) result |= ib_nr_to_mlx5_nr(1 << i); } return result; } static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { [MLX5_QP_STATE_RST] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, }, [MLX5_QP_STATE_INIT] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, }, [MLX5_QP_STATE_RTS] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, }, [MLX5_QP_STATE_SQD] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQD_RTS_QP, }, [MLX5_QP_STATE_SQER] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, }, [MLX5_QP_STATE_ERR] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, } }; struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; struct mlx5_modify_qp_mbox_in *in; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; enum mlx5_qp_optpar optpar; int sqd_event; int mlx5_st; int err; u16 op; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return -ENOMEM; context = &in->ctx; err = to_mlx5_st(ibqp->qp_type); if (err < 0) goto out; context->flags = cpu_to_be32(err << 16); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); } else { switch (attr->path_mig_state) { case IB_MIG_MIGRATED: context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); break; case IB_MIG_REARM: context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); break; case IB_MIG_ARMED: context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); break; } } if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; } else if (ibqp->qp_type == IB_QPT_UD) { context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); err = -EINVAL; goto out; } context->mtu_msgmax = (attr->path_mtu << 5) | (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg); } if (attr_mask & IB_QP_DEST_QPN) context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); if (attr_mask & IB_QP_PKEY_INDEX) context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); /* todo implement counter_index functionality */ if (is_sqp(ibqp->qp_type)) context->pri_path.port = qp->port; if (attr_mask & IB_QP_PORT) context->pri_path.port = attr->port_num; if (attr_mask & IB_QP_AV) { err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, attr_mask, 0, attr, 0); if (err) goto out; } if (attr_mask & IB_QP_TIMEOUT) context->pri_path.ackto_lt |= attr->timeout << 3; if (attr_mask & IB_QP_ALT_PATH) { err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, attr->alt_port_num, attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, 0, attr, 1); if (err) goto out; } pd = get_pd(qp); get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); if (attr_mask & IB_QP_RNR_RETRY) context->params1 |= cpu_to_be32(attr->rnr_retry << 13); if (attr_mask & IB_QP_RETRY_CNT) context->params1 |= cpu_to_be32(attr->retry_cnt << 16); if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic) context->params1 |= cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); } if (attr_mask & IB_QP_SQ_PSN) context->next_send_psn = cpu_to_be32(attr->sq_psn & 0xffffff); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) context->params2 |= cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); } if ((attr_mask & IB_QP_ACCESS_FLAGS) && (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && !dev->enable_atomic_resp) { mlx5_ib_warn(dev, "atomic responder is not supported\n"); err = -EINVAL; goto out; } if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); if (attr_mask & IB_QP_MIN_RNR_TIMER) context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); if (attr_mask & IB_QP_RQ_PSN) context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn & 0xffffff); if (attr_mask & IB_QP_QKEY) context->qkey = cpu_to_be32(attr->qkey); if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; else sqd_event = 0; if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->sq_crq_size |= cpu_to_be16(1 << 4); if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : qp->port) - 1; struct mlx5_ib_port *mibport = &dev->port[port_num]; context->qp_counter_set_usr_page |= cpu_to_be32(mibport->q_cnt_id << 24); } mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); mlx5_st = to_mlx5_st(ibqp->qp_type); if (mlx5_st < 0) goto out; if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || !optab[mlx5_cur][mlx5_new]) return -EINVAL; op = optab[mlx5_cur][mlx5_new]; optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; in->optparam = cpu_to_be32(optpar); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) err = -EOPNOTSUPP; else err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event, &qp->mqp); if (err) goto out; qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) qp->atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) qp->resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_ALT_PATH) qp->alt_port = attr->alt_port_num; /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !ibqp->uobject) { mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; qp->sq.head = 0; qp->sq.tail = 0; qp->sq.cur_post = 0; qp->sq.last_poll = 0; if (qp->db.db) { qp->db.db[MLX5_RCV_DBR] = 0; qp->db.db[MLX5_SND_DBR] = 0; } } out: kfree(in); return err; } static int ignored_ts_check(enum ib_qp_type qp_type) { return 0; } int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ignored_ts_check(ibqp->qp_type) && !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) goto out; if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) goto out; if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= dev->mdev->port_caps[port - 1].pkey_table_len) goto out; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) goto out; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) goto out; if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; goto out; } err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); out: mutex_unlock(&qp->mutex); return err; } static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) { struct mlx5_ib_cq *cq; unsigned cur; cur = wq->head - wq->tail; if (likely(cur + nreq < wq->max_post)) return 0; cq = to_mcq(ib_cq); spin_lock(&cq->lock); cur = wq->head - wq->tail; spin_unlock(&cq->lock); return cur + nreq >= wq->max_post; } static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, u64 remote_addr, u32 rkey) { rseg->raddr = cpu_to_be64(remote_addr); rseg->rkey = cpu_to_be32(rkey); rseg->reserved = 0; } static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, struct ib_send_wr *wr) { memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); } static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) { dseg->byte_count = cpu_to_be32(sg->length); dseg->lkey = cpu_to_be32(sg->lkey); dseg->addr = cpu_to_be64(sg->addr); } static __be16 get_klm_octo(int npages) { return cpu_to_be16(ALIGN(npages, 8) / 2); } static __be64 frwr_mkey_mask(void) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR | MLX5_MKEY_MASK_EN_RINVAL | MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_LR | MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_A | MLX5_MKEY_MASK_SMALL_FENCE | MLX5_MKEY_MASK_FREE; return cpu_to_be64(result); } static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, struct ib_send_wr *wr, int li) { memset(umr, 0, sizeof(*umr)); if (li) { umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); umr->flags = 1 << 7; return; } umr->flags = (1 << 5); /* fail if not free */ umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); umr->mkey_mask = frwr_mkey_mask(); } static u8 get_umr_flags(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; } static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, int li, int *writ) { memset(seg, 0, sizeof(*seg)); if (li) { seg->status = MLX5_MKEY_STATUS_FREE; return; } seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | MLX5_ACCESS_MODE_MTT; *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); seg->len = cpu_to_be64(wr->wr.fast_reg.length); seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); seg->log2_page_size = wr->wr.fast_reg.page_shift; } static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, struct ib_send_wr *wr, struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, int writ) { struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); u64 *page_list = wr->wr.fast_reg.page_list->page_list; u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); int i; for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); dseg->addr = cpu_to_be64(mfrpl->map); dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); dseg->lkey = cpu_to_be32(pd->pa_lkey); } static __be32 send_ieth(struct ib_send_wr *wr) { switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: case IB_WR_RDMA_WRITE_WITH_IMM: return wr->ex.imm_data; case IB_WR_SEND_WITH_INV: return cpu_to_be32(wr->ex.invalidate_rkey); default: return 0; } } static u8 calc_sig(void *wqe, int size) { u8 *p = wqe; u8 res = 0; int i; for (i = 0; i < size; i++) res ^= p[i]; return ~res; } static u8 calc_wq_sig(void *wqe) { return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); } static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, void *wqe, int *sz) { struct mlx5_wqe_inline_seg *seg; void *qend = qp->sq.qend; void *addr; int inl = 0; int copy; int len; int i; seg = wqe; wqe += sizeof(*seg); for (i = 0; i < wr->num_sge; i++) { addr = (void *)(uintptr_t)(wr->sg_list[i].addr); len = wr->sg_list[i].length; inl += len; if (unlikely(inl > qp->max_inline_data)) return -ENOMEM; if (unlikely(wqe + len > qend)) { copy = (int)(qend - wqe); memcpy(wqe, addr, copy); addr += copy; len -= copy; wqe = mlx5_get_send_wqe(qp, 0); } memcpy(wqe, addr, len); wqe += len; } seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; return 0; } static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) { int writ = 0; int li; li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; if (unlikely(wr->send_flags & IB_SEND_INLINE)) return -EINVAL; set_frwr_umr_segment(*seg, wr, li); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); set_mkey_segment(*seg, wr, li, &writ); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); if (!li) { if (unlikely(wr->wr.fast_reg.page_list_len > wr->wr.fast_reg.page_list->max_page_list_len)) return -ENOMEM; set_frwr_pages(*seg, wr, mdev, pd, writ); *seg += sizeof(struct mlx5_wqe_data_seg); *size += (sizeof(struct mlx5_wqe_data_seg) / 16); } return 0; } static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) { __be32 *p = NULL; int tidx = idx; int i, j; pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { if ((i & 0xf) == 0) { void *buf = mlx5_get_send_wqe(qp, tidx); tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); p = buf; j = 0; } pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), be32_to_cpu(p[j + 3])); } } static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, unsigned bytecnt, struct mlx5_ib_qp *qp) { while (bytecnt > 0) { __iowrite64_copy(dst++, src++, 8); __iowrite64_copy(dst++, src++, 8); __iowrite64_copy(dst++, src++, 8); __iowrite64_copy(dst++, src++, 8); __iowrite64_copy(dst++, src++, 8); __iowrite64_copy(dst++, src++, 8); __iowrite64_copy(dst++, src++, 8); __iowrite64_copy(dst++, src++, 8); bytecnt -= 64; if (unlikely(src == qp->sq.qend)) src = mlx5_get_send_wqe(qp, 0); } } static u8 get_fence(u8 fence, struct ib_send_wr *wr) { if (unlikely(wr->opcode == IB_WR_LOCAL_INV && wr->send_flags & IB_SEND_FENCE)) return MLX5_FENCE_MODE_STRONG_ORDERING; if (unlikely(fence)) { if (wr->send_flags & IB_SEND_FENCE) return MLX5_FENCE_MODE_SMALL_AND_FENCE; else return fence; } else { return 0; } } static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, struct mlx5_wqe_ctrl_seg **ctrl, struct ib_send_wr *wr, unsigned *idx, int *size, int nreq) { int err = 0; if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { mlx5_ib_warn(to_mdev(qp->ibqp.device), "work queue overflow\n"); err = -ENOMEM; return err; } *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); *seg = mlx5_get_send_wqe(qp, *idx); *ctrl = *seg; *(u32 *)(*seg + 8) = 0; (*ctrl)->imm = send_ieth(wr); (*ctrl)->fm_ce_se = qp->sq_signal_bits | (wr->send_flags & IB_SEND_SIGNALED ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | (wr->send_flags & IB_SEND_SOLICITED ? MLX5_WQE_CTRL_SOLICITED : 0); *seg += sizeof(**ctrl); *size = sizeof(**ctrl) / 16; return err; } static void finish_wqe(struct mlx5_ib_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, u8 size, unsigned idx, struct ib_send_wr *wr, int nreq, u8 fence, u8 next_fence, u32 mlx5_opcode) { u8 opmod = 0; ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | mlx5_opcode | ((u32)opmod << 24)); ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); ctrl->fm_ce_se |= fence; qp->fm_cache = next_fence; if (unlikely(qp->wq_sig)) ctrl->signature = calc_wq_sig(ctrl); qp->sq.swr_ctx[idx].wrid = wr->wr_id; qp->sq.swr_ctx[idx].w_list.opcode = mlx5_opcode; qp->sq.swr_ctx[idx].wqe_head = qp->sq.head + nreq; qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); qp->sq.swr_ctx[idx].w_list.next = qp->sq.cur_post; qp->sq.swr_ctx[idx].sig_piped = 0; } int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf = qp->bf; int uninitialized_var(size); void *qend = qp->sq.qend; unsigned long flags; unsigned idx; int err = 0; int inl = 0; int num_sge; void *seg; int nreq; int i; u8 next_fence = 0; u8 fence; spin_lock_irqsave(&qp->sq.lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; nreq = 0; goto out; } for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { mlx5_ib_warn(dev, "Invalid opcode 0x%x\n", wr->opcode); err = -EINVAL; *bad_wr = wr; goto out; } fence = qp->fm_cache; num_sge = wr->num_sge; if (unlikely(num_sge > qp->sq.max_gs)) { mlx5_ib_warn(dev, "Max gs exceeded %d (max = %d)\n", wr->num_sge, qp->sq.max_gs); err = -ENOMEM; *bad_wr = wr; goto out; } err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); if (err) { mlx5_ib_warn(dev, "Failed to prepare WQE\n"); err = -ENOMEM; *bad_wr = wr; goto out; } switch (ibqp->qp_type) { case IB_QPT_XRC_INI: xrc = seg; xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); seg += sizeof(*xrc); size += sizeof(*xrc) / 16; /* fall through */ case IB_QPT_RC: switch (wr->opcode) { case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(seg, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); seg += sizeof(struct mlx5_wqe_raddr_seg); size += sizeof(struct mlx5_wqe_raddr_seg) / 16; break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); err = -ENOSYS; *bad_wr = wr; goto out; case IB_WR_LOCAL_INV: next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; qp->sq.swr_ctx[idx].wr_data = IB_WR_LOCAL_INV; ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); if (err) { mlx5_ib_warn(dev, "Failed to prepare LOCAL_INV WQE\n"); *bad_wr = wr; goto out; } num_sge = 0; break; case IB_WR_FAST_REG_MR: next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; qp->sq.swr_ctx[idx].wr_data = IB_WR_FAST_REG_MR; ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); if (err) { mlx5_ib_warn(dev, "Failed to prepare FAST_REG_MR WQE\n"); *bad_wr = wr; goto out; } num_sge = 0; break; default: break; } break; case IB_QPT_UC: switch (wr->opcode) { case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(seg, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); seg += sizeof(struct mlx5_wqe_raddr_seg); size += sizeof(struct mlx5_wqe_raddr_seg) / 16; break; default: break; } break; case IB_QPT_SMI: if (!mlx5_core_is_pf(mdev)) { err = -EINVAL; mlx5_ib_warn(dev, "Only physical function is allowed to send SMP MADs\n"); *bad_wr = wr; goto out; } case IB_QPT_GSI: case IB_QPT_UD: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); size += sizeof(struct mlx5_wqe_datagram_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); break; default: break; } if (wr->send_flags & IB_SEND_INLINE && num_sge) { int uninitialized_var(sz); err = set_data_inl_seg(qp, wr, seg, &sz); if (unlikely(err)) { mlx5_ib_warn(dev, "Failed to prepare inline data segment\n"); *bad_wr = wr; goto out; } inl = 1; size += sz; } else { dpseg = seg; for (i = 0; i < num_sge; i++) { if (unlikely(dpseg == qend)) { seg = mlx5_get_send_wqe(qp, 0); dpseg = seg; } if (likely(wr->sg_list[i].length)) { set_data_ptr_seg(dpseg, wr->sg_list + i); size += sizeof(struct mlx5_wqe_data_seg) / 16; dpseg++; } } } finish_wqe(qp, ctrl, size, idx, wr, nreq, get_fence(fence, wr), next_fence, mlx5_ib_opcode[wr->opcode]); if (0) dump_wqe(qp, idx, size); } out: if (likely(nreq)) { qp->sq.head += nreq; /* Make sure that descriptors are written before * updating doorbell record and ringing the doorbell */ wmb(); qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); /* Make sure doorbell record is visible to the HCA before * we hit doorbell */ wmb(); if (bf->need_lock) spin_lock(&bf->lock); else __acquire(&bf->lock); /* TBD enable WC */ if (BF_ENABLE && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) { mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); /* wc_wmb(); */ } else { mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, MLX5_GET_DOORBELL_LOCK(&bf->lock32)); /* Make sure doorbells don't leak out of SQ spinlock * and reach the HCA out of order. */ mmiowb(); } bf->offset ^= bf->buf_size; if (bf->need_lock) spin_unlock(&bf->lock); else __release(&bf->lock); } spin_unlock_irqrestore(&qp->sq.lock, flags); return err; } static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) { sig->signature = calc_sig(sig, size); } int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr) { struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_wqe_data_seg *scat; struct mlx5_rwqe_sig *sig; struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; unsigned long flags; int err = 0; int nreq; int ind; int i; spin_lock_irqsave(&qp->rq.lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; nreq = 0; goto out; } ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; nreq++, wr = wr->next) { if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->rq.max_gs)) { err = -EINVAL; *bad_wr = wr; goto out; } scat = get_recv_wqe(qp, ind); if (qp->wq_sig) scat++; for (i = 0; i < wr->num_sge; i++) set_data_ptr_seg(scat + i, wr->sg_list + i); if (i < qp->rq.max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); scat[i].addr = 0; } if (qp->wq_sig) { sig = (struct mlx5_rwqe_sig *)scat; set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); } qp->rq.rwr_ctx[ind].wrid = wr->wr_id; ind = (ind + 1) & (qp->rq.wqe_cnt - 1); } out: if (likely(nreq)) { qp->rq.head += nreq; /* Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); } spin_unlock_irqrestore(&qp->rq.lock, flags); return err; } static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) { switch (mlx5_state) { case MLX5_QP_STATE_RST: return IB_QPS_RESET; case MLX5_QP_STATE_INIT: return IB_QPS_INIT; case MLX5_QP_STATE_RTR: return IB_QPS_RTR; case MLX5_QP_STATE_RTS: return IB_QPS_RTS; case MLX5_QP_STATE_SQ_DRAINING: case MLX5_QP_STATE_SQD: return IB_QPS_SQD; case MLX5_QP_STATE_SQER: return IB_QPS_SQE; case MLX5_QP_STATE_ERR: return IB_QPS_ERR; default: return -1; } } static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) { switch (mlx5_mig_state) { case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; case MLX5_QP_PM_REARM: return IB_MIG_REARM; case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; default: return -1; } } static int to_ib_qp_access_flags(int mlx5_flags) { int ib_flags = 0; if (mlx5_flags & MLX5_QP_BIT_RRE) ib_flags |= IB_ACCESS_REMOTE_READ; if (mlx5_flags & MLX5_QP_BIT_RWE) ib_flags |= IB_ACCESS_REMOTE_WRITE; if (mlx5_flags & MLX5_QP_BIT_RAE) ib_flags |= IB_ACCESS_REMOTE_ATOMIC; return ib_flags; } static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, struct mlx5_qp_path *path) { struct mlx5_core_dev *dev = ibdev->mdev; memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); ib_ah_attr->port_num = path->port; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports)) return; ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; if (ib_ah_attr->ah_flags) { ib_ah_attr->grh.sgid_index = path->mgid_index; ib_ah_attr->grh.hop_limit = path->hop_limit; ib_ah_attr->grh.traffic_class = (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; ib_ah_attr->grh.flow_label = be32_to_cpu(path->tclass_flowlabel) & 0xfffff; memcpy(ib_ah_attr->grh.dgid.raw, path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); } } int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_query_qp_mbox_out *outb; struct mlx5_qp_context *context; int mlx5_state; int err = 0; mutex_lock(&qp->mutex); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { err = -EOPNOTSUPP; goto out; } else { outb = kzalloc(sizeof(*outb), GFP_KERNEL); if (!outb) { err = -ENOMEM; goto out; } context = &outb->ctx; err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); if (err) { kfree(outb); goto out; } mlx5_state = be32_to_cpu(context->flags) >> 28; qp->state = to_ib_qp_state(mlx5_state); qp_attr->path_mtu = context->mtu_msgmax >> 5; qp_attr->path_mig_state = to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); qp_attr->qkey = be32_to_cpu(context->qkey); qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context->params2)); if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); qp_attr->alt_pkey_index = be16_to_cpu(context->alt_path.pkey_index); qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; } qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); qp_attr->port_num = context->pri_path.port; /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); qp_attr->max_dest_rd_atomic = 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); qp_attr->min_rnr_timer = (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; qp_attr->timeout = context->pri_path.ackto_lt >> 3; qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; kfree(outb); } qp_attr->qp_state = qp->state; qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; qp_attr->cap.max_recv_sge = qp->rq.max_gs; if (!ibqp->uobject) { qp_attr->cap.max_send_wr = qp->sq.max_post; qp_attr->cap.max_send_sge = qp->sq.max_gs; qp_init_attr->qp_context = ibqp->qp_context; } else { qp_attr->cap.max_send_wr = 0; qp_attr->cap.max_send_sge = 0; } qp_init_attr->qp_type = ibqp->qp_type; qp_init_attr->recv_cq = ibqp->recv_cq; qp_init_attr->send_cq = ibqp->send_cq; qp_init_attr->srq = ibqp->srq; qp_attr->cap.max_inline_data = qp->max_inline_data; qp_init_attr->cap = qp_attr->cap; qp_init_attr->create_flags = 0; if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; out: mutex_unlock(&qp->mutex); return err; } struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_xrcd *xrcd; int err; if (!MLX5_CAP_GEN(dev->mdev, xrc)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); if (err) { kfree(xrcd); return ERR_PTR(-ENOMEM); } return &xrcd->ibxrcd; } int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { struct mlx5_ib_dev *dev = to_mdev(xrcd->device); u32 xrcdn = to_mxrcd(xrcd)->xrcdn; int err; err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); if (err) { mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); return err; } kfree(xrcd); return 0; } Index: stable/10/sys/ofed/drivers/infiniband/core/cma.c =================================================================== --- stable/10/sys/ofed/drivers/infiniband/core/cma.c (revision 325610) +++ stable/10/sys/ofed/drivers/infiniband/core/cma.c (revision 325611) @@ -1,3519 +1,3499 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define LINUXKPI_PARAM_PREFIX ibcore_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); static int tavor_quirk = 0; module_param_named(tavor_quirk, tavor_quirk, int, 0644); MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0"); int unify_tcp_port_space = 1; module_param(unify_tcp_port_space, int, 0644); MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " "space allocation (default=1)"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define IBOE_PACKET_LIFETIME 18 static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT; module_param_named(cma_response_timeout, cma_response_timeout, int, 0644); MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT default=20"); static int def_prec2sl = 3; module_param_named(def_prec2sl, def_prec2sl, int, 0644); MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); static DEFINE_IDR(ipoib_ps); #if defined(INET) static int next_port; #endif struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; atomic_t refcount; struct list_head id_list; }; enum cma_state { CMA_IDLE, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY, CMA_ROUTE_RESOLVED, CMA_CONNECT, CMA_DISCONNECT, CMA_ADDR_BOUND, CMA_LISTEN, CMA_DEVICE_REMOVAL, CMA_DESTROYING }; struct rdma_bind_list { struct idr *ps; struct hlist_head owners; unsigned short port; }; /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ struct cma_device *cma_dev; struct list_head mc_list; int internal_id; enum cma_state state; spinlock_t lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; int backlog; int timeout_ms; struct ib_sa_query *query; int query_id; union { struct ib_cm_id *ib; struct iw_cm_id *iw; } cm_id; u32 seq_num; u32 qkey; u32 qp_num; u8 srq; u8 tos; + int unify_ps_tcp; }; struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *ib; } multicast; struct list_head list; void *context; struct sockaddr_storage addr; struct kref mcref; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum cma_state old_state; enum cma_state new_state; struct rdma_cm_event event; }; struct cma_ndev_work { struct work_struct work; struct rdma_id_private *id; struct rdma_cm_event event; }; struct iboe_mcast_work { struct work_struct work; struct rdma_id_private *id; struct cma_multicast *mc; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; struct sdp_hh { u8 bsdh[16]; u8 sdp_version; /* Major version: 7:4 */ u8 ip_version; /* IP version: 7:4 */ u8 sdp_specific1[10]; __be16 port; __be16 sdp_specific2; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; struct sdp_hah { u8 bsdh[16]; u8 sdp_version; }; #define CMA_VERSION 0x00 #define SDP_MAJ_VERSION 0x2 static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); ret = (id_priv->state == comp); spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static int cma_comp_exch(struct rdma_id_private *id_priv, enum cma_state comp, enum cma_state exch) { unsigned long flags; int ret; spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static enum cma_state cma_exch(struct rdma_id_private *id_priv, enum cma_state exch) { unsigned long flags; enum cma_state old; spin_lock_irqsave(&id_priv->lock, flags); old = id_priv->state; id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return old; } static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static inline u8 sdp_get_majv(u8 sdp_version) { return sdp_version >> 4; } static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) { return hh->ip_version >> 4; } static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) { hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); } static inline int cma_is_ud_ps(enum rdma_port_space ps) { return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { atomic_inc(&cma_dev->refcount); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } static inline void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } static inline void release_mc(struct kref *kref) { struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); kfree(mc->multicast.ib); kfree(mc); } static void cma_detach_from_dev(struct rdma_id_private *id_priv) { list_del(&id_priv->list); cma_deref_dev(id_priv->cma_dev); id_priv->cma_dev = NULL; } static int cma_set_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; if (id_priv->qkey) return 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) { int i; int err; struct ib_port_attr props; union ib_gid tmp; err = ib_query_port(device, port_num, &props); if (err) return 1; for (i = 0; i < props.gid_tbl_len; ++i) { err = ib_query_gid(device, port_num, i, &tmp); if (err) return 1; if (!memcmp(&tmp, gid, sizeof tmp)) return 0; } return -EAGAIN; } int rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type, void **cm_id) { int ret; u8 port; int found_dev = 0, found_cmid = 0; struct rdma_id_private *id_priv; struct rdma_id_private *dev_id_priv; struct cma_device *cma_dev; struct rdma_dev_addr dev_addr; union ib_gid gid; enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; memset(&dev_addr, 0, sizeof(dev_addr)); ret = rdma_translate_ip((struct sockaddr *)local_addr, &dev_addr); if (ret) goto err; /* find rdma device based on MAC address/gid */ mutex_lock(&lock); memcpy(&gid, dev_addr.src_dev_addr + rdma_addr_gid_offset(&dev_addr), sizeof(gid)); list_for_each_entry(cma_dev, &dev_list, list) for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) if ((rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) && (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IWARP)) { ret = find_gid_port(cma_dev->device, &gid, port); if (!ret) { found_dev = 1; goto out; } else if (ret == 1) { mutex_unlock(&lock); goto err; } } out: mutex_unlock(&lock); if (!found_dev) goto err; /* Traverse through the list of listening cm_id's to find the * desired cm_id based on rdma device & port number. */ list_for_each_entry(id_priv, &listen_any_list, list) list_for_each_entry(dev_id_priv, &id_priv->listen_list, listen_list) if (dev_id_priv->cma_dev == cma_dev) if (dev_id_priv->cm_id.iw->local_addr.sin_port == local_addr->sin_port) { *cm_id = (void *)dev_id_priv->cm_id.iw; found_cmid = 1; } return found_cmid ? 0 : -ENODEV; err: return -ENODEV; } EXPORT_SYMBOL(rdma_find_cmid_laddr); static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; union ib_gid gid; int ret = -ENODEV; if (dev_addr->dev_type != ARPHRD_INFINIBAND) { iboe_addr_get_sgid(dev_addr, &gid); list_for_each_entry(cma_dev, &dev_list, list) { ret = ib_find_cached_gid(cma_dev->device, &gid, &id_priv->id.port_num, NULL); if (!ret) goto out; } } memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); list_for_each_entry(cma_dev, &dev_list, list) { ret = ib_find_cached_gid(cma_dev->device, &gid, &id_priv->id.port_num, NULL); if (!ret) break; } out: if (!ret) cma_attach_to_dev(id_priv, cma_dev); return ret; } static void cma_deref_id(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static int cma_disable_callback(struct rdma_id_private *id_priv, enum cma_state state) { mutex_lock(&id_priv->handler_mutex); if (id_priv->state != state) { mutex_unlock(&id_priv->handler_mutex); return -EINVAL; } return 0; } static int cma_has_cm_dev(struct rdma_id_private *id_priv) { return (id_priv->id.device && id_priv->cm_id.ib); } struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = CMA_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); return &id_priv->id; } EXPORT_SYMBOL(rdma_create_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) return -EINVAL; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); if (cma_is_ud_ps(id_priv->id.ps)) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto err; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); return 0; err: ib_destroy_qp(qp); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == IB_LINK_LAYER_INFINIBAND) pkey = ib_addr_get_pkey(dev_addr); else pkey = 0xffff; ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (cma_is_ud_ps(id_priv->id.ps)) { ret = cma_set_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); - break; - default: + } else ret = -ENOSYS; - break; - } return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline int cma_zero_addr(struct sockaddr *addr) { struct in6_addr *ip6; if (addr->sa_family == AF_INET) return ipv4_is_zeronet( ((struct sockaddr_in *)addr)->sin_addr.s_addr); else { ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { if (addr->sa_family == AF_INET) return ipv4_is_loopback( ((struct sockaddr_in *) addr)->sin_addr.s_addr); else return ipv6_addr_loopback( &((struct sockaddr_in6 *) addr)->sin6_addr); } static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } int rdma_cma_any_addr(struct sockaddr *addr) { return cma_any_addr(addr); } EXPORT_SYMBOL(rdma_cma_any_addr); static inline __be16 cma_port(struct sockaddr *addr) { if (addr->sa_family == AF_INET) return ((struct sockaddr_in *) addr)->sin_port; else return ((struct sockaddr_in6 *) addr)->sin6_port; } static inline int cma_any_port(struct sockaddr *addr) { return !cma_port(addr); } static int cma_get_net_info(void *hdr, enum rdma_port_space ps, u8 *ip_ver, __be16 *port, union cma_ip_addr **src, union cma_ip_addr **dst) { switch (ps) { case RDMA_PS_SDP: if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; *ip_ver = sdp_get_ip_ver(hdr); *port = ((struct sdp_hh *) hdr)->port; *src = &((struct sdp_hh *) hdr)->src_addr; *dst = &((struct sdp_hh *) hdr)->dst_addr; break; default: if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) return -EINVAL; *ip_ver = cma_get_ip_ver(hdr); *port = ((struct cma_hdr *) hdr)->port; *src = &((struct cma_hdr *) hdr)->src_addr; *dst = &((struct cma_hdr *) hdr)->dst_addr; break; } if (*ip_ver != 4 && *ip_ver != 6) return -EINVAL; return 0; } static void cma_save_net_info(struct rdma_addr *addr, struct rdma_addr *listen_addr, u8 ip_ver, __be16 port, union cma_ip_addr *src, union cma_ip_addr *dst) { struct sockaddr_in *listen4, *ip4; struct sockaddr_in6 *listen6, *ip6; switch (ip_ver) { case 4: listen4 = (struct sockaddr_in *) &listen_addr->src_addr; ip4 = (struct sockaddr_in *) &addr->src_addr; ip4->sin_family = listen4->sin_family; ip4->sin_addr.s_addr = dst->ip4.addr; ip4->sin_port = listen4->sin_port; ip4 = (struct sockaddr_in *) &addr->dst_addr; ip4->sin_family = listen4->sin_family; ip4->sin_addr.s_addr = src->ip4.addr; ip4->sin_port = port; break; case 6: listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; ip6 = (struct sockaddr_in6 *) &addr->src_addr; ip6->sin6_family = listen6->sin6_family; ip6->sin6_addr = dst->ip6; ip6->sin6_port = listen6->sin6_port; ip6 = (struct sockaddr_in6 *) &addr->dst_addr; ip6->sin6_family = listen6->sin6_family; ip6->sin6_addr = src->ip6; ip6->sin6_port = port; break; default: break; } } static inline int cma_user_data_offset(enum rdma_port_space ps) { switch (ps) { case RDMA_PS_SDP: return 0; default: return sizeof(struct cma_hdr); } } static void cma_cancel_route(struct rdma_id_private *id_priv) { switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); break; default: break; } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_entry(id_priv->listen_list.next, struct rdma_id_private, listen_list); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->list); list_del(&dev_id_priv->listen_list); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum cma_state state) { switch (state) { case CMA_ADDR_QUERY: rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case CMA_ROUTE_QUERY: cma_cancel_route(id_priv); break; case CMA_LISTEN: if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { idr_remove(bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); break; case IB_LINK_LAYER_ETHERNET: kref_put(&mc->mcref, release_mc); break; default: break; } } } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; enum cma_state state; id_priv = container_of(id, struct rdma_id_private, id); state = cma_exch(id_priv, CMA_DESTROYING); cma_cancel_operation(id_priv, state); mutex_lock(&lock); if (id_priv->cma_dev) { mutex_unlock(&lock); - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) ib_destroy_cm_id(id_priv->cm_id.ib); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw)) iw_destroy_cm_id(id_priv->cm_id.iw); - break; - default: - break; } cma_leave_mc_groups(id_priv); mutex_lock(&lock); cma_detach_from_dev(id_priv); } mutex_unlock(&lock); cma_release_port(id_priv); cma_deref_id(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_deref_id(id_priv->id.context); + if (id_priv->sock != NULL && !id_priv->internal_id && + !id_priv->unify_ps_tcp) + sock_release(id_priv->sock); + kfree(id_priv->id.route.path_rec); kfree(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) { if (id_priv->id.ps == RDMA_PS_SDP && sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; return 0; } static void cma_set_rep_event_data(struct rdma_cm_event *event, struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; } static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; int ret = 0; if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && cma_disable_callback(id_priv, CMA_CONNECT)) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && cma_disable_callback(id_priv, CMA_DISCONNECT))) return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: event.status = cma_verify_rep(id_priv, ib_event->private_data); if (event.status) event.event = RDMA_CM_EVENT_CONNECT_ERROR; else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; /* fall through */ case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; union cma_ip_addr *src, *dst; __be16 port; u8 ip_ver; int ret; if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) goto err; id = rdma_create_id(listen_id->event_handler, listen_id->context, listen_id->ps); if (IS_ERR(id)) goto err; cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL); if (!rt->path_rec) goto destroy_id; rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey); } else { ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, &rt->addr.dev_addr); if (ret) goto destroy_id; } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = CMA_CONNECT; return id_priv; destroy_id: rdma_destroy_id(id); err: return NULL; } static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; union cma_ip_addr *src, *dst; __be16 port; u8 ip_ver; int ret; id = rdma_create_id(listen_id->event_handler, listen_id->context, listen_id->ps); if (IS_ERR(id)) return NULL; if (cma_get_net_info(ib_event->private_data, listen_id->ps, &ip_ver, &port, &src, &dst)) goto err; cma_save_net_info(&id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, &id->route.addr.dev_addr); if (ret) goto err; } id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = CMA_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; } static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int offset, ret; listen_id = cm_id->context; if (cma_disable_callback(listen_id, CMA_LISTEN)) return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (cma_is_ud_ps(listen_id->id.ps)) { conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_new_conn_id(&listen_id->id, ib_event); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); mutex_lock(&lock); ret = cma_acquire_dev(conn_id); mutex_unlock(&lock); if (ret) goto release_conn_id; conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = conn_id->id.event_handler(&conn_id->id, &event); if (!ret) { /* * Acquire mutex to prevent user executing rdma_destroy_id() * while we're accessing the cm_id. */ mutex_lock(&lock); if (cma_comp(conn_id, CMA_CONNECT) && !cma_is_ud_ps(conn_id->id.ps)) ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); goto out; } /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; release_conn_id: cma_exch(conn_id, CMA_DESTROYING); mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(&conn_id->id); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) { return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); } static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, struct ib_cm_compare_data *compare) { struct cma_hdr *cma_data, *cma_mask; struct sdp_hh *sdp_data, *sdp_mask; __be32 ip4_addr; #ifdef INET6 struct in6_addr ip6_addr; #endif memset(compare, 0, sizeof *compare); cma_data = (void *) compare->data; cma_mask = (void *) compare->mask; sdp_data = (void *) compare->data; sdp_mask = (void *) compare->mask; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 4); sdp_set_ip_ver(sdp_mask, 0xF); sdp_data->dst_addr.ip4.addr = ip4_addr; sdp_mask->dst_addr.ip4.addr = htonl(~0); } else { cma_set_ip_ver(cma_data, 4); cma_set_ip_ver(cma_mask, 0xF); cma_data->dst_addr.ip4.addr = ip4_addr; cma_mask->dst_addr.ip4.addr = htonl(~0); } break; #ifdef INET6 case AF_INET6: ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; if (ps == RDMA_PS_SDP) { sdp_set_ip_ver(sdp_data, 6); sdp_set_ip_ver(sdp_mask, 0xF); sdp_data->dst_addr.ip6 = ip6_addr; memset(&sdp_mask->dst_addr.ip6, 0xFF, sizeof sdp_mask->dst_addr.ip6); } else { cma_set_ip_ver(cma_data, 6); cma_set_ip_ver(cma_mask, 0xF); cma_data->dst_addr.ip6 = ip6_addr; memset(&cma_mask->dst_addr.ip6, 0xFF, sizeof cma_mask->dst_addr.ip6); } break; #endif default: break; } } static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; struct sockaddr_in *sin; int ret = 0; if (cma_disable_callback(id_priv, CMA_CONNECT)) return 0; memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; *sin = iw_event->local_addr; sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; *sin = iw_event->remote_addr; switch ((int)iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; default: BUG_ON(1); } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct sockaddr_in *sin; struct net_device *dev = NULL; struct rdma_cm_event event; int ret; struct ib_device_attr attr; listen_id = cm_id->context; if (cma_disable_callback(listen_id, CMA_LISTEN)) return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = CMA_CONNECT; dev = ip_dev_find(NULL, iw_event->local_addr.sin_addr.s_addr); if (!dev) { ret = -EADDRNOTAVAIL; mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } mutex_lock(&lock); ret = cma_acquire_dev(conn_id); mutex_unlock(&lock); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; *sin = iw_event->local_addr; sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; *sin = iw_event->remote_addr; ret = ib_query_device(conn_id->id.device, &attr); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); goto out; } memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = attr.max_qp_init_rd_atom; event.param.conn.responder_resources = attr.max_qp_rd_atom; ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; cma_exch(conn_id, CMA_DESTROYING); mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(&conn_id->id); goto out; } mutex_unlock(&conn_id->handler_mutex); out: if (dev) dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct ib_cm_compare_data compare_data; struct sockaddr *addr; __be64 svc_id; int ret; id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); if (IS_ERR(id_priv->cm_id.ib)) return PTR_ERR(id_priv->cm_id.ib); addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; svc_id = cma_get_service_id(id_priv->id.ps, addr); if (cma_any_addr(addr)) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); else { cma_set_compare_data(id_priv->id.ps, addr, &compare_data); ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); } if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } return ret; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct sockaddr_in *sin; id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device, id_priv->sock, iw_conn_req_handler, id_priv); if (IS_ERR(id_priv->cm_id.iw)) return PTR_ERR(id_priv->cm_id.iw); sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; id_priv->cm_id.iw->local_addr = *sin; ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; return id_priv->id.event_handler(id, event); } static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { struct rdma_id_private *dev_id_priv; struct rdma_cm_id *id; int ret; id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps); if (IS_ERR(id)) return; dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = CMA_ADDR_BOUND; dev_id_priv->sock = id_priv->sock; memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; ret = rdma_listen(id, id_priv->backlog); if (ret) printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " "listening on device %s\n", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; mutex_lock(&lock); list_add_tail(&id_priv->list, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == CMA_IDLE) { ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); if (ret) return ret; } if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN)) return -EINVAL; id_priv->backlog = backlog; if (id->device) { - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; - break; - default: + } else { ret = -ENOSYS; goto err; } } else cma_listen_on_all(id_priv); return 0; err: id_priv->backlog = 0; cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); id_priv->tos = (u8) tos; } EXPORT_SYMBOL(rdma_set_service_type); static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { struct cma_work *work = context; struct rdma_route *route; route = &work->id->id.route; if (!status) { route->num_paths = 1; *route->path_rec = *path_rec; } else { work->old_state = CMA_ROUTE_QUERY; work->new_state = CMA_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; } queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { struct rdma_addr *addr = &id_priv->id.route.addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; memset(&path_rec, 0, sizeof path_rec); rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &addr->dst_addr); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; if (addr->src_addr.ss_family == AF_INET) { path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; } else { sin6 = (struct sockaddr_in6 *) &addr->src_addr; path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; } if (tavor_quirk) { path_rec.mtu_selector = IB_SA_LT; path_rec.mtu = IB_MTU_2048; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, CMA_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } static void cma_ndev_work_handler(struct work_struct *_work) { struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); struct rdma_id_private *id_priv = work->id; int destroy = 0; mutex_lock(&id_priv->handler_mutex); if (id_priv->state == CMA_DESTROYING || id_priv->state == CMA_DEVICE_REMOVAL) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { cma_exch(id_priv, CMA_DESTROYING); destroy = 1; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); if (destroy) rdma_destroy_id(&id_priv->id); kfree(work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = CMA_ROUTE_QUERY; work->new_state = CMA_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_set_ib_paths(struct rdma_cm_id *id, struct ib_sa_path_rec *path_rec, int num_paths) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths); return 0; err: cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_paths); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = CMA_ROUTE_QUERY; work->new_state = CMA_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; queue_work(cma_wq, &work->work); return 0; } static u8 tos_to_sl(u8 tos) { return def_prec2sl & 7; } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; u16 vid; if (src_addr->sin_family != dst_addr->sin_family) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_paths = 1; if (addr->dev_addr.bound_dev_if) ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; goto err2; } vid = rdma_vlan_dev_vlan_id(ndev); iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid); iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid); route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = tos_to_sl(id_priv->tos); #ifdef __linux__ route->path_rec->mtu = iboe_get_mtu(ndev->mtu); #else route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); #endif route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; route->path_rec->packet_life_time = IBOE_PACKET_LIFETIME; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } work->old_state = CMA_ROUTE_QUERY; work->new_state = CMA_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; work->event.status = 0; queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ret = cma_resolve_ib_route(id_priv, timeout_ms); - break; - case IB_LINK_LAYER_ETHERNET: - ret = cma_resolve_iboe_route(id_priv); - break; - default: - ret = -ENOSYS; - } - break; - case RDMA_TRANSPORT_IWARP: + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_route(id_priv, timeout_ms); + else if (rdma_protocol_roce(id->device, id->port_num)) + ret = cma_resolve_iboe_route(id_priv); + else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv, timeout_ms); - break; - default: + else ret = -ENOSYS; - break; - } + if (ret) goto err; return 0; err: cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; mutex_lock(&lock); if (list_empty(&dev_list)) { ret = -ENODEV; goto out; } list_for_each_entry(cma_dev, &dev_list, list) for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) if (!ib_query_port(cma_dev->device, p, &port_attr) && port_attr.state == IB_PORT_ACTIVE) goto port_found; p = 1; cma_dev = list_entry(dev_list.next, struct cma_device, list); port_found: ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event; memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); /* * Grab mutex to block rdma_destroy_id() from removing the device while * we're trying to acquire it. */ mutex_lock(&lock); if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) { mutex_unlock(&lock); goto out; } if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv); mutex_unlock(&lock); if (status) { if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else { memcpy(&id_priv->id.route.addr.src_addr, src_addr, ip_addr_size(src_addr)); event.event = RDMA_CM_EVENT_ADDR_RESOLVED; } if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; struct sockaddr *src, *dst; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; if (cma_zero_addr(src)) { dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; if ((src->sa_family = dst->sa_family) == AF_INET) { ((struct sockaddr_in *) src)->sin_addr.s_addr = ((struct sockaddr_in *) dst)->sin_addr.s_addr; } else { ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); } } work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = CMA_ADDR_QUERY; work->new_state = CMA_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; err: kfree(work); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr) { if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) { ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; } } if (!cma_any_addr(src_addr)) return rdma_bind_addr(id, src_addr); else { struct sockaddr_in addr_in; memset(&addr_in, 0, sizeof addr_in); addr_in.sin_family = dst_addr->sa_family; addr_in.sin_len = sizeof addr_in; return rdma_bind_addr(id, (struct sockaddr *) &addr_in); } } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == CMA_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); if (ret) return ret; } if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); if (cma_any_addr(dst_addr)) ret = cma_resolve_loopback(id_priv); else ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); if (ret) goto err; return 0; err: cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr_in *sin; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; sin->sin_port = htons(bind_list->port); id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int port, ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; do { ret = idr_get_new_above(ps, bind_list, snum, &port); } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); if (ret) goto err1; if (port != snum) { ret = -EADDRNOTAVAIL; goto err2; } bind_list->ps = ps; bind_list->port = (unsigned short) port; cma_bind_port(bind_list, id_priv); return 0; err2: idr_remove(ps, port); err1: kfree(bind_list); return ret; } static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) { #if defined(INET) struct rdma_bind_list *bind_list; int port, ret, low, high; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; retry: /* FIXME: add proper port randomization per like inet_csk_get_port */ do { ret = idr_get_new_above(ps, bind_list, next_port, &port); } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); if (ret) goto err1; inet_get_local_port_range(&low, &high); if (port > high) { if (next_port != low) { idr_remove(ps, port); next_port = low; goto retry; } ret = -EADDRNOTAVAIL; goto err2; } if (port == high) next_port = low; else next_port = port + 1; bind_list->ps = ps; bind_list->port = (unsigned short) port; cma_bind_port(bind_list, id_priv); return 0; err2: idr_remove(ps, port); err1: kfree(bind_list); return ret; #else return -ENOSPC; #endif } static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr_in *sin, *cur_sin; struct rdma_bind_list *bind_list; struct hlist_node *node; unsigned short snum; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; snum = ntohs(sin->sin_port); #ifdef __linux__ if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; #endif bind_list = idr_find(ps, snum); if (!bind_list) return cma_alloc_port(ps, id_priv, snum); /* * We don't support binding to any address if anyone is bound to * a specific address on the same port. */ if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) return -EADDRNOTAVAIL; hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr)) return -EADDRNOTAVAIL; cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr; if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr) return -EADDRINUSE; } cma_bind_port(bind_list, id_priv); return 0; } static int cma_get_tcp_port(struct rdma_id_private *id_priv) { int ret; int size; struct socket *sock; ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; #ifdef __linux__ ret = sock->ops->bind(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); #else + SOCK_LOCK(sock); + sock->so_options |= SO_REUSEADDR; + SOCK_UNLOCK(sock); + ret = -sobind(sock, (struct sockaddr *)&id_priv->id.route.addr.src_addr, curthread); #endif if (ret) { sock_release(sock); return ret; } size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); ret = sock_getname(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, &size, 0); if (ret) { sock_release(sock); return ret; } id_priv->sock = sock; return 0; } static int cma_get_port(struct rdma_id_private *id_priv) { + struct cma_device *cma_dev; struct idr *ps; int ret; switch (id_priv->id.ps) { case RDMA_PS_SDP: ps = &sdp_ps; break; case RDMA_PS_TCP: ps = &tcp_ps; - if (unify_tcp_port_space) { + + mutex_lock(&lock); + /* check if there are any iWarp IB devices present */ + list_for_each_entry(cma_dev, &dev_list, list) { + if (rdma_protocol_iwarp(cma_dev->device, 1)) { + id_priv->unify_ps_tcp = 1; + break; + } + } + mutex_unlock(&lock); + + if (id_priv->unify_ps_tcp) { ret = cma_get_tcp_port(id_priv); if (ret) goto out; } break; case RDMA_PS_UDP: ps = &udp_ps; break; case RDMA_PS_IPOIB: ps = &ipoib_ps; break; default: return -EPROTONOSUPPORT; } mutex_lock(&lock); if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); out: return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if defined(INET6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; #ifdef __linux__ if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && #else if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && #endif !sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); if (ret) goto err1; if (!cma_any_addr(addr)) { ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); if (ret) goto err1; mutex_lock(&lock); ret = cma_acquire_dev(id_priv); mutex_unlock(&lock); if (ret) goto err1; } memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); ret = cma_get_port(id_priv); if (ret) goto err2; return 0; err2: if (id_priv->cma_dev) { mutex_lock(&lock); cma_detach_from_dev(id_priv); mutex_unlock(&lock); } err1: cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE); return ret; } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, enum rdma_port_space ps, struct rdma_route *route) { struct cma_hdr *cma_hdr; struct sdp_hh *sdp_hdr; if (route->addr.src_addr.ss_family == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) &route->addr.src_addr; dst4 = (struct sockaddr_in *) &route->addr.dst_addr; switch (ps) { case RDMA_PS_SDP: sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; sdp_set_ip_ver(sdp_hdr, 4); sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; sdp_hdr->port = src4->sin_port; break; default: cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; break; } } else { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) &route->addr.src_addr; dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; switch (ps) { case RDMA_PS_SDP: sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; sdp_set_ip_ver(sdp_hdr, 6); sdp_hdr->src_addr.ip6 = src6->sin6_addr; sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; sdp_hdr->port = src6->sin6_port; break; default: cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; break; } } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event; struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; if (cma_disable_callback(id_priv, CMA_CONNECT)) return 0; memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; break; } ret = cma_set_qkey(id_priv); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = -EINVAL; break; } if (id_priv->qkey != rep->qkey) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -EINVAL; break; } ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct rdma_route *route; int ret; req.private_data_len = sizeof(struct cma_hdr) + conn_param->private_data_len; req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!req.private_data) return -ENOMEM; if (conn_param->private_data && conn_param->private_data_len) memcpy((void *) req.private_data + sizeof(struct cma_hdr), conn_param->private_data, conn_param->private_data_len); route = &id_priv->id.route; ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); if (ret) goto out; id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id_priv->cm_id.ib)) { ret = PTR_ERR(id_priv->cm_id.ib); goto out; } req.path = route->path_rec; req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.timeout_ms = 1 << (cma_response_timeout - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(req.private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; int offset, ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv->id.ps); req.private_data_len = offset + conn_param->private_data_len; private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id_priv->cm_id.ib)) { ret = PTR_ERR(id_priv->cm_id.ib); goto out; } route = &id_priv->id.route; ret = cma_format_hdr(private_data, id_priv->id.ps, route); if (ret) goto out; req.private_data = private_data; req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); req.qp_num = id_priv->qp_num; req.qp_type = IB_QPT_RC; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = conn_param->retry_count; req.rnr_retry_count = conn_param->rnr_retry_count; req.remote_cm_response_timeout = cma_response_timeout; req.local_cm_response_timeout = cma_response_timeout; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id_priv->cm_id.ib)) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; struct sockaddr_in* sin; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto out; } id_priv->cm_id.iw = cm_id; sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; cm_id->local_addr = *sin; sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; cm_id->remote_addr = *sin; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; else iw_param.qpn = conn_param->qp_num; ret = iw_cm_connect(cm_id, &iw_param); out: if (ret && !IS_ERR(cm_id)) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (cma_is_ud_ps(id->ps)) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_connect_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = conn_param->rnr_retry_count; rep.srq = id_priv->srq ? 1 : 0; ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) { iw_param.qpn = id_priv->qp_num; } else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { ret = cma_set_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; } rep.private_data = private_data; rep.private_data_len = private_data_len; return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp(id_priv, CMA_CONNECT)) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (cma_is_ud_ps(id->ps)) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->private_data, conn_param->private_data_len); else if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_accept_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_has_cm_dev(id_priv)) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_has_cm_dev(id_priv)) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (cma_is_ud_ps(id->ps)) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, private_data, private_data_len); else ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); - break; - default: + } else ret = -ENOSYS; - break; - } + return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_has_cm_dev(id_priv)) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); - break; - case RDMA_TRANSPORT_IWARP: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); - break; - default: + } else ret = -EINVAL; - break; - } + out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; int ret; id_priv = mc->id_priv; if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) && cma_disable_callback(id_priv, CMA_ADDR_RESOLVED)) return 0; mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, multicast->rec.mlid); mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; if (!status) { event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, &event.param.ud.ah_attr); event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; } mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { #if defined(INET) || defined(INET6) unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; #endif #ifdef INET struct sockaddr_in *sin = (struct sockaddr_in *) addr; #endif #ifdef INET6 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; #endif if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); #ifdef INET6 } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); #endif #ifdef INET } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); #endif } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); if (id_priv->id.ps == RDMA_PS_UDP) rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = 1; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR; mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); if (IS_ERR(mc->multicast.ib)) return PTR_ERR(mc->multicast.ib); return 0; } static void iboe_mcast_work_handler(struct work_struct *work) { struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); struct cma_multicast *mc = mw->mc; struct ib_sa_multicast *m = mc->multicast.ib; mc->multicast.ib->context = mc; cma_ib_mc_handler(0, m); kref_put(&mc->mcref, release_mc); kfree(mw); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); else { mgid->raw[0] = 0xff; mgid->raw[1] = 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); if (!mc->multicast.ib) { err = -ENOMEM; goto out1; } cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; } mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; #ifdef __linux__ mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); #else mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); #endif dev_put(ndev); if (!mc->multicast.ib->rec.mtu) { err = -EINVAL; goto out2; } iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid); work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); kref_get(&mc->mcref); queue_work(cma_wq, &work->work); return 0; out2: kfree(mc->multicast.ib); out1: kfree(work); return err; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp(id_priv, CMA_ADDR_BOUND) && !cma_comp(id_priv, CMA_ADDR_RESOLVED)) return -EINVAL; mc = kmalloc(sizeof *mc, GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, ip_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ret = cma_join_ib_multicast(id_priv, mc); break; case IB_LINK_LAYER_ETHERNET: kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); break; default: ret = -EINVAL; } break; default: ret = -ENOSYS; break; } if (ret) { spin_lock_irq(&id_priv->lock); list_del(&mc->list); spin_unlock_irq(&id_priv->lock); kfree(mc); } return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); if (id->qp) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, mc->multicast.ib->rec.mlid); if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { switch (rdma_port_get_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); break; case IB_LINK_LAYER_ETHERNET: kref_put(&mc->mcref, release_mc); break; default: break; } } return; } } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_ndev_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; #ifdef __linux__ if ((dev_addr->bound_dev_if == ndev->ifindex) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); #else if ((dev_addr->bound_dev_if == ndev->if_index) && memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", ndev->if_xname, &id_priv->id); #endif work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_ndev_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; atomic_inc(&id_priv->refcount); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct net_device *ndev = (struct net_device *)ctx; struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; #ifdef __linux__ if (dev_net(ndev) != &init_net) return NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; #else if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) return NOTIFY_DONE; #endif mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, list) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; cma_dev->device = device; init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); } static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; enum cma_state state; int ret = 0; /* Record that we want to remove the device */ state = cma_exch(id_priv, CMA_DEVICE_REMOVAL); if (state == CMA_DESTROYING) return 0; cma_cancel_operation(id_priv, state); mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL)) goto out; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; ret = id_priv->id.event_handler(&id_priv->id, &event); out: mutex_unlock(&id_priv->handler_mutex); return ret; } static void cma_process_remove(struct cma_device *cma_dev) { struct rdma_id_private *id_priv; int ret; mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { id_priv = list_entry(cma_dev->id_list.next, struct rdma_id_private, list); list_del(&id_priv->listen_list); list_del_init(&id_priv->list); atomic_inc(&id_priv->refcount); mutex_unlock(&lock); ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); cma_deref_id(id_priv); if (ret) rdma_destroy_id(&id_priv->id); mutex_lock(&lock); } mutex_unlock(&lock); cma_deref_dev(cma_dev); wait_for_completion(&cma_dev->comp); } static void cma_remove_one(struct ib_device *device) { struct cma_device *cma_dev; cma_dev = ib_get_client_data(device, &cma_client); if (!cma_dev) return; mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev); } static int cma_init(void) { int ret; #if defined(INET) int low, high, remaining; get_random_bytes(&next_port, sizeof next_port); inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; next_port = ((unsigned int) next_port % remaining) + low; #endif cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); if (ret) goto err; return 0; err: unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); return ret; } static void cma_cleanup(void) { ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); idr_destroy(&sdp_ps); idr_destroy(&tcp_ps); idr_destroy(&udp_ps); idr_destroy(&ipoib_ps); } module_init(cma_init); module_exit(cma_cleanup); Index: stable/10/sys/ofed/drivers/infiniband/core/device.c =================================================================== --- stable/10/sys/ofed/drivers/infiniband/core/device.c (revision 325610) +++ stable/10/sys/ofed/drivers/infiniband/core/device.c (revision 325611) @@ -1,771 +1,769 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); #ifdef __ia64__ /* workaround for a bug in hp chipset that would cause kernel panic when dma resources are exhaused */ int dma_map_sg_hp_wa = 0; #endif struct ib_client_data { struct list_head list; struct ib_client *client; void * data; }; static LIST_HEAD(device_list); static LIST_HEAD(client_list); /* * device_mutex protects access to both device_list and client_list. * There's no real point to using multiple locks or something fancier * like an rwsem: we always access both lists, and we're always * modifying one list or the other list. In any case this is not a * hot path so there's no point in trying to optimize. */ static DEFINE_MUTEX(device_mutex); static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(query_gid), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_ah), IB_MANDATORY_FUNC(destroy_ah), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), - IB_MANDATORY_FUNC(dereg_mr) + IB_MANDATORY_FUNC(dereg_mr), + IB_MANDATORY_FUNC(get_port_immutable) }; int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) { printk(KERN_WARNING "Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); return -EINVAL; } } return 0; } static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; list_for_each_entry(device, &device_list, core_list) if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) return device; return NULL; } static int alloc_name(char *name) { unsigned long *inuse; char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); if (!inuse) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { if (!sscanf(device->name, name, &i)) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); snprintf(buf, sizeof buf, name, i); if (__ib_device_get_by_name(buf)) return -ENFILE; strlcpy(name, buf, IB_DEVICE_NAME_MAX); return 0; } -static int start_port(struct ib_device *device) +static int rdma_start_port(struct ib_device *device) { return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; } -static int end_port(struct ib_device *device) +static int rdma_end_port(struct ib_device *device) { return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : device->phys_port_cnt; } /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *ib_alloc_device(size_t size) { BUG_ON(size < sizeof (struct ib_device)); return kzalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->reg_state == IB_DEV_UNINITIALIZED) { + kfree(device->port_immutable); kfree(device); return; } BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); kobject_put(&device->dev.kobj); } EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; unsigned long flags; context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", device->name, client->name); return -ENOMEM; } context->client = client; context->data = NULL; spin_lock_irqsave(&device->client_data_lock, flags); list_add(&context->list, &device->client_data_list); spin_unlock_irqrestore(&device->client_data_lock, flags); return 0; } -static int read_port_table_lengths(struct ib_device *device) +static int verify_immutable(const struct ib_device *dev, u8 port) { - struct ib_port_attr *tprops = NULL; - int num_ports, ret = -ENOMEM; - u8 port_index; + return WARN_ON(!rdma_cap_ib_mad(dev, port) && + rdma_max_mad_size(dev, port) != 0); +} - tprops = kmalloc(sizeof *tprops, GFP_KERNEL); - if (!tprops) - goto out; +static int read_port_immutable(struct ib_device *device) +{ + int ret; + u8 start_port = rdma_start_port(device); + u8 end_port = rdma_end_port(device); + u8 port; - num_ports = end_port(device) - start_port(device) + 1; + /** + * device->port_immutable is indexed directly by the port number to make + * access to this data as efficient as possible. + * + * Therefore port_immutable is declared as a 1 based array with + * potential empty slots at the beginning. + */ + device->port_immutable = kzalloc(sizeof(*device->port_immutable) + * (end_port + 1), + GFP_KERNEL); + if (!device->port_immutable) + return -ENOMEM; - device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, - GFP_KERNEL); - device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, - GFP_KERNEL); - if (!device->pkey_tbl_len || !device->gid_tbl_len) - goto err; - - for (port_index = 0; port_index < num_ports; ++port_index) { - ret = ib_query_port(device, port_index + start_port(device), - tprops); + for (port = start_port; port <= end_port; ++port) { + ret = device->get_port_immutable(device, port, + &device->port_immutable[port]); if (ret) - goto err; - device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; - device->gid_tbl_len[port_index] = tprops->gid_tbl_len; - } + return ret; - ret = 0; - goto out; - -err: - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); -out: - kfree(tprops); - return ret; + if (verify_immutable(device, port)) + return -EINVAL; + } + return 0; } /** * ib_register_device - Register an IB device with IB core * @device:Device to register * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) { int ret; mutex_lock(&device_mutex); if (strchr(device->name, '%')) { ret = alloc_name(device->name); if (ret) goto out; } if (ib_device_check_mandatory(device)) { ret = -EINVAL; goto out; } INIT_LIST_HEAD(&device->event_handler_list); INIT_LIST_HEAD(&device->client_data_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); device->ib_uverbs_xrcd_table = RB_ROOT; mutex_init(&device->xrcd_table_mutex); - ret = read_port_table_lengths(device); + + ret = read_port_immutable(device); if (ret) { - printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", - device->name); + printk(KERN_WARNING "Couldn't create per port immutable data %s\n", + device->name); goto out; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); + kfree(device->port_immutable); goto out; } list_add_tail(&device->core_list, &device_list); device->reg_state = IB_DEV_REGISTERED; { struct ib_client *client; list_for_each_entry(client, &client_list, list) if (client->add && !add_client_context(device, client)) client->add(device); } out: mutex_unlock(&device_mutex); return ret; } EXPORT_SYMBOL(ib_register_device); /** * ib_unregister_device - Unregister an IB device * @device:Device to unregister * * Unregister an IB device. All clients will receive a remove callback. */ void ib_unregister_device(struct ib_device *device) { struct ib_client *client; struct ib_client_data *context, *tmp; unsigned long flags; mutex_lock(&device_mutex); list_for_each_entry_reverse(client, &client_list, list) if (client->remove) client->remove(device); list_del(&device->core_list); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); - mutex_unlock(&device_mutex); ib_device_unregister_sysfs(device); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); device->reg_state = IB_DEV_UNREGISTERED; } EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; mutex_lock(&device_mutex); list_add_tail(&client->list, &client_list); list_for_each_entry(device, &device_list, core_list) if (client->add && !add_client_context(device, client)) client->add(device); mutex_unlock(&device_mutex); return 0; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. */ void ib_unregister_client(struct ib_client *client) { struct ib_client_data *context, *tmp; struct ib_device *device; unsigned long flags; mutex_lock(&device_mutex); list_for_each_entry(device, &device_list, core_list) { if (client->remove) client->remove(device); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) if (context->client == client) { list_del(&context->list); kfree(context); } spin_unlock_irqrestore(&device->client_data_lock, flags); } list_del(&client->list); mutex_unlock(&device_mutex); } EXPORT_SYMBOL(ib_unregister_client); /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns client context set with * ib_set_client_data(). */ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; void *ret = NULL; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } spin_unlock_irqrestore(&device->client_data_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_client_data); /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context that can be retrieved with * ib_get_client_data(). */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { struct ib_client_data *context; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; goto out; } printk(KERN_WARNING "No client context found for %s/%s\n", device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ int ib_register_event_handler (struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ int ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(struct ib_event *event) { unsigned long flags; struct ib_event_handler *handler; spin_lock_irqsave(&event->device->event_handler_lock, flags); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); spin_unlock_irqrestore(&event->device->event_handler_lock, flags); } EXPORT_SYMBOL(ib_dispatch_event); /** * ib_query_device - Query IB device attributes * @device:Device to query * @device_attr:Device attributes * * ib_query_device() returns the attributes of a device through the * @device_attr pointer. */ int ib_query_device(struct ib_device *device, struct ib_device_attr *device_attr) { return device->query_device(device, device_attr); } EXPORT_SYMBOL(ib_query_device); /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; return device->query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); /** * ib_query_gid - Get GID table entry * @device:Device to query * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid) { return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey) { return device->query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { return device->modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, port_modify); } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; - for (port = start_port(device); port <= end_port(device); ++port) { - for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { + for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { ret = ib_query_gid(device, port, i, &tmp_gid); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; - for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { + for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { *index = i; return 0; } } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); static int __init ib_core_init(void) { int ret; #ifdef __ia64__ if (ia64_platform_is("hpzx1")) dma_map_sg_hp_wa = 1; #endif ret = ib_sysfs_setup(); if (ret) printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); ret = ib_cache_setup(); if (ret) { printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); ib_sysfs_cleanup(); } return ret; } static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); ib_sysfs_cleanup(); /* Make sure that any pending umem accounting work is done. */ flush_scheduled_work(); } module_init(ib_core_init); module_exit(ib_core_cleanup); #undef MODULE_VERSION #include static int ibcore_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ibcore_mod = { .name = "ibcore", .evhand = ibcore_evhand, }; MODULE_VERSION(ibcore, 1); DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_SMP, SI_ORDER_ANY); Index: stable/10/sys/ofed/drivers/infiniband/hw/mlx4/main.c =================================================================== --- stable/10/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 325610) +++ stable/10/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 325611) @@ -1,2424 +1,2457 @@ /* * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define LINUXKPI_PARAM_PREFIX mlx4_ #include #ifdef __linux__ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include "user.h" #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME #define DRV_VERSION "1.0" #define DRV_RELDATE "April 4, 2008" #define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib" #define MLX4_IB_MRS_PROC_DIR_NAME "mrs" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); int mlx4_ib_sm_guid_assign = 1; #ifdef __linux__ struct proc_dir_entry *mlx4_mrs_dir_entry; static struct proc_dir_entry *mlx4_ib_driver_dir_entry; #endif module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); static char dev_assign_str[512]; //module_param_string(dev_assign_str, dev_assign_str, sizeof(dev_assign_str), 0644); MODULE_PARM_DESC(dev_assign_str, "Map all device function numbers to " "IB device numbers following the pattern: " "bb:dd.f-0,bb:dd.f-1,... (all numbers are hexadecimals)." " Max supported devices - 32"); static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; struct update_gid_work { struct work_struct work; union ib_gid gids[128]; struct mlx4_ib_dev *dev; int port; }; struct dev_rec { int bus; int dev; int func; int nr; }; #define MAX_DR 32 static struct dev_rec dr[MAX_DR]; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static union ib_gid zgid; static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memset(props, 0, sizeof *props); props->fw_ver = dev->dev->caps.fw_ver; props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK | IB_DEVICE_SHARED_MR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; props->device_cap_flags |= IB_DEVICE_QPG; if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { props->device_cap_flags |= IB_DEVICE_UD_RSS; props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz; } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; out: kfree(in_mad); kfree(out_mad); return err; } static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx4_dev *dev = to_mdev(device)->dev; return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } static int ib_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int ext_active_speed; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); props->sm_sl = out_mad->data[36] & 0xf; props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); if (netw_view) props->gid_tbl_len = out_mad->data[50]; else props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; props->active_speed = out_mad->data[35] >> 4; props->max_mtu = out_mad->data[41] & 0xf; props->active_mtu = out_mad->data[36] >> 4; props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; /* Check if extended speeds (EDR/FDR/...) are supported */ if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { ext_active_speed = out_mad->data[62] >> 4; switch (ext_active_speed) { case 1: props->active_speed = IB_SPEED_FDR; break; case 2: props->active_speed = IB_SPEED_EDR; break; } } /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == IB_SPEED_QDR) { init_query_mad(in_mad); in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; /* Checking LinkSpeedActive for FDR-10 */ if (out_mad->data[15] & 0x1) props->active_speed = IB_SPEED_FDR10; } /* Avoid wrong speed value returned by FW if the IB link is down. */ if (props->state == IB_PORT_DOWN) props->active_speed = IB_SPEED_SDR; out: kfree(in_mad); kfree(out_mad); return err; } static u8 state_to_phys_state(enum ib_port_state state) { return state == IB_PORT_ACTIVE ? 5 : 3; } static int eth_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); struct mlx4_ib_iboe *iboe = &mdev->iboe; struct net_device *ndev; enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; props->port_cap_flags = IB_PORT_CM_SUP; if (netw_view) props->gid_tbl_len = MLX4_ROCE_MAX_GIDS; else props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; props->max_mtu = IB_MTU_4096; props->max_vl_num = 2; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock(&iboe->lock); ndev = iboe->netdevs[port - 1]; if (!ndev) goto out_unlock; tmp = iboe_get_mtu(ndev->if_mtu); props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: spin_unlock(&iboe->lock); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { int err; memset(props, 0, sizeof *props); err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? ib_link_query_port(ibdev, port, props, netw_view) : eth_link_query_port(ibdev, port, props, netw_view); return err; } static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { /* returns host view */ return __mlx4_ib_query_port(ibdev, port, props, 0); } int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; struct mlx4_ib_dev *dev = to_mdev(ibdev); int clear = 0; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(dev->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); if (mlx4_is_mfunc(dev->dev) && !netw_view) { if (index) { /* For any index > 0, return the null guid */ err = 0; clear = 1; goto out; } } init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: if (clear) memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; } static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); *gid = dev->iboe.gid_table[port - 1][index]; return 0; } static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); else return iboe_query_gid(ibdev, port, index, gid); } int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); out: kfree(in_mad); kfree(out_mad); return err; } static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); } static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; if (mlx4_is_slave(to_mdev(ibdev)->dev)) return -EOPNOTSUPP; spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); memcpy(ibdev->node_desc, props->node_desc, 64); spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) return 0; memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, 256); if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); } else { ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct ib_port_attr attr; u32 cap_mask; int err; mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) goto out; cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx4_SET_PORT(to_mdev(ibdev), port, !!(mask & IB_PORT_RESET_QKEY_CNTR), cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); return err; } static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp_v3.bf_reg_size = 0; resp_v3.bf_regs_per_page = 0; } } else { resp.dev_caps = dev->dev->caps.userspace_caps; resp.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp.bf_reg_size = 0; resp.bf_regs_per_page = 0; } resp.cqe_size = dev->dev->caps.cqe_size; } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); if (err) { kfree(context); return ERR_PTR(err); } INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); return ERR_PTR(-EFAULT); } return &context->ibucontext; } static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); kfree(context); return 0; } #ifdef __linux__ static unsigned long mlx4_ib_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm; struct vm_area_struct *vma; unsigned long start_addr; unsigned long page_size_order; unsigned long command; mm = current->mm; if (addr) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); /* Last 8 bits hold the command others are data per that command */ command = pgoff & MLX4_IB_MMAP_CMD_MASK; if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS; /* code is based on the huge-pages get_unmapped_area code */ start_addr = mm->free_area_cache; if (len <= mm->cached_hole_size) start_addr = TASK_UNMAPPED_BASE; full_search: addr = ALIGN(start_addr, 1 << page_size_order); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (TASK_SIZE - len < addr) { /* * Start a new search - just in case we missed * some holes. */ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = TASK_UNMAPPED_BASE; goto full_search; } return -ENOMEM; } if (!vma || addr + len <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, 1 << page_size_order); } } #endif static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); int err; /* Last 8 bits hold the command others are data per that command */ unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK; if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { /* compatability handling for commands 0 & 1*/ if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; } if (command == MLX4_IB_MMAP_UAR_PAGE) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE && dev->dev->caps.bf_reg_size != 0) { vma->vm_page_prot = pgprot_wc(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { /* Getting contiguous physical pages */ unsigned long total_size = vma->vm_end - vma->vm_start; unsigned long page_size_order = (vma->vm_pgoff) >> MLX4_IB_MMAP_CMD_BITS; struct ib_cmem *ib_cmem; ib_cmem = ib_cmem_alloc_contiguous_pages(context, total_size, page_size_order); if (IS_ERR(ib_cmem)) { err = PTR_ERR(ib_cmem); return err; } err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma); if (err) { ib_cmem_release_contiguous_pages(ib_cmem); return err; } return 0; } else return -EINVAL; return 0; } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_pd *pd; int err; pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } return &pd->ibpd; } static int mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); kfree(pd); return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_xrcd *xrcd; int err; if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); if (err) goto err1; xrcd->pd = ib_alloc_pd(ibdev); if (IS_ERR(xrcd->pd)) { err = PTR_ERR(xrcd->pd); goto err2; } xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); if (IS_ERR(xrcd->cq)) { err = PTR_ERR(xrcd->cq); goto err3; } return &xrcd->ibxrcd; err3: ib_dealloc_pd(xrcd->pd); err2: mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); err1: kfree(xrcd); return ERR_PTR(err); } static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { ib_destroy_cq(to_mxrcd(xrcd)->cq); ib_dealloc_pd(to_mxrcd(xrcd)->pd); mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); kfree(xrcd); return 0; } static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) return -ENOMEM; ge->gid = *gid; if (mlx4_ib_add_mc(mdev, mqp, gid)) { ge->port = mqp->port; ge->added = 1; } mutex_lock(&mqp->mutex); list_add_tail(&ge->list, &mqp->gid_list); mutex_unlock(&mqp->mutex); return 0; } int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { u8 mac[6]; struct net_device *ndev; int ret = 0; if (!mqp->port) return 0; spin_lock(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); if (ndev) { rdma_get_mcast_mac((struct in6_addr *)gid, mac); rtnl_lock(); dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0); ret = 1; rtnl_unlock(); dev_put(ndev); } return ret; } struct mlx4_ib_steering { struct list_head list; u64 reg_id; union ib_gid gid; }; static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); if (!ib_steering) return -ENOMEM; } err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), MLX4_PROT_IB_IPV6, ®_id); if (err) goto err_malloc; err = add_gid_entry(ibqp, gid); if (err) goto err_add; if (ib_steering) { memcpy(ib_steering->gid.raw, gid->raw, 16); ib_steering->reg_id = reg_id; mutex_lock(&mqp->mutex); list_add(&ib_steering->list, &mqp->steering_rules); mutex_unlock(&mqp->mutex); } return 0; err_add: mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6, reg_id); err_malloc: kfree(ib_steering); return err; } enum { IBV_FLOW_L4_NONE = 0, IBV_FLOW_L4_OTHER = 3, IBV_FLOW_L4_UDP = 5, IBV_FLOW_L4_TCP = 6 }; struct mlx4_cm_steering { struct list_head list; u64 reg_id; struct ib_flow_spec spec; }; static int flow_spec_to_net_rule(struct ib_device *dev, struct ib_flow_spec *flow_spec, struct list_head *rule_list_h) { struct mlx4_spec_list *spec_l2, *spec_l3, *spec_l4; u64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL); if (!spec_l2) return -ENOMEM; switch (flow_spec->type) { case IB_FLOW_ETH: spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH; memcpy(spec_l2->eth.dst_mac, flow_spec->l2_id.eth.mac, ETH_ALEN); memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN); spec_l2->eth.ether_type = flow_spec->l2_id.eth.ethertype; if (flow_spec->l2_id.eth.vlan_present) { spec_l2->eth.vlan_id = flow_spec->l2_id.eth.vlan; spec_l2->eth.vlan_id_msk = cpu_to_be16(0x0fff); } break; case IB_FLOW_IB_UC: spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; if(flow_spec->l2_id.ib_uc.qpn) { spec_l2->ib.l3_qpn = cpu_to_be32(flow_spec->l2_id.ib_uc.qpn); spec_l2->ib.qpn_msk = cpu_to_be32(0xffffff); } break; case IB_FLOW_IB_MC_IPV4: case IB_FLOW_IB_MC_IPV6: spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; memcpy(spec_l2->ib.dst_gid, flow_spec->l2_id.ib_mc.mgid, 16); memset(spec_l2->ib.dst_gid_msk, 0xff, 16); break; } list_add_tail(&spec_l2->list, rule_list_h); if (flow_spec->l2_id.eth.ethertype == cpu_to_be16(ETH_P_IP) || flow_spec->type != IB_FLOW_ETH) { spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL); if (!spec_l3) return -ENOMEM; spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4; spec_l3->ipv4.src_ip = flow_spec->src_ip; if (flow_spec->type != IB_FLOW_IB_MC_IPV4 && flow_spec->type != IB_FLOW_IB_MC_IPV6) spec_l3->ipv4.dst_ip = flow_spec->dst_ip; if (spec_l3->ipv4.src_ip) spec_l3->ipv4.src_ip_msk = MLX4_BE_WORD_MASK; if (spec_l3->ipv4.dst_ip) spec_l3->ipv4.dst_ip_msk = MLX4_BE_WORD_MASK; list_add_tail(&spec_l3->list, rule_list_h); } if (flow_spec->l4_protocol) { spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL); if (!spec_l4) return -ENOMEM; spec_l4->tcp_udp.src_port = flow_spec->src_port; spec_l4->tcp_udp.dst_port = flow_spec->dst_port; if (spec_l4->tcp_udp.src_port) spec_l4->tcp_udp.src_port_msk = MLX4_BE_SHORT_MASK; if (spec_l4->tcp_udp.dst_port) spec_l4->tcp_udp.dst_port_msk = MLX4_BE_SHORT_MASK; switch (flow_spec->l4_protocol) { case IBV_FLOW_L4_UDP: spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP; break; case IBV_FLOW_L4_TCP: spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP; break; default: dev_err(dev->dma_device, "Unsupported l4 protocol.\n"); kfree(spec_l4); return -EPROTONOSUPPORT; } list_add_tail(&spec_l4->list, rule_list_h); } return 0; } static int __mlx4_ib_flow_attach(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, struct ib_flow_spec *flow_spec, int priority, int lock_qp) { u64 reg_id = 0; int err = 0; struct mlx4_cm_steering *cm_flow; struct mlx4_spec_list *spec, *tmp_spec; struct mlx4_net_trans_rule rule = { .queue_mode = MLX4_NET_TRANS_Q_FIFO, .exclusive = 0, }; rule.promisc_mode = flow_spec->rule_type; rule.port = mqp->port; rule.qpn = mqp->mqp.qpn; INIT_LIST_HEAD(&rule.list); cm_flow = kmalloc(sizeof(*cm_flow), GFP_KERNEL); if (!cm_flow) return -ENOMEM; if (rule.promisc_mode == MLX4_FS_REGULAR) { rule.allow_loopback = !flow_spec->block_mc_loopback; rule.priority = MLX4_DOMAIN_UVERBS | priority; err = flow_spec_to_net_rule(&mdev->ib_dev, flow_spec, &rule.list); if (err) goto free_list; } err = mlx4_flow_attach(mdev->dev, &rule, ®_id); if (err) goto free_list; memcpy(&cm_flow->spec, flow_spec, sizeof(*flow_spec)); cm_flow->reg_id = reg_id; if (lock_qp) mutex_lock(&mqp->mutex); list_add(&cm_flow->list, &mqp->rules_list); if (lock_qp) mutex_unlock(&mqp->mutex); free_list: list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) { list_del(&spec->list); kfree(spec); } if (err) { kfree(cm_flow); dev_err(mdev->ib_dev.dma_device, "Fail to attach flow steering rule\n"); } return err; } static int __mlx4_ib_flow_detach(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, struct ib_flow_spec *spec, int priority, int lock_qp) { struct mlx4_cm_steering *cm_flow; int ret; if (lock_qp) mutex_lock(&mqp->mutex); list_for_each_entry(cm_flow, &mqp->rules_list, list) { if (!memcmp(&cm_flow->spec, spec, sizeof(*spec))) { list_del(&cm_flow->list); break; } } if (lock_qp) mutex_unlock(&mqp->mutex); if (&cm_flow->list == &mqp->rules_list) { dev_err(mdev->ib_dev.dma_device, "Couldn't find reg_id for flow spec. " "Steering rule is left attached\n"); return -EINVAL; } ret = mlx4_flow_detach(mdev->dev, cm_flow->reg_id); kfree(cm_flow); return ret; } static int mlx4_ib_flow_attach(struct ib_qp *qp, struct ib_flow_spec *flow_spec, int priority) { return __mlx4_ib_flow_attach(to_mdev(qp->device), to_mqp(qp), flow_spec, priority, 1); } static int mlx4_ib_flow_detach(struct ib_qp *qp, struct ib_flow_spec *spec, int priority) { return __mlx4_ib_flow_detach(to_mdev(qp->device), to_mqp(qp), spec, priority, 1); } static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { struct mlx4_ib_gid_entry *ge; struct mlx4_ib_gid_entry *tmp; struct mlx4_ib_gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { ret = ge; break; } } return ret; } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); u8 mac[6]; struct net_device *ndev; struct mlx4_ib_gid_entry *ge; u64 reg_id = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { struct mlx4_ib_steering *ib_steering; mutex_lock(&mqp->mutex); list_for_each_entry(ib_steering, &mqp->steering_rules, list) { if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { list_del(&ib_steering->list); break; } } mutex_unlock(&mqp->mutex); if (&ib_steering->list == &mqp->steering_rules) { pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); return -EINVAL; } reg_id = ib_steering->reg_id; kfree(ib_steering); } err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6, reg_id); if (err) return err; mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { spin_lock(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); rdma_get_mcast_mac((struct in6_addr *)gid, mac); if (ndev) { rtnl_lock(); dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0); rtnl_unlock(); dev_put(ndev); } list_del(&ge->list); kfree(ge); } else pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); return 0; } static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; if (mlx4_is_master(dev->dev)) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(dev->ib_dev.node_desc, out_mad->data, 64); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id }; static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) { #ifdef __linux__ memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); #else memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); #endif if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; } else { eui[3] = 0xff; eui[4] = 0xfe; } eui[0] ^= 2; } static void update_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); return; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof gw->gids); err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port command failed\n"); else { memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); } mlx4_free_cmd_mailbox(dev, mailbox); kfree(gw); } static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) { struct net_device *ndev = dev->iboe.netdevs[port - 1]; struct update_gid_work *work; struct net_device *tmp; int i; u8 *hits; union ib_gid gid; int index_free; int found; int need_update = 0; int max_gids; u16 vid; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; hits = kzalloc(128, GFP_ATOMIC); if (!hits) { kfree(work); return -ENOMEM; } max_gids = dev->dev->caps.gid_table_len[port]; #ifdef __linux__ rcu_read_lock(); for_each_netdev_rcu(&init_net, tmp) { #else IFNET_RLOCK(); TAILQ_FOREACH(tmp, &V_ifnet, if_link) { #endif if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); vid = rdma_vlan_dev_vlan_id(tmp); mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); found = 0; index_free = -1; for (i = 0; i < max_gids; ++i) { if (index_free < 0 && !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) index_free = i; if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { hits[i] = 1; found = 1; break; } } if (!found) { if (tmp == ndev && (memcmp(&dev->iboe.gid_table[port - 1][0], &gid, sizeof gid) || !memcmp(&dev->iboe.gid_table[port - 1][0], &zgid, sizeof gid))) { dev->iboe.gid_table[port - 1][0] = gid; ++need_update; hits[0] = 1; } else if (index_free >= 0) { dev->iboe.gid_table[port - 1][index_free] = gid; hits[index_free] = 1; ++need_update; } } } #ifdef __linux__ } rcu_read_unlock(); #else } IFNET_RUNLOCK(); #endif for (i = 0; i < max_gids; ++i) if (!hits[i]) { if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) ++need_update; dev->iboe.gid_table[port - 1][i] = zgid; } if (need_update) { memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); INIT_WORK(&work->work, update_gids_task); work->port = port; work->dev = dev; queue_work(wq, &work->work); } else kfree(work); kfree(hits); return 0; } static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) { switch (event) { case NETDEV_UP: #ifdef __linux__ case NETDEV_CHANGEADDR: #endif update_ipv6_gids(dev, port, 0); break; case NETDEV_DOWN: update_ipv6_gids(dev, port, 1); dev->iboe.netdevs[port - 1] = NULL; } } static void netdev_added(struct mlx4_ib_dev *dev, int port) { update_ipv6_gids(dev, port, 0); } static void netdev_removed(struct mlx4_ib_dev *dev, int port) { update_ipv6_gids(dev, port, 1); } static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; struct net_device *oldnd; struct mlx4_ib_iboe *iboe; int port; #ifdef __linux__ if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; #endif ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); iboe = &ibdev->iboe; spin_lock(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { oldnd = iboe->netdevs[port - 1]; iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (oldnd != iboe->netdevs[port - 1]) { if (iboe->netdevs[port - 1]) netdev_added(ibdev, port); else netdev_removed(ibdev, port); } } if (dev == iboe->netdevs[0] || (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) handle_en_event(ibdev, 1, event); else if (dev == iboe->netdevs[1] || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) handle_en_event(ibdev, 2, event); spin_unlock(&iboe->lock); return NOTIFY_DONE; } static void init_pkeys(struct mlx4_ib_dev *ibdev) { int port; int slave; int i; if (mlx4_is_master(ibdev->dev)) { for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) { ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = /* master has the identity virt2phys pkey mapping */ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; mlx4_sync_pkey_table(ibdev->dev, slave, port, i, ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); } } } /* initialize pkey cache */ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) ibdev->pkeys.phys_pkey_cache[port-1][i] = (i) ? 0 : 0xFFFF; } } } static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { char name[32]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; int i, j, eq; /* Legacy mode or comp_pool is not large enough */ if (dev->caps.comp_pool == 0 || dev->caps.num_ports > dev->caps.comp_pool) return; eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ dev->caps.num_ports); /* Init eq table */ added_eqs = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) added_eqs += eq_per_port; total_eqs = dev->caps.num_comp_vectors + added_eqs; ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); if (!ibdev->eq_table) return; ibdev->eq_added = added_eqs; eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j, pci_get_domain(dev->pdev->dev.bsddev), pci_get_bus(dev->pdev->dev.bsddev), PCI_SLOT(dev->pdev->devfn), PCI_FUNC(dev->pdev->devfn)); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, &ibdev->eq_table[eq])) { /* Use legacy (same as mlx4_en driver) */ pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); ibdev->eq_table[eq] = (eq % dev->caps.num_comp_vectors); } eq++; } } /* Fill the reset of the vector with legacy EQ */ for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) ibdev->eq_table[eq++] = i; /* Advertise the new number of EQs to clients */ ibdev->ib_dev.num_comp_vectors = total_eqs; } static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; /* no additional eqs were added */ if (!ibdev->eq_table) return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; /* Free only the added eqs */ for (i = 0; i < ibdev->eq_added; i++) { /* Don't free legacy eqs if used */ if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) continue; mlx4_release_eq(dev, ibdev->eq_table[i]); } kfree(ibdev->eq_table); } /* * create show function and a device_attribute struct pointing to * the function for _name */ #define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ static ssize_t show_rprt_##_name(struct device *dev, \ struct device_attribute *attr, \ char *buf){ \ return show_diag_rprt(dev, buf, _offset, _op_mod); \ } \ static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); #define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 static size_t show_diag_rprt(struct device *device, char *buf, u32 offset, u8 op_modifier) { size_t ret; u32 counter_offset = offset; u32 diag_counter = 0; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, &counter_offset, &diag_counter); if (ret) return ret; return sprintf(buf, "%d\n", diag_counter); } static ssize_t clear_diag_counters(struct device *device, struct device_attribute *attr, const char *buf, size_t length) { size_t ret; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, NULL, NULL); if (ret) return ret; return length; } DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters); static struct attribute *diag_rprt_attrs[] = { &dev_attr_rq_num_lle.attr, &dev_attr_sq_num_lle.attr, &dev_attr_rq_num_lqpoe.attr, &dev_attr_sq_num_lqpoe.attr, &dev_attr_rq_num_lpe.attr, &dev_attr_sq_num_lpe.attr, &dev_attr_rq_num_wrfe.attr, &dev_attr_sq_num_wrfe.attr, &dev_attr_sq_num_mwbe.attr, &dev_attr_sq_num_bre.attr, &dev_attr_rq_num_lae.attr, &dev_attr_sq_num_rire.attr, &dev_attr_rq_num_rire.attr, &dev_attr_sq_num_rae.attr, &dev_attr_rq_num_rae.attr, &dev_attr_sq_num_roe.attr, &dev_attr_sq_num_tree.attr, &dev_attr_sq_num_rree.attr, &dev_attr_rq_num_rnr.attr, &dev_attr_sq_num_rnr.attr, &dev_attr_rq_num_oos.attr, &dev_attr_sq_num_oos.attr, &dev_attr_rq_num_mce.attr, &dev_attr_rq_num_udsdprd.attr, &dev_attr_rq_num_ucsdprd.attr, &dev_attr_num_cqovf.attr, &dev_attr_num_eqovf.attr, &dev_attr_num_baddb.attr, &dev_attr_clear_diag.attr, NULL }; static struct attribute_group diag_counters_group = { .name = "diag_counters", .attrs = diag_rprt_attrs }; #ifdef __linux__ static int mlx4_ib_proc_init(void) { /* Creating procfs directories /proc/drivers/mlx4_ib/ && /proc/drivers/mlx4_ib/mrs for further use by the driver. */ int err; mlx4_ib_driver_dir_entry = proc_mkdir(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); if (!mlx4_ib_driver_dir_entry) { pr_err("mlx4_ib_proc_init has failed for %s\n", MLX4_IB_DRIVER_PROC_DIR_NAME); err = -ENODEV; goto error; } mlx4_mrs_dir_entry = proc_mkdir(MLX4_IB_MRS_PROC_DIR_NAME, mlx4_ib_driver_dir_entry); if (!mlx4_mrs_dir_entry) { pr_err("mlx4_ib_proc_init has failed for %s\n", MLX4_IB_MRS_PROC_DIR_NAME); err = -ENODEV; goto remove_entry; } return 0; remove_entry: remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); error: return err; } #endif static void init_dev_assign(void) { int bus, slot, fn, ib_idx; char *p = dev_assign_str, *t; char curr_val[32] = {0}; int ret; int j, i = 0; memset(dr, 0, sizeof dr); if (dev_assign_str[0] == 0) return; while (strlen(p)) { ret = sscanf(p, "%02x:%02x.%x-%x", &bus, &slot, &fn, &ib_idx); if (ret != 4 || ib_idx < 0) goto err; for (j = 0; j < i; j++) if (dr[j].nr == ib_idx) goto err; dr[i].bus = bus; dr[i].dev = slot; dr[i].func = fn; dr[i].nr = ib_idx; t = strchr(p, ','); sprintf(curr_val, "%02x:%02x.%x-%x", bus, slot, fn, ib_idx); if ((!t) && strlen(p) == strlen(curr_val)) return; if (!t || (t + 1) >= dev_assign_str + sizeof dev_assign_str) goto err; ++i; if (i >= MAX_DR) goto err; p = t + 1; } return; err: memset(dr, 0, sizeof dr); printk(KERN_WARNING "mlx4_ib: The value of 'dev_assign_str' parameter " "is incorrect. The parameter value is discarded!"); } +static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + int err; + + if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) { + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } else { + if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCEV2) + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE | + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->core_cap_flags |= RDMA_CORE_PORT_RAW_PACKET; + if (immutable->core_cap_flags & (RDMA_CORE_PORT_IBA_ROCE | + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP)) + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + + return 0; +} + static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; int num_ports = 0; int i, j; int err; struct mlx4_ib_iboe *iboe; printk(KERN_INFO "%s", mlx4_ib_version); mlx4_foreach_ib_transport_port(i, dev) num_ports++; /* No point in registering a device with no ports... */ if (num_ports == 0) return NULL; ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); return NULL; } iboe = &ibdev->iboe; if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) goto err_dealloc; if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!ibdev->priv_uar.map) goto err_uar; MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; if (dev->caps.userspace_caps) ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; else ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_FLOW) | (1ull << IB_USER_VERBS_CMD_DETACH_FLOW) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; ibdev->ib_dev.query_gid = mlx4_ib_query_gid; ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; ibdev->ib_dev.modify_device = mlx4_ib_modify_device; ibdev->ib_dev.modify_port = mlx4_ib_modify_port; ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; ibdev->ib_dev.mmap = mlx4_ib_mmap; #ifdef __linux__ ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area; #endif ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; ibdev->ib_dev.create_ah = mlx4_ib_create_ah; ibdev->ib_dev.query_ah = mlx4_ib_query_ah; ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; ibdev->ib_dev.create_srq = mlx4_ib_create_srq; ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; ibdev->ib_dev.query_srq = mlx4_ib_query_srq; ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; ibdev->ib_dev.create_qp = mlx4_ib_create_qp; ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; ibdev->ib_dev.query_qp = mlx4_ib_query_qp; ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; ibdev->ib_dev.post_send = mlx4_ib_post_send; ibdev->ib_dev.post_recv = mlx4_ib_post_recv; ibdev->ib_dev.create_cq = mlx4_ib_create_cq; ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.attach_flow = mlx4_ib_flow_attach; ibdev->ib_dev.detach_flow = mlx4_ib_flow_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; + ibdev->ib_dev.get_port_immutable = mlx4_port_immutable; if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { err = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i]); if (err) ibdev->counters[i] = -1; } else ibdev->counters[i] = -1; } spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && !mlx4_is_slave(dev)) { ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); if (err) goto err_counter; ibdev->ib_uc_qpns_bitmap = kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * sizeof(long), GFP_KERNEL); if (!ibdev->ib_uc_qpns_bitmap) { dev_err(&dev->pdev->dev, "bit map alloc failed\n"); goto err_steer_qp_release; } bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1); if (err) goto err_steer_free_bitmap; } if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; if (mlx4_ib_init_sriov(ibdev)) goto err_mad; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); if (err) goto err_sriov; } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j])) goto err_notif; } if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) goto err_notif; ibdev->ib_active = true; if (mlx4_is_mfunc(ibdev->dev)) init_pkeys(ibdev); /* create paravirt contexts for any VFs which are active */ if (mlx4_is_master(ibdev->dev)) { for (j = 0; j < MLX4_MFUNC_MAX; j++) { if (j == mlx4_master_func_num(ibdev->dev)) continue; if (mlx4_is_slave_active(ibdev->dev, j)) do_slave_init(ibdev, j, 1); } } return ibdev; err_notif: if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); flush_workqueue(wq); err_sriov: mlx4_ib_close_sriov(ibdev); err_mad: mlx4_ib_mad_cleanup(ibdev); err_reg: ib_unregister_device(&ibdev->ib_dev); err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); err_counter: for (; i; --i) if (ibdev->counters[i - 1] != -1) mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1]); err_map: iounmap(ibdev->priv_uar.map); mlx4_ib_free_eqs(dev, ibdev); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); err_pd: mlx4_pd_free(dev, ibdev->priv_pdn); err_dealloc: ib_dealloc_device(&ibdev->ib_dev); return NULL; } int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) { int offset; WARN_ON(!dev->ib_uc_qpns_bitmap); offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, dev->steer_qpn_count, get_count_order(count)); if (offset < 0) return offset; *qpn = dev->steer_qpn_base + offset; return 0; } void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) { if (!qpn || dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) return; BUG_ON(qpn < dev->steer_qpn_base); bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, get_count_order(count)); } int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach) { struct ib_flow_spec spec = { .type = IB_FLOW_IB_UC, .l2_id.ib_uc.qpn = mqp->ibqp.qp_num, }; return is_attach ? __mlx4_ib_flow_attach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0) : __mlx4_ib_flow_detach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0); } static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p,j; mlx4_ib_close_sriov(ibdev); sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); mlx4_ib_mad_cleanup(ibdev); for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } ib_unregister_device(&ibdev->ib_dev); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); kfree(ibdev->ib_uc_qpns_bitmap); } if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } iounmap(ibdev->priv_uar.map); for (p = 0; p < ibdev->num_ports; ++p) if (ibdev->counters[p] != -1) mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p]); mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); mlx4_ib_free_eqs(dev, ibdev); mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); } static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) { struct mlx4_ib_demux_work **dm = NULL; struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; if (!mlx4_is_master(dev)) return; dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); goto out; } for (i = 0; i < dev->caps.num_ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); for (i = 0; i < dev->caps.num_ports; i++) { if (dm[i]) kfree(dm[i]); } goto out; } } /* initialize or tear down tunnel QPs for the slave */ for (i = 0; i < dev->caps.num_ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); dm[i]->port = i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); if (!ibdev->sriov.is_going_down) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } out: if (dm) kfree(dm); return; } static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); struct mlx4_eqe *eqe = NULL; struct ib_event_work *ew; int p = 0; if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) eqe = (struct mlx4_eqe *)param; else p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: if (p > ibdev->num_ports) return; if (mlx4_is_master(dev) && rdma_port_get_link_layer(&ibdev->ib_dev, p) == IB_LINK_LAYER_INFINIBAND) { mlx4_ib_invalidate_all_guid_record(ibdev, p); } mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is up\n", p); ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: if (p > ibdev->num_ports) return; mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is down\n", p); ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) { pr_err("failed to allocate memory for events work\n"); break; } INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); ew->ib_dev = ibdev; /* need to queue only for port owner, which uses GEN_EQE */ if (mlx4_is_master(dev)) queue_work(wq, &ew->work); else handle_port_mgmt_change_event(&ew->work); return; case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; default: return; } ibev.device = ibdev_ptr; ibev.element.port_num = (u8) p; ib_dispatch_event(&ibev); } static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, .protocol = MLX4_PROT_IB_IPV6 }; static int __init mlx4_ib_init(void) { int err; wq = create_singlethread_workqueue("mlx4_ib"); if (!wq) return -ENOMEM; #ifdef __linux__ err = mlx4_ib_proc_init(); if (err) goto clean_wq; #endif err = mlx4_ib_mcg_init(); if (err) goto clean_proc; init_dev_assign(); err = mlx4_register_interface(&mlx4_ib_interface); if (err) goto clean_mcg; return 0; clean_mcg: mlx4_ib_mcg_destroy(); clean_proc: #ifdef __linux__ remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, mlx4_ib_driver_dir_entry); remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); clean_wq: #endif destroy_workqueue(wq); return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); destroy_workqueue(wq); /* Remove proc entries */ #ifdef __linux__ remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, mlx4_ib_driver_dir_entry); remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); #endif } module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); module_exit(mlx4_ib_cleanup); #undef MODULE_VERSION #include static int mlx4ib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlx4ib_mod = { .name = "mlx4ib", .evhand = mlx4ib_evhand, }; DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); Index: stable/10/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c =================================================================== --- stable/10/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c (revision 325610) +++ stable/10/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c (revision 325611) @@ -1,1428 +1,1448 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include "mthca_dev.h" #include "mthca_cmd.h" #include "mthca_user.h" #include "mthca_memfree.h" static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; struct mthca_dev *mdev = to_mdev(ibdev); u8 status; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; memset(props, 0, sizeof *props); props->fw_ver = mdev->fw_ver; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mthca_MAD_IFC(mdev, 1, 1, 1, NULL, NULL, in_mad, out_mad, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } props->device_cap_flags = mdev->device_cap_flags; props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30)); props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = mdev->limits.page_size_cap; props->max_qp = mdev->limits.num_qps - mdev->limits.reserved_qps; props->max_qp_wr = mdev->limits.max_wqes; props->max_sge = mdev->limits.max_sg; props->max_cq = mdev->limits.num_cqs - mdev->limits.reserved_cqs; props->max_cqe = mdev->limits.max_cqes; props->max_mr = mdev->limits.num_mpts - mdev->limits.reserved_mrws; props->max_pd = mdev->limits.num_pds - mdev->limits.reserved_pds; props->max_qp_rd_atom = 1 << mdev->qp_table.rdb_shift; props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = mdev->limits.num_srqs - mdev->limits.reserved_srqs; props->max_srq_wr = mdev->limits.max_srq_wqes; props->max_srq_sge = mdev->limits.max_srq_sge; props->local_ca_ack_delay = mdev->limits.local_ca_ack_delay; props->atomic_cap = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->max_pkeys = mdev->limits.pkey_table_len; props->max_mcast_grp = mdev->limits.num_mgms + mdev->limits.num_amgms; props->max_mcast_qp_attach = MTHCA_QP_PER_MGM; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; /* * If Sinai memory key optimization is being used, then only * the 8-bit key portion will change. For other HCAs, the * unused index bits will also be used for FMR remapping. */ if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) props->max_map_per_fmr = 255; else props->max_map_per_fmr = (1 << (32 - ilog2(mdev->limits.num_mpts))) - 1; err = 0; out: kfree(in_mad); kfree(out_mad); return err; } static int mthca_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; u8 status; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; memset(props, 0, sizeof *props); init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); props->sm_sl = out_mad->data[36] & 0xf; props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); props->gid_tbl_len = to_mdev(ibdev)->limits.gid_table_len; props->max_msg_sz = 0x80000000; props->pkey_tbl_len = to_mdev(ibdev)->limits.pkey_table_len; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; props->active_speed = out_mad->data[35] >> 4; props->max_mtu = out_mad->data[41] & 0xf; props->active_mtu = out_mad->data[36] >> 4; props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; out: kfree(in_mad); kfree(out_mad); return err; } static int mthca_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (mask & IB_DEVICE_MODIFY_NODE_DESC) { if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex)) return -ERESTARTSYS; memcpy(ibdev->node_desc, props->node_desc, 64); mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); } return 0; } static int mthca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, struct ib_port_modify *props) { struct mthca_set_ib_param set_ib; struct ib_port_attr attr; int err; u8 status; if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex)) return -ERESTARTSYS; err = mthca_query_port(ibdev, port, &attr); if (err) goto out; set_ib.set_si_guid = 0; set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR); set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); return err; } static int mthca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; u8 status; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); out: kfree(in_mad); kfree(out_mad); return err; } static int mthca_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; u8 status; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } memcpy(gid->raw, out_mad->data + 8, 8); init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: kfree(in_mad); kfree(out_mad); return err; } static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mthca_alloc_ucontext_resp uresp; struct mthca_ucontext *context; int err; if (!(to_mdev(ibdev)->active)) return ERR_PTR(-EAGAIN); memset(&uresp, 0, sizeof uresp); uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps; if (mthca_is_memfree(to_mdev(ibdev))) uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size; else uresp.uarc_size = 0; context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); err = mthca_uar_alloc(to_mdev(ibdev), &context->uar); if (err) { kfree(context); return ERR_PTR(err); } context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev)); if (IS_ERR(context->db_tab)) { err = PTR_ERR(context->db_tab); mthca_uar_free(to_mdev(ibdev), &context->uar); kfree(context); return ERR_PTR(err); } if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab); mthca_uar_free(to_mdev(ibdev), &context->uar); kfree(context); return ERR_PTR(-EFAULT); } context->reg_mr_warned = 0; return &context->ibucontext; } static int mthca_dealloc_ucontext(struct ib_ucontext *context) { mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar, to_mucontext(context)->db_tab); mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar); kfree(to_mucontext(context)); return 0; } static int mthca_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma) { if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; return 0; } static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mthca_pd *pd; int err; pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mthca_pd_alloc(to_mdev(ibdev), !context, pd); if (err) { kfree(pd); return ERR_PTR(err); } if (context) { if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) { mthca_pd_free(to_mdev(ibdev), pd); kfree(pd); return ERR_PTR(-EFAULT); } } return &pd->ibpd; } static int mthca_dealloc_pd(struct ib_pd *pd) { mthca_pd_free(to_mdev(pd->device), to_mpd(pd)); kfree(pd); return 0; } static struct ib_ah *mthca_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { int err; struct mthca_ah *ah; ah = kmalloc(sizeof *ah, GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah); if (err) { kfree(ah); return ERR_PTR(err); } return &ah->ibah; } static int mthca_ah_destroy(struct ib_ah *ah) { mthca_destroy_ah(to_mdev(ah->device), to_mah(ah)); kfree(ah); return 0; } static struct ib_srq *mthca_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata) { struct mthca_create_srq ucmd; struct mthca_ucontext *context = NULL; struct mthca_srq *srq; int err; srq = kmalloc(sizeof *srq, GFP_KERNEL); if (!srq) return ERR_PTR(-ENOMEM); if (pd->uobject) { context = to_mucontext(pd->uobject->context); if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { err = -EFAULT; goto err_free; } err = mthca_map_user_db(to_mdev(pd->device), &context->uar, context->db_tab, ucmd.db_index, ucmd.db_page); if (err) goto err_free; srq->mr.ibmr.lkey = ucmd.lkey; srq->db_index = ucmd.db_index; } err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd), &init_attr->attr, srq); if (err && pd->uobject) mthca_unmap_user_db(to_mdev(pd->device), &context->uar, context->db_tab, ucmd.db_index); if (err) goto err_free; if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) { mthca_free_srq(to_mdev(pd->device), srq); err = -EFAULT; goto err_free; } return &srq->ibsrq; err_free: kfree(srq); return ERR_PTR(err); } static int mthca_destroy_srq(struct ib_srq *srq) { struct mthca_ucontext *context; if (srq->uobject) { context = to_mucontext(srq->uobject->context); mthca_unmap_user_db(to_mdev(srq->device), &context->uar, context->db_tab, to_msrq(srq)->db_index); } mthca_free_srq(to_mdev(srq->device), to_msrq(srq)); kfree(srq); return 0; } static struct ib_qp *mthca_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct mthca_create_qp ucmd; struct mthca_qp *qp; int err; if (init_attr->create_flags) return ERR_PTR(-EINVAL); switch (init_attr->qp_type) { case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: { struct mthca_ucontext *context; qp = kmalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); if (pd->uobject) { context = to_mucontext(pd->uobject->context); if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { kfree(qp); return ERR_PTR(-EFAULT); } err = mthca_map_user_db(to_mdev(pd->device), &context->uar, context->db_tab, ucmd.sq_db_index, ucmd.sq_db_page); if (err) { kfree(qp); return ERR_PTR(err); } err = mthca_map_user_db(to_mdev(pd->device), &context->uar, context->db_tab, ucmd.rq_db_index, ucmd.rq_db_page); if (err) { mthca_unmap_user_db(to_mdev(pd->device), &context->uar, context->db_tab, ucmd.sq_db_index); kfree(qp); return ERR_PTR(err); } qp->mr.ibmr.lkey = ucmd.lkey; qp->sq.db_index = ucmd.sq_db_index; qp->rq.db_index = ucmd.rq_db_index; } err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd), to_mcq(init_attr->send_cq), to_mcq(init_attr->recv_cq), init_attr->qp_type, init_attr->sq_sig_type, &init_attr->cap, qp); if (err && pd->uobject) { context = to_mucontext(pd->uobject->context); mthca_unmap_user_db(to_mdev(pd->device), &context->uar, context->db_tab, ucmd.sq_db_index); mthca_unmap_user_db(to_mdev(pd->device), &context->uar, context->db_tab, ucmd.rq_db_index); } qp->ibqp.qp_num = qp->qpn; break; } case IB_QPT_SMI: case IB_QPT_GSI: { /* Don't allow userspace to create special QPs */ if (pd->uobject) return ERR_PTR(-EINVAL); qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd), to_mcq(init_attr->send_cq), to_mcq(init_attr->recv_cq), init_attr->sq_sig_type, &init_attr->cap, qp->ibqp.qp_num, init_attr->port_num, to_msqp(qp)); break; } default: /* Don't support raw QPs */ return ERR_PTR(-ENOSYS); } if (err) { kfree(qp); return ERR_PTR(err); } init_attr->cap.max_send_wr = qp->sq.max; init_attr->cap.max_recv_wr = qp->rq.max; init_attr->cap.max_send_sge = qp->sq.max_gs; init_attr->cap.max_recv_sge = qp->rq.max_gs; init_attr->cap.max_inline_data = qp->max_inline_data; return &qp->ibqp; } static int mthca_destroy_qp(struct ib_qp *qp) { if (qp->uobject) { mthca_unmap_user_db(to_mdev(qp->device), &to_mucontext(qp->uobject->context)->uar, to_mucontext(qp->uobject->context)->db_tab, to_mqp(qp)->sq.db_index); mthca_unmap_user_db(to_mdev(qp->device), &to_mucontext(qp->uobject->context)->uar, to_mucontext(qp->uobject->context)->db_tab, to_mqp(qp)->rq.db_index); } mthca_free_qp(to_mdev(qp->device), to_mqp(qp)); kfree(qp); return 0; } static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, int comp_vector, struct ib_ucontext *context, struct ib_udata *udata) { struct mthca_create_cq ucmd; struct mthca_cq *cq; int nent; int err; if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes) return ERR_PTR(-EINVAL); if (context) { if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) return ERR_PTR(-EFAULT); err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, to_mucontext(context)->db_tab, ucmd.set_db_index, ucmd.set_db_page); if (err) return ERR_PTR(err); err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, to_mucontext(context)->db_tab, ucmd.arm_db_index, ucmd.arm_db_page); if (err) goto err_unmap_set; } cq = kmalloc(sizeof *cq, GFP_KERNEL); if (!cq) { err = -ENOMEM; goto err_unmap_arm; } if (context) { cq->buf.mr.ibmr.lkey = ucmd.lkey; cq->set_ci_db_index = ucmd.set_db_index; cq->arm_db_index = ucmd.arm_db_index; } for (nent = 1; nent <= entries; nent <<= 1) ; /* nothing */ err = mthca_init_cq(to_mdev(ibdev), nent, context ? to_mucontext(context) : NULL, context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num, cq); if (err) goto err_free; if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) { mthca_free_cq(to_mdev(ibdev), cq); goto err_free; } cq->resize_buf = NULL; return &cq->ibcq; err_free: kfree(cq); err_unmap_arm: if (context) mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, to_mucontext(context)->db_tab, ucmd.arm_db_index); err_unmap_set: if (context) mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, to_mucontext(context)->db_tab, ucmd.set_db_index); return ERR_PTR(err); } static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq, int entries) { int ret; spin_lock_irq(&cq->lock); if (cq->resize_buf) { ret = -EBUSY; goto unlock; } cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); if (!cq->resize_buf) { ret = -ENOMEM; goto unlock; } cq->resize_buf->state = CQ_RESIZE_ALLOC; ret = 0; unlock: spin_unlock_irq(&cq->lock); if (ret) return ret; ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries); if (ret) { spin_lock_irq(&cq->lock); kfree(cq->resize_buf); cq->resize_buf = NULL; spin_unlock_irq(&cq->lock); return ret; } cq->resize_buf->cqe = entries - 1; spin_lock_irq(&cq->lock); cq->resize_buf->state = CQ_RESIZE_READY; spin_unlock_irq(&cq->lock); return 0; } static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(ibcq->device); struct mthca_cq *cq = to_mcq(ibcq); struct mthca_resize_cq ucmd; u32 lkey; u8 status; int ret; if (entries < 1 || entries > dev->limits.max_cqes) return -EINVAL; mutex_lock(&cq->mutex); entries = roundup_pow_of_two(entries + 1); if (entries == ibcq->cqe + 1) { ret = 0; goto out; } if (cq->is_kernel) { ret = mthca_alloc_resize_buf(dev, cq, entries); if (ret) goto out; lkey = cq->resize_buf->buf.mr.ibmr.lkey; } else { if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { ret = -EFAULT; goto out; } lkey = ucmd.lkey; } ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries), &status); if (status) ret = -EINVAL; if (ret) { if (cq->resize_buf) { mthca_free_cq_buf(dev, &cq->resize_buf->buf, cq->resize_buf->cqe); kfree(cq->resize_buf); spin_lock_irq(&cq->lock); cq->resize_buf = NULL; spin_unlock_irq(&cq->lock); } goto out; } if (cq->is_kernel) { struct mthca_cq_buf tbuf; int tcqe; spin_lock_irq(&cq->lock); if (cq->resize_buf->state == CQ_RESIZE_READY) { mthca_cq_resize_copy_cqes(cq); tbuf = cq->buf; tcqe = cq->ibcq.cqe; cq->buf = cq->resize_buf->buf; cq->ibcq.cqe = cq->resize_buf->cqe; } else { tbuf = cq->resize_buf->buf; tcqe = cq->resize_buf->cqe; } kfree(cq->resize_buf); cq->resize_buf = NULL; spin_unlock_irq(&cq->lock); mthca_free_cq_buf(dev, &tbuf, tcqe); } else ibcq->cqe = entries - 1; out: mutex_unlock(&cq->mutex); return ret; } static int mthca_destroy_cq(struct ib_cq *cq) { if (cq->uobject) { mthca_unmap_user_db(to_mdev(cq->device), &to_mucontext(cq->uobject->context)->uar, to_mucontext(cq->uobject->context)->db_tab, to_mcq(cq)->arm_db_index); mthca_unmap_user_db(to_mdev(cq->device), &to_mucontext(cq->uobject->context)->uar, to_mucontext(cq->uobject->context)->db_tab, to_mcq(cq)->set_ci_db_index); } mthca_free_cq(to_mdev(cq->device), to_mcq(cq)); kfree(cq); return 0; } static inline u32 convert_access(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC : 0) | (acc & IB_ACCESS_REMOTE_WRITE ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) | (acc & IB_ACCESS_REMOTE_READ ? MTHCA_MPT_FLAG_REMOTE_READ : 0) | (acc & IB_ACCESS_LOCAL_WRITE ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0) | MTHCA_MPT_FLAG_LOCAL_READ; } static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc) { struct mthca_mr *mr; int err; mr = kmalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = mthca_mr_alloc_notrans(to_mdev(pd->device), to_mpd(pd)->pd_num, convert_access(acc), mr); if (err) { kfree(mr); return ERR_PTR(err); } mr->umem = NULL; return &mr->ibmr; } static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, u64 *iova_start) { struct mthca_mr *mr; u64 *page_list; u64 total_size; unsigned long mask; int shift; int npages; int err; int i, j, n; mask = buffer_list[0].addr ^ *iova_start; total_size = 0; for (i = 0; i < num_phys_buf; ++i) { if (i != 0) mask |= buffer_list[i].addr; if (i != num_phys_buf - 1) mask |= buffer_list[i].addr + buffer_list[i].size; total_size += buffer_list[i].size; } if (mask & ~PAGE_MASK) return ERR_PTR(-EINVAL); shift = __ffs(mask | 1 << 31); buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1); buffer_list[0].addr &= ~0ull << shift; mr = kmalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); npages = 0; for (i = 0; i < num_phys_buf; ++i) npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift; if (!npages) return &mr->ibmr; page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL); if (!page_list) { kfree(mr); return ERR_PTR(-ENOMEM); } n = 0; for (i = 0; i < num_phys_buf; ++i) for (j = 0; j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift; ++j) page_list[n++] = buffer_list[i].addr + ((u64) j << shift); mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) " "in PD %x; shift %d, npages %d.\n", (unsigned long long) buffer_list[0].addr, (unsigned long long) *iova_start, to_mpd(pd)->pd_num, shift, npages); err = mthca_mr_alloc_phys(to_mdev(pd->device), to_mpd(pd)->pd_num, page_list, shift, npages, *iova_start, total_size, convert_access(acc), mr); if (err) { kfree(page_list); kfree(mr); return ERR_PTR(err); } kfree(page_list); mr->umem = NULL; return &mr->ibmr; } static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata, int mr_id) { struct mthca_dev *dev = to_mdev(pd->device); struct ib_umem_chunk *chunk; struct mthca_mr *mr; struct mthca_reg_mr ucmd; u64 *pages; int shift, n, len; int i, j, k; int err = 0; int write_mtt_size; if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) { if (!to_mucontext(pd->uobject->context)->reg_mr_warned) { mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", curproc->p_comm); mthca_warn(dev, " Update libmthca to fix this.\n"); } ++to_mucontext(pd->uobject->context)->reg_mr_warned; ucmd.mr_attrs = 0; } else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) return ERR_PTR(-EFAULT); mr = kmalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, ucmd.mr_attrs & MTHCA_MR_DMASYNC); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err; } shift = ffs(mr->umem->page_size) - 1; n = 0; list_for_each_entry(chunk, &mr->umem->chunk_list, list) n += chunk->nents; mr->mtt = mthca_alloc_mtt(dev, n); if (IS_ERR(mr->mtt)) { err = PTR_ERR(mr->mtt); goto err_umem; } pages = (u64 *) __get_free_page(GFP_KERNEL); if (!pages) { err = -ENOMEM; goto err_mtt; } i = n = 0; write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); list_for_each_entry(chunk, &mr->umem->chunk_list, list) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; for (k = 0; k < len; ++k) { pages[i++] = sg_dma_address(&chunk->page_list[j]) + mr->umem->page_size * k; /* * Be friendly to write_mtt and pass it chunks * of appropriate size. */ if (i == write_mtt_size) { err = mthca_write_mtt(dev, mr->mtt, n, pages, i); if (err) goto mtt_done; n += i; i = 0; } } } if (i) err = mthca_write_mtt(dev, mr->mtt, n, pages, i); mtt_done: free_page((unsigned long) pages); if (err) goto err_mtt; err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length, convert_access(acc), mr); if (err) goto err_mtt; return &mr->ibmr; err_mtt: mthca_free_mtt(dev, mr->mtt); err_umem: ib_umem_release(mr->umem); err: kfree(mr); return ERR_PTR(err); } static int mthca_dereg_mr(struct ib_mr *mr) { struct mthca_mr *mmr = to_mmr(mr); mthca_free_mr(to_mdev(mr->device), mmr); if (mmr->umem) ib_umem_release(mmr->umem); kfree(mmr); return 0; } static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr) { struct mthca_fmr *fmr; int err; fmr = kmalloc(sizeof *fmr, GFP_KERNEL); if (!fmr) return ERR_PTR(-ENOMEM); memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr); err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num, convert_access(mr_access_flags), fmr); if (err) { kfree(fmr); return ERR_PTR(err); } return &fmr->ibmr; } static int mthca_dealloc_fmr(struct ib_fmr *fmr) { struct mthca_fmr *mfmr = to_mfmr(fmr); int err; err = mthca_free_fmr(to_mdev(fmr->device), mfmr); if (err) return err; kfree(mfmr); return 0; } static int mthca_unmap_fmr(struct list_head *fmr_list) { struct ib_fmr *fmr; int err; u8 status; struct mthca_dev *mdev = NULL; list_for_each_entry(fmr, fmr_list, list) { if (mdev && to_mdev(fmr->device) != mdev) return -EINVAL; mdev = to_mdev(fmr->device); } if (!mdev) return 0; if (mthca_is_memfree(mdev)) { list_for_each_entry(fmr, fmr_list, list) mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr)); wmb(); } else list_for_each_entry(fmr, fmr_list, list) mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr)); err = mthca_SYNC_TPT(mdev, &status); if (err) return err; if (status) return -EINVAL; return 0; } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->rev_id); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32), (int) (dev->fw_ver >> 16) & 0xffff, (int) dev->fw_ver & 0xffff); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); switch (dev->pdev->device) { case PCI_DEVICE_ID_MELLANOX_TAVOR: return sprintf(buf, "MT23108\n"); case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT: return sprintf(buf, "MT25208 (MT23108 compat mode)\n"); case PCI_DEVICE_ID_MELLANOX_ARBEL: return sprintf(buf, "MT25208\n"); case PCI_DEVICE_ID_MELLANOX_SINAI: case PCI_DEVICE_ID_MELLANOX_SINAI_OLD: return sprintf(buf, "MT25204\n"); default: return sprintf(buf, "unknown\n"); } } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *mthca_dev_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id }; static int mthca_init_node_data(struct mthca_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; u8 status; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; err = mthca_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } memcpy(dev->ib_dev.node_desc, out_mad->data, 64); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mthca_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad, &status); if (err) goto out; if (status) { err = -EINVAL; goto out; } if (mthca_is_memfree(dev)) dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } +static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + int mthca_register_device(struct mthca_dev *dev) { int ret; int i; ret = mthca_init_node_data(dev); if (ret) return ret; strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST); dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.phys_port_cnt = dev->limits.num_ports; dev->ib_dev.num_comp_vectors = 1; dev->ib_dev.dma_device = &dev->pdev->dev; dev->ib_dev.query_device = mthca_query_device; dev->ib_dev.query_port = mthca_query_port; dev->ib_dev.modify_device = mthca_modify_device; dev->ib_dev.modify_port = mthca_modify_port; dev->ib_dev.query_pkey = mthca_query_pkey; dev->ib_dev.query_gid = mthca_query_gid; dev->ib_dev.alloc_ucontext = mthca_alloc_ucontext; dev->ib_dev.dealloc_ucontext = mthca_dealloc_ucontext; dev->ib_dev.mmap = mthca_mmap_uar; dev->ib_dev.alloc_pd = mthca_alloc_pd; dev->ib_dev.dealloc_pd = mthca_dealloc_pd; dev->ib_dev.create_ah = mthca_ah_create; dev->ib_dev.query_ah = mthca_ah_query; dev->ib_dev.destroy_ah = mthca_ah_destroy; if (dev->mthca_flags & MTHCA_FLAG_SRQ) { dev->ib_dev.create_srq = mthca_create_srq; dev->ib_dev.modify_srq = mthca_modify_srq; dev->ib_dev.query_srq = mthca_query_srq; dev->ib_dev.destroy_srq = mthca_destroy_srq; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); if (mthca_is_memfree(dev)) dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv; else dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv; } dev->ib_dev.create_qp = mthca_create_qp; dev->ib_dev.modify_qp = mthca_modify_qp; dev->ib_dev.query_qp = mthca_query_qp; dev->ib_dev.destroy_qp = mthca_destroy_qp; dev->ib_dev.create_cq = mthca_create_cq; dev->ib_dev.resize_cq = mthca_resize_cq; dev->ib_dev.destroy_cq = mthca_destroy_cq; dev->ib_dev.poll_cq = mthca_poll_cq; dev->ib_dev.get_dma_mr = mthca_get_dma_mr; dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr; dev->ib_dev.reg_user_mr = mthca_reg_user_mr; dev->ib_dev.dereg_mr = mthca_dereg_mr; + dev->ib_dev.get_port_immutable = mthca_port_immutable; if (dev->mthca_flags & MTHCA_FLAG_FMR) { dev->ib_dev.alloc_fmr = mthca_alloc_fmr; dev->ib_dev.unmap_fmr = mthca_unmap_fmr; dev->ib_dev.dealloc_fmr = mthca_dealloc_fmr; if (mthca_is_memfree(dev)) dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr; else dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr; } dev->ib_dev.attach_mcast = mthca_multicast_attach; dev->ib_dev.detach_mcast = mthca_multicast_detach; dev->ib_dev.process_mad = mthca_process_mad; if (mthca_is_memfree(dev)) { dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq; dev->ib_dev.post_send = mthca_arbel_post_send; dev->ib_dev.post_recv = mthca_arbel_post_receive; } else { dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq; dev->ib_dev.post_send = mthca_tavor_post_send; dev->ib_dev.post_recv = mthca_tavor_post_receive; } mutex_init(&dev->cap_mask_mutex); ret = ib_register_device(&dev->ib_dev, NULL); if (ret) return ret; for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) { ret = device_create_file(&dev->ib_dev.dev, mthca_dev_attributes[i]); if (ret) { ib_unregister_device(&dev->ib_dev); return ret; } } mthca_start_catas_poll(dev); return 0; } void mthca_unregister_device(struct mthca_dev *dev) { mthca_stop_catas_poll(dev); ib_unregister_device(&dev->ib_dev); } Index: stable/10/sys/ofed/include/linux/kernel.h =================================================================== --- stable/10/sys/ofed/include/linux/kernel.h (revision 325610) +++ stable/10/sys/ofed/include/linux/kernel.h (revision 325611) @@ -1,187 +1,194 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _LINUX_KERNEL_H_ #define _LINUX_KERNEL_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERN_CONT "" #define KERN_EMERG "<0>" #define KERN_ALERT "<1>" #define KERN_CRIT "<2>" #define KERN_ERR "<3>" #define KERN_WARNING "<4>" #define KERN_NOTICE "<5>" #define KERN_INFO "<6>" #define KERN_DEBUG "<7>" #define BUILD_BUG_ON(x) CTASSERT(!(x)) #define BUG() panic("BUG") #define BUG_ON(condition) do { if (condition) BUG(); } while(0) -#define WARN_ON BUG_ON +#define WARN_ON(cond) ({ \ + bool __ret = (cond); \ + if (__ret) { \ + printf("WARNING %s failed at %s:%d\n", \ + __stringify(cond), __FILE__, __LINE__); \ + } \ + unlikely(__ret); \ +}) #undef ALIGN #define ALIGN(x, y) roundup2((x), (y)) #undef PTR_ALIGN #define PTR_ALIGN(p, a) ((__typeof(p))ALIGN((uintptr_t)(p), (a))) #define DIV_ROUND_UP howmany #define FIELD_SIZEOF(t, f) sizeof(((t *)0)->f) #define printk(X...) printf(X) /* * The "pr_debug()" and "pr_devel()" macros should produce zero code * unless DEBUG is defined: */ #ifdef DEBUG #define pr_debug(fmt, ...) \ log(LOG_DEBUG, fmt, ##__VA_ARGS__) #define pr_devel(fmt, ...) \ log(LOG_DEBUG, pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug(fmt, ...) \ ({ if (0) log(LOG_DEBUG, fmt, ##__VA_ARGS__); 0; }) #define pr_devel(fmt, ...) \ ({ if (0) log(LOG_DEBUG, pr_fmt(fmt), ##__VA_ARGS__); 0; }) #endif #define udelay(t) DELAY(t) #define usleep_range(min,max) DELAY(min) #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif /* * Print a one-time message (analogous to WARN_ONCE() et al): */ #define printk_once(...) do { \ static bool __print_once; \ \ if (!__print_once) { \ __print_once = true; \ printk(__VA_ARGS__); \ } \ } while (0) /* * Log a one-time message (analogous to WARN_ONCE() et al): */ #define log_once(level,...) do { \ static bool __log_once; \ \ if (!__log_once) { \ __log_once = true; \ log(level, __VA_ARGS__); \ } \ } while (0) #define pr_emerg(fmt, ...) \ log(LOG_EMERG, pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert(fmt, ...) \ log(LOG_ALERT, pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit(fmt, ...) \ log(LOG_CRIT, pr_fmt(fmt), ##__VA_ARGS__) #define pr_err(fmt, ...) \ log(LOG_ERR, pr_fmt(fmt), ##__VA_ARGS__) #define pr_warning(fmt, ...) \ log(LOG_WARNING, pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn pr_warning #define pr_notice(fmt, ...) \ log(LOG_NOTICE, pr_fmt(fmt), ##__VA_ARGS__) #define pr_info(fmt, ...) \ log(LOG_INFO, pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_once(fmt, ...) \ log_once(LOG_INFO, pr_fmt(fmt), ##__VA_ARGS__) #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) #ifndef WARN #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ pr_warning(format); \ unlikely(__ret_warn_on); \ }) #endif #define container_of(ptr, type, member) \ ({ \ __typeof(((type *)0)->member) *_p = (ptr); \ (type *)((char *)_p - offsetof(type, member)); \ }) #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define simple_strtoul strtoul #define simple_strtol strtol #define kstrtol(a,b,c) ({*(c) = strtol(a,0,b);}) #define min(x, y) ((x) < (y) ? (x) : (y)) #define max(x, y) ((x) > (y) ? (x) : (y)) #define min_t(type, _x, _y) ((type)(_x) < (type)(_y) ? (type)(_x) : (type)(_y)) #define max_t(type, _x, _y) ((type)(_x) > (type)(_y) ? (type)(_x) : (type)(_y)) /* * This looks more complex than it should be. But we need to * get the type for the ~ right in round_down (it needs to be * as wide as the result!), and we want to evaluate the macro * arguments just once each. */ #define __round_mask(x, y) ((__typeof__(x))((y)-1)) #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define num_possible_cpus() mp_ncpus #define num_online_cpus() mp_ncpus typedef struct pm_message { int event; } pm_message_t; #endif /* _LINUX_KERNEL_H_ */ Index: stable/10/sys/ofed/include/rdma/ib_mad.h =================================================================== --- stable/10/sys/ofed/include/rdma/ib_mad.h (revision 325610) +++ stable/10/sys/ofed/include/rdma/ib_mad.h (revision 325611) @@ -1,655 +1,656 @@ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(IB_MAD_H) #define IB_MAD_H #include #include /* Management base version */ #define IB_MGMT_BASE_VERSION 1 /* Management classes */ #define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01 #define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81 #define IB_MGMT_CLASS_SUBN_ADM 0x03 #define IB_MGMT_CLASS_PERF_MGMT 0x04 #define IB_MGMT_CLASS_BM 0x05 #define IB_MGMT_CLASS_DEVICE_MGMT 0x06 #define IB_MGMT_CLASS_CM 0x07 #define IB_MGMT_CLASS_SNMP 0x08 #define IB_MGMT_CLASS_DEVICE_ADM 0x10 #define IB_MGMT_CLASS_BOOT_MGMT 0x11 #define IB_MGMT_CLASS_BIS 0x12 #define IB_MGMT_CLASS_CONG_MGMT 0x21 #define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30 #define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F #define IB_OPENIB_OUI (0x001405) /* Management methods */ #define IB_MGMT_METHOD_GET 0x01 #define IB_MGMT_METHOD_SET 0x02 #define IB_MGMT_METHOD_GET_RESP 0x81 #define IB_MGMT_METHOD_SEND 0x03 #define IB_MGMT_METHOD_TRAP 0x05 #define IB_MGMT_METHOD_REPORT 0x06 #define IB_MGMT_METHOD_REPORT_RESP 0x86 #define IB_MGMT_METHOD_TRAP_REPRESS 0x07 #define IB_MGMT_METHOD_RESP 0x80 #define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) #define IB_MGMT_MAX_METHODS 128 /* RMPP information */ #define IB_MGMT_RMPP_VERSION 1 #define IB_MGMT_RMPP_TYPE_DATA 1 #define IB_MGMT_RMPP_TYPE_ACK 2 #define IB_MGMT_RMPP_TYPE_STOP 3 #define IB_MGMT_RMPP_TYPE_ABORT 4 #define IB_MGMT_RMPP_FLAG_ACTIVE 1 #define IB_MGMT_RMPP_FLAG_FIRST (1<<1) #define IB_MGMT_RMPP_FLAG_LAST (1<<2) #define IB_MGMT_RMPP_NO_RESPTIME 0x1F #define IB_MGMT_RMPP_STATUS_SUCCESS 0 #define IB_MGMT_RMPP_STATUS_RESX 1 #define IB_MGMT_RMPP_STATUS_ABORT_MIN 118 #define IB_MGMT_RMPP_STATUS_T2L 118 #define IB_MGMT_RMPP_STATUS_BAD_LEN 119 #define IB_MGMT_RMPP_STATUS_BAD_SEG 120 #define IB_MGMT_RMPP_STATUS_BADT 121 #define IB_MGMT_RMPP_STATUS_W2S 122 #define IB_MGMT_RMPP_STATUS_S2B 123 #define IB_MGMT_RMPP_STATUS_BAD_STATUS 124 #define IB_MGMT_RMPP_STATUS_UNV 125 #define IB_MGMT_RMPP_STATUS_TMR 126 #define IB_MGMT_RMPP_STATUS_UNSPEC 127 #define IB_MGMT_RMPP_STATUS_ABORT_MAX 127 #define IB_QP0 0 #define IB_QP1 cpu_to_be32(1) #define IB_QP1_QKEY 0x80010000 #define IB_QP_SET_QKEY 0x80000000 #define IB_DEFAULT_PKEY_PARTIAL 0x7FFF #define IB_DEFAULT_PKEY_FULL 0xFFFF enum { IB_MGMT_MAD_HDR = 24, IB_MGMT_MAD_DATA = 232, IB_MGMT_RMPP_HDR = 36, IB_MGMT_RMPP_DATA = 220, IB_MGMT_VENDOR_HDR = 40, IB_MGMT_VENDOR_DATA = 216, IB_MGMT_SA_HDR = 56, IB_MGMT_SA_DATA = 200, IB_MGMT_DEVICE_HDR = 64, IB_MGMT_DEVICE_DATA = 192, + IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA, }; struct ib_mad_hdr { u8 base_version; u8 mgmt_class; u8 class_version; u8 method; __be16 status; __be16 class_specific; __be64 tid; __be16 attr_id; __be16 resv; __be32 attr_mod; }; struct ib_rmpp_hdr { u8 rmpp_version; u8 rmpp_type; u8 rmpp_rtime_flags; u8 rmpp_status; __be32 seg_num; __be32 paylen_newwin; }; typedef u64 __bitwise ib_sa_comp_mask; #define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n))) /* * ib_sa_hdr and ib_sa_mad structures must be packed because they have * 64-bit fields that are only 32-bit aligned. 64-bit architectures will * lay them out wrong otherwise. (And unfortunately they are sent on * the wire so we can't change the layout) */ struct ib_sa_hdr { __be64 sm_key; __be16 attr_offset; __be16 reserved; ib_sa_comp_mask comp_mask; } __attribute__ ((packed)); struct ib_mad { struct ib_mad_hdr mad_hdr; u8 data[IB_MGMT_MAD_DATA]; }; struct ib_rmpp_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; u8 data[IB_MGMT_RMPP_DATA]; }; struct ib_sa_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; struct ib_sa_hdr sa_hdr; u8 data[IB_MGMT_SA_DATA]; } __attribute__ ((packed)); struct ib_vendor_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; u8 reserved; u8 oui[3]; u8 data[IB_MGMT_VENDOR_DATA]; }; struct ib_class_port_info { u8 base_version; u8 class_version; __be16 capability_mask; u8 reserved[3]; u8 resp_time_value; u8 redirect_gid[16]; __be32 redirect_tcslfl; __be16 redirect_lid; __be16 redirect_pkey; __be32 redirect_qp; __be32 redirect_qkey; u8 trap_gid[16]; __be32 trap_tcslfl; __be16 trap_lid; __be16 trap_pkey; __be32 trap_hlqp; __be32 trap_qkey; }; /** * ib_mad_send_buf - MAD data buffer and work request for sends. * @next: A pointer used to chain together MADs for posting. * @mad: References an allocated MAD data buffer for MADs that do not have * RMPP active. For MADs using RMPP, references the common and management * class specific headers. * @mad_agent: MAD agent that allocated the buffer. * @ah: The address handle to use when sending the MAD. * @context: User-controlled context fields. * @hdr_len: Indicates the size of the data header of the MAD. This length * includes the common MAD, RMPP, and class specific headers. * @data_len: Indicates the total size of user-transferred data. * @seg_count: The number of RMPP segments allocated for this send. * @seg_size: Size of each RMPP segment. * @timeout_ms: Time to wait for a response. * @retries: Number of times to retry a request for a response. For MADs * using RMPP, this applies per window. On completion, returns the number * of retries needed to complete the transfer. * * Users are responsible for initializing the MAD buffer itself, with the * exception of any RMPP header. Additional segment buffer space allocated * beyond data_len is padding. */ struct ib_mad_send_buf { struct ib_mad_send_buf *next; void *mad; struct ib_mad_agent *mad_agent; struct ib_ah *ah; void *context[2]; int hdr_len; int data_len; int seg_count; int seg_size; int timeout_ms; int retries; }; /** * ib_response_mad - Returns if the specified MAD has been generated in * response to a sent request or trap. */ int ib_response_mad(struct ib_mad *mad); /** * ib_get_rmpp_resptime - Returns the RMPP response time. * @rmpp_hdr: An RMPP header. */ static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr) { return rmpp_hdr->rmpp_rtime_flags >> 3; } /** * ib_get_rmpp_flags - Returns the RMPP flags. * @rmpp_hdr: An RMPP header. */ static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr) { return rmpp_hdr->rmpp_rtime_flags & 0x7; } /** * ib_set_rmpp_resptime - Sets the response time in an RMPP header. * @rmpp_hdr: An RMPP header. * @rtime: The response time to set. */ static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime) { rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3); } /** * ib_set_rmpp_flags - Sets the flags in an RMPP header. * @rmpp_hdr: An RMPP header. * @flags: The flags to set. */ static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags) { rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) | (flags & 0x7); } struct ib_mad_agent; struct ib_mad_send_wc; struct ib_mad_recv_wc; /** * ib_mad_send_handler - callback handler for a sent MAD. * @mad_agent: MAD agent that sent the MAD. * @mad_send_wc: Send work completion information on the sent MAD. */ typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc); /** * ib_mad_snoop_handler - Callback handler for snooping sent MADs. * @mad_agent: MAD agent that snooped the MAD. * @send_wr: Work request information on the sent MAD. * @mad_send_wc: Work completion information on the sent MAD. Valid * only for snooping that occurs on a send completion. * * Clients snooping MADs should not modify data referenced by the @send_wr * or @mad_send_wc. */ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc); /** * ib_mad_recv_handler - callback handler for a received MAD. * @mad_agent: MAD agent requesting the received MAD. * @mad_recv_wc: Received work completion information on the received MAD. * * MADs received in response to a send request operation will be handed to * the user before the send operation completes. All data buffers given * to registered agents through this routine are owned by the receiving * client, except for snooping agents. Clients snooping MADs should not * modify the data referenced by @mad_recv_wc. */ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, struct ib_mad_recv_wc *mad_recv_wc); /** * ib_mad_agent - Used to track MAD registration with the access layer. * @device: Reference to device registration is on. * @qp: Reference to QP used for sending and receiving MADs. * @mr: Memory region for system memory usable for DMA. * @recv_handler: Callback handler for a received MAD. * @send_handler: Callback handler for a sent MAD. * @snoop_handler: Callback handler for snooped sent MADs. * @context: User-specified context associated with this registration. * @hi_tid: Access layer assigned transaction ID for this client. * Unsolicited MADs sent by this client will have the upper 32-bits * of their TID set to this value. * @port_num: Port number on which QP is registered * @rmpp_version: If set, indicates the RMPP version used by this agent. */ struct ib_mad_agent { struct ib_device *device; struct ib_qp *qp; struct ib_mr *mr; ib_mad_recv_handler recv_handler; ib_mad_send_handler send_handler; ib_mad_snoop_handler snoop_handler; void *context; u32 hi_tid; u8 port_num; u8 rmpp_version; }; /** * ib_mad_send_wc - MAD send completion information. * @send_buf: Send MAD data buffer associated with the send MAD request. * @status: Completion status. * @vendor_err: Optional vendor error information returned with a failed * request. */ struct ib_mad_send_wc { struct ib_mad_send_buf *send_buf; enum ib_wc_status status; u32 vendor_err; }; /** * ib_mad_recv_buf - received MAD buffer information. * @list: Reference to next data buffer for a received RMPP MAD. * @grh: References a data buffer containing the global route header. * The data refereced by this buffer is only valid if the GRH is * valid. * @mad: References the start of the received MAD. */ struct ib_mad_recv_buf { struct list_head list; struct ib_grh *grh; struct ib_mad *mad; }; /** * ib_mad_recv_wc - received MAD information. * @wc: Completion information for the received data. * @recv_buf: Specifies the location of the received data buffer(s). * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers. * @mad_len: The length of the received MAD, without duplicated headers. * * For received response, the wr_id contains a pointer to the ib_mad_send_buf * for the corresponding send request. */ struct ib_mad_recv_wc { struct ib_wc *wc; struct ib_mad_recv_buf recv_buf; struct list_head rmpp_list; int mad_len; }; /** * ib_mad_reg_req - MAD registration request * @mgmt_class: Indicates which management class of MADs should be receive * by the caller. This field is only required if the user wishes to * receive unsolicited MADs, otherwise it should be 0. * @mgmt_class_version: Indicates which version of MADs for the given * management class to receive. * @oui: Indicates IEEE OUI when mgmt_class is a vendor class * in the range from 0x30 to 0x4f. Otherwise not used. * @method_mask: The caller will receive unsolicited MADs for any method * where @method_mask = 1. */ struct ib_mad_reg_req { u8 mgmt_class; u8 mgmt_class_version; u8 oui[3]; DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS); }; /** * ib_register_mad_agent - Register to send/receive MADs. * @device: The device to register with. * @port_num: The port on the specified device to use. * @qp_type: Specifies which QP to access. Must be either * IB_QPT_SMI or IB_QPT_GSI. * @mad_reg_req: Specifies which unsolicited MADs should be received * by the caller. This parameter may be NULL if the caller only * wishes to receive solicited responses. * @rmpp_version: If set, indicates that the client will send * and receive MADs that contain the RMPP header for the given version. * If set to 0, indicates that RMPP is not used by this client. * @send_handler: The completion callback routine invoked after a send * request has completed. * @recv_handler: The completion callback routine invoked for a received * MAD. * @context: User specified context associated with the registration. */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context); enum ib_mad_snoop_flags { /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/ /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/ IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2), /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/ IB_MAD_SNOOP_RECVS = (1<<4) /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/ /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/ }; /** * ib_register_mad_snoop - Register to snoop sent and received MADs. * @device: The device to register with. * @port_num: The port on the specified device to use. * @qp_type: Specifies which QP traffic to snoop. Must be either * IB_QPT_SMI or IB_QPT_GSI. * @mad_snoop_flags: Specifies information where snooping occurs. * @send_handler: The callback routine invoked for a snooped send. * @recv_handler: The callback routine invoked for a snooped receive. * @context: User specified context associated with the registration. */ struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, u8 port_num, enum ib_qp_type qp_type, int mad_snoop_flags, ib_mad_snoop_handler snoop_handler, ib_mad_recv_handler recv_handler, void *context); /** * ib_unregister_mad_agent - Unregisters a client from using MAD services. * @mad_agent: Corresponding MAD registration request to deregister. * * After invoking this routine, MAD services are no longer usable by the * client on the associated QP. */ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent); /** * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client. * @send_buf: Specifies the information needed to send the MAD(s). * @bad_send_buf: Specifies the MAD on which an error was encountered. This * parameter is optional if only a single MAD is posted. * * Sent MADs are not guaranteed to complete in the order that they were posted. * * If the MAD requires RMPP, the data buffer should contain a single copy * of the common MAD, RMPP, and class specific headers, followed by the class * defined data. If the class defined data would not divide evenly into * RMPP segments, then space must be allocated at the end of the referenced * buffer for any required padding. To indicate the amount of class defined * data being transferred, the paylen_newwin field in the RMPP header should * be set to the size of the class specific header plus the amount of class * defined data being transferred. The paylen_newwin field should be * specified in network-byte order. */ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, struct ib_mad_send_buf **bad_send_buf); /** * ib_free_recv_mad - Returns data buffers used to receive a MAD. * @mad_recv_wc: Work completion information for a received MAD. * * Clients receiving MADs through their ib_mad_recv_handler must call this * routine to return the work completion buffers to the access layer. */ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc); /** * ib_cancel_mad - Cancels an outstanding send MAD operation. * @mad_agent: Specifies the registration associated with sent MAD. * @send_buf: Indicates the MAD to cancel. * * MADs will be returned to the user through the corresponding * ib_mad_send_handler. */ void ib_cancel_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf); /** * ib_modify_mad - Modifies an outstanding send MAD operation. * @mad_agent: Specifies the registration associated with sent MAD. * @send_buf: Indicates the MAD to modify. * @timeout_ms: New timeout value for sent MAD. * * This call will reset the timeout value for a sent MAD to the specified * value. */ int ib_modify_mad(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, u32 timeout_ms); /** * ib_redirect_mad_qp - Registers a QP for MAD services. * @qp: Reference to a QP that requires MAD services. * @rmpp_version: If set, indicates that the client will send * and receive MADs that contain the RMPP header for the given version. * If set to 0, indicates that RMPP is not used by this client. * @send_handler: The completion callback routine invoked after a send * request has completed. * @recv_handler: The completion callback routine invoked for a received * MAD. * @context: User specified context associated with the registration. * * Use of this call allows clients to use MAD services, such as RMPP, * on user-owned QPs. After calling this routine, users may send * MADs on the specified QP by calling ib_mad_post_send. */ struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, void *context); /** * ib_process_mad_wc - Processes a work completion associated with a * MAD sent or received on a redirected QP. * @mad_agent: Specifies the registered MAD service using the redirected QP. * @wc: References a work completion associated with a sent or received * MAD segment. * * This routine is used to complete or continue processing on a MAD request. * If the work completion is associated with a send operation, calling * this routine is required to continue an RMPP transfer or to wait for a * corresponding response, if it is a request. If the work completion is * associated with a receive operation, calling this routine is required to * process an inbound or outbound RMPP transfer, or to match a response MAD * with its corresponding request. */ int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc); /** * ib_create_send_mad - Allocate and initialize a data buffer and work request * for sending a MAD. * @mad_agent: Specifies the registered MAD service to associate with the MAD. * @remote_qpn: Specifies the QPN of the receiving node. * @pkey_index: Specifies which PKey the MAD will be sent using. This field * is valid only if the remote_qpn is QP 1. * @rmpp_active: Indicates if the send will enable RMPP. * @hdr_len: Indicates the size of the data header of the MAD. This length * should include the common MAD header, RMPP header, plus any class * specific header. * @data_len: Indicates the size of any user-transferred data. The call will * automatically adjust the allocated buffer size to account for any * additional padding that may be necessary. * @gfp_mask: GFP mask used for the memory allocation. * * This routine allocates a MAD for sending. The returned MAD send buffer * will reference a data buffer usable for sending a MAD, along * with an initialized work request structure. Users may modify the returned * MAD data buffer before posting the send. * * The returned MAD header, class specific headers, and any padding will be * cleared. Users are responsible for initializing the common MAD header, * any class specific header, and MAD data area. * If @rmpp_active is set, the RMPP header will be initialized for sending. */ struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, gfp_t gfp_mask); /** * ib_is_mad_class_rmpp - returns whether given management class * supports RMPP. * @mgmt_class: management class * * This routine returns whether the management class supports RMPP. */ int ib_is_mad_class_rmpp(u8 mgmt_class); /** * ib_get_mad_data_offset - returns the data offset for a given * management class. * @mgmt_class: management class * * This routine returns the data offset in the MAD for the management * class requested. */ int ib_get_mad_data_offset(u8 mgmt_class); /** * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment. * @send_buf: Previously allocated send data buffer. * @seg_num: number of segment to return * * This routine returns a pointer to the data buffer of an RMPP MAD. * Users must provide synchronization to @send_buf around this call. */ void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num); /** * ib_free_send_mad - Returns data buffers used to send a MAD. * @send_buf: Previously allocated send data buffer. */ void ib_free_send_mad(struct ib_mad_send_buf *send_buf); #endif /* IB_MAD_H */ Index: stable/10/sys/ofed/include/rdma/ib_verbs.h =================================================================== --- stable/10/sys/ofed/include/rdma/ib_verbs.h (revision 325610) +++ stable/10/sys/ofed/include/rdma/ib_verbs.h (revision 325611) @@ -1,2340 +1,2653 @@ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if !defined(IB_VERBS_H) #define IB_VERBS_H #include #include #include #include #include #include #include #include #include #include #include #include extern struct workqueue_struct *ib_wq; union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; enum rdma_node_type { /* IB values map to NodeInfo:NodeType. */ RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, RDMA_NODE_RNIC }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP }; enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__; enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = 1, IB_DEVICE_BAD_PKEY_CNTR = (1<<1), IB_DEVICE_BAD_QKEY_CNTR = (1<<2), IB_DEVICE_RAW_MULTI = (1<<3), IB_DEVICE_AUTO_PATH_MIG = (1<<4), IB_DEVICE_CHANGE_PHY_PORT = (1<<5), IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6), IB_DEVICE_CURR_QP_STATE_MOD = (1<<7), IB_DEVICE_SHUTDOWN_PORT = (1<<8), IB_DEVICE_INIT_TYPE = (1<<9), IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10), IB_DEVICE_SYS_IMAGE_GUID = (1<<11), IB_DEVICE_RC_RNR_NAK_GEN = (1<<12), IB_DEVICE_SRQ_RESIZE = (1<<13), IB_DEVICE_N_NOTIFY_CQ = (1<<14), IB_DEVICE_LOCAL_DMA_LKEY = (1<<15), IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */ IB_DEVICE_MEM_WINDOW = (1<<17), /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = (1<<18), IB_DEVICE_UD_TSO = (1<<19), IB_DEVICE_XRC = (1<<20), IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), IB_DEVICE_MR_ALLOCATE = (1<<23), IB_DEVICE_SHARED_MR = (1<<24), IB_DEVICE_QPG = (1<<25), IB_DEVICE_UD_RSS = (1<<26), IB_DEVICE_UD_TSS = (1<<27) }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; int device_cap_flags; int max_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; int max_rss_tbl_sz; u16 max_pkeys; u8 local_ca_ack_delay; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5 }; enum ib_port_cap_flags { IB_PORT_SM = 1 << 1, IB_PORT_NOTICE_SUP = 1 << 2, IB_PORT_TRAP_SUP = 1 << 3, IB_PORT_OPT_IPD_SUP = 1 << 4, IB_PORT_AUTO_MIGR_SUP = 1 << 5, IB_PORT_SL_MAP_SUP = 1 << 6, IB_PORT_MKEY_NVRAM = 1 << 7, IB_PORT_PKEY_NVRAM = 1 << 8, IB_PORT_LED_INFO_SUP = 1 << 9, IB_PORT_SM_DISABLED = 1 << 10, IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IB_PORT_CM_SUP = 1 << 16, IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, IB_PORT_REINIT_SUP = 1 << 18, IB_PORT_DEVICE_MGMT_SUP = 1 << 19, IB_PORT_VENDOR_CLASS_SUP = 1 << 20, IB_PORT_DR_NOTICE_SUP = 1 << 21, IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, IB_PORT_BOOT_MGMT_SUP = 1 << 23, IB_PORT_LINK_LATENCY_SUP = 1 << 24, IB_PORT_CLIENT_REG_SUP = 1 << 25 }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32 }; struct ib_protocol_stats { /* TBD... */ }; struct iw_protocol_stats { u64 ipInReceives; u64 ipInHdrErrors; u64 ipInTooBigErrors; u64 ipInNoRoutes; u64 ipInAddrErrors; u64 ipInUnknownProtos; u64 ipInTruncatedPkts; u64 ipInDiscards; u64 ipInDelivers; u64 ipOutForwDatagrams; u64 ipOutRequests; u64 ipOutDiscards; u64 ipOutNoRoutes; u64 ipReasmTimeout; u64 ipReasmReqds; u64 ipReasmOKs; u64 ipReasmFails; u64 ipFragOKs; u64 ipFragFails; u64 ipFragCreates; u64 ipInMcastPkts; u64 ipOutMcastPkts; u64 ipInBcastPkts; u64 ipOutBcastPkts; u64 tcpRtoAlgorithm; u64 tcpRtoMin; u64 tcpRtoMax; u64 tcpMaxConn; u64 tcpActiveOpens; u64 tcpPassiveOpens; u64 tcpAttemptFails; u64 tcpEstabResets; u64 tcpCurrEstab; u64 tcpInSegs; u64 tcpOutSegs; u64 tcpRetransSegs; u64 tcpInErrs; u64 tcpOutRsts; }; union rdma_protocol_stats { struct ib_protocol_stats ib; struct iw_protocol_stats iw; }; +/* Define bits for the various functionality this port needs to be supported by + * the core. + */ +/* Management 0x00000FFF */ +#define RDMA_CORE_CAP_IB_MAD 0x00000001 +#define RDMA_CORE_CAP_IB_SMI 0x00000002 +#define RDMA_CORE_CAP_IB_CM 0x00000004 +#define RDMA_CORE_CAP_IW_CM 0x00000008 +#define RDMA_CORE_CAP_IB_SA 0x00000010 +#define RDMA_CORE_CAP_OPA_MAD 0x00000020 + +/* Address format 0x000FF000 */ +#define RDMA_CORE_CAP_AF_IB 0x00001000 +#define RDMA_CORE_CAP_ETH_AH 0x00002000 +#define RDMA_CORE_CAP_OPA_AH 0x00004000 + +/* Protocol 0xFFF00000 */ +#define RDMA_CORE_CAP_PROT_IB 0x00100000 +#define RDMA_CORE_CAP_PROT_ROCE 0x00200000 +#define RDMA_CORE_CAP_PROT_IWARP 0x00400000 +#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 +#define RDMA_CORE_CAP_PROT_RAW_PACKET 0x01000000 +#define RDMA_CORE_CAP_PROT_USNIC 0x02000000 + +#define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_SMI \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_IB_SA \ + | RDMA_CORE_CAP_AF_IB) +#define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ + (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ + | RDMA_CORE_CAP_IW_CM) +#define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ + | RDMA_CORE_CAP_OPA_MAD) + +#define RDMA_CORE_PORT_RAW_PACKET (RDMA_CORE_CAP_PROT_RAW_PACKET) + +#define RDMA_CORE_PORT_USNIC (RDMA_CORE_CAP_PROT_USNIC) + struct ib_port_attr { enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; int gid_tbl_len; u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u16 lid; u16 sm_lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u8 active_speed; u8 phys_state; enum rdma_link_layer link_layer; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; struct ib_device_modify { u64 sys_image_guid; char node_desc[64]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, }; enum ib_event_flags { IB_XRC_QP_EVENT_FLAG = 0x80000000, }; struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; u8 port_num; u32 xrc_qp_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18 }; /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ enum ib_rate mult_to_ib_rate(int mult) __attribute_const__; struct ib_ah_attr { struct ib_global_route grh; u16 dlid; u8 sl; u8 src_path_bits; u8 static_rate; u8 ah_flags; u8 port_num; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; enum ib_wc_opcode { IB_WC_SEND, IB_WC_RDMA_WRITE, IB_WC_RDMA_READ, IB_WC_COMP_SWAP, IB_WC_FETCH_ADD, IB_WC_BIND_MW, IB_WC_LSO, IB_WC_LOCAL_INV, IB_WC_FAST_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), }; struct ib_wc { u64 wr_id; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; int wc_flags; u16 pkey_index; u16 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ int csum_ok; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC, IB_SRQT_XRC }; enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; } xrc; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; u32 qpg_tss_mask_sz; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC, IB_QPT_UC, IB_QPT_UD, IB_QPT_XRC, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = 8, IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, IB_QPT_MAX, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, IB_QP_CREATE_NETIF_QP = 1 << 2, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; enum ib_qpg_type { IB_QPG_NONE = 0, IB_QPG_PARENT = (1<<0), IB_QPG_CHILD_RX = (1<<1), IB_QPG_CHILD_TX = (1<<2) }; struct ib_qpg_init_attrib { u32 tss_child_count; u32 rss_child_count; }; struct ib_qp_init_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; union { struct ib_qp *qpg_parent; /* see qpg_type */ struct ib_qpg_init_attrib parent_attrib; } pp; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; enum ib_qpg_type qpg_type; u8 port_num; /* special QP types only */ }; struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), IB_QP_GROUP_RSS = (1<<21) }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct ib_ah_attr ah_attr; struct ib_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u8 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; }; enum ib_wr_opcode { IB_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND, IB_WR_SEND_WITH_IMM, IB_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD, IB_WR_LSO, IB_WR_BIG_LSO, IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, IB_WR_FAST_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4) }; enum ib_flow_types { IB_FLOW_ETH = 0, IB_FLOW_IB_UC = 1, IB_FLOW_IB_MC_IPV4 = 2, IB_FLOW_IB_MC_IPV6 = 3 }; enum { IB_FLOW_L4_NONE = 0, IB_FLOW_L4_OTHER = 3, IB_FLOW_L4_UDP = 5, IB_FLOW_L4_TCP = 6 }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; struct ib_fast_reg_page_list { struct ib_device *device; u64 *page_list; unsigned int max_page_list_len; }; struct ib_send_wr { struct ib_send_wr *next; u64 wr_id; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; union { struct { u64 remote_addr; u32 rkey; } rdma; struct { u64 remote_addr; u64 compare_add; u64 swap; u64 compare_add_mask; u64 swap_mask; u32 rkey; } atomic; struct { struct ib_ah *ah; void *header; int hlen; int mss; u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ u8 port_num; /* valid for DR SMPs on switch only */ } ud; struct { u64 iova_start; struct ib_fast_reg_page_list *page_list; unsigned int page_shift; unsigned int page_list_len; u32 length; int access_flags; u32 rkey; } fast_reg; struct { struct ib_unpacked_lrh *lrh; u32 eth_type; u8 static_rate; } raw_ety; } wr; u32 xrc_remote_srq_num; /* XRC TGT QPs only */ }; struct ib_recv_wr { struct ib_recv_wr *next; u64 wr_id; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = 1, IB_ACCESS_REMOTE_WRITE = (1<<1), IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), IB_ACCESS_ALLOCATE_MR = (1<<5), IB_ACCESS_SHARED_MR_USER_READ = (1<<6), IB_ACCESS_SHARED_MR_USER_WRITE = (1<<7), IB_ACCESS_SHARED_MR_GROUP_READ = (1<<8), IB_ACCESS_SHARED_MR_GROUP_WRITE = (1<<9), IB_ACCESS_SHARED_MR_OTHER_READ = (1<<10), IB_ACCESS_SHARED_MR_OTHER_WRITE = (1<<11) }; struct ib_phys_buf { u64 addr; u64 size; }; struct ib_mr_attr { struct ib_pd *pd; u64 device_virt_addr; u64 size; int mr_access_flags; u32 lkey; u32 rkey; }; enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), IB_MR_REREG_ACCESS = (1<<2) }; struct ib_mw_bind { struct ib_mr *mr; u64 wr_id; u64 addr; u32 length; int send_flags; int mw_access_flags; }; struct ib_fmr_attr { int max_pages; int max_maps; u8 page_shift; }; struct ib_ucontext { struct ib_device *device; struct list_head pd_list; struct list_head mr_list; struct list_head mw_list; struct list_head cq_list; struct list_head qp_list; struct list_head srq_list; struct list_head ah_list; struct list_head xrcd_list; int closing; }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ int live; }; struct ib_udata { void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_uxrc_rcv_object { struct list_head list; /* link to context's list */ u32 qp_num; u32 domain_handle; }; struct ib_pd { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ }; struct ib_xrcd { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct rb_node node; struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); struct ib_cq { struct ib_device *device; struct ib_uobject *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; atomic_t usecnt; /* count number of work queues */ }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; union { struct { struct ib_xrcd *xrcd; struct ib_cq *cq; u32 srq_num; } xrc; } ext; }; struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; atomic_t usecnt; /* count times opened, mcast attaches */ struct list_head open_list; struct ib_qp *real_qp; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; enum ib_qpg_type qpg_type; }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 lkey; u32 rkey; atomic_t usecnt; /* count number of MWs */ }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; }; struct ib_fmr { struct ib_device *device; struct ib_pd *pd; struct list_head list; u32 lkey; u32 rkey; }; struct ib_flow_spec { enum ib_flow_types type; union { struct { __be16 ethertype; __be16 vlan; u8 vlan_present; u8 mac[6]; u8 port; } eth; struct { __be32 qpn; } ib_uc; struct { u8 mgid[16]; } ib_mc; } l2_id; __be32 src_ip; __be32 dst_ip; __be16 src_port; __be16 dst_port; u8 l4_protocol; u8 block_mc_loopback; u8 rule_type; }; struct ib_mad; struct ib_grh; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; #define IB_DEVICE_NAME_MAX 64 struct ib_cache { rwlock_t lock; struct ib_event_handler event_handler; struct ib_pkey_cache **pkey_cache; struct ib_gid_cache **gid_cache; u8 *lmc_cache; }; struct ib_dma_mapping_ops { int (*mapping_error)(struct ib_device *dev, u64 dma_addr); u64 (*map_single)(struct ib_device *dev, void *ptr, size_t size, enum dma_data_direction direction); void (*unmap_single)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); u64 (*map_page)(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction); void (*unmap_page)(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); int (*map_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); void (*unmap_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); u64 (*dma_address)(struct ib_device *dev, struct scatterlist *sg); unsigned int (*dma_len)(struct ib_device *dev, struct scatterlist *sg); void (*sync_single_for_cpu)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct ib_device *dev, u64 dma_handle, size_t size, enum dma_data_direction dir); void *(*alloc_coherent)(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag); void (*free_coherent)(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle); }; struct iw_cm_verbs; +struct ib_port_immutable { + int pkey_tbl_len; + int gid_tbl_len; + u32 core_cap_flags; + u32 max_mad_size; +}; + struct ib_device { struct device *dma_device; char name[IB_DEVICE_NAME_MAX]; struct list_head event_handler_list; spinlock_t event_handler_lock; spinlock_t client_data_lock; struct list_head core_list; struct list_head client_data_list; struct ib_cache cache; - int *pkey_tbl_len; - int *gid_tbl_len; + /** + * port_immutable is indexed by port number + */ + struct ib_port_immutable *port_immutable; int num_comp_vectors; struct iw_cm_verbs *iwcm; int (*get_protocol_stats)(struct ib_device *device, union rdma_protocol_stats *stats); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr); int (*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u8 port_num); int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int (*modify_port)(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device, struct ib_udata *udata); int (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); struct ib_pd * (*alloc_pd)(struct ib_device *device, struct ib_ucontext *context, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd); struct ib_ah * (*create_ah)(struct ib_pd *pd, struct ib_ah_attr *ah_attr); int (*modify_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah); struct ib_srq * (*create_srq)(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq); int (*post_srq_recv)(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); struct ib_qp * (*create_qp)(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp); int (*post_send)(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); struct ib_cq * (*create_cq)(struct ib_device *device, int cqe, int comp_vector, struct ib_ucontext *context, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*req_ncomp_notif)(struct ib_cq *cq, int wc_cnt); struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata, int mr_id); int (*query_mr)(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int (*dereg_mr)(struct ib_mr *mr); struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd, int max_page_list_len); struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, int page_list_len); void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list); int (*rereg_phys_mr)(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); struct ib_mw * (*alloc_mw)(struct ib_pd *pd); int (*bind_mw)(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); int (*map_phys_fmr)(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova); int (*unmap_fmr)(struct list_head *fmr_list); int (*dealloc_fmr)(struct ib_fmr *fmr); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*process_mad)(struct ib_device *device, int process_mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad); struct ib_srq * (*create_xrc_srq)(struct ib_pd *pd, struct ib_cq *xrc_cq, struct ib_xrcd *xrcd, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, struct ib_ucontext *ucontext, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd); int (*create_xrc_rcv_qp)(struct ib_qp_init_attr *init_attr, u32 *qp_num); int (*modify_xrc_rcv_qp)(struct ib_xrcd *xrcd, u32 qp_num, struct ib_qp_attr *attr, int attr_mask); int (*query_xrc_rcv_qp)(struct ib_xrcd *xrcd, u32 qp_num, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr); int (*reg_xrc_rcv_qp)(struct ib_xrcd *xrcd, void *context, u32 qp_num); int (*unreg_xrc_rcv_qp)(struct ib_xrcd *xrcd, void *context, u32 qp_num); int (*attach_flow)(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); int (*detach_flow)(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); unsigned long (*get_unmapped_area)(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); struct ib_dma_mapping_ops *dma_ops; struct module *owner; struct device dev; struct kobject *ports_parent; struct list_head port_list; enum { IB_DEV_UNINITIALIZED, IB_DEV_REGISTERED, IB_DEV_UNREGISTERED } reg_state; int uverbs_abi_ver; u64 uverbs_cmd_mask; char node_desc[64]; __be64 node_guid; u32 local_dma_lkey; u8 node_type; u8 phys_port_cnt; struct rb_root ib_uverbs_xrcd_table; struct mutex xrcd_table_mutex; + + /** + * The following mandatory functions are used only at device + * registration. Keep functions such as these at the end of this + * structure to avoid cache line misses when accessing struct ib_device + * in fast paths. + */ + int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); }; struct ib_client { char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *); struct list_head list; }; struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask); int ib_register_event_handler (struct ib_event_handler *event_handler); int ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); int ib_query_device(struct ib_device *device, struct ib_device_attr *device_attr); int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); + +static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; +} + +static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & + (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); +} + +static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; +} + +static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; +} + +static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; +} + +static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) +{ + return rdma_protocol_ib(device, port_num) || + rdma_protocol_roce(device, port_num); +} + +/** + * rdma_cap_ib_mad - Check if the port of a device supports Infiniband + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Management Datagrams (MAD) are a required part of the InfiniBand + * specification and are supported on all InfiniBand devices. A slightly + * extended version are also supported on OPA interfaces. + * + * Return: true if the port supports sending/receiving of MAD packets. + */ +static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; +} + +/** + * rdma_cap_opa_mad - Check if the port of device provides support for OPA + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Intel OmniPath devices extend and/or replace the InfiniBand Management + * datagrams with their own versions. These OPA MADs share many but not all of + * the characteristics of InfiniBand MADs. + * + * OPA MADs differ in the following ways: + * + * 1) MADs are variable size up to 2K + * IBTA defined MADs remain fixed at 256 bytes + * 2) OPA SMPs must carry valid PKeys + * 3) OPA SMP packets are a different format + * + * Return: true if the port supports OPA MAD packet formats. + */ +static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) +{ + return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) + == RDMA_CORE_CAP_OPA_MAD; +} + +/** + * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband + * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). + * @device: Device to check + * @port_num: Port number to check + * + * Each InfiniBand node is required to provide a Subnet Management Agent + * that the subnet manager can access. Prior to the fabric being fully + * configured by the subnet manager, the SMA is accessed via a well known + * interface called the Subnet Management Interface (SMI). This interface + * uses directed route packets to communicate with the SM to get around the + * chicken and egg problem of the SM needing to know what's on the fabric + * in order to configure the fabric, and needing to configure the fabric in + * order to send packets to the devices on the fabric. These directed + * route packets do not need the fabric fully configured in order to reach + * their destination. The SMI is the only method allowed to send + * directed route packets on an InfiniBand fabric. + * + * Return: true if the port provides an SMI. + */ +static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; +} + +/** + * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * The InfiniBand Communication Manager is one of many pre-defined General + * Service Agents (GSA) that are accessed via the General Service + * Interface (GSI). It's role is to facilitate establishment of connections + * between nodes as well as other management related tasks for established + * connections. + * + * Return: true if the port supports an IB CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; +} + +/** + * rdma_cap_iw_cm - Check if the port of device has the capability IWARP + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * Similar to above, but specific to iWARP connections which have a different + * managment protocol than InfiniBand. + * + * Return: true if the port supports an iWARP CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; +} + +/** + * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband + * Subnet Administration. + * @device: Device to check + * @port_num: Port number to check + * + * An InfiniBand Subnet Administration (SA) service is a pre-defined General + * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand + * fabrics, devices should resolve routes to other hosts by contacting the + * SA to query the proper route. + * + * Return: true if the port should act as a client to the fabric Subnet + * Administration interface. This does not imply that the SA service is + * running locally. + */ +static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; +} + +/** + * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband + * Multicast. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand multicast registration is more complex than normal IPv4 or + * IPv6 multicast registration. Each Host Channel Adapter must register + * with the Subnet Manager when it wishes to join a multicast group. It + * should do so only once regardless of how many queue pairs it subscribes + * to this group. And it should leave the group only after all queue pairs + * attached to the group have been detached. + * + * Return: true if the port must undertake the additional adminstrative + * overhead of registering/unregistering with the SM and tracking of the + * total number of queue pairs attached to the multicast group. + */ +static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num) +{ + return rdma_cap_ib_sa(device, port_num); +} + +/** + * rdma_cap_af_ib - Check if the port of device has the capability + * Native Infiniband Address. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default + * GID. RoCE uses a different mechanism, but still generates a GID via + * a prescribed mechanism and port specific data. + * + * Return: true if the port uses a GID address to identify devices on the + * network. + */ +static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; +} + +/** + * rdma_cap_eth_ah - Check if the port of device has the capability + * Ethernet Address Handle. + * @device: Device to check + * @port_num: Port number to check + * + * RoCE is InfiniBand over Ethernet, and it uses a well defined technique + * to fabricate GIDs over Ethernet/IP specific addresses native to the + * port. Normally, packet headers are generated by the sending host + * adapter, but when sending connectionless datagrams, we must manually + * inject the proper headers for the fabric we are communicating over. + * + * Return: true if we are running as a RoCE port and must force the + * addition of a Global Route Header built from our Ethernet Address + * Handle into our header list for connectionless packets. + */ +static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; +} + +/** + * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. + * + * @device: Device + * @port_num: Port number + * + * This MAD size includes the MAD headers and MAD payload. No other headers + * are included. + * + * Return the max MAD size required by the Port. Will return 0 if the port + * does not support MADs + */ +static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].max_mad_size; +} + +/* + * Check if the device supports READ W/ INVALIDATE. + */ +static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) +{ + /* + * iWarp drivers must support READ W/ INVALIDATE. No other protocol + * has support for it yet. + */ + return rdma_protocol_iwarp(dev, port_num); +} int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); /** * ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. */ struct ib_pd *ib_alloc_pd(struct ib_device *device); /** * ib_dealloc_pd - Deallocates a protection domain. * @pd: The protection domain to deallocate. */ int ib_dealloc_pd(struct ib_pd *pd); /** * ib_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); /** * ib_init_ah_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, struct ib_grh *grh, struct ib_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, struct ib_grh *grh, u8 port_num); /** * ib_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); /** * ib_destroy_ah - Destroys an address handle. * @ah: The address handle to destroy. */ int ib_destroy_ah(struct ib_ah *ah); /** * ib_create_xrc_srq - Creates an XRC SRQ associated with the specified * protection domain, cq, and xrc domain. * @pd: The protection domain associated with the SRQ. * @xrc_cq: The cq to be associated with the XRC SRQ. * @xrcd: The XRC domain to be associated with the XRC SRQ. * @srq_init_attr: A list of initial attributes required to create the * XRC SRQ. If XRC SRQ creation succeeds, then the attributes are updated * to the actual capabilities of the created XRC SRQ. * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the XRC SRQ, and set to the actual values allocated * on return. If ib_create_xrc_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd, struct ib_cq *xrc_cq, struct ib_xrcd *xrcd, struct ib_srq_init_attr *srq_init_attr); /** * ib_create_srq - Creates a SRQ associated with the specified protection * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to * the actual capabilities of the created SRQ. * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the SRQ, and set to the actual values allocated * on return. If ib_create_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr); /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq - Destroys the specified SRQ. * @srq: The SRQ to destroy. */ int ib_destroy_srq(struct ib_srq *srq); /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr); } /** * ib_create_qp - Creates a QP associated with the specified protection * domain. * @pd: The protection domain associated with the QP. * @qp_init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. */ int ib_destroy_qp(struct ib_qp *qp); /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd - XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr) { return qp->device->post_send(qp, send_wr, bad_send_wr); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr) { return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } /* * IB_CQ_VECTOR_LEAST_ATTACHED: The constant specifies that * the CQ will be attached to the completion vector that has * the least number of CQs already attached to it. */ #define IB_CQ_VECTOR_LEAST_ATTACHED 0xffffffff /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. * @cqe: The minimum size of the CQ. * @comp_vector - Completion vector used to signal completion events. * Must be >= 0 and < context->num_comp_vectors. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, int cqe, int comp_vector); /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** * ib_modify_cq - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event * @cq_period: max period of time in usec before triggering an event * */ int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq - Destroys the specified CQ. * @cq: The CQ to destroy. */ int ib_destroy_cq(struct ib_cq *cq); /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->poll_cq(cq, num_entries, wc); } /** * ib_peek_cq - Returns the number of unreaped completions currently * on the specified CQ. * @cq: The CQ to peek. * @wc_cnt: A minimum number of unreaped completions to check for. * * If the number of unreaped completions is greater than or equal to wc_cnt, * this function returns wc_cnt, otherwise, it returns the actual number of * unreaped completions. */ int ib_peek_cq(struct ib_cq *cq, int wc_cnt); /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->req_notify_cq(cq, flags); } /** * ib_req_ncomp_notif - Request completion notification when there are * at least the specified number of unreaped completions on the CQ. * @cq: The CQ to generate an event for. * @wc_cnt: The number of unreaped completions that should be on the * CQ before an event is generated. */ static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt) { return cq->device->req_ncomp_notif ? cq->device->req_ncomp_notif(cq, wc_cnt) : -ENOSYS; } /** * ib_get_dma_mr - Returns a memory region for system memory that is * usable for DMA. * @pd: The protection domain associated with the memory region. * @mr_access_flags: Specifies the memory access rights. * * Note that the ib_dma_*() functions defined below must be used * to create/destroy addresses used with the Lkey or Rkey returned * by ib_get_dma_mr(). */ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags); /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (dev->dma_ops) return dev->dma_ops->mapping_error(dev, dma_addr); return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_single(dev, cpu_addr, size, direction); return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_single(dev, addr, size, direction); else dma_unmap_single(dev->dma_device, addr, size, direction); } static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs) { return dma_map_single_attrs(dev->dma_device, cpu_addr, size, direction, attrs); } static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs) { return dma_unmap_single_attrs(dev->dma_device, addr, size, direction, attrs); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_page(dev, page, offset, size, direction); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_page(dev, addr, size, direction); else dma_unmap_page(dev->dma_device, addr, size, direction); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) return dev->dma_ops->map_sg(dev, sg, nents, direction); return dma_map_sg(dev->dma_device, sg, nents, direction); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { if (dev->dma_ops) dev->dma_ops->unmap_sg(dev, sg, nents, direction); else dma_unmap_sg(dev->dma_device, sg, nents, direction); } static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs) { return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs) { dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs); } /** * ib_sg_dma_address - Return the DMA address from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry */ static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) { if (dev->dma_ops) return dev->dma_ops->dma_address(dev, sg); return sg_dma_address(sg); } /** * ib_sg_dma_len - Return the DMA length from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry */ static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg) { if (dev->dma_ops) return dev->dma_ops->dma_len(dev, sg); return sg_dma_len(sg); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir); else dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (dev->dma_ops) dev->dma_ops->sync_single_for_device(dev, addr, size, dir); else dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /** * ib_dma_alloc_coherent - Allocate memory and map it for DMA * @dev: The device for which the DMA address is requested * @size: The size of the region to allocate in bytes * @dma_handle: A pointer for returning the DMA address of the region * @flag: memory allocator flags */ static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag) { if (dev->dma_ops) return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag); else { dma_addr_t handle; void *ret; ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag); *dma_handle = handle; return ret; } } /** * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent() * @dev: The device for which the DMA addresses were allocated * @size: The size of the region * @cpu_addr: the address returned by ib_dma_alloc_coherent() * @dma_handle: the DMA address returned by ib_dma_alloc_coherent() */ static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle) { if (dev->dma_ops) dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); else dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); } /** * ib_reg_phys_mr - Prepares a virtually addressed memory region for use * by an HCA. * @pd: The protection domain associated assigned to the registered region. * @phys_buf_array: Specifies a list of physical buffers to use in the * memory region. * @num_phys_buf: Specifies the size of the phys_buf_array. * @mr_access_flags: Specifies the memory access rights. * @iova_start: The offset of the region's starting I/O virtual address. */ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); /** * ib_rereg_phys_mr - Modifies the attributes of an existing memory region. * Conceptually, this call performs the functions deregister memory region * followed by register physical memory region. Where possible, * resources are reused instead of deallocated and reallocated. * @mr: The memory region to modify. * @mr_rereg_mask: A bit-mask used to indicate which of the following * properties of the memory region are being modified. * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies * the new protection domain to associated with the memory region, * otherwise, this parameter is ignored. * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this * field specifies a list of physical buffers to use in the new * translation, otherwise, this parameter is ignored. * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this * field specifies the size of the phys_buf_array, otherwise, this * parameter is ignored. * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this * field specifies the new memory access rights, otherwise, this * parameter is ignored. * @iova_start: The offset of the region's starting I/O virtual address. */ int ib_rereg_phys_mr(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); /** * ib_query_mr - Retrieves information about a specific memory region. * @mr: The memory region to retrieve information about. * @mr_attr: The attributes of the specified memory region. */ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); /** * ib_dereg_mr - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. */ int ib_dereg_mr(struct ib_mr *mr); /** * ib_alloc_fast_reg_mr - Allocates memory region usable with the * IB_WR_FAST_REG_MR send work request. * @pd: The protection domain associated with the region. * @max_page_list_len: requested max physical buffer list length to be * used with fast register work requests for this MR. */ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); /** * ib_alloc_fast_reg_page_list - Allocates a page list array * @device - ib device pointer. * @page_list_len - size of the page list array to be allocated. * * This allocates and returns a struct ib_fast_reg_page_list * and a * page_list array that is at least page_list_len in size. The actual * size is returned in max_page_list_len. The caller is responsible * for initializing the contents of the page_list array before posting * a send work request with the IB_WC_FAST_REG_MR opcode. * * The page_list array entries must be translated using one of the * ib_dma_*() functions just like the addresses passed to * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct * ib_fast_reg_page_list must not be modified by the caller until the * IB_WC_FAST_REG_MR work request completes. */ struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list( struct ib_device *device, int page_list_len); /** * ib_free_fast_reg_page_list - Deallocates a previously allocated * page list array. * @page_list - struct ib_fast_reg_page_list pointer to be deallocated. */ void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr - struct ib_mr pointer to be updated. * @newkey - new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_alloc_mw - Allocates a memory window. * @pd: The protection domain associated with the memory window. */ struct ib_mw *ib_alloc_mw(struct ib_pd *pd); /** * ib_bind_mw - Posts a work request to the send queue of the specified * QP, which binds the memory window to the given address range and * remote access attributes. * @qp: QP to post the bind work request on. * @mw: The memory window to bind. * @mw_bind: Specifies information about the memory window, including * its address range, remote access rights, and associated memory region. */ static inline int ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind) { /* XXX reference counting in corresponding MR? */ return mw->device->bind_mw ? mw->device->bind_mw(qp, mw, mw_bind) : -ENOSYS; } /** * ib_dealloc_mw - Deallocates a memory window. * @mw: The memory window to deallocate. */ int ib_dealloc_mw(struct ib_mw *mw); /** * ib_alloc_fmr - Allocates a unmapped fast memory region. * @pd: The protection domain associated with the unmapped region. * @mr_access_flags: Specifies the memory access rights. * @fmr_attr: Attributes of the unmapped region. * * A fast memory region must be mapped before it can be used as part of * a work request. */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); /** * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. * @fmr: The fast memory region to associate with the pages. * @page_list: An array of physical pages to map to the fast memory region. * @list_len: The number of pages in page_list. * @iova: The I/O virtual address to use with the mapped region. */ static inline int ib_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len, u64 iova) { return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova); } /** * ib_unmap_fmr - Removes the mapping from a list of fast memory regions. * @fmr_list: A linked list of fast memory regions to unmap. */ int ib_unmap_fmr(struct list_head *fmr_list); /** * ib_dealloc_fmr - Deallocates a fast memory region. * @fmr: The fast memory region to deallocate. */ int ib_dealloc_fmr(struct ib_fmr *fmr); /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_alloc_xrcd - Allocates an XRC domain. * @device: The device on which to allocate the XRC domain. */ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); /** * ib_dealloc_xrcd - Deallocates an XRC domain. * @xrcd: The XRC domain to deallocate. */ int ib_dealloc_xrcd(struct ib_xrcd *xrcd); int ib_attach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); int ib_detach_flow(struct ib_qp *qp, struct ib_flow_spec *spec, int priority); #endif /* IB_VERBS_H */ Index: stable/10 =================================================================== --- stable/10 (revision 325610) +++ stable/10 (revision 325611) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r324792